1 /* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com 2 * Copyright (c) 2016 Facebook 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of version 2 of the GNU General Public 6 * License as published by the Free Software Foundation. 7 */ 8 #include <linux/kernel.h> 9 #include <linux/types.h> 10 #include <linux/slab.h> 11 #include <linux/bpf.h> 12 #include <linux/bpf_perf_event.h> 13 #include <linux/filter.h> 14 #include <linux/uaccess.h> 15 #include <linux/ctype.h> 16 #include <linux/kprobes.h> 17 #include <linux/syscalls.h> 18 #include <linux/error-injection.h> 19 20 #include "trace_probe.h" 21 #include "trace.h" 22 23 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); 24 u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); 25 26 /** 27 * trace_call_bpf - invoke BPF program 28 * @call: tracepoint event 29 * @ctx: opaque context pointer 30 * 31 * kprobe handlers execute BPF programs via this helper. 32 * Can be used from static tracepoints in the future. 33 * 34 * Return: BPF programs always return an integer which is interpreted by 35 * kprobe handler as: 36 * 0 - return from kprobe (event is filtered out) 37 * 1 - store kprobe event into ring buffer 38 * Other values are reserved and currently alias to 1 39 */ 40 unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) 41 { 42 unsigned int ret; 43 44 if (in_nmi()) /* not supported yet */ 45 return 1; 46 47 preempt_disable(); 48 49 if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { 50 /* 51 * since some bpf program is already running on this cpu, 52 * don't call into another bpf program (same or different) 53 * and don't send kprobe event into ring-buffer, 54 * so return zero here 55 */ 56 ret = 0; 57 goto out; 58 } 59 60 /* 61 * Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock 62 * to all call sites, we did a bpf_prog_array_valid() there to check 63 * whether call->prog_array is empty or not, which is 64 * a heurisitc to speed up execution. 65 * 66 * If bpf_prog_array_valid() fetched prog_array was 67 * non-NULL, we go into trace_call_bpf() and do the actual 68 * proper rcu_dereference() under RCU lock. 69 * If it turns out that prog_array is NULL then, we bail out. 70 * For the opposite, if the bpf_prog_array_valid() fetched pointer 71 * was NULL, you'll skip the prog_array with the risk of missing 72 * out of events when it was updated in between this and the 73 * rcu_dereference() which is accepted risk. 74 */ 75 ret = BPF_PROG_RUN_ARRAY_CHECK(call->prog_array, ctx, BPF_PROG_RUN); 76 77 out: 78 __this_cpu_dec(bpf_prog_active); 79 preempt_enable(); 80 81 return ret; 82 } 83 EXPORT_SYMBOL_GPL(trace_call_bpf); 84 85 #ifdef CONFIG_BPF_KPROBE_OVERRIDE 86 BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) 87 { 88 regs_set_return_value(regs, rc); 89 override_function_with_return(regs); 90 return 0; 91 } 92 93 static const struct bpf_func_proto bpf_override_return_proto = { 94 .func = bpf_override_return, 95 .gpl_only = true, 96 .ret_type = RET_INTEGER, 97 .arg1_type = ARG_PTR_TO_CTX, 98 .arg2_type = ARG_ANYTHING, 99 }; 100 #endif 101 102 BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr) 103 { 104 int ret; 105 106 ret = probe_kernel_read(dst, unsafe_ptr, size); 107 if (unlikely(ret < 0)) 108 memset(dst, 0, size); 109 110 return ret; 111 } 112 113 static const struct bpf_func_proto bpf_probe_read_proto = { 114 .func = bpf_probe_read, 115 .gpl_only = true, 116 .ret_type = RET_INTEGER, 117 .arg1_type = ARG_PTR_TO_UNINIT_MEM, 118 .arg2_type = ARG_CONST_SIZE_OR_ZERO, 119 .arg3_type = ARG_ANYTHING, 120 }; 121 122 BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, 123 u32, size) 124 { 125 /* 126 * Ensure we're in user context which is safe for the helper to 127 * run. This helper has no business in a kthread. 128 * 129 * access_ok() should prevent writing to non-user memory, but in 130 * some situations (nommu, temporary switch, etc) access_ok() does 131 * not provide enough validation, hence the check on KERNEL_DS. 132 */ 133 134 if (unlikely(in_interrupt() || 135 current->flags & (PF_KTHREAD | PF_EXITING))) 136 return -EPERM; 137 if (unlikely(uaccess_kernel())) 138 return -EPERM; 139 if (!access_ok(VERIFY_WRITE, unsafe_ptr, size)) 140 return -EPERM; 141 142 return probe_kernel_write(unsafe_ptr, src, size); 143 } 144 145 static const struct bpf_func_proto bpf_probe_write_user_proto = { 146 .func = bpf_probe_write_user, 147 .gpl_only = true, 148 .ret_type = RET_INTEGER, 149 .arg1_type = ARG_ANYTHING, 150 .arg2_type = ARG_PTR_TO_MEM, 151 .arg3_type = ARG_CONST_SIZE, 152 }; 153 154 static const struct bpf_func_proto *bpf_get_probe_write_proto(void) 155 { 156 pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!", 157 current->comm, task_pid_nr(current)); 158 159 return &bpf_probe_write_user_proto; 160 } 161 162 /* 163 * Only limited trace_printk() conversion specifiers allowed: 164 * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %s 165 */ 166 BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, 167 u64, arg2, u64, arg3) 168 { 169 bool str_seen = false; 170 int mod[3] = {}; 171 int fmt_cnt = 0; 172 u64 unsafe_addr; 173 char buf[64]; 174 int i; 175 176 /* 177 * bpf_check()->check_func_arg()->check_stack_boundary() 178 * guarantees that fmt points to bpf program stack, 179 * fmt_size bytes of it were initialized and fmt_size > 0 180 */ 181 if (fmt[--fmt_size] != 0) 182 return -EINVAL; 183 184 /* check format string for allowed specifiers */ 185 for (i = 0; i < fmt_size; i++) { 186 if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) 187 return -EINVAL; 188 189 if (fmt[i] != '%') 190 continue; 191 192 if (fmt_cnt >= 3) 193 return -EINVAL; 194 195 /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */ 196 i++; 197 if (fmt[i] == 'l') { 198 mod[fmt_cnt]++; 199 i++; 200 } else if (fmt[i] == 'p' || fmt[i] == 's') { 201 mod[fmt_cnt]++; 202 i++; 203 if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0) 204 return -EINVAL; 205 fmt_cnt++; 206 if (fmt[i - 1] == 's') { 207 if (str_seen) 208 /* allow only one '%s' per fmt string */ 209 return -EINVAL; 210 str_seen = true; 211 212 switch (fmt_cnt) { 213 case 1: 214 unsafe_addr = arg1; 215 arg1 = (long) buf; 216 break; 217 case 2: 218 unsafe_addr = arg2; 219 arg2 = (long) buf; 220 break; 221 case 3: 222 unsafe_addr = arg3; 223 arg3 = (long) buf; 224 break; 225 } 226 buf[0] = 0; 227 strncpy_from_unsafe(buf, 228 (void *) (long) unsafe_addr, 229 sizeof(buf)); 230 } 231 continue; 232 } 233 234 if (fmt[i] == 'l') { 235 mod[fmt_cnt]++; 236 i++; 237 } 238 239 if (fmt[i] != 'i' && fmt[i] != 'd' && 240 fmt[i] != 'u' && fmt[i] != 'x') 241 return -EINVAL; 242 fmt_cnt++; 243 } 244 245 /* Horrid workaround for getting va_list handling working with different 246 * argument type combinations generically for 32 and 64 bit archs. 247 */ 248 #define __BPF_TP_EMIT() __BPF_ARG3_TP() 249 #define __BPF_TP(...) \ 250 __trace_printk(0 /* Fake ip */, \ 251 fmt, ##__VA_ARGS__) 252 253 #define __BPF_ARG1_TP(...) \ 254 ((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64)) \ 255 ? __BPF_TP(arg1, ##__VA_ARGS__) \ 256 : ((mod[0] == 1 || (mod[0] == 0 && __BITS_PER_LONG == 32)) \ 257 ? __BPF_TP((long)arg1, ##__VA_ARGS__) \ 258 : __BPF_TP((u32)arg1, ##__VA_ARGS__))) 259 260 #define __BPF_ARG2_TP(...) \ 261 ((mod[1] == 2 || (mod[1] == 1 && __BITS_PER_LONG == 64)) \ 262 ? __BPF_ARG1_TP(arg2, ##__VA_ARGS__) \ 263 : ((mod[1] == 1 || (mod[1] == 0 && __BITS_PER_LONG == 32)) \ 264 ? __BPF_ARG1_TP((long)arg2, ##__VA_ARGS__) \ 265 : __BPF_ARG1_TP((u32)arg2, ##__VA_ARGS__))) 266 267 #define __BPF_ARG3_TP(...) \ 268 ((mod[2] == 2 || (mod[2] == 1 && __BITS_PER_LONG == 64)) \ 269 ? __BPF_ARG2_TP(arg3, ##__VA_ARGS__) \ 270 : ((mod[2] == 1 || (mod[2] == 0 && __BITS_PER_LONG == 32)) \ 271 ? __BPF_ARG2_TP((long)arg3, ##__VA_ARGS__) \ 272 : __BPF_ARG2_TP((u32)arg3, ##__VA_ARGS__))) 273 274 return __BPF_TP_EMIT(); 275 } 276 277 static const struct bpf_func_proto bpf_trace_printk_proto = { 278 .func = bpf_trace_printk, 279 .gpl_only = true, 280 .ret_type = RET_INTEGER, 281 .arg1_type = ARG_PTR_TO_MEM, 282 .arg2_type = ARG_CONST_SIZE, 283 }; 284 285 const struct bpf_func_proto *bpf_get_trace_printk_proto(void) 286 { 287 /* 288 * this program might be calling bpf_trace_printk, 289 * so allocate per-cpu printk buffers 290 */ 291 trace_printk_init_buffers(); 292 293 return &bpf_trace_printk_proto; 294 } 295 296 static __always_inline int 297 get_map_perf_counter(struct bpf_map *map, u64 flags, 298 u64 *value, u64 *enabled, u64 *running) 299 { 300 struct bpf_array *array = container_of(map, struct bpf_array, map); 301 unsigned int cpu = smp_processor_id(); 302 u64 index = flags & BPF_F_INDEX_MASK; 303 struct bpf_event_entry *ee; 304 305 if (unlikely(flags & ~(BPF_F_INDEX_MASK))) 306 return -EINVAL; 307 if (index == BPF_F_CURRENT_CPU) 308 index = cpu; 309 if (unlikely(index >= array->map.max_entries)) 310 return -E2BIG; 311 312 ee = READ_ONCE(array->ptrs[index]); 313 if (!ee) 314 return -ENOENT; 315 316 return perf_event_read_local(ee->event, value, enabled, running); 317 } 318 319 BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) 320 { 321 u64 value = 0; 322 int err; 323 324 err = get_map_perf_counter(map, flags, &value, NULL, NULL); 325 /* 326 * this api is ugly since we miss [-22..-2] range of valid 327 * counter values, but that's uapi 328 */ 329 if (err) 330 return err; 331 return value; 332 } 333 334 static const struct bpf_func_proto bpf_perf_event_read_proto = { 335 .func = bpf_perf_event_read, 336 .gpl_only = true, 337 .ret_type = RET_INTEGER, 338 .arg1_type = ARG_CONST_MAP_PTR, 339 .arg2_type = ARG_ANYTHING, 340 }; 341 342 BPF_CALL_4(bpf_perf_event_read_value, struct bpf_map *, map, u64, flags, 343 struct bpf_perf_event_value *, buf, u32, size) 344 { 345 int err = -EINVAL; 346 347 if (unlikely(size != sizeof(struct bpf_perf_event_value))) 348 goto clear; 349 err = get_map_perf_counter(map, flags, &buf->counter, &buf->enabled, 350 &buf->running); 351 if (unlikely(err)) 352 goto clear; 353 return 0; 354 clear: 355 memset(buf, 0, size); 356 return err; 357 } 358 359 static const struct bpf_func_proto bpf_perf_event_read_value_proto = { 360 .func = bpf_perf_event_read_value, 361 .gpl_only = true, 362 .ret_type = RET_INTEGER, 363 .arg1_type = ARG_CONST_MAP_PTR, 364 .arg2_type = ARG_ANYTHING, 365 .arg3_type = ARG_PTR_TO_UNINIT_MEM, 366 .arg4_type = ARG_CONST_SIZE, 367 }; 368 369 static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd); 370 371 static __always_inline u64 372 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, 373 u64 flags, struct perf_sample_data *sd) 374 { 375 struct bpf_array *array = container_of(map, struct bpf_array, map); 376 unsigned int cpu = smp_processor_id(); 377 u64 index = flags & BPF_F_INDEX_MASK; 378 struct bpf_event_entry *ee; 379 struct perf_event *event; 380 381 if (index == BPF_F_CURRENT_CPU) 382 index = cpu; 383 if (unlikely(index >= array->map.max_entries)) 384 return -E2BIG; 385 386 ee = READ_ONCE(array->ptrs[index]); 387 if (!ee) 388 return -ENOENT; 389 390 event = ee->event; 391 if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE || 392 event->attr.config != PERF_COUNT_SW_BPF_OUTPUT)) 393 return -EINVAL; 394 395 if (unlikely(event->oncpu != cpu)) 396 return -EOPNOTSUPP; 397 398 perf_event_output(event, sd, regs); 399 return 0; 400 } 401 402 BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, 403 u64, flags, void *, data, u64, size) 404 { 405 struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd); 406 struct perf_raw_record raw = { 407 .frag = { 408 .size = size, 409 .data = data, 410 }, 411 }; 412 413 if (unlikely(flags & ~(BPF_F_INDEX_MASK))) 414 return -EINVAL; 415 416 perf_sample_data_init(sd, 0, 0); 417 sd->raw = &raw; 418 419 return __bpf_perf_event_output(regs, map, flags, sd); 420 } 421 422 static const struct bpf_func_proto bpf_perf_event_output_proto = { 423 .func = bpf_perf_event_output, 424 .gpl_only = true, 425 .ret_type = RET_INTEGER, 426 .arg1_type = ARG_PTR_TO_CTX, 427 .arg2_type = ARG_CONST_MAP_PTR, 428 .arg3_type = ARG_ANYTHING, 429 .arg4_type = ARG_PTR_TO_MEM, 430 .arg5_type = ARG_CONST_SIZE_OR_ZERO, 431 }; 432 433 static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); 434 static DEFINE_PER_CPU(struct perf_sample_data, bpf_misc_sd); 435 436 u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, 437 void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) 438 { 439 struct perf_sample_data *sd = this_cpu_ptr(&bpf_misc_sd); 440 struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs); 441 struct perf_raw_frag frag = { 442 .copy = ctx_copy, 443 .size = ctx_size, 444 .data = ctx, 445 }; 446 struct perf_raw_record raw = { 447 .frag = { 448 { 449 .next = ctx_size ? &frag : NULL, 450 }, 451 .size = meta_size, 452 .data = meta, 453 }, 454 }; 455 456 perf_fetch_caller_regs(regs); 457 perf_sample_data_init(sd, 0, 0); 458 sd->raw = &raw; 459 460 return __bpf_perf_event_output(regs, map, flags, sd); 461 } 462 463 BPF_CALL_0(bpf_get_current_task) 464 { 465 return (long) current; 466 } 467 468 static const struct bpf_func_proto bpf_get_current_task_proto = { 469 .func = bpf_get_current_task, 470 .gpl_only = true, 471 .ret_type = RET_INTEGER, 472 }; 473 474 BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx) 475 { 476 struct bpf_array *array = container_of(map, struct bpf_array, map); 477 struct cgroup *cgrp; 478 479 if (unlikely(idx >= array->map.max_entries)) 480 return -E2BIG; 481 482 cgrp = READ_ONCE(array->ptrs[idx]); 483 if (unlikely(!cgrp)) 484 return -EAGAIN; 485 486 return task_under_cgroup_hierarchy(current, cgrp); 487 } 488 489 static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = { 490 .func = bpf_current_task_under_cgroup, 491 .gpl_only = false, 492 .ret_type = RET_INTEGER, 493 .arg1_type = ARG_CONST_MAP_PTR, 494 .arg2_type = ARG_ANYTHING, 495 }; 496 497 BPF_CALL_3(bpf_probe_read_str, void *, dst, u32, size, 498 const void *, unsafe_ptr) 499 { 500 int ret; 501 502 /* 503 * The strncpy_from_unsafe() call will likely not fill the entire 504 * buffer, but that's okay in this circumstance as we're probing 505 * arbitrary memory anyway similar to bpf_probe_read() and might 506 * as well probe the stack. Thus, memory is explicitly cleared 507 * only in error case, so that improper users ignoring return 508 * code altogether don't copy garbage; otherwise length of string 509 * is returned that can be used for bpf_perf_event_output() et al. 510 */ 511 ret = strncpy_from_unsafe(dst, unsafe_ptr, size); 512 if (unlikely(ret < 0)) 513 memset(dst, 0, size); 514 515 return ret; 516 } 517 518 static const struct bpf_func_proto bpf_probe_read_str_proto = { 519 .func = bpf_probe_read_str, 520 .gpl_only = true, 521 .ret_type = RET_INTEGER, 522 .arg1_type = ARG_PTR_TO_UNINIT_MEM, 523 .arg2_type = ARG_CONST_SIZE_OR_ZERO, 524 .arg3_type = ARG_ANYTHING, 525 }; 526 527 static const struct bpf_func_proto * 528 tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 529 { 530 switch (func_id) { 531 case BPF_FUNC_map_lookup_elem: 532 return &bpf_map_lookup_elem_proto; 533 case BPF_FUNC_map_update_elem: 534 return &bpf_map_update_elem_proto; 535 case BPF_FUNC_map_delete_elem: 536 return &bpf_map_delete_elem_proto; 537 case BPF_FUNC_probe_read: 538 return &bpf_probe_read_proto; 539 case BPF_FUNC_ktime_get_ns: 540 return &bpf_ktime_get_ns_proto; 541 case BPF_FUNC_tail_call: 542 return &bpf_tail_call_proto; 543 case BPF_FUNC_get_current_pid_tgid: 544 return &bpf_get_current_pid_tgid_proto; 545 case BPF_FUNC_get_current_task: 546 return &bpf_get_current_task_proto; 547 case BPF_FUNC_get_current_uid_gid: 548 return &bpf_get_current_uid_gid_proto; 549 case BPF_FUNC_get_current_comm: 550 return &bpf_get_current_comm_proto; 551 case BPF_FUNC_trace_printk: 552 return bpf_get_trace_printk_proto(); 553 case BPF_FUNC_get_smp_processor_id: 554 return &bpf_get_smp_processor_id_proto; 555 case BPF_FUNC_get_numa_node_id: 556 return &bpf_get_numa_node_id_proto; 557 case BPF_FUNC_perf_event_read: 558 return &bpf_perf_event_read_proto; 559 case BPF_FUNC_probe_write_user: 560 return bpf_get_probe_write_proto(); 561 case BPF_FUNC_current_task_under_cgroup: 562 return &bpf_current_task_under_cgroup_proto; 563 case BPF_FUNC_get_prandom_u32: 564 return &bpf_get_prandom_u32_proto; 565 case BPF_FUNC_probe_read_str: 566 return &bpf_probe_read_str_proto; 567 #ifdef CONFIG_CGROUPS 568 case BPF_FUNC_get_current_cgroup_id: 569 return &bpf_get_current_cgroup_id_proto; 570 #endif 571 default: 572 return NULL; 573 } 574 } 575 576 static const struct bpf_func_proto * 577 kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 578 { 579 switch (func_id) { 580 case BPF_FUNC_perf_event_output: 581 return &bpf_perf_event_output_proto; 582 case BPF_FUNC_get_stackid: 583 return &bpf_get_stackid_proto; 584 case BPF_FUNC_get_stack: 585 return &bpf_get_stack_proto; 586 case BPF_FUNC_perf_event_read_value: 587 return &bpf_perf_event_read_value_proto; 588 #ifdef CONFIG_BPF_KPROBE_OVERRIDE 589 case BPF_FUNC_override_return: 590 return &bpf_override_return_proto; 591 #endif 592 default: 593 return tracing_func_proto(func_id, prog); 594 } 595 } 596 597 /* bpf+kprobe programs can access fields of 'struct pt_regs' */ 598 static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, 599 const struct bpf_prog *prog, 600 struct bpf_insn_access_aux *info) 601 { 602 if (off < 0 || off >= sizeof(struct pt_regs)) 603 return false; 604 if (type != BPF_READ) 605 return false; 606 if (off % size != 0) 607 return false; 608 /* 609 * Assertion for 32 bit to make sure last 8 byte access 610 * (BPF_DW) to the last 4 byte member is disallowed. 611 */ 612 if (off + size > sizeof(struct pt_regs)) 613 return false; 614 615 return true; 616 } 617 618 const struct bpf_verifier_ops kprobe_verifier_ops = { 619 .get_func_proto = kprobe_prog_func_proto, 620 .is_valid_access = kprobe_prog_is_valid_access, 621 }; 622 623 const struct bpf_prog_ops kprobe_prog_ops = { 624 }; 625 626 BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map, 627 u64, flags, void *, data, u64, size) 628 { 629 struct pt_regs *regs = *(struct pt_regs **)tp_buff; 630 631 /* 632 * r1 points to perf tracepoint buffer where first 8 bytes are hidden 633 * from bpf program and contain a pointer to 'struct pt_regs'. Fetch it 634 * from there and call the same bpf_perf_event_output() helper inline. 635 */ 636 return ____bpf_perf_event_output(regs, map, flags, data, size); 637 } 638 639 static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { 640 .func = bpf_perf_event_output_tp, 641 .gpl_only = true, 642 .ret_type = RET_INTEGER, 643 .arg1_type = ARG_PTR_TO_CTX, 644 .arg2_type = ARG_CONST_MAP_PTR, 645 .arg3_type = ARG_ANYTHING, 646 .arg4_type = ARG_PTR_TO_MEM, 647 .arg5_type = ARG_CONST_SIZE_OR_ZERO, 648 }; 649 650 BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map, 651 u64, flags) 652 { 653 struct pt_regs *regs = *(struct pt_regs **)tp_buff; 654 655 /* 656 * Same comment as in bpf_perf_event_output_tp(), only that this time 657 * the other helper's function body cannot be inlined due to being 658 * external, thus we need to call raw helper function. 659 */ 660 return bpf_get_stackid((unsigned long) regs, (unsigned long) map, 661 flags, 0, 0); 662 } 663 664 static const struct bpf_func_proto bpf_get_stackid_proto_tp = { 665 .func = bpf_get_stackid_tp, 666 .gpl_only = true, 667 .ret_type = RET_INTEGER, 668 .arg1_type = ARG_PTR_TO_CTX, 669 .arg2_type = ARG_CONST_MAP_PTR, 670 .arg3_type = ARG_ANYTHING, 671 }; 672 673 BPF_CALL_4(bpf_get_stack_tp, void *, tp_buff, void *, buf, u32, size, 674 u64, flags) 675 { 676 struct pt_regs *regs = *(struct pt_regs **)tp_buff; 677 678 return bpf_get_stack((unsigned long) regs, (unsigned long) buf, 679 (unsigned long) size, flags, 0); 680 } 681 682 static const struct bpf_func_proto bpf_get_stack_proto_tp = { 683 .func = bpf_get_stack_tp, 684 .gpl_only = true, 685 .ret_type = RET_INTEGER, 686 .arg1_type = ARG_PTR_TO_CTX, 687 .arg2_type = ARG_PTR_TO_UNINIT_MEM, 688 .arg3_type = ARG_CONST_SIZE_OR_ZERO, 689 .arg4_type = ARG_ANYTHING, 690 }; 691 692 static const struct bpf_func_proto * 693 tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 694 { 695 switch (func_id) { 696 case BPF_FUNC_perf_event_output: 697 return &bpf_perf_event_output_proto_tp; 698 case BPF_FUNC_get_stackid: 699 return &bpf_get_stackid_proto_tp; 700 case BPF_FUNC_get_stack: 701 return &bpf_get_stack_proto_tp; 702 default: 703 return tracing_func_proto(func_id, prog); 704 } 705 } 706 707 static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, 708 const struct bpf_prog *prog, 709 struct bpf_insn_access_aux *info) 710 { 711 if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) 712 return false; 713 if (type != BPF_READ) 714 return false; 715 if (off % size != 0) 716 return false; 717 718 BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(__u64)); 719 return true; 720 } 721 722 const struct bpf_verifier_ops tracepoint_verifier_ops = { 723 .get_func_proto = tp_prog_func_proto, 724 .is_valid_access = tp_prog_is_valid_access, 725 }; 726 727 const struct bpf_prog_ops tracepoint_prog_ops = { 728 }; 729 730 BPF_CALL_3(bpf_perf_prog_read_value, struct bpf_perf_event_data_kern *, ctx, 731 struct bpf_perf_event_value *, buf, u32, size) 732 { 733 int err = -EINVAL; 734 735 if (unlikely(size != sizeof(struct bpf_perf_event_value))) 736 goto clear; 737 err = perf_event_read_local(ctx->event, &buf->counter, &buf->enabled, 738 &buf->running); 739 if (unlikely(err)) 740 goto clear; 741 return 0; 742 clear: 743 memset(buf, 0, size); 744 return err; 745 } 746 747 static const struct bpf_func_proto bpf_perf_prog_read_value_proto = { 748 .func = bpf_perf_prog_read_value, 749 .gpl_only = true, 750 .ret_type = RET_INTEGER, 751 .arg1_type = ARG_PTR_TO_CTX, 752 .arg2_type = ARG_PTR_TO_UNINIT_MEM, 753 .arg3_type = ARG_CONST_SIZE, 754 }; 755 756 static const struct bpf_func_proto * 757 pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 758 { 759 switch (func_id) { 760 case BPF_FUNC_perf_event_output: 761 return &bpf_perf_event_output_proto_tp; 762 case BPF_FUNC_get_stackid: 763 return &bpf_get_stackid_proto_tp; 764 case BPF_FUNC_get_stack: 765 return &bpf_get_stack_proto_tp; 766 case BPF_FUNC_perf_prog_read_value: 767 return &bpf_perf_prog_read_value_proto; 768 default: 769 return tracing_func_proto(func_id, prog); 770 } 771 } 772 773 /* 774 * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp 775 * to avoid potential recursive reuse issue when/if tracepoints are added 776 * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack 777 */ 778 static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs); 779 BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args, 780 struct bpf_map *, map, u64, flags, void *, data, u64, size) 781 { 782 struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); 783 784 perf_fetch_caller_regs(regs); 785 return ____bpf_perf_event_output(regs, map, flags, data, size); 786 } 787 788 static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { 789 .func = bpf_perf_event_output_raw_tp, 790 .gpl_only = true, 791 .ret_type = RET_INTEGER, 792 .arg1_type = ARG_PTR_TO_CTX, 793 .arg2_type = ARG_CONST_MAP_PTR, 794 .arg3_type = ARG_ANYTHING, 795 .arg4_type = ARG_PTR_TO_MEM, 796 .arg5_type = ARG_CONST_SIZE_OR_ZERO, 797 }; 798 799 BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args, 800 struct bpf_map *, map, u64, flags) 801 { 802 struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); 803 804 perf_fetch_caller_regs(regs); 805 /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */ 806 return bpf_get_stackid((unsigned long) regs, (unsigned long) map, 807 flags, 0, 0); 808 } 809 810 static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { 811 .func = bpf_get_stackid_raw_tp, 812 .gpl_only = true, 813 .ret_type = RET_INTEGER, 814 .arg1_type = ARG_PTR_TO_CTX, 815 .arg2_type = ARG_CONST_MAP_PTR, 816 .arg3_type = ARG_ANYTHING, 817 }; 818 819 BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args, 820 void *, buf, u32, size, u64, flags) 821 { 822 struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); 823 824 perf_fetch_caller_regs(regs); 825 return bpf_get_stack((unsigned long) regs, (unsigned long) buf, 826 (unsigned long) size, flags, 0); 827 } 828 829 static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = { 830 .func = bpf_get_stack_raw_tp, 831 .gpl_only = true, 832 .ret_type = RET_INTEGER, 833 .arg1_type = ARG_PTR_TO_CTX, 834 .arg2_type = ARG_PTR_TO_MEM, 835 .arg3_type = ARG_CONST_SIZE_OR_ZERO, 836 .arg4_type = ARG_ANYTHING, 837 }; 838 839 static const struct bpf_func_proto * 840 raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 841 { 842 switch (func_id) { 843 case BPF_FUNC_perf_event_output: 844 return &bpf_perf_event_output_proto_raw_tp; 845 case BPF_FUNC_get_stackid: 846 return &bpf_get_stackid_proto_raw_tp; 847 case BPF_FUNC_get_stack: 848 return &bpf_get_stack_proto_raw_tp; 849 default: 850 return tracing_func_proto(func_id, prog); 851 } 852 } 853 854 static bool raw_tp_prog_is_valid_access(int off, int size, 855 enum bpf_access_type type, 856 const struct bpf_prog *prog, 857 struct bpf_insn_access_aux *info) 858 { 859 /* largest tracepoint in the kernel has 12 args */ 860 if (off < 0 || off >= sizeof(__u64) * 12) 861 return false; 862 if (type != BPF_READ) 863 return false; 864 if (off % size != 0) 865 return false; 866 return true; 867 } 868 869 const struct bpf_verifier_ops raw_tracepoint_verifier_ops = { 870 .get_func_proto = raw_tp_prog_func_proto, 871 .is_valid_access = raw_tp_prog_is_valid_access, 872 }; 873 874 const struct bpf_prog_ops raw_tracepoint_prog_ops = { 875 }; 876 877 static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, 878 const struct bpf_prog *prog, 879 struct bpf_insn_access_aux *info) 880 { 881 const int size_u64 = sizeof(u64); 882 883 if (off < 0 || off >= sizeof(struct bpf_perf_event_data)) 884 return false; 885 if (type != BPF_READ) 886 return false; 887 if (off % size != 0) { 888 if (sizeof(unsigned long) != 4) 889 return false; 890 if (size != 8) 891 return false; 892 if (off % size != 4) 893 return false; 894 } 895 896 switch (off) { 897 case bpf_ctx_range(struct bpf_perf_event_data, sample_period): 898 bpf_ctx_record_field_size(info, size_u64); 899 if (!bpf_ctx_narrow_access_ok(off, size, size_u64)) 900 return false; 901 break; 902 case bpf_ctx_range(struct bpf_perf_event_data, addr): 903 bpf_ctx_record_field_size(info, size_u64); 904 if (!bpf_ctx_narrow_access_ok(off, size, size_u64)) 905 return false; 906 break; 907 default: 908 if (size != sizeof(long)) 909 return false; 910 } 911 912 return true; 913 } 914 915 static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, 916 const struct bpf_insn *si, 917 struct bpf_insn *insn_buf, 918 struct bpf_prog *prog, u32 *target_size) 919 { 920 struct bpf_insn *insn = insn_buf; 921 922 switch (si->off) { 923 case offsetof(struct bpf_perf_event_data, sample_period): 924 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, 925 data), si->dst_reg, si->src_reg, 926 offsetof(struct bpf_perf_event_data_kern, data)); 927 *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, 928 bpf_target_off(struct perf_sample_data, period, 8, 929 target_size)); 930 break; 931 case offsetof(struct bpf_perf_event_data, addr): 932 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, 933 data), si->dst_reg, si->src_reg, 934 offsetof(struct bpf_perf_event_data_kern, data)); 935 *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, 936 bpf_target_off(struct perf_sample_data, addr, 8, 937 target_size)); 938 break; 939 default: 940 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, 941 regs), si->dst_reg, si->src_reg, 942 offsetof(struct bpf_perf_event_data_kern, regs)); 943 *insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), si->dst_reg, si->dst_reg, 944 si->off); 945 break; 946 } 947 948 return insn - insn_buf; 949 } 950 951 const struct bpf_verifier_ops perf_event_verifier_ops = { 952 .get_func_proto = pe_prog_func_proto, 953 .is_valid_access = pe_prog_is_valid_access, 954 .convert_ctx_access = pe_prog_convert_ctx_access, 955 }; 956 957 const struct bpf_prog_ops perf_event_prog_ops = { 958 }; 959 960 static DEFINE_MUTEX(bpf_event_mutex); 961 962 #define BPF_TRACE_MAX_PROGS 64 963 964 int perf_event_attach_bpf_prog(struct perf_event *event, 965 struct bpf_prog *prog) 966 { 967 struct bpf_prog_array __rcu *old_array; 968 struct bpf_prog_array *new_array; 969 int ret = -EEXIST; 970 971 /* 972 * Kprobe override only works if they are on the function entry, 973 * and only if they are on the opt-in list. 974 */ 975 if (prog->kprobe_override && 976 (!trace_kprobe_on_func_entry(event->tp_event) || 977 !trace_kprobe_error_injectable(event->tp_event))) 978 return -EINVAL; 979 980 mutex_lock(&bpf_event_mutex); 981 982 if (event->prog) 983 goto unlock; 984 985 old_array = event->tp_event->prog_array; 986 if (old_array && 987 bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) { 988 ret = -E2BIG; 989 goto unlock; 990 } 991 992 ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array); 993 if (ret < 0) 994 goto unlock; 995 996 /* set the new array to event->tp_event and set event->prog */ 997 event->prog = prog; 998 rcu_assign_pointer(event->tp_event->prog_array, new_array); 999 bpf_prog_array_free(old_array); 1000 1001 unlock: 1002 mutex_unlock(&bpf_event_mutex); 1003 return ret; 1004 } 1005 1006 void perf_event_detach_bpf_prog(struct perf_event *event) 1007 { 1008 struct bpf_prog_array __rcu *old_array; 1009 struct bpf_prog_array *new_array; 1010 int ret; 1011 1012 mutex_lock(&bpf_event_mutex); 1013 1014 if (!event->prog) 1015 goto unlock; 1016 1017 old_array = event->tp_event->prog_array; 1018 ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array); 1019 if (ret == -ENOENT) 1020 goto unlock; 1021 if (ret < 0) { 1022 bpf_prog_array_delete_safe(old_array, event->prog); 1023 } else { 1024 rcu_assign_pointer(event->tp_event->prog_array, new_array); 1025 bpf_prog_array_free(old_array); 1026 } 1027 1028 bpf_prog_put(event->prog); 1029 event->prog = NULL; 1030 1031 unlock: 1032 mutex_unlock(&bpf_event_mutex); 1033 } 1034 1035 int perf_event_query_prog_array(struct perf_event *event, void __user *info) 1036 { 1037 struct perf_event_query_bpf __user *uquery = info; 1038 struct perf_event_query_bpf query = {}; 1039 u32 *ids, prog_cnt, ids_len; 1040 int ret; 1041 1042 if (!capable(CAP_SYS_ADMIN)) 1043 return -EPERM; 1044 if (event->attr.type != PERF_TYPE_TRACEPOINT) 1045 return -EINVAL; 1046 if (copy_from_user(&query, uquery, sizeof(query))) 1047 return -EFAULT; 1048 1049 ids_len = query.ids_len; 1050 if (ids_len > BPF_TRACE_MAX_PROGS) 1051 return -E2BIG; 1052 ids = kcalloc(ids_len, sizeof(u32), GFP_USER | __GFP_NOWARN); 1053 if (!ids) 1054 return -ENOMEM; 1055 /* 1056 * The above kcalloc returns ZERO_SIZE_PTR when ids_len = 0, which 1057 * is required when user only wants to check for uquery->prog_cnt. 1058 * There is no need to check for it since the case is handled 1059 * gracefully in bpf_prog_array_copy_info. 1060 */ 1061 1062 mutex_lock(&bpf_event_mutex); 1063 ret = bpf_prog_array_copy_info(event->tp_event->prog_array, 1064 ids, 1065 ids_len, 1066 &prog_cnt); 1067 mutex_unlock(&bpf_event_mutex); 1068 1069 if (copy_to_user(&uquery->prog_cnt, &prog_cnt, sizeof(prog_cnt)) || 1070 copy_to_user(uquery->ids, ids, ids_len * sizeof(u32))) 1071 ret = -EFAULT; 1072 1073 kfree(ids); 1074 return ret; 1075 } 1076 1077 extern struct bpf_raw_event_map __start__bpf_raw_tp[]; 1078 extern struct bpf_raw_event_map __stop__bpf_raw_tp[]; 1079 1080 struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) 1081 { 1082 struct bpf_raw_event_map *btp = __start__bpf_raw_tp; 1083 1084 for (; btp < __stop__bpf_raw_tp; btp++) { 1085 if (!strcmp(btp->tp->name, name)) 1086 return btp; 1087 } 1088 return NULL; 1089 } 1090 1091 static __always_inline 1092 void __bpf_trace_run(struct bpf_prog *prog, u64 *args) 1093 { 1094 rcu_read_lock(); 1095 preempt_disable(); 1096 (void) BPF_PROG_RUN(prog, args); 1097 preempt_enable(); 1098 rcu_read_unlock(); 1099 } 1100 1101 #define UNPACK(...) __VA_ARGS__ 1102 #define REPEAT_1(FN, DL, X, ...) FN(X) 1103 #define REPEAT_2(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_1(FN, DL, __VA_ARGS__) 1104 #define REPEAT_3(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_2(FN, DL, __VA_ARGS__) 1105 #define REPEAT_4(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_3(FN, DL, __VA_ARGS__) 1106 #define REPEAT_5(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_4(FN, DL, __VA_ARGS__) 1107 #define REPEAT_6(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_5(FN, DL, __VA_ARGS__) 1108 #define REPEAT_7(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_6(FN, DL, __VA_ARGS__) 1109 #define REPEAT_8(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_7(FN, DL, __VA_ARGS__) 1110 #define REPEAT_9(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_8(FN, DL, __VA_ARGS__) 1111 #define REPEAT_10(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_9(FN, DL, __VA_ARGS__) 1112 #define REPEAT_11(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_10(FN, DL, __VA_ARGS__) 1113 #define REPEAT_12(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_11(FN, DL, __VA_ARGS__) 1114 #define REPEAT(X, FN, DL, ...) REPEAT_##X(FN, DL, __VA_ARGS__) 1115 1116 #define SARG(X) u64 arg##X 1117 #define COPY(X) args[X] = arg##X 1118 1119 #define __DL_COM (,) 1120 #define __DL_SEM (;) 1121 1122 #define __SEQ_0_11 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 1123 1124 #define BPF_TRACE_DEFN_x(x) \ 1125 void bpf_trace_run##x(struct bpf_prog *prog, \ 1126 REPEAT(x, SARG, __DL_COM, __SEQ_0_11)) \ 1127 { \ 1128 u64 args[x]; \ 1129 REPEAT(x, COPY, __DL_SEM, __SEQ_0_11); \ 1130 __bpf_trace_run(prog, args); \ 1131 } \ 1132 EXPORT_SYMBOL_GPL(bpf_trace_run##x) 1133 BPF_TRACE_DEFN_x(1); 1134 BPF_TRACE_DEFN_x(2); 1135 BPF_TRACE_DEFN_x(3); 1136 BPF_TRACE_DEFN_x(4); 1137 BPF_TRACE_DEFN_x(5); 1138 BPF_TRACE_DEFN_x(6); 1139 BPF_TRACE_DEFN_x(7); 1140 BPF_TRACE_DEFN_x(8); 1141 BPF_TRACE_DEFN_x(9); 1142 BPF_TRACE_DEFN_x(10); 1143 BPF_TRACE_DEFN_x(11); 1144 BPF_TRACE_DEFN_x(12); 1145 1146 static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) 1147 { 1148 struct tracepoint *tp = btp->tp; 1149 1150 /* 1151 * check that program doesn't access arguments beyond what's 1152 * available in this tracepoint 1153 */ 1154 if (prog->aux->max_ctx_offset > btp->num_args * sizeof(u64)) 1155 return -EINVAL; 1156 1157 return tracepoint_probe_register(tp, (void *)btp->bpf_func, prog); 1158 } 1159 1160 int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) 1161 { 1162 int err; 1163 1164 mutex_lock(&bpf_event_mutex); 1165 err = __bpf_probe_register(btp, prog); 1166 mutex_unlock(&bpf_event_mutex); 1167 return err; 1168 } 1169 1170 int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog) 1171 { 1172 int err; 1173 1174 mutex_lock(&bpf_event_mutex); 1175 err = tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog); 1176 mutex_unlock(&bpf_event_mutex); 1177 return err; 1178 } 1179 1180 int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, 1181 u32 *fd_type, const char **buf, 1182 u64 *probe_offset, u64 *probe_addr) 1183 { 1184 bool is_tracepoint, is_syscall_tp; 1185 struct bpf_prog *prog; 1186 int flags, err = 0; 1187 1188 prog = event->prog; 1189 if (!prog) 1190 return -ENOENT; 1191 1192 /* not supporting BPF_PROG_TYPE_PERF_EVENT yet */ 1193 if (prog->type == BPF_PROG_TYPE_PERF_EVENT) 1194 return -EOPNOTSUPP; 1195 1196 *prog_id = prog->aux->id; 1197 flags = event->tp_event->flags; 1198 is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT; 1199 is_syscall_tp = is_syscall_trace_event(event->tp_event); 1200 1201 if (is_tracepoint || is_syscall_tp) { 1202 *buf = is_tracepoint ? event->tp_event->tp->name 1203 : event->tp_event->name; 1204 *fd_type = BPF_FD_TYPE_TRACEPOINT; 1205 *probe_offset = 0x0; 1206 *probe_addr = 0x0; 1207 } else { 1208 /* kprobe/uprobe */ 1209 err = -EOPNOTSUPP; 1210 #ifdef CONFIG_KPROBE_EVENTS 1211 if (flags & TRACE_EVENT_FL_KPROBE) 1212 err = bpf_get_kprobe_info(event, fd_type, buf, 1213 probe_offset, probe_addr, 1214 event->attr.type == PERF_TYPE_TRACEPOINT); 1215 #endif 1216 #ifdef CONFIG_UPROBE_EVENTS 1217 if (flags & TRACE_EVENT_FL_UPROBE) 1218 err = bpf_get_uprobe_info(event, fd_type, buf, 1219 probe_offset, 1220 event->attr.type == PERF_TYPE_TRACEPOINT); 1221 #endif 1222 } 1223 1224 return err; 1225 } 1226