1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2019 Facebook */ 3 #include <linux/hash.h> 4 #include <linux/bpf.h> 5 #include <linux/filter.h> 6 #include <linux/ftrace.h> 7 #include <linux/rbtree_latch.h> 8 #include <linux/perf_event.h> 9 #include <linux/btf.h> 10 #include <linux/rcupdate_trace.h> 11 #include <linux/rcupdate_wait.h> 12 13 /* dummy _ops. The verifier will operate on target program's ops. */ 14 const struct bpf_verifier_ops bpf_extension_verifier_ops = { 15 }; 16 const struct bpf_prog_ops bpf_extension_prog_ops = { 17 }; 18 19 /* btf_vmlinux has ~22k attachable functions. 1k htab is enough. */ 20 #define TRAMPOLINE_HASH_BITS 10 21 #define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS) 22 23 static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE]; 24 25 /* serializes access to trampoline_table */ 26 static DEFINE_MUTEX(trampoline_mutex); 27 28 void *bpf_jit_alloc_exec_page(void) 29 { 30 void *image; 31 32 image = bpf_jit_alloc_exec(PAGE_SIZE); 33 if (!image) 34 return NULL; 35 36 set_vm_flush_reset_perms(image); 37 /* Keep image as writeable. The alternative is to keep flipping ro/rw 38 * everytime new program is attached or detached. 39 */ 40 set_memory_x((long)image, 1); 41 return image; 42 } 43 44 void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym) 45 { 46 ksym->start = (unsigned long) data; 47 ksym->end = ksym->start + PAGE_SIZE; 48 bpf_ksym_add(ksym); 49 perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start, 50 PAGE_SIZE, false, ksym->name); 51 } 52 53 void bpf_image_ksym_del(struct bpf_ksym *ksym) 54 { 55 bpf_ksym_del(ksym); 56 perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start, 57 PAGE_SIZE, true, ksym->name); 58 } 59 60 static struct bpf_trampoline *bpf_trampoline_lookup(u64 key) 61 { 62 struct bpf_trampoline *tr; 63 struct hlist_head *head; 64 int i; 65 66 mutex_lock(&trampoline_mutex); 67 head = &trampoline_table[hash_64(key, TRAMPOLINE_HASH_BITS)]; 68 hlist_for_each_entry(tr, head, hlist) { 69 if (tr->key == key) { 70 refcount_inc(&tr->refcnt); 71 goto out; 72 } 73 } 74 tr = kzalloc(sizeof(*tr), GFP_KERNEL); 75 if (!tr) 76 goto out; 77 78 tr->key = key; 79 INIT_HLIST_NODE(&tr->hlist); 80 hlist_add_head(&tr->hlist, head); 81 refcount_set(&tr->refcnt, 1); 82 mutex_init(&tr->mutex); 83 for (i = 0; i < BPF_TRAMP_MAX; i++) 84 INIT_HLIST_HEAD(&tr->progs_hlist[i]); 85 out: 86 mutex_unlock(&trampoline_mutex); 87 return tr; 88 } 89 90 static int is_ftrace_location(void *ip) 91 { 92 long addr; 93 94 addr = ftrace_location((long)ip); 95 if (!addr) 96 return 0; 97 if (WARN_ON_ONCE(addr != (long)ip)) 98 return -EFAULT; 99 return 1; 100 } 101 102 static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr) 103 { 104 void *ip = tr->func.addr; 105 int ret; 106 107 if (tr->func.ftrace_managed) 108 ret = unregister_ftrace_direct((long)ip, (long)old_addr); 109 else 110 ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL); 111 return ret; 112 } 113 114 static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr) 115 { 116 void *ip = tr->func.addr; 117 int ret; 118 119 if (tr->func.ftrace_managed) 120 ret = modify_ftrace_direct((long)ip, (long)old_addr, (long)new_addr); 121 else 122 ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr); 123 return ret; 124 } 125 126 /* first time registering */ 127 static int register_fentry(struct bpf_trampoline *tr, void *new_addr) 128 { 129 void *ip = tr->func.addr; 130 int ret; 131 132 ret = is_ftrace_location(ip); 133 if (ret < 0) 134 return ret; 135 tr->func.ftrace_managed = ret; 136 137 if (tr->func.ftrace_managed) 138 ret = register_ftrace_direct((long)ip, (long)new_addr); 139 else 140 ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr); 141 return ret; 142 } 143 144 static struct bpf_tramp_progs * 145 bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total) 146 { 147 const struct bpf_prog_aux *aux; 148 struct bpf_tramp_progs *tprogs; 149 struct bpf_prog **progs; 150 int kind; 151 152 *total = 0; 153 tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL); 154 if (!tprogs) 155 return ERR_PTR(-ENOMEM); 156 157 for (kind = 0; kind < BPF_TRAMP_MAX; kind++) { 158 tprogs[kind].nr_progs = tr->progs_cnt[kind]; 159 *total += tr->progs_cnt[kind]; 160 progs = tprogs[kind].progs; 161 162 hlist_for_each_entry(aux, &tr->progs_hlist[kind], tramp_hlist) 163 *progs++ = aux->prog; 164 } 165 return tprogs; 166 } 167 168 static void __bpf_tramp_image_put_deferred(struct work_struct *work) 169 { 170 struct bpf_tramp_image *im; 171 172 im = container_of(work, struct bpf_tramp_image, work); 173 bpf_image_ksym_del(&im->ksym); 174 bpf_jit_free_exec(im->image); 175 bpf_jit_uncharge_modmem(1); 176 percpu_ref_exit(&im->pcref); 177 kfree_rcu(im, rcu); 178 } 179 180 /* callback, fexit step 3 or fentry step 2 */ 181 static void __bpf_tramp_image_put_rcu(struct rcu_head *rcu) 182 { 183 struct bpf_tramp_image *im; 184 185 im = container_of(rcu, struct bpf_tramp_image, rcu); 186 INIT_WORK(&im->work, __bpf_tramp_image_put_deferred); 187 schedule_work(&im->work); 188 } 189 190 /* callback, fexit step 2. Called after percpu_ref_kill confirms. */ 191 static void __bpf_tramp_image_release(struct percpu_ref *pcref) 192 { 193 struct bpf_tramp_image *im; 194 195 im = container_of(pcref, struct bpf_tramp_image, pcref); 196 call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu); 197 } 198 199 /* callback, fexit or fentry step 1 */ 200 static void __bpf_tramp_image_put_rcu_tasks(struct rcu_head *rcu) 201 { 202 struct bpf_tramp_image *im; 203 204 im = container_of(rcu, struct bpf_tramp_image, rcu); 205 if (im->ip_after_call) 206 /* the case of fmod_ret/fexit trampoline and CONFIG_PREEMPTION=y */ 207 percpu_ref_kill(&im->pcref); 208 else 209 /* the case of fentry trampoline */ 210 call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu); 211 } 212 213 static void bpf_tramp_image_put(struct bpf_tramp_image *im) 214 { 215 /* The trampoline image that calls original function is using: 216 * rcu_read_lock_trace to protect sleepable bpf progs 217 * rcu_read_lock to protect normal bpf progs 218 * percpu_ref to protect trampoline itself 219 * rcu tasks to protect trampoline asm not covered by percpu_ref 220 * (which are few asm insns before __bpf_tramp_enter and 221 * after __bpf_tramp_exit) 222 * 223 * The trampoline is unreachable before bpf_tramp_image_put(). 224 * 225 * First, patch the trampoline to avoid calling into fexit progs. 226 * The progs will be freed even if the original function is still 227 * executing or sleeping. 228 * In case of CONFIG_PREEMPT=y use call_rcu_tasks() to wait on 229 * first few asm instructions to execute and call into 230 * __bpf_tramp_enter->percpu_ref_get. 231 * Then use percpu_ref_kill to wait for the trampoline and the original 232 * function to finish. 233 * Then use call_rcu_tasks() to make sure few asm insns in 234 * the trampoline epilogue are done as well. 235 * 236 * In !PREEMPT case the task that got interrupted in the first asm 237 * insns won't go through an RCU quiescent state which the 238 * percpu_ref_kill will be waiting for. Hence the first 239 * call_rcu_tasks() is not necessary. 240 */ 241 if (im->ip_after_call) { 242 int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP, 243 NULL, im->ip_epilogue); 244 WARN_ON(err); 245 if (IS_ENABLED(CONFIG_PREEMPTION)) 246 call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu_tasks); 247 else 248 percpu_ref_kill(&im->pcref); 249 return; 250 } 251 252 /* The trampoline without fexit and fmod_ret progs doesn't call original 253 * function and doesn't use percpu_ref. 254 * Use call_rcu_tasks_trace() to wait for sleepable progs to finish. 255 * Then use call_rcu_tasks() to wait for the rest of trampoline asm 256 * and normal progs. 257 */ 258 call_rcu_tasks_trace(&im->rcu, __bpf_tramp_image_put_rcu_tasks); 259 } 260 261 static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx) 262 { 263 struct bpf_tramp_image *im; 264 struct bpf_ksym *ksym; 265 void *image; 266 int err = -ENOMEM; 267 268 im = kzalloc(sizeof(*im), GFP_KERNEL); 269 if (!im) 270 goto out; 271 272 err = bpf_jit_charge_modmem(1); 273 if (err) 274 goto out_free_im; 275 276 err = -ENOMEM; 277 im->image = image = bpf_jit_alloc_exec_page(); 278 if (!image) 279 goto out_uncharge; 280 281 err = percpu_ref_init(&im->pcref, __bpf_tramp_image_release, 0, GFP_KERNEL); 282 if (err) 283 goto out_free_image; 284 285 ksym = &im->ksym; 286 INIT_LIST_HEAD_RCU(&ksym->lnode); 287 snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu_%u", key, idx); 288 bpf_image_ksym_add(image, ksym); 289 return im; 290 291 out_free_image: 292 bpf_jit_free_exec(im->image); 293 out_uncharge: 294 bpf_jit_uncharge_modmem(1); 295 out_free_im: 296 kfree(im); 297 out: 298 return ERR_PTR(err); 299 } 300 301 static int bpf_trampoline_update(struct bpf_trampoline *tr) 302 { 303 struct bpf_tramp_image *im; 304 struct bpf_tramp_progs *tprogs; 305 u32 flags = BPF_TRAMP_F_RESTORE_REGS; 306 int err, total; 307 308 tprogs = bpf_trampoline_get_progs(tr, &total); 309 if (IS_ERR(tprogs)) 310 return PTR_ERR(tprogs); 311 312 if (total == 0) { 313 err = unregister_fentry(tr, tr->cur_image->image); 314 bpf_tramp_image_put(tr->cur_image); 315 tr->cur_image = NULL; 316 tr->selector = 0; 317 goto out; 318 } 319 320 im = bpf_tramp_image_alloc(tr->key, tr->selector); 321 if (IS_ERR(im)) { 322 err = PTR_ERR(im); 323 goto out; 324 } 325 326 if (tprogs[BPF_TRAMP_FEXIT].nr_progs || 327 tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs) 328 flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; 329 330 err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE, 331 &tr->func.model, flags, tprogs, 332 tr->func.addr); 333 if (err < 0) 334 goto out; 335 336 WARN_ON(tr->cur_image && tr->selector == 0); 337 WARN_ON(!tr->cur_image && tr->selector); 338 if (tr->cur_image) 339 /* progs already running at this address */ 340 err = modify_fentry(tr, tr->cur_image->image, im->image); 341 else 342 /* first time registering */ 343 err = register_fentry(tr, im->image); 344 if (err) 345 goto out; 346 if (tr->cur_image) 347 bpf_tramp_image_put(tr->cur_image); 348 tr->cur_image = im; 349 tr->selector++; 350 out: 351 kfree(tprogs); 352 return err; 353 } 354 355 static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) 356 { 357 switch (prog->expected_attach_type) { 358 case BPF_TRACE_FENTRY: 359 return BPF_TRAMP_FENTRY; 360 case BPF_MODIFY_RETURN: 361 return BPF_TRAMP_MODIFY_RETURN; 362 case BPF_TRACE_FEXIT: 363 return BPF_TRAMP_FEXIT; 364 case BPF_LSM_MAC: 365 if (!prog->aux->attach_func_proto->type) 366 /* The function returns void, we cannot modify its 367 * return value. 368 */ 369 return BPF_TRAMP_FEXIT; 370 else 371 return BPF_TRAMP_MODIFY_RETURN; 372 default: 373 return BPF_TRAMP_REPLACE; 374 } 375 } 376 377 int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) 378 { 379 enum bpf_tramp_prog_type kind; 380 int err = 0; 381 int cnt; 382 383 kind = bpf_attach_type_to_tramp(prog); 384 mutex_lock(&tr->mutex); 385 if (tr->extension_prog) { 386 /* cannot attach fentry/fexit if extension prog is attached. 387 * cannot overwrite extension prog either. 388 */ 389 err = -EBUSY; 390 goto out; 391 } 392 cnt = tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT]; 393 if (kind == BPF_TRAMP_REPLACE) { 394 /* Cannot attach extension if fentry/fexit are in use. */ 395 if (cnt) { 396 err = -EBUSY; 397 goto out; 398 } 399 tr->extension_prog = prog; 400 err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL, 401 prog->bpf_func); 402 goto out; 403 } 404 if (cnt >= BPF_MAX_TRAMP_PROGS) { 405 err = -E2BIG; 406 goto out; 407 } 408 if (!hlist_unhashed(&prog->aux->tramp_hlist)) { 409 /* prog already linked */ 410 err = -EBUSY; 411 goto out; 412 } 413 hlist_add_head(&prog->aux->tramp_hlist, &tr->progs_hlist[kind]); 414 tr->progs_cnt[kind]++; 415 err = bpf_trampoline_update(tr); 416 if (err) { 417 hlist_del(&prog->aux->tramp_hlist); 418 tr->progs_cnt[kind]--; 419 } 420 out: 421 mutex_unlock(&tr->mutex); 422 return err; 423 } 424 425 /* bpf_trampoline_unlink_prog() should never fail. */ 426 int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) 427 { 428 enum bpf_tramp_prog_type kind; 429 int err; 430 431 kind = bpf_attach_type_to_tramp(prog); 432 mutex_lock(&tr->mutex); 433 if (kind == BPF_TRAMP_REPLACE) { 434 WARN_ON_ONCE(!tr->extension_prog); 435 err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, 436 tr->extension_prog->bpf_func, NULL); 437 tr->extension_prog = NULL; 438 goto out; 439 } 440 hlist_del(&prog->aux->tramp_hlist); 441 tr->progs_cnt[kind]--; 442 err = bpf_trampoline_update(tr); 443 out: 444 mutex_unlock(&tr->mutex); 445 return err; 446 } 447 448 struct bpf_trampoline *bpf_trampoline_get(u64 key, 449 struct bpf_attach_target_info *tgt_info) 450 { 451 struct bpf_trampoline *tr; 452 453 tr = bpf_trampoline_lookup(key); 454 if (!tr) 455 return NULL; 456 457 mutex_lock(&tr->mutex); 458 if (tr->func.addr) 459 goto out; 460 461 memcpy(&tr->func.model, &tgt_info->fmodel, sizeof(tgt_info->fmodel)); 462 tr->func.addr = (void *)tgt_info->tgt_addr; 463 out: 464 mutex_unlock(&tr->mutex); 465 return tr; 466 } 467 468 void bpf_trampoline_put(struct bpf_trampoline *tr) 469 { 470 if (!tr) 471 return; 472 mutex_lock(&trampoline_mutex); 473 if (!refcount_dec_and_test(&tr->refcnt)) 474 goto out; 475 WARN_ON_ONCE(mutex_is_locked(&tr->mutex)); 476 if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FENTRY]))) 477 goto out; 478 if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT]))) 479 goto out; 480 /* This code will be executed even when the last bpf_tramp_image 481 * is alive. All progs are detached from the trampoline and the 482 * trampoline image is patched with jmp into epilogue to skip 483 * fexit progs. The fentry-only trampoline will be freed via 484 * multiple rcu callbacks. 485 */ 486 hlist_del(&tr->hlist); 487 kfree(tr); 488 out: 489 mutex_unlock(&trampoline_mutex); 490 } 491 492 #define NO_START_TIME 1 493 static u64 notrace bpf_prog_start_time(void) 494 { 495 u64 start = NO_START_TIME; 496 497 if (static_branch_unlikely(&bpf_stats_enabled_key)) { 498 start = sched_clock(); 499 if (unlikely(!start)) 500 start = NO_START_TIME; 501 } 502 return start; 503 } 504 505 static void notrace inc_misses_counter(struct bpf_prog *prog) 506 { 507 struct bpf_prog_stats *stats; 508 509 stats = this_cpu_ptr(prog->stats); 510 u64_stats_update_begin(&stats->syncp); 511 stats->misses++; 512 u64_stats_update_end(&stats->syncp); 513 } 514 515 /* The logic is similar to BPF_PROG_RUN, but with an explicit 516 * rcu_read_lock() and migrate_disable() which are required 517 * for the trampoline. The macro is split into 518 * call __bpf_prog_enter 519 * call prog->bpf_func 520 * call __bpf_prog_exit 521 * 522 * __bpf_prog_enter returns: 523 * 0 - skip execution of the bpf prog 524 * 1 - execute bpf prog 525 * [2..MAX_U64] - excute bpf prog and record execution time. 526 * This is start time. 527 */ 528 u64 notrace __bpf_prog_enter(struct bpf_prog *prog) 529 __acquires(RCU) 530 { 531 rcu_read_lock(); 532 migrate_disable(); 533 if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) { 534 inc_misses_counter(prog); 535 return 0; 536 } 537 return bpf_prog_start_time(); 538 } 539 540 static void notrace update_prog_stats(struct bpf_prog *prog, 541 u64 start) 542 { 543 struct bpf_prog_stats *stats; 544 545 if (static_branch_unlikely(&bpf_stats_enabled_key) && 546 /* static_key could be enabled in __bpf_prog_enter* 547 * and disabled in __bpf_prog_exit*. 548 * And vice versa. 549 * Hence check that 'start' is valid. 550 */ 551 start > NO_START_TIME) { 552 stats = this_cpu_ptr(prog->stats); 553 u64_stats_update_begin(&stats->syncp); 554 stats->cnt++; 555 stats->nsecs += sched_clock() - start; 556 u64_stats_update_end(&stats->syncp); 557 } 558 } 559 560 void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) 561 __releases(RCU) 562 { 563 update_prog_stats(prog, start); 564 __this_cpu_dec(*(prog->active)); 565 migrate_enable(); 566 rcu_read_unlock(); 567 } 568 569 u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog) 570 { 571 rcu_read_lock_trace(); 572 migrate_disable(); 573 might_fault(); 574 if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) { 575 inc_misses_counter(prog); 576 return 0; 577 } 578 return bpf_prog_start_time(); 579 } 580 581 void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start) 582 { 583 update_prog_stats(prog, start); 584 __this_cpu_dec(*(prog->active)); 585 migrate_enable(); 586 rcu_read_unlock_trace(); 587 } 588 589 void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr) 590 { 591 percpu_ref_get(&tr->pcref); 592 } 593 594 void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr) 595 { 596 percpu_ref_put(&tr->pcref); 597 } 598 599 int __weak 600 arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end, 601 const struct btf_func_model *m, u32 flags, 602 struct bpf_tramp_progs *tprogs, 603 void *orig_call) 604 { 605 return -ENOTSUPP; 606 } 607 608 static int __init init_trampolines(void) 609 { 610 int i; 611 612 for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++) 613 INIT_HLIST_HEAD(&trampoline_table[i]); 614 return 0; 615 } 616 late_initcall(init_trampolines); 617