1 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com 2 * 3 * This program is free software; you can redistribute it and/or 4 * modify it under the terms of version 2 of the GNU General Public 5 * License as published by the Free Software Foundation. 6 * 7 * This program is distributed in the hope that it will be useful, but 8 * WITHOUT ANY WARRANTY; without even the implied warranty of 9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 10 * General Public License for more details. 11 */ 12 #include <linux/bpf.h> 13 #include <linux/syscalls.h> 14 #include <linux/slab.h> 15 #include <linux/anon_inodes.h> 16 #include <linux/file.h> 17 #include <linux/license.h> 18 #include <linux/filter.h> 19 #include <linux/version.h> 20 21 int sysctl_unprivileged_bpf_disabled __read_mostly; 22 23 static LIST_HEAD(bpf_map_types); 24 25 static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) 26 { 27 struct bpf_map_type_list *tl; 28 struct bpf_map *map; 29 30 list_for_each_entry(tl, &bpf_map_types, list_node) { 31 if (tl->type == attr->map_type) { 32 map = tl->ops->map_alloc(attr); 33 if (IS_ERR(map)) 34 return map; 35 map->ops = tl->ops; 36 map->map_type = attr->map_type; 37 return map; 38 } 39 } 40 return ERR_PTR(-EINVAL); 41 } 42 43 /* boot time registration of different map implementations */ 44 void bpf_register_map_type(struct bpf_map_type_list *tl) 45 { 46 list_add(&tl->list_node, &bpf_map_types); 47 } 48 49 static int bpf_map_charge_memlock(struct bpf_map *map) 50 { 51 struct user_struct *user = get_current_user(); 52 unsigned long memlock_limit; 53 54 memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 55 56 atomic_long_add(map->pages, &user->locked_vm); 57 58 if (atomic_long_read(&user->locked_vm) > memlock_limit) { 59 atomic_long_sub(map->pages, &user->locked_vm); 60 free_uid(user); 61 return -EPERM; 62 } 63 map->user = user; 64 return 0; 65 } 66 67 static void bpf_map_uncharge_memlock(struct bpf_map *map) 68 { 69 struct user_struct *user = map->user; 70 71 atomic_long_sub(map->pages, &user->locked_vm); 72 free_uid(user); 73 } 74 75 /* called from workqueue */ 76 static void bpf_map_free_deferred(struct work_struct *work) 77 { 78 struct bpf_map *map = container_of(work, struct bpf_map, work); 79 80 bpf_map_uncharge_memlock(map); 81 /* implementation dependent freeing */ 82 map->ops->map_free(map); 83 } 84 85 static void bpf_map_put_uref(struct bpf_map *map) 86 { 87 if (atomic_dec_and_test(&map->usercnt)) { 88 if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) 89 bpf_fd_array_map_clear(map); 90 } 91 } 92 93 /* decrement map refcnt and schedule it for freeing via workqueue 94 * (unrelying map implementation ops->map_free() might sleep) 95 */ 96 void bpf_map_put(struct bpf_map *map) 97 { 98 if (atomic_dec_and_test(&map->refcnt)) { 99 INIT_WORK(&map->work, bpf_map_free_deferred); 100 schedule_work(&map->work); 101 } 102 } 103 104 void bpf_map_put_with_uref(struct bpf_map *map) 105 { 106 bpf_map_put_uref(map); 107 bpf_map_put(map); 108 } 109 110 static int bpf_map_release(struct inode *inode, struct file *filp) 111 { 112 bpf_map_put_with_uref(filp->private_data); 113 return 0; 114 } 115 116 static const struct file_operations bpf_map_fops = { 117 .release = bpf_map_release, 118 }; 119 120 int bpf_map_new_fd(struct bpf_map *map) 121 { 122 return anon_inode_getfd("bpf-map", &bpf_map_fops, map, 123 O_RDWR | O_CLOEXEC); 124 } 125 126 /* helper macro to check that unused fields 'union bpf_attr' are zero */ 127 #define CHECK_ATTR(CMD) \ 128 memchr_inv((void *) &attr->CMD##_LAST_FIELD + \ 129 sizeof(attr->CMD##_LAST_FIELD), 0, \ 130 sizeof(*attr) - \ 131 offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ 132 sizeof(attr->CMD##_LAST_FIELD)) != NULL 133 134 #define BPF_MAP_CREATE_LAST_FIELD max_entries 135 /* called via syscall */ 136 static int map_create(union bpf_attr *attr) 137 { 138 struct bpf_map *map; 139 int err; 140 141 err = CHECK_ATTR(BPF_MAP_CREATE); 142 if (err) 143 return -EINVAL; 144 145 /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ 146 map = find_and_alloc_map(attr); 147 if (IS_ERR(map)) 148 return PTR_ERR(map); 149 150 atomic_set(&map->refcnt, 1); 151 atomic_set(&map->usercnt, 1); 152 153 err = bpf_map_charge_memlock(map); 154 if (err) 155 goto free_map; 156 157 err = bpf_map_new_fd(map); 158 if (err < 0) 159 /* failed to allocate fd */ 160 goto free_map; 161 162 return err; 163 164 free_map: 165 map->ops->map_free(map); 166 return err; 167 } 168 169 /* if error is returned, fd is released. 170 * On success caller should complete fd access with matching fdput() 171 */ 172 struct bpf_map *__bpf_map_get(struct fd f) 173 { 174 if (!f.file) 175 return ERR_PTR(-EBADF); 176 if (f.file->f_op != &bpf_map_fops) { 177 fdput(f); 178 return ERR_PTR(-EINVAL); 179 } 180 181 return f.file->private_data; 182 } 183 184 void bpf_map_inc(struct bpf_map *map, bool uref) 185 { 186 atomic_inc(&map->refcnt); 187 if (uref) 188 atomic_inc(&map->usercnt); 189 } 190 191 struct bpf_map *bpf_map_get_with_uref(u32 ufd) 192 { 193 struct fd f = fdget(ufd); 194 struct bpf_map *map; 195 196 map = __bpf_map_get(f); 197 if (IS_ERR(map)) 198 return map; 199 200 bpf_map_inc(map, true); 201 fdput(f); 202 203 return map; 204 } 205 206 /* helper to convert user pointers passed inside __aligned_u64 fields */ 207 static void __user *u64_to_ptr(__u64 val) 208 { 209 return (void __user *) (unsigned long) val; 210 } 211 212 /* last field in 'union bpf_attr' used by this command */ 213 #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value 214 215 static int map_lookup_elem(union bpf_attr *attr) 216 { 217 void __user *ukey = u64_to_ptr(attr->key); 218 void __user *uvalue = u64_to_ptr(attr->value); 219 int ufd = attr->map_fd; 220 struct bpf_map *map; 221 void *key, *value, *ptr; 222 struct fd f; 223 int err; 224 225 if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) 226 return -EINVAL; 227 228 f = fdget(ufd); 229 map = __bpf_map_get(f); 230 if (IS_ERR(map)) 231 return PTR_ERR(map); 232 233 err = -ENOMEM; 234 key = kmalloc(map->key_size, GFP_USER); 235 if (!key) 236 goto err_put; 237 238 err = -EFAULT; 239 if (copy_from_user(key, ukey, map->key_size) != 0) 240 goto free_key; 241 242 err = -ENOMEM; 243 value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); 244 if (!value) 245 goto free_key; 246 247 rcu_read_lock(); 248 ptr = map->ops->map_lookup_elem(map, key); 249 if (ptr) 250 memcpy(value, ptr, map->value_size); 251 rcu_read_unlock(); 252 253 err = -ENOENT; 254 if (!ptr) 255 goto free_value; 256 257 err = -EFAULT; 258 if (copy_to_user(uvalue, value, map->value_size) != 0) 259 goto free_value; 260 261 err = 0; 262 263 free_value: 264 kfree(value); 265 free_key: 266 kfree(key); 267 err_put: 268 fdput(f); 269 return err; 270 } 271 272 #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags 273 274 static int map_update_elem(union bpf_attr *attr) 275 { 276 void __user *ukey = u64_to_ptr(attr->key); 277 void __user *uvalue = u64_to_ptr(attr->value); 278 int ufd = attr->map_fd; 279 struct bpf_map *map; 280 void *key, *value; 281 struct fd f; 282 int err; 283 284 if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM)) 285 return -EINVAL; 286 287 f = fdget(ufd); 288 map = __bpf_map_get(f); 289 if (IS_ERR(map)) 290 return PTR_ERR(map); 291 292 err = -ENOMEM; 293 key = kmalloc(map->key_size, GFP_USER); 294 if (!key) 295 goto err_put; 296 297 err = -EFAULT; 298 if (copy_from_user(key, ukey, map->key_size) != 0) 299 goto free_key; 300 301 err = -ENOMEM; 302 value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); 303 if (!value) 304 goto free_key; 305 306 err = -EFAULT; 307 if (copy_from_user(value, uvalue, map->value_size) != 0) 308 goto free_value; 309 310 /* eBPF program that use maps are running under rcu_read_lock(), 311 * therefore all map accessors rely on this fact, so do the same here 312 */ 313 rcu_read_lock(); 314 err = map->ops->map_update_elem(map, key, value, attr->flags); 315 rcu_read_unlock(); 316 317 free_value: 318 kfree(value); 319 free_key: 320 kfree(key); 321 err_put: 322 fdput(f); 323 return err; 324 } 325 326 #define BPF_MAP_DELETE_ELEM_LAST_FIELD key 327 328 static int map_delete_elem(union bpf_attr *attr) 329 { 330 void __user *ukey = u64_to_ptr(attr->key); 331 int ufd = attr->map_fd; 332 struct bpf_map *map; 333 struct fd f; 334 void *key; 335 int err; 336 337 if (CHECK_ATTR(BPF_MAP_DELETE_ELEM)) 338 return -EINVAL; 339 340 f = fdget(ufd); 341 map = __bpf_map_get(f); 342 if (IS_ERR(map)) 343 return PTR_ERR(map); 344 345 err = -ENOMEM; 346 key = kmalloc(map->key_size, GFP_USER); 347 if (!key) 348 goto err_put; 349 350 err = -EFAULT; 351 if (copy_from_user(key, ukey, map->key_size) != 0) 352 goto free_key; 353 354 rcu_read_lock(); 355 err = map->ops->map_delete_elem(map, key); 356 rcu_read_unlock(); 357 358 free_key: 359 kfree(key); 360 err_put: 361 fdput(f); 362 return err; 363 } 364 365 /* last field in 'union bpf_attr' used by this command */ 366 #define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key 367 368 static int map_get_next_key(union bpf_attr *attr) 369 { 370 void __user *ukey = u64_to_ptr(attr->key); 371 void __user *unext_key = u64_to_ptr(attr->next_key); 372 int ufd = attr->map_fd; 373 struct bpf_map *map; 374 void *key, *next_key; 375 struct fd f; 376 int err; 377 378 if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY)) 379 return -EINVAL; 380 381 f = fdget(ufd); 382 map = __bpf_map_get(f); 383 if (IS_ERR(map)) 384 return PTR_ERR(map); 385 386 err = -ENOMEM; 387 key = kmalloc(map->key_size, GFP_USER); 388 if (!key) 389 goto err_put; 390 391 err = -EFAULT; 392 if (copy_from_user(key, ukey, map->key_size) != 0) 393 goto free_key; 394 395 err = -ENOMEM; 396 next_key = kmalloc(map->key_size, GFP_USER); 397 if (!next_key) 398 goto free_key; 399 400 rcu_read_lock(); 401 err = map->ops->map_get_next_key(map, key, next_key); 402 rcu_read_unlock(); 403 if (err) 404 goto free_next_key; 405 406 err = -EFAULT; 407 if (copy_to_user(unext_key, next_key, map->key_size) != 0) 408 goto free_next_key; 409 410 err = 0; 411 412 free_next_key: 413 kfree(next_key); 414 free_key: 415 kfree(key); 416 err_put: 417 fdput(f); 418 return err; 419 } 420 421 static LIST_HEAD(bpf_prog_types); 422 423 static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) 424 { 425 struct bpf_prog_type_list *tl; 426 427 list_for_each_entry(tl, &bpf_prog_types, list_node) { 428 if (tl->type == type) { 429 prog->aux->ops = tl->ops; 430 prog->type = type; 431 return 0; 432 } 433 } 434 435 return -EINVAL; 436 } 437 438 void bpf_register_prog_type(struct bpf_prog_type_list *tl) 439 { 440 list_add(&tl->list_node, &bpf_prog_types); 441 } 442 443 /* fixup insn->imm field of bpf_call instructions: 444 * if (insn->imm == BPF_FUNC_map_lookup_elem) 445 * insn->imm = bpf_map_lookup_elem - __bpf_call_base; 446 * else if (insn->imm == BPF_FUNC_map_update_elem) 447 * insn->imm = bpf_map_update_elem - __bpf_call_base; 448 * else ... 449 * 450 * this function is called after eBPF program passed verification 451 */ 452 static void fixup_bpf_calls(struct bpf_prog *prog) 453 { 454 const struct bpf_func_proto *fn; 455 int i; 456 457 for (i = 0; i < prog->len; i++) { 458 struct bpf_insn *insn = &prog->insnsi[i]; 459 460 if (insn->code == (BPF_JMP | BPF_CALL)) { 461 /* we reach here when program has bpf_call instructions 462 * and it passed bpf_check(), means that 463 * ops->get_func_proto must have been supplied, check it 464 */ 465 BUG_ON(!prog->aux->ops->get_func_proto); 466 467 if (insn->imm == BPF_FUNC_get_route_realm) 468 prog->dst_needed = 1; 469 if (insn->imm == BPF_FUNC_get_prandom_u32) 470 bpf_user_rnd_init_once(); 471 if (insn->imm == BPF_FUNC_tail_call) { 472 /* mark bpf_tail_call as different opcode 473 * to avoid conditional branch in 474 * interpeter for every normal call 475 * and to prevent accidental JITing by 476 * JIT compiler that doesn't support 477 * bpf_tail_call yet 478 */ 479 insn->imm = 0; 480 insn->code |= BPF_X; 481 continue; 482 } 483 484 fn = prog->aux->ops->get_func_proto(insn->imm); 485 /* all functions that have prototype and verifier allowed 486 * programs to call them, must be real in-kernel functions 487 */ 488 BUG_ON(!fn->func); 489 insn->imm = fn->func - __bpf_call_base; 490 } 491 } 492 } 493 494 /* drop refcnt on maps used by eBPF program and free auxilary data */ 495 static void free_used_maps(struct bpf_prog_aux *aux) 496 { 497 int i; 498 499 for (i = 0; i < aux->used_map_cnt; i++) 500 bpf_map_put(aux->used_maps[i]); 501 502 kfree(aux->used_maps); 503 } 504 505 static int bpf_prog_charge_memlock(struct bpf_prog *prog) 506 { 507 struct user_struct *user = get_current_user(); 508 unsigned long memlock_limit; 509 510 memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 511 512 atomic_long_add(prog->pages, &user->locked_vm); 513 if (atomic_long_read(&user->locked_vm) > memlock_limit) { 514 atomic_long_sub(prog->pages, &user->locked_vm); 515 free_uid(user); 516 return -EPERM; 517 } 518 prog->aux->user = user; 519 return 0; 520 } 521 522 static void bpf_prog_uncharge_memlock(struct bpf_prog *prog) 523 { 524 struct user_struct *user = prog->aux->user; 525 526 atomic_long_sub(prog->pages, &user->locked_vm); 527 free_uid(user); 528 } 529 530 static void __prog_put_common(struct rcu_head *rcu) 531 { 532 struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); 533 534 free_used_maps(aux); 535 bpf_prog_uncharge_memlock(aux->prog); 536 bpf_prog_free(aux->prog); 537 } 538 539 /* version of bpf_prog_put() that is called after a grace period */ 540 void bpf_prog_put_rcu(struct bpf_prog *prog) 541 { 542 if (atomic_dec_and_test(&prog->aux->refcnt)) 543 call_rcu(&prog->aux->rcu, __prog_put_common); 544 } 545 546 void bpf_prog_put(struct bpf_prog *prog) 547 { 548 if (atomic_dec_and_test(&prog->aux->refcnt)) 549 __prog_put_common(&prog->aux->rcu); 550 } 551 EXPORT_SYMBOL_GPL(bpf_prog_put); 552 553 static int bpf_prog_release(struct inode *inode, struct file *filp) 554 { 555 struct bpf_prog *prog = filp->private_data; 556 557 bpf_prog_put_rcu(prog); 558 return 0; 559 } 560 561 static const struct file_operations bpf_prog_fops = { 562 .release = bpf_prog_release, 563 }; 564 565 int bpf_prog_new_fd(struct bpf_prog *prog) 566 { 567 return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, 568 O_RDWR | O_CLOEXEC); 569 } 570 571 static struct bpf_prog *__bpf_prog_get(struct fd f) 572 { 573 if (!f.file) 574 return ERR_PTR(-EBADF); 575 if (f.file->f_op != &bpf_prog_fops) { 576 fdput(f); 577 return ERR_PTR(-EINVAL); 578 } 579 580 return f.file->private_data; 581 } 582 583 /* called by sockets/tracing/seccomp before attaching program to an event 584 * pairs with bpf_prog_put() 585 */ 586 struct bpf_prog *bpf_prog_get(u32 ufd) 587 { 588 struct fd f = fdget(ufd); 589 struct bpf_prog *prog; 590 591 prog = __bpf_prog_get(f); 592 if (IS_ERR(prog)) 593 return prog; 594 595 atomic_inc(&prog->aux->refcnt); 596 fdput(f); 597 598 return prog; 599 } 600 EXPORT_SYMBOL_GPL(bpf_prog_get); 601 602 /* last field in 'union bpf_attr' used by this command */ 603 #define BPF_PROG_LOAD_LAST_FIELD kern_version 604 605 static int bpf_prog_load(union bpf_attr *attr) 606 { 607 enum bpf_prog_type type = attr->prog_type; 608 struct bpf_prog *prog; 609 int err; 610 char license[128]; 611 bool is_gpl; 612 613 if (CHECK_ATTR(BPF_PROG_LOAD)) 614 return -EINVAL; 615 616 /* copy eBPF program license from user space */ 617 if (strncpy_from_user(license, u64_to_ptr(attr->license), 618 sizeof(license) - 1) < 0) 619 return -EFAULT; 620 license[sizeof(license) - 1] = 0; 621 622 /* eBPF programs must be GPL compatible to use GPL-ed functions */ 623 is_gpl = license_is_gpl_compatible(license); 624 625 if (attr->insn_cnt >= BPF_MAXINSNS) 626 return -EINVAL; 627 628 if (type == BPF_PROG_TYPE_KPROBE && 629 attr->kern_version != LINUX_VERSION_CODE) 630 return -EINVAL; 631 632 if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN)) 633 return -EPERM; 634 635 /* plain bpf_prog allocation */ 636 prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); 637 if (!prog) 638 return -ENOMEM; 639 640 err = bpf_prog_charge_memlock(prog); 641 if (err) 642 goto free_prog_nouncharge; 643 644 prog->len = attr->insn_cnt; 645 646 err = -EFAULT; 647 if (copy_from_user(prog->insns, u64_to_ptr(attr->insns), 648 prog->len * sizeof(struct bpf_insn)) != 0) 649 goto free_prog; 650 651 prog->orig_prog = NULL; 652 prog->jited = 0; 653 654 atomic_set(&prog->aux->refcnt, 1); 655 prog->gpl_compatible = is_gpl ? 1 : 0; 656 657 /* find program type: socket_filter vs tracing_filter */ 658 err = find_prog_type(type, prog); 659 if (err < 0) 660 goto free_prog; 661 662 /* run eBPF verifier */ 663 err = bpf_check(&prog, attr); 664 if (err < 0) 665 goto free_used_maps; 666 667 /* fixup BPF_CALL->imm field */ 668 fixup_bpf_calls(prog); 669 670 /* eBPF program is ready to be JITed */ 671 err = bpf_prog_select_runtime(prog); 672 if (err < 0) 673 goto free_used_maps; 674 675 err = bpf_prog_new_fd(prog); 676 if (err < 0) 677 /* failed to allocate fd */ 678 goto free_used_maps; 679 680 return err; 681 682 free_used_maps: 683 free_used_maps(prog->aux); 684 free_prog: 685 bpf_prog_uncharge_memlock(prog); 686 free_prog_nouncharge: 687 bpf_prog_free(prog); 688 return err; 689 } 690 691 #define BPF_OBJ_LAST_FIELD bpf_fd 692 693 static int bpf_obj_pin(const union bpf_attr *attr) 694 { 695 if (CHECK_ATTR(BPF_OBJ)) 696 return -EINVAL; 697 698 return bpf_obj_pin_user(attr->bpf_fd, u64_to_ptr(attr->pathname)); 699 } 700 701 static int bpf_obj_get(const union bpf_attr *attr) 702 { 703 if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0) 704 return -EINVAL; 705 706 return bpf_obj_get_user(u64_to_ptr(attr->pathname)); 707 } 708 709 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) 710 { 711 union bpf_attr attr = {}; 712 int err; 713 714 if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled) 715 return -EPERM; 716 717 if (!access_ok(VERIFY_READ, uattr, 1)) 718 return -EFAULT; 719 720 if (size > PAGE_SIZE) /* silly large */ 721 return -E2BIG; 722 723 /* If we're handed a bigger struct than we know of, 724 * ensure all the unknown bits are 0 - i.e. new 725 * user-space does not rely on any kernel feature 726 * extensions we dont know about yet. 727 */ 728 if (size > sizeof(attr)) { 729 unsigned char __user *addr; 730 unsigned char __user *end; 731 unsigned char val; 732 733 addr = (void __user *)uattr + sizeof(attr); 734 end = (void __user *)uattr + size; 735 736 for (; addr < end; addr++) { 737 err = get_user(val, addr); 738 if (err) 739 return err; 740 if (val) 741 return -E2BIG; 742 } 743 size = sizeof(attr); 744 } 745 746 /* copy attributes from user space, may be less than sizeof(bpf_attr) */ 747 if (copy_from_user(&attr, uattr, size) != 0) 748 return -EFAULT; 749 750 switch (cmd) { 751 case BPF_MAP_CREATE: 752 err = map_create(&attr); 753 break; 754 case BPF_MAP_LOOKUP_ELEM: 755 err = map_lookup_elem(&attr); 756 break; 757 case BPF_MAP_UPDATE_ELEM: 758 err = map_update_elem(&attr); 759 break; 760 case BPF_MAP_DELETE_ELEM: 761 err = map_delete_elem(&attr); 762 break; 763 case BPF_MAP_GET_NEXT_KEY: 764 err = map_get_next_key(&attr); 765 break; 766 case BPF_PROG_LOAD: 767 err = bpf_prog_load(&attr); 768 break; 769 case BPF_OBJ_PIN: 770 err = bpf_obj_pin(&attr); 771 break; 772 case BPF_OBJ_GET: 773 err = bpf_obj_get(&attr); 774 break; 775 default: 776 err = -EINVAL; 777 break; 778 } 779 780 return err; 781 } 782