1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Minimal file system backend for holding eBPF maps and programs, 4 * used by bpf(2) object pinning. 5 * 6 * Authors: 7 * 8 * Daniel Borkmann <daniel@iogearbox.net> 9 */ 10 11 #include <linux/init.h> 12 #include <linux/magic.h> 13 #include <linux/major.h> 14 #include <linux/mount.h> 15 #include <linux/namei.h> 16 #include <linux/fs.h> 17 #include <linux/fs_context.h> 18 #include <linux/fs_parser.h> 19 #include <linux/kdev_t.h> 20 #include <linux/filter.h> 21 #include <linux/bpf.h> 22 #include <linux/bpf_trace.h> 23 24 enum bpf_type { 25 BPF_TYPE_UNSPEC = 0, 26 BPF_TYPE_PROG, 27 BPF_TYPE_MAP, 28 BPF_TYPE_LINK, 29 }; 30 31 static void *bpf_any_get(void *raw, enum bpf_type type) 32 { 33 switch (type) { 34 case BPF_TYPE_PROG: 35 bpf_prog_inc(raw); 36 break; 37 case BPF_TYPE_MAP: 38 bpf_map_inc_with_uref(raw); 39 break; 40 case BPF_TYPE_LINK: 41 bpf_link_inc(raw); 42 break; 43 default: 44 WARN_ON_ONCE(1); 45 break; 46 } 47 48 return raw; 49 } 50 51 static void bpf_any_put(void *raw, enum bpf_type type) 52 { 53 switch (type) { 54 case BPF_TYPE_PROG: 55 bpf_prog_put(raw); 56 break; 57 case BPF_TYPE_MAP: 58 bpf_map_put_with_uref(raw); 59 break; 60 case BPF_TYPE_LINK: 61 bpf_link_put(raw); 62 break; 63 default: 64 WARN_ON_ONCE(1); 65 break; 66 } 67 } 68 69 static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type) 70 { 71 void *raw; 72 73 raw = bpf_map_get_with_uref(ufd); 74 if (!IS_ERR(raw)) { 75 *type = BPF_TYPE_MAP; 76 return raw; 77 } 78 79 raw = bpf_prog_get(ufd); 80 if (!IS_ERR(raw)) { 81 *type = BPF_TYPE_PROG; 82 return raw; 83 } 84 85 raw = bpf_link_get_from_fd(ufd); 86 if (!IS_ERR(raw)) { 87 *type = BPF_TYPE_LINK; 88 return raw; 89 } 90 91 return ERR_PTR(-EINVAL); 92 } 93 94 static const struct inode_operations bpf_dir_iops; 95 96 static const struct inode_operations bpf_prog_iops = { }; 97 static const struct inode_operations bpf_map_iops = { }; 98 static const struct inode_operations bpf_link_iops = { }; 99 100 static struct inode *bpf_get_inode(struct super_block *sb, 101 const struct inode *dir, 102 umode_t mode) 103 { 104 struct inode *inode; 105 106 switch (mode & S_IFMT) { 107 case S_IFDIR: 108 case S_IFREG: 109 case S_IFLNK: 110 break; 111 default: 112 return ERR_PTR(-EINVAL); 113 } 114 115 inode = new_inode(sb); 116 if (!inode) 117 return ERR_PTR(-ENOSPC); 118 119 inode->i_ino = get_next_ino(); 120 inode->i_atime = current_time(inode); 121 inode->i_mtime = inode->i_atime; 122 inode->i_ctime = inode->i_atime; 123 124 inode_init_owner(inode, dir, mode); 125 126 return inode; 127 } 128 129 static int bpf_inode_type(const struct inode *inode, enum bpf_type *type) 130 { 131 *type = BPF_TYPE_UNSPEC; 132 if (inode->i_op == &bpf_prog_iops) 133 *type = BPF_TYPE_PROG; 134 else if (inode->i_op == &bpf_map_iops) 135 *type = BPF_TYPE_MAP; 136 else if (inode->i_op == &bpf_link_iops) 137 *type = BPF_TYPE_LINK; 138 else 139 return -EACCES; 140 141 return 0; 142 } 143 144 static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode, 145 struct inode *dir) 146 { 147 d_instantiate(dentry, inode); 148 dget(dentry); 149 150 dir->i_mtime = current_time(dir); 151 dir->i_ctime = dir->i_mtime; 152 } 153 154 static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 155 { 156 struct inode *inode; 157 158 inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR); 159 if (IS_ERR(inode)) 160 return PTR_ERR(inode); 161 162 inode->i_op = &bpf_dir_iops; 163 inode->i_fop = &simple_dir_operations; 164 165 inc_nlink(inode); 166 inc_nlink(dir); 167 168 bpf_dentry_finalize(dentry, inode, dir); 169 return 0; 170 } 171 172 struct map_iter { 173 void *key; 174 bool done; 175 }; 176 177 static struct map_iter *map_iter(struct seq_file *m) 178 { 179 return m->private; 180 } 181 182 static struct bpf_map *seq_file_to_map(struct seq_file *m) 183 { 184 return file_inode(m->file)->i_private; 185 } 186 187 static void map_iter_free(struct map_iter *iter) 188 { 189 if (iter) { 190 kfree(iter->key); 191 kfree(iter); 192 } 193 } 194 195 static struct map_iter *map_iter_alloc(struct bpf_map *map) 196 { 197 struct map_iter *iter; 198 199 iter = kzalloc(sizeof(*iter), GFP_KERNEL | __GFP_NOWARN); 200 if (!iter) 201 goto error; 202 203 iter->key = kzalloc(map->key_size, GFP_KERNEL | __GFP_NOWARN); 204 if (!iter->key) 205 goto error; 206 207 return iter; 208 209 error: 210 map_iter_free(iter); 211 return NULL; 212 } 213 214 static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos) 215 { 216 struct bpf_map *map = seq_file_to_map(m); 217 void *key = map_iter(m)->key; 218 void *prev_key; 219 220 (*pos)++; 221 if (map_iter(m)->done) 222 return NULL; 223 224 if (unlikely(v == SEQ_START_TOKEN)) 225 prev_key = NULL; 226 else 227 prev_key = key; 228 229 rcu_read_lock(); 230 if (map->ops->map_get_next_key(map, prev_key, key)) { 231 map_iter(m)->done = true; 232 key = NULL; 233 } 234 rcu_read_unlock(); 235 return key; 236 } 237 238 static void *map_seq_start(struct seq_file *m, loff_t *pos) 239 { 240 if (map_iter(m)->done) 241 return NULL; 242 243 return *pos ? map_iter(m)->key : SEQ_START_TOKEN; 244 } 245 246 static void map_seq_stop(struct seq_file *m, void *v) 247 { 248 } 249 250 static int map_seq_show(struct seq_file *m, void *v) 251 { 252 struct bpf_map *map = seq_file_to_map(m); 253 void *key = map_iter(m)->key; 254 255 if (unlikely(v == SEQ_START_TOKEN)) { 256 seq_puts(m, "# WARNING!! The output is for debug purpose only\n"); 257 seq_puts(m, "# WARNING!! The output format will change\n"); 258 } else { 259 map->ops->map_seq_show_elem(map, key, m); 260 } 261 262 return 0; 263 } 264 265 static const struct seq_operations bpffs_map_seq_ops = { 266 .start = map_seq_start, 267 .next = map_seq_next, 268 .show = map_seq_show, 269 .stop = map_seq_stop, 270 }; 271 272 static int bpffs_map_open(struct inode *inode, struct file *file) 273 { 274 struct bpf_map *map = inode->i_private; 275 struct map_iter *iter; 276 struct seq_file *m; 277 int err; 278 279 iter = map_iter_alloc(map); 280 if (!iter) 281 return -ENOMEM; 282 283 err = seq_open(file, &bpffs_map_seq_ops); 284 if (err) { 285 map_iter_free(iter); 286 return err; 287 } 288 289 m = file->private_data; 290 m->private = iter; 291 292 return 0; 293 } 294 295 static int bpffs_map_release(struct inode *inode, struct file *file) 296 { 297 struct seq_file *m = file->private_data; 298 299 map_iter_free(map_iter(m)); 300 301 return seq_release(inode, file); 302 } 303 304 /* bpffs_map_fops should only implement the basic 305 * read operation for a BPF map. The purpose is to 306 * provide a simple user intuitive way to do 307 * "cat bpffs/pathto/a-pinned-map". 308 * 309 * Other operations (e.g. write, lookup...) should be realized by 310 * the userspace tools (e.g. bpftool) through the 311 * BPF_OBJ_GET_INFO_BY_FD and the map's lookup/update 312 * interface. 313 */ 314 static const struct file_operations bpffs_map_fops = { 315 .open = bpffs_map_open, 316 .read = seq_read, 317 .release = bpffs_map_release, 318 }; 319 320 static int bpffs_obj_open(struct inode *inode, struct file *file) 321 { 322 return -EIO; 323 } 324 325 static const struct file_operations bpffs_obj_fops = { 326 .open = bpffs_obj_open, 327 }; 328 329 static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw, 330 const struct inode_operations *iops, 331 const struct file_operations *fops) 332 { 333 struct inode *dir = dentry->d_parent->d_inode; 334 struct inode *inode = bpf_get_inode(dir->i_sb, dir, mode); 335 if (IS_ERR(inode)) 336 return PTR_ERR(inode); 337 338 inode->i_op = iops; 339 inode->i_fop = fops; 340 inode->i_private = raw; 341 342 bpf_dentry_finalize(dentry, inode, dir); 343 return 0; 344 } 345 346 static int bpf_mkprog(struct dentry *dentry, umode_t mode, void *arg) 347 { 348 return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops, 349 &bpffs_obj_fops); 350 } 351 352 static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) 353 { 354 struct bpf_map *map = arg; 355 356 return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops, 357 bpf_map_support_seq_show(map) ? 358 &bpffs_map_fops : &bpffs_obj_fops); 359 } 360 361 static int bpf_mklink(struct dentry *dentry, umode_t mode, void *arg) 362 { 363 struct bpf_link *link = arg; 364 365 return bpf_mkobj_ops(dentry, mode, arg, &bpf_link_iops, 366 bpf_link_is_iter(link) ? 367 &bpf_iter_fops : &bpffs_obj_fops); 368 } 369 370 static struct dentry * 371 bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) 372 { 373 /* Dots in names (e.g. "/sys/fs/bpf/foo.bar") are reserved for future 374 * extensions. 375 */ 376 if (strchr(dentry->d_name.name, '.')) 377 return ERR_PTR(-EPERM); 378 379 return simple_lookup(dir, dentry, flags); 380 } 381 382 static int bpf_symlink(struct inode *dir, struct dentry *dentry, 383 const char *target) 384 { 385 char *link = kstrdup(target, GFP_USER | __GFP_NOWARN); 386 struct inode *inode; 387 388 if (!link) 389 return -ENOMEM; 390 391 inode = bpf_get_inode(dir->i_sb, dir, S_IRWXUGO | S_IFLNK); 392 if (IS_ERR(inode)) { 393 kfree(link); 394 return PTR_ERR(inode); 395 } 396 397 inode->i_op = &simple_symlink_inode_operations; 398 inode->i_link = link; 399 400 bpf_dentry_finalize(dentry, inode, dir); 401 return 0; 402 } 403 404 static const struct inode_operations bpf_dir_iops = { 405 .lookup = bpf_lookup, 406 .mkdir = bpf_mkdir, 407 .symlink = bpf_symlink, 408 .rmdir = simple_rmdir, 409 .rename = simple_rename, 410 .link = simple_link, 411 .unlink = simple_unlink, 412 }; 413 414 static int bpf_obj_do_pin(const char __user *pathname, void *raw, 415 enum bpf_type type) 416 { 417 struct dentry *dentry; 418 struct inode *dir; 419 struct path path; 420 umode_t mode; 421 int ret; 422 423 dentry = user_path_create(AT_FDCWD, pathname, &path, 0); 424 if (IS_ERR(dentry)) 425 return PTR_ERR(dentry); 426 427 mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask()); 428 429 ret = security_path_mknod(&path, dentry, mode, 0); 430 if (ret) 431 goto out; 432 433 dir = d_inode(path.dentry); 434 if (dir->i_op != &bpf_dir_iops) { 435 ret = -EPERM; 436 goto out; 437 } 438 439 switch (type) { 440 case BPF_TYPE_PROG: 441 ret = vfs_mkobj(dentry, mode, bpf_mkprog, raw); 442 break; 443 case BPF_TYPE_MAP: 444 ret = vfs_mkobj(dentry, mode, bpf_mkmap, raw); 445 break; 446 case BPF_TYPE_LINK: 447 ret = vfs_mkobj(dentry, mode, bpf_mklink, raw); 448 break; 449 default: 450 ret = -EPERM; 451 } 452 out: 453 done_path_create(&path, dentry); 454 return ret; 455 } 456 457 int bpf_obj_pin_user(u32 ufd, const char __user *pathname) 458 { 459 enum bpf_type type; 460 void *raw; 461 int ret; 462 463 raw = bpf_fd_probe_obj(ufd, &type); 464 if (IS_ERR(raw)) 465 return PTR_ERR(raw); 466 467 ret = bpf_obj_do_pin(pathname, raw, type); 468 if (ret != 0) 469 bpf_any_put(raw, type); 470 471 return ret; 472 } 473 474 static void *bpf_obj_do_get(const char __user *pathname, 475 enum bpf_type *type, int flags) 476 { 477 struct inode *inode; 478 struct path path; 479 void *raw; 480 int ret; 481 482 ret = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW, &path); 483 if (ret) 484 return ERR_PTR(ret); 485 486 inode = d_backing_inode(path.dentry); 487 ret = inode_permission(inode, ACC_MODE(flags)); 488 if (ret) 489 goto out; 490 491 ret = bpf_inode_type(inode, type); 492 if (ret) 493 goto out; 494 495 raw = bpf_any_get(inode->i_private, *type); 496 if (!IS_ERR(raw)) 497 touch_atime(&path); 498 499 path_put(&path); 500 return raw; 501 out: 502 path_put(&path); 503 return ERR_PTR(ret); 504 } 505 506 int bpf_obj_get_user(const char __user *pathname, int flags) 507 { 508 enum bpf_type type = BPF_TYPE_UNSPEC; 509 int f_flags; 510 void *raw; 511 int ret; 512 513 f_flags = bpf_get_file_flag(flags); 514 if (f_flags < 0) 515 return f_flags; 516 517 raw = bpf_obj_do_get(pathname, &type, f_flags); 518 if (IS_ERR(raw)) 519 return PTR_ERR(raw); 520 521 if (type == BPF_TYPE_PROG) 522 ret = bpf_prog_new_fd(raw); 523 else if (type == BPF_TYPE_MAP) 524 ret = bpf_map_new_fd(raw, f_flags); 525 else if (type == BPF_TYPE_LINK) 526 ret = bpf_link_new_fd(raw); 527 else 528 return -ENOENT; 529 530 if (ret < 0) 531 bpf_any_put(raw, type); 532 return ret; 533 } 534 535 static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type) 536 { 537 struct bpf_prog *prog; 538 int ret = inode_permission(inode, MAY_READ); 539 if (ret) 540 return ERR_PTR(ret); 541 542 if (inode->i_op == &bpf_map_iops) 543 return ERR_PTR(-EINVAL); 544 if (inode->i_op == &bpf_link_iops) 545 return ERR_PTR(-EINVAL); 546 if (inode->i_op != &bpf_prog_iops) 547 return ERR_PTR(-EACCES); 548 549 prog = inode->i_private; 550 551 ret = security_bpf_prog(prog); 552 if (ret < 0) 553 return ERR_PTR(ret); 554 555 if (!bpf_prog_get_ok(prog, &type, false)) 556 return ERR_PTR(-EINVAL); 557 558 bpf_prog_inc(prog); 559 return prog; 560 } 561 562 struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type) 563 { 564 struct bpf_prog *prog; 565 struct path path; 566 int ret = kern_path(name, LOOKUP_FOLLOW, &path); 567 if (ret) 568 return ERR_PTR(ret); 569 prog = __get_prog_inode(d_backing_inode(path.dentry), type); 570 if (!IS_ERR(prog)) 571 touch_atime(&path); 572 path_put(&path); 573 return prog; 574 } 575 EXPORT_SYMBOL(bpf_prog_get_type_path); 576 577 /* 578 * Display the mount options in /proc/mounts. 579 */ 580 static int bpf_show_options(struct seq_file *m, struct dentry *root) 581 { 582 umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX; 583 584 if (mode != S_IRWXUGO) 585 seq_printf(m, ",mode=%o", mode); 586 return 0; 587 } 588 589 static void bpf_free_inode(struct inode *inode) 590 { 591 enum bpf_type type; 592 593 if (S_ISLNK(inode->i_mode)) 594 kfree(inode->i_link); 595 if (!bpf_inode_type(inode, &type)) 596 bpf_any_put(inode->i_private, type); 597 free_inode_nonrcu(inode); 598 } 599 600 static const struct super_operations bpf_super_ops = { 601 .statfs = simple_statfs, 602 .drop_inode = generic_delete_inode, 603 .show_options = bpf_show_options, 604 .free_inode = bpf_free_inode, 605 }; 606 607 enum { 608 OPT_MODE, 609 }; 610 611 static const struct fs_parameter_spec bpf_fs_parameters[] = { 612 fsparam_u32oct ("mode", OPT_MODE), 613 {} 614 }; 615 616 struct bpf_mount_opts { 617 umode_t mode; 618 }; 619 620 static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) 621 { 622 struct bpf_mount_opts *opts = fc->fs_private; 623 struct fs_parse_result result; 624 int opt; 625 626 opt = fs_parse(fc, bpf_fs_parameters, param, &result); 627 if (opt < 0) 628 /* We might like to report bad mount options here, but 629 * traditionally we've ignored all mount options, so we'd 630 * better continue to ignore non-existing options for bpf. 631 */ 632 return opt == -ENOPARAM ? 0 : opt; 633 634 switch (opt) { 635 case OPT_MODE: 636 opts->mode = result.uint_32 & S_IALLUGO; 637 break; 638 } 639 640 return 0; 641 } 642 643 static int bpf_fill_super(struct super_block *sb, struct fs_context *fc) 644 { 645 static const struct tree_descr bpf_rfiles[] = { { "" } }; 646 struct bpf_mount_opts *opts = fc->fs_private; 647 struct inode *inode; 648 int ret; 649 650 ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles); 651 if (ret) 652 return ret; 653 654 sb->s_op = &bpf_super_ops; 655 656 inode = sb->s_root->d_inode; 657 inode->i_op = &bpf_dir_iops; 658 inode->i_mode &= ~S_IALLUGO; 659 inode->i_mode |= S_ISVTX | opts->mode; 660 661 return 0; 662 } 663 664 static int bpf_get_tree(struct fs_context *fc) 665 { 666 return get_tree_nodev(fc, bpf_fill_super); 667 } 668 669 static void bpf_free_fc(struct fs_context *fc) 670 { 671 kfree(fc->fs_private); 672 } 673 674 static const struct fs_context_operations bpf_context_ops = { 675 .free = bpf_free_fc, 676 .parse_param = bpf_parse_param, 677 .get_tree = bpf_get_tree, 678 }; 679 680 /* 681 * Set up the filesystem mount context. 682 */ 683 static int bpf_init_fs_context(struct fs_context *fc) 684 { 685 struct bpf_mount_opts *opts; 686 687 opts = kzalloc(sizeof(struct bpf_mount_opts), GFP_KERNEL); 688 if (!opts) 689 return -ENOMEM; 690 691 opts->mode = S_IRWXUGO; 692 693 fc->fs_private = opts; 694 fc->ops = &bpf_context_ops; 695 return 0; 696 } 697 698 static struct file_system_type bpf_fs_type = { 699 .owner = THIS_MODULE, 700 .name = "bpf", 701 .init_fs_context = bpf_init_fs_context, 702 .parameters = bpf_fs_parameters, 703 .kill_sb = kill_litter_super, 704 }; 705 706 static int __init bpf_init(void) 707 { 708 int ret; 709 710 ret = sysfs_create_mount_point(fs_kobj, "bpf"); 711 if (ret) 712 return ret; 713 714 ret = register_filesystem(&bpf_fs_type); 715 if (ret) 716 sysfs_remove_mount_point(fs_kobj, "bpf"); 717 718 return ret; 719 } 720 fs_initcall(bpf_init); 721