1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Minimal file system backend for holding eBPF maps and programs, 4 * used by bpf(2) object pinning. 5 * 6 * Authors: 7 * 8 * Daniel Borkmann <daniel@iogearbox.net> 9 */ 10 11 #include <linux/init.h> 12 #include <linux/magic.h> 13 #include <linux/major.h> 14 #include <linux/mount.h> 15 #include <linux/namei.h> 16 #include <linux/fs.h> 17 #include <linux/fs_context.h> 18 #include <linux/fs_parser.h> 19 #include <linux/kdev_t.h> 20 #include <linux/filter.h> 21 #include <linux/bpf.h> 22 #include <linux/bpf_trace.h> 23 24 enum bpf_type { 25 BPF_TYPE_UNSPEC = 0, 26 BPF_TYPE_PROG, 27 BPF_TYPE_MAP, 28 BPF_TYPE_LINK, 29 }; 30 31 static void *bpf_any_get(void *raw, enum bpf_type type) 32 { 33 switch (type) { 34 case BPF_TYPE_PROG: 35 bpf_prog_inc(raw); 36 break; 37 case BPF_TYPE_MAP: 38 bpf_map_inc_with_uref(raw); 39 break; 40 case BPF_TYPE_LINK: 41 bpf_link_inc(raw); 42 break; 43 default: 44 WARN_ON_ONCE(1); 45 break; 46 } 47 48 return raw; 49 } 50 51 static void bpf_any_put(void *raw, enum bpf_type type) 52 { 53 switch (type) { 54 case BPF_TYPE_PROG: 55 bpf_prog_put(raw); 56 break; 57 case BPF_TYPE_MAP: 58 bpf_map_put_with_uref(raw); 59 break; 60 case BPF_TYPE_LINK: 61 bpf_link_put(raw); 62 break; 63 default: 64 WARN_ON_ONCE(1); 65 break; 66 } 67 } 68 69 static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type) 70 { 71 void *raw; 72 73 raw = bpf_map_get_with_uref(ufd); 74 if (!IS_ERR(raw)) { 75 *type = BPF_TYPE_MAP; 76 return raw; 77 } 78 79 raw = bpf_prog_get(ufd); 80 if (!IS_ERR(raw)) { 81 *type = BPF_TYPE_PROG; 82 return raw; 83 } 84 85 raw = bpf_link_get_from_fd(ufd); 86 if (!IS_ERR(raw)) { 87 *type = BPF_TYPE_LINK; 88 return raw; 89 } 90 91 return ERR_PTR(-EINVAL); 92 } 93 94 static const struct inode_operations bpf_dir_iops; 95 96 static const struct inode_operations bpf_prog_iops = { }; 97 static const struct inode_operations bpf_map_iops = { }; 98 static const struct inode_operations bpf_link_iops = { }; 99 100 static struct inode *bpf_get_inode(struct super_block *sb, 101 const struct inode *dir, 102 umode_t mode) 103 { 104 struct inode *inode; 105 106 switch (mode & S_IFMT) { 107 case S_IFDIR: 108 case S_IFREG: 109 case S_IFLNK: 110 break; 111 default: 112 return ERR_PTR(-EINVAL); 113 } 114 115 inode = new_inode(sb); 116 if (!inode) 117 return ERR_PTR(-ENOSPC); 118 119 inode->i_ino = get_next_ino(); 120 inode->i_atime = current_time(inode); 121 inode->i_mtime = inode->i_atime; 122 inode->i_ctime = inode->i_atime; 123 124 inode_init_owner(inode, dir, mode); 125 126 return inode; 127 } 128 129 static int bpf_inode_type(const struct inode *inode, enum bpf_type *type) 130 { 131 *type = BPF_TYPE_UNSPEC; 132 if (inode->i_op == &bpf_prog_iops) 133 *type = BPF_TYPE_PROG; 134 else if (inode->i_op == &bpf_map_iops) 135 *type = BPF_TYPE_MAP; 136 else if (inode->i_op == &bpf_link_iops) 137 *type = BPF_TYPE_LINK; 138 else 139 return -EACCES; 140 141 return 0; 142 } 143 144 static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode, 145 struct inode *dir) 146 { 147 d_instantiate(dentry, inode); 148 dget(dentry); 149 150 dir->i_mtime = current_time(dir); 151 dir->i_ctime = dir->i_mtime; 152 } 153 154 static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 155 { 156 struct inode *inode; 157 158 inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR); 159 if (IS_ERR(inode)) 160 return PTR_ERR(inode); 161 162 inode->i_op = &bpf_dir_iops; 163 inode->i_fop = &simple_dir_operations; 164 165 inc_nlink(inode); 166 inc_nlink(dir); 167 168 bpf_dentry_finalize(dentry, inode, dir); 169 return 0; 170 } 171 172 struct map_iter { 173 void *key; 174 bool done; 175 }; 176 177 static struct map_iter *map_iter(struct seq_file *m) 178 { 179 return m->private; 180 } 181 182 static struct bpf_map *seq_file_to_map(struct seq_file *m) 183 { 184 return file_inode(m->file)->i_private; 185 } 186 187 static void map_iter_free(struct map_iter *iter) 188 { 189 if (iter) { 190 kfree(iter->key); 191 kfree(iter); 192 } 193 } 194 195 static struct map_iter *map_iter_alloc(struct bpf_map *map) 196 { 197 struct map_iter *iter; 198 199 iter = kzalloc(sizeof(*iter), GFP_KERNEL | __GFP_NOWARN); 200 if (!iter) 201 goto error; 202 203 iter->key = kzalloc(map->key_size, GFP_KERNEL | __GFP_NOWARN); 204 if (!iter->key) 205 goto error; 206 207 return iter; 208 209 error: 210 map_iter_free(iter); 211 return NULL; 212 } 213 214 static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos) 215 { 216 struct bpf_map *map = seq_file_to_map(m); 217 void *key = map_iter(m)->key; 218 void *prev_key; 219 220 (*pos)++; 221 if (map_iter(m)->done) 222 return NULL; 223 224 if (unlikely(v == SEQ_START_TOKEN)) 225 prev_key = NULL; 226 else 227 prev_key = key; 228 229 if (map->ops->map_get_next_key(map, prev_key, key)) { 230 map_iter(m)->done = true; 231 return NULL; 232 } 233 return key; 234 } 235 236 static void *map_seq_start(struct seq_file *m, loff_t *pos) 237 { 238 if (map_iter(m)->done) 239 return NULL; 240 241 return *pos ? map_iter(m)->key : SEQ_START_TOKEN; 242 } 243 244 static void map_seq_stop(struct seq_file *m, void *v) 245 { 246 } 247 248 static int map_seq_show(struct seq_file *m, void *v) 249 { 250 struct bpf_map *map = seq_file_to_map(m); 251 void *key = map_iter(m)->key; 252 253 if (unlikely(v == SEQ_START_TOKEN)) { 254 seq_puts(m, "# WARNING!! The output is for debug purpose only\n"); 255 seq_puts(m, "# WARNING!! The output format will change\n"); 256 } else { 257 map->ops->map_seq_show_elem(map, key, m); 258 } 259 260 return 0; 261 } 262 263 static const struct seq_operations bpffs_map_seq_ops = { 264 .start = map_seq_start, 265 .next = map_seq_next, 266 .show = map_seq_show, 267 .stop = map_seq_stop, 268 }; 269 270 static int bpffs_map_open(struct inode *inode, struct file *file) 271 { 272 struct bpf_map *map = inode->i_private; 273 struct map_iter *iter; 274 struct seq_file *m; 275 int err; 276 277 iter = map_iter_alloc(map); 278 if (!iter) 279 return -ENOMEM; 280 281 err = seq_open(file, &bpffs_map_seq_ops); 282 if (err) { 283 map_iter_free(iter); 284 return err; 285 } 286 287 m = file->private_data; 288 m->private = iter; 289 290 return 0; 291 } 292 293 static int bpffs_map_release(struct inode *inode, struct file *file) 294 { 295 struct seq_file *m = file->private_data; 296 297 map_iter_free(map_iter(m)); 298 299 return seq_release(inode, file); 300 } 301 302 /* bpffs_map_fops should only implement the basic 303 * read operation for a BPF map. The purpose is to 304 * provide a simple user intuitive way to do 305 * "cat bpffs/pathto/a-pinned-map". 306 * 307 * Other operations (e.g. write, lookup...) should be realized by 308 * the userspace tools (e.g. bpftool) through the 309 * BPF_OBJ_GET_INFO_BY_FD and the map's lookup/update 310 * interface. 311 */ 312 static const struct file_operations bpffs_map_fops = { 313 .open = bpffs_map_open, 314 .read = seq_read, 315 .release = bpffs_map_release, 316 }; 317 318 static int bpffs_obj_open(struct inode *inode, struct file *file) 319 { 320 return -EIO; 321 } 322 323 static const struct file_operations bpffs_obj_fops = { 324 .open = bpffs_obj_open, 325 }; 326 327 static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw, 328 const struct inode_operations *iops, 329 const struct file_operations *fops) 330 { 331 struct inode *dir = dentry->d_parent->d_inode; 332 struct inode *inode = bpf_get_inode(dir->i_sb, dir, mode); 333 if (IS_ERR(inode)) 334 return PTR_ERR(inode); 335 336 inode->i_op = iops; 337 inode->i_fop = fops; 338 inode->i_private = raw; 339 340 bpf_dentry_finalize(dentry, inode, dir); 341 return 0; 342 } 343 344 static int bpf_mkprog(struct dentry *dentry, umode_t mode, void *arg) 345 { 346 return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops, 347 &bpffs_obj_fops); 348 } 349 350 static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) 351 { 352 struct bpf_map *map = arg; 353 354 return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops, 355 bpf_map_support_seq_show(map) ? 356 &bpffs_map_fops : &bpffs_obj_fops); 357 } 358 359 static int bpf_mklink(struct dentry *dentry, umode_t mode, void *arg) 360 { 361 struct bpf_link *link = arg; 362 363 return bpf_mkobj_ops(dentry, mode, arg, &bpf_link_iops, 364 bpf_link_is_iter(link) ? 365 &bpf_iter_fops : &bpffs_obj_fops); 366 } 367 368 static struct dentry * 369 bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) 370 { 371 /* Dots in names (e.g. "/sys/fs/bpf/foo.bar") are reserved for future 372 * extensions. 373 */ 374 if (strchr(dentry->d_name.name, '.')) 375 return ERR_PTR(-EPERM); 376 377 return simple_lookup(dir, dentry, flags); 378 } 379 380 static int bpf_symlink(struct inode *dir, struct dentry *dentry, 381 const char *target) 382 { 383 char *link = kstrdup(target, GFP_USER | __GFP_NOWARN); 384 struct inode *inode; 385 386 if (!link) 387 return -ENOMEM; 388 389 inode = bpf_get_inode(dir->i_sb, dir, S_IRWXUGO | S_IFLNK); 390 if (IS_ERR(inode)) { 391 kfree(link); 392 return PTR_ERR(inode); 393 } 394 395 inode->i_op = &simple_symlink_inode_operations; 396 inode->i_link = link; 397 398 bpf_dentry_finalize(dentry, inode, dir); 399 return 0; 400 } 401 402 static const struct inode_operations bpf_dir_iops = { 403 .lookup = bpf_lookup, 404 .mkdir = bpf_mkdir, 405 .symlink = bpf_symlink, 406 .rmdir = simple_rmdir, 407 .rename = simple_rename, 408 .link = simple_link, 409 .unlink = simple_unlink, 410 }; 411 412 static int bpf_obj_do_pin(const char __user *pathname, void *raw, 413 enum bpf_type type) 414 { 415 struct dentry *dentry; 416 struct inode *dir; 417 struct path path; 418 umode_t mode; 419 int ret; 420 421 dentry = user_path_create(AT_FDCWD, pathname, &path, 0); 422 if (IS_ERR(dentry)) 423 return PTR_ERR(dentry); 424 425 mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask()); 426 427 ret = security_path_mknod(&path, dentry, mode, 0); 428 if (ret) 429 goto out; 430 431 dir = d_inode(path.dentry); 432 if (dir->i_op != &bpf_dir_iops) { 433 ret = -EPERM; 434 goto out; 435 } 436 437 switch (type) { 438 case BPF_TYPE_PROG: 439 ret = vfs_mkobj(dentry, mode, bpf_mkprog, raw); 440 break; 441 case BPF_TYPE_MAP: 442 ret = vfs_mkobj(dentry, mode, bpf_mkmap, raw); 443 break; 444 case BPF_TYPE_LINK: 445 ret = vfs_mkobj(dentry, mode, bpf_mklink, raw); 446 break; 447 default: 448 ret = -EPERM; 449 } 450 out: 451 done_path_create(&path, dentry); 452 return ret; 453 } 454 455 int bpf_obj_pin_user(u32 ufd, const char __user *pathname) 456 { 457 enum bpf_type type; 458 void *raw; 459 int ret; 460 461 raw = bpf_fd_probe_obj(ufd, &type); 462 if (IS_ERR(raw)) 463 return PTR_ERR(raw); 464 465 ret = bpf_obj_do_pin(pathname, raw, type); 466 if (ret != 0) 467 bpf_any_put(raw, type); 468 469 return ret; 470 } 471 472 static void *bpf_obj_do_get(const char __user *pathname, 473 enum bpf_type *type, int flags) 474 { 475 struct inode *inode; 476 struct path path; 477 void *raw; 478 int ret; 479 480 ret = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW, &path); 481 if (ret) 482 return ERR_PTR(ret); 483 484 inode = d_backing_inode(path.dentry); 485 ret = inode_permission(inode, ACC_MODE(flags)); 486 if (ret) 487 goto out; 488 489 ret = bpf_inode_type(inode, type); 490 if (ret) 491 goto out; 492 493 raw = bpf_any_get(inode->i_private, *type); 494 if (!IS_ERR(raw)) 495 touch_atime(&path); 496 497 path_put(&path); 498 return raw; 499 out: 500 path_put(&path); 501 return ERR_PTR(ret); 502 } 503 504 int bpf_obj_get_user(const char __user *pathname, int flags) 505 { 506 enum bpf_type type = BPF_TYPE_UNSPEC; 507 int f_flags; 508 void *raw; 509 int ret; 510 511 f_flags = bpf_get_file_flag(flags); 512 if (f_flags < 0) 513 return f_flags; 514 515 raw = bpf_obj_do_get(pathname, &type, f_flags); 516 if (IS_ERR(raw)) 517 return PTR_ERR(raw); 518 519 if (type == BPF_TYPE_PROG) 520 ret = bpf_prog_new_fd(raw); 521 else if (type == BPF_TYPE_MAP) 522 ret = bpf_map_new_fd(raw, f_flags); 523 else if (type == BPF_TYPE_LINK) 524 ret = bpf_link_new_fd(raw); 525 else 526 return -ENOENT; 527 528 if (ret < 0) 529 bpf_any_put(raw, type); 530 return ret; 531 } 532 533 static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type) 534 { 535 struct bpf_prog *prog; 536 int ret = inode_permission(inode, MAY_READ); 537 if (ret) 538 return ERR_PTR(ret); 539 540 if (inode->i_op == &bpf_map_iops) 541 return ERR_PTR(-EINVAL); 542 if (inode->i_op == &bpf_link_iops) 543 return ERR_PTR(-EINVAL); 544 if (inode->i_op != &bpf_prog_iops) 545 return ERR_PTR(-EACCES); 546 547 prog = inode->i_private; 548 549 ret = security_bpf_prog(prog); 550 if (ret < 0) 551 return ERR_PTR(ret); 552 553 if (!bpf_prog_get_ok(prog, &type, false)) 554 return ERR_PTR(-EINVAL); 555 556 bpf_prog_inc(prog); 557 return prog; 558 } 559 560 struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type) 561 { 562 struct bpf_prog *prog; 563 struct path path; 564 int ret = kern_path(name, LOOKUP_FOLLOW, &path); 565 if (ret) 566 return ERR_PTR(ret); 567 prog = __get_prog_inode(d_backing_inode(path.dentry), type); 568 if (!IS_ERR(prog)) 569 touch_atime(&path); 570 path_put(&path); 571 return prog; 572 } 573 EXPORT_SYMBOL(bpf_prog_get_type_path); 574 575 /* 576 * Display the mount options in /proc/mounts. 577 */ 578 static int bpf_show_options(struct seq_file *m, struct dentry *root) 579 { 580 umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX; 581 582 if (mode != S_IRWXUGO) 583 seq_printf(m, ",mode=%o", mode); 584 return 0; 585 } 586 587 static void bpf_free_inode(struct inode *inode) 588 { 589 enum bpf_type type; 590 591 if (S_ISLNK(inode->i_mode)) 592 kfree(inode->i_link); 593 if (!bpf_inode_type(inode, &type)) 594 bpf_any_put(inode->i_private, type); 595 free_inode_nonrcu(inode); 596 } 597 598 static const struct super_operations bpf_super_ops = { 599 .statfs = simple_statfs, 600 .drop_inode = generic_delete_inode, 601 .show_options = bpf_show_options, 602 .free_inode = bpf_free_inode, 603 }; 604 605 enum { 606 OPT_MODE, 607 }; 608 609 static const struct fs_parameter_spec bpf_fs_parameters[] = { 610 fsparam_u32oct ("mode", OPT_MODE), 611 {} 612 }; 613 614 struct bpf_mount_opts { 615 umode_t mode; 616 }; 617 618 static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) 619 { 620 struct bpf_mount_opts *opts = fc->fs_private; 621 struct fs_parse_result result; 622 int opt; 623 624 opt = fs_parse(fc, bpf_fs_parameters, param, &result); 625 if (opt < 0) 626 /* We might like to report bad mount options here, but 627 * traditionally we've ignored all mount options, so we'd 628 * better continue to ignore non-existing options for bpf. 629 */ 630 return opt == -ENOPARAM ? 0 : opt; 631 632 switch (opt) { 633 case OPT_MODE: 634 opts->mode = result.uint_32 & S_IALLUGO; 635 break; 636 } 637 638 return 0; 639 } 640 641 static int bpf_fill_super(struct super_block *sb, struct fs_context *fc) 642 { 643 static const struct tree_descr bpf_rfiles[] = { { "" } }; 644 struct bpf_mount_opts *opts = fc->fs_private; 645 struct inode *inode; 646 int ret; 647 648 ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles); 649 if (ret) 650 return ret; 651 652 sb->s_op = &bpf_super_ops; 653 654 inode = sb->s_root->d_inode; 655 inode->i_op = &bpf_dir_iops; 656 inode->i_mode &= ~S_IALLUGO; 657 inode->i_mode |= S_ISVTX | opts->mode; 658 659 return 0; 660 } 661 662 static int bpf_get_tree(struct fs_context *fc) 663 { 664 return get_tree_nodev(fc, bpf_fill_super); 665 } 666 667 static void bpf_free_fc(struct fs_context *fc) 668 { 669 kfree(fc->fs_private); 670 } 671 672 static const struct fs_context_operations bpf_context_ops = { 673 .free = bpf_free_fc, 674 .parse_param = bpf_parse_param, 675 .get_tree = bpf_get_tree, 676 }; 677 678 /* 679 * Set up the filesystem mount context. 680 */ 681 static int bpf_init_fs_context(struct fs_context *fc) 682 { 683 struct bpf_mount_opts *opts; 684 685 opts = kzalloc(sizeof(struct bpf_mount_opts), GFP_KERNEL); 686 if (!opts) 687 return -ENOMEM; 688 689 opts->mode = S_IRWXUGO; 690 691 fc->fs_private = opts; 692 fc->ops = &bpf_context_ops; 693 return 0; 694 } 695 696 static struct file_system_type bpf_fs_type = { 697 .owner = THIS_MODULE, 698 .name = "bpf", 699 .init_fs_context = bpf_init_fs_context, 700 .parameters = bpf_fs_parameters, 701 .kill_sb = kill_litter_super, 702 }; 703 704 static int __init bpf_init(void) 705 { 706 int ret; 707 708 ret = sysfs_create_mount_point(fs_kobj, "bpf"); 709 if (ret) 710 return ret; 711 712 ret = register_filesystem(&bpf_fs_type); 713 if (ret) 714 sysfs_remove_mount_point(fs_kobj, "bpf"); 715 716 return ret; 717 } 718 fs_initcall(bpf_init); 719