1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Minimal file system backend for holding eBPF maps and programs, 4 * used by bpf(2) object pinning. 5 * 6 * Authors: 7 * 8 * Daniel Borkmann <daniel@iogearbox.net> 9 */ 10 11 #include <linux/init.h> 12 #include <linux/magic.h> 13 #include <linux/major.h> 14 #include <linux/mount.h> 15 #include <linux/namei.h> 16 #include <linux/fs.h> 17 #include <linux/fs_context.h> 18 #include <linux/fs_parser.h> 19 #include <linux/kdev_t.h> 20 #include <linux/filter.h> 21 #include <linux/bpf.h> 22 #include <linux/bpf_trace.h> 23 24 enum bpf_type { 25 BPF_TYPE_UNSPEC = 0, 26 BPF_TYPE_PROG, 27 BPF_TYPE_MAP, 28 BPF_TYPE_LINK, 29 }; 30 31 static void *bpf_any_get(void *raw, enum bpf_type type) 32 { 33 switch (type) { 34 case BPF_TYPE_PROG: 35 bpf_prog_inc(raw); 36 break; 37 case BPF_TYPE_MAP: 38 bpf_map_inc_with_uref(raw); 39 break; 40 case BPF_TYPE_LINK: 41 bpf_link_inc(raw); 42 break; 43 default: 44 WARN_ON_ONCE(1); 45 break; 46 } 47 48 return raw; 49 } 50 51 static void bpf_any_put(void *raw, enum bpf_type type) 52 { 53 switch (type) { 54 case BPF_TYPE_PROG: 55 bpf_prog_put(raw); 56 break; 57 case BPF_TYPE_MAP: 58 bpf_map_put_with_uref(raw); 59 break; 60 case BPF_TYPE_LINK: 61 bpf_link_put(raw); 62 break; 63 default: 64 WARN_ON_ONCE(1); 65 break; 66 } 67 } 68 69 static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type) 70 { 71 void *raw; 72 73 raw = bpf_map_get_with_uref(ufd); 74 if (!IS_ERR(raw)) { 75 *type = BPF_TYPE_MAP; 76 return raw; 77 } 78 79 raw = bpf_prog_get(ufd); 80 if (!IS_ERR(raw)) { 81 *type = BPF_TYPE_PROG; 82 return raw; 83 } 84 85 raw = bpf_link_get_from_fd(ufd); 86 if (!IS_ERR(raw)) { 87 *type = BPF_TYPE_LINK; 88 return raw; 89 } 90 91 return ERR_PTR(-EINVAL); 92 } 93 94 static const struct inode_operations bpf_dir_iops; 95 96 static const struct inode_operations bpf_prog_iops = { }; 97 static const struct inode_operations bpf_map_iops = { }; 98 static const struct inode_operations bpf_link_iops = { }; 99 100 static struct inode *bpf_get_inode(struct super_block *sb, 101 const struct inode *dir, 102 umode_t mode) 103 { 104 struct inode *inode; 105 106 switch (mode & S_IFMT) { 107 case S_IFDIR: 108 case S_IFREG: 109 case S_IFLNK: 110 break; 111 default: 112 return ERR_PTR(-EINVAL); 113 } 114 115 inode = new_inode(sb); 116 if (!inode) 117 return ERR_PTR(-ENOSPC); 118 119 inode->i_ino = get_next_ino(); 120 inode->i_atime = current_time(inode); 121 inode->i_mtime = inode->i_atime; 122 inode->i_ctime = inode->i_atime; 123 124 inode_init_owner(inode, dir, mode); 125 126 return inode; 127 } 128 129 static int bpf_inode_type(const struct inode *inode, enum bpf_type *type) 130 { 131 *type = BPF_TYPE_UNSPEC; 132 if (inode->i_op == &bpf_prog_iops) 133 *type = BPF_TYPE_PROG; 134 else if (inode->i_op == &bpf_map_iops) 135 *type = BPF_TYPE_MAP; 136 else if (inode->i_op == &bpf_link_iops) 137 *type = BPF_TYPE_LINK; 138 else 139 return -EACCES; 140 141 return 0; 142 } 143 144 static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode, 145 struct inode *dir) 146 { 147 d_instantiate(dentry, inode); 148 dget(dentry); 149 150 dir->i_mtime = current_time(dir); 151 dir->i_ctime = dir->i_mtime; 152 } 153 154 static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 155 { 156 struct inode *inode; 157 158 inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR); 159 if (IS_ERR(inode)) 160 return PTR_ERR(inode); 161 162 inode->i_op = &bpf_dir_iops; 163 inode->i_fop = &simple_dir_operations; 164 165 inc_nlink(inode); 166 inc_nlink(dir); 167 168 bpf_dentry_finalize(dentry, inode, dir); 169 return 0; 170 } 171 172 struct map_iter { 173 void *key; 174 bool done; 175 }; 176 177 static struct map_iter *map_iter(struct seq_file *m) 178 { 179 return m->private; 180 } 181 182 static struct bpf_map *seq_file_to_map(struct seq_file *m) 183 { 184 return file_inode(m->file)->i_private; 185 } 186 187 static void map_iter_free(struct map_iter *iter) 188 { 189 if (iter) { 190 kfree(iter->key); 191 kfree(iter); 192 } 193 } 194 195 static struct map_iter *map_iter_alloc(struct bpf_map *map) 196 { 197 struct map_iter *iter; 198 199 iter = kzalloc(sizeof(*iter), GFP_KERNEL | __GFP_NOWARN); 200 if (!iter) 201 goto error; 202 203 iter->key = kzalloc(map->key_size, GFP_KERNEL | __GFP_NOWARN); 204 if (!iter->key) 205 goto error; 206 207 return iter; 208 209 error: 210 map_iter_free(iter); 211 return NULL; 212 } 213 214 static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos) 215 { 216 struct bpf_map *map = seq_file_to_map(m); 217 void *key = map_iter(m)->key; 218 void *prev_key; 219 220 (*pos)++; 221 if (map_iter(m)->done) 222 return NULL; 223 224 if (unlikely(v == SEQ_START_TOKEN)) 225 prev_key = NULL; 226 else 227 prev_key = key; 228 229 if (map->ops->map_get_next_key(map, prev_key, key)) { 230 map_iter(m)->done = true; 231 return NULL; 232 } 233 return key; 234 } 235 236 static void *map_seq_start(struct seq_file *m, loff_t *pos) 237 { 238 if (map_iter(m)->done) 239 return NULL; 240 241 return *pos ? map_iter(m)->key : SEQ_START_TOKEN; 242 } 243 244 static void map_seq_stop(struct seq_file *m, void *v) 245 { 246 } 247 248 static int map_seq_show(struct seq_file *m, void *v) 249 { 250 struct bpf_map *map = seq_file_to_map(m); 251 void *key = map_iter(m)->key; 252 253 if (unlikely(v == SEQ_START_TOKEN)) { 254 seq_puts(m, "# WARNING!! The output is for debug purpose only\n"); 255 seq_puts(m, "# WARNING!! The output format will change\n"); 256 } else { 257 map->ops->map_seq_show_elem(map, key, m); 258 } 259 260 return 0; 261 } 262 263 static const struct seq_operations bpffs_map_seq_ops = { 264 .start = map_seq_start, 265 .next = map_seq_next, 266 .show = map_seq_show, 267 .stop = map_seq_stop, 268 }; 269 270 static int bpffs_map_open(struct inode *inode, struct file *file) 271 { 272 struct bpf_map *map = inode->i_private; 273 struct map_iter *iter; 274 struct seq_file *m; 275 int err; 276 277 iter = map_iter_alloc(map); 278 if (!iter) 279 return -ENOMEM; 280 281 err = seq_open(file, &bpffs_map_seq_ops); 282 if (err) { 283 map_iter_free(iter); 284 return err; 285 } 286 287 m = file->private_data; 288 m->private = iter; 289 290 return 0; 291 } 292 293 static int bpffs_map_release(struct inode *inode, struct file *file) 294 { 295 struct seq_file *m = file->private_data; 296 297 map_iter_free(map_iter(m)); 298 299 return seq_release(inode, file); 300 } 301 302 /* bpffs_map_fops should only implement the basic 303 * read operation for a BPF map. The purpose is to 304 * provide a simple user intuitive way to do 305 * "cat bpffs/pathto/a-pinned-map". 306 * 307 * Other operations (e.g. write, lookup...) should be realized by 308 * the userspace tools (e.g. bpftool) through the 309 * BPF_OBJ_GET_INFO_BY_FD and the map's lookup/update 310 * interface. 311 */ 312 static const struct file_operations bpffs_map_fops = { 313 .open = bpffs_map_open, 314 .read = seq_read, 315 .release = bpffs_map_release, 316 }; 317 318 static int bpffs_obj_open(struct inode *inode, struct file *file) 319 { 320 return -EIO; 321 } 322 323 static const struct file_operations bpffs_obj_fops = { 324 .open = bpffs_obj_open, 325 }; 326 327 static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw, 328 const struct inode_operations *iops, 329 const struct file_operations *fops) 330 { 331 struct inode *dir = dentry->d_parent->d_inode; 332 struct inode *inode = bpf_get_inode(dir->i_sb, dir, mode); 333 if (IS_ERR(inode)) 334 return PTR_ERR(inode); 335 336 inode->i_op = iops; 337 inode->i_fop = fops; 338 inode->i_private = raw; 339 340 bpf_dentry_finalize(dentry, inode, dir); 341 return 0; 342 } 343 344 static int bpf_mkprog(struct dentry *dentry, umode_t mode, void *arg) 345 { 346 return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops, 347 &bpffs_obj_fops); 348 } 349 350 static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) 351 { 352 struct bpf_map *map = arg; 353 354 return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops, 355 bpf_map_support_seq_show(map) ? 356 &bpffs_map_fops : &bpffs_obj_fops); 357 } 358 359 static int bpf_mklink(struct dentry *dentry, umode_t mode, void *arg) 360 { 361 return bpf_mkobj_ops(dentry, mode, arg, &bpf_link_iops, 362 &bpffs_obj_fops); 363 } 364 365 static struct dentry * 366 bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) 367 { 368 /* Dots in names (e.g. "/sys/fs/bpf/foo.bar") are reserved for future 369 * extensions. 370 */ 371 if (strchr(dentry->d_name.name, '.')) 372 return ERR_PTR(-EPERM); 373 374 return simple_lookup(dir, dentry, flags); 375 } 376 377 static int bpf_symlink(struct inode *dir, struct dentry *dentry, 378 const char *target) 379 { 380 char *link = kstrdup(target, GFP_USER | __GFP_NOWARN); 381 struct inode *inode; 382 383 if (!link) 384 return -ENOMEM; 385 386 inode = bpf_get_inode(dir->i_sb, dir, S_IRWXUGO | S_IFLNK); 387 if (IS_ERR(inode)) { 388 kfree(link); 389 return PTR_ERR(inode); 390 } 391 392 inode->i_op = &simple_symlink_inode_operations; 393 inode->i_link = link; 394 395 bpf_dentry_finalize(dentry, inode, dir); 396 return 0; 397 } 398 399 static const struct inode_operations bpf_dir_iops = { 400 .lookup = bpf_lookup, 401 .mkdir = bpf_mkdir, 402 .symlink = bpf_symlink, 403 .rmdir = simple_rmdir, 404 .rename = simple_rename, 405 .link = simple_link, 406 .unlink = simple_unlink, 407 }; 408 409 static int bpf_obj_do_pin(const char __user *pathname, void *raw, 410 enum bpf_type type) 411 { 412 struct dentry *dentry; 413 struct inode *dir; 414 struct path path; 415 umode_t mode; 416 int ret; 417 418 dentry = user_path_create(AT_FDCWD, pathname, &path, 0); 419 if (IS_ERR(dentry)) 420 return PTR_ERR(dentry); 421 422 mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask()); 423 424 ret = security_path_mknod(&path, dentry, mode, 0); 425 if (ret) 426 goto out; 427 428 dir = d_inode(path.dentry); 429 if (dir->i_op != &bpf_dir_iops) { 430 ret = -EPERM; 431 goto out; 432 } 433 434 switch (type) { 435 case BPF_TYPE_PROG: 436 ret = vfs_mkobj(dentry, mode, bpf_mkprog, raw); 437 break; 438 case BPF_TYPE_MAP: 439 ret = vfs_mkobj(dentry, mode, bpf_mkmap, raw); 440 break; 441 case BPF_TYPE_LINK: 442 ret = vfs_mkobj(dentry, mode, bpf_mklink, raw); 443 break; 444 default: 445 ret = -EPERM; 446 } 447 out: 448 done_path_create(&path, dentry); 449 return ret; 450 } 451 452 int bpf_obj_pin_user(u32 ufd, const char __user *pathname) 453 { 454 enum bpf_type type; 455 void *raw; 456 int ret; 457 458 raw = bpf_fd_probe_obj(ufd, &type); 459 if (IS_ERR(raw)) 460 return PTR_ERR(raw); 461 462 ret = bpf_obj_do_pin(pathname, raw, type); 463 if (ret != 0) 464 bpf_any_put(raw, type); 465 466 return ret; 467 } 468 469 static void *bpf_obj_do_get(const char __user *pathname, 470 enum bpf_type *type, int flags) 471 { 472 struct inode *inode; 473 struct path path; 474 void *raw; 475 int ret; 476 477 ret = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW, &path); 478 if (ret) 479 return ERR_PTR(ret); 480 481 inode = d_backing_inode(path.dentry); 482 ret = inode_permission(inode, ACC_MODE(flags)); 483 if (ret) 484 goto out; 485 486 ret = bpf_inode_type(inode, type); 487 if (ret) 488 goto out; 489 490 raw = bpf_any_get(inode->i_private, *type); 491 if (!IS_ERR(raw)) 492 touch_atime(&path); 493 494 path_put(&path); 495 return raw; 496 out: 497 path_put(&path); 498 return ERR_PTR(ret); 499 } 500 501 int bpf_obj_get_user(const char __user *pathname, int flags) 502 { 503 enum bpf_type type = BPF_TYPE_UNSPEC; 504 int f_flags; 505 void *raw; 506 int ret; 507 508 f_flags = bpf_get_file_flag(flags); 509 if (f_flags < 0) 510 return f_flags; 511 512 raw = bpf_obj_do_get(pathname, &type, f_flags); 513 if (IS_ERR(raw)) 514 return PTR_ERR(raw); 515 516 if (type == BPF_TYPE_PROG) 517 ret = bpf_prog_new_fd(raw); 518 else if (type == BPF_TYPE_MAP) 519 ret = bpf_map_new_fd(raw, f_flags); 520 else if (type == BPF_TYPE_LINK) 521 ret = bpf_link_new_fd(raw); 522 else 523 return -ENOENT; 524 525 if (ret < 0) 526 bpf_any_put(raw, type); 527 return ret; 528 } 529 530 static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type) 531 { 532 struct bpf_prog *prog; 533 int ret = inode_permission(inode, MAY_READ); 534 if (ret) 535 return ERR_PTR(ret); 536 537 if (inode->i_op == &bpf_map_iops) 538 return ERR_PTR(-EINVAL); 539 if (inode->i_op == &bpf_link_iops) 540 return ERR_PTR(-EINVAL); 541 if (inode->i_op != &bpf_prog_iops) 542 return ERR_PTR(-EACCES); 543 544 prog = inode->i_private; 545 546 ret = security_bpf_prog(prog); 547 if (ret < 0) 548 return ERR_PTR(ret); 549 550 if (!bpf_prog_get_ok(prog, &type, false)) 551 return ERR_PTR(-EINVAL); 552 553 bpf_prog_inc(prog); 554 return prog; 555 } 556 557 struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type) 558 { 559 struct bpf_prog *prog; 560 struct path path; 561 int ret = kern_path(name, LOOKUP_FOLLOW, &path); 562 if (ret) 563 return ERR_PTR(ret); 564 prog = __get_prog_inode(d_backing_inode(path.dentry), type); 565 if (!IS_ERR(prog)) 566 touch_atime(&path); 567 path_put(&path); 568 return prog; 569 } 570 EXPORT_SYMBOL(bpf_prog_get_type_path); 571 572 /* 573 * Display the mount options in /proc/mounts. 574 */ 575 static int bpf_show_options(struct seq_file *m, struct dentry *root) 576 { 577 umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX; 578 579 if (mode != S_IRWXUGO) 580 seq_printf(m, ",mode=%o", mode); 581 return 0; 582 } 583 584 static void bpf_free_inode(struct inode *inode) 585 { 586 enum bpf_type type; 587 588 if (S_ISLNK(inode->i_mode)) 589 kfree(inode->i_link); 590 if (!bpf_inode_type(inode, &type)) 591 bpf_any_put(inode->i_private, type); 592 free_inode_nonrcu(inode); 593 } 594 595 static const struct super_operations bpf_super_ops = { 596 .statfs = simple_statfs, 597 .drop_inode = generic_delete_inode, 598 .show_options = bpf_show_options, 599 .free_inode = bpf_free_inode, 600 }; 601 602 enum { 603 OPT_MODE, 604 }; 605 606 static const struct fs_parameter_spec bpf_fs_parameters[] = { 607 fsparam_u32oct ("mode", OPT_MODE), 608 {} 609 }; 610 611 struct bpf_mount_opts { 612 umode_t mode; 613 }; 614 615 static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) 616 { 617 struct bpf_mount_opts *opts = fc->fs_private; 618 struct fs_parse_result result; 619 int opt; 620 621 opt = fs_parse(fc, bpf_fs_parameters, param, &result); 622 if (opt < 0) 623 /* We might like to report bad mount options here, but 624 * traditionally we've ignored all mount options, so we'd 625 * better continue to ignore non-existing options for bpf. 626 */ 627 return opt == -ENOPARAM ? 0 : opt; 628 629 switch (opt) { 630 case OPT_MODE: 631 opts->mode = result.uint_32 & S_IALLUGO; 632 break; 633 } 634 635 return 0; 636 } 637 638 static int bpf_fill_super(struct super_block *sb, struct fs_context *fc) 639 { 640 static const struct tree_descr bpf_rfiles[] = { { "" } }; 641 struct bpf_mount_opts *opts = fc->fs_private; 642 struct inode *inode; 643 int ret; 644 645 ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles); 646 if (ret) 647 return ret; 648 649 sb->s_op = &bpf_super_ops; 650 651 inode = sb->s_root->d_inode; 652 inode->i_op = &bpf_dir_iops; 653 inode->i_mode &= ~S_IALLUGO; 654 inode->i_mode |= S_ISVTX | opts->mode; 655 656 return 0; 657 } 658 659 static int bpf_get_tree(struct fs_context *fc) 660 { 661 return get_tree_nodev(fc, bpf_fill_super); 662 } 663 664 static void bpf_free_fc(struct fs_context *fc) 665 { 666 kfree(fc->fs_private); 667 } 668 669 static const struct fs_context_operations bpf_context_ops = { 670 .free = bpf_free_fc, 671 .parse_param = bpf_parse_param, 672 .get_tree = bpf_get_tree, 673 }; 674 675 /* 676 * Set up the filesystem mount context. 677 */ 678 static int bpf_init_fs_context(struct fs_context *fc) 679 { 680 struct bpf_mount_opts *opts; 681 682 opts = kzalloc(sizeof(struct bpf_mount_opts), GFP_KERNEL); 683 if (!opts) 684 return -ENOMEM; 685 686 opts->mode = S_IRWXUGO; 687 688 fc->fs_private = opts; 689 fc->ops = &bpf_context_ops; 690 return 0; 691 } 692 693 static struct file_system_type bpf_fs_type = { 694 .owner = THIS_MODULE, 695 .name = "bpf", 696 .init_fs_context = bpf_init_fs_context, 697 .parameters = bpf_fs_parameters, 698 .kill_sb = kill_litter_super, 699 }; 700 701 static int __init bpf_init(void) 702 { 703 int ret; 704 705 ret = sysfs_create_mount_point(fs_kobj, "bpf"); 706 if (ret) 707 return ret; 708 709 ret = register_filesystem(&bpf_fs_type); 710 if (ret) 711 sysfs_remove_mount_point(fs_kobj, "bpf"); 712 713 return ret; 714 } 715 fs_initcall(bpf_init); 716