1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Functions to manage eBPF programs attached to cgroups 4 * 5 * Copyright (c) 2016 Daniel Mack 6 */ 7 8 #include <linux/kernel.h> 9 #include <linux/atomic.h> 10 #include <linux/cgroup.h> 11 #include <linux/filter.h> 12 #include <linux/slab.h> 13 #include <linux/sysctl.h> 14 #include <linux/string.h> 15 #include <linux/bpf.h> 16 #include <linux/bpf-cgroup.h> 17 #include <net/sock.h> 18 #include <net/bpf_sk_storage.h> 19 20 #include "../cgroup/cgroup-internal.h" 21 22 DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); 23 EXPORT_SYMBOL(cgroup_bpf_enabled_key); 24 25 void cgroup_bpf_offline(struct cgroup *cgrp) 26 { 27 cgroup_get(cgrp); 28 percpu_ref_kill(&cgrp->bpf.refcnt); 29 } 30 31 static void bpf_cgroup_storages_free(struct bpf_cgroup_storage *storages[]) 32 { 33 enum bpf_cgroup_storage_type stype; 34 35 for_each_cgroup_storage_type(stype) 36 bpf_cgroup_storage_free(storages[stype]); 37 } 38 39 static int bpf_cgroup_storages_alloc(struct bpf_cgroup_storage *storages[], 40 struct bpf_prog *prog) 41 { 42 enum bpf_cgroup_storage_type stype; 43 44 for_each_cgroup_storage_type(stype) { 45 storages[stype] = bpf_cgroup_storage_alloc(prog, stype); 46 if (IS_ERR(storages[stype])) { 47 storages[stype] = NULL; 48 bpf_cgroup_storages_free(storages); 49 return -ENOMEM; 50 } 51 } 52 53 return 0; 54 } 55 56 static void bpf_cgroup_storages_assign(struct bpf_cgroup_storage *dst[], 57 struct bpf_cgroup_storage *src[]) 58 { 59 enum bpf_cgroup_storage_type stype; 60 61 for_each_cgroup_storage_type(stype) 62 dst[stype] = src[stype]; 63 } 64 65 static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[], 66 struct cgroup* cgrp, 67 enum bpf_attach_type attach_type) 68 { 69 enum bpf_cgroup_storage_type stype; 70 71 for_each_cgroup_storage_type(stype) 72 bpf_cgroup_storage_link(storages[stype], cgrp, attach_type); 73 } 74 75 static void bpf_cgroup_storages_unlink(struct bpf_cgroup_storage *storages[]) 76 { 77 enum bpf_cgroup_storage_type stype; 78 79 for_each_cgroup_storage_type(stype) 80 bpf_cgroup_storage_unlink(storages[stype]); 81 } 82 83 /** 84 * cgroup_bpf_release() - put references of all bpf programs and 85 * release all cgroup bpf data 86 * @work: work structure embedded into the cgroup to modify 87 */ 88 static void cgroup_bpf_release(struct work_struct *work) 89 { 90 struct cgroup *p, *cgrp = container_of(work, struct cgroup, 91 bpf.release_work); 92 struct bpf_prog_array *old_array; 93 unsigned int type; 94 95 mutex_lock(&cgroup_mutex); 96 97 for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { 98 struct list_head *progs = &cgrp->bpf.progs[type]; 99 struct bpf_prog_list *pl, *tmp; 100 101 list_for_each_entry_safe(pl, tmp, progs, node) { 102 list_del(&pl->node); 103 bpf_prog_put(pl->prog); 104 bpf_cgroup_storages_unlink(pl->storage); 105 bpf_cgroup_storages_free(pl->storage); 106 kfree(pl); 107 static_branch_dec(&cgroup_bpf_enabled_key); 108 } 109 old_array = rcu_dereference_protected( 110 cgrp->bpf.effective[type], 111 lockdep_is_held(&cgroup_mutex)); 112 bpf_prog_array_free(old_array); 113 } 114 115 mutex_unlock(&cgroup_mutex); 116 117 for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) 118 cgroup_bpf_put(p); 119 120 percpu_ref_exit(&cgrp->bpf.refcnt); 121 cgroup_put(cgrp); 122 } 123 124 /** 125 * cgroup_bpf_release_fn() - callback used to schedule releasing 126 * of bpf cgroup data 127 * @ref: percpu ref counter structure 128 */ 129 static void cgroup_bpf_release_fn(struct percpu_ref *ref) 130 { 131 struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt); 132 133 INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release); 134 queue_work(system_wq, &cgrp->bpf.release_work); 135 } 136 137 /* count number of elements in the list. 138 * it's slow but the list cannot be long 139 */ 140 static u32 prog_list_length(struct list_head *head) 141 { 142 struct bpf_prog_list *pl; 143 u32 cnt = 0; 144 145 list_for_each_entry(pl, head, node) { 146 if (!pl->prog) 147 continue; 148 cnt++; 149 } 150 return cnt; 151 } 152 153 /* if parent has non-overridable prog attached, 154 * disallow attaching new programs to the descendent cgroup. 155 * if parent has overridable or multi-prog, allow attaching 156 */ 157 static bool hierarchy_allows_attach(struct cgroup *cgrp, 158 enum bpf_attach_type type) 159 { 160 struct cgroup *p; 161 162 p = cgroup_parent(cgrp); 163 if (!p) 164 return true; 165 do { 166 u32 flags = p->bpf.flags[type]; 167 u32 cnt; 168 169 if (flags & BPF_F_ALLOW_MULTI) 170 return true; 171 cnt = prog_list_length(&p->bpf.progs[type]); 172 WARN_ON_ONCE(cnt > 1); 173 if (cnt == 1) 174 return !!(flags & BPF_F_ALLOW_OVERRIDE); 175 p = cgroup_parent(p); 176 } while (p); 177 return true; 178 } 179 180 /* compute a chain of effective programs for a given cgroup: 181 * start from the list of programs in this cgroup and add 182 * all parent programs. 183 * Note that parent's F_ALLOW_OVERRIDE-type program is yielding 184 * to programs in this cgroup 185 */ 186 static int compute_effective_progs(struct cgroup *cgrp, 187 enum bpf_attach_type type, 188 struct bpf_prog_array **array) 189 { 190 struct bpf_prog_array_item *item; 191 struct bpf_prog_array *progs; 192 struct bpf_prog_list *pl; 193 struct cgroup *p = cgrp; 194 int cnt = 0; 195 196 /* count number of effective programs by walking parents */ 197 do { 198 if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) 199 cnt += prog_list_length(&p->bpf.progs[type]); 200 p = cgroup_parent(p); 201 } while (p); 202 203 progs = bpf_prog_array_alloc(cnt, GFP_KERNEL); 204 if (!progs) 205 return -ENOMEM; 206 207 /* populate the array with effective progs */ 208 cnt = 0; 209 p = cgrp; 210 do { 211 if (cnt > 0 && !(p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) 212 continue; 213 214 list_for_each_entry(pl, &p->bpf.progs[type], node) { 215 if (!pl->prog) 216 continue; 217 218 item = &progs->items[cnt]; 219 item->prog = pl->prog; 220 bpf_cgroup_storages_assign(item->cgroup_storage, 221 pl->storage); 222 cnt++; 223 } 224 } while ((p = cgroup_parent(p))); 225 226 *array = progs; 227 return 0; 228 } 229 230 static void activate_effective_progs(struct cgroup *cgrp, 231 enum bpf_attach_type type, 232 struct bpf_prog_array *old_array) 233 { 234 old_array = rcu_replace_pointer(cgrp->bpf.effective[type], old_array, 235 lockdep_is_held(&cgroup_mutex)); 236 /* free prog array after grace period, since __cgroup_bpf_run_*() 237 * might be still walking the array 238 */ 239 bpf_prog_array_free(old_array); 240 } 241 242 /** 243 * cgroup_bpf_inherit() - inherit effective programs from parent 244 * @cgrp: the cgroup to modify 245 */ 246 int cgroup_bpf_inherit(struct cgroup *cgrp) 247 { 248 /* has to use marco instead of const int, since compiler thinks 249 * that array below is variable length 250 */ 251 #define NR ARRAY_SIZE(cgrp->bpf.effective) 252 struct bpf_prog_array *arrays[NR] = {}; 253 struct cgroup *p; 254 int ret, i; 255 256 ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0, 257 GFP_KERNEL); 258 if (ret) 259 return ret; 260 261 for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) 262 cgroup_bpf_get(p); 263 264 for (i = 0; i < NR; i++) 265 INIT_LIST_HEAD(&cgrp->bpf.progs[i]); 266 267 for (i = 0; i < NR; i++) 268 if (compute_effective_progs(cgrp, i, &arrays[i])) 269 goto cleanup; 270 271 for (i = 0; i < NR; i++) 272 activate_effective_progs(cgrp, i, arrays[i]); 273 274 return 0; 275 cleanup: 276 for (i = 0; i < NR; i++) 277 bpf_prog_array_free(arrays[i]); 278 279 percpu_ref_exit(&cgrp->bpf.refcnt); 280 281 return -ENOMEM; 282 } 283 284 static int update_effective_progs(struct cgroup *cgrp, 285 enum bpf_attach_type type) 286 { 287 struct cgroup_subsys_state *css; 288 int err; 289 290 /* allocate and recompute effective prog arrays */ 291 css_for_each_descendant_pre(css, &cgrp->self) { 292 struct cgroup *desc = container_of(css, struct cgroup, self); 293 294 if (percpu_ref_is_zero(&desc->bpf.refcnt)) 295 continue; 296 297 err = compute_effective_progs(desc, type, &desc->bpf.inactive); 298 if (err) 299 goto cleanup; 300 } 301 302 /* all allocations were successful. Activate all prog arrays */ 303 css_for_each_descendant_pre(css, &cgrp->self) { 304 struct cgroup *desc = container_of(css, struct cgroup, self); 305 306 if (percpu_ref_is_zero(&desc->bpf.refcnt)) { 307 if (unlikely(desc->bpf.inactive)) { 308 bpf_prog_array_free(desc->bpf.inactive); 309 desc->bpf.inactive = NULL; 310 } 311 continue; 312 } 313 314 activate_effective_progs(desc, type, desc->bpf.inactive); 315 desc->bpf.inactive = NULL; 316 } 317 318 return 0; 319 320 cleanup: 321 /* oom while computing effective. Free all computed effective arrays 322 * since they were not activated 323 */ 324 css_for_each_descendant_pre(css, &cgrp->self) { 325 struct cgroup *desc = container_of(css, struct cgroup, self); 326 327 bpf_prog_array_free(desc->bpf.inactive); 328 desc->bpf.inactive = NULL; 329 } 330 331 return err; 332 } 333 334 #define BPF_CGROUP_MAX_PROGS 64 335 336 /** 337 * __cgroup_bpf_attach() - Attach the program to a cgroup, and 338 * propagate the change to descendants 339 * @cgrp: The cgroup which descendants to traverse 340 * @prog: A program to attach 341 * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set 342 * @type: Type of attach operation 343 * @flags: Option flags 344 * 345 * Must be called with cgroup_mutex held. 346 */ 347 int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, 348 struct bpf_prog *replace_prog, 349 enum bpf_attach_type type, u32 flags) 350 { 351 u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)); 352 struct list_head *progs = &cgrp->bpf.progs[type]; 353 struct bpf_prog *old_prog = NULL; 354 struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE], 355 *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL}; 356 struct bpf_prog_list *pl, *replace_pl = NULL; 357 int err; 358 359 if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) || 360 ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI))) 361 /* invalid combination */ 362 return -EINVAL; 363 364 if (!hierarchy_allows_attach(cgrp, type)) 365 return -EPERM; 366 367 if (!list_empty(progs) && cgrp->bpf.flags[type] != saved_flags) 368 /* Disallow attaching non-overridable on top 369 * of existing overridable in this cgroup. 370 * Disallow attaching multi-prog if overridable or none 371 */ 372 return -EPERM; 373 374 if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) 375 return -E2BIG; 376 377 if (flags & BPF_F_ALLOW_MULTI) { 378 list_for_each_entry(pl, progs, node) { 379 if (pl->prog == prog) 380 /* disallow attaching the same prog twice */ 381 return -EINVAL; 382 if (pl->prog == replace_prog) 383 replace_pl = pl; 384 } 385 if ((flags & BPF_F_REPLACE) && !replace_pl) 386 /* prog to replace not found for cgroup */ 387 return -ENOENT; 388 } else if (!list_empty(progs)) { 389 replace_pl = list_first_entry(progs, typeof(*pl), node); 390 } 391 392 if (bpf_cgroup_storages_alloc(storage, prog)) 393 return -ENOMEM; 394 395 if (replace_pl) { 396 pl = replace_pl; 397 old_prog = pl->prog; 398 bpf_cgroup_storages_unlink(pl->storage); 399 bpf_cgroup_storages_assign(old_storage, pl->storage); 400 } else { 401 pl = kmalloc(sizeof(*pl), GFP_KERNEL); 402 if (!pl) { 403 bpf_cgroup_storages_free(storage); 404 return -ENOMEM; 405 } 406 list_add_tail(&pl->node, progs); 407 } 408 409 pl->prog = prog; 410 bpf_cgroup_storages_assign(pl->storage, storage); 411 cgrp->bpf.flags[type] = saved_flags; 412 413 err = update_effective_progs(cgrp, type); 414 if (err) 415 goto cleanup; 416 417 static_branch_inc(&cgroup_bpf_enabled_key); 418 bpf_cgroup_storages_free(old_storage); 419 if (old_prog) { 420 bpf_prog_put(old_prog); 421 static_branch_dec(&cgroup_bpf_enabled_key); 422 } 423 bpf_cgroup_storages_link(storage, cgrp, type); 424 return 0; 425 426 cleanup: 427 /* and cleanup the prog list */ 428 pl->prog = old_prog; 429 bpf_cgroup_storages_free(pl->storage); 430 bpf_cgroup_storages_assign(pl->storage, old_storage); 431 bpf_cgroup_storages_link(pl->storage, cgrp, type); 432 if (!replace_pl) { 433 list_del(&pl->node); 434 kfree(pl); 435 } 436 return err; 437 } 438 439 /** 440 * __cgroup_bpf_detach() - Detach the program from a cgroup, and 441 * propagate the change to descendants 442 * @cgrp: The cgroup which descendants to traverse 443 * @prog: A program to detach or NULL 444 * @type: Type of detach operation 445 * 446 * Must be called with cgroup_mutex held. 447 */ 448 int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, 449 enum bpf_attach_type type) 450 { 451 struct list_head *progs = &cgrp->bpf.progs[type]; 452 u32 flags = cgrp->bpf.flags[type]; 453 struct bpf_prog *old_prog = NULL; 454 struct bpf_prog_list *pl; 455 int err; 456 457 if (flags & BPF_F_ALLOW_MULTI) { 458 if (!prog) 459 /* to detach MULTI prog the user has to specify valid FD 460 * of the program to be detached 461 */ 462 return -EINVAL; 463 } else { 464 if (list_empty(progs)) 465 /* report error when trying to detach and nothing is attached */ 466 return -ENOENT; 467 } 468 469 if (flags & BPF_F_ALLOW_MULTI) { 470 /* find the prog and detach it */ 471 list_for_each_entry(pl, progs, node) { 472 if (pl->prog != prog) 473 continue; 474 old_prog = prog; 475 /* mark it deleted, so it's ignored while 476 * recomputing effective 477 */ 478 pl->prog = NULL; 479 break; 480 } 481 if (!old_prog) 482 return -ENOENT; 483 } else { 484 /* to maintain backward compatibility NONE and OVERRIDE cgroups 485 * allow detaching with invalid FD (prog==NULL) 486 */ 487 pl = list_first_entry(progs, typeof(*pl), node); 488 old_prog = pl->prog; 489 pl->prog = NULL; 490 } 491 492 err = update_effective_progs(cgrp, type); 493 if (err) 494 goto cleanup; 495 496 /* now can actually delete it from this cgroup list */ 497 list_del(&pl->node); 498 bpf_cgroup_storages_unlink(pl->storage); 499 bpf_cgroup_storages_free(pl->storage); 500 kfree(pl); 501 if (list_empty(progs)) 502 /* last program was detached, reset flags to zero */ 503 cgrp->bpf.flags[type] = 0; 504 505 bpf_prog_put(old_prog); 506 static_branch_dec(&cgroup_bpf_enabled_key); 507 return 0; 508 509 cleanup: 510 /* and restore back old_prog */ 511 pl->prog = old_prog; 512 return err; 513 } 514 515 /* Must be called with cgroup_mutex held to avoid races. */ 516 int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, 517 union bpf_attr __user *uattr) 518 { 519 __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); 520 enum bpf_attach_type type = attr->query.attach_type; 521 struct list_head *progs = &cgrp->bpf.progs[type]; 522 u32 flags = cgrp->bpf.flags[type]; 523 struct bpf_prog_array *effective; 524 int cnt, ret = 0, i; 525 526 effective = rcu_dereference_protected(cgrp->bpf.effective[type], 527 lockdep_is_held(&cgroup_mutex)); 528 529 if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) 530 cnt = bpf_prog_array_length(effective); 531 else 532 cnt = prog_list_length(progs); 533 534 if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) 535 return -EFAULT; 536 if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt))) 537 return -EFAULT; 538 if (attr->query.prog_cnt == 0 || !prog_ids || !cnt) 539 /* return early if user requested only program count + flags */ 540 return 0; 541 if (attr->query.prog_cnt < cnt) { 542 cnt = attr->query.prog_cnt; 543 ret = -ENOSPC; 544 } 545 546 if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { 547 return bpf_prog_array_copy_to_user(effective, prog_ids, cnt); 548 } else { 549 struct bpf_prog_list *pl; 550 u32 id; 551 552 i = 0; 553 list_for_each_entry(pl, progs, node) { 554 id = pl->prog->aux->id; 555 if (copy_to_user(prog_ids + i, &id, sizeof(id))) 556 return -EFAULT; 557 if (++i == cnt) 558 break; 559 } 560 } 561 return ret; 562 } 563 564 int cgroup_bpf_prog_attach(const union bpf_attr *attr, 565 enum bpf_prog_type ptype, struct bpf_prog *prog) 566 { 567 struct bpf_prog *replace_prog = NULL; 568 struct cgroup *cgrp; 569 int ret; 570 571 cgrp = cgroup_get_from_fd(attr->target_fd); 572 if (IS_ERR(cgrp)) 573 return PTR_ERR(cgrp); 574 575 if ((attr->attach_flags & BPF_F_ALLOW_MULTI) && 576 (attr->attach_flags & BPF_F_REPLACE)) { 577 replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, ptype); 578 if (IS_ERR(replace_prog)) { 579 cgroup_put(cgrp); 580 return PTR_ERR(replace_prog); 581 } 582 } 583 584 ret = cgroup_bpf_attach(cgrp, prog, replace_prog, attr->attach_type, 585 attr->attach_flags); 586 587 if (replace_prog) 588 bpf_prog_put(replace_prog); 589 cgroup_put(cgrp); 590 return ret; 591 } 592 593 int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) 594 { 595 struct bpf_prog *prog; 596 struct cgroup *cgrp; 597 int ret; 598 599 cgrp = cgroup_get_from_fd(attr->target_fd); 600 if (IS_ERR(cgrp)) 601 return PTR_ERR(cgrp); 602 603 prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); 604 if (IS_ERR(prog)) 605 prog = NULL; 606 607 ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0); 608 if (prog) 609 bpf_prog_put(prog); 610 611 cgroup_put(cgrp); 612 return ret; 613 } 614 615 int cgroup_bpf_prog_query(const union bpf_attr *attr, 616 union bpf_attr __user *uattr) 617 { 618 struct cgroup *cgrp; 619 int ret; 620 621 cgrp = cgroup_get_from_fd(attr->query.target_fd); 622 if (IS_ERR(cgrp)) 623 return PTR_ERR(cgrp); 624 625 ret = cgroup_bpf_query(cgrp, attr, uattr); 626 627 cgroup_put(cgrp); 628 return ret; 629 } 630 631 /** 632 * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering 633 * @sk: The socket sending or receiving traffic 634 * @skb: The skb that is being sent or received 635 * @type: The type of program to be exectuted 636 * 637 * If no socket is passed, or the socket is not of type INET or INET6, 638 * this function does nothing and returns 0. 639 * 640 * The program type passed in via @type must be suitable for network 641 * filtering. No further check is performed to assert that. 642 * 643 * For egress packets, this function can return: 644 * NET_XMIT_SUCCESS (0) - continue with packet output 645 * NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr 646 * NET_XMIT_CN (2) - continue with packet output and notify TCP 647 * to call cwr 648 * -EPERM - drop packet 649 * 650 * For ingress packets, this function will return -EPERM if any 651 * attached program was found and if it returned != 1 during execution. 652 * Otherwise 0 is returned. 653 */ 654 int __cgroup_bpf_run_filter_skb(struct sock *sk, 655 struct sk_buff *skb, 656 enum bpf_attach_type type) 657 { 658 unsigned int offset = skb->data - skb_network_header(skb); 659 struct sock *save_sk; 660 void *saved_data_end; 661 struct cgroup *cgrp; 662 int ret; 663 664 if (!sk || !sk_fullsock(sk)) 665 return 0; 666 667 if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) 668 return 0; 669 670 cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 671 save_sk = skb->sk; 672 skb->sk = sk; 673 __skb_push(skb, offset); 674 675 /* compute pointers for the bpf prog */ 676 bpf_compute_and_save_data_end(skb, &saved_data_end); 677 678 if (type == BPF_CGROUP_INET_EGRESS) { 679 ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY( 680 cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb); 681 } else { 682 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, 683 __bpf_prog_run_save_cb); 684 ret = (ret == 1 ? 0 : -EPERM); 685 } 686 bpf_restore_data_end(skb, saved_data_end); 687 __skb_pull(skb, offset); 688 skb->sk = save_sk; 689 690 return ret; 691 } 692 EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); 693 694 /** 695 * __cgroup_bpf_run_filter_sk() - Run a program on a sock 696 * @sk: sock structure to manipulate 697 * @type: The type of program to be exectuted 698 * 699 * socket is passed is expected to be of type INET or INET6. 700 * 701 * The program type passed in via @type must be suitable for sock 702 * filtering. No further check is performed to assert that. 703 * 704 * This function will return %-EPERM if any if an attached program was found 705 * and if it returned != 1 during execution. In all other cases, 0 is returned. 706 */ 707 int __cgroup_bpf_run_filter_sk(struct sock *sk, 708 enum bpf_attach_type type) 709 { 710 struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 711 int ret; 712 713 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN); 714 return ret == 1 ? 0 : -EPERM; 715 } 716 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); 717 718 /** 719 * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and 720 * provided by user sockaddr 721 * @sk: sock struct that will use sockaddr 722 * @uaddr: sockaddr struct provided by user 723 * @type: The type of program to be exectuted 724 * @t_ctx: Pointer to attach type specific context 725 * 726 * socket is expected to be of type INET or INET6. 727 * 728 * This function will return %-EPERM if an attached program is found and 729 * returned value != 1 during execution. In all other cases, 0 is returned. 730 */ 731 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, 732 struct sockaddr *uaddr, 733 enum bpf_attach_type type, 734 void *t_ctx) 735 { 736 struct bpf_sock_addr_kern ctx = { 737 .sk = sk, 738 .uaddr = uaddr, 739 .t_ctx = t_ctx, 740 }; 741 struct sockaddr_storage unspec; 742 struct cgroup *cgrp; 743 int ret; 744 745 /* Check socket family since not all sockets represent network 746 * endpoint (e.g. AF_UNIX). 747 */ 748 if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) 749 return 0; 750 751 if (!ctx.uaddr) { 752 memset(&unspec, 0, sizeof(unspec)); 753 ctx.uaddr = (struct sockaddr *)&unspec; 754 } 755 756 cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 757 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); 758 759 return ret == 1 ? 0 : -EPERM; 760 } 761 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); 762 763 /** 764 * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock 765 * @sk: socket to get cgroup from 766 * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains 767 * sk with connection information (IP addresses, etc.) May not contain 768 * cgroup info if it is a req sock. 769 * @type: The type of program to be exectuted 770 * 771 * socket passed is expected to be of type INET or INET6. 772 * 773 * The program type passed in via @type must be suitable for sock_ops 774 * filtering. No further check is performed to assert that. 775 * 776 * This function will return %-EPERM if any if an attached program was found 777 * and if it returned != 1 during execution. In all other cases, 0 is returned. 778 */ 779 int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, 780 struct bpf_sock_ops_kern *sock_ops, 781 enum bpf_attach_type type) 782 { 783 struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 784 int ret; 785 786 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops, 787 BPF_PROG_RUN); 788 return ret == 1 ? 0 : -EPERM; 789 } 790 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); 791 792 int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, 793 short access, enum bpf_attach_type type) 794 { 795 struct cgroup *cgrp; 796 struct bpf_cgroup_dev_ctx ctx = { 797 .access_type = (access << 16) | dev_type, 798 .major = major, 799 .minor = minor, 800 }; 801 int allow = 1; 802 803 rcu_read_lock(); 804 cgrp = task_dfl_cgroup(current); 805 allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, 806 BPF_PROG_RUN); 807 rcu_read_unlock(); 808 809 return !allow; 810 } 811 EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission); 812 813 static const struct bpf_func_proto * 814 cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 815 { 816 switch (func_id) { 817 case BPF_FUNC_map_lookup_elem: 818 return &bpf_map_lookup_elem_proto; 819 case BPF_FUNC_map_update_elem: 820 return &bpf_map_update_elem_proto; 821 case BPF_FUNC_map_delete_elem: 822 return &bpf_map_delete_elem_proto; 823 case BPF_FUNC_map_push_elem: 824 return &bpf_map_push_elem_proto; 825 case BPF_FUNC_map_pop_elem: 826 return &bpf_map_pop_elem_proto; 827 case BPF_FUNC_map_peek_elem: 828 return &bpf_map_peek_elem_proto; 829 case BPF_FUNC_get_current_uid_gid: 830 return &bpf_get_current_uid_gid_proto; 831 case BPF_FUNC_get_local_storage: 832 return &bpf_get_local_storage_proto; 833 case BPF_FUNC_get_current_cgroup_id: 834 return &bpf_get_current_cgroup_id_proto; 835 case BPF_FUNC_trace_printk: 836 if (capable(CAP_SYS_ADMIN)) 837 return bpf_get_trace_printk_proto(); 838 /* fall through */ 839 default: 840 return NULL; 841 } 842 } 843 844 static const struct bpf_func_proto * 845 cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 846 { 847 return cgroup_base_func_proto(func_id, prog); 848 } 849 850 static bool cgroup_dev_is_valid_access(int off, int size, 851 enum bpf_access_type type, 852 const struct bpf_prog *prog, 853 struct bpf_insn_access_aux *info) 854 { 855 const int size_default = sizeof(__u32); 856 857 if (type == BPF_WRITE) 858 return false; 859 860 if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx)) 861 return false; 862 /* The verifier guarantees that size > 0. */ 863 if (off % size != 0) 864 return false; 865 866 switch (off) { 867 case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type): 868 bpf_ctx_record_field_size(info, size_default); 869 if (!bpf_ctx_narrow_access_ok(off, size, size_default)) 870 return false; 871 break; 872 default: 873 if (size != size_default) 874 return false; 875 } 876 877 return true; 878 } 879 880 const struct bpf_prog_ops cg_dev_prog_ops = { 881 }; 882 883 const struct bpf_verifier_ops cg_dev_verifier_ops = { 884 .get_func_proto = cgroup_dev_func_proto, 885 .is_valid_access = cgroup_dev_is_valid_access, 886 }; 887 888 /** 889 * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl 890 * 891 * @head: sysctl table header 892 * @table: sysctl table 893 * @write: sysctl is being read (= 0) or written (= 1) 894 * @buf: pointer to buffer passed by user space 895 * @pcount: value-result argument: value is size of buffer pointed to by @buf, 896 * result is size of @new_buf if program set new value, initial value 897 * otherwise 898 * @ppos: value-result argument: value is position at which read from or write 899 * to sysctl is happening, result is new position if program overrode it, 900 * initial value otherwise 901 * @new_buf: pointer to pointer to new buffer that will be allocated if program 902 * overrides new value provided by user space on sysctl write 903 * NOTE: it's caller responsibility to free *new_buf if it was set 904 * @type: type of program to be executed 905 * 906 * Program is run when sysctl is being accessed, either read or written, and 907 * can allow or deny such access. 908 * 909 * This function will return %-EPERM if an attached program is found and 910 * returned value != 1 during execution. In all other cases 0 is returned. 911 */ 912 int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, 913 struct ctl_table *table, int write, 914 void __user *buf, size_t *pcount, 915 loff_t *ppos, void **new_buf, 916 enum bpf_attach_type type) 917 { 918 struct bpf_sysctl_kern ctx = { 919 .head = head, 920 .table = table, 921 .write = write, 922 .ppos = ppos, 923 .cur_val = NULL, 924 .cur_len = PAGE_SIZE, 925 .new_val = NULL, 926 .new_len = 0, 927 .new_updated = 0, 928 }; 929 struct cgroup *cgrp; 930 int ret; 931 932 ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL); 933 if (ctx.cur_val) { 934 mm_segment_t old_fs; 935 loff_t pos = 0; 936 937 old_fs = get_fs(); 938 set_fs(KERNEL_DS); 939 if (table->proc_handler(table, 0, (void __user *)ctx.cur_val, 940 &ctx.cur_len, &pos)) { 941 /* Let BPF program decide how to proceed. */ 942 ctx.cur_len = 0; 943 } 944 set_fs(old_fs); 945 } else { 946 /* Let BPF program decide how to proceed. */ 947 ctx.cur_len = 0; 948 } 949 950 if (write && buf && *pcount) { 951 /* BPF program should be able to override new value with a 952 * buffer bigger than provided by user. 953 */ 954 ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL); 955 ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount); 956 if (!ctx.new_val || 957 copy_from_user(ctx.new_val, buf, ctx.new_len)) 958 /* Let BPF program decide how to proceed. */ 959 ctx.new_len = 0; 960 } 961 962 rcu_read_lock(); 963 cgrp = task_dfl_cgroup(current); 964 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); 965 rcu_read_unlock(); 966 967 kfree(ctx.cur_val); 968 969 if (ret == 1 && ctx.new_updated) { 970 *new_buf = ctx.new_val; 971 *pcount = ctx.new_len; 972 } else { 973 kfree(ctx.new_val); 974 } 975 976 return ret == 1 ? 0 : -EPERM; 977 } 978 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); 979 980 #ifdef CONFIG_NET 981 static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, 982 enum bpf_attach_type attach_type) 983 { 984 struct bpf_prog_array *prog_array; 985 bool empty; 986 987 rcu_read_lock(); 988 prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]); 989 empty = bpf_prog_array_is_empty(prog_array); 990 rcu_read_unlock(); 991 992 return empty; 993 } 994 995 static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) 996 { 997 if (unlikely(max_optlen > PAGE_SIZE) || max_optlen < 0) 998 return -EINVAL; 999 1000 ctx->optval = kzalloc(max_optlen, GFP_USER); 1001 if (!ctx->optval) 1002 return -ENOMEM; 1003 1004 ctx->optval_end = ctx->optval + max_optlen; 1005 1006 return 0; 1007 } 1008 1009 static void sockopt_free_buf(struct bpf_sockopt_kern *ctx) 1010 { 1011 kfree(ctx->optval); 1012 } 1013 1014 int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, 1015 int *optname, char __user *optval, 1016 int *optlen, char **kernel_optval) 1017 { 1018 struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 1019 struct bpf_sockopt_kern ctx = { 1020 .sk = sk, 1021 .level = *level, 1022 .optname = *optname, 1023 }; 1024 int ret, max_optlen; 1025 1026 /* Opportunistic check to see whether we have any BPF program 1027 * attached to the hook so we don't waste time allocating 1028 * memory and locking the socket. 1029 */ 1030 if (!cgroup_bpf_enabled || 1031 __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) 1032 return 0; 1033 1034 /* Allocate a bit more than the initial user buffer for 1035 * BPF program. The canonical use case is overriding 1036 * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic). 1037 */ 1038 max_optlen = max_t(int, 16, *optlen); 1039 1040 ret = sockopt_alloc_buf(&ctx, max_optlen); 1041 if (ret) 1042 return ret; 1043 1044 ctx.optlen = *optlen; 1045 1046 if (copy_from_user(ctx.optval, optval, *optlen) != 0) { 1047 ret = -EFAULT; 1048 goto out; 1049 } 1050 1051 lock_sock(sk); 1052 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT], 1053 &ctx, BPF_PROG_RUN); 1054 release_sock(sk); 1055 1056 if (!ret) { 1057 ret = -EPERM; 1058 goto out; 1059 } 1060 1061 if (ctx.optlen == -1) { 1062 /* optlen set to -1, bypass kernel */ 1063 ret = 1; 1064 } else if (ctx.optlen > max_optlen || ctx.optlen < -1) { 1065 /* optlen is out of bounds */ 1066 ret = -EFAULT; 1067 } else { 1068 /* optlen within bounds, run kernel handler */ 1069 ret = 0; 1070 1071 /* export any potential modifications */ 1072 *level = ctx.level; 1073 *optname = ctx.optname; 1074 *optlen = ctx.optlen; 1075 *kernel_optval = ctx.optval; 1076 } 1077 1078 out: 1079 if (ret) 1080 sockopt_free_buf(&ctx); 1081 return ret; 1082 } 1083 EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt); 1084 1085 int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, 1086 int optname, char __user *optval, 1087 int __user *optlen, int max_optlen, 1088 int retval) 1089 { 1090 struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 1091 struct bpf_sockopt_kern ctx = { 1092 .sk = sk, 1093 .level = level, 1094 .optname = optname, 1095 .retval = retval, 1096 }; 1097 int ret; 1098 1099 /* Opportunistic check to see whether we have any BPF program 1100 * attached to the hook so we don't waste time allocating 1101 * memory and locking the socket. 1102 */ 1103 if (!cgroup_bpf_enabled || 1104 __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) 1105 return retval; 1106 1107 ret = sockopt_alloc_buf(&ctx, max_optlen); 1108 if (ret) 1109 return ret; 1110 1111 ctx.optlen = max_optlen; 1112 1113 if (!retval) { 1114 /* If kernel getsockopt finished successfully, 1115 * copy whatever was returned to the user back 1116 * into our temporary buffer. Set optlen to the 1117 * one that kernel returned as well to let 1118 * BPF programs inspect the value. 1119 */ 1120 1121 if (get_user(ctx.optlen, optlen)) { 1122 ret = -EFAULT; 1123 goto out; 1124 } 1125 1126 if (ctx.optlen > max_optlen) 1127 ctx.optlen = max_optlen; 1128 1129 if (copy_from_user(ctx.optval, optval, ctx.optlen) != 0) { 1130 ret = -EFAULT; 1131 goto out; 1132 } 1133 } 1134 1135 lock_sock(sk); 1136 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], 1137 &ctx, BPF_PROG_RUN); 1138 release_sock(sk); 1139 1140 if (!ret) { 1141 ret = -EPERM; 1142 goto out; 1143 } 1144 1145 if (ctx.optlen > max_optlen) { 1146 ret = -EFAULT; 1147 goto out; 1148 } 1149 1150 /* BPF programs only allowed to set retval to 0, not some 1151 * arbitrary value. 1152 */ 1153 if (ctx.retval != 0 && ctx.retval != retval) { 1154 ret = -EFAULT; 1155 goto out; 1156 } 1157 1158 if (copy_to_user(optval, ctx.optval, ctx.optlen) || 1159 put_user(ctx.optlen, optlen)) { 1160 ret = -EFAULT; 1161 goto out; 1162 } 1163 1164 ret = ctx.retval; 1165 1166 out: 1167 sockopt_free_buf(&ctx); 1168 return ret; 1169 } 1170 EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt); 1171 #endif 1172 1173 static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, 1174 size_t *lenp) 1175 { 1176 ssize_t tmp_ret = 0, ret; 1177 1178 if (dir->header.parent) { 1179 tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp); 1180 if (tmp_ret < 0) 1181 return tmp_ret; 1182 } 1183 1184 ret = strscpy(*bufp, dir->header.ctl_table[0].procname, *lenp); 1185 if (ret < 0) 1186 return ret; 1187 *bufp += ret; 1188 *lenp -= ret; 1189 ret += tmp_ret; 1190 1191 /* Avoid leading slash. */ 1192 if (!ret) 1193 return ret; 1194 1195 tmp_ret = strscpy(*bufp, "/", *lenp); 1196 if (tmp_ret < 0) 1197 return tmp_ret; 1198 *bufp += tmp_ret; 1199 *lenp -= tmp_ret; 1200 1201 return ret + tmp_ret; 1202 } 1203 1204 BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern *, ctx, char *, buf, 1205 size_t, buf_len, u64, flags) 1206 { 1207 ssize_t tmp_ret = 0, ret; 1208 1209 if (!buf) 1210 return -EINVAL; 1211 1212 if (!(flags & BPF_F_SYSCTL_BASE_NAME)) { 1213 if (!ctx->head) 1214 return -EINVAL; 1215 tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len); 1216 if (tmp_ret < 0) 1217 return tmp_ret; 1218 } 1219 1220 ret = strscpy(buf, ctx->table->procname, buf_len); 1221 1222 return ret < 0 ? ret : tmp_ret + ret; 1223 } 1224 1225 static const struct bpf_func_proto bpf_sysctl_get_name_proto = { 1226 .func = bpf_sysctl_get_name, 1227 .gpl_only = false, 1228 .ret_type = RET_INTEGER, 1229 .arg1_type = ARG_PTR_TO_CTX, 1230 .arg2_type = ARG_PTR_TO_MEM, 1231 .arg3_type = ARG_CONST_SIZE, 1232 .arg4_type = ARG_ANYTHING, 1233 }; 1234 1235 static int copy_sysctl_value(char *dst, size_t dst_len, char *src, 1236 size_t src_len) 1237 { 1238 if (!dst) 1239 return -EINVAL; 1240 1241 if (!dst_len) 1242 return -E2BIG; 1243 1244 if (!src || !src_len) { 1245 memset(dst, 0, dst_len); 1246 return -EINVAL; 1247 } 1248 1249 memcpy(dst, src, min(dst_len, src_len)); 1250 1251 if (dst_len > src_len) { 1252 memset(dst + src_len, '\0', dst_len - src_len); 1253 return src_len; 1254 } 1255 1256 dst[dst_len - 1] = '\0'; 1257 1258 return -E2BIG; 1259 } 1260 1261 BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx, 1262 char *, buf, size_t, buf_len) 1263 { 1264 return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len); 1265 } 1266 1267 static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = { 1268 .func = bpf_sysctl_get_current_value, 1269 .gpl_only = false, 1270 .ret_type = RET_INTEGER, 1271 .arg1_type = ARG_PTR_TO_CTX, 1272 .arg2_type = ARG_PTR_TO_UNINIT_MEM, 1273 .arg3_type = ARG_CONST_SIZE, 1274 }; 1275 1276 BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf, 1277 size_t, buf_len) 1278 { 1279 if (!ctx->write) { 1280 if (buf && buf_len) 1281 memset(buf, '\0', buf_len); 1282 return -EINVAL; 1283 } 1284 return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len); 1285 } 1286 1287 static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = { 1288 .func = bpf_sysctl_get_new_value, 1289 .gpl_only = false, 1290 .ret_type = RET_INTEGER, 1291 .arg1_type = ARG_PTR_TO_CTX, 1292 .arg2_type = ARG_PTR_TO_UNINIT_MEM, 1293 .arg3_type = ARG_CONST_SIZE, 1294 }; 1295 1296 BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx, 1297 const char *, buf, size_t, buf_len) 1298 { 1299 if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len) 1300 return -EINVAL; 1301 1302 if (buf_len > PAGE_SIZE - 1) 1303 return -E2BIG; 1304 1305 memcpy(ctx->new_val, buf, buf_len); 1306 ctx->new_len = buf_len; 1307 ctx->new_updated = 1; 1308 1309 return 0; 1310 } 1311 1312 static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = { 1313 .func = bpf_sysctl_set_new_value, 1314 .gpl_only = false, 1315 .ret_type = RET_INTEGER, 1316 .arg1_type = ARG_PTR_TO_CTX, 1317 .arg2_type = ARG_PTR_TO_MEM, 1318 .arg3_type = ARG_CONST_SIZE, 1319 }; 1320 1321 static const struct bpf_func_proto * 1322 sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 1323 { 1324 switch (func_id) { 1325 case BPF_FUNC_strtol: 1326 return &bpf_strtol_proto; 1327 case BPF_FUNC_strtoul: 1328 return &bpf_strtoul_proto; 1329 case BPF_FUNC_sysctl_get_name: 1330 return &bpf_sysctl_get_name_proto; 1331 case BPF_FUNC_sysctl_get_current_value: 1332 return &bpf_sysctl_get_current_value_proto; 1333 case BPF_FUNC_sysctl_get_new_value: 1334 return &bpf_sysctl_get_new_value_proto; 1335 case BPF_FUNC_sysctl_set_new_value: 1336 return &bpf_sysctl_set_new_value_proto; 1337 default: 1338 return cgroup_base_func_proto(func_id, prog); 1339 } 1340 } 1341 1342 static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type, 1343 const struct bpf_prog *prog, 1344 struct bpf_insn_access_aux *info) 1345 { 1346 const int size_default = sizeof(__u32); 1347 1348 if (off < 0 || off + size > sizeof(struct bpf_sysctl) || off % size) 1349 return false; 1350 1351 switch (off) { 1352 case bpf_ctx_range(struct bpf_sysctl, write): 1353 if (type != BPF_READ) 1354 return false; 1355 bpf_ctx_record_field_size(info, size_default); 1356 return bpf_ctx_narrow_access_ok(off, size, size_default); 1357 case bpf_ctx_range(struct bpf_sysctl, file_pos): 1358 if (type == BPF_READ) { 1359 bpf_ctx_record_field_size(info, size_default); 1360 return bpf_ctx_narrow_access_ok(off, size, size_default); 1361 } else { 1362 return size == size_default; 1363 } 1364 default: 1365 return false; 1366 } 1367 } 1368 1369 static u32 sysctl_convert_ctx_access(enum bpf_access_type type, 1370 const struct bpf_insn *si, 1371 struct bpf_insn *insn_buf, 1372 struct bpf_prog *prog, u32 *target_size) 1373 { 1374 struct bpf_insn *insn = insn_buf; 1375 u32 read_size; 1376 1377 switch (si->off) { 1378 case offsetof(struct bpf_sysctl, write): 1379 *insn++ = BPF_LDX_MEM( 1380 BPF_SIZE(si->code), si->dst_reg, si->src_reg, 1381 bpf_target_off(struct bpf_sysctl_kern, write, 1382 sizeof_field(struct bpf_sysctl_kern, 1383 write), 1384 target_size)); 1385 break; 1386 case offsetof(struct bpf_sysctl, file_pos): 1387 /* ppos is a pointer so it should be accessed via indirect 1388 * loads and stores. Also for stores additional temporary 1389 * register is used since neither src_reg nor dst_reg can be 1390 * overridden. 1391 */ 1392 if (type == BPF_WRITE) { 1393 int treg = BPF_REG_9; 1394 1395 if (si->src_reg == treg || si->dst_reg == treg) 1396 --treg; 1397 if (si->src_reg == treg || si->dst_reg == treg) 1398 --treg; 1399 *insn++ = BPF_STX_MEM( 1400 BPF_DW, si->dst_reg, treg, 1401 offsetof(struct bpf_sysctl_kern, tmp_reg)); 1402 *insn++ = BPF_LDX_MEM( 1403 BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos), 1404 treg, si->dst_reg, 1405 offsetof(struct bpf_sysctl_kern, ppos)); 1406 *insn++ = BPF_STX_MEM( 1407 BPF_SIZEOF(u32), treg, si->src_reg, 1408 bpf_ctx_narrow_access_offset( 1409 0, sizeof(u32), sizeof(loff_t))); 1410 *insn++ = BPF_LDX_MEM( 1411 BPF_DW, treg, si->dst_reg, 1412 offsetof(struct bpf_sysctl_kern, tmp_reg)); 1413 } else { 1414 *insn++ = BPF_LDX_MEM( 1415 BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos), 1416 si->dst_reg, si->src_reg, 1417 offsetof(struct bpf_sysctl_kern, ppos)); 1418 read_size = bpf_size_to_bytes(BPF_SIZE(si->code)); 1419 *insn++ = BPF_LDX_MEM( 1420 BPF_SIZE(si->code), si->dst_reg, si->dst_reg, 1421 bpf_ctx_narrow_access_offset( 1422 0, read_size, sizeof(loff_t))); 1423 } 1424 *target_size = sizeof(u32); 1425 break; 1426 } 1427 1428 return insn - insn_buf; 1429 } 1430 1431 const struct bpf_verifier_ops cg_sysctl_verifier_ops = { 1432 .get_func_proto = sysctl_func_proto, 1433 .is_valid_access = sysctl_is_valid_access, 1434 .convert_ctx_access = sysctl_convert_ctx_access, 1435 }; 1436 1437 const struct bpf_prog_ops cg_sysctl_prog_ops = { 1438 }; 1439 1440 static const struct bpf_func_proto * 1441 cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 1442 { 1443 switch (func_id) { 1444 #ifdef CONFIG_NET 1445 case BPF_FUNC_sk_storage_get: 1446 return &bpf_sk_storage_get_proto; 1447 case BPF_FUNC_sk_storage_delete: 1448 return &bpf_sk_storage_delete_proto; 1449 #endif 1450 #ifdef CONFIG_INET 1451 case BPF_FUNC_tcp_sock: 1452 return &bpf_tcp_sock_proto; 1453 #endif 1454 default: 1455 return cgroup_base_func_proto(func_id, prog); 1456 } 1457 } 1458 1459 static bool cg_sockopt_is_valid_access(int off, int size, 1460 enum bpf_access_type type, 1461 const struct bpf_prog *prog, 1462 struct bpf_insn_access_aux *info) 1463 { 1464 const int size_default = sizeof(__u32); 1465 1466 if (off < 0 || off >= sizeof(struct bpf_sockopt)) 1467 return false; 1468 1469 if (off % size != 0) 1470 return false; 1471 1472 if (type == BPF_WRITE) { 1473 switch (off) { 1474 case offsetof(struct bpf_sockopt, retval): 1475 if (size != size_default) 1476 return false; 1477 return prog->expected_attach_type == 1478 BPF_CGROUP_GETSOCKOPT; 1479 case offsetof(struct bpf_sockopt, optname): 1480 /* fallthrough */ 1481 case offsetof(struct bpf_sockopt, level): 1482 if (size != size_default) 1483 return false; 1484 return prog->expected_attach_type == 1485 BPF_CGROUP_SETSOCKOPT; 1486 case offsetof(struct bpf_sockopt, optlen): 1487 return size == size_default; 1488 default: 1489 return false; 1490 } 1491 } 1492 1493 switch (off) { 1494 case offsetof(struct bpf_sockopt, sk): 1495 if (size != sizeof(__u64)) 1496 return false; 1497 info->reg_type = PTR_TO_SOCKET; 1498 break; 1499 case offsetof(struct bpf_sockopt, optval): 1500 if (size != sizeof(__u64)) 1501 return false; 1502 info->reg_type = PTR_TO_PACKET; 1503 break; 1504 case offsetof(struct bpf_sockopt, optval_end): 1505 if (size != sizeof(__u64)) 1506 return false; 1507 info->reg_type = PTR_TO_PACKET_END; 1508 break; 1509 case offsetof(struct bpf_sockopt, retval): 1510 if (size != size_default) 1511 return false; 1512 return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT; 1513 default: 1514 if (size != size_default) 1515 return false; 1516 break; 1517 } 1518 return true; 1519 } 1520 1521 #define CG_SOCKOPT_ACCESS_FIELD(T, F) \ 1522 T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \ 1523 si->dst_reg, si->src_reg, \ 1524 offsetof(struct bpf_sockopt_kern, F)) 1525 1526 static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type, 1527 const struct bpf_insn *si, 1528 struct bpf_insn *insn_buf, 1529 struct bpf_prog *prog, 1530 u32 *target_size) 1531 { 1532 struct bpf_insn *insn = insn_buf; 1533 1534 switch (si->off) { 1535 case offsetof(struct bpf_sockopt, sk): 1536 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk); 1537 break; 1538 case offsetof(struct bpf_sockopt, level): 1539 if (type == BPF_WRITE) 1540 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level); 1541 else 1542 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level); 1543 break; 1544 case offsetof(struct bpf_sockopt, optname): 1545 if (type == BPF_WRITE) 1546 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname); 1547 else 1548 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname); 1549 break; 1550 case offsetof(struct bpf_sockopt, optlen): 1551 if (type == BPF_WRITE) 1552 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen); 1553 else 1554 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen); 1555 break; 1556 case offsetof(struct bpf_sockopt, retval): 1557 if (type == BPF_WRITE) 1558 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval); 1559 else 1560 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval); 1561 break; 1562 case offsetof(struct bpf_sockopt, optval): 1563 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval); 1564 break; 1565 case offsetof(struct bpf_sockopt, optval_end): 1566 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end); 1567 break; 1568 } 1569 1570 return insn - insn_buf; 1571 } 1572 1573 static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf, 1574 bool direct_write, 1575 const struct bpf_prog *prog) 1576 { 1577 /* Nothing to do for sockopt argument. The data is kzalloc'ated. 1578 */ 1579 return 0; 1580 } 1581 1582 const struct bpf_verifier_ops cg_sockopt_verifier_ops = { 1583 .get_func_proto = cg_sockopt_func_proto, 1584 .is_valid_access = cg_sockopt_is_valid_access, 1585 .convert_ctx_access = cg_sockopt_convert_ctx_access, 1586 .gen_prologue = cg_sockopt_get_prologue, 1587 }; 1588 1589 const struct bpf_prog_ops cg_sockopt_prog_ops = { 1590 }; 1591