1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Functions to manage eBPF programs attached to cgroups 4 * 5 * Copyright (c) 2016 Daniel Mack 6 */ 7 8 #include <linux/kernel.h> 9 #include <linux/atomic.h> 10 #include <linux/cgroup.h> 11 #include <linux/filter.h> 12 #include <linux/slab.h> 13 #include <linux/sysctl.h> 14 #include <linux/string.h> 15 #include <linux/bpf.h> 16 #include <linux/bpf-cgroup.h> 17 #include <net/sock.h> 18 #include <net/bpf_sk_storage.h> 19 20 #include "../cgroup/cgroup-internal.h" 21 22 DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); 23 EXPORT_SYMBOL(cgroup_bpf_enabled_key); 24 25 void cgroup_bpf_offline(struct cgroup *cgrp) 26 { 27 cgroup_get(cgrp); 28 percpu_ref_kill(&cgrp->bpf.refcnt); 29 } 30 31 /** 32 * cgroup_bpf_release() - put references of all bpf programs and 33 * release all cgroup bpf data 34 * @work: work structure embedded into the cgroup to modify 35 */ 36 static void cgroup_bpf_release(struct work_struct *work) 37 { 38 struct cgroup *p, *cgrp = container_of(work, struct cgroup, 39 bpf.release_work); 40 enum bpf_cgroup_storage_type stype; 41 struct bpf_prog_array *old_array; 42 unsigned int type; 43 44 mutex_lock(&cgroup_mutex); 45 46 for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { 47 struct list_head *progs = &cgrp->bpf.progs[type]; 48 struct bpf_prog_list *pl, *tmp; 49 50 list_for_each_entry_safe(pl, tmp, progs, node) { 51 list_del(&pl->node); 52 bpf_prog_put(pl->prog); 53 for_each_cgroup_storage_type(stype) { 54 bpf_cgroup_storage_unlink(pl->storage[stype]); 55 bpf_cgroup_storage_free(pl->storage[stype]); 56 } 57 kfree(pl); 58 static_branch_dec(&cgroup_bpf_enabled_key); 59 } 60 old_array = rcu_dereference_protected( 61 cgrp->bpf.effective[type], 62 lockdep_is_held(&cgroup_mutex)); 63 bpf_prog_array_free(old_array); 64 } 65 66 mutex_unlock(&cgroup_mutex); 67 68 for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) 69 cgroup_bpf_put(p); 70 71 percpu_ref_exit(&cgrp->bpf.refcnt); 72 cgroup_put(cgrp); 73 } 74 75 /** 76 * cgroup_bpf_release_fn() - callback used to schedule releasing 77 * of bpf cgroup data 78 * @ref: percpu ref counter structure 79 */ 80 static void cgroup_bpf_release_fn(struct percpu_ref *ref) 81 { 82 struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt); 83 84 INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release); 85 queue_work(system_wq, &cgrp->bpf.release_work); 86 } 87 88 /* count number of elements in the list. 89 * it's slow but the list cannot be long 90 */ 91 static u32 prog_list_length(struct list_head *head) 92 { 93 struct bpf_prog_list *pl; 94 u32 cnt = 0; 95 96 list_for_each_entry(pl, head, node) { 97 if (!pl->prog) 98 continue; 99 cnt++; 100 } 101 return cnt; 102 } 103 104 /* if parent has non-overridable prog attached, 105 * disallow attaching new programs to the descendent cgroup. 106 * if parent has overridable or multi-prog, allow attaching 107 */ 108 static bool hierarchy_allows_attach(struct cgroup *cgrp, 109 enum bpf_attach_type type, 110 u32 new_flags) 111 { 112 struct cgroup *p; 113 114 p = cgroup_parent(cgrp); 115 if (!p) 116 return true; 117 do { 118 u32 flags = p->bpf.flags[type]; 119 u32 cnt; 120 121 if (flags & BPF_F_ALLOW_MULTI) 122 return true; 123 cnt = prog_list_length(&p->bpf.progs[type]); 124 WARN_ON_ONCE(cnt > 1); 125 if (cnt == 1) 126 return !!(flags & BPF_F_ALLOW_OVERRIDE); 127 p = cgroup_parent(p); 128 } while (p); 129 return true; 130 } 131 132 /* compute a chain of effective programs for a given cgroup: 133 * start from the list of programs in this cgroup and add 134 * all parent programs. 135 * Note that parent's F_ALLOW_OVERRIDE-type program is yielding 136 * to programs in this cgroup 137 */ 138 static int compute_effective_progs(struct cgroup *cgrp, 139 enum bpf_attach_type type, 140 struct bpf_prog_array **array) 141 { 142 enum bpf_cgroup_storage_type stype; 143 struct bpf_prog_array *progs; 144 struct bpf_prog_list *pl; 145 struct cgroup *p = cgrp; 146 int cnt = 0; 147 148 /* count number of effective programs by walking parents */ 149 do { 150 if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) 151 cnt += prog_list_length(&p->bpf.progs[type]); 152 p = cgroup_parent(p); 153 } while (p); 154 155 progs = bpf_prog_array_alloc(cnt, GFP_KERNEL); 156 if (!progs) 157 return -ENOMEM; 158 159 /* populate the array with effective progs */ 160 cnt = 0; 161 p = cgrp; 162 do { 163 if (cnt > 0 && !(p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) 164 continue; 165 166 list_for_each_entry(pl, &p->bpf.progs[type], node) { 167 if (!pl->prog) 168 continue; 169 170 progs->items[cnt].prog = pl->prog; 171 for_each_cgroup_storage_type(stype) 172 progs->items[cnt].cgroup_storage[stype] = 173 pl->storage[stype]; 174 cnt++; 175 } 176 } while ((p = cgroup_parent(p))); 177 178 *array = progs; 179 return 0; 180 } 181 182 static void activate_effective_progs(struct cgroup *cgrp, 183 enum bpf_attach_type type, 184 struct bpf_prog_array *old_array) 185 { 186 old_array = rcu_replace_pointer(cgrp->bpf.effective[type], old_array, 187 lockdep_is_held(&cgroup_mutex)); 188 /* free prog array after grace period, since __cgroup_bpf_run_*() 189 * might be still walking the array 190 */ 191 bpf_prog_array_free(old_array); 192 } 193 194 /** 195 * cgroup_bpf_inherit() - inherit effective programs from parent 196 * @cgrp: the cgroup to modify 197 */ 198 int cgroup_bpf_inherit(struct cgroup *cgrp) 199 { 200 /* has to use marco instead of const int, since compiler thinks 201 * that array below is variable length 202 */ 203 #define NR ARRAY_SIZE(cgrp->bpf.effective) 204 struct bpf_prog_array *arrays[NR] = {}; 205 struct cgroup *p; 206 int ret, i; 207 208 ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0, 209 GFP_KERNEL); 210 if (ret) 211 return ret; 212 213 for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) 214 cgroup_bpf_get(p); 215 216 for (i = 0; i < NR; i++) 217 INIT_LIST_HEAD(&cgrp->bpf.progs[i]); 218 219 for (i = 0; i < NR; i++) 220 if (compute_effective_progs(cgrp, i, &arrays[i])) 221 goto cleanup; 222 223 for (i = 0; i < NR; i++) 224 activate_effective_progs(cgrp, i, arrays[i]); 225 226 return 0; 227 cleanup: 228 for (i = 0; i < NR; i++) 229 bpf_prog_array_free(arrays[i]); 230 231 percpu_ref_exit(&cgrp->bpf.refcnt); 232 233 return -ENOMEM; 234 } 235 236 static int update_effective_progs(struct cgroup *cgrp, 237 enum bpf_attach_type type) 238 { 239 struct cgroup_subsys_state *css; 240 int err; 241 242 /* allocate and recompute effective prog arrays */ 243 css_for_each_descendant_pre(css, &cgrp->self) { 244 struct cgroup *desc = container_of(css, struct cgroup, self); 245 246 if (percpu_ref_is_zero(&desc->bpf.refcnt)) 247 continue; 248 249 err = compute_effective_progs(desc, type, &desc->bpf.inactive); 250 if (err) 251 goto cleanup; 252 } 253 254 /* all allocations were successful. Activate all prog arrays */ 255 css_for_each_descendant_pre(css, &cgrp->self) { 256 struct cgroup *desc = container_of(css, struct cgroup, self); 257 258 if (percpu_ref_is_zero(&desc->bpf.refcnt)) { 259 if (unlikely(desc->bpf.inactive)) { 260 bpf_prog_array_free(desc->bpf.inactive); 261 desc->bpf.inactive = NULL; 262 } 263 continue; 264 } 265 266 activate_effective_progs(desc, type, desc->bpf.inactive); 267 desc->bpf.inactive = NULL; 268 } 269 270 return 0; 271 272 cleanup: 273 /* oom while computing effective. Free all computed effective arrays 274 * since they were not activated 275 */ 276 css_for_each_descendant_pre(css, &cgrp->self) { 277 struct cgroup *desc = container_of(css, struct cgroup, self); 278 279 bpf_prog_array_free(desc->bpf.inactive); 280 desc->bpf.inactive = NULL; 281 } 282 283 return err; 284 } 285 286 #define BPF_CGROUP_MAX_PROGS 64 287 288 /** 289 * __cgroup_bpf_attach() - Attach the program to a cgroup, and 290 * propagate the change to descendants 291 * @cgrp: The cgroup which descendants to traverse 292 * @prog: A program to attach 293 * @type: Type of attach operation 294 * @flags: Option flags 295 * 296 * Must be called with cgroup_mutex held. 297 */ 298 int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, 299 enum bpf_attach_type type, u32 flags) 300 { 301 struct list_head *progs = &cgrp->bpf.progs[type]; 302 struct bpf_prog *old_prog = NULL; 303 struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE], 304 *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL}; 305 enum bpf_cgroup_storage_type stype; 306 struct bpf_prog_list *pl; 307 bool pl_was_allocated; 308 int err; 309 310 if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) 311 /* invalid combination */ 312 return -EINVAL; 313 314 if (!hierarchy_allows_attach(cgrp, type, flags)) 315 return -EPERM; 316 317 if (!list_empty(progs) && cgrp->bpf.flags[type] != flags) 318 /* Disallow attaching non-overridable on top 319 * of existing overridable in this cgroup. 320 * Disallow attaching multi-prog if overridable or none 321 */ 322 return -EPERM; 323 324 if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) 325 return -E2BIG; 326 327 for_each_cgroup_storage_type(stype) { 328 storage[stype] = bpf_cgroup_storage_alloc(prog, stype); 329 if (IS_ERR(storage[stype])) { 330 storage[stype] = NULL; 331 for_each_cgroup_storage_type(stype) 332 bpf_cgroup_storage_free(storage[stype]); 333 return -ENOMEM; 334 } 335 } 336 337 if (flags & BPF_F_ALLOW_MULTI) { 338 list_for_each_entry(pl, progs, node) { 339 if (pl->prog == prog) { 340 /* disallow attaching the same prog twice */ 341 for_each_cgroup_storage_type(stype) 342 bpf_cgroup_storage_free(storage[stype]); 343 return -EINVAL; 344 } 345 } 346 347 pl = kmalloc(sizeof(*pl), GFP_KERNEL); 348 if (!pl) { 349 for_each_cgroup_storage_type(stype) 350 bpf_cgroup_storage_free(storage[stype]); 351 return -ENOMEM; 352 } 353 354 pl_was_allocated = true; 355 pl->prog = prog; 356 for_each_cgroup_storage_type(stype) 357 pl->storage[stype] = storage[stype]; 358 list_add_tail(&pl->node, progs); 359 } else { 360 if (list_empty(progs)) { 361 pl = kmalloc(sizeof(*pl), GFP_KERNEL); 362 if (!pl) { 363 for_each_cgroup_storage_type(stype) 364 bpf_cgroup_storage_free(storage[stype]); 365 return -ENOMEM; 366 } 367 pl_was_allocated = true; 368 list_add_tail(&pl->node, progs); 369 } else { 370 pl = list_first_entry(progs, typeof(*pl), node); 371 old_prog = pl->prog; 372 for_each_cgroup_storage_type(stype) { 373 old_storage[stype] = pl->storage[stype]; 374 bpf_cgroup_storage_unlink(old_storage[stype]); 375 } 376 pl_was_allocated = false; 377 } 378 pl->prog = prog; 379 for_each_cgroup_storage_type(stype) 380 pl->storage[stype] = storage[stype]; 381 } 382 383 cgrp->bpf.flags[type] = flags; 384 385 err = update_effective_progs(cgrp, type); 386 if (err) 387 goto cleanup; 388 389 static_branch_inc(&cgroup_bpf_enabled_key); 390 for_each_cgroup_storage_type(stype) { 391 if (!old_storage[stype]) 392 continue; 393 bpf_cgroup_storage_free(old_storage[stype]); 394 } 395 if (old_prog) { 396 bpf_prog_put(old_prog); 397 static_branch_dec(&cgroup_bpf_enabled_key); 398 } 399 for_each_cgroup_storage_type(stype) 400 bpf_cgroup_storage_link(storage[stype], cgrp, type); 401 return 0; 402 403 cleanup: 404 /* and cleanup the prog list */ 405 pl->prog = old_prog; 406 for_each_cgroup_storage_type(stype) { 407 bpf_cgroup_storage_free(pl->storage[stype]); 408 pl->storage[stype] = old_storage[stype]; 409 bpf_cgroup_storage_link(old_storage[stype], cgrp, type); 410 } 411 if (pl_was_allocated) { 412 list_del(&pl->node); 413 kfree(pl); 414 } 415 return err; 416 } 417 418 /** 419 * __cgroup_bpf_detach() - Detach the program from a cgroup, and 420 * propagate the change to descendants 421 * @cgrp: The cgroup which descendants to traverse 422 * @prog: A program to detach or NULL 423 * @type: Type of detach operation 424 * 425 * Must be called with cgroup_mutex held. 426 */ 427 int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, 428 enum bpf_attach_type type) 429 { 430 struct list_head *progs = &cgrp->bpf.progs[type]; 431 enum bpf_cgroup_storage_type stype; 432 u32 flags = cgrp->bpf.flags[type]; 433 struct bpf_prog *old_prog = NULL; 434 struct bpf_prog_list *pl; 435 int err; 436 437 if (flags & BPF_F_ALLOW_MULTI) { 438 if (!prog) 439 /* to detach MULTI prog the user has to specify valid FD 440 * of the program to be detached 441 */ 442 return -EINVAL; 443 } else { 444 if (list_empty(progs)) 445 /* report error when trying to detach and nothing is attached */ 446 return -ENOENT; 447 } 448 449 if (flags & BPF_F_ALLOW_MULTI) { 450 /* find the prog and detach it */ 451 list_for_each_entry(pl, progs, node) { 452 if (pl->prog != prog) 453 continue; 454 old_prog = prog; 455 /* mark it deleted, so it's ignored while 456 * recomputing effective 457 */ 458 pl->prog = NULL; 459 break; 460 } 461 if (!old_prog) 462 return -ENOENT; 463 } else { 464 /* to maintain backward compatibility NONE and OVERRIDE cgroups 465 * allow detaching with invalid FD (prog==NULL) 466 */ 467 pl = list_first_entry(progs, typeof(*pl), node); 468 old_prog = pl->prog; 469 pl->prog = NULL; 470 } 471 472 err = update_effective_progs(cgrp, type); 473 if (err) 474 goto cleanup; 475 476 /* now can actually delete it from this cgroup list */ 477 list_del(&pl->node); 478 for_each_cgroup_storage_type(stype) { 479 bpf_cgroup_storage_unlink(pl->storage[stype]); 480 bpf_cgroup_storage_free(pl->storage[stype]); 481 } 482 kfree(pl); 483 if (list_empty(progs)) 484 /* last program was detached, reset flags to zero */ 485 cgrp->bpf.flags[type] = 0; 486 487 bpf_prog_put(old_prog); 488 static_branch_dec(&cgroup_bpf_enabled_key); 489 return 0; 490 491 cleanup: 492 /* and restore back old_prog */ 493 pl->prog = old_prog; 494 return err; 495 } 496 497 /* Must be called with cgroup_mutex held to avoid races. */ 498 int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, 499 union bpf_attr __user *uattr) 500 { 501 __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); 502 enum bpf_attach_type type = attr->query.attach_type; 503 struct list_head *progs = &cgrp->bpf.progs[type]; 504 u32 flags = cgrp->bpf.flags[type]; 505 struct bpf_prog_array *effective; 506 int cnt, ret = 0, i; 507 508 effective = rcu_dereference_protected(cgrp->bpf.effective[type], 509 lockdep_is_held(&cgroup_mutex)); 510 511 if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) 512 cnt = bpf_prog_array_length(effective); 513 else 514 cnt = prog_list_length(progs); 515 516 if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) 517 return -EFAULT; 518 if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt))) 519 return -EFAULT; 520 if (attr->query.prog_cnt == 0 || !prog_ids || !cnt) 521 /* return early if user requested only program count + flags */ 522 return 0; 523 if (attr->query.prog_cnt < cnt) { 524 cnt = attr->query.prog_cnt; 525 ret = -ENOSPC; 526 } 527 528 if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { 529 return bpf_prog_array_copy_to_user(effective, prog_ids, cnt); 530 } else { 531 struct bpf_prog_list *pl; 532 u32 id; 533 534 i = 0; 535 list_for_each_entry(pl, progs, node) { 536 id = pl->prog->aux->id; 537 if (copy_to_user(prog_ids + i, &id, sizeof(id))) 538 return -EFAULT; 539 if (++i == cnt) 540 break; 541 } 542 } 543 return ret; 544 } 545 546 int cgroup_bpf_prog_attach(const union bpf_attr *attr, 547 enum bpf_prog_type ptype, struct bpf_prog *prog) 548 { 549 struct cgroup *cgrp; 550 int ret; 551 552 cgrp = cgroup_get_from_fd(attr->target_fd); 553 if (IS_ERR(cgrp)) 554 return PTR_ERR(cgrp); 555 556 ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type, 557 attr->attach_flags); 558 cgroup_put(cgrp); 559 return ret; 560 } 561 562 int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) 563 { 564 struct bpf_prog *prog; 565 struct cgroup *cgrp; 566 int ret; 567 568 cgrp = cgroup_get_from_fd(attr->target_fd); 569 if (IS_ERR(cgrp)) 570 return PTR_ERR(cgrp); 571 572 prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); 573 if (IS_ERR(prog)) 574 prog = NULL; 575 576 ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0); 577 if (prog) 578 bpf_prog_put(prog); 579 580 cgroup_put(cgrp); 581 return ret; 582 } 583 584 int cgroup_bpf_prog_query(const union bpf_attr *attr, 585 union bpf_attr __user *uattr) 586 { 587 struct cgroup *cgrp; 588 int ret; 589 590 cgrp = cgroup_get_from_fd(attr->query.target_fd); 591 if (IS_ERR(cgrp)) 592 return PTR_ERR(cgrp); 593 594 ret = cgroup_bpf_query(cgrp, attr, uattr); 595 596 cgroup_put(cgrp); 597 return ret; 598 } 599 600 /** 601 * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering 602 * @sk: The socket sending or receiving traffic 603 * @skb: The skb that is being sent or received 604 * @type: The type of program to be exectuted 605 * 606 * If no socket is passed, or the socket is not of type INET or INET6, 607 * this function does nothing and returns 0. 608 * 609 * The program type passed in via @type must be suitable for network 610 * filtering. No further check is performed to assert that. 611 * 612 * For egress packets, this function can return: 613 * NET_XMIT_SUCCESS (0) - continue with packet output 614 * NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr 615 * NET_XMIT_CN (2) - continue with packet output and notify TCP 616 * to call cwr 617 * -EPERM - drop packet 618 * 619 * For ingress packets, this function will return -EPERM if any 620 * attached program was found and if it returned != 1 during execution. 621 * Otherwise 0 is returned. 622 */ 623 int __cgroup_bpf_run_filter_skb(struct sock *sk, 624 struct sk_buff *skb, 625 enum bpf_attach_type type) 626 { 627 unsigned int offset = skb->data - skb_network_header(skb); 628 struct sock *save_sk; 629 void *saved_data_end; 630 struct cgroup *cgrp; 631 int ret; 632 633 if (!sk || !sk_fullsock(sk)) 634 return 0; 635 636 if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) 637 return 0; 638 639 cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 640 save_sk = skb->sk; 641 skb->sk = sk; 642 __skb_push(skb, offset); 643 644 /* compute pointers for the bpf prog */ 645 bpf_compute_and_save_data_end(skb, &saved_data_end); 646 647 if (type == BPF_CGROUP_INET_EGRESS) { 648 ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY( 649 cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb); 650 } else { 651 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, 652 __bpf_prog_run_save_cb); 653 ret = (ret == 1 ? 0 : -EPERM); 654 } 655 bpf_restore_data_end(skb, saved_data_end); 656 __skb_pull(skb, offset); 657 skb->sk = save_sk; 658 659 return ret; 660 } 661 EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); 662 663 /** 664 * __cgroup_bpf_run_filter_sk() - Run a program on a sock 665 * @sk: sock structure to manipulate 666 * @type: The type of program to be exectuted 667 * 668 * socket is passed is expected to be of type INET or INET6. 669 * 670 * The program type passed in via @type must be suitable for sock 671 * filtering. No further check is performed to assert that. 672 * 673 * This function will return %-EPERM if any if an attached program was found 674 * and if it returned != 1 during execution. In all other cases, 0 is returned. 675 */ 676 int __cgroup_bpf_run_filter_sk(struct sock *sk, 677 enum bpf_attach_type type) 678 { 679 struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 680 int ret; 681 682 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN); 683 return ret == 1 ? 0 : -EPERM; 684 } 685 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); 686 687 /** 688 * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and 689 * provided by user sockaddr 690 * @sk: sock struct that will use sockaddr 691 * @uaddr: sockaddr struct provided by user 692 * @type: The type of program to be exectuted 693 * @t_ctx: Pointer to attach type specific context 694 * 695 * socket is expected to be of type INET or INET6. 696 * 697 * This function will return %-EPERM if an attached program is found and 698 * returned value != 1 during execution. In all other cases, 0 is returned. 699 */ 700 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, 701 struct sockaddr *uaddr, 702 enum bpf_attach_type type, 703 void *t_ctx) 704 { 705 struct bpf_sock_addr_kern ctx = { 706 .sk = sk, 707 .uaddr = uaddr, 708 .t_ctx = t_ctx, 709 }; 710 struct sockaddr_storage unspec; 711 struct cgroup *cgrp; 712 int ret; 713 714 /* Check socket family since not all sockets represent network 715 * endpoint (e.g. AF_UNIX). 716 */ 717 if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) 718 return 0; 719 720 if (!ctx.uaddr) { 721 memset(&unspec, 0, sizeof(unspec)); 722 ctx.uaddr = (struct sockaddr *)&unspec; 723 } 724 725 cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 726 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); 727 728 return ret == 1 ? 0 : -EPERM; 729 } 730 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); 731 732 /** 733 * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock 734 * @sk: socket to get cgroup from 735 * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains 736 * sk with connection information (IP addresses, etc.) May not contain 737 * cgroup info if it is a req sock. 738 * @type: The type of program to be exectuted 739 * 740 * socket passed is expected to be of type INET or INET6. 741 * 742 * The program type passed in via @type must be suitable for sock_ops 743 * filtering. No further check is performed to assert that. 744 * 745 * This function will return %-EPERM if any if an attached program was found 746 * and if it returned != 1 during execution. In all other cases, 0 is returned. 747 */ 748 int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, 749 struct bpf_sock_ops_kern *sock_ops, 750 enum bpf_attach_type type) 751 { 752 struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 753 int ret; 754 755 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops, 756 BPF_PROG_RUN); 757 return ret == 1 ? 0 : -EPERM; 758 } 759 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); 760 761 int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, 762 short access, enum bpf_attach_type type) 763 { 764 struct cgroup *cgrp; 765 struct bpf_cgroup_dev_ctx ctx = { 766 .access_type = (access << 16) | dev_type, 767 .major = major, 768 .minor = minor, 769 }; 770 int allow = 1; 771 772 rcu_read_lock(); 773 cgrp = task_dfl_cgroup(current); 774 allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, 775 BPF_PROG_RUN); 776 rcu_read_unlock(); 777 778 return !allow; 779 } 780 EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission); 781 782 static const struct bpf_func_proto * 783 cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 784 { 785 switch (func_id) { 786 case BPF_FUNC_map_lookup_elem: 787 return &bpf_map_lookup_elem_proto; 788 case BPF_FUNC_map_update_elem: 789 return &bpf_map_update_elem_proto; 790 case BPF_FUNC_map_delete_elem: 791 return &bpf_map_delete_elem_proto; 792 case BPF_FUNC_map_push_elem: 793 return &bpf_map_push_elem_proto; 794 case BPF_FUNC_map_pop_elem: 795 return &bpf_map_pop_elem_proto; 796 case BPF_FUNC_map_peek_elem: 797 return &bpf_map_peek_elem_proto; 798 case BPF_FUNC_get_current_uid_gid: 799 return &bpf_get_current_uid_gid_proto; 800 case BPF_FUNC_get_local_storage: 801 return &bpf_get_local_storage_proto; 802 case BPF_FUNC_get_current_cgroup_id: 803 return &bpf_get_current_cgroup_id_proto; 804 case BPF_FUNC_trace_printk: 805 if (capable(CAP_SYS_ADMIN)) 806 return bpf_get_trace_printk_proto(); 807 /* fall through */ 808 default: 809 return NULL; 810 } 811 } 812 813 static const struct bpf_func_proto * 814 cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 815 { 816 return cgroup_base_func_proto(func_id, prog); 817 } 818 819 static bool cgroup_dev_is_valid_access(int off, int size, 820 enum bpf_access_type type, 821 const struct bpf_prog *prog, 822 struct bpf_insn_access_aux *info) 823 { 824 const int size_default = sizeof(__u32); 825 826 if (type == BPF_WRITE) 827 return false; 828 829 if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx)) 830 return false; 831 /* The verifier guarantees that size > 0. */ 832 if (off % size != 0) 833 return false; 834 835 switch (off) { 836 case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type): 837 bpf_ctx_record_field_size(info, size_default); 838 if (!bpf_ctx_narrow_access_ok(off, size, size_default)) 839 return false; 840 break; 841 default: 842 if (size != size_default) 843 return false; 844 } 845 846 return true; 847 } 848 849 const struct bpf_prog_ops cg_dev_prog_ops = { 850 }; 851 852 const struct bpf_verifier_ops cg_dev_verifier_ops = { 853 .get_func_proto = cgroup_dev_func_proto, 854 .is_valid_access = cgroup_dev_is_valid_access, 855 }; 856 857 /** 858 * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl 859 * 860 * @head: sysctl table header 861 * @table: sysctl table 862 * @write: sysctl is being read (= 0) or written (= 1) 863 * @buf: pointer to buffer passed by user space 864 * @pcount: value-result argument: value is size of buffer pointed to by @buf, 865 * result is size of @new_buf if program set new value, initial value 866 * otherwise 867 * @ppos: value-result argument: value is position at which read from or write 868 * to sysctl is happening, result is new position if program overrode it, 869 * initial value otherwise 870 * @new_buf: pointer to pointer to new buffer that will be allocated if program 871 * overrides new value provided by user space on sysctl write 872 * NOTE: it's caller responsibility to free *new_buf if it was set 873 * @type: type of program to be executed 874 * 875 * Program is run when sysctl is being accessed, either read or written, and 876 * can allow or deny such access. 877 * 878 * This function will return %-EPERM if an attached program is found and 879 * returned value != 1 during execution. In all other cases 0 is returned. 880 */ 881 int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, 882 struct ctl_table *table, int write, 883 void __user *buf, size_t *pcount, 884 loff_t *ppos, void **new_buf, 885 enum bpf_attach_type type) 886 { 887 struct bpf_sysctl_kern ctx = { 888 .head = head, 889 .table = table, 890 .write = write, 891 .ppos = ppos, 892 .cur_val = NULL, 893 .cur_len = PAGE_SIZE, 894 .new_val = NULL, 895 .new_len = 0, 896 .new_updated = 0, 897 }; 898 struct cgroup *cgrp; 899 int ret; 900 901 ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL); 902 if (ctx.cur_val) { 903 mm_segment_t old_fs; 904 loff_t pos = 0; 905 906 old_fs = get_fs(); 907 set_fs(KERNEL_DS); 908 if (table->proc_handler(table, 0, (void __user *)ctx.cur_val, 909 &ctx.cur_len, &pos)) { 910 /* Let BPF program decide how to proceed. */ 911 ctx.cur_len = 0; 912 } 913 set_fs(old_fs); 914 } else { 915 /* Let BPF program decide how to proceed. */ 916 ctx.cur_len = 0; 917 } 918 919 if (write && buf && *pcount) { 920 /* BPF program should be able to override new value with a 921 * buffer bigger than provided by user. 922 */ 923 ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL); 924 ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount); 925 if (!ctx.new_val || 926 copy_from_user(ctx.new_val, buf, ctx.new_len)) 927 /* Let BPF program decide how to proceed. */ 928 ctx.new_len = 0; 929 } 930 931 rcu_read_lock(); 932 cgrp = task_dfl_cgroup(current); 933 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); 934 rcu_read_unlock(); 935 936 kfree(ctx.cur_val); 937 938 if (ret == 1 && ctx.new_updated) { 939 *new_buf = ctx.new_val; 940 *pcount = ctx.new_len; 941 } else { 942 kfree(ctx.new_val); 943 } 944 945 return ret == 1 ? 0 : -EPERM; 946 } 947 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); 948 949 #ifdef CONFIG_NET 950 static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, 951 enum bpf_attach_type attach_type) 952 { 953 struct bpf_prog_array *prog_array; 954 bool empty; 955 956 rcu_read_lock(); 957 prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]); 958 empty = bpf_prog_array_is_empty(prog_array); 959 rcu_read_unlock(); 960 961 return empty; 962 } 963 964 static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) 965 { 966 if (unlikely(max_optlen > PAGE_SIZE) || max_optlen < 0) 967 return -EINVAL; 968 969 ctx->optval = kzalloc(max_optlen, GFP_USER); 970 if (!ctx->optval) 971 return -ENOMEM; 972 973 ctx->optval_end = ctx->optval + max_optlen; 974 975 return 0; 976 } 977 978 static void sockopt_free_buf(struct bpf_sockopt_kern *ctx) 979 { 980 kfree(ctx->optval); 981 } 982 983 int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, 984 int *optname, char __user *optval, 985 int *optlen, char **kernel_optval) 986 { 987 struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 988 struct bpf_sockopt_kern ctx = { 989 .sk = sk, 990 .level = *level, 991 .optname = *optname, 992 }; 993 int ret, max_optlen; 994 995 /* Opportunistic check to see whether we have any BPF program 996 * attached to the hook so we don't waste time allocating 997 * memory and locking the socket. 998 */ 999 if (!cgroup_bpf_enabled || 1000 __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) 1001 return 0; 1002 1003 /* Allocate a bit more than the initial user buffer for 1004 * BPF program. The canonical use case is overriding 1005 * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic). 1006 */ 1007 max_optlen = max_t(int, 16, *optlen); 1008 1009 ret = sockopt_alloc_buf(&ctx, max_optlen); 1010 if (ret) 1011 return ret; 1012 1013 ctx.optlen = *optlen; 1014 1015 if (copy_from_user(ctx.optval, optval, *optlen) != 0) { 1016 ret = -EFAULT; 1017 goto out; 1018 } 1019 1020 lock_sock(sk); 1021 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT], 1022 &ctx, BPF_PROG_RUN); 1023 release_sock(sk); 1024 1025 if (!ret) { 1026 ret = -EPERM; 1027 goto out; 1028 } 1029 1030 if (ctx.optlen == -1) { 1031 /* optlen set to -1, bypass kernel */ 1032 ret = 1; 1033 } else if (ctx.optlen > max_optlen || ctx.optlen < -1) { 1034 /* optlen is out of bounds */ 1035 ret = -EFAULT; 1036 } else { 1037 /* optlen within bounds, run kernel handler */ 1038 ret = 0; 1039 1040 /* export any potential modifications */ 1041 *level = ctx.level; 1042 *optname = ctx.optname; 1043 *optlen = ctx.optlen; 1044 *kernel_optval = ctx.optval; 1045 } 1046 1047 out: 1048 if (ret) 1049 sockopt_free_buf(&ctx); 1050 return ret; 1051 } 1052 EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt); 1053 1054 int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, 1055 int optname, char __user *optval, 1056 int __user *optlen, int max_optlen, 1057 int retval) 1058 { 1059 struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 1060 struct bpf_sockopt_kern ctx = { 1061 .sk = sk, 1062 .level = level, 1063 .optname = optname, 1064 .retval = retval, 1065 }; 1066 int ret; 1067 1068 /* Opportunistic check to see whether we have any BPF program 1069 * attached to the hook so we don't waste time allocating 1070 * memory and locking the socket. 1071 */ 1072 if (!cgroup_bpf_enabled || 1073 __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) 1074 return retval; 1075 1076 ret = sockopt_alloc_buf(&ctx, max_optlen); 1077 if (ret) 1078 return ret; 1079 1080 ctx.optlen = max_optlen; 1081 1082 if (!retval) { 1083 /* If kernel getsockopt finished successfully, 1084 * copy whatever was returned to the user back 1085 * into our temporary buffer. Set optlen to the 1086 * one that kernel returned as well to let 1087 * BPF programs inspect the value. 1088 */ 1089 1090 if (get_user(ctx.optlen, optlen)) { 1091 ret = -EFAULT; 1092 goto out; 1093 } 1094 1095 if (ctx.optlen > max_optlen) 1096 ctx.optlen = max_optlen; 1097 1098 if (copy_from_user(ctx.optval, optval, ctx.optlen) != 0) { 1099 ret = -EFAULT; 1100 goto out; 1101 } 1102 } 1103 1104 lock_sock(sk); 1105 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], 1106 &ctx, BPF_PROG_RUN); 1107 release_sock(sk); 1108 1109 if (!ret) { 1110 ret = -EPERM; 1111 goto out; 1112 } 1113 1114 if (ctx.optlen > max_optlen) { 1115 ret = -EFAULT; 1116 goto out; 1117 } 1118 1119 /* BPF programs only allowed to set retval to 0, not some 1120 * arbitrary value. 1121 */ 1122 if (ctx.retval != 0 && ctx.retval != retval) { 1123 ret = -EFAULT; 1124 goto out; 1125 } 1126 1127 if (copy_to_user(optval, ctx.optval, ctx.optlen) || 1128 put_user(ctx.optlen, optlen)) { 1129 ret = -EFAULT; 1130 goto out; 1131 } 1132 1133 ret = ctx.retval; 1134 1135 out: 1136 sockopt_free_buf(&ctx); 1137 return ret; 1138 } 1139 EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt); 1140 #endif 1141 1142 static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, 1143 size_t *lenp) 1144 { 1145 ssize_t tmp_ret = 0, ret; 1146 1147 if (dir->header.parent) { 1148 tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp); 1149 if (tmp_ret < 0) 1150 return tmp_ret; 1151 } 1152 1153 ret = strscpy(*bufp, dir->header.ctl_table[0].procname, *lenp); 1154 if (ret < 0) 1155 return ret; 1156 *bufp += ret; 1157 *lenp -= ret; 1158 ret += tmp_ret; 1159 1160 /* Avoid leading slash. */ 1161 if (!ret) 1162 return ret; 1163 1164 tmp_ret = strscpy(*bufp, "/", *lenp); 1165 if (tmp_ret < 0) 1166 return tmp_ret; 1167 *bufp += tmp_ret; 1168 *lenp -= tmp_ret; 1169 1170 return ret + tmp_ret; 1171 } 1172 1173 BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern *, ctx, char *, buf, 1174 size_t, buf_len, u64, flags) 1175 { 1176 ssize_t tmp_ret = 0, ret; 1177 1178 if (!buf) 1179 return -EINVAL; 1180 1181 if (!(flags & BPF_F_SYSCTL_BASE_NAME)) { 1182 if (!ctx->head) 1183 return -EINVAL; 1184 tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len); 1185 if (tmp_ret < 0) 1186 return tmp_ret; 1187 } 1188 1189 ret = strscpy(buf, ctx->table->procname, buf_len); 1190 1191 return ret < 0 ? ret : tmp_ret + ret; 1192 } 1193 1194 static const struct bpf_func_proto bpf_sysctl_get_name_proto = { 1195 .func = bpf_sysctl_get_name, 1196 .gpl_only = false, 1197 .ret_type = RET_INTEGER, 1198 .arg1_type = ARG_PTR_TO_CTX, 1199 .arg2_type = ARG_PTR_TO_MEM, 1200 .arg3_type = ARG_CONST_SIZE, 1201 .arg4_type = ARG_ANYTHING, 1202 }; 1203 1204 static int copy_sysctl_value(char *dst, size_t dst_len, char *src, 1205 size_t src_len) 1206 { 1207 if (!dst) 1208 return -EINVAL; 1209 1210 if (!dst_len) 1211 return -E2BIG; 1212 1213 if (!src || !src_len) { 1214 memset(dst, 0, dst_len); 1215 return -EINVAL; 1216 } 1217 1218 memcpy(dst, src, min(dst_len, src_len)); 1219 1220 if (dst_len > src_len) { 1221 memset(dst + src_len, '\0', dst_len - src_len); 1222 return src_len; 1223 } 1224 1225 dst[dst_len - 1] = '\0'; 1226 1227 return -E2BIG; 1228 } 1229 1230 BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx, 1231 char *, buf, size_t, buf_len) 1232 { 1233 return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len); 1234 } 1235 1236 static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = { 1237 .func = bpf_sysctl_get_current_value, 1238 .gpl_only = false, 1239 .ret_type = RET_INTEGER, 1240 .arg1_type = ARG_PTR_TO_CTX, 1241 .arg2_type = ARG_PTR_TO_UNINIT_MEM, 1242 .arg3_type = ARG_CONST_SIZE, 1243 }; 1244 1245 BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf, 1246 size_t, buf_len) 1247 { 1248 if (!ctx->write) { 1249 if (buf && buf_len) 1250 memset(buf, '\0', buf_len); 1251 return -EINVAL; 1252 } 1253 return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len); 1254 } 1255 1256 static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = { 1257 .func = bpf_sysctl_get_new_value, 1258 .gpl_only = false, 1259 .ret_type = RET_INTEGER, 1260 .arg1_type = ARG_PTR_TO_CTX, 1261 .arg2_type = ARG_PTR_TO_UNINIT_MEM, 1262 .arg3_type = ARG_CONST_SIZE, 1263 }; 1264 1265 BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx, 1266 const char *, buf, size_t, buf_len) 1267 { 1268 if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len) 1269 return -EINVAL; 1270 1271 if (buf_len > PAGE_SIZE - 1) 1272 return -E2BIG; 1273 1274 memcpy(ctx->new_val, buf, buf_len); 1275 ctx->new_len = buf_len; 1276 ctx->new_updated = 1; 1277 1278 return 0; 1279 } 1280 1281 static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = { 1282 .func = bpf_sysctl_set_new_value, 1283 .gpl_only = false, 1284 .ret_type = RET_INTEGER, 1285 .arg1_type = ARG_PTR_TO_CTX, 1286 .arg2_type = ARG_PTR_TO_MEM, 1287 .arg3_type = ARG_CONST_SIZE, 1288 }; 1289 1290 static const struct bpf_func_proto * 1291 sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 1292 { 1293 switch (func_id) { 1294 case BPF_FUNC_strtol: 1295 return &bpf_strtol_proto; 1296 case BPF_FUNC_strtoul: 1297 return &bpf_strtoul_proto; 1298 case BPF_FUNC_sysctl_get_name: 1299 return &bpf_sysctl_get_name_proto; 1300 case BPF_FUNC_sysctl_get_current_value: 1301 return &bpf_sysctl_get_current_value_proto; 1302 case BPF_FUNC_sysctl_get_new_value: 1303 return &bpf_sysctl_get_new_value_proto; 1304 case BPF_FUNC_sysctl_set_new_value: 1305 return &bpf_sysctl_set_new_value_proto; 1306 default: 1307 return cgroup_base_func_proto(func_id, prog); 1308 } 1309 } 1310 1311 static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type, 1312 const struct bpf_prog *prog, 1313 struct bpf_insn_access_aux *info) 1314 { 1315 const int size_default = sizeof(__u32); 1316 1317 if (off < 0 || off + size > sizeof(struct bpf_sysctl) || off % size) 1318 return false; 1319 1320 switch (off) { 1321 case bpf_ctx_range(struct bpf_sysctl, write): 1322 if (type != BPF_READ) 1323 return false; 1324 bpf_ctx_record_field_size(info, size_default); 1325 return bpf_ctx_narrow_access_ok(off, size, size_default); 1326 case bpf_ctx_range(struct bpf_sysctl, file_pos): 1327 if (type == BPF_READ) { 1328 bpf_ctx_record_field_size(info, size_default); 1329 return bpf_ctx_narrow_access_ok(off, size, size_default); 1330 } else { 1331 return size == size_default; 1332 } 1333 default: 1334 return false; 1335 } 1336 } 1337 1338 static u32 sysctl_convert_ctx_access(enum bpf_access_type type, 1339 const struct bpf_insn *si, 1340 struct bpf_insn *insn_buf, 1341 struct bpf_prog *prog, u32 *target_size) 1342 { 1343 struct bpf_insn *insn = insn_buf; 1344 u32 read_size; 1345 1346 switch (si->off) { 1347 case offsetof(struct bpf_sysctl, write): 1348 *insn++ = BPF_LDX_MEM( 1349 BPF_SIZE(si->code), si->dst_reg, si->src_reg, 1350 bpf_target_off(struct bpf_sysctl_kern, write, 1351 sizeof_field(struct bpf_sysctl_kern, 1352 write), 1353 target_size)); 1354 break; 1355 case offsetof(struct bpf_sysctl, file_pos): 1356 /* ppos is a pointer so it should be accessed via indirect 1357 * loads and stores. Also for stores additional temporary 1358 * register is used since neither src_reg nor dst_reg can be 1359 * overridden. 1360 */ 1361 if (type == BPF_WRITE) { 1362 int treg = BPF_REG_9; 1363 1364 if (si->src_reg == treg || si->dst_reg == treg) 1365 --treg; 1366 if (si->src_reg == treg || si->dst_reg == treg) 1367 --treg; 1368 *insn++ = BPF_STX_MEM( 1369 BPF_DW, si->dst_reg, treg, 1370 offsetof(struct bpf_sysctl_kern, tmp_reg)); 1371 *insn++ = BPF_LDX_MEM( 1372 BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos), 1373 treg, si->dst_reg, 1374 offsetof(struct bpf_sysctl_kern, ppos)); 1375 *insn++ = BPF_STX_MEM( 1376 BPF_SIZEOF(u32), treg, si->src_reg, 1377 bpf_ctx_narrow_access_offset( 1378 0, sizeof(u32), sizeof(loff_t))); 1379 *insn++ = BPF_LDX_MEM( 1380 BPF_DW, treg, si->dst_reg, 1381 offsetof(struct bpf_sysctl_kern, tmp_reg)); 1382 } else { 1383 *insn++ = BPF_LDX_MEM( 1384 BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos), 1385 si->dst_reg, si->src_reg, 1386 offsetof(struct bpf_sysctl_kern, ppos)); 1387 read_size = bpf_size_to_bytes(BPF_SIZE(si->code)); 1388 *insn++ = BPF_LDX_MEM( 1389 BPF_SIZE(si->code), si->dst_reg, si->dst_reg, 1390 bpf_ctx_narrow_access_offset( 1391 0, read_size, sizeof(loff_t))); 1392 } 1393 *target_size = sizeof(u32); 1394 break; 1395 } 1396 1397 return insn - insn_buf; 1398 } 1399 1400 const struct bpf_verifier_ops cg_sysctl_verifier_ops = { 1401 .get_func_proto = sysctl_func_proto, 1402 .is_valid_access = sysctl_is_valid_access, 1403 .convert_ctx_access = sysctl_convert_ctx_access, 1404 }; 1405 1406 const struct bpf_prog_ops cg_sysctl_prog_ops = { 1407 }; 1408 1409 static const struct bpf_func_proto * 1410 cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 1411 { 1412 switch (func_id) { 1413 #ifdef CONFIG_NET 1414 case BPF_FUNC_sk_storage_get: 1415 return &bpf_sk_storage_get_proto; 1416 case BPF_FUNC_sk_storage_delete: 1417 return &bpf_sk_storage_delete_proto; 1418 #endif 1419 #ifdef CONFIG_INET 1420 case BPF_FUNC_tcp_sock: 1421 return &bpf_tcp_sock_proto; 1422 #endif 1423 default: 1424 return cgroup_base_func_proto(func_id, prog); 1425 } 1426 } 1427 1428 static bool cg_sockopt_is_valid_access(int off, int size, 1429 enum bpf_access_type type, 1430 const struct bpf_prog *prog, 1431 struct bpf_insn_access_aux *info) 1432 { 1433 const int size_default = sizeof(__u32); 1434 1435 if (off < 0 || off >= sizeof(struct bpf_sockopt)) 1436 return false; 1437 1438 if (off % size != 0) 1439 return false; 1440 1441 if (type == BPF_WRITE) { 1442 switch (off) { 1443 case offsetof(struct bpf_sockopt, retval): 1444 if (size != size_default) 1445 return false; 1446 return prog->expected_attach_type == 1447 BPF_CGROUP_GETSOCKOPT; 1448 case offsetof(struct bpf_sockopt, optname): 1449 /* fallthrough */ 1450 case offsetof(struct bpf_sockopt, level): 1451 if (size != size_default) 1452 return false; 1453 return prog->expected_attach_type == 1454 BPF_CGROUP_SETSOCKOPT; 1455 case offsetof(struct bpf_sockopt, optlen): 1456 return size == size_default; 1457 default: 1458 return false; 1459 } 1460 } 1461 1462 switch (off) { 1463 case offsetof(struct bpf_sockopt, sk): 1464 if (size != sizeof(__u64)) 1465 return false; 1466 info->reg_type = PTR_TO_SOCKET; 1467 break; 1468 case offsetof(struct bpf_sockopt, optval): 1469 if (size != sizeof(__u64)) 1470 return false; 1471 info->reg_type = PTR_TO_PACKET; 1472 break; 1473 case offsetof(struct bpf_sockopt, optval_end): 1474 if (size != sizeof(__u64)) 1475 return false; 1476 info->reg_type = PTR_TO_PACKET_END; 1477 break; 1478 case offsetof(struct bpf_sockopt, retval): 1479 if (size != size_default) 1480 return false; 1481 return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT; 1482 default: 1483 if (size != size_default) 1484 return false; 1485 break; 1486 } 1487 return true; 1488 } 1489 1490 #define CG_SOCKOPT_ACCESS_FIELD(T, F) \ 1491 T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \ 1492 si->dst_reg, si->src_reg, \ 1493 offsetof(struct bpf_sockopt_kern, F)) 1494 1495 static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type, 1496 const struct bpf_insn *si, 1497 struct bpf_insn *insn_buf, 1498 struct bpf_prog *prog, 1499 u32 *target_size) 1500 { 1501 struct bpf_insn *insn = insn_buf; 1502 1503 switch (si->off) { 1504 case offsetof(struct bpf_sockopt, sk): 1505 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk); 1506 break; 1507 case offsetof(struct bpf_sockopt, level): 1508 if (type == BPF_WRITE) 1509 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level); 1510 else 1511 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level); 1512 break; 1513 case offsetof(struct bpf_sockopt, optname): 1514 if (type == BPF_WRITE) 1515 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname); 1516 else 1517 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname); 1518 break; 1519 case offsetof(struct bpf_sockopt, optlen): 1520 if (type == BPF_WRITE) 1521 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen); 1522 else 1523 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen); 1524 break; 1525 case offsetof(struct bpf_sockopt, retval): 1526 if (type == BPF_WRITE) 1527 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval); 1528 else 1529 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval); 1530 break; 1531 case offsetof(struct bpf_sockopt, optval): 1532 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval); 1533 break; 1534 case offsetof(struct bpf_sockopt, optval_end): 1535 *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end); 1536 break; 1537 } 1538 1539 return insn - insn_buf; 1540 } 1541 1542 static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf, 1543 bool direct_write, 1544 const struct bpf_prog *prog) 1545 { 1546 /* Nothing to do for sockopt argument. The data is kzalloc'ated. 1547 */ 1548 return 0; 1549 } 1550 1551 const struct bpf_verifier_ops cg_sockopt_verifier_ops = { 1552 .get_func_proto = cg_sockopt_func_proto, 1553 .is_valid_access = cg_sockopt_is_valid_access, 1554 .convert_ctx_access = cg_sockopt_convert_ctx_access, 1555 .gen_prologue = cg_sockopt_get_prologue, 1556 }; 1557 1558 const struct bpf_prog_ops cg_sockopt_prog_ops = { 1559 }; 1560