1 /* 2 * Functions to manage eBPF programs attached to cgroups 3 * 4 * Copyright (c) 2016 Daniel Mack 5 * 6 * This file is subject to the terms and conditions of version 2 of the GNU 7 * General Public License. See the file COPYING in the main directory of the 8 * Linux distribution for more details. 9 */ 10 11 #include <linux/kernel.h> 12 #include <linux/atomic.h> 13 #include <linux/cgroup.h> 14 #include <linux/slab.h> 15 #include <linux/bpf.h> 16 #include <linux/bpf-cgroup.h> 17 #include <net/sock.h> 18 19 DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); 20 EXPORT_SYMBOL(cgroup_bpf_enabled_key); 21 22 /** 23 * cgroup_bpf_put() - put references of all bpf programs 24 * @cgrp: the cgroup to modify 25 */ 26 void cgroup_bpf_put(struct cgroup *cgrp) 27 { 28 unsigned int type; 29 30 for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { 31 struct list_head *progs = &cgrp->bpf.progs[type]; 32 struct bpf_prog_list *pl, *tmp; 33 34 list_for_each_entry_safe(pl, tmp, progs, node) { 35 list_del(&pl->node); 36 bpf_prog_put(pl->prog); 37 kfree(pl); 38 static_branch_dec(&cgroup_bpf_enabled_key); 39 } 40 bpf_prog_array_free(cgrp->bpf.effective[type]); 41 } 42 } 43 44 /* count number of elements in the list. 45 * it's slow but the list cannot be long 46 */ 47 static u32 prog_list_length(struct list_head *head) 48 { 49 struct bpf_prog_list *pl; 50 u32 cnt = 0; 51 52 list_for_each_entry(pl, head, node) { 53 if (!pl->prog) 54 continue; 55 cnt++; 56 } 57 return cnt; 58 } 59 60 /* if parent has non-overridable prog attached, 61 * disallow attaching new programs to the descendent cgroup. 62 * if parent has overridable or multi-prog, allow attaching 63 */ 64 static bool hierarchy_allows_attach(struct cgroup *cgrp, 65 enum bpf_attach_type type, 66 u32 new_flags) 67 { 68 struct cgroup *p; 69 70 p = cgroup_parent(cgrp); 71 if (!p) 72 return true; 73 do { 74 u32 flags = p->bpf.flags[type]; 75 u32 cnt; 76 77 if (flags & BPF_F_ALLOW_MULTI) 78 return true; 79 cnt = prog_list_length(&p->bpf.progs[type]); 80 WARN_ON_ONCE(cnt > 1); 81 if (cnt == 1) 82 return !!(flags & BPF_F_ALLOW_OVERRIDE); 83 p = cgroup_parent(p); 84 } while (p); 85 return true; 86 } 87 88 /* compute a chain of effective programs for a given cgroup: 89 * start from the list of programs in this cgroup and add 90 * all parent programs. 91 * Note that parent's F_ALLOW_OVERRIDE-type program is yielding 92 * to programs in this cgroup 93 */ 94 static int compute_effective_progs(struct cgroup *cgrp, 95 enum bpf_attach_type type, 96 struct bpf_prog_array __rcu **array) 97 { 98 struct bpf_prog_array __rcu *progs; 99 struct bpf_prog_list *pl; 100 struct cgroup *p = cgrp; 101 int cnt = 0; 102 103 /* count number of effective programs by walking parents */ 104 do { 105 if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) 106 cnt += prog_list_length(&p->bpf.progs[type]); 107 p = cgroup_parent(p); 108 } while (p); 109 110 progs = bpf_prog_array_alloc(cnt, GFP_KERNEL); 111 if (!progs) 112 return -ENOMEM; 113 114 /* populate the array with effective progs */ 115 cnt = 0; 116 p = cgrp; 117 do { 118 if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) 119 list_for_each_entry(pl, 120 &p->bpf.progs[type], node) { 121 if (!pl->prog) 122 continue; 123 rcu_dereference_protected(progs, 1)-> 124 progs[cnt++] = pl->prog; 125 } 126 p = cgroup_parent(p); 127 } while (p); 128 129 *array = progs; 130 return 0; 131 } 132 133 static void activate_effective_progs(struct cgroup *cgrp, 134 enum bpf_attach_type type, 135 struct bpf_prog_array __rcu *array) 136 { 137 struct bpf_prog_array __rcu *old_array; 138 139 old_array = xchg(&cgrp->bpf.effective[type], array); 140 /* free prog array after grace period, since __cgroup_bpf_run_*() 141 * might be still walking the array 142 */ 143 bpf_prog_array_free(old_array); 144 } 145 146 /** 147 * cgroup_bpf_inherit() - inherit effective programs from parent 148 * @cgrp: the cgroup to modify 149 */ 150 int cgroup_bpf_inherit(struct cgroup *cgrp) 151 { 152 /* has to use marco instead of const int, since compiler thinks 153 * that array below is variable length 154 */ 155 #define NR ARRAY_SIZE(cgrp->bpf.effective) 156 struct bpf_prog_array __rcu *arrays[NR] = {}; 157 int i; 158 159 for (i = 0; i < NR; i++) 160 INIT_LIST_HEAD(&cgrp->bpf.progs[i]); 161 162 for (i = 0; i < NR; i++) 163 if (compute_effective_progs(cgrp, i, &arrays[i])) 164 goto cleanup; 165 166 for (i = 0; i < NR; i++) 167 activate_effective_progs(cgrp, i, arrays[i]); 168 169 return 0; 170 cleanup: 171 for (i = 0; i < NR; i++) 172 bpf_prog_array_free(arrays[i]); 173 return -ENOMEM; 174 } 175 176 #define BPF_CGROUP_MAX_PROGS 64 177 178 /** 179 * __cgroup_bpf_attach() - Attach the program to a cgroup, and 180 * propagate the change to descendants 181 * @cgrp: The cgroup which descendants to traverse 182 * @prog: A program to attach 183 * @type: Type of attach operation 184 * 185 * Must be called with cgroup_mutex held. 186 */ 187 int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, 188 enum bpf_attach_type type, u32 flags) 189 { 190 struct list_head *progs = &cgrp->bpf.progs[type]; 191 struct bpf_prog *old_prog = NULL; 192 struct cgroup_subsys_state *css; 193 struct bpf_prog_list *pl; 194 bool pl_was_allocated; 195 int err; 196 197 if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) 198 /* invalid combination */ 199 return -EINVAL; 200 201 if (!hierarchy_allows_attach(cgrp, type, flags)) 202 return -EPERM; 203 204 if (!list_empty(progs) && cgrp->bpf.flags[type] != flags) 205 /* Disallow attaching non-overridable on top 206 * of existing overridable in this cgroup. 207 * Disallow attaching multi-prog if overridable or none 208 */ 209 return -EPERM; 210 211 if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) 212 return -E2BIG; 213 214 if (flags & BPF_F_ALLOW_MULTI) { 215 list_for_each_entry(pl, progs, node) 216 if (pl->prog == prog) 217 /* disallow attaching the same prog twice */ 218 return -EINVAL; 219 220 pl = kmalloc(sizeof(*pl), GFP_KERNEL); 221 if (!pl) 222 return -ENOMEM; 223 pl_was_allocated = true; 224 pl->prog = prog; 225 list_add_tail(&pl->node, progs); 226 } else { 227 if (list_empty(progs)) { 228 pl = kmalloc(sizeof(*pl), GFP_KERNEL); 229 if (!pl) 230 return -ENOMEM; 231 pl_was_allocated = true; 232 list_add_tail(&pl->node, progs); 233 } else { 234 pl = list_first_entry(progs, typeof(*pl), node); 235 old_prog = pl->prog; 236 pl_was_allocated = false; 237 } 238 pl->prog = prog; 239 } 240 241 cgrp->bpf.flags[type] = flags; 242 243 /* allocate and recompute effective prog arrays */ 244 css_for_each_descendant_pre(css, &cgrp->self) { 245 struct cgroup *desc = container_of(css, struct cgroup, self); 246 247 err = compute_effective_progs(desc, type, &desc->bpf.inactive); 248 if (err) 249 goto cleanup; 250 } 251 252 /* all allocations were successful. Activate all prog arrays */ 253 css_for_each_descendant_pre(css, &cgrp->self) { 254 struct cgroup *desc = container_of(css, struct cgroup, self); 255 256 activate_effective_progs(desc, type, desc->bpf.inactive); 257 desc->bpf.inactive = NULL; 258 } 259 260 static_branch_inc(&cgroup_bpf_enabled_key); 261 if (old_prog) { 262 bpf_prog_put(old_prog); 263 static_branch_dec(&cgroup_bpf_enabled_key); 264 } 265 return 0; 266 267 cleanup: 268 /* oom while computing effective. Free all computed effective arrays 269 * since they were not activated 270 */ 271 css_for_each_descendant_pre(css, &cgrp->self) { 272 struct cgroup *desc = container_of(css, struct cgroup, self); 273 274 bpf_prog_array_free(desc->bpf.inactive); 275 desc->bpf.inactive = NULL; 276 } 277 278 /* and cleanup the prog list */ 279 pl->prog = old_prog; 280 if (pl_was_allocated) { 281 list_del(&pl->node); 282 kfree(pl); 283 } 284 return err; 285 } 286 287 /** 288 * __cgroup_bpf_detach() - Detach the program from a cgroup, and 289 * propagate the change to descendants 290 * @cgrp: The cgroup which descendants to traverse 291 * @prog: A program to detach or NULL 292 * @type: Type of detach operation 293 * 294 * Must be called with cgroup_mutex held. 295 */ 296 int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, 297 enum bpf_attach_type type, u32 unused_flags) 298 { 299 struct list_head *progs = &cgrp->bpf.progs[type]; 300 u32 flags = cgrp->bpf.flags[type]; 301 struct bpf_prog *old_prog = NULL; 302 struct cgroup_subsys_state *css; 303 struct bpf_prog_list *pl; 304 int err; 305 306 if (flags & BPF_F_ALLOW_MULTI) { 307 if (!prog) 308 /* to detach MULTI prog the user has to specify valid FD 309 * of the program to be detached 310 */ 311 return -EINVAL; 312 } else { 313 if (list_empty(progs)) 314 /* report error when trying to detach and nothing is attached */ 315 return -ENOENT; 316 } 317 318 if (flags & BPF_F_ALLOW_MULTI) { 319 /* find the prog and detach it */ 320 list_for_each_entry(pl, progs, node) { 321 if (pl->prog != prog) 322 continue; 323 old_prog = prog; 324 /* mark it deleted, so it's ignored while 325 * recomputing effective 326 */ 327 pl->prog = NULL; 328 break; 329 } 330 if (!old_prog) 331 return -ENOENT; 332 } else { 333 /* to maintain backward compatibility NONE and OVERRIDE cgroups 334 * allow detaching with invalid FD (prog==NULL) 335 */ 336 pl = list_first_entry(progs, typeof(*pl), node); 337 old_prog = pl->prog; 338 pl->prog = NULL; 339 } 340 341 /* allocate and recompute effective prog arrays */ 342 css_for_each_descendant_pre(css, &cgrp->self) { 343 struct cgroup *desc = container_of(css, struct cgroup, self); 344 345 err = compute_effective_progs(desc, type, &desc->bpf.inactive); 346 if (err) 347 goto cleanup; 348 } 349 350 /* all allocations were successful. Activate all prog arrays */ 351 css_for_each_descendant_pre(css, &cgrp->self) { 352 struct cgroup *desc = container_of(css, struct cgroup, self); 353 354 activate_effective_progs(desc, type, desc->bpf.inactive); 355 desc->bpf.inactive = NULL; 356 } 357 358 /* now can actually delete it from this cgroup list */ 359 list_del(&pl->node); 360 kfree(pl); 361 if (list_empty(progs)) 362 /* last program was detached, reset flags to zero */ 363 cgrp->bpf.flags[type] = 0; 364 365 bpf_prog_put(old_prog); 366 static_branch_dec(&cgroup_bpf_enabled_key); 367 return 0; 368 369 cleanup: 370 /* oom while computing effective. Free all computed effective arrays 371 * since they were not activated 372 */ 373 css_for_each_descendant_pre(css, &cgrp->self) { 374 struct cgroup *desc = container_of(css, struct cgroup, self); 375 376 bpf_prog_array_free(desc->bpf.inactive); 377 desc->bpf.inactive = NULL; 378 } 379 380 /* and restore back old_prog */ 381 pl->prog = old_prog; 382 return err; 383 } 384 385 /* Must be called with cgroup_mutex held to avoid races. */ 386 int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, 387 union bpf_attr __user *uattr) 388 { 389 __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); 390 enum bpf_attach_type type = attr->query.attach_type; 391 struct list_head *progs = &cgrp->bpf.progs[type]; 392 u32 flags = cgrp->bpf.flags[type]; 393 int cnt, ret = 0, i; 394 395 if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) 396 cnt = bpf_prog_array_length(cgrp->bpf.effective[type]); 397 else 398 cnt = prog_list_length(progs); 399 400 if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) 401 return -EFAULT; 402 if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt))) 403 return -EFAULT; 404 if (attr->query.prog_cnt == 0 || !prog_ids || !cnt) 405 /* return early if user requested only program count + flags */ 406 return 0; 407 if (attr->query.prog_cnt < cnt) { 408 cnt = attr->query.prog_cnt; 409 ret = -ENOSPC; 410 } 411 412 if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { 413 return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type], 414 prog_ids, cnt); 415 } else { 416 struct bpf_prog_list *pl; 417 u32 id; 418 419 i = 0; 420 list_for_each_entry(pl, progs, node) { 421 id = pl->prog->aux->id; 422 if (copy_to_user(prog_ids + i, &id, sizeof(id))) 423 return -EFAULT; 424 if (++i == cnt) 425 break; 426 } 427 } 428 return ret; 429 } 430 431 /** 432 * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering 433 * @sk: The socket sending or receiving traffic 434 * @skb: The skb that is being sent or received 435 * @type: The type of program to be exectuted 436 * 437 * If no socket is passed, or the socket is not of type INET or INET6, 438 * this function does nothing and returns 0. 439 * 440 * The program type passed in via @type must be suitable for network 441 * filtering. No further check is performed to assert that. 442 * 443 * This function will return %-EPERM if any if an attached program was found 444 * and if it returned != 1 during execution. In all other cases, 0 is returned. 445 */ 446 int __cgroup_bpf_run_filter_skb(struct sock *sk, 447 struct sk_buff *skb, 448 enum bpf_attach_type type) 449 { 450 unsigned int offset = skb->data - skb_network_header(skb); 451 struct sock *save_sk; 452 struct cgroup *cgrp; 453 int ret; 454 455 if (!sk || !sk_fullsock(sk)) 456 return 0; 457 458 if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) 459 return 0; 460 461 cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 462 save_sk = skb->sk; 463 skb->sk = sk; 464 __skb_push(skb, offset); 465 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, 466 bpf_prog_run_save_cb); 467 __skb_pull(skb, offset); 468 skb->sk = save_sk; 469 return ret == 1 ? 0 : -EPERM; 470 } 471 EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); 472 473 /** 474 * __cgroup_bpf_run_filter_sk() - Run a program on a sock 475 * @sk: sock structure to manipulate 476 * @type: The type of program to be exectuted 477 * 478 * socket is passed is expected to be of type INET or INET6. 479 * 480 * The program type passed in via @type must be suitable for sock 481 * filtering. No further check is performed to assert that. 482 * 483 * This function will return %-EPERM if any if an attached program was found 484 * and if it returned != 1 during execution. In all other cases, 0 is returned. 485 */ 486 int __cgroup_bpf_run_filter_sk(struct sock *sk, 487 enum bpf_attach_type type) 488 { 489 struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 490 int ret; 491 492 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN); 493 return ret == 1 ? 0 : -EPERM; 494 } 495 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); 496 497 /** 498 * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock 499 * @sk: socket to get cgroup from 500 * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains 501 * sk with connection information (IP addresses, etc.) May not contain 502 * cgroup info if it is a req sock. 503 * @type: The type of program to be exectuted 504 * 505 * socket passed is expected to be of type INET or INET6. 506 * 507 * The program type passed in via @type must be suitable for sock_ops 508 * filtering. No further check is performed to assert that. 509 * 510 * This function will return %-EPERM if any if an attached program was found 511 * and if it returned != 1 during execution. In all other cases, 0 is returned. 512 */ 513 int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, 514 struct bpf_sock_ops_kern *sock_ops, 515 enum bpf_attach_type type) 516 { 517 struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 518 int ret; 519 520 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops, 521 BPF_PROG_RUN); 522 return ret == 1 ? 0 : -EPERM; 523 } 524 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); 525 526 int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, 527 short access, enum bpf_attach_type type) 528 { 529 struct cgroup *cgrp; 530 struct bpf_cgroup_dev_ctx ctx = { 531 .access_type = (access << 16) | dev_type, 532 .major = major, 533 .minor = minor, 534 }; 535 int allow = 1; 536 537 rcu_read_lock(); 538 cgrp = task_dfl_cgroup(current); 539 allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, 540 BPF_PROG_RUN); 541 rcu_read_unlock(); 542 543 return !allow; 544 } 545 EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission); 546 547 static const struct bpf_func_proto * 548 cgroup_dev_func_proto(enum bpf_func_id func_id) 549 { 550 switch (func_id) { 551 case BPF_FUNC_map_lookup_elem: 552 return &bpf_map_lookup_elem_proto; 553 case BPF_FUNC_map_update_elem: 554 return &bpf_map_update_elem_proto; 555 case BPF_FUNC_map_delete_elem: 556 return &bpf_map_delete_elem_proto; 557 case BPF_FUNC_get_current_uid_gid: 558 return &bpf_get_current_uid_gid_proto; 559 case BPF_FUNC_trace_printk: 560 if (capable(CAP_SYS_ADMIN)) 561 return bpf_get_trace_printk_proto(); 562 default: 563 return NULL; 564 } 565 } 566 567 static bool cgroup_dev_is_valid_access(int off, int size, 568 enum bpf_access_type type, 569 struct bpf_insn_access_aux *info) 570 { 571 const int size_default = sizeof(__u32); 572 573 if (type == BPF_WRITE) 574 return false; 575 576 if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx)) 577 return false; 578 /* The verifier guarantees that size > 0. */ 579 if (off % size != 0) 580 return false; 581 582 switch (off) { 583 case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type): 584 bpf_ctx_record_field_size(info, size_default); 585 if (!bpf_ctx_narrow_access_ok(off, size, size_default)) 586 return false; 587 break; 588 default: 589 if (size != size_default) 590 return false; 591 } 592 593 return true; 594 } 595 596 const struct bpf_prog_ops cg_dev_prog_ops = { 597 }; 598 599 const struct bpf_verifier_ops cg_dev_verifier_ops = { 600 .get_func_proto = cgroup_dev_func_proto, 601 .is_valid_access = cgroup_dev_is_valid_access, 602 }; 603