1 /* 2 * Functions to manage eBPF programs attached to cgroups 3 * 4 * Copyright (c) 2016 Daniel Mack 5 * 6 * This file is subject to the terms and conditions of version 2 of the GNU 7 * General Public License. See the file COPYING in the main directory of the 8 * Linux distribution for more details. 9 */ 10 11 #include <linux/kernel.h> 12 #include <linux/atomic.h> 13 #include <linux/cgroup.h> 14 #include <linux/slab.h> 15 #include <linux/bpf.h> 16 #include <linux/bpf-cgroup.h> 17 #include <net/sock.h> 18 19 DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); 20 EXPORT_SYMBOL(cgroup_bpf_enabled_key); 21 22 /** 23 * cgroup_bpf_put() - put references of all bpf programs 24 * @cgrp: the cgroup to modify 25 */ 26 void cgroup_bpf_put(struct cgroup *cgrp) 27 { 28 unsigned int type; 29 30 for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { 31 struct list_head *progs = &cgrp->bpf.progs[type]; 32 struct bpf_prog_list *pl, *tmp; 33 34 list_for_each_entry_safe(pl, tmp, progs, node) { 35 list_del(&pl->node); 36 bpf_prog_put(pl->prog); 37 kfree(pl); 38 static_branch_dec(&cgroup_bpf_enabled_key); 39 } 40 bpf_prog_array_free(cgrp->bpf.effective[type]); 41 } 42 } 43 44 /* count number of elements in the list. 45 * it's slow but the list cannot be long 46 */ 47 static u32 prog_list_length(struct list_head *head) 48 { 49 struct bpf_prog_list *pl; 50 u32 cnt = 0; 51 52 list_for_each_entry(pl, head, node) { 53 if (!pl->prog) 54 continue; 55 cnt++; 56 } 57 return cnt; 58 } 59 60 /* if parent has non-overridable prog attached, 61 * disallow attaching new programs to the descendent cgroup. 62 * if parent has overridable or multi-prog, allow attaching 63 */ 64 static bool hierarchy_allows_attach(struct cgroup *cgrp, 65 enum bpf_attach_type type, 66 u32 new_flags) 67 { 68 struct cgroup *p; 69 70 p = cgroup_parent(cgrp); 71 if (!p) 72 return true; 73 do { 74 u32 flags = p->bpf.flags[type]; 75 u32 cnt; 76 77 if (flags & BPF_F_ALLOW_MULTI) 78 return true; 79 cnt = prog_list_length(&p->bpf.progs[type]); 80 WARN_ON_ONCE(cnt > 1); 81 if (cnt == 1) 82 return !!(flags & BPF_F_ALLOW_OVERRIDE); 83 p = cgroup_parent(p); 84 } while (p); 85 return true; 86 } 87 88 /* compute a chain of effective programs for a given cgroup: 89 * start from the list of programs in this cgroup and add 90 * all parent programs. 91 * Note that parent's F_ALLOW_OVERRIDE-type program is yielding 92 * to programs in this cgroup 93 */ 94 static int compute_effective_progs(struct cgroup *cgrp, 95 enum bpf_attach_type type, 96 struct bpf_prog_array __rcu **array) 97 { 98 struct bpf_prog_array __rcu *progs; 99 struct bpf_prog_list *pl; 100 struct cgroup *p = cgrp; 101 int cnt = 0; 102 103 /* count number of effective programs by walking parents */ 104 do { 105 if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) 106 cnt += prog_list_length(&p->bpf.progs[type]); 107 p = cgroup_parent(p); 108 } while (p); 109 110 progs = bpf_prog_array_alloc(cnt, GFP_KERNEL); 111 if (!progs) 112 return -ENOMEM; 113 114 /* populate the array with effective progs */ 115 cnt = 0; 116 p = cgrp; 117 do { 118 if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) 119 list_for_each_entry(pl, 120 &p->bpf.progs[type], node) { 121 if (!pl->prog) 122 continue; 123 rcu_dereference_protected(progs, 1)-> 124 progs[cnt++] = pl->prog; 125 } 126 p = cgroup_parent(p); 127 } while (p); 128 129 *array = progs; 130 return 0; 131 } 132 133 static void activate_effective_progs(struct cgroup *cgrp, 134 enum bpf_attach_type type, 135 struct bpf_prog_array __rcu *array) 136 { 137 struct bpf_prog_array __rcu *old_array; 138 139 old_array = xchg(&cgrp->bpf.effective[type], array); 140 /* free prog array after grace period, since __cgroup_bpf_run_*() 141 * might be still walking the array 142 */ 143 bpf_prog_array_free(old_array); 144 } 145 146 /** 147 * cgroup_bpf_inherit() - inherit effective programs from parent 148 * @cgrp: the cgroup to modify 149 */ 150 int cgroup_bpf_inherit(struct cgroup *cgrp) 151 { 152 /* has to use marco instead of const int, since compiler thinks 153 * that array below is variable length 154 */ 155 #define NR ARRAY_SIZE(cgrp->bpf.effective) 156 struct bpf_prog_array __rcu *arrays[NR] = {}; 157 int i; 158 159 for (i = 0; i < NR; i++) 160 INIT_LIST_HEAD(&cgrp->bpf.progs[i]); 161 162 for (i = 0; i < NR; i++) 163 if (compute_effective_progs(cgrp, i, &arrays[i])) 164 goto cleanup; 165 166 for (i = 0; i < NR; i++) 167 activate_effective_progs(cgrp, i, arrays[i]); 168 169 return 0; 170 cleanup: 171 for (i = 0; i < NR; i++) 172 bpf_prog_array_free(arrays[i]); 173 return -ENOMEM; 174 } 175 176 #define BPF_CGROUP_MAX_PROGS 64 177 178 /** 179 * __cgroup_bpf_attach() - Attach the program to a cgroup, and 180 * propagate the change to descendants 181 * @cgrp: The cgroup which descendants to traverse 182 * @prog: A program to attach 183 * @type: Type of attach operation 184 * 185 * Must be called with cgroup_mutex held. 186 */ 187 int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, 188 enum bpf_attach_type type, u32 flags) 189 { 190 struct list_head *progs = &cgrp->bpf.progs[type]; 191 struct bpf_prog *old_prog = NULL; 192 struct cgroup_subsys_state *css; 193 struct bpf_prog_list *pl; 194 bool pl_was_allocated; 195 u32 old_flags; 196 int err; 197 198 if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) 199 /* invalid combination */ 200 return -EINVAL; 201 202 if (!hierarchy_allows_attach(cgrp, type, flags)) 203 return -EPERM; 204 205 if (!list_empty(progs) && cgrp->bpf.flags[type] != flags) 206 /* Disallow attaching non-overridable on top 207 * of existing overridable in this cgroup. 208 * Disallow attaching multi-prog if overridable or none 209 */ 210 return -EPERM; 211 212 if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) 213 return -E2BIG; 214 215 if (flags & BPF_F_ALLOW_MULTI) { 216 list_for_each_entry(pl, progs, node) 217 if (pl->prog == prog) 218 /* disallow attaching the same prog twice */ 219 return -EINVAL; 220 221 pl = kmalloc(sizeof(*pl), GFP_KERNEL); 222 if (!pl) 223 return -ENOMEM; 224 pl_was_allocated = true; 225 pl->prog = prog; 226 list_add_tail(&pl->node, progs); 227 } else { 228 if (list_empty(progs)) { 229 pl = kmalloc(sizeof(*pl), GFP_KERNEL); 230 if (!pl) 231 return -ENOMEM; 232 pl_was_allocated = true; 233 list_add_tail(&pl->node, progs); 234 } else { 235 pl = list_first_entry(progs, typeof(*pl), node); 236 old_prog = pl->prog; 237 pl_was_allocated = false; 238 } 239 pl->prog = prog; 240 } 241 242 old_flags = cgrp->bpf.flags[type]; 243 cgrp->bpf.flags[type] = flags; 244 245 /* allocate and recompute effective prog arrays */ 246 css_for_each_descendant_pre(css, &cgrp->self) { 247 struct cgroup *desc = container_of(css, struct cgroup, self); 248 249 err = compute_effective_progs(desc, type, &desc->bpf.inactive); 250 if (err) 251 goto cleanup; 252 } 253 254 /* all allocations were successful. Activate all prog arrays */ 255 css_for_each_descendant_pre(css, &cgrp->self) { 256 struct cgroup *desc = container_of(css, struct cgroup, self); 257 258 activate_effective_progs(desc, type, desc->bpf.inactive); 259 desc->bpf.inactive = NULL; 260 } 261 262 static_branch_inc(&cgroup_bpf_enabled_key); 263 if (old_prog) { 264 bpf_prog_put(old_prog); 265 static_branch_dec(&cgroup_bpf_enabled_key); 266 } 267 return 0; 268 269 cleanup: 270 /* oom while computing effective. Free all computed effective arrays 271 * since they were not activated 272 */ 273 css_for_each_descendant_pre(css, &cgrp->self) { 274 struct cgroup *desc = container_of(css, struct cgroup, self); 275 276 bpf_prog_array_free(desc->bpf.inactive); 277 desc->bpf.inactive = NULL; 278 } 279 280 /* and cleanup the prog list */ 281 pl->prog = old_prog; 282 if (pl_was_allocated) { 283 list_del(&pl->node); 284 kfree(pl); 285 } 286 return err; 287 } 288 289 /** 290 * __cgroup_bpf_detach() - Detach the program from a cgroup, and 291 * propagate the change to descendants 292 * @cgrp: The cgroup which descendants to traverse 293 * @prog: A program to detach or NULL 294 * @type: Type of detach operation 295 * 296 * Must be called with cgroup_mutex held. 297 */ 298 int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, 299 enum bpf_attach_type type, u32 unused_flags) 300 { 301 struct list_head *progs = &cgrp->bpf.progs[type]; 302 u32 flags = cgrp->bpf.flags[type]; 303 struct bpf_prog *old_prog = NULL; 304 struct cgroup_subsys_state *css; 305 struct bpf_prog_list *pl; 306 int err; 307 308 if (flags & BPF_F_ALLOW_MULTI) { 309 if (!prog) 310 /* to detach MULTI prog the user has to specify valid FD 311 * of the program to be detached 312 */ 313 return -EINVAL; 314 } else { 315 if (list_empty(progs)) 316 /* report error when trying to detach and nothing is attached */ 317 return -ENOENT; 318 } 319 320 if (flags & BPF_F_ALLOW_MULTI) { 321 /* find the prog and detach it */ 322 list_for_each_entry(pl, progs, node) { 323 if (pl->prog != prog) 324 continue; 325 old_prog = prog; 326 /* mark it deleted, so it's ignored while 327 * recomputing effective 328 */ 329 pl->prog = NULL; 330 break; 331 } 332 if (!old_prog) 333 return -ENOENT; 334 } else { 335 /* to maintain backward compatibility NONE and OVERRIDE cgroups 336 * allow detaching with invalid FD (prog==NULL) 337 */ 338 pl = list_first_entry(progs, typeof(*pl), node); 339 old_prog = pl->prog; 340 pl->prog = NULL; 341 } 342 343 /* allocate and recompute effective prog arrays */ 344 css_for_each_descendant_pre(css, &cgrp->self) { 345 struct cgroup *desc = container_of(css, struct cgroup, self); 346 347 err = compute_effective_progs(desc, type, &desc->bpf.inactive); 348 if (err) 349 goto cleanup; 350 } 351 352 /* all allocations were successful. Activate all prog arrays */ 353 css_for_each_descendant_pre(css, &cgrp->self) { 354 struct cgroup *desc = container_of(css, struct cgroup, self); 355 356 activate_effective_progs(desc, type, desc->bpf.inactive); 357 desc->bpf.inactive = NULL; 358 } 359 360 /* now can actually delete it from this cgroup list */ 361 list_del(&pl->node); 362 kfree(pl); 363 if (list_empty(progs)) 364 /* last program was detached, reset flags to zero */ 365 cgrp->bpf.flags[type] = 0; 366 367 bpf_prog_put(old_prog); 368 static_branch_dec(&cgroup_bpf_enabled_key); 369 return 0; 370 371 cleanup: 372 /* oom while computing effective. Free all computed effective arrays 373 * since they were not activated 374 */ 375 css_for_each_descendant_pre(css, &cgrp->self) { 376 struct cgroup *desc = container_of(css, struct cgroup, self); 377 378 bpf_prog_array_free(desc->bpf.inactive); 379 desc->bpf.inactive = NULL; 380 } 381 382 /* and restore back old_prog */ 383 pl->prog = old_prog; 384 return err; 385 } 386 387 /** 388 * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering 389 * @sk: The socket sending or receiving traffic 390 * @skb: The skb that is being sent or received 391 * @type: The type of program to be exectuted 392 * 393 * If no socket is passed, or the socket is not of type INET or INET6, 394 * this function does nothing and returns 0. 395 * 396 * The program type passed in via @type must be suitable for network 397 * filtering. No further check is performed to assert that. 398 * 399 * This function will return %-EPERM if any if an attached program was found 400 * and if it returned != 1 during execution. In all other cases, 0 is returned. 401 */ 402 int __cgroup_bpf_run_filter_skb(struct sock *sk, 403 struct sk_buff *skb, 404 enum bpf_attach_type type) 405 { 406 unsigned int offset = skb->data - skb_network_header(skb); 407 struct sock *save_sk; 408 struct cgroup *cgrp; 409 int ret; 410 411 if (!sk || !sk_fullsock(sk)) 412 return 0; 413 414 if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) 415 return 0; 416 417 cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 418 save_sk = skb->sk; 419 skb->sk = sk; 420 __skb_push(skb, offset); 421 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, 422 bpf_prog_run_save_cb); 423 __skb_pull(skb, offset); 424 skb->sk = save_sk; 425 return ret == 1 ? 0 : -EPERM; 426 } 427 EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); 428 429 /** 430 * __cgroup_bpf_run_filter_sk() - Run a program on a sock 431 * @sk: sock structure to manipulate 432 * @type: The type of program to be exectuted 433 * 434 * socket is passed is expected to be of type INET or INET6. 435 * 436 * The program type passed in via @type must be suitable for sock 437 * filtering. No further check is performed to assert that. 438 * 439 * This function will return %-EPERM if any if an attached program was found 440 * and if it returned != 1 during execution. In all other cases, 0 is returned. 441 */ 442 int __cgroup_bpf_run_filter_sk(struct sock *sk, 443 enum bpf_attach_type type) 444 { 445 struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 446 int ret; 447 448 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN); 449 return ret == 1 ? 0 : -EPERM; 450 } 451 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); 452 453 /** 454 * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock 455 * @sk: socket to get cgroup from 456 * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains 457 * sk with connection information (IP addresses, etc.) May not contain 458 * cgroup info if it is a req sock. 459 * @type: The type of program to be exectuted 460 * 461 * socket passed is expected to be of type INET or INET6. 462 * 463 * The program type passed in via @type must be suitable for sock_ops 464 * filtering. No further check is performed to assert that. 465 * 466 * This function will return %-EPERM if any if an attached program was found 467 * and if it returned != 1 during execution. In all other cases, 0 is returned. 468 */ 469 int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, 470 struct bpf_sock_ops_kern *sock_ops, 471 enum bpf_attach_type type) 472 { 473 struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 474 int ret; 475 476 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops, 477 BPF_PROG_RUN); 478 return ret == 1 ? 0 : -EPERM; 479 } 480 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); 481