xref: /openbmc/linux/kernel/bpf/cgroup.c (revision 69fd337a)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Functions to manage eBPF programs attached to cgroups
4  *
5  * Copyright (c) 2016 Daniel Mack
6  */
7 
8 #include <linux/kernel.h>
9 #include <linux/atomic.h>
10 #include <linux/cgroup.h>
11 #include <linux/filter.h>
12 #include <linux/slab.h>
13 #include <linux/sysctl.h>
14 #include <linux/string.h>
15 #include <linux/bpf.h>
16 #include <linux/bpf-cgroup.h>
17 #include <linux/bpf_lsm.h>
18 #include <linux/bpf_verifier.h>
19 #include <net/sock.h>
20 #include <net/bpf_sk_storage.h>
21 
22 #include "../cgroup/cgroup-internal.h"
23 
24 DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
25 EXPORT_SYMBOL(cgroup_bpf_enabled_key);
26 
27 /* __always_inline is necessary to prevent indirect call through run_prog
28  * function pointer.
29  */
30 static __always_inline int
31 bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
32 		      enum cgroup_bpf_attach_type atype,
33 		      const void *ctx, bpf_prog_run_fn run_prog,
34 		      int retval, u32 *ret_flags)
35 {
36 	const struct bpf_prog_array_item *item;
37 	const struct bpf_prog *prog;
38 	const struct bpf_prog_array *array;
39 	struct bpf_run_ctx *old_run_ctx;
40 	struct bpf_cg_run_ctx run_ctx;
41 	u32 func_ret;
42 
43 	run_ctx.retval = retval;
44 	migrate_disable();
45 	rcu_read_lock();
46 	array = rcu_dereference(cgrp->effective[atype]);
47 	item = &array->items[0];
48 	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
49 	while ((prog = READ_ONCE(item->prog))) {
50 		run_ctx.prog_item = item;
51 		func_ret = run_prog(prog, ctx);
52 		if (ret_flags) {
53 			*(ret_flags) |= (func_ret >> 1);
54 			func_ret &= 1;
55 		}
56 		if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval))
57 			run_ctx.retval = -EPERM;
58 		item++;
59 	}
60 	bpf_reset_run_ctx(old_run_ctx);
61 	rcu_read_unlock();
62 	migrate_enable();
63 	return run_ctx.retval;
64 }
65 
66 unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx,
67 				       const struct bpf_insn *insn)
68 {
69 	const struct bpf_prog *shim_prog;
70 	struct sock *sk;
71 	struct cgroup *cgrp;
72 	int ret = 0;
73 	u64 *args;
74 
75 	args = (u64 *)ctx;
76 	sk = (void *)(unsigned long)args[0];
77 	/*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
78 	shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
79 
80 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
81 	if (likely(cgrp))
82 		ret = bpf_prog_run_array_cg(&cgrp->bpf,
83 					    shim_prog->aux->cgroup_atype,
84 					    ctx, bpf_prog_run, 0, NULL);
85 	return ret;
86 }
87 
88 unsigned int __cgroup_bpf_run_lsm_socket(const void *ctx,
89 					 const struct bpf_insn *insn)
90 {
91 	const struct bpf_prog *shim_prog;
92 	struct socket *sock;
93 	struct cgroup *cgrp;
94 	int ret = 0;
95 	u64 *args;
96 
97 	args = (u64 *)ctx;
98 	sock = (void *)(unsigned long)args[0];
99 	/*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
100 	shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
101 
102 	cgrp = sock_cgroup_ptr(&sock->sk->sk_cgrp_data);
103 	if (likely(cgrp))
104 		ret = bpf_prog_run_array_cg(&cgrp->bpf,
105 					    shim_prog->aux->cgroup_atype,
106 					    ctx, bpf_prog_run, 0, NULL);
107 	return ret;
108 }
109 
110 unsigned int __cgroup_bpf_run_lsm_current(const void *ctx,
111 					  const struct bpf_insn *insn)
112 {
113 	const struct bpf_prog *shim_prog;
114 	struct cgroup *cgrp;
115 	int ret = 0;
116 
117 	/*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
118 	shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
119 
120 	/* We rely on trampoline's __bpf_prog_enter_lsm_cgroup to grab RCU read lock. */
121 	cgrp = task_dfl_cgroup(current);
122 	if (likely(cgrp))
123 		ret = bpf_prog_run_array_cg(&cgrp->bpf,
124 					    shim_prog->aux->cgroup_atype,
125 					    ctx, bpf_prog_run, 0, NULL);
126 	return ret;
127 }
128 
129 #ifdef CONFIG_BPF_LSM
130 static enum cgroup_bpf_attach_type
131 bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
132 {
133 	if (attach_type != BPF_LSM_CGROUP)
134 		return to_cgroup_bpf_attach_type(attach_type);
135 	return CGROUP_LSM_START + bpf_lsm_hook_idx(attach_btf_id);
136 }
137 #else
138 static enum cgroup_bpf_attach_type
139 bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
140 {
141 	if (attach_type != BPF_LSM_CGROUP)
142 		return to_cgroup_bpf_attach_type(attach_type);
143 	return -EOPNOTSUPP;
144 }
145 #endif /* CONFIG_BPF_LSM */
146 
147 void cgroup_bpf_offline(struct cgroup *cgrp)
148 {
149 	cgroup_get(cgrp);
150 	percpu_ref_kill(&cgrp->bpf.refcnt);
151 }
152 
153 static void bpf_cgroup_storages_free(struct bpf_cgroup_storage *storages[])
154 {
155 	enum bpf_cgroup_storage_type stype;
156 
157 	for_each_cgroup_storage_type(stype)
158 		bpf_cgroup_storage_free(storages[stype]);
159 }
160 
161 static int bpf_cgroup_storages_alloc(struct bpf_cgroup_storage *storages[],
162 				     struct bpf_cgroup_storage *new_storages[],
163 				     enum bpf_attach_type type,
164 				     struct bpf_prog *prog,
165 				     struct cgroup *cgrp)
166 {
167 	enum bpf_cgroup_storage_type stype;
168 	struct bpf_cgroup_storage_key key;
169 	struct bpf_map *map;
170 
171 	key.cgroup_inode_id = cgroup_id(cgrp);
172 	key.attach_type = type;
173 
174 	for_each_cgroup_storage_type(stype) {
175 		map = prog->aux->cgroup_storage[stype];
176 		if (!map)
177 			continue;
178 
179 		storages[stype] = cgroup_storage_lookup((void *)map, &key, false);
180 		if (storages[stype])
181 			continue;
182 
183 		storages[stype] = bpf_cgroup_storage_alloc(prog, stype);
184 		if (IS_ERR(storages[stype])) {
185 			bpf_cgroup_storages_free(new_storages);
186 			return -ENOMEM;
187 		}
188 
189 		new_storages[stype] = storages[stype];
190 	}
191 
192 	return 0;
193 }
194 
195 static void bpf_cgroup_storages_assign(struct bpf_cgroup_storage *dst[],
196 				       struct bpf_cgroup_storage *src[])
197 {
198 	enum bpf_cgroup_storage_type stype;
199 
200 	for_each_cgroup_storage_type(stype)
201 		dst[stype] = src[stype];
202 }
203 
204 static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[],
205 				     struct cgroup *cgrp,
206 				     enum bpf_attach_type attach_type)
207 {
208 	enum bpf_cgroup_storage_type stype;
209 
210 	for_each_cgroup_storage_type(stype)
211 		bpf_cgroup_storage_link(storages[stype], cgrp, attach_type);
212 }
213 
214 /* Called when bpf_cgroup_link is auto-detached from dying cgroup.
215  * It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It
216  * doesn't free link memory, which will eventually be done by bpf_link's
217  * release() callback, when its last FD is closed.
218  */
219 static void bpf_cgroup_link_auto_detach(struct bpf_cgroup_link *link)
220 {
221 	cgroup_put(link->cgroup);
222 	link->cgroup = NULL;
223 }
224 
225 /**
226  * cgroup_bpf_release() - put references of all bpf programs and
227  *                        release all cgroup bpf data
228  * @work: work structure embedded into the cgroup to modify
229  */
230 static void cgroup_bpf_release(struct work_struct *work)
231 {
232 	struct cgroup *p, *cgrp = container_of(work, struct cgroup,
233 					       bpf.release_work);
234 	struct bpf_prog_array *old_array;
235 	struct list_head *storages = &cgrp->bpf.storages;
236 	struct bpf_cgroup_storage *storage, *stmp;
237 
238 	unsigned int atype;
239 
240 	mutex_lock(&cgroup_mutex);
241 
242 	for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) {
243 		struct hlist_head *progs = &cgrp->bpf.progs[atype];
244 		struct bpf_prog_list *pl;
245 		struct hlist_node *pltmp;
246 
247 		hlist_for_each_entry_safe(pl, pltmp, progs, node) {
248 			hlist_del(&pl->node);
249 			if (pl->prog) {
250 				if (pl->prog->expected_attach_type == BPF_LSM_CGROUP)
251 					bpf_trampoline_unlink_cgroup_shim(pl->prog);
252 				bpf_prog_put(pl->prog);
253 			}
254 			if (pl->link) {
255 				if (pl->link->link.prog->expected_attach_type == BPF_LSM_CGROUP)
256 					bpf_trampoline_unlink_cgroup_shim(pl->link->link.prog);
257 				bpf_cgroup_link_auto_detach(pl->link);
258 			}
259 			kfree(pl);
260 			static_branch_dec(&cgroup_bpf_enabled_key[atype]);
261 		}
262 		old_array = rcu_dereference_protected(
263 				cgrp->bpf.effective[atype],
264 				lockdep_is_held(&cgroup_mutex));
265 		bpf_prog_array_free(old_array);
266 	}
267 
268 	list_for_each_entry_safe(storage, stmp, storages, list_cg) {
269 		bpf_cgroup_storage_unlink(storage);
270 		bpf_cgroup_storage_free(storage);
271 	}
272 
273 	mutex_unlock(&cgroup_mutex);
274 
275 	for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
276 		cgroup_bpf_put(p);
277 
278 	percpu_ref_exit(&cgrp->bpf.refcnt);
279 	cgroup_put(cgrp);
280 }
281 
282 /**
283  * cgroup_bpf_release_fn() - callback used to schedule releasing
284  *                           of bpf cgroup data
285  * @ref: percpu ref counter structure
286  */
287 static void cgroup_bpf_release_fn(struct percpu_ref *ref)
288 {
289 	struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
290 
291 	INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
292 	queue_work(system_wq, &cgrp->bpf.release_work);
293 }
294 
295 /* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through
296  * link or direct prog.
297  */
298 static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
299 {
300 	if (pl->prog)
301 		return pl->prog;
302 	if (pl->link)
303 		return pl->link->link.prog;
304 	return NULL;
305 }
306 
307 /* count number of elements in the list.
308  * it's slow but the list cannot be long
309  */
310 static u32 prog_list_length(struct hlist_head *head)
311 {
312 	struct bpf_prog_list *pl;
313 	u32 cnt = 0;
314 
315 	hlist_for_each_entry(pl, head, node) {
316 		if (!prog_list_prog(pl))
317 			continue;
318 		cnt++;
319 	}
320 	return cnt;
321 }
322 
323 /* if parent has non-overridable prog attached,
324  * disallow attaching new programs to the descendent cgroup.
325  * if parent has overridable or multi-prog, allow attaching
326  */
327 static bool hierarchy_allows_attach(struct cgroup *cgrp,
328 				    enum cgroup_bpf_attach_type atype)
329 {
330 	struct cgroup *p;
331 
332 	p = cgroup_parent(cgrp);
333 	if (!p)
334 		return true;
335 	do {
336 		u32 flags = p->bpf.flags[atype];
337 		u32 cnt;
338 
339 		if (flags & BPF_F_ALLOW_MULTI)
340 			return true;
341 		cnt = prog_list_length(&p->bpf.progs[atype]);
342 		WARN_ON_ONCE(cnt > 1);
343 		if (cnt == 1)
344 			return !!(flags & BPF_F_ALLOW_OVERRIDE);
345 		p = cgroup_parent(p);
346 	} while (p);
347 	return true;
348 }
349 
350 /* compute a chain of effective programs for a given cgroup:
351  * start from the list of programs in this cgroup and add
352  * all parent programs.
353  * Note that parent's F_ALLOW_OVERRIDE-type program is yielding
354  * to programs in this cgroup
355  */
356 static int compute_effective_progs(struct cgroup *cgrp,
357 				   enum cgroup_bpf_attach_type atype,
358 				   struct bpf_prog_array **array)
359 {
360 	struct bpf_prog_array_item *item;
361 	struct bpf_prog_array *progs;
362 	struct bpf_prog_list *pl;
363 	struct cgroup *p = cgrp;
364 	int cnt = 0;
365 
366 	/* count number of effective programs by walking parents */
367 	do {
368 		if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
369 			cnt += prog_list_length(&p->bpf.progs[atype]);
370 		p = cgroup_parent(p);
371 	} while (p);
372 
373 	progs = bpf_prog_array_alloc(cnt, GFP_KERNEL);
374 	if (!progs)
375 		return -ENOMEM;
376 
377 	/* populate the array with effective progs */
378 	cnt = 0;
379 	p = cgrp;
380 	do {
381 		if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
382 			continue;
383 
384 		hlist_for_each_entry(pl, &p->bpf.progs[atype], node) {
385 			if (!prog_list_prog(pl))
386 				continue;
387 
388 			item = &progs->items[cnt];
389 			item->prog = prog_list_prog(pl);
390 			bpf_cgroup_storages_assign(item->cgroup_storage,
391 						   pl->storage);
392 			cnt++;
393 		}
394 	} while ((p = cgroup_parent(p)));
395 
396 	*array = progs;
397 	return 0;
398 }
399 
400 static void activate_effective_progs(struct cgroup *cgrp,
401 				     enum cgroup_bpf_attach_type atype,
402 				     struct bpf_prog_array *old_array)
403 {
404 	old_array = rcu_replace_pointer(cgrp->bpf.effective[atype], old_array,
405 					lockdep_is_held(&cgroup_mutex));
406 	/* free prog array after grace period, since __cgroup_bpf_run_*()
407 	 * might be still walking the array
408 	 */
409 	bpf_prog_array_free(old_array);
410 }
411 
412 /**
413  * cgroup_bpf_inherit() - inherit effective programs from parent
414  * @cgrp: the cgroup to modify
415  */
416 int cgroup_bpf_inherit(struct cgroup *cgrp)
417 {
418 /* has to use marco instead of const int, since compiler thinks
419  * that array below is variable length
420  */
421 #define	NR ARRAY_SIZE(cgrp->bpf.effective)
422 	struct bpf_prog_array *arrays[NR] = {};
423 	struct cgroup *p;
424 	int ret, i;
425 
426 	ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0,
427 			      GFP_KERNEL);
428 	if (ret)
429 		return ret;
430 
431 	for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
432 		cgroup_bpf_get(p);
433 
434 	for (i = 0; i < NR; i++)
435 		INIT_HLIST_HEAD(&cgrp->bpf.progs[i]);
436 
437 	INIT_LIST_HEAD(&cgrp->bpf.storages);
438 
439 	for (i = 0; i < NR; i++)
440 		if (compute_effective_progs(cgrp, i, &arrays[i]))
441 			goto cleanup;
442 
443 	for (i = 0; i < NR; i++)
444 		activate_effective_progs(cgrp, i, arrays[i]);
445 
446 	return 0;
447 cleanup:
448 	for (i = 0; i < NR; i++)
449 		bpf_prog_array_free(arrays[i]);
450 
451 	for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
452 		cgroup_bpf_put(p);
453 
454 	percpu_ref_exit(&cgrp->bpf.refcnt);
455 
456 	return -ENOMEM;
457 }
458 
459 static int update_effective_progs(struct cgroup *cgrp,
460 				  enum cgroup_bpf_attach_type atype)
461 {
462 	struct cgroup_subsys_state *css;
463 	int err;
464 
465 	/* allocate and recompute effective prog arrays */
466 	css_for_each_descendant_pre(css, &cgrp->self) {
467 		struct cgroup *desc = container_of(css, struct cgroup, self);
468 
469 		if (percpu_ref_is_zero(&desc->bpf.refcnt))
470 			continue;
471 
472 		err = compute_effective_progs(desc, atype, &desc->bpf.inactive);
473 		if (err)
474 			goto cleanup;
475 	}
476 
477 	/* all allocations were successful. Activate all prog arrays */
478 	css_for_each_descendant_pre(css, &cgrp->self) {
479 		struct cgroup *desc = container_of(css, struct cgroup, self);
480 
481 		if (percpu_ref_is_zero(&desc->bpf.refcnt)) {
482 			if (unlikely(desc->bpf.inactive)) {
483 				bpf_prog_array_free(desc->bpf.inactive);
484 				desc->bpf.inactive = NULL;
485 			}
486 			continue;
487 		}
488 
489 		activate_effective_progs(desc, atype, desc->bpf.inactive);
490 		desc->bpf.inactive = NULL;
491 	}
492 
493 	return 0;
494 
495 cleanup:
496 	/* oom while computing effective. Free all computed effective arrays
497 	 * since they were not activated
498 	 */
499 	css_for_each_descendant_pre(css, &cgrp->self) {
500 		struct cgroup *desc = container_of(css, struct cgroup, self);
501 
502 		bpf_prog_array_free(desc->bpf.inactive);
503 		desc->bpf.inactive = NULL;
504 	}
505 
506 	return err;
507 }
508 
509 #define BPF_CGROUP_MAX_PROGS 64
510 
511 static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
512 					       struct bpf_prog *prog,
513 					       struct bpf_cgroup_link *link,
514 					       struct bpf_prog *replace_prog,
515 					       bool allow_multi)
516 {
517 	struct bpf_prog_list *pl;
518 
519 	/* single-attach case */
520 	if (!allow_multi) {
521 		if (hlist_empty(progs))
522 			return NULL;
523 		return hlist_entry(progs->first, typeof(*pl), node);
524 	}
525 
526 	hlist_for_each_entry(pl, progs, node) {
527 		if (prog && pl->prog == prog && prog != replace_prog)
528 			/* disallow attaching the same prog twice */
529 			return ERR_PTR(-EINVAL);
530 		if (link && pl->link == link)
531 			/* disallow attaching the same link twice */
532 			return ERR_PTR(-EINVAL);
533 	}
534 
535 	/* direct prog multi-attach w/ replacement case */
536 	if (replace_prog) {
537 		hlist_for_each_entry(pl, progs, node) {
538 			if (pl->prog == replace_prog)
539 				/* a match found */
540 				return pl;
541 		}
542 		/* prog to replace not found for cgroup */
543 		return ERR_PTR(-ENOENT);
544 	}
545 
546 	return NULL;
547 }
548 
549 /**
550  * __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and
551  *                         propagate the change to descendants
552  * @cgrp: The cgroup which descendants to traverse
553  * @prog: A program to attach
554  * @link: A link to attach
555  * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set
556  * @type: Type of attach operation
557  * @flags: Option flags
558  *
559  * Exactly one of @prog or @link can be non-null.
560  * Must be called with cgroup_mutex held.
561  */
562 static int __cgroup_bpf_attach(struct cgroup *cgrp,
563 			       struct bpf_prog *prog, struct bpf_prog *replace_prog,
564 			       struct bpf_cgroup_link *link,
565 			       enum bpf_attach_type type, u32 flags)
566 {
567 	u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
568 	struct bpf_prog *old_prog = NULL;
569 	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
570 	struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
571 	struct bpf_prog *new_prog = prog ? : link->link.prog;
572 	enum cgroup_bpf_attach_type atype;
573 	struct bpf_prog_list *pl;
574 	struct hlist_head *progs;
575 	int err;
576 
577 	if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
578 	    ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI)))
579 		/* invalid combination */
580 		return -EINVAL;
581 	if (link && (prog || replace_prog))
582 		/* only either link or prog/replace_prog can be specified */
583 		return -EINVAL;
584 	if (!!replace_prog != !!(flags & BPF_F_REPLACE))
585 		/* replace_prog implies BPF_F_REPLACE, and vice versa */
586 		return -EINVAL;
587 
588 	atype = bpf_cgroup_atype_find(type, new_prog->aux->attach_btf_id);
589 	if (atype < 0)
590 		return -EINVAL;
591 
592 	progs = &cgrp->bpf.progs[atype];
593 
594 	if (!hierarchy_allows_attach(cgrp, atype))
595 		return -EPERM;
596 
597 	if (!hlist_empty(progs) && cgrp->bpf.flags[atype] != saved_flags)
598 		/* Disallow attaching non-overridable on top
599 		 * of existing overridable in this cgroup.
600 		 * Disallow attaching multi-prog if overridable or none
601 		 */
602 		return -EPERM;
603 
604 	if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
605 		return -E2BIG;
606 
607 	pl = find_attach_entry(progs, prog, link, replace_prog,
608 			       flags & BPF_F_ALLOW_MULTI);
609 	if (IS_ERR(pl))
610 		return PTR_ERR(pl);
611 
612 	if (bpf_cgroup_storages_alloc(storage, new_storage, type,
613 				      prog ? : link->link.prog, cgrp))
614 		return -ENOMEM;
615 
616 	if (pl) {
617 		old_prog = pl->prog;
618 	} else {
619 		struct hlist_node *last = NULL;
620 
621 		pl = kmalloc(sizeof(*pl), GFP_KERNEL);
622 		if (!pl) {
623 			bpf_cgroup_storages_free(new_storage);
624 			return -ENOMEM;
625 		}
626 		if (hlist_empty(progs))
627 			hlist_add_head(&pl->node, progs);
628 		else
629 			hlist_for_each(last, progs) {
630 				if (last->next)
631 					continue;
632 				hlist_add_behind(&pl->node, last);
633 				break;
634 			}
635 	}
636 
637 	pl->prog = prog;
638 	pl->link = link;
639 	bpf_cgroup_storages_assign(pl->storage, storage);
640 	cgrp->bpf.flags[atype] = saved_flags;
641 
642 	if (type == BPF_LSM_CGROUP) {
643 		err = bpf_trampoline_link_cgroup_shim(new_prog, atype);
644 		if (err)
645 			goto cleanup;
646 	}
647 
648 	err = update_effective_progs(cgrp, atype);
649 	if (err)
650 		goto cleanup_trampoline;
651 
652 	if (old_prog) {
653 		if (type == BPF_LSM_CGROUP)
654 			bpf_trampoline_unlink_cgroup_shim(old_prog);
655 		bpf_prog_put(old_prog);
656 	} else {
657 		static_branch_inc(&cgroup_bpf_enabled_key[atype]);
658 	}
659 	bpf_cgroup_storages_link(new_storage, cgrp, type);
660 	return 0;
661 
662 cleanup_trampoline:
663 	if (type == BPF_LSM_CGROUP)
664 		bpf_trampoline_unlink_cgroup_shim(new_prog);
665 
666 cleanup:
667 	if (old_prog) {
668 		pl->prog = old_prog;
669 		pl->link = NULL;
670 	}
671 	bpf_cgroup_storages_free(new_storage);
672 	if (!old_prog) {
673 		hlist_del(&pl->node);
674 		kfree(pl);
675 	}
676 	return err;
677 }
678 
679 static int cgroup_bpf_attach(struct cgroup *cgrp,
680 			     struct bpf_prog *prog, struct bpf_prog *replace_prog,
681 			     struct bpf_cgroup_link *link,
682 			     enum bpf_attach_type type,
683 			     u32 flags)
684 {
685 	int ret;
686 
687 	mutex_lock(&cgroup_mutex);
688 	ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
689 	mutex_unlock(&cgroup_mutex);
690 	return ret;
691 }
692 
693 /* Swap updated BPF program for given link in effective program arrays across
694  * all descendant cgroups. This function is guaranteed to succeed.
695  */
696 static void replace_effective_prog(struct cgroup *cgrp,
697 				   enum cgroup_bpf_attach_type atype,
698 				   struct bpf_cgroup_link *link)
699 {
700 	struct bpf_prog_array_item *item;
701 	struct cgroup_subsys_state *css;
702 	struct bpf_prog_array *progs;
703 	struct bpf_prog_list *pl;
704 	struct hlist_head *head;
705 	struct cgroup *cg;
706 	int pos;
707 
708 	css_for_each_descendant_pre(css, &cgrp->self) {
709 		struct cgroup *desc = container_of(css, struct cgroup, self);
710 
711 		if (percpu_ref_is_zero(&desc->bpf.refcnt))
712 			continue;
713 
714 		/* find position of link in effective progs array */
715 		for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
716 			if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
717 				continue;
718 
719 			head = &cg->bpf.progs[atype];
720 			hlist_for_each_entry(pl, head, node) {
721 				if (!prog_list_prog(pl))
722 					continue;
723 				if (pl->link == link)
724 					goto found;
725 				pos++;
726 			}
727 		}
728 found:
729 		BUG_ON(!cg);
730 		progs = rcu_dereference_protected(
731 				desc->bpf.effective[atype],
732 				lockdep_is_held(&cgroup_mutex));
733 		item = &progs->items[pos];
734 		WRITE_ONCE(item->prog, link->link.prog);
735 	}
736 }
737 
738 /**
739  * __cgroup_bpf_replace() - Replace link's program and propagate the change
740  *                          to descendants
741  * @cgrp: The cgroup which descendants to traverse
742  * @link: A link for which to replace BPF program
743  * @type: Type of attach operation
744  *
745  * Must be called with cgroup_mutex held.
746  */
747 static int __cgroup_bpf_replace(struct cgroup *cgrp,
748 				struct bpf_cgroup_link *link,
749 				struct bpf_prog *new_prog)
750 {
751 	enum cgroup_bpf_attach_type atype;
752 	struct bpf_prog *old_prog;
753 	struct bpf_prog_list *pl;
754 	struct hlist_head *progs;
755 	bool found = false;
756 
757 	atype = bpf_cgroup_atype_find(link->type, new_prog->aux->attach_btf_id);
758 	if (atype < 0)
759 		return -EINVAL;
760 
761 	progs = &cgrp->bpf.progs[atype];
762 
763 	if (link->link.prog->type != new_prog->type)
764 		return -EINVAL;
765 
766 	hlist_for_each_entry(pl, progs, node) {
767 		if (pl->link == link) {
768 			found = true;
769 			break;
770 		}
771 	}
772 	if (!found)
773 		return -ENOENT;
774 
775 	old_prog = xchg(&link->link.prog, new_prog);
776 	replace_effective_prog(cgrp, atype, link);
777 	bpf_prog_put(old_prog);
778 	return 0;
779 }
780 
781 static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog,
782 			      struct bpf_prog *old_prog)
783 {
784 	struct bpf_cgroup_link *cg_link;
785 	int ret;
786 
787 	cg_link = container_of(link, struct bpf_cgroup_link, link);
788 
789 	mutex_lock(&cgroup_mutex);
790 	/* link might have been auto-released by dying cgroup, so fail */
791 	if (!cg_link->cgroup) {
792 		ret = -ENOLINK;
793 		goto out_unlock;
794 	}
795 	if (old_prog && link->prog != old_prog) {
796 		ret = -EPERM;
797 		goto out_unlock;
798 	}
799 	ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog);
800 out_unlock:
801 	mutex_unlock(&cgroup_mutex);
802 	return ret;
803 }
804 
805 static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs,
806 					       struct bpf_prog *prog,
807 					       struct bpf_cgroup_link *link,
808 					       bool allow_multi)
809 {
810 	struct bpf_prog_list *pl;
811 
812 	if (!allow_multi) {
813 		if (hlist_empty(progs))
814 			/* report error when trying to detach and nothing is attached */
815 			return ERR_PTR(-ENOENT);
816 
817 		/* to maintain backward compatibility NONE and OVERRIDE cgroups
818 		 * allow detaching with invalid FD (prog==NULL) in legacy mode
819 		 */
820 		return hlist_entry(progs->first, typeof(*pl), node);
821 	}
822 
823 	if (!prog && !link)
824 		/* to detach MULTI prog the user has to specify valid FD
825 		 * of the program or link to be detached
826 		 */
827 		return ERR_PTR(-EINVAL);
828 
829 	/* find the prog or link and detach it */
830 	hlist_for_each_entry(pl, progs, node) {
831 		if (pl->prog == prog && pl->link == link)
832 			return pl;
833 	}
834 	return ERR_PTR(-ENOENT);
835 }
836 
837 /**
838  * purge_effective_progs() - After compute_effective_progs fails to alloc new
839  *                           cgrp->bpf.inactive table we can recover by
840  *                           recomputing the array in place.
841  *
842  * @cgrp: The cgroup which descendants to travers
843  * @prog: A program to detach or NULL
844  * @link: A link to detach or NULL
845  * @atype: Type of detach operation
846  */
847 static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog,
848 				  struct bpf_cgroup_link *link,
849 				  enum cgroup_bpf_attach_type atype)
850 {
851 	struct cgroup_subsys_state *css;
852 	struct bpf_prog_array *progs;
853 	struct bpf_prog_list *pl;
854 	struct hlist_head *head;
855 	struct cgroup *cg;
856 	int pos;
857 
858 	/* recompute effective prog array in place */
859 	css_for_each_descendant_pre(css, &cgrp->self) {
860 		struct cgroup *desc = container_of(css, struct cgroup, self);
861 
862 		if (percpu_ref_is_zero(&desc->bpf.refcnt))
863 			continue;
864 
865 		/* find position of link or prog in effective progs array */
866 		for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
867 			if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
868 				continue;
869 
870 			head = &cg->bpf.progs[atype];
871 			hlist_for_each_entry(pl, head, node) {
872 				if (!prog_list_prog(pl))
873 					continue;
874 				if (pl->prog == prog && pl->link == link)
875 					goto found;
876 				pos++;
877 			}
878 		}
879 found:
880 		BUG_ON(!cg);
881 		progs = rcu_dereference_protected(
882 				desc->bpf.effective[atype],
883 				lockdep_is_held(&cgroup_mutex));
884 
885 		/* Remove the program from the array */
886 		WARN_ONCE(bpf_prog_array_delete_safe_at(progs, pos),
887 			  "Failed to purge a prog from array at index %d", pos);
888 	}
889 }
890 
891 /**
892  * __cgroup_bpf_detach() - Detach the program or link from a cgroup, and
893  *                         propagate the change to descendants
894  * @cgrp: The cgroup which descendants to traverse
895  * @prog: A program to detach or NULL
896  * @link: A link to detach or NULL
897  * @type: Type of detach operation
898  *
899  * At most one of @prog or @link can be non-NULL.
900  * Must be called with cgroup_mutex held.
901  */
902 static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
903 			       struct bpf_cgroup_link *link, enum bpf_attach_type type)
904 {
905 	enum cgroup_bpf_attach_type atype;
906 	struct bpf_prog *old_prog;
907 	struct bpf_prog_list *pl;
908 	struct hlist_head *progs;
909 	u32 attach_btf_id = 0;
910 	u32 flags;
911 
912 	if (prog)
913 		attach_btf_id = prog->aux->attach_btf_id;
914 	if (link)
915 		attach_btf_id = link->link.prog->aux->attach_btf_id;
916 
917 	atype = bpf_cgroup_atype_find(type, attach_btf_id);
918 	if (atype < 0)
919 		return -EINVAL;
920 
921 	progs = &cgrp->bpf.progs[atype];
922 	flags = cgrp->bpf.flags[atype];
923 
924 	if (prog && link)
925 		/* only one of prog or link can be specified */
926 		return -EINVAL;
927 
928 	pl = find_detach_entry(progs, prog, link, flags & BPF_F_ALLOW_MULTI);
929 	if (IS_ERR(pl))
930 		return PTR_ERR(pl);
931 
932 	/* mark it deleted, so it's ignored while recomputing effective */
933 	old_prog = pl->prog;
934 	pl->prog = NULL;
935 	pl->link = NULL;
936 
937 	if (update_effective_progs(cgrp, atype)) {
938 		/* if update effective array failed replace the prog with a dummy prog*/
939 		pl->prog = old_prog;
940 		pl->link = link;
941 		purge_effective_progs(cgrp, old_prog, link, atype);
942 	}
943 
944 	/* now can actually delete it from this cgroup list */
945 	hlist_del(&pl->node);
946 
947 	kfree(pl);
948 	if (hlist_empty(progs))
949 		/* last program was detached, reset flags to zero */
950 		cgrp->bpf.flags[atype] = 0;
951 	if (old_prog) {
952 		if (type == BPF_LSM_CGROUP)
953 			bpf_trampoline_unlink_cgroup_shim(old_prog);
954 		bpf_prog_put(old_prog);
955 	}
956 	static_branch_dec(&cgroup_bpf_enabled_key[atype]);
957 	return 0;
958 }
959 
960 static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
961 			     enum bpf_attach_type type)
962 {
963 	int ret;
964 
965 	mutex_lock(&cgroup_mutex);
966 	ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
967 	mutex_unlock(&cgroup_mutex);
968 	return ret;
969 }
970 
971 /* Must be called with cgroup_mutex held to avoid races. */
972 static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
973 			      union bpf_attr __user *uattr)
974 {
975 	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
976 	enum bpf_attach_type type = attr->query.attach_type;
977 	enum cgroup_bpf_attach_type atype;
978 	struct bpf_prog_array *effective;
979 	struct hlist_head *progs;
980 	struct bpf_prog *prog;
981 	int cnt, ret = 0, i;
982 	u32 flags;
983 
984 	atype = to_cgroup_bpf_attach_type(type);
985 	if (atype < 0)
986 		return -EINVAL;
987 
988 	progs = &cgrp->bpf.progs[atype];
989 	flags = cgrp->bpf.flags[atype];
990 
991 	effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
992 					      lockdep_is_held(&cgroup_mutex));
993 
994 	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
995 		cnt = bpf_prog_array_length(effective);
996 	else
997 		cnt = prog_list_length(progs);
998 
999 	if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
1000 		return -EFAULT;
1001 	if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt)))
1002 		return -EFAULT;
1003 	if (attr->query.prog_cnt == 0 || !prog_ids || !cnt)
1004 		/* return early if user requested only program count + flags */
1005 		return 0;
1006 	if (attr->query.prog_cnt < cnt) {
1007 		cnt = attr->query.prog_cnt;
1008 		ret = -ENOSPC;
1009 	}
1010 
1011 	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
1012 		return bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
1013 	} else {
1014 		struct bpf_prog_list *pl;
1015 		u32 id;
1016 
1017 		i = 0;
1018 		hlist_for_each_entry(pl, progs, node) {
1019 			prog = prog_list_prog(pl);
1020 			id = prog->aux->id;
1021 			if (copy_to_user(prog_ids + i, &id, sizeof(id)))
1022 				return -EFAULT;
1023 			if (++i == cnt)
1024 				break;
1025 		}
1026 	}
1027 	return ret;
1028 }
1029 
1030 static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
1031 			    union bpf_attr __user *uattr)
1032 {
1033 	int ret;
1034 
1035 	mutex_lock(&cgroup_mutex);
1036 	ret = __cgroup_bpf_query(cgrp, attr, uattr);
1037 	mutex_unlock(&cgroup_mutex);
1038 	return ret;
1039 }
1040 
1041 int cgroup_bpf_prog_attach(const union bpf_attr *attr,
1042 			   enum bpf_prog_type ptype, struct bpf_prog *prog)
1043 {
1044 	struct bpf_prog *replace_prog = NULL;
1045 	struct cgroup *cgrp;
1046 	int ret;
1047 
1048 	cgrp = cgroup_get_from_fd(attr->target_fd);
1049 	if (IS_ERR(cgrp))
1050 		return PTR_ERR(cgrp);
1051 
1052 	if ((attr->attach_flags & BPF_F_ALLOW_MULTI) &&
1053 	    (attr->attach_flags & BPF_F_REPLACE)) {
1054 		replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, ptype);
1055 		if (IS_ERR(replace_prog)) {
1056 			cgroup_put(cgrp);
1057 			return PTR_ERR(replace_prog);
1058 		}
1059 	}
1060 
1061 	ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL,
1062 				attr->attach_type, attr->attach_flags);
1063 
1064 	if (replace_prog)
1065 		bpf_prog_put(replace_prog);
1066 	cgroup_put(cgrp);
1067 	return ret;
1068 }
1069 
1070 int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
1071 {
1072 	struct bpf_prog *prog;
1073 	struct cgroup *cgrp;
1074 	int ret;
1075 
1076 	cgrp = cgroup_get_from_fd(attr->target_fd);
1077 	if (IS_ERR(cgrp))
1078 		return PTR_ERR(cgrp);
1079 
1080 	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
1081 	if (IS_ERR(prog))
1082 		prog = NULL;
1083 
1084 	ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type);
1085 	if (prog)
1086 		bpf_prog_put(prog);
1087 
1088 	cgroup_put(cgrp);
1089 	return ret;
1090 }
1091 
1092 static void bpf_cgroup_link_release(struct bpf_link *link)
1093 {
1094 	struct bpf_cgroup_link *cg_link =
1095 		container_of(link, struct bpf_cgroup_link, link);
1096 	struct cgroup *cg;
1097 
1098 	/* link might have been auto-detached by dying cgroup already,
1099 	 * in that case our work is done here
1100 	 */
1101 	if (!cg_link->cgroup)
1102 		return;
1103 
1104 	mutex_lock(&cgroup_mutex);
1105 
1106 	/* re-check cgroup under lock again */
1107 	if (!cg_link->cgroup) {
1108 		mutex_unlock(&cgroup_mutex);
1109 		return;
1110 	}
1111 
1112 	WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link,
1113 				    cg_link->type));
1114 	if (cg_link->type == BPF_LSM_CGROUP)
1115 		bpf_trampoline_unlink_cgroup_shim(cg_link->link.prog);
1116 
1117 	cg = cg_link->cgroup;
1118 	cg_link->cgroup = NULL;
1119 
1120 	mutex_unlock(&cgroup_mutex);
1121 
1122 	cgroup_put(cg);
1123 }
1124 
1125 static void bpf_cgroup_link_dealloc(struct bpf_link *link)
1126 {
1127 	struct bpf_cgroup_link *cg_link =
1128 		container_of(link, struct bpf_cgroup_link, link);
1129 
1130 	kfree(cg_link);
1131 }
1132 
1133 static int bpf_cgroup_link_detach(struct bpf_link *link)
1134 {
1135 	bpf_cgroup_link_release(link);
1136 
1137 	return 0;
1138 }
1139 
1140 static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link,
1141 					struct seq_file *seq)
1142 {
1143 	struct bpf_cgroup_link *cg_link =
1144 		container_of(link, struct bpf_cgroup_link, link);
1145 	u64 cg_id = 0;
1146 
1147 	mutex_lock(&cgroup_mutex);
1148 	if (cg_link->cgroup)
1149 		cg_id = cgroup_id(cg_link->cgroup);
1150 	mutex_unlock(&cgroup_mutex);
1151 
1152 	seq_printf(seq,
1153 		   "cgroup_id:\t%llu\n"
1154 		   "attach_type:\t%d\n",
1155 		   cg_id,
1156 		   cg_link->type);
1157 }
1158 
1159 static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link,
1160 					  struct bpf_link_info *info)
1161 {
1162 	struct bpf_cgroup_link *cg_link =
1163 		container_of(link, struct bpf_cgroup_link, link);
1164 	u64 cg_id = 0;
1165 
1166 	mutex_lock(&cgroup_mutex);
1167 	if (cg_link->cgroup)
1168 		cg_id = cgroup_id(cg_link->cgroup);
1169 	mutex_unlock(&cgroup_mutex);
1170 
1171 	info->cgroup.cgroup_id = cg_id;
1172 	info->cgroup.attach_type = cg_link->type;
1173 	return 0;
1174 }
1175 
1176 static const struct bpf_link_ops bpf_cgroup_link_lops = {
1177 	.release = bpf_cgroup_link_release,
1178 	.dealloc = bpf_cgroup_link_dealloc,
1179 	.detach = bpf_cgroup_link_detach,
1180 	.update_prog = cgroup_bpf_replace,
1181 	.show_fdinfo = bpf_cgroup_link_show_fdinfo,
1182 	.fill_link_info = bpf_cgroup_link_fill_link_info,
1183 };
1184 
1185 int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
1186 {
1187 	struct bpf_link_primer link_primer;
1188 	struct bpf_cgroup_link *link;
1189 	struct cgroup *cgrp;
1190 	int err;
1191 
1192 	if (attr->link_create.flags)
1193 		return -EINVAL;
1194 
1195 	cgrp = cgroup_get_from_fd(attr->link_create.target_fd);
1196 	if (IS_ERR(cgrp))
1197 		return PTR_ERR(cgrp);
1198 
1199 	link = kzalloc(sizeof(*link), GFP_USER);
1200 	if (!link) {
1201 		err = -ENOMEM;
1202 		goto out_put_cgroup;
1203 	}
1204 	bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops,
1205 		      prog);
1206 	link->cgroup = cgrp;
1207 	link->type = attr->link_create.attach_type;
1208 
1209 	err = bpf_link_prime(&link->link, &link_primer);
1210 	if (err) {
1211 		kfree(link);
1212 		goto out_put_cgroup;
1213 	}
1214 
1215 	err = cgroup_bpf_attach(cgrp, NULL, NULL, link,
1216 				link->type, BPF_F_ALLOW_MULTI);
1217 	if (err) {
1218 		bpf_link_cleanup(&link_primer);
1219 		goto out_put_cgroup;
1220 	}
1221 
1222 	return bpf_link_settle(&link_primer);
1223 
1224 out_put_cgroup:
1225 	cgroup_put(cgrp);
1226 	return err;
1227 }
1228 
1229 int cgroup_bpf_prog_query(const union bpf_attr *attr,
1230 			  union bpf_attr __user *uattr)
1231 {
1232 	struct cgroup *cgrp;
1233 	int ret;
1234 
1235 	cgrp = cgroup_get_from_fd(attr->query.target_fd);
1236 	if (IS_ERR(cgrp))
1237 		return PTR_ERR(cgrp);
1238 
1239 	ret = cgroup_bpf_query(cgrp, attr, uattr);
1240 
1241 	cgroup_put(cgrp);
1242 	return ret;
1243 }
1244 
1245 /**
1246  * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
1247  * @sk: The socket sending or receiving traffic
1248  * @skb: The skb that is being sent or received
1249  * @type: The type of program to be executed
1250  *
1251  * If no socket is passed, or the socket is not of type INET or INET6,
1252  * this function does nothing and returns 0.
1253  *
1254  * The program type passed in via @type must be suitable for network
1255  * filtering. No further check is performed to assert that.
1256  *
1257  * For egress packets, this function can return:
1258  *   NET_XMIT_SUCCESS    (0)	- continue with packet output
1259  *   NET_XMIT_DROP       (1)	- drop packet and notify TCP to call cwr
1260  *   NET_XMIT_CN         (2)	- continue with packet output and notify TCP
1261  *				  to call cwr
1262  *   -err			- drop packet
1263  *
1264  * For ingress packets, this function will return -EPERM if any
1265  * attached program was found and if it returned != 1 during execution.
1266  * Otherwise 0 is returned.
1267  */
1268 int __cgroup_bpf_run_filter_skb(struct sock *sk,
1269 				struct sk_buff *skb,
1270 				enum cgroup_bpf_attach_type atype)
1271 {
1272 	unsigned int offset = skb->data - skb_network_header(skb);
1273 	struct sock *save_sk;
1274 	void *saved_data_end;
1275 	struct cgroup *cgrp;
1276 	int ret;
1277 
1278 	if (!sk || !sk_fullsock(sk))
1279 		return 0;
1280 
1281 	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
1282 		return 0;
1283 
1284 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1285 	save_sk = skb->sk;
1286 	skb->sk = sk;
1287 	__skb_push(skb, offset);
1288 
1289 	/* compute pointers for the bpf prog */
1290 	bpf_compute_and_save_data_end(skb, &saved_data_end);
1291 
1292 	if (atype == CGROUP_INET_EGRESS) {
1293 		u32 flags = 0;
1294 		bool cn;
1295 
1296 		ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, skb,
1297 					    __bpf_prog_run_save_cb, 0, &flags);
1298 
1299 		/* Return values of CGROUP EGRESS BPF programs are:
1300 		 *   0: drop packet
1301 		 *   1: keep packet
1302 		 *   2: drop packet and cn
1303 		 *   3: keep packet and cn
1304 		 *
1305 		 * The returned value is then converted to one of the NET_XMIT
1306 		 * or an error code that is then interpreted as drop packet
1307 		 * (and no cn):
1308 		 *   0: NET_XMIT_SUCCESS  skb should be transmitted
1309 		 *   1: NET_XMIT_DROP     skb should be dropped and cn
1310 		 *   2: NET_XMIT_CN       skb should be transmitted and cn
1311 		 *   3: -err              skb should be dropped
1312 		 */
1313 
1314 		cn = flags & BPF_RET_SET_CN;
1315 		if (ret && !IS_ERR_VALUE((long)ret))
1316 			ret = -EFAULT;
1317 		if (!ret)
1318 			ret = (cn ? NET_XMIT_CN : NET_XMIT_SUCCESS);
1319 		else
1320 			ret = (cn ? NET_XMIT_DROP : ret);
1321 	} else {
1322 		ret = bpf_prog_run_array_cg(&cgrp->bpf, atype,
1323 					    skb, __bpf_prog_run_save_cb, 0,
1324 					    NULL);
1325 		if (ret && !IS_ERR_VALUE((long)ret))
1326 			ret = -EFAULT;
1327 	}
1328 	bpf_restore_data_end(skb, saved_data_end);
1329 	__skb_pull(skb, offset);
1330 	skb->sk = save_sk;
1331 
1332 	return ret;
1333 }
1334 EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
1335 
1336 /**
1337  * __cgroup_bpf_run_filter_sk() - Run a program on a sock
1338  * @sk: sock structure to manipulate
1339  * @type: The type of program to be executed
1340  *
1341  * socket is passed is expected to be of type INET or INET6.
1342  *
1343  * The program type passed in via @type must be suitable for sock
1344  * filtering. No further check is performed to assert that.
1345  *
1346  * This function will return %-EPERM if any if an attached program was found
1347  * and if it returned != 1 during execution. In all other cases, 0 is returned.
1348  */
1349 int __cgroup_bpf_run_filter_sk(struct sock *sk,
1350 			       enum cgroup_bpf_attach_type atype)
1351 {
1352 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1353 
1354 	return bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0,
1355 				     NULL);
1356 }
1357 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
1358 
1359 /**
1360  * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and
1361  *                                       provided by user sockaddr
1362  * @sk: sock struct that will use sockaddr
1363  * @uaddr: sockaddr struct provided by user
1364  * @type: The type of program to be executed
1365  * @t_ctx: Pointer to attach type specific context
1366  * @flags: Pointer to u32 which contains higher bits of BPF program
1367  *         return value (OR'ed together).
1368  *
1369  * socket is expected to be of type INET or INET6.
1370  *
1371  * This function will return %-EPERM if an attached program is found and
1372  * returned value != 1 during execution. In all other cases, 0 is returned.
1373  */
1374 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
1375 				      struct sockaddr *uaddr,
1376 				      enum cgroup_bpf_attach_type atype,
1377 				      void *t_ctx,
1378 				      u32 *flags)
1379 {
1380 	struct bpf_sock_addr_kern ctx = {
1381 		.sk = sk,
1382 		.uaddr = uaddr,
1383 		.t_ctx = t_ctx,
1384 	};
1385 	struct sockaddr_storage unspec;
1386 	struct cgroup *cgrp;
1387 
1388 	/* Check socket family since not all sockets represent network
1389 	 * endpoint (e.g. AF_UNIX).
1390 	 */
1391 	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
1392 		return 0;
1393 
1394 	if (!ctx.uaddr) {
1395 		memset(&unspec, 0, sizeof(unspec));
1396 		ctx.uaddr = (struct sockaddr *)&unspec;
1397 	}
1398 
1399 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1400 	return bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run,
1401 				     0, flags);
1402 }
1403 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
1404 
1405 /**
1406  * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock
1407  * @sk: socket to get cgroup from
1408  * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
1409  * sk with connection information (IP addresses, etc.) May not contain
1410  * cgroup info if it is a req sock.
1411  * @type: The type of program to be executed
1412  *
1413  * socket passed is expected to be of type INET or INET6.
1414  *
1415  * The program type passed in via @type must be suitable for sock_ops
1416  * filtering. No further check is performed to assert that.
1417  *
1418  * This function will return %-EPERM if any if an attached program was found
1419  * and if it returned != 1 during execution. In all other cases, 0 is returned.
1420  */
1421 int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
1422 				     struct bpf_sock_ops_kern *sock_ops,
1423 				     enum cgroup_bpf_attach_type atype)
1424 {
1425 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1426 
1427 	return bpf_prog_run_array_cg(&cgrp->bpf, atype, sock_ops, bpf_prog_run,
1428 				     0, NULL);
1429 }
1430 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
1431 
1432 int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
1433 				      short access, enum cgroup_bpf_attach_type atype)
1434 {
1435 	struct cgroup *cgrp;
1436 	struct bpf_cgroup_dev_ctx ctx = {
1437 		.access_type = (access << 16) | dev_type,
1438 		.major = major,
1439 		.minor = minor,
1440 	};
1441 	int ret;
1442 
1443 	rcu_read_lock();
1444 	cgrp = task_dfl_cgroup(current);
1445 	ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0,
1446 				    NULL);
1447 	rcu_read_unlock();
1448 
1449 	return ret;
1450 }
1451 
1452 BPF_CALL_0(bpf_get_retval)
1453 {
1454 	struct bpf_cg_run_ctx *ctx =
1455 		container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
1456 
1457 	return ctx->retval;
1458 }
1459 
1460 const struct bpf_func_proto bpf_get_retval_proto = {
1461 	.func		= bpf_get_retval,
1462 	.gpl_only	= false,
1463 	.ret_type	= RET_INTEGER,
1464 };
1465 
1466 BPF_CALL_1(bpf_set_retval, int, retval)
1467 {
1468 	struct bpf_cg_run_ctx *ctx =
1469 		container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
1470 
1471 	ctx->retval = retval;
1472 	return 0;
1473 }
1474 
1475 const struct bpf_func_proto bpf_set_retval_proto = {
1476 	.func		= bpf_set_retval,
1477 	.gpl_only	= false,
1478 	.ret_type	= RET_INTEGER,
1479 	.arg1_type	= ARG_ANYTHING,
1480 };
1481 
1482 static const struct bpf_func_proto *
1483 cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1484 {
1485 	switch (func_id) {
1486 	case BPF_FUNC_get_current_uid_gid:
1487 		return &bpf_get_current_uid_gid_proto;
1488 	case BPF_FUNC_get_local_storage:
1489 		return &bpf_get_local_storage_proto;
1490 	case BPF_FUNC_get_current_cgroup_id:
1491 		return &bpf_get_current_cgroup_id_proto;
1492 	case BPF_FUNC_perf_event_output:
1493 		return &bpf_event_output_data_proto;
1494 	case BPF_FUNC_get_retval:
1495 		return &bpf_get_retval_proto;
1496 	case BPF_FUNC_set_retval:
1497 		return &bpf_set_retval_proto;
1498 	default:
1499 		return bpf_base_func_proto(func_id);
1500 	}
1501 }
1502 
1503 static const struct bpf_func_proto *
1504 cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1505 {
1506 	return cgroup_base_func_proto(func_id, prog);
1507 }
1508 
1509 static bool cgroup_dev_is_valid_access(int off, int size,
1510 				       enum bpf_access_type type,
1511 				       const struct bpf_prog *prog,
1512 				       struct bpf_insn_access_aux *info)
1513 {
1514 	const int size_default = sizeof(__u32);
1515 
1516 	if (type == BPF_WRITE)
1517 		return false;
1518 
1519 	if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx))
1520 		return false;
1521 	/* The verifier guarantees that size > 0. */
1522 	if (off % size != 0)
1523 		return false;
1524 
1525 	switch (off) {
1526 	case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type):
1527 		bpf_ctx_record_field_size(info, size_default);
1528 		if (!bpf_ctx_narrow_access_ok(off, size, size_default))
1529 			return false;
1530 		break;
1531 	default:
1532 		if (size != size_default)
1533 			return false;
1534 	}
1535 
1536 	return true;
1537 }
1538 
1539 const struct bpf_prog_ops cg_dev_prog_ops = {
1540 };
1541 
1542 const struct bpf_verifier_ops cg_dev_verifier_ops = {
1543 	.get_func_proto		= cgroup_dev_func_proto,
1544 	.is_valid_access	= cgroup_dev_is_valid_access,
1545 };
1546 
1547 /**
1548  * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl
1549  *
1550  * @head: sysctl table header
1551  * @table: sysctl table
1552  * @write: sysctl is being read (= 0) or written (= 1)
1553  * @buf: pointer to buffer (in and out)
1554  * @pcount: value-result argument: value is size of buffer pointed to by @buf,
1555  *	result is size of @new_buf if program set new value, initial value
1556  *	otherwise
1557  * @ppos: value-result argument: value is position at which read from or write
1558  *	to sysctl is happening, result is new position if program overrode it,
1559  *	initial value otherwise
1560  * @type: type of program to be executed
1561  *
1562  * Program is run when sysctl is being accessed, either read or written, and
1563  * can allow or deny such access.
1564  *
1565  * This function will return %-EPERM if an attached program is found and
1566  * returned value != 1 during execution. In all other cases 0 is returned.
1567  */
1568 int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
1569 				   struct ctl_table *table, int write,
1570 				   char **buf, size_t *pcount, loff_t *ppos,
1571 				   enum cgroup_bpf_attach_type atype)
1572 {
1573 	struct bpf_sysctl_kern ctx = {
1574 		.head = head,
1575 		.table = table,
1576 		.write = write,
1577 		.ppos = ppos,
1578 		.cur_val = NULL,
1579 		.cur_len = PAGE_SIZE,
1580 		.new_val = NULL,
1581 		.new_len = 0,
1582 		.new_updated = 0,
1583 	};
1584 	struct cgroup *cgrp;
1585 	loff_t pos = 0;
1586 	int ret;
1587 
1588 	ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL);
1589 	if (!ctx.cur_val ||
1590 	    table->proc_handler(table, 0, ctx.cur_val, &ctx.cur_len, &pos)) {
1591 		/* Let BPF program decide how to proceed. */
1592 		ctx.cur_len = 0;
1593 	}
1594 
1595 	if (write && *buf && *pcount) {
1596 		/* BPF program should be able to override new value with a
1597 		 * buffer bigger than provided by user.
1598 		 */
1599 		ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL);
1600 		ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount);
1601 		if (ctx.new_val) {
1602 			memcpy(ctx.new_val, *buf, ctx.new_len);
1603 		} else {
1604 			/* Let BPF program decide how to proceed. */
1605 			ctx.new_len = 0;
1606 		}
1607 	}
1608 
1609 	rcu_read_lock();
1610 	cgrp = task_dfl_cgroup(current);
1611 	ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0,
1612 				    NULL);
1613 	rcu_read_unlock();
1614 
1615 	kfree(ctx.cur_val);
1616 
1617 	if (ret == 1 && ctx.new_updated) {
1618 		kfree(*buf);
1619 		*buf = ctx.new_val;
1620 		*pcount = ctx.new_len;
1621 	} else {
1622 		kfree(ctx.new_val);
1623 	}
1624 
1625 	return ret;
1626 }
1627 
1628 #ifdef CONFIG_NET
1629 static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
1630 			     struct bpf_sockopt_buf *buf)
1631 {
1632 	if (unlikely(max_optlen < 0))
1633 		return -EINVAL;
1634 
1635 	if (unlikely(max_optlen > PAGE_SIZE)) {
1636 		/* We don't expose optvals that are greater than PAGE_SIZE
1637 		 * to the BPF program.
1638 		 */
1639 		max_optlen = PAGE_SIZE;
1640 	}
1641 
1642 	if (max_optlen <= sizeof(buf->data)) {
1643 		/* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE
1644 		 * bytes avoid the cost of kzalloc.
1645 		 */
1646 		ctx->optval = buf->data;
1647 		ctx->optval_end = ctx->optval + max_optlen;
1648 		return max_optlen;
1649 	}
1650 
1651 	ctx->optval = kzalloc(max_optlen, GFP_USER);
1652 	if (!ctx->optval)
1653 		return -ENOMEM;
1654 
1655 	ctx->optval_end = ctx->optval + max_optlen;
1656 
1657 	return max_optlen;
1658 }
1659 
1660 static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
1661 			     struct bpf_sockopt_buf *buf)
1662 {
1663 	if (ctx->optval == buf->data)
1664 		return;
1665 	kfree(ctx->optval);
1666 }
1667 
1668 static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
1669 				  struct bpf_sockopt_buf *buf)
1670 {
1671 	return ctx->optval != buf->data;
1672 }
1673 
1674 int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
1675 				       int *optname, char __user *optval,
1676 				       int *optlen, char **kernel_optval)
1677 {
1678 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1679 	struct bpf_sockopt_buf buf = {};
1680 	struct bpf_sockopt_kern ctx = {
1681 		.sk = sk,
1682 		.level = *level,
1683 		.optname = *optname,
1684 	};
1685 	int ret, max_optlen;
1686 
1687 	/* Allocate a bit more than the initial user buffer for
1688 	 * BPF program. The canonical use case is overriding
1689 	 * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
1690 	 */
1691 	max_optlen = max_t(int, 16, *optlen);
1692 	max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
1693 	if (max_optlen < 0)
1694 		return max_optlen;
1695 
1696 	ctx.optlen = *optlen;
1697 
1698 	if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) {
1699 		ret = -EFAULT;
1700 		goto out;
1701 	}
1702 
1703 	lock_sock(sk);
1704 	ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_SETSOCKOPT,
1705 				    &ctx, bpf_prog_run, 0, NULL);
1706 	release_sock(sk);
1707 
1708 	if (ret)
1709 		goto out;
1710 
1711 	if (ctx.optlen == -1) {
1712 		/* optlen set to -1, bypass kernel */
1713 		ret = 1;
1714 	} else if (ctx.optlen > max_optlen || ctx.optlen < -1) {
1715 		/* optlen is out of bounds */
1716 		ret = -EFAULT;
1717 	} else {
1718 		/* optlen within bounds, run kernel handler */
1719 		ret = 0;
1720 
1721 		/* export any potential modifications */
1722 		*level = ctx.level;
1723 		*optname = ctx.optname;
1724 
1725 		/* optlen == 0 from BPF indicates that we should
1726 		 * use original userspace data.
1727 		 */
1728 		if (ctx.optlen != 0) {
1729 			*optlen = ctx.optlen;
1730 			/* We've used bpf_sockopt_kern->buf as an intermediary
1731 			 * storage, but the BPF program indicates that we need
1732 			 * to pass this data to the kernel setsockopt handler.
1733 			 * No way to export on-stack buf, have to allocate a
1734 			 * new buffer.
1735 			 */
1736 			if (!sockopt_buf_allocated(&ctx, &buf)) {
1737 				void *p = kmalloc(ctx.optlen, GFP_USER);
1738 
1739 				if (!p) {
1740 					ret = -ENOMEM;
1741 					goto out;
1742 				}
1743 				memcpy(p, ctx.optval, ctx.optlen);
1744 				*kernel_optval = p;
1745 			} else {
1746 				*kernel_optval = ctx.optval;
1747 			}
1748 			/* export and don't free sockopt buf */
1749 			return 0;
1750 		}
1751 	}
1752 
1753 out:
1754 	sockopt_free_buf(&ctx, &buf);
1755 	return ret;
1756 }
1757 
1758 int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
1759 				       int optname, char __user *optval,
1760 				       int __user *optlen, int max_optlen,
1761 				       int retval)
1762 {
1763 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1764 	struct bpf_sockopt_buf buf = {};
1765 	struct bpf_sockopt_kern ctx = {
1766 		.sk = sk,
1767 		.level = level,
1768 		.optname = optname,
1769 		.current_task = current,
1770 	};
1771 	int ret;
1772 
1773 	ctx.optlen = max_optlen;
1774 	max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
1775 	if (max_optlen < 0)
1776 		return max_optlen;
1777 
1778 	if (!retval) {
1779 		/* If kernel getsockopt finished successfully,
1780 		 * copy whatever was returned to the user back
1781 		 * into our temporary buffer. Set optlen to the
1782 		 * one that kernel returned as well to let
1783 		 * BPF programs inspect the value.
1784 		 */
1785 
1786 		if (get_user(ctx.optlen, optlen)) {
1787 			ret = -EFAULT;
1788 			goto out;
1789 		}
1790 
1791 		if (ctx.optlen < 0) {
1792 			ret = -EFAULT;
1793 			goto out;
1794 		}
1795 
1796 		if (copy_from_user(ctx.optval, optval,
1797 				   min(ctx.optlen, max_optlen)) != 0) {
1798 			ret = -EFAULT;
1799 			goto out;
1800 		}
1801 	}
1802 
1803 	lock_sock(sk);
1804 	ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
1805 				    &ctx, bpf_prog_run, retval, NULL);
1806 	release_sock(sk);
1807 
1808 	if (ret < 0)
1809 		goto out;
1810 
1811 	if (ctx.optlen > max_optlen || ctx.optlen < 0) {
1812 		ret = -EFAULT;
1813 		goto out;
1814 	}
1815 
1816 	if (ctx.optlen != 0) {
1817 		if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
1818 		    put_user(ctx.optlen, optlen)) {
1819 			ret = -EFAULT;
1820 			goto out;
1821 		}
1822 	}
1823 
1824 out:
1825 	sockopt_free_buf(&ctx, &buf);
1826 	return ret;
1827 }
1828 
1829 int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
1830 					    int optname, void *optval,
1831 					    int *optlen, int retval)
1832 {
1833 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1834 	struct bpf_sockopt_kern ctx = {
1835 		.sk = sk,
1836 		.level = level,
1837 		.optname = optname,
1838 		.optlen = *optlen,
1839 		.optval = optval,
1840 		.optval_end = optval + *optlen,
1841 		.current_task = current,
1842 	};
1843 	int ret;
1844 
1845 	/* Note that __cgroup_bpf_run_filter_getsockopt doesn't copy
1846 	 * user data back into BPF buffer when reval != 0. This is
1847 	 * done as an optimization to avoid extra copy, assuming
1848 	 * kernel won't populate the data in case of an error.
1849 	 * Here we always pass the data and memset() should
1850 	 * be called if that data shouldn't be "exported".
1851 	 */
1852 
1853 	ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
1854 				    &ctx, bpf_prog_run, retval, NULL);
1855 	if (ret < 0)
1856 		return ret;
1857 
1858 	if (ctx.optlen > *optlen)
1859 		return -EFAULT;
1860 
1861 	/* BPF programs can shrink the buffer, export the modifications.
1862 	 */
1863 	if (ctx.optlen != 0)
1864 		*optlen = ctx.optlen;
1865 
1866 	return ret;
1867 }
1868 #endif
1869 
1870 static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
1871 			      size_t *lenp)
1872 {
1873 	ssize_t tmp_ret = 0, ret;
1874 
1875 	if (dir->header.parent) {
1876 		tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp);
1877 		if (tmp_ret < 0)
1878 			return tmp_ret;
1879 	}
1880 
1881 	ret = strscpy(*bufp, dir->header.ctl_table[0].procname, *lenp);
1882 	if (ret < 0)
1883 		return ret;
1884 	*bufp += ret;
1885 	*lenp -= ret;
1886 	ret += tmp_ret;
1887 
1888 	/* Avoid leading slash. */
1889 	if (!ret)
1890 		return ret;
1891 
1892 	tmp_ret = strscpy(*bufp, "/", *lenp);
1893 	if (tmp_ret < 0)
1894 		return tmp_ret;
1895 	*bufp += tmp_ret;
1896 	*lenp -= tmp_ret;
1897 
1898 	return ret + tmp_ret;
1899 }
1900 
1901 BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern *, ctx, char *, buf,
1902 	   size_t, buf_len, u64, flags)
1903 {
1904 	ssize_t tmp_ret = 0, ret;
1905 
1906 	if (!buf)
1907 		return -EINVAL;
1908 
1909 	if (!(flags & BPF_F_SYSCTL_BASE_NAME)) {
1910 		if (!ctx->head)
1911 			return -EINVAL;
1912 		tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len);
1913 		if (tmp_ret < 0)
1914 			return tmp_ret;
1915 	}
1916 
1917 	ret = strscpy(buf, ctx->table->procname, buf_len);
1918 
1919 	return ret < 0 ? ret : tmp_ret + ret;
1920 }
1921 
1922 static const struct bpf_func_proto bpf_sysctl_get_name_proto = {
1923 	.func		= bpf_sysctl_get_name,
1924 	.gpl_only	= false,
1925 	.ret_type	= RET_INTEGER,
1926 	.arg1_type	= ARG_PTR_TO_CTX,
1927 	.arg2_type	= ARG_PTR_TO_MEM,
1928 	.arg3_type	= ARG_CONST_SIZE,
1929 	.arg4_type	= ARG_ANYTHING,
1930 };
1931 
1932 static int copy_sysctl_value(char *dst, size_t dst_len, char *src,
1933 			     size_t src_len)
1934 {
1935 	if (!dst)
1936 		return -EINVAL;
1937 
1938 	if (!dst_len)
1939 		return -E2BIG;
1940 
1941 	if (!src || !src_len) {
1942 		memset(dst, 0, dst_len);
1943 		return -EINVAL;
1944 	}
1945 
1946 	memcpy(dst, src, min(dst_len, src_len));
1947 
1948 	if (dst_len > src_len) {
1949 		memset(dst + src_len, '\0', dst_len - src_len);
1950 		return src_len;
1951 	}
1952 
1953 	dst[dst_len - 1] = '\0';
1954 
1955 	return -E2BIG;
1956 }
1957 
1958 BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx,
1959 	   char *, buf, size_t, buf_len)
1960 {
1961 	return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len);
1962 }
1963 
1964 static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = {
1965 	.func		= bpf_sysctl_get_current_value,
1966 	.gpl_only	= false,
1967 	.ret_type	= RET_INTEGER,
1968 	.arg1_type	= ARG_PTR_TO_CTX,
1969 	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
1970 	.arg3_type	= ARG_CONST_SIZE,
1971 };
1972 
1973 BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf,
1974 	   size_t, buf_len)
1975 {
1976 	if (!ctx->write) {
1977 		if (buf && buf_len)
1978 			memset(buf, '\0', buf_len);
1979 		return -EINVAL;
1980 	}
1981 	return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len);
1982 }
1983 
1984 static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = {
1985 	.func		= bpf_sysctl_get_new_value,
1986 	.gpl_only	= false,
1987 	.ret_type	= RET_INTEGER,
1988 	.arg1_type	= ARG_PTR_TO_CTX,
1989 	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
1990 	.arg3_type	= ARG_CONST_SIZE,
1991 };
1992 
1993 BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx,
1994 	   const char *, buf, size_t, buf_len)
1995 {
1996 	if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len)
1997 		return -EINVAL;
1998 
1999 	if (buf_len > PAGE_SIZE - 1)
2000 		return -E2BIG;
2001 
2002 	memcpy(ctx->new_val, buf, buf_len);
2003 	ctx->new_len = buf_len;
2004 	ctx->new_updated = 1;
2005 
2006 	return 0;
2007 }
2008 
2009 static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
2010 	.func		= bpf_sysctl_set_new_value,
2011 	.gpl_only	= false,
2012 	.ret_type	= RET_INTEGER,
2013 	.arg1_type	= ARG_PTR_TO_CTX,
2014 	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
2015 	.arg3_type	= ARG_CONST_SIZE,
2016 };
2017 
2018 static const struct bpf_func_proto *
2019 sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
2020 {
2021 	switch (func_id) {
2022 	case BPF_FUNC_strtol:
2023 		return &bpf_strtol_proto;
2024 	case BPF_FUNC_strtoul:
2025 		return &bpf_strtoul_proto;
2026 	case BPF_FUNC_sysctl_get_name:
2027 		return &bpf_sysctl_get_name_proto;
2028 	case BPF_FUNC_sysctl_get_current_value:
2029 		return &bpf_sysctl_get_current_value_proto;
2030 	case BPF_FUNC_sysctl_get_new_value:
2031 		return &bpf_sysctl_get_new_value_proto;
2032 	case BPF_FUNC_sysctl_set_new_value:
2033 		return &bpf_sysctl_set_new_value_proto;
2034 	case BPF_FUNC_ktime_get_coarse_ns:
2035 		return &bpf_ktime_get_coarse_ns_proto;
2036 	default:
2037 		return cgroup_base_func_proto(func_id, prog);
2038 	}
2039 }
2040 
2041 static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type,
2042 				   const struct bpf_prog *prog,
2043 				   struct bpf_insn_access_aux *info)
2044 {
2045 	const int size_default = sizeof(__u32);
2046 
2047 	if (off < 0 || off + size > sizeof(struct bpf_sysctl) || off % size)
2048 		return false;
2049 
2050 	switch (off) {
2051 	case bpf_ctx_range(struct bpf_sysctl, write):
2052 		if (type != BPF_READ)
2053 			return false;
2054 		bpf_ctx_record_field_size(info, size_default);
2055 		return bpf_ctx_narrow_access_ok(off, size, size_default);
2056 	case bpf_ctx_range(struct bpf_sysctl, file_pos):
2057 		if (type == BPF_READ) {
2058 			bpf_ctx_record_field_size(info, size_default);
2059 			return bpf_ctx_narrow_access_ok(off, size, size_default);
2060 		} else {
2061 			return size == size_default;
2062 		}
2063 	default:
2064 		return false;
2065 	}
2066 }
2067 
2068 static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
2069 				     const struct bpf_insn *si,
2070 				     struct bpf_insn *insn_buf,
2071 				     struct bpf_prog *prog, u32 *target_size)
2072 {
2073 	struct bpf_insn *insn = insn_buf;
2074 	u32 read_size;
2075 
2076 	switch (si->off) {
2077 	case offsetof(struct bpf_sysctl, write):
2078 		*insn++ = BPF_LDX_MEM(
2079 			BPF_SIZE(si->code), si->dst_reg, si->src_reg,
2080 			bpf_target_off(struct bpf_sysctl_kern, write,
2081 				       sizeof_field(struct bpf_sysctl_kern,
2082 						    write),
2083 				       target_size));
2084 		break;
2085 	case offsetof(struct bpf_sysctl, file_pos):
2086 		/* ppos is a pointer so it should be accessed via indirect
2087 		 * loads and stores. Also for stores additional temporary
2088 		 * register is used since neither src_reg nor dst_reg can be
2089 		 * overridden.
2090 		 */
2091 		if (type == BPF_WRITE) {
2092 			int treg = BPF_REG_9;
2093 
2094 			if (si->src_reg == treg || si->dst_reg == treg)
2095 				--treg;
2096 			if (si->src_reg == treg || si->dst_reg == treg)
2097 				--treg;
2098 			*insn++ = BPF_STX_MEM(
2099 				BPF_DW, si->dst_reg, treg,
2100 				offsetof(struct bpf_sysctl_kern, tmp_reg));
2101 			*insn++ = BPF_LDX_MEM(
2102 				BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
2103 				treg, si->dst_reg,
2104 				offsetof(struct bpf_sysctl_kern, ppos));
2105 			*insn++ = BPF_STX_MEM(
2106 				BPF_SIZEOF(u32), treg, si->src_reg,
2107 				bpf_ctx_narrow_access_offset(
2108 					0, sizeof(u32), sizeof(loff_t)));
2109 			*insn++ = BPF_LDX_MEM(
2110 				BPF_DW, treg, si->dst_reg,
2111 				offsetof(struct bpf_sysctl_kern, tmp_reg));
2112 		} else {
2113 			*insn++ = BPF_LDX_MEM(
2114 				BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
2115 				si->dst_reg, si->src_reg,
2116 				offsetof(struct bpf_sysctl_kern, ppos));
2117 			read_size = bpf_size_to_bytes(BPF_SIZE(si->code));
2118 			*insn++ = BPF_LDX_MEM(
2119 				BPF_SIZE(si->code), si->dst_reg, si->dst_reg,
2120 				bpf_ctx_narrow_access_offset(
2121 					0, read_size, sizeof(loff_t)));
2122 		}
2123 		*target_size = sizeof(u32);
2124 		break;
2125 	}
2126 
2127 	return insn - insn_buf;
2128 }
2129 
2130 const struct bpf_verifier_ops cg_sysctl_verifier_ops = {
2131 	.get_func_proto		= sysctl_func_proto,
2132 	.is_valid_access	= sysctl_is_valid_access,
2133 	.convert_ctx_access	= sysctl_convert_ctx_access,
2134 };
2135 
2136 const struct bpf_prog_ops cg_sysctl_prog_ops = {
2137 };
2138 
2139 #ifdef CONFIG_NET
2140 BPF_CALL_1(bpf_get_netns_cookie_sockopt, struct bpf_sockopt_kern *, ctx)
2141 {
2142 	const struct net *net = ctx ? sock_net(ctx->sk) : &init_net;
2143 
2144 	return net->net_cookie;
2145 }
2146 
2147 static const struct bpf_func_proto bpf_get_netns_cookie_sockopt_proto = {
2148 	.func		= bpf_get_netns_cookie_sockopt,
2149 	.gpl_only	= false,
2150 	.ret_type	= RET_INTEGER,
2151 	.arg1_type	= ARG_PTR_TO_CTX_OR_NULL,
2152 };
2153 #endif
2154 
2155 static const struct bpf_func_proto *
2156 cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
2157 {
2158 	switch (func_id) {
2159 #ifdef CONFIG_NET
2160 	case BPF_FUNC_get_netns_cookie:
2161 		return &bpf_get_netns_cookie_sockopt_proto;
2162 	case BPF_FUNC_sk_storage_get:
2163 		return &bpf_sk_storage_get_proto;
2164 	case BPF_FUNC_sk_storage_delete:
2165 		return &bpf_sk_storage_delete_proto;
2166 	case BPF_FUNC_setsockopt:
2167 		if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT)
2168 			return &bpf_sk_setsockopt_proto;
2169 		return NULL;
2170 	case BPF_FUNC_getsockopt:
2171 		if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT)
2172 			return &bpf_sk_getsockopt_proto;
2173 		return NULL;
2174 #endif
2175 #ifdef CONFIG_INET
2176 	case BPF_FUNC_tcp_sock:
2177 		return &bpf_tcp_sock_proto;
2178 #endif
2179 	default:
2180 		return cgroup_base_func_proto(func_id, prog);
2181 	}
2182 }
2183 
2184 static bool cg_sockopt_is_valid_access(int off, int size,
2185 				       enum bpf_access_type type,
2186 				       const struct bpf_prog *prog,
2187 				       struct bpf_insn_access_aux *info)
2188 {
2189 	const int size_default = sizeof(__u32);
2190 
2191 	if (off < 0 || off >= sizeof(struct bpf_sockopt))
2192 		return false;
2193 
2194 	if (off % size != 0)
2195 		return false;
2196 
2197 	if (type == BPF_WRITE) {
2198 		switch (off) {
2199 		case offsetof(struct bpf_sockopt, retval):
2200 			if (size != size_default)
2201 				return false;
2202 			return prog->expected_attach_type ==
2203 				BPF_CGROUP_GETSOCKOPT;
2204 		case offsetof(struct bpf_sockopt, optname):
2205 			fallthrough;
2206 		case offsetof(struct bpf_sockopt, level):
2207 			if (size != size_default)
2208 				return false;
2209 			return prog->expected_attach_type ==
2210 				BPF_CGROUP_SETSOCKOPT;
2211 		case offsetof(struct bpf_sockopt, optlen):
2212 			return size == size_default;
2213 		default:
2214 			return false;
2215 		}
2216 	}
2217 
2218 	switch (off) {
2219 	case offsetof(struct bpf_sockopt, sk):
2220 		if (size != sizeof(__u64))
2221 			return false;
2222 		info->reg_type = PTR_TO_SOCKET;
2223 		break;
2224 	case offsetof(struct bpf_sockopt, optval):
2225 		if (size != sizeof(__u64))
2226 			return false;
2227 		info->reg_type = PTR_TO_PACKET;
2228 		break;
2229 	case offsetof(struct bpf_sockopt, optval_end):
2230 		if (size != sizeof(__u64))
2231 			return false;
2232 		info->reg_type = PTR_TO_PACKET_END;
2233 		break;
2234 	case offsetof(struct bpf_sockopt, retval):
2235 		if (size != size_default)
2236 			return false;
2237 		return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT;
2238 	default:
2239 		if (size != size_default)
2240 			return false;
2241 		break;
2242 	}
2243 	return true;
2244 }
2245 
2246 #define CG_SOCKOPT_ACCESS_FIELD(T, F)					\
2247 	T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F),			\
2248 	  si->dst_reg, si->src_reg,					\
2249 	  offsetof(struct bpf_sockopt_kern, F))
2250 
2251 static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
2252 					 const struct bpf_insn *si,
2253 					 struct bpf_insn *insn_buf,
2254 					 struct bpf_prog *prog,
2255 					 u32 *target_size)
2256 {
2257 	struct bpf_insn *insn = insn_buf;
2258 
2259 	switch (si->off) {
2260 	case offsetof(struct bpf_sockopt, sk):
2261 		*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk);
2262 		break;
2263 	case offsetof(struct bpf_sockopt, level):
2264 		if (type == BPF_WRITE)
2265 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level);
2266 		else
2267 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level);
2268 		break;
2269 	case offsetof(struct bpf_sockopt, optname):
2270 		if (type == BPF_WRITE)
2271 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname);
2272 		else
2273 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname);
2274 		break;
2275 	case offsetof(struct bpf_sockopt, optlen):
2276 		if (type == BPF_WRITE)
2277 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen);
2278 		else
2279 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen);
2280 		break;
2281 	case offsetof(struct bpf_sockopt, retval):
2282 		BUILD_BUG_ON(offsetof(struct bpf_cg_run_ctx, run_ctx) != 0);
2283 
2284 		if (type == BPF_WRITE) {
2285 			int treg = BPF_REG_9;
2286 
2287 			if (si->src_reg == treg || si->dst_reg == treg)
2288 				--treg;
2289 			if (si->src_reg == treg || si->dst_reg == treg)
2290 				--treg;
2291 			*insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, treg,
2292 					      offsetof(struct bpf_sockopt_kern, tmp_reg));
2293 			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
2294 					      treg, si->dst_reg,
2295 					      offsetof(struct bpf_sockopt_kern, current_task));
2296 			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
2297 					      treg, treg,
2298 					      offsetof(struct task_struct, bpf_ctx));
2299 			*insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
2300 					      treg, si->src_reg,
2301 					      offsetof(struct bpf_cg_run_ctx, retval));
2302 			*insn++ = BPF_LDX_MEM(BPF_DW, treg, si->dst_reg,
2303 					      offsetof(struct bpf_sockopt_kern, tmp_reg));
2304 		} else {
2305 			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
2306 					      si->dst_reg, si->src_reg,
2307 					      offsetof(struct bpf_sockopt_kern, current_task));
2308 			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
2309 					      si->dst_reg, si->dst_reg,
2310 					      offsetof(struct task_struct, bpf_ctx));
2311 			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
2312 					      si->dst_reg, si->dst_reg,
2313 					      offsetof(struct bpf_cg_run_ctx, retval));
2314 		}
2315 		break;
2316 	case offsetof(struct bpf_sockopt, optval):
2317 		*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval);
2318 		break;
2319 	case offsetof(struct bpf_sockopt, optval_end):
2320 		*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end);
2321 		break;
2322 	}
2323 
2324 	return insn - insn_buf;
2325 }
2326 
2327 static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf,
2328 				   bool direct_write,
2329 				   const struct bpf_prog *prog)
2330 {
2331 	/* Nothing to do for sockopt argument. The data is kzalloc'ated.
2332 	 */
2333 	return 0;
2334 }
2335 
2336 const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
2337 	.get_func_proto		= cg_sockopt_func_proto,
2338 	.is_valid_access	= cg_sockopt_is_valid_access,
2339 	.convert_ctx_access	= cg_sockopt_convert_ctx_access,
2340 	.gen_prologue		= cg_sockopt_get_prologue,
2341 };
2342 
2343 const struct bpf_prog_ops cg_sockopt_prog_ops = {
2344 };
2345