xref: /openbmc/linux/kernel/bpf/cgroup.c (revision 83b41bb27b25b4b972d03b593e8da7e3dd5735aa)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Functions to manage eBPF programs attached to cgroups
4  *
5  * Copyright (c) 2016 Daniel Mack
6  */
7 
8 #include <linux/kernel.h>
9 #include <linux/atomic.h>
10 #include <linux/cgroup.h>
11 #include <linux/filter.h>
12 #include <linux/slab.h>
13 #include <linux/sysctl.h>
14 #include <linux/string.h>
15 #include <linux/bpf.h>
16 #include <linux/bpf-cgroup.h>
17 #include <net/sock.h>
18 #include <net/bpf_sk_storage.h>
19 
20 #include "../cgroup/cgroup-internal.h"
21 
22 DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
23 EXPORT_SYMBOL(cgroup_bpf_enabled_key);
24 
25 /* __always_inline is necessary to prevent indirect call through run_prog
26  * function pointer.
27  */
28 static __always_inline int
29 bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
30 		      enum cgroup_bpf_attach_type atype,
31 		      const void *ctx, bpf_prog_run_fn run_prog,
32 		      int retval, u32 *ret_flags)
33 {
34 	const struct bpf_prog_array_item *item;
35 	const struct bpf_prog *prog;
36 	const struct bpf_prog_array *array;
37 	struct bpf_run_ctx *old_run_ctx;
38 	struct bpf_cg_run_ctx run_ctx;
39 	u32 func_ret;
40 
41 	run_ctx.retval = retval;
42 	migrate_disable();
43 	rcu_read_lock();
44 	array = rcu_dereference(cgrp->effective[atype]);
45 	item = &array->items[0];
46 	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
47 	while ((prog = READ_ONCE(item->prog))) {
48 		run_ctx.prog_item = item;
49 		func_ret = run_prog(prog, ctx);
50 		if (ret_flags) {
51 			*(ret_flags) |= (func_ret >> 1);
52 			func_ret &= 1;
53 		}
54 		if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval))
55 			run_ctx.retval = -EPERM;
56 		item++;
57 	}
58 	bpf_reset_run_ctx(old_run_ctx);
59 	rcu_read_unlock();
60 	migrate_enable();
61 	return run_ctx.retval;
62 }
63 
64 void cgroup_bpf_offline(struct cgroup *cgrp)
65 {
66 	cgroup_get(cgrp);
67 	percpu_ref_kill(&cgrp->bpf.refcnt);
68 }
69 
70 static void bpf_cgroup_storages_free(struct bpf_cgroup_storage *storages[])
71 {
72 	enum bpf_cgroup_storage_type stype;
73 
74 	for_each_cgroup_storage_type(stype)
75 		bpf_cgroup_storage_free(storages[stype]);
76 }
77 
78 static int bpf_cgroup_storages_alloc(struct bpf_cgroup_storage *storages[],
79 				     struct bpf_cgroup_storage *new_storages[],
80 				     enum bpf_attach_type type,
81 				     struct bpf_prog *prog,
82 				     struct cgroup *cgrp)
83 {
84 	enum bpf_cgroup_storage_type stype;
85 	struct bpf_cgroup_storage_key key;
86 	struct bpf_map *map;
87 
88 	key.cgroup_inode_id = cgroup_id(cgrp);
89 	key.attach_type = type;
90 
91 	for_each_cgroup_storage_type(stype) {
92 		map = prog->aux->cgroup_storage[stype];
93 		if (!map)
94 			continue;
95 
96 		storages[stype] = cgroup_storage_lookup((void *)map, &key, false);
97 		if (storages[stype])
98 			continue;
99 
100 		storages[stype] = bpf_cgroup_storage_alloc(prog, stype);
101 		if (IS_ERR(storages[stype])) {
102 			bpf_cgroup_storages_free(new_storages);
103 			return -ENOMEM;
104 		}
105 
106 		new_storages[stype] = storages[stype];
107 	}
108 
109 	return 0;
110 }
111 
112 static void bpf_cgroup_storages_assign(struct bpf_cgroup_storage *dst[],
113 				       struct bpf_cgroup_storage *src[])
114 {
115 	enum bpf_cgroup_storage_type stype;
116 
117 	for_each_cgroup_storage_type(stype)
118 		dst[stype] = src[stype];
119 }
120 
121 static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[],
122 				     struct cgroup *cgrp,
123 				     enum bpf_attach_type attach_type)
124 {
125 	enum bpf_cgroup_storage_type stype;
126 
127 	for_each_cgroup_storage_type(stype)
128 		bpf_cgroup_storage_link(storages[stype], cgrp, attach_type);
129 }
130 
131 /* Called when bpf_cgroup_link is auto-detached from dying cgroup.
132  * It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It
133  * doesn't free link memory, which will eventually be done by bpf_link's
134  * release() callback, when its last FD is closed.
135  */
136 static void bpf_cgroup_link_auto_detach(struct bpf_cgroup_link *link)
137 {
138 	cgroup_put(link->cgroup);
139 	link->cgroup = NULL;
140 }
141 
142 /**
143  * cgroup_bpf_release() - put references of all bpf programs and
144  *                        release all cgroup bpf data
145  * @work: work structure embedded into the cgroup to modify
146  */
147 static void cgroup_bpf_release(struct work_struct *work)
148 {
149 	struct cgroup *p, *cgrp = container_of(work, struct cgroup,
150 					       bpf.release_work);
151 	struct bpf_prog_array *old_array;
152 	struct list_head *storages = &cgrp->bpf.storages;
153 	struct bpf_cgroup_storage *storage, *stmp;
154 
155 	unsigned int atype;
156 
157 	mutex_lock(&cgroup_mutex);
158 
159 	for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) {
160 		struct list_head *progs = &cgrp->bpf.progs[atype];
161 		struct bpf_prog_list *pl, *pltmp;
162 
163 		list_for_each_entry_safe(pl, pltmp, progs, node) {
164 			list_del(&pl->node);
165 			if (pl->prog)
166 				bpf_prog_put(pl->prog);
167 			if (pl->link)
168 				bpf_cgroup_link_auto_detach(pl->link);
169 			kfree(pl);
170 			static_branch_dec(&cgroup_bpf_enabled_key[atype]);
171 		}
172 		old_array = rcu_dereference_protected(
173 				cgrp->bpf.effective[atype],
174 				lockdep_is_held(&cgroup_mutex));
175 		bpf_prog_array_free(old_array);
176 	}
177 
178 	list_for_each_entry_safe(storage, stmp, storages, list_cg) {
179 		bpf_cgroup_storage_unlink(storage);
180 		bpf_cgroup_storage_free(storage);
181 	}
182 
183 	mutex_unlock(&cgroup_mutex);
184 
185 	for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
186 		cgroup_bpf_put(p);
187 
188 	percpu_ref_exit(&cgrp->bpf.refcnt);
189 	cgroup_put(cgrp);
190 }
191 
192 /**
193  * cgroup_bpf_release_fn() - callback used to schedule releasing
194  *                           of bpf cgroup data
195  * @ref: percpu ref counter structure
196  */
197 static void cgroup_bpf_release_fn(struct percpu_ref *ref)
198 {
199 	struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
200 
201 	INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
202 	queue_work(system_wq, &cgrp->bpf.release_work);
203 }
204 
205 /* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through
206  * link or direct prog.
207  */
208 static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
209 {
210 	if (pl->prog)
211 		return pl->prog;
212 	if (pl->link)
213 		return pl->link->link.prog;
214 	return NULL;
215 }
216 
217 /* count number of elements in the list.
218  * it's slow but the list cannot be long
219  */
220 static u32 prog_list_length(struct list_head *head)
221 {
222 	struct bpf_prog_list *pl;
223 	u32 cnt = 0;
224 
225 	list_for_each_entry(pl, head, node) {
226 		if (!prog_list_prog(pl))
227 			continue;
228 		cnt++;
229 	}
230 	return cnt;
231 }
232 
233 /* if parent has non-overridable prog attached,
234  * disallow attaching new programs to the descendent cgroup.
235  * if parent has overridable or multi-prog, allow attaching
236  */
237 static bool hierarchy_allows_attach(struct cgroup *cgrp,
238 				    enum cgroup_bpf_attach_type atype)
239 {
240 	struct cgroup *p;
241 
242 	p = cgroup_parent(cgrp);
243 	if (!p)
244 		return true;
245 	do {
246 		u32 flags = p->bpf.flags[atype];
247 		u32 cnt;
248 
249 		if (flags & BPF_F_ALLOW_MULTI)
250 			return true;
251 		cnt = prog_list_length(&p->bpf.progs[atype]);
252 		WARN_ON_ONCE(cnt > 1);
253 		if (cnt == 1)
254 			return !!(flags & BPF_F_ALLOW_OVERRIDE);
255 		p = cgroup_parent(p);
256 	} while (p);
257 	return true;
258 }
259 
260 /* compute a chain of effective programs for a given cgroup:
261  * start from the list of programs in this cgroup and add
262  * all parent programs.
263  * Note that parent's F_ALLOW_OVERRIDE-type program is yielding
264  * to programs in this cgroup
265  */
266 static int compute_effective_progs(struct cgroup *cgrp,
267 				   enum cgroup_bpf_attach_type atype,
268 				   struct bpf_prog_array **array)
269 {
270 	struct bpf_prog_array_item *item;
271 	struct bpf_prog_array *progs;
272 	struct bpf_prog_list *pl;
273 	struct cgroup *p = cgrp;
274 	int cnt = 0;
275 
276 	/* count number of effective programs by walking parents */
277 	do {
278 		if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
279 			cnt += prog_list_length(&p->bpf.progs[atype]);
280 		p = cgroup_parent(p);
281 	} while (p);
282 
283 	progs = bpf_prog_array_alloc(cnt, GFP_KERNEL);
284 	if (!progs)
285 		return -ENOMEM;
286 
287 	/* populate the array with effective progs */
288 	cnt = 0;
289 	p = cgrp;
290 	do {
291 		if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
292 			continue;
293 
294 		list_for_each_entry(pl, &p->bpf.progs[atype], node) {
295 			if (!prog_list_prog(pl))
296 				continue;
297 
298 			item = &progs->items[cnt];
299 			item->prog = prog_list_prog(pl);
300 			bpf_cgroup_storages_assign(item->cgroup_storage,
301 						   pl->storage);
302 			cnt++;
303 		}
304 	} while ((p = cgroup_parent(p)));
305 
306 	*array = progs;
307 	return 0;
308 }
309 
310 static void activate_effective_progs(struct cgroup *cgrp,
311 				     enum cgroup_bpf_attach_type atype,
312 				     struct bpf_prog_array *old_array)
313 {
314 	old_array = rcu_replace_pointer(cgrp->bpf.effective[atype], old_array,
315 					lockdep_is_held(&cgroup_mutex));
316 	/* free prog array after grace period, since __cgroup_bpf_run_*()
317 	 * might be still walking the array
318 	 */
319 	bpf_prog_array_free(old_array);
320 }
321 
322 /**
323  * cgroup_bpf_inherit() - inherit effective programs from parent
324  * @cgrp: the cgroup to modify
325  */
326 int cgroup_bpf_inherit(struct cgroup *cgrp)
327 {
328 /* has to use marco instead of const int, since compiler thinks
329  * that array below is variable length
330  */
331 #define	NR ARRAY_SIZE(cgrp->bpf.effective)
332 	struct bpf_prog_array *arrays[NR] = {};
333 	struct cgroup *p;
334 	int ret, i;
335 
336 	ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0,
337 			      GFP_KERNEL);
338 	if (ret)
339 		return ret;
340 
341 	for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
342 		cgroup_bpf_get(p);
343 
344 	for (i = 0; i < NR; i++)
345 		INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
346 
347 	INIT_LIST_HEAD(&cgrp->bpf.storages);
348 
349 	for (i = 0; i < NR; i++)
350 		if (compute_effective_progs(cgrp, i, &arrays[i]))
351 			goto cleanup;
352 
353 	for (i = 0; i < NR; i++)
354 		activate_effective_progs(cgrp, i, arrays[i]);
355 
356 	return 0;
357 cleanup:
358 	for (i = 0; i < NR; i++)
359 		bpf_prog_array_free(arrays[i]);
360 
361 	for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
362 		cgroup_bpf_put(p);
363 
364 	percpu_ref_exit(&cgrp->bpf.refcnt);
365 
366 	return -ENOMEM;
367 }
368 
369 static int update_effective_progs(struct cgroup *cgrp,
370 				  enum cgroup_bpf_attach_type atype)
371 {
372 	struct cgroup_subsys_state *css;
373 	int err;
374 
375 	/* allocate and recompute effective prog arrays */
376 	css_for_each_descendant_pre(css, &cgrp->self) {
377 		struct cgroup *desc = container_of(css, struct cgroup, self);
378 
379 		if (percpu_ref_is_zero(&desc->bpf.refcnt))
380 			continue;
381 
382 		err = compute_effective_progs(desc, atype, &desc->bpf.inactive);
383 		if (err)
384 			goto cleanup;
385 	}
386 
387 	/* all allocations were successful. Activate all prog arrays */
388 	css_for_each_descendant_pre(css, &cgrp->self) {
389 		struct cgroup *desc = container_of(css, struct cgroup, self);
390 
391 		if (percpu_ref_is_zero(&desc->bpf.refcnt)) {
392 			if (unlikely(desc->bpf.inactive)) {
393 				bpf_prog_array_free(desc->bpf.inactive);
394 				desc->bpf.inactive = NULL;
395 			}
396 			continue;
397 		}
398 
399 		activate_effective_progs(desc, atype, desc->bpf.inactive);
400 		desc->bpf.inactive = NULL;
401 	}
402 
403 	return 0;
404 
405 cleanup:
406 	/* oom while computing effective. Free all computed effective arrays
407 	 * since they were not activated
408 	 */
409 	css_for_each_descendant_pre(css, &cgrp->self) {
410 		struct cgroup *desc = container_of(css, struct cgroup, self);
411 
412 		bpf_prog_array_free(desc->bpf.inactive);
413 		desc->bpf.inactive = NULL;
414 	}
415 
416 	return err;
417 }
418 
419 #define BPF_CGROUP_MAX_PROGS 64
420 
421 static struct bpf_prog_list *find_attach_entry(struct list_head *progs,
422 					       struct bpf_prog *prog,
423 					       struct bpf_cgroup_link *link,
424 					       struct bpf_prog *replace_prog,
425 					       bool allow_multi)
426 {
427 	struct bpf_prog_list *pl;
428 
429 	/* single-attach case */
430 	if (!allow_multi) {
431 		if (list_empty(progs))
432 			return NULL;
433 		return list_first_entry(progs, typeof(*pl), node);
434 	}
435 
436 	list_for_each_entry(pl, progs, node) {
437 		if (prog && pl->prog == prog && prog != replace_prog)
438 			/* disallow attaching the same prog twice */
439 			return ERR_PTR(-EINVAL);
440 		if (link && pl->link == link)
441 			/* disallow attaching the same link twice */
442 			return ERR_PTR(-EINVAL);
443 	}
444 
445 	/* direct prog multi-attach w/ replacement case */
446 	if (replace_prog) {
447 		list_for_each_entry(pl, progs, node) {
448 			if (pl->prog == replace_prog)
449 				/* a match found */
450 				return pl;
451 		}
452 		/* prog to replace not found for cgroup */
453 		return ERR_PTR(-ENOENT);
454 	}
455 
456 	return NULL;
457 }
458 
459 /**
460  * __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and
461  *                         propagate the change to descendants
462  * @cgrp: The cgroup which descendants to traverse
463  * @prog: A program to attach
464  * @link: A link to attach
465  * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set
466  * @type: Type of attach operation
467  * @flags: Option flags
468  *
469  * Exactly one of @prog or @link can be non-null.
470  * Must be called with cgroup_mutex held.
471  */
472 static int __cgroup_bpf_attach(struct cgroup *cgrp,
473 			       struct bpf_prog *prog, struct bpf_prog *replace_prog,
474 			       struct bpf_cgroup_link *link,
475 			       enum bpf_attach_type type, u32 flags)
476 {
477 	u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
478 	struct bpf_prog *old_prog = NULL;
479 	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
480 	struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
481 	enum cgroup_bpf_attach_type atype;
482 	struct bpf_prog_list *pl;
483 	struct list_head *progs;
484 	int err;
485 
486 	if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
487 	    ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI)))
488 		/* invalid combination */
489 		return -EINVAL;
490 	if (link && (prog || replace_prog))
491 		/* only either link or prog/replace_prog can be specified */
492 		return -EINVAL;
493 	if (!!replace_prog != !!(flags & BPF_F_REPLACE))
494 		/* replace_prog implies BPF_F_REPLACE, and vice versa */
495 		return -EINVAL;
496 
497 	atype = to_cgroup_bpf_attach_type(type);
498 	if (atype < 0)
499 		return -EINVAL;
500 
501 	progs = &cgrp->bpf.progs[atype];
502 
503 	if (!hierarchy_allows_attach(cgrp, atype))
504 		return -EPERM;
505 
506 	if (!list_empty(progs) && cgrp->bpf.flags[atype] != saved_flags)
507 		/* Disallow attaching non-overridable on top
508 		 * of existing overridable in this cgroup.
509 		 * Disallow attaching multi-prog if overridable or none
510 		 */
511 		return -EPERM;
512 
513 	if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
514 		return -E2BIG;
515 
516 	pl = find_attach_entry(progs, prog, link, replace_prog,
517 			       flags & BPF_F_ALLOW_MULTI);
518 	if (IS_ERR(pl))
519 		return PTR_ERR(pl);
520 
521 	if (bpf_cgroup_storages_alloc(storage, new_storage, type,
522 				      prog ? : link->link.prog, cgrp))
523 		return -ENOMEM;
524 
525 	if (pl) {
526 		old_prog = pl->prog;
527 	} else {
528 		pl = kmalloc(sizeof(*pl), GFP_KERNEL);
529 		if (!pl) {
530 			bpf_cgroup_storages_free(new_storage);
531 			return -ENOMEM;
532 		}
533 		list_add_tail(&pl->node, progs);
534 	}
535 
536 	pl->prog = prog;
537 	pl->link = link;
538 	bpf_cgroup_storages_assign(pl->storage, storage);
539 	cgrp->bpf.flags[atype] = saved_flags;
540 
541 	err = update_effective_progs(cgrp, atype);
542 	if (err)
543 		goto cleanup;
544 
545 	if (old_prog)
546 		bpf_prog_put(old_prog);
547 	else
548 		static_branch_inc(&cgroup_bpf_enabled_key[atype]);
549 	bpf_cgroup_storages_link(new_storage, cgrp, type);
550 	return 0;
551 
552 cleanup:
553 	if (old_prog) {
554 		pl->prog = old_prog;
555 		pl->link = NULL;
556 	}
557 	bpf_cgroup_storages_free(new_storage);
558 	if (!old_prog) {
559 		list_del(&pl->node);
560 		kfree(pl);
561 	}
562 	return err;
563 }
564 
565 static int cgroup_bpf_attach(struct cgroup *cgrp,
566 			     struct bpf_prog *prog, struct bpf_prog *replace_prog,
567 			     struct bpf_cgroup_link *link,
568 			     enum bpf_attach_type type,
569 			     u32 flags)
570 {
571 	int ret;
572 
573 	mutex_lock(&cgroup_mutex);
574 	ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
575 	mutex_unlock(&cgroup_mutex);
576 	return ret;
577 }
578 
579 /* Swap updated BPF program for given link in effective program arrays across
580  * all descendant cgroups. This function is guaranteed to succeed.
581  */
582 static void replace_effective_prog(struct cgroup *cgrp,
583 				   enum cgroup_bpf_attach_type atype,
584 				   struct bpf_cgroup_link *link)
585 {
586 	struct bpf_prog_array_item *item;
587 	struct cgroup_subsys_state *css;
588 	struct bpf_prog_array *progs;
589 	struct bpf_prog_list *pl;
590 	struct list_head *head;
591 	struct cgroup *cg;
592 	int pos;
593 
594 	css_for_each_descendant_pre(css, &cgrp->self) {
595 		struct cgroup *desc = container_of(css, struct cgroup, self);
596 
597 		if (percpu_ref_is_zero(&desc->bpf.refcnt))
598 			continue;
599 
600 		/* find position of link in effective progs array */
601 		for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
602 			if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
603 				continue;
604 
605 			head = &cg->bpf.progs[atype];
606 			list_for_each_entry(pl, head, node) {
607 				if (!prog_list_prog(pl))
608 					continue;
609 				if (pl->link == link)
610 					goto found;
611 				pos++;
612 			}
613 		}
614 found:
615 		BUG_ON(!cg);
616 		progs = rcu_dereference_protected(
617 				desc->bpf.effective[atype],
618 				lockdep_is_held(&cgroup_mutex));
619 		item = &progs->items[pos];
620 		WRITE_ONCE(item->prog, link->link.prog);
621 	}
622 }
623 
624 /**
625  * __cgroup_bpf_replace() - Replace link's program and propagate the change
626  *                          to descendants
627  * @cgrp: The cgroup which descendants to traverse
628  * @link: A link for which to replace BPF program
629  * @type: Type of attach operation
630  *
631  * Must be called with cgroup_mutex held.
632  */
633 static int __cgroup_bpf_replace(struct cgroup *cgrp,
634 				struct bpf_cgroup_link *link,
635 				struct bpf_prog *new_prog)
636 {
637 	enum cgroup_bpf_attach_type atype;
638 	struct bpf_prog *old_prog;
639 	struct bpf_prog_list *pl;
640 	struct list_head *progs;
641 	bool found = false;
642 
643 	atype = to_cgroup_bpf_attach_type(link->type);
644 	if (atype < 0)
645 		return -EINVAL;
646 
647 	progs = &cgrp->bpf.progs[atype];
648 
649 	if (link->link.prog->type != new_prog->type)
650 		return -EINVAL;
651 
652 	list_for_each_entry(pl, progs, node) {
653 		if (pl->link == link) {
654 			found = true;
655 			break;
656 		}
657 	}
658 	if (!found)
659 		return -ENOENT;
660 
661 	old_prog = xchg(&link->link.prog, new_prog);
662 	replace_effective_prog(cgrp, atype, link);
663 	bpf_prog_put(old_prog);
664 	return 0;
665 }
666 
667 static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog,
668 			      struct bpf_prog *old_prog)
669 {
670 	struct bpf_cgroup_link *cg_link;
671 	int ret;
672 
673 	cg_link = container_of(link, struct bpf_cgroup_link, link);
674 
675 	mutex_lock(&cgroup_mutex);
676 	/* link might have been auto-released by dying cgroup, so fail */
677 	if (!cg_link->cgroup) {
678 		ret = -ENOLINK;
679 		goto out_unlock;
680 	}
681 	if (old_prog && link->prog != old_prog) {
682 		ret = -EPERM;
683 		goto out_unlock;
684 	}
685 	ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog);
686 out_unlock:
687 	mutex_unlock(&cgroup_mutex);
688 	return ret;
689 }
690 
691 static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
692 					       struct bpf_prog *prog,
693 					       struct bpf_cgroup_link *link,
694 					       bool allow_multi)
695 {
696 	struct bpf_prog_list *pl;
697 
698 	if (!allow_multi) {
699 		if (list_empty(progs))
700 			/* report error when trying to detach and nothing is attached */
701 			return ERR_PTR(-ENOENT);
702 
703 		/* to maintain backward compatibility NONE and OVERRIDE cgroups
704 		 * allow detaching with invalid FD (prog==NULL) in legacy mode
705 		 */
706 		return list_first_entry(progs, typeof(*pl), node);
707 	}
708 
709 	if (!prog && !link)
710 		/* to detach MULTI prog the user has to specify valid FD
711 		 * of the program or link to be detached
712 		 */
713 		return ERR_PTR(-EINVAL);
714 
715 	/* find the prog or link and detach it */
716 	list_for_each_entry(pl, progs, node) {
717 		if (pl->prog == prog && pl->link == link)
718 			return pl;
719 	}
720 	return ERR_PTR(-ENOENT);
721 }
722 
723 /**
724  * __cgroup_bpf_detach() - Detach the program or link from a cgroup, and
725  *                         propagate the change to descendants
726  * @cgrp: The cgroup which descendants to traverse
727  * @prog: A program to detach or NULL
728  * @link: A link to detach or NULL
729  * @type: Type of detach operation
730  *
731  * At most one of @prog or @link can be non-NULL.
732  * Must be called with cgroup_mutex held.
733  */
734 static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
735 			       struct bpf_cgroup_link *link, enum bpf_attach_type type)
736 {
737 	enum cgroup_bpf_attach_type atype;
738 	struct bpf_prog *old_prog;
739 	struct bpf_prog_list *pl;
740 	struct list_head *progs;
741 	u32 flags;
742 	int err;
743 
744 	atype = to_cgroup_bpf_attach_type(type);
745 	if (atype < 0)
746 		return -EINVAL;
747 
748 	progs = &cgrp->bpf.progs[atype];
749 	flags = cgrp->bpf.flags[atype];
750 
751 	if (prog && link)
752 		/* only one of prog or link can be specified */
753 		return -EINVAL;
754 
755 	pl = find_detach_entry(progs, prog, link, flags & BPF_F_ALLOW_MULTI);
756 	if (IS_ERR(pl))
757 		return PTR_ERR(pl);
758 
759 	/* mark it deleted, so it's ignored while recomputing effective */
760 	old_prog = pl->prog;
761 	pl->prog = NULL;
762 	pl->link = NULL;
763 
764 	err = update_effective_progs(cgrp, atype);
765 	if (err)
766 		goto cleanup;
767 
768 	/* now can actually delete it from this cgroup list */
769 	list_del(&pl->node);
770 	kfree(pl);
771 	if (list_empty(progs))
772 		/* last program was detached, reset flags to zero */
773 		cgrp->bpf.flags[atype] = 0;
774 	if (old_prog)
775 		bpf_prog_put(old_prog);
776 	static_branch_dec(&cgroup_bpf_enabled_key[atype]);
777 	return 0;
778 
779 cleanup:
780 	/* restore back prog or link */
781 	pl->prog = old_prog;
782 	pl->link = link;
783 	return err;
784 }
785 
786 static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
787 			     enum bpf_attach_type type)
788 {
789 	int ret;
790 
791 	mutex_lock(&cgroup_mutex);
792 	ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
793 	mutex_unlock(&cgroup_mutex);
794 	return ret;
795 }
796 
797 /* Must be called with cgroup_mutex held to avoid races. */
798 static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
799 			      union bpf_attr __user *uattr)
800 {
801 	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
802 	enum bpf_attach_type type = attr->query.attach_type;
803 	enum cgroup_bpf_attach_type atype;
804 	struct bpf_prog_array *effective;
805 	struct list_head *progs;
806 	struct bpf_prog *prog;
807 	int cnt, ret = 0, i;
808 	u32 flags;
809 
810 	atype = to_cgroup_bpf_attach_type(type);
811 	if (atype < 0)
812 		return -EINVAL;
813 
814 	progs = &cgrp->bpf.progs[atype];
815 	flags = cgrp->bpf.flags[atype];
816 
817 	effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
818 					      lockdep_is_held(&cgroup_mutex));
819 
820 	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
821 		cnt = bpf_prog_array_length(effective);
822 	else
823 		cnt = prog_list_length(progs);
824 
825 	if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
826 		return -EFAULT;
827 	if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt)))
828 		return -EFAULT;
829 	if (attr->query.prog_cnt == 0 || !prog_ids || !cnt)
830 		/* return early if user requested only program count + flags */
831 		return 0;
832 	if (attr->query.prog_cnt < cnt) {
833 		cnt = attr->query.prog_cnt;
834 		ret = -ENOSPC;
835 	}
836 
837 	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
838 		return bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
839 	} else {
840 		struct bpf_prog_list *pl;
841 		u32 id;
842 
843 		i = 0;
844 		list_for_each_entry(pl, progs, node) {
845 			prog = prog_list_prog(pl);
846 			id = prog->aux->id;
847 			if (copy_to_user(prog_ids + i, &id, sizeof(id)))
848 				return -EFAULT;
849 			if (++i == cnt)
850 				break;
851 		}
852 	}
853 	return ret;
854 }
855 
856 static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
857 			    union bpf_attr __user *uattr)
858 {
859 	int ret;
860 
861 	mutex_lock(&cgroup_mutex);
862 	ret = __cgroup_bpf_query(cgrp, attr, uattr);
863 	mutex_unlock(&cgroup_mutex);
864 	return ret;
865 }
866 
867 int cgroup_bpf_prog_attach(const union bpf_attr *attr,
868 			   enum bpf_prog_type ptype, struct bpf_prog *prog)
869 {
870 	struct bpf_prog *replace_prog = NULL;
871 	struct cgroup *cgrp;
872 	int ret;
873 
874 	cgrp = cgroup_get_from_fd(attr->target_fd);
875 	if (IS_ERR(cgrp))
876 		return PTR_ERR(cgrp);
877 
878 	if ((attr->attach_flags & BPF_F_ALLOW_MULTI) &&
879 	    (attr->attach_flags & BPF_F_REPLACE)) {
880 		replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, ptype);
881 		if (IS_ERR(replace_prog)) {
882 			cgroup_put(cgrp);
883 			return PTR_ERR(replace_prog);
884 		}
885 	}
886 
887 	ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL,
888 				attr->attach_type, attr->attach_flags);
889 
890 	if (replace_prog)
891 		bpf_prog_put(replace_prog);
892 	cgroup_put(cgrp);
893 	return ret;
894 }
895 
896 int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
897 {
898 	struct bpf_prog *prog;
899 	struct cgroup *cgrp;
900 	int ret;
901 
902 	cgrp = cgroup_get_from_fd(attr->target_fd);
903 	if (IS_ERR(cgrp))
904 		return PTR_ERR(cgrp);
905 
906 	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
907 	if (IS_ERR(prog))
908 		prog = NULL;
909 
910 	ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type);
911 	if (prog)
912 		bpf_prog_put(prog);
913 
914 	cgroup_put(cgrp);
915 	return ret;
916 }
917 
918 static void bpf_cgroup_link_release(struct bpf_link *link)
919 {
920 	struct bpf_cgroup_link *cg_link =
921 		container_of(link, struct bpf_cgroup_link, link);
922 	struct cgroup *cg;
923 
924 	/* link might have been auto-detached by dying cgroup already,
925 	 * in that case our work is done here
926 	 */
927 	if (!cg_link->cgroup)
928 		return;
929 
930 	mutex_lock(&cgroup_mutex);
931 
932 	/* re-check cgroup under lock again */
933 	if (!cg_link->cgroup) {
934 		mutex_unlock(&cgroup_mutex);
935 		return;
936 	}
937 
938 	WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link,
939 				    cg_link->type));
940 
941 	cg = cg_link->cgroup;
942 	cg_link->cgroup = NULL;
943 
944 	mutex_unlock(&cgroup_mutex);
945 
946 	cgroup_put(cg);
947 }
948 
949 static void bpf_cgroup_link_dealloc(struct bpf_link *link)
950 {
951 	struct bpf_cgroup_link *cg_link =
952 		container_of(link, struct bpf_cgroup_link, link);
953 
954 	kfree(cg_link);
955 }
956 
957 static int bpf_cgroup_link_detach(struct bpf_link *link)
958 {
959 	bpf_cgroup_link_release(link);
960 
961 	return 0;
962 }
963 
964 static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link,
965 					struct seq_file *seq)
966 {
967 	struct bpf_cgroup_link *cg_link =
968 		container_of(link, struct bpf_cgroup_link, link);
969 	u64 cg_id = 0;
970 
971 	mutex_lock(&cgroup_mutex);
972 	if (cg_link->cgroup)
973 		cg_id = cgroup_id(cg_link->cgroup);
974 	mutex_unlock(&cgroup_mutex);
975 
976 	seq_printf(seq,
977 		   "cgroup_id:\t%llu\n"
978 		   "attach_type:\t%d\n",
979 		   cg_id,
980 		   cg_link->type);
981 }
982 
983 static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link,
984 					  struct bpf_link_info *info)
985 {
986 	struct bpf_cgroup_link *cg_link =
987 		container_of(link, struct bpf_cgroup_link, link);
988 	u64 cg_id = 0;
989 
990 	mutex_lock(&cgroup_mutex);
991 	if (cg_link->cgroup)
992 		cg_id = cgroup_id(cg_link->cgroup);
993 	mutex_unlock(&cgroup_mutex);
994 
995 	info->cgroup.cgroup_id = cg_id;
996 	info->cgroup.attach_type = cg_link->type;
997 	return 0;
998 }
999 
1000 static const struct bpf_link_ops bpf_cgroup_link_lops = {
1001 	.release = bpf_cgroup_link_release,
1002 	.dealloc = bpf_cgroup_link_dealloc,
1003 	.detach = bpf_cgroup_link_detach,
1004 	.update_prog = cgroup_bpf_replace,
1005 	.show_fdinfo = bpf_cgroup_link_show_fdinfo,
1006 	.fill_link_info = bpf_cgroup_link_fill_link_info,
1007 };
1008 
1009 int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
1010 {
1011 	struct bpf_link_primer link_primer;
1012 	struct bpf_cgroup_link *link;
1013 	struct cgroup *cgrp;
1014 	int err;
1015 
1016 	if (attr->link_create.flags)
1017 		return -EINVAL;
1018 
1019 	cgrp = cgroup_get_from_fd(attr->link_create.target_fd);
1020 	if (IS_ERR(cgrp))
1021 		return PTR_ERR(cgrp);
1022 
1023 	link = kzalloc(sizeof(*link), GFP_USER);
1024 	if (!link) {
1025 		err = -ENOMEM;
1026 		goto out_put_cgroup;
1027 	}
1028 	bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops,
1029 		      prog);
1030 	link->cgroup = cgrp;
1031 	link->type = attr->link_create.attach_type;
1032 
1033 	err = bpf_link_prime(&link->link, &link_primer);
1034 	if (err) {
1035 		kfree(link);
1036 		goto out_put_cgroup;
1037 	}
1038 
1039 	err = cgroup_bpf_attach(cgrp, NULL, NULL, link,
1040 				link->type, BPF_F_ALLOW_MULTI);
1041 	if (err) {
1042 		bpf_link_cleanup(&link_primer);
1043 		goto out_put_cgroup;
1044 	}
1045 
1046 	return bpf_link_settle(&link_primer);
1047 
1048 out_put_cgroup:
1049 	cgroup_put(cgrp);
1050 	return err;
1051 }
1052 
1053 int cgroup_bpf_prog_query(const union bpf_attr *attr,
1054 			  union bpf_attr __user *uattr)
1055 {
1056 	struct cgroup *cgrp;
1057 	int ret;
1058 
1059 	cgrp = cgroup_get_from_fd(attr->query.target_fd);
1060 	if (IS_ERR(cgrp))
1061 		return PTR_ERR(cgrp);
1062 
1063 	ret = cgroup_bpf_query(cgrp, attr, uattr);
1064 
1065 	cgroup_put(cgrp);
1066 	return ret;
1067 }
1068 
1069 /**
1070  * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
1071  * @sk: The socket sending or receiving traffic
1072  * @skb: The skb that is being sent or received
1073  * @type: The type of program to be executed
1074  *
1075  * If no socket is passed, or the socket is not of type INET or INET6,
1076  * this function does nothing and returns 0.
1077  *
1078  * The program type passed in via @type must be suitable for network
1079  * filtering. No further check is performed to assert that.
1080  *
1081  * For egress packets, this function can return:
1082  *   NET_XMIT_SUCCESS    (0)	- continue with packet output
1083  *   NET_XMIT_DROP       (1)	- drop packet and notify TCP to call cwr
1084  *   NET_XMIT_CN         (2)	- continue with packet output and notify TCP
1085  *				  to call cwr
1086  *   -err			- drop packet
1087  *
1088  * For ingress packets, this function will return -EPERM if any
1089  * attached program was found and if it returned != 1 during execution.
1090  * Otherwise 0 is returned.
1091  */
1092 int __cgroup_bpf_run_filter_skb(struct sock *sk,
1093 				struct sk_buff *skb,
1094 				enum cgroup_bpf_attach_type atype)
1095 {
1096 	unsigned int offset = skb->data - skb_network_header(skb);
1097 	struct sock *save_sk;
1098 	void *saved_data_end;
1099 	struct cgroup *cgrp;
1100 	int ret;
1101 
1102 	if (!sk || !sk_fullsock(sk))
1103 		return 0;
1104 
1105 	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
1106 		return 0;
1107 
1108 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1109 	save_sk = skb->sk;
1110 	skb->sk = sk;
1111 	__skb_push(skb, offset);
1112 
1113 	/* compute pointers for the bpf prog */
1114 	bpf_compute_and_save_data_end(skb, &saved_data_end);
1115 
1116 	if (atype == CGROUP_INET_EGRESS) {
1117 		u32 flags = 0;
1118 		bool cn;
1119 
1120 		ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, skb,
1121 					    __bpf_prog_run_save_cb, 0, &flags);
1122 
1123 		/* Return values of CGROUP EGRESS BPF programs are:
1124 		 *   0: drop packet
1125 		 *   1: keep packet
1126 		 *   2: drop packet and cn
1127 		 *   3: keep packet and cn
1128 		 *
1129 		 * The returned value is then converted to one of the NET_XMIT
1130 		 * or an error code that is then interpreted as drop packet
1131 		 * (and no cn):
1132 		 *   0: NET_XMIT_SUCCESS  skb should be transmitted
1133 		 *   1: NET_XMIT_DROP     skb should be dropped and cn
1134 		 *   2: NET_XMIT_CN       skb should be transmitted and cn
1135 		 *   3: -err              skb should be dropped
1136 		 */
1137 
1138 		cn = flags & BPF_RET_SET_CN;
1139 		if (ret && !IS_ERR_VALUE((long)ret))
1140 			ret = -EFAULT;
1141 		if (!ret)
1142 			ret = (cn ? NET_XMIT_CN : NET_XMIT_SUCCESS);
1143 		else
1144 			ret = (cn ? NET_XMIT_DROP : ret);
1145 	} else {
1146 		ret = bpf_prog_run_array_cg(&cgrp->bpf, atype,
1147 					    skb, __bpf_prog_run_save_cb, 0,
1148 					    NULL);
1149 		if (ret && !IS_ERR_VALUE((long)ret))
1150 			ret = -EFAULT;
1151 	}
1152 	bpf_restore_data_end(skb, saved_data_end);
1153 	__skb_pull(skb, offset);
1154 	skb->sk = save_sk;
1155 
1156 	return ret;
1157 }
1158 EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
1159 
1160 /**
1161  * __cgroup_bpf_run_filter_sk() - Run a program on a sock
1162  * @sk: sock structure to manipulate
1163  * @type: The type of program to be executed
1164  *
1165  * socket is passed is expected to be of type INET or INET6.
1166  *
1167  * The program type passed in via @type must be suitable for sock
1168  * filtering. No further check is performed to assert that.
1169  *
1170  * This function will return %-EPERM if any if an attached program was found
1171  * and if it returned != 1 during execution. In all other cases, 0 is returned.
1172  */
1173 int __cgroup_bpf_run_filter_sk(struct sock *sk,
1174 			       enum cgroup_bpf_attach_type atype)
1175 {
1176 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1177 
1178 	return bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0,
1179 				     NULL);
1180 }
1181 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
1182 
1183 /**
1184  * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and
1185  *                                       provided by user sockaddr
1186  * @sk: sock struct that will use sockaddr
1187  * @uaddr: sockaddr struct provided by user
1188  * @type: The type of program to be executed
1189  * @t_ctx: Pointer to attach type specific context
1190  * @flags: Pointer to u32 which contains higher bits of BPF program
1191  *         return value (OR'ed together).
1192  *
1193  * socket is expected to be of type INET or INET6.
1194  *
1195  * This function will return %-EPERM if an attached program is found and
1196  * returned value != 1 during execution. In all other cases, 0 is returned.
1197  */
1198 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
1199 				      struct sockaddr *uaddr,
1200 				      enum cgroup_bpf_attach_type atype,
1201 				      void *t_ctx,
1202 				      u32 *flags)
1203 {
1204 	struct bpf_sock_addr_kern ctx = {
1205 		.sk = sk,
1206 		.uaddr = uaddr,
1207 		.t_ctx = t_ctx,
1208 	};
1209 	struct sockaddr_storage unspec;
1210 	struct cgroup *cgrp;
1211 
1212 	/* Check socket family since not all sockets represent network
1213 	 * endpoint (e.g. AF_UNIX).
1214 	 */
1215 	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
1216 		return 0;
1217 
1218 	if (!ctx.uaddr) {
1219 		memset(&unspec, 0, sizeof(unspec));
1220 		ctx.uaddr = (struct sockaddr *)&unspec;
1221 	}
1222 
1223 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1224 	return bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run,
1225 				     0, flags);
1226 }
1227 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
1228 
1229 /**
1230  * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock
1231  * @sk: socket to get cgroup from
1232  * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
1233  * sk with connection information (IP addresses, etc.) May not contain
1234  * cgroup info if it is a req sock.
1235  * @type: The type of program to be executed
1236  *
1237  * socket passed is expected to be of type INET or INET6.
1238  *
1239  * The program type passed in via @type must be suitable for sock_ops
1240  * filtering. No further check is performed to assert that.
1241  *
1242  * This function will return %-EPERM if any if an attached program was found
1243  * and if it returned != 1 during execution. In all other cases, 0 is returned.
1244  */
1245 int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
1246 				     struct bpf_sock_ops_kern *sock_ops,
1247 				     enum cgroup_bpf_attach_type atype)
1248 {
1249 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1250 
1251 	return bpf_prog_run_array_cg(&cgrp->bpf, atype, sock_ops, bpf_prog_run,
1252 				     0, NULL);
1253 }
1254 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
1255 
1256 int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
1257 				      short access, enum cgroup_bpf_attach_type atype)
1258 {
1259 	struct cgroup *cgrp;
1260 	struct bpf_cgroup_dev_ctx ctx = {
1261 		.access_type = (access << 16) | dev_type,
1262 		.major = major,
1263 		.minor = minor,
1264 	};
1265 	int ret;
1266 
1267 	rcu_read_lock();
1268 	cgrp = task_dfl_cgroup(current);
1269 	ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0,
1270 				    NULL);
1271 	rcu_read_unlock();
1272 
1273 	return ret;
1274 }
1275 
1276 BPF_CALL_0(bpf_get_retval)
1277 {
1278 	struct bpf_cg_run_ctx *ctx =
1279 		container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
1280 
1281 	return ctx->retval;
1282 }
1283 
1284 static const struct bpf_func_proto bpf_get_retval_proto = {
1285 	.func		= bpf_get_retval,
1286 	.gpl_only	= false,
1287 	.ret_type	= RET_INTEGER,
1288 };
1289 
1290 BPF_CALL_1(bpf_set_retval, int, retval)
1291 {
1292 	struct bpf_cg_run_ctx *ctx =
1293 		container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
1294 
1295 	ctx->retval = retval;
1296 	return 0;
1297 }
1298 
1299 static const struct bpf_func_proto bpf_set_retval_proto = {
1300 	.func		= bpf_set_retval,
1301 	.gpl_only	= false,
1302 	.ret_type	= RET_INTEGER,
1303 	.arg1_type	= ARG_ANYTHING,
1304 };
1305 
1306 static const struct bpf_func_proto *
1307 cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1308 {
1309 	switch (func_id) {
1310 	case BPF_FUNC_get_current_uid_gid:
1311 		return &bpf_get_current_uid_gid_proto;
1312 	case BPF_FUNC_get_local_storage:
1313 		return &bpf_get_local_storage_proto;
1314 	case BPF_FUNC_get_current_cgroup_id:
1315 		return &bpf_get_current_cgroup_id_proto;
1316 	case BPF_FUNC_perf_event_output:
1317 		return &bpf_event_output_data_proto;
1318 	case BPF_FUNC_get_retval:
1319 		return &bpf_get_retval_proto;
1320 	case BPF_FUNC_set_retval:
1321 		return &bpf_set_retval_proto;
1322 	default:
1323 		return bpf_base_func_proto(func_id);
1324 	}
1325 }
1326 
1327 static const struct bpf_func_proto *
1328 cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1329 {
1330 	return cgroup_base_func_proto(func_id, prog);
1331 }
1332 
1333 static bool cgroup_dev_is_valid_access(int off, int size,
1334 				       enum bpf_access_type type,
1335 				       const struct bpf_prog *prog,
1336 				       struct bpf_insn_access_aux *info)
1337 {
1338 	const int size_default = sizeof(__u32);
1339 
1340 	if (type == BPF_WRITE)
1341 		return false;
1342 
1343 	if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx))
1344 		return false;
1345 	/* The verifier guarantees that size > 0. */
1346 	if (off % size != 0)
1347 		return false;
1348 
1349 	switch (off) {
1350 	case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type):
1351 		bpf_ctx_record_field_size(info, size_default);
1352 		if (!bpf_ctx_narrow_access_ok(off, size, size_default))
1353 			return false;
1354 		break;
1355 	default:
1356 		if (size != size_default)
1357 			return false;
1358 	}
1359 
1360 	return true;
1361 }
1362 
1363 const struct bpf_prog_ops cg_dev_prog_ops = {
1364 };
1365 
1366 const struct bpf_verifier_ops cg_dev_verifier_ops = {
1367 	.get_func_proto		= cgroup_dev_func_proto,
1368 	.is_valid_access	= cgroup_dev_is_valid_access,
1369 };
1370 
1371 /**
1372  * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl
1373  *
1374  * @head: sysctl table header
1375  * @table: sysctl table
1376  * @write: sysctl is being read (= 0) or written (= 1)
1377  * @buf: pointer to buffer (in and out)
1378  * @pcount: value-result argument: value is size of buffer pointed to by @buf,
1379  *	result is size of @new_buf if program set new value, initial value
1380  *	otherwise
1381  * @ppos: value-result argument: value is position at which read from or write
1382  *	to sysctl is happening, result is new position if program overrode it,
1383  *	initial value otherwise
1384  * @type: type of program to be executed
1385  *
1386  * Program is run when sysctl is being accessed, either read or written, and
1387  * can allow or deny such access.
1388  *
1389  * This function will return %-EPERM if an attached program is found and
1390  * returned value != 1 during execution. In all other cases 0 is returned.
1391  */
1392 int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
1393 				   struct ctl_table *table, int write,
1394 				   char **buf, size_t *pcount, loff_t *ppos,
1395 				   enum cgroup_bpf_attach_type atype)
1396 {
1397 	struct bpf_sysctl_kern ctx = {
1398 		.head = head,
1399 		.table = table,
1400 		.write = write,
1401 		.ppos = ppos,
1402 		.cur_val = NULL,
1403 		.cur_len = PAGE_SIZE,
1404 		.new_val = NULL,
1405 		.new_len = 0,
1406 		.new_updated = 0,
1407 	};
1408 	struct cgroup *cgrp;
1409 	loff_t pos = 0;
1410 	int ret;
1411 
1412 	ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL);
1413 	if (!ctx.cur_val ||
1414 	    table->proc_handler(table, 0, ctx.cur_val, &ctx.cur_len, &pos)) {
1415 		/* Let BPF program decide how to proceed. */
1416 		ctx.cur_len = 0;
1417 	}
1418 
1419 	if (write && *buf && *pcount) {
1420 		/* BPF program should be able to override new value with a
1421 		 * buffer bigger than provided by user.
1422 		 */
1423 		ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL);
1424 		ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount);
1425 		if (ctx.new_val) {
1426 			memcpy(ctx.new_val, *buf, ctx.new_len);
1427 		} else {
1428 			/* Let BPF program decide how to proceed. */
1429 			ctx.new_len = 0;
1430 		}
1431 	}
1432 
1433 	rcu_read_lock();
1434 	cgrp = task_dfl_cgroup(current);
1435 	ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0,
1436 				    NULL);
1437 	rcu_read_unlock();
1438 
1439 	kfree(ctx.cur_val);
1440 
1441 	if (ret == 1 && ctx.new_updated) {
1442 		kfree(*buf);
1443 		*buf = ctx.new_val;
1444 		*pcount = ctx.new_len;
1445 	} else {
1446 		kfree(ctx.new_val);
1447 	}
1448 
1449 	return ret;
1450 }
1451 
1452 #ifdef CONFIG_NET
1453 static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
1454 			     struct bpf_sockopt_buf *buf)
1455 {
1456 	if (unlikely(max_optlen < 0))
1457 		return -EINVAL;
1458 
1459 	if (unlikely(max_optlen > PAGE_SIZE)) {
1460 		/* We don't expose optvals that are greater than PAGE_SIZE
1461 		 * to the BPF program.
1462 		 */
1463 		max_optlen = PAGE_SIZE;
1464 	}
1465 
1466 	if (max_optlen <= sizeof(buf->data)) {
1467 		/* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE
1468 		 * bytes avoid the cost of kzalloc.
1469 		 */
1470 		ctx->optval = buf->data;
1471 		ctx->optval_end = ctx->optval + max_optlen;
1472 		return max_optlen;
1473 	}
1474 
1475 	ctx->optval = kzalloc(max_optlen, GFP_USER);
1476 	if (!ctx->optval)
1477 		return -ENOMEM;
1478 
1479 	ctx->optval_end = ctx->optval + max_optlen;
1480 
1481 	return max_optlen;
1482 }
1483 
1484 static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
1485 			     struct bpf_sockopt_buf *buf)
1486 {
1487 	if (ctx->optval == buf->data)
1488 		return;
1489 	kfree(ctx->optval);
1490 }
1491 
1492 static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
1493 				  struct bpf_sockopt_buf *buf)
1494 {
1495 	return ctx->optval != buf->data;
1496 }
1497 
1498 int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
1499 				       int *optname, char __user *optval,
1500 				       int *optlen, char **kernel_optval)
1501 {
1502 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1503 	struct bpf_sockopt_buf buf = {};
1504 	struct bpf_sockopt_kern ctx = {
1505 		.sk = sk,
1506 		.level = *level,
1507 		.optname = *optname,
1508 	};
1509 	int ret, max_optlen;
1510 
1511 	/* Allocate a bit more than the initial user buffer for
1512 	 * BPF program. The canonical use case is overriding
1513 	 * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
1514 	 */
1515 	max_optlen = max_t(int, 16, *optlen);
1516 	max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
1517 	if (max_optlen < 0)
1518 		return max_optlen;
1519 
1520 	ctx.optlen = *optlen;
1521 
1522 	if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) {
1523 		ret = -EFAULT;
1524 		goto out;
1525 	}
1526 
1527 	lock_sock(sk);
1528 	ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_SETSOCKOPT,
1529 				    &ctx, bpf_prog_run, 0, NULL);
1530 	release_sock(sk);
1531 
1532 	if (ret)
1533 		goto out;
1534 
1535 	if (ctx.optlen == -1) {
1536 		/* optlen set to -1, bypass kernel */
1537 		ret = 1;
1538 	} else if (ctx.optlen > max_optlen || ctx.optlen < -1) {
1539 		/* optlen is out of bounds */
1540 		ret = -EFAULT;
1541 	} else {
1542 		/* optlen within bounds, run kernel handler */
1543 		ret = 0;
1544 
1545 		/* export any potential modifications */
1546 		*level = ctx.level;
1547 		*optname = ctx.optname;
1548 
1549 		/* optlen == 0 from BPF indicates that we should
1550 		 * use original userspace data.
1551 		 */
1552 		if (ctx.optlen != 0) {
1553 			*optlen = ctx.optlen;
1554 			/* We've used bpf_sockopt_kern->buf as an intermediary
1555 			 * storage, but the BPF program indicates that we need
1556 			 * to pass this data to the kernel setsockopt handler.
1557 			 * No way to export on-stack buf, have to allocate a
1558 			 * new buffer.
1559 			 */
1560 			if (!sockopt_buf_allocated(&ctx, &buf)) {
1561 				void *p = kmalloc(ctx.optlen, GFP_USER);
1562 
1563 				if (!p) {
1564 					ret = -ENOMEM;
1565 					goto out;
1566 				}
1567 				memcpy(p, ctx.optval, ctx.optlen);
1568 				*kernel_optval = p;
1569 			} else {
1570 				*kernel_optval = ctx.optval;
1571 			}
1572 			/* export and don't free sockopt buf */
1573 			return 0;
1574 		}
1575 	}
1576 
1577 out:
1578 	sockopt_free_buf(&ctx, &buf);
1579 	return ret;
1580 }
1581 
1582 int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
1583 				       int optname, char __user *optval,
1584 				       int __user *optlen, int max_optlen,
1585 				       int retval)
1586 {
1587 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1588 	struct bpf_sockopt_buf buf = {};
1589 	struct bpf_sockopt_kern ctx = {
1590 		.sk = sk,
1591 		.level = level,
1592 		.optname = optname,
1593 		.current_task = current,
1594 	};
1595 	int ret;
1596 
1597 	ctx.optlen = max_optlen;
1598 	max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
1599 	if (max_optlen < 0)
1600 		return max_optlen;
1601 
1602 	if (!retval) {
1603 		/* If kernel getsockopt finished successfully,
1604 		 * copy whatever was returned to the user back
1605 		 * into our temporary buffer. Set optlen to the
1606 		 * one that kernel returned as well to let
1607 		 * BPF programs inspect the value.
1608 		 */
1609 
1610 		if (get_user(ctx.optlen, optlen)) {
1611 			ret = -EFAULT;
1612 			goto out;
1613 		}
1614 
1615 		if (ctx.optlen < 0) {
1616 			ret = -EFAULT;
1617 			goto out;
1618 		}
1619 
1620 		if (copy_from_user(ctx.optval, optval,
1621 				   min(ctx.optlen, max_optlen)) != 0) {
1622 			ret = -EFAULT;
1623 			goto out;
1624 		}
1625 	}
1626 
1627 	lock_sock(sk);
1628 	ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
1629 				    &ctx, bpf_prog_run, retval, NULL);
1630 	release_sock(sk);
1631 
1632 	if (ret < 0)
1633 		goto out;
1634 
1635 	if (ctx.optlen > max_optlen || ctx.optlen < 0) {
1636 		ret = -EFAULT;
1637 		goto out;
1638 	}
1639 
1640 	if (ctx.optlen != 0) {
1641 		if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
1642 		    put_user(ctx.optlen, optlen)) {
1643 			ret = -EFAULT;
1644 			goto out;
1645 		}
1646 	}
1647 
1648 out:
1649 	sockopt_free_buf(&ctx, &buf);
1650 	return ret;
1651 }
1652 
1653 int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
1654 					    int optname, void *optval,
1655 					    int *optlen, int retval)
1656 {
1657 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1658 	struct bpf_sockopt_kern ctx = {
1659 		.sk = sk,
1660 		.level = level,
1661 		.optname = optname,
1662 		.optlen = *optlen,
1663 		.optval = optval,
1664 		.optval_end = optval + *optlen,
1665 		.current_task = current,
1666 	};
1667 	int ret;
1668 
1669 	/* Note that __cgroup_bpf_run_filter_getsockopt doesn't copy
1670 	 * user data back into BPF buffer when reval != 0. This is
1671 	 * done as an optimization to avoid extra copy, assuming
1672 	 * kernel won't populate the data in case of an error.
1673 	 * Here we always pass the data and memset() should
1674 	 * be called if that data shouldn't be "exported".
1675 	 */
1676 
1677 	ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
1678 				    &ctx, bpf_prog_run, retval, NULL);
1679 	if (ret < 0)
1680 		return ret;
1681 
1682 	if (ctx.optlen > *optlen)
1683 		return -EFAULT;
1684 
1685 	/* BPF programs can shrink the buffer, export the modifications.
1686 	 */
1687 	if (ctx.optlen != 0)
1688 		*optlen = ctx.optlen;
1689 
1690 	return ret;
1691 }
1692 #endif
1693 
1694 static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
1695 			      size_t *lenp)
1696 {
1697 	ssize_t tmp_ret = 0, ret;
1698 
1699 	if (dir->header.parent) {
1700 		tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp);
1701 		if (tmp_ret < 0)
1702 			return tmp_ret;
1703 	}
1704 
1705 	ret = strscpy(*bufp, dir->header.ctl_table[0].procname, *lenp);
1706 	if (ret < 0)
1707 		return ret;
1708 	*bufp += ret;
1709 	*lenp -= ret;
1710 	ret += tmp_ret;
1711 
1712 	/* Avoid leading slash. */
1713 	if (!ret)
1714 		return ret;
1715 
1716 	tmp_ret = strscpy(*bufp, "/", *lenp);
1717 	if (tmp_ret < 0)
1718 		return tmp_ret;
1719 	*bufp += tmp_ret;
1720 	*lenp -= tmp_ret;
1721 
1722 	return ret + tmp_ret;
1723 }
1724 
1725 BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern *, ctx, char *, buf,
1726 	   size_t, buf_len, u64, flags)
1727 {
1728 	ssize_t tmp_ret = 0, ret;
1729 
1730 	if (!buf)
1731 		return -EINVAL;
1732 
1733 	if (!(flags & BPF_F_SYSCTL_BASE_NAME)) {
1734 		if (!ctx->head)
1735 			return -EINVAL;
1736 		tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len);
1737 		if (tmp_ret < 0)
1738 			return tmp_ret;
1739 	}
1740 
1741 	ret = strscpy(buf, ctx->table->procname, buf_len);
1742 
1743 	return ret < 0 ? ret : tmp_ret + ret;
1744 }
1745 
1746 static const struct bpf_func_proto bpf_sysctl_get_name_proto = {
1747 	.func		= bpf_sysctl_get_name,
1748 	.gpl_only	= false,
1749 	.ret_type	= RET_INTEGER,
1750 	.arg1_type	= ARG_PTR_TO_CTX,
1751 	.arg2_type	= ARG_PTR_TO_MEM,
1752 	.arg3_type	= ARG_CONST_SIZE,
1753 	.arg4_type	= ARG_ANYTHING,
1754 };
1755 
1756 static int copy_sysctl_value(char *dst, size_t dst_len, char *src,
1757 			     size_t src_len)
1758 {
1759 	if (!dst)
1760 		return -EINVAL;
1761 
1762 	if (!dst_len)
1763 		return -E2BIG;
1764 
1765 	if (!src || !src_len) {
1766 		memset(dst, 0, dst_len);
1767 		return -EINVAL;
1768 	}
1769 
1770 	memcpy(dst, src, min(dst_len, src_len));
1771 
1772 	if (dst_len > src_len) {
1773 		memset(dst + src_len, '\0', dst_len - src_len);
1774 		return src_len;
1775 	}
1776 
1777 	dst[dst_len - 1] = '\0';
1778 
1779 	return -E2BIG;
1780 }
1781 
1782 BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx,
1783 	   char *, buf, size_t, buf_len)
1784 {
1785 	return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len);
1786 }
1787 
1788 static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = {
1789 	.func		= bpf_sysctl_get_current_value,
1790 	.gpl_only	= false,
1791 	.ret_type	= RET_INTEGER,
1792 	.arg1_type	= ARG_PTR_TO_CTX,
1793 	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
1794 	.arg3_type	= ARG_CONST_SIZE,
1795 };
1796 
1797 BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf,
1798 	   size_t, buf_len)
1799 {
1800 	if (!ctx->write) {
1801 		if (buf && buf_len)
1802 			memset(buf, '\0', buf_len);
1803 		return -EINVAL;
1804 	}
1805 	return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len);
1806 }
1807 
1808 static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = {
1809 	.func		= bpf_sysctl_get_new_value,
1810 	.gpl_only	= false,
1811 	.ret_type	= RET_INTEGER,
1812 	.arg1_type	= ARG_PTR_TO_CTX,
1813 	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
1814 	.arg3_type	= ARG_CONST_SIZE,
1815 };
1816 
1817 BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx,
1818 	   const char *, buf, size_t, buf_len)
1819 {
1820 	if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len)
1821 		return -EINVAL;
1822 
1823 	if (buf_len > PAGE_SIZE - 1)
1824 		return -E2BIG;
1825 
1826 	memcpy(ctx->new_val, buf, buf_len);
1827 	ctx->new_len = buf_len;
1828 	ctx->new_updated = 1;
1829 
1830 	return 0;
1831 }
1832 
1833 static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
1834 	.func		= bpf_sysctl_set_new_value,
1835 	.gpl_only	= false,
1836 	.ret_type	= RET_INTEGER,
1837 	.arg1_type	= ARG_PTR_TO_CTX,
1838 	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
1839 	.arg3_type	= ARG_CONST_SIZE,
1840 };
1841 
1842 static const struct bpf_func_proto *
1843 sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1844 {
1845 	switch (func_id) {
1846 	case BPF_FUNC_strtol:
1847 		return &bpf_strtol_proto;
1848 	case BPF_FUNC_strtoul:
1849 		return &bpf_strtoul_proto;
1850 	case BPF_FUNC_sysctl_get_name:
1851 		return &bpf_sysctl_get_name_proto;
1852 	case BPF_FUNC_sysctl_get_current_value:
1853 		return &bpf_sysctl_get_current_value_proto;
1854 	case BPF_FUNC_sysctl_get_new_value:
1855 		return &bpf_sysctl_get_new_value_proto;
1856 	case BPF_FUNC_sysctl_set_new_value:
1857 		return &bpf_sysctl_set_new_value_proto;
1858 	case BPF_FUNC_ktime_get_coarse_ns:
1859 		return &bpf_ktime_get_coarse_ns_proto;
1860 	default:
1861 		return cgroup_base_func_proto(func_id, prog);
1862 	}
1863 }
1864 
1865 static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type,
1866 				   const struct bpf_prog *prog,
1867 				   struct bpf_insn_access_aux *info)
1868 {
1869 	const int size_default = sizeof(__u32);
1870 
1871 	if (off < 0 || off + size > sizeof(struct bpf_sysctl) || off % size)
1872 		return false;
1873 
1874 	switch (off) {
1875 	case bpf_ctx_range(struct bpf_sysctl, write):
1876 		if (type != BPF_READ)
1877 			return false;
1878 		bpf_ctx_record_field_size(info, size_default);
1879 		return bpf_ctx_narrow_access_ok(off, size, size_default);
1880 	case bpf_ctx_range(struct bpf_sysctl, file_pos):
1881 		if (type == BPF_READ) {
1882 			bpf_ctx_record_field_size(info, size_default);
1883 			return bpf_ctx_narrow_access_ok(off, size, size_default);
1884 		} else {
1885 			return size == size_default;
1886 		}
1887 	default:
1888 		return false;
1889 	}
1890 }
1891 
1892 static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
1893 				     const struct bpf_insn *si,
1894 				     struct bpf_insn *insn_buf,
1895 				     struct bpf_prog *prog, u32 *target_size)
1896 {
1897 	struct bpf_insn *insn = insn_buf;
1898 	u32 read_size;
1899 
1900 	switch (si->off) {
1901 	case offsetof(struct bpf_sysctl, write):
1902 		*insn++ = BPF_LDX_MEM(
1903 			BPF_SIZE(si->code), si->dst_reg, si->src_reg,
1904 			bpf_target_off(struct bpf_sysctl_kern, write,
1905 				       sizeof_field(struct bpf_sysctl_kern,
1906 						    write),
1907 				       target_size));
1908 		break;
1909 	case offsetof(struct bpf_sysctl, file_pos):
1910 		/* ppos is a pointer so it should be accessed via indirect
1911 		 * loads and stores. Also for stores additional temporary
1912 		 * register is used since neither src_reg nor dst_reg can be
1913 		 * overridden.
1914 		 */
1915 		if (type == BPF_WRITE) {
1916 			int treg = BPF_REG_9;
1917 
1918 			if (si->src_reg == treg || si->dst_reg == treg)
1919 				--treg;
1920 			if (si->src_reg == treg || si->dst_reg == treg)
1921 				--treg;
1922 			*insn++ = BPF_STX_MEM(
1923 				BPF_DW, si->dst_reg, treg,
1924 				offsetof(struct bpf_sysctl_kern, tmp_reg));
1925 			*insn++ = BPF_LDX_MEM(
1926 				BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
1927 				treg, si->dst_reg,
1928 				offsetof(struct bpf_sysctl_kern, ppos));
1929 			*insn++ = BPF_STX_MEM(
1930 				BPF_SIZEOF(u32), treg, si->src_reg,
1931 				bpf_ctx_narrow_access_offset(
1932 					0, sizeof(u32), sizeof(loff_t)));
1933 			*insn++ = BPF_LDX_MEM(
1934 				BPF_DW, treg, si->dst_reg,
1935 				offsetof(struct bpf_sysctl_kern, tmp_reg));
1936 		} else {
1937 			*insn++ = BPF_LDX_MEM(
1938 				BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
1939 				si->dst_reg, si->src_reg,
1940 				offsetof(struct bpf_sysctl_kern, ppos));
1941 			read_size = bpf_size_to_bytes(BPF_SIZE(si->code));
1942 			*insn++ = BPF_LDX_MEM(
1943 				BPF_SIZE(si->code), si->dst_reg, si->dst_reg,
1944 				bpf_ctx_narrow_access_offset(
1945 					0, read_size, sizeof(loff_t)));
1946 		}
1947 		*target_size = sizeof(u32);
1948 		break;
1949 	}
1950 
1951 	return insn - insn_buf;
1952 }
1953 
1954 const struct bpf_verifier_ops cg_sysctl_verifier_ops = {
1955 	.get_func_proto		= sysctl_func_proto,
1956 	.is_valid_access	= sysctl_is_valid_access,
1957 	.convert_ctx_access	= sysctl_convert_ctx_access,
1958 };
1959 
1960 const struct bpf_prog_ops cg_sysctl_prog_ops = {
1961 };
1962 
1963 #ifdef CONFIG_NET
1964 BPF_CALL_1(bpf_get_netns_cookie_sockopt, struct bpf_sockopt_kern *, ctx)
1965 {
1966 	const struct net *net = ctx ? sock_net(ctx->sk) : &init_net;
1967 
1968 	return net->net_cookie;
1969 }
1970 
1971 static const struct bpf_func_proto bpf_get_netns_cookie_sockopt_proto = {
1972 	.func		= bpf_get_netns_cookie_sockopt,
1973 	.gpl_only	= false,
1974 	.ret_type	= RET_INTEGER,
1975 	.arg1_type	= ARG_PTR_TO_CTX_OR_NULL,
1976 };
1977 #endif
1978 
1979 static const struct bpf_func_proto *
1980 cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1981 {
1982 	switch (func_id) {
1983 #ifdef CONFIG_NET
1984 	case BPF_FUNC_get_netns_cookie:
1985 		return &bpf_get_netns_cookie_sockopt_proto;
1986 	case BPF_FUNC_sk_storage_get:
1987 		return &bpf_sk_storage_get_proto;
1988 	case BPF_FUNC_sk_storage_delete:
1989 		return &bpf_sk_storage_delete_proto;
1990 	case BPF_FUNC_setsockopt:
1991 		if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT)
1992 			return &bpf_sk_setsockopt_proto;
1993 		return NULL;
1994 	case BPF_FUNC_getsockopt:
1995 		if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT)
1996 			return &bpf_sk_getsockopt_proto;
1997 		return NULL;
1998 #endif
1999 #ifdef CONFIG_INET
2000 	case BPF_FUNC_tcp_sock:
2001 		return &bpf_tcp_sock_proto;
2002 #endif
2003 	default:
2004 		return cgroup_base_func_proto(func_id, prog);
2005 	}
2006 }
2007 
2008 static bool cg_sockopt_is_valid_access(int off, int size,
2009 				       enum bpf_access_type type,
2010 				       const struct bpf_prog *prog,
2011 				       struct bpf_insn_access_aux *info)
2012 {
2013 	const int size_default = sizeof(__u32);
2014 
2015 	if (off < 0 || off >= sizeof(struct bpf_sockopt))
2016 		return false;
2017 
2018 	if (off % size != 0)
2019 		return false;
2020 
2021 	if (type == BPF_WRITE) {
2022 		switch (off) {
2023 		case offsetof(struct bpf_sockopt, retval):
2024 			if (size != size_default)
2025 				return false;
2026 			return prog->expected_attach_type ==
2027 				BPF_CGROUP_GETSOCKOPT;
2028 		case offsetof(struct bpf_sockopt, optname):
2029 			fallthrough;
2030 		case offsetof(struct bpf_sockopt, level):
2031 			if (size != size_default)
2032 				return false;
2033 			return prog->expected_attach_type ==
2034 				BPF_CGROUP_SETSOCKOPT;
2035 		case offsetof(struct bpf_sockopt, optlen):
2036 			return size == size_default;
2037 		default:
2038 			return false;
2039 		}
2040 	}
2041 
2042 	switch (off) {
2043 	case offsetof(struct bpf_sockopt, sk):
2044 		if (size != sizeof(__u64))
2045 			return false;
2046 		info->reg_type = PTR_TO_SOCKET;
2047 		break;
2048 	case offsetof(struct bpf_sockopt, optval):
2049 		if (size != sizeof(__u64))
2050 			return false;
2051 		info->reg_type = PTR_TO_PACKET;
2052 		break;
2053 	case offsetof(struct bpf_sockopt, optval_end):
2054 		if (size != sizeof(__u64))
2055 			return false;
2056 		info->reg_type = PTR_TO_PACKET_END;
2057 		break;
2058 	case offsetof(struct bpf_sockopt, retval):
2059 		if (size != size_default)
2060 			return false;
2061 		return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT;
2062 	default:
2063 		if (size != size_default)
2064 			return false;
2065 		break;
2066 	}
2067 	return true;
2068 }
2069 
2070 #define CG_SOCKOPT_ACCESS_FIELD(T, F)					\
2071 	T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F),			\
2072 	  si->dst_reg, si->src_reg,					\
2073 	  offsetof(struct bpf_sockopt_kern, F))
2074 
2075 static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
2076 					 const struct bpf_insn *si,
2077 					 struct bpf_insn *insn_buf,
2078 					 struct bpf_prog *prog,
2079 					 u32 *target_size)
2080 {
2081 	struct bpf_insn *insn = insn_buf;
2082 
2083 	switch (si->off) {
2084 	case offsetof(struct bpf_sockopt, sk):
2085 		*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk);
2086 		break;
2087 	case offsetof(struct bpf_sockopt, level):
2088 		if (type == BPF_WRITE)
2089 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level);
2090 		else
2091 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level);
2092 		break;
2093 	case offsetof(struct bpf_sockopt, optname):
2094 		if (type == BPF_WRITE)
2095 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname);
2096 		else
2097 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname);
2098 		break;
2099 	case offsetof(struct bpf_sockopt, optlen):
2100 		if (type == BPF_WRITE)
2101 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen);
2102 		else
2103 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen);
2104 		break;
2105 	case offsetof(struct bpf_sockopt, retval):
2106 		BUILD_BUG_ON(offsetof(struct bpf_cg_run_ctx, run_ctx) != 0);
2107 
2108 		if (type == BPF_WRITE) {
2109 			int treg = BPF_REG_9;
2110 
2111 			if (si->src_reg == treg || si->dst_reg == treg)
2112 				--treg;
2113 			if (si->src_reg == treg || si->dst_reg == treg)
2114 				--treg;
2115 			*insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, treg,
2116 					      offsetof(struct bpf_sockopt_kern, tmp_reg));
2117 			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
2118 					      treg, si->dst_reg,
2119 					      offsetof(struct bpf_sockopt_kern, current_task));
2120 			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
2121 					      treg, treg,
2122 					      offsetof(struct task_struct, bpf_ctx));
2123 			*insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
2124 					      treg, si->src_reg,
2125 					      offsetof(struct bpf_cg_run_ctx, retval));
2126 			*insn++ = BPF_LDX_MEM(BPF_DW, treg, si->dst_reg,
2127 					      offsetof(struct bpf_sockopt_kern, tmp_reg));
2128 		} else {
2129 			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
2130 					      si->dst_reg, si->src_reg,
2131 					      offsetof(struct bpf_sockopt_kern, current_task));
2132 			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
2133 					      si->dst_reg, si->dst_reg,
2134 					      offsetof(struct task_struct, bpf_ctx));
2135 			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
2136 					      si->dst_reg, si->dst_reg,
2137 					      offsetof(struct bpf_cg_run_ctx, retval));
2138 		}
2139 		break;
2140 	case offsetof(struct bpf_sockopt, optval):
2141 		*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval);
2142 		break;
2143 	case offsetof(struct bpf_sockopt, optval_end):
2144 		*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end);
2145 		break;
2146 	}
2147 
2148 	return insn - insn_buf;
2149 }
2150 
2151 static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf,
2152 				   bool direct_write,
2153 				   const struct bpf_prog *prog)
2154 {
2155 	/* Nothing to do for sockopt argument. The data is kzalloc'ated.
2156 	 */
2157 	return 0;
2158 }
2159 
2160 const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
2161 	.get_func_proto		= cg_sockopt_func_proto,
2162 	.is_valid_access	= cg_sockopt_is_valid_access,
2163 	.convert_ctx_access	= cg_sockopt_convert_ctx_access,
2164 	.gen_prologue		= cg_sockopt_get_prologue,
2165 };
2166 
2167 const struct bpf_prog_ops cg_sockopt_prog_ops = {
2168 };
2169