xref: /openbmc/linux/kernel/bpf/cgroup.c (revision 00442143)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Functions to manage eBPF programs attached to cgroups
4  *
5  * Copyright (c) 2016 Daniel Mack
6  */
7 
8 #include <linux/kernel.h>
9 #include <linux/atomic.h>
10 #include <linux/cgroup.h>
11 #include <linux/filter.h>
12 #include <linux/slab.h>
13 #include <linux/sysctl.h>
14 #include <linux/string.h>
15 #include <linux/bpf.h>
16 #include <linux/bpf-cgroup.h>
17 #include <net/sock.h>
18 #include <net/bpf_sk_storage.h>
19 
20 #include "../cgroup/cgroup-internal.h"
21 
22 DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
23 EXPORT_SYMBOL(cgroup_bpf_enabled_key);
24 
25 /* __always_inline is necessary to prevent indirect call through run_prog
26  * function pointer.
27  */
28 static __always_inline int
29 bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
30 		      enum cgroup_bpf_attach_type atype,
31 		      const void *ctx, bpf_prog_run_fn run_prog,
32 		      int retval, u32 *ret_flags)
33 {
34 	const struct bpf_prog_array_item *item;
35 	const struct bpf_prog *prog;
36 	const struct bpf_prog_array *array;
37 	struct bpf_run_ctx *old_run_ctx;
38 	struct bpf_cg_run_ctx run_ctx;
39 	u32 func_ret;
40 
41 	run_ctx.retval = retval;
42 	migrate_disable();
43 	rcu_read_lock();
44 	array = rcu_dereference(cgrp->effective[atype]);
45 	item = &array->items[0];
46 	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
47 	while ((prog = READ_ONCE(item->prog))) {
48 		run_ctx.prog_item = item;
49 		func_ret = run_prog(prog, ctx);
50 		if (ret_flags) {
51 			*(ret_flags) |= (func_ret >> 1);
52 			func_ret &= 1;
53 		}
54 		if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval))
55 			run_ctx.retval = -EPERM;
56 		item++;
57 	}
58 	bpf_reset_run_ctx(old_run_ctx);
59 	rcu_read_unlock();
60 	migrate_enable();
61 	return run_ctx.retval;
62 }
63 
64 void cgroup_bpf_offline(struct cgroup *cgrp)
65 {
66 	cgroup_get(cgrp);
67 	percpu_ref_kill(&cgrp->bpf.refcnt);
68 }
69 
70 static void bpf_cgroup_storages_free(struct bpf_cgroup_storage *storages[])
71 {
72 	enum bpf_cgroup_storage_type stype;
73 
74 	for_each_cgroup_storage_type(stype)
75 		bpf_cgroup_storage_free(storages[stype]);
76 }
77 
78 static int bpf_cgroup_storages_alloc(struct bpf_cgroup_storage *storages[],
79 				     struct bpf_cgroup_storage *new_storages[],
80 				     enum bpf_attach_type type,
81 				     struct bpf_prog *prog,
82 				     struct cgroup *cgrp)
83 {
84 	enum bpf_cgroup_storage_type stype;
85 	struct bpf_cgroup_storage_key key;
86 	struct bpf_map *map;
87 
88 	key.cgroup_inode_id = cgroup_id(cgrp);
89 	key.attach_type = type;
90 
91 	for_each_cgroup_storage_type(stype) {
92 		map = prog->aux->cgroup_storage[stype];
93 		if (!map)
94 			continue;
95 
96 		storages[stype] = cgroup_storage_lookup((void *)map, &key, false);
97 		if (storages[stype])
98 			continue;
99 
100 		storages[stype] = bpf_cgroup_storage_alloc(prog, stype);
101 		if (IS_ERR(storages[stype])) {
102 			bpf_cgroup_storages_free(new_storages);
103 			return -ENOMEM;
104 		}
105 
106 		new_storages[stype] = storages[stype];
107 	}
108 
109 	return 0;
110 }
111 
112 static void bpf_cgroup_storages_assign(struct bpf_cgroup_storage *dst[],
113 				       struct bpf_cgroup_storage *src[])
114 {
115 	enum bpf_cgroup_storage_type stype;
116 
117 	for_each_cgroup_storage_type(stype)
118 		dst[stype] = src[stype];
119 }
120 
121 static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[],
122 				     struct cgroup *cgrp,
123 				     enum bpf_attach_type attach_type)
124 {
125 	enum bpf_cgroup_storage_type stype;
126 
127 	for_each_cgroup_storage_type(stype)
128 		bpf_cgroup_storage_link(storages[stype], cgrp, attach_type);
129 }
130 
131 /* Called when bpf_cgroup_link is auto-detached from dying cgroup.
132  * It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It
133  * doesn't free link memory, which will eventually be done by bpf_link's
134  * release() callback, when its last FD is closed.
135  */
136 static void bpf_cgroup_link_auto_detach(struct bpf_cgroup_link *link)
137 {
138 	cgroup_put(link->cgroup);
139 	link->cgroup = NULL;
140 }
141 
142 /**
143  * cgroup_bpf_release() - put references of all bpf programs and
144  *                        release all cgroup bpf data
145  * @work: work structure embedded into the cgroup to modify
146  */
147 static void cgroup_bpf_release(struct work_struct *work)
148 {
149 	struct cgroup *p, *cgrp = container_of(work, struct cgroup,
150 					       bpf.release_work);
151 	struct bpf_prog_array *old_array;
152 	struct list_head *storages = &cgrp->bpf.storages;
153 	struct bpf_cgroup_storage *storage, *stmp;
154 
155 	unsigned int atype;
156 
157 	mutex_lock(&cgroup_mutex);
158 
159 	for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) {
160 		struct hlist_head *progs = &cgrp->bpf.progs[atype];
161 		struct bpf_prog_list *pl;
162 		struct hlist_node *pltmp;
163 
164 		hlist_for_each_entry_safe(pl, pltmp, progs, node) {
165 			hlist_del(&pl->node);
166 			if (pl->prog)
167 				bpf_prog_put(pl->prog);
168 			if (pl->link)
169 				bpf_cgroup_link_auto_detach(pl->link);
170 			kfree(pl);
171 			static_branch_dec(&cgroup_bpf_enabled_key[atype]);
172 		}
173 		old_array = rcu_dereference_protected(
174 				cgrp->bpf.effective[atype],
175 				lockdep_is_held(&cgroup_mutex));
176 		bpf_prog_array_free(old_array);
177 	}
178 
179 	list_for_each_entry_safe(storage, stmp, storages, list_cg) {
180 		bpf_cgroup_storage_unlink(storage);
181 		bpf_cgroup_storage_free(storage);
182 	}
183 
184 	mutex_unlock(&cgroup_mutex);
185 
186 	for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
187 		cgroup_bpf_put(p);
188 
189 	percpu_ref_exit(&cgrp->bpf.refcnt);
190 	cgroup_put(cgrp);
191 }
192 
193 /**
194  * cgroup_bpf_release_fn() - callback used to schedule releasing
195  *                           of bpf cgroup data
196  * @ref: percpu ref counter structure
197  */
198 static void cgroup_bpf_release_fn(struct percpu_ref *ref)
199 {
200 	struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
201 
202 	INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
203 	queue_work(system_wq, &cgrp->bpf.release_work);
204 }
205 
206 /* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through
207  * link or direct prog.
208  */
209 static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
210 {
211 	if (pl->prog)
212 		return pl->prog;
213 	if (pl->link)
214 		return pl->link->link.prog;
215 	return NULL;
216 }
217 
218 /* count number of elements in the list.
219  * it's slow but the list cannot be long
220  */
221 static u32 prog_list_length(struct hlist_head *head)
222 {
223 	struct bpf_prog_list *pl;
224 	u32 cnt = 0;
225 
226 	hlist_for_each_entry(pl, head, node) {
227 		if (!prog_list_prog(pl))
228 			continue;
229 		cnt++;
230 	}
231 	return cnt;
232 }
233 
234 /* if parent has non-overridable prog attached,
235  * disallow attaching new programs to the descendent cgroup.
236  * if parent has overridable or multi-prog, allow attaching
237  */
238 static bool hierarchy_allows_attach(struct cgroup *cgrp,
239 				    enum cgroup_bpf_attach_type atype)
240 {
241 	struct cgroup *p;
242 
243 	p = cgroup_parent(cgrp);
244 	if (!p)
245 		return true;
246 	do {
247 		u32 flags = p->bpf.flags[atype];
248 		u32 cnt;
249 
250 		if (flags & BPF_F_ALLOW_MULTI)
251 			return true;
252 		cnt = prog_list_length(&p->bpf.progs[atype]);
253 		WARN_ON_ONCE(cnt > 1);
254 		if (cnt == 1)
255 			return !!(flags & BPF_F_ALLOW_OVERRIDE);
256 		p = cgroup_parent(p);
257 	} while (p);
258 	return true;
259 }
260 
261 /* compute a chain of effective programs for a given cgroup:
262  * start from the list of programs in this cgroup and add
263  * all parent programs.
264  * Note that parent's F_ALLOW_OVERRIDE-type program is yielding
265  * to programs in this cgroup
266  */
267 static int compute_effective_progs(struct cgroup *cgrp,
268 				   enum cgroup_bpf_attach_type atype,
269 				   struct bpf_prog_array **array)
270 {
271 	struct bpf_prog_array_item *item;
272 	struct bpf_prog_array *progs;
273 	struct bpf_prog_list *pl;
274 	struct cgroup *p = cgrp;
275 	int cnt = 0;
276 
277 	/* count number of effective programs by walking parents */
278 	do {
279 		if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
280 			cnt += prog_list_length(&p->bpf.progs[atype]);
281 		p = cgroup_parent(p);
282 	} while (p);
283 
284 	progs = bpf_prog_array_alloc(cnt, GFP_KERNEL);
285 	if (!progs)
286 		return -ENOMEM;
287 
288 	/* populate the array with effective progs */
289 	cnt = 0;
290 	p = cgrp;
291 	do {
292 		if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
293 			continue;
294 
295 		hlist_for_each_entry(pl, &p->bpf.progs[atype], node) {
296 			if (!prog_list_prog(pl))
297 				continue;
298 
299 			item = &progs->items[cnt];
300 			item->prog = prog_list_prog(pl);
301 			bpf_cgroup_storages_assign(item->cgroup_storage,
302 						   pl->storage);
303 			cnt++;
304 		}
305 	} while ((p = cgroup_parent(p)));
306 
307 	*array = progs;
308 	return 0;
309 }
310 
311 static void activate_effective_progs(struct cgroup *cgrp,
312 				     enum cgroup_bpf_attach_type atype,
313 				     struct bpf_prog_array *old_array)
314 {
315 	old_array = rcu_replace_pointer(cgrp->bpf.effective[atype], old_array,
316 					lockdep_is_held(&cgroup_mutex));
317 	/* free prog array after grace period, since __cgroup_bpf_run_*()
318 	 * might be still walking the array
319 	 */
320 	bpf_prog_array_free(old_array);
321 }
322 
323 /**
324  * cgroup_bpf_inherit() - inherit effective programs from parent
325  * @cgrp: the cgroup to modify
326  */
327 int cgroup_bpf_inherit(struct cgroup *cgrp)
328 {
329 /* has to use marco instead of const int, since compiler thinks
330  * that array below is variable length
331  */
332 #define	NR ARRAY_SIZE(cgrp->bpf.effective)
333 	struct bpf_prog_array *arrays[NR] = {};
334 	struct cgroup *p;
335 	int ret, i;
336 
337 	ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0,
338 			      GFP_KERNEL);
339 	if (ret)
340 		return ret;
341 
342 	for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
343 		cgroup_bpf_get(p);
344 
345 	for (i = 0; i < NR; i++)
346 		INIT_HLIST_HEAD(&cgrp->bpf.progs[i]);
347 
348 	INIT_LIST_HEAD(&cgrp->bpf.storages);
349 
350 	for (i = 0; i < NR; i++)
351 		if (compute_effective_progs(cgrp, i, &arrays[i]))
352 			goto cleanup;
353 
354 	for (i = 0; i < NR; i++)
355 		activate_effective_progs(cgrp, i, arrays[i]);
356 
357 	return 0;
358 cleanup:
359 	for (i = 0; i < NR; i++)
360 		bpf_prog_array_free(arrays[i]);
361 
362 	for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
363 		cgroup_bpf_put(p);
364 
365 	percpu_ref_exit(&cgrp->bpf.refcnt);
366 
367 	return -ENOMEM;
368 }
369 
370 static int update_effective_progs(struct cgroup *cgrp,
371 				  enum cgroup_bpf_attach_type atype)
372 {
373 	struct cgroup_subsys_state *css;
374 	int err;
375 
376 	/* allocate and recompute effective prog arrays */
377 	css_for_each_descendant_pre(css, &cgrp->self) {
378 		struct cgroup *desc = container_of(css, struct cgroup, self);
379 
380 		if (percpu_ref_is_zero(&desc->bpf.refcnt))
381 			continue;
382 
383 		err = compute_effective_progs(desc, atype, &desc->bpf.inactive);
384 		if (err)
385 			goto cleanup;
386 	}
387 
388 	/* all allocations were successful. Activate all prog arrays */
389 	css_for_each_descendant_pre(css, &cgrp->self) {
390 		struct cgroup *desc = container_of(css, struct cgroup, self);
391 
392 		if (percpu_ref_is_zero(&desc->bpf.refcnt)) {
393 			if (unlikely(desc->bpf.inactive)) {
394 				bpf_prog_array_free(desc->bpf.inactive);
395 				desc->bpf.inactive = NULL;
396 			}
397 			continue;
398 		}
399 
400 		activate_effective_progs(desc, atype, desc->bpf.inactive);
401 		desc->bpf.inactive = NULL;
402 	}
403 
404 	return 0;
405 
406 cleanup:
407 	/* oom while computing effective. Free all computed effective arrays
408 	 * since they were not activated
409 	 */
410 	css_for_each_descendant_pre(css, &cgrp->self) {
411 		struct cgroup *desc = container_of(css, struct cgroup, self);
412 
413 		bpf_prog_array_free(desc->bpf.inactive);
414 		desc->bpf.inactive = NULL;
415 	}
416 
417 	return err;
418 }
419 
420 #define BPF_CGROUP_MAX_PROGS 64
421 
422 static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
423 					       struct bpf_prog *prog,
424 					       struct bpf_cgroup_link *link,
425 					       struct bpf_prog *replace_prog,
426 					       bool allow_multi)
427 {
428 	struct bpf_prog_list *pl;
429 
430 	/* single-attach case */
431 	if (!allow_multi) {
432 		if (hlist_empty(progs))
433 			return NULL;
434 		return hlist_entry(progs->first, typeof(*pl), node);
435 	}
436 
437 	hlist_for_each_entry(pl, progs, node) {
438 		if (prog && pl->prog == prog && prog != replace_prog)
439 			/* disallow attaching the same prog twice */
440 			return ERR_PTR(-EINVAL);
441 		if (link && pl->link == link)
442 			/* disallow attaching the same link twice */
443 			return ERR_PTR(-EINVAL);
444 	}
445 
446 	/* direct prog multi-attach w/ replacement case */
447 	if (replace_prog) {
448 		hlist_for_each_entry(pl, progs, node) {
449 			if (pl->prog == replace_prog)
450 				/* a match found */
451 				return pl;
452 		}
453 		/* prog to replace not found for cgroup */
454 		return ERR_PTR(-ENOENT);
455 	}
456 
457 	return NULL;
458 }
459 
460 /**
461  * __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and
462  *                         propagate the change to descendants
463  * @cgrp: The cgroup which descendants to traverse
464  * @prog: A program to attach
465  * @link: A link to attach
466  * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set
467  * @type: Type of attach operation
468  * @flags: Option flags
469  *
470  * Exactly one of @prog or @link can be non-null.
471  * Must be called with cgroup_mutex held.
472  */
473 static int __cgroup_bpf_attach(struct cgroup *cgrp,
474 			       struct bpf_prog *prog, struct bpf_prog *replace_prog,
475 			       struct bpf_cgroup_link *link,
476 			       enum bpf_attach_type type, u32 flags)
477 {
478 	u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
479 	struct bpf_prog *old_prog = NULL;
480 	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
481 	struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
482 	enum cgroup_bpf_attach_type atype;
483 	struct bpf_prog_list *pl;
484 	struct hlist_head *progs;
485 	int err;
486 
487 	if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
488 	    ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI)))
489 		/* invalid combination */
490 		return -EINVAL;
491 	if (link && (prog || replace_prog))
492 		/* only either link or prog/replace_prog can be specified */
493 		return -EINVAL;
494 	if (!!replace_prog != !!(flags & BPF_F_REPLACE))
495 		/* replace_prog implies BPF_F_REPLACE, and vice versa */
496 		return -EINVAL;
497 
498 	atype = to_cgroup_bpf_attach_type(type);
499 	if (atype < 0)
500 		return -EINVAL;
501 
502 	progs = &cgrp->bpf.progs[atype];
503 
504 	if (!hierarchy_allows_attach(cgrp, atype))
505 		return -EPERM;
506 
507 	if (!hlist_empty(progs) && cgrp->bpf.flags[atype] != saved_flags)
508 		/* Disallow attaching non-overridable on top
509 		 * of existing overridable in this cgroup.
510 		 * Disallow attaching multi-prog if overridable or none
511 		 */
512 		return -EPERM;
513 
514 	if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
515 		return -E2BIG;
516 
517 	pl = find_attach_entry(progs, prog, link, replace_prog,
518 			       flags & BPF_F_ALLOW_MULTI);
519 	if (IS_ERR(pl))
520 		return PTR_ERR(pl);
521 
522 	if (bpf_cgroup_storages_alloc(storage, new_storage, type,
523 				      prog ? : link->link.prog, cgrp))
524 		return -ENOMEM;
525 
526 	if (pl) {
527 		old_prog = pl->prog;
528 	} else {
529 		struct hlist_node *last = NULL;
530 
531 		pl = kmalloc(sizeof(*pl), GFP_KERNEL);
532 		if (!pl) {
533 			bpf_cgroup_storages_free(new_storage);
534 			return -ENOMEM;
535 		}
536 		if (hlist_empty(progs))
537 			hlist_add_head(&pl->node, progs);
538 		else
539 			hlist_for_each(last, progs) {
540 				if (last->next)
541 					continue;
542 				hlist_add_behind(&pl->node, last);
543 				break;
544 			}
545 	}
546 
547 	pl->prog = prog;
548 	pl->link = link;
549 	bpf_cgroup_storages_assign(pl->storage, storage);
550 	cgrp->bpf.flags[atype] = saved_flags;
551 
552 	err = update_effective_progs(cgrp, atype);
553 	if (err)
554 		goto cleanup;
555 
556 	if (old_prog)
557 		bpf_prog_put(old_prog);
558 	else
559 		static_branch_inc(&cgroup_bpf_enabled_key[atype]);
560 	bpf_cgroup_storages_link(new_storage, cgrp, type);
561 	return 0;
562 
563 cleanup:
564 	if (old_prog) {
565 		pl->prog = old_prog;
566 		pl->link = NULL;
567 	}
568 	bpf_cgroup_storages_free(new_storage);
569 	if (!old_prog) {
570 		hlist_del(&pl->node);
571 		kfree(pl);
572 	}
573 	return err;
574 }
575 
576 static int cgroup_bpf_attach(struct cgroup *cgrp,
577 			     struct bpf_prog *prog, struct bpf_prog *replace_prog,
578 			     struct bpf_cgroup_link *link,
579 			     enum bpf_attach_type type,
580 			     u32 flags)
581 {
582 	int ret;
583 
584 	mutex_lock(&cgroup_mutex);
585 	ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
586 	mutex_unlock(&cgroup_mutex);
587 	return ret;
588 }
589 
590 /* Swap updated BPF program for given link in effective program arrays across
591  * all descendant cgroups. This function is guaranteed to succeed.
592  */
593 static void replace_effective_prog(struct cgroup *cgrp,
594 				   enum cgroup_bpf_attach_type atype,
595 				   struct bpf_cgroup_link *link)
596 {
597 	struct bpf_prog_array_item *item;
598 	struct cgroup_subsys_state *css;
599 	struct bpf_prog_array *progs;
600 	struct bpf_prog_list *pl;
601 	struct hlist_head *head;
602 	struct cgroup *cg;
603 	int pos;
604 
605 	css_for_each_descendant_pre(css, &cgrp->self) {
606 		struct cgroup *desc = container_of(css, struct cgroup, self);
607 
608 		if (percpu_ref_is_zero(&desc->bpf.refcnt))
609 			continue;
610 
611 		/* find position of link in effective progs array */
612 		for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
613 			if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
614 				continue;
615 
616 			head = &cg->bpf.progs[atype];
617 			hlist_for_each_entry(pl, head, node) {
618 				if (!prog_list_prog(pl))
619 					continue;
620 				if (pl->link == link)
621 					goto found;
622 				pos++;
623 			}
624 		}
625 found:
626 		BUG_ON(!cg);
627 		progs = rcu_dereference_protected(
628 				desc->bpf.effective[atype],
629 				lockdep_is_held(&cgroup_mutex));
630 		item = &progs->items[pos];
631 		WRITE_ONCE(item->prog, link->link.prog);
632 	}
633 }
634 
635 /**
636  * __cgroup_bpf_replace() - Replace link's program and propagate the change
637  *                          to descendants
638  * @cgrp: The cgroup which descendants to traverse
639  * @link: A link for which to replace BPF program
640  * @type: Type of attach operation
641  *
642  * Must be called with cgroup_mutex held.
643  */
644 static int __cgroup_bpf_replace(struct cgroup *cgrp,
645 				struct bpf_cgroup_link *link,
646 				struct bpf_prog *new_prog)
647 {
648 	enum cgroup_bpf_attach_type atype;
649 	struct bpf_prog *old_prog;
650 	struct bpf_prog_list *pl;
651 	struct hlist_head *progs;
652 	bool found = false;
653 
654 	atype = to_cgroup_bpf_attach_type(link->type);
655 	if (atype < 0)
656 		return -EINVAL;
657 
658 	progs = &cgrp->bpf.progs[atype];
659 
660 	if (link->link.prog->type != new_prog->type)
661 		return -EINVAL;
662 
663 	hlist_for_each_entry(pl, progs, node) {
664 		if (pl->link == link) {
665 			found = true;
666 			break;
667 		}
668 	}
669 	if (!found)
670 		return -ENOENT;
671 
672 	old_prog = xchg(&link->link.prog, new_prog);
673 	replace_effective_prog(cgrp, atype, link);
674 	bpf_prog_put(old_prog);
675 	return 0;
676 }
677 
678 static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog,
679 			      struct bpf_prog *old_prog)
680 {
681 	struct bpf_cgroup_link *cg_link;
682 	int ret;
683 
684 	cg_link = container_of(link, struct bpf_cgroup_link, link);
685 
686 	mutex_lock(&cgroup_mutex);
687 	/* link might have been auto-released by dying cgroup, so fail */
688 	if (!cg_link->cgroup) {
689 		ret = -ENOLINK;
690 		goto out_unlock;
691 	}
692 	if (old_prog && link->prog != old_prog) {
693 		ret = -EPERM;
694 		goto out_unlock;
695 	}
696 	ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog);
697 out_unlock:
698 	mutex_unlock(&cgroup_mutex);
699 	return ret;
700 }
701 
702 static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs,
703 					       struct bpf_prog *prog,
704 					       struct bpf_cgroup_link *link,
705 					       bool allow_multi)
706 {
707 	struct bpf_prog_list *pl;
708 
709 	if (!allow_multi) {
710 		if (hlist_empty(progs))
711 			/* report error when trying to detach and nothing is attached */
712 			return ERR_PTR(-ENOENT);
713 
714 		/* to maintain backward compatibility NONE and OVERRIDE cgroups
715 		 * allow detaching with invalid FD (prog==NULL) in legacy mode
716 		 */
717 		return hlist_entry(progs->first, typeof(*pl), node);
718 	}
719 
720 	if (!prog && !link)
721 		/* to detach MULTI prog the user has to specify valid FD
722 		 * of the program or link to be detached
723 		 */
724 		return ERR_PTR(-EINVAL);
725 
726 	/* find the prog or link and detach it */
727 	hlist_for_each_entry(pl, progs, node) {
728 		if (pl->prog == prog && pl->link == link)
729 			return pl;
730 	}
731 	return ERR_PTR(-ENOENT);
732 }
733 
734 /**
735  * purge_effective_progs() - After compute_effective_progs fails to alloc new
736  *                           cgrp->bpf.inactive table we can recover by
737  *                           recomputing the array in place.
738  *
739  * @cgrp: The cgroup which descendants to travers
740  * @prog: A program to detach or NULL
741  * @link: A link to detach or NULL
742  * @atype: Type of detach operation
743  */
744 static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog,
745 				  struct bpf_cgroup_link *link,
746 				  enum cgroup_bpf_attach_type atype)
747 {
748 	struct cgroup_subsys_state *css;
749 	struct bpf_prog_array *progs;
750 	struct bpf_prog_list *pl;
751 	struct hlist_head *head;
752 	struct cgroup *cg;
753 	int pos;
754 
755 	/* recompute effective prog array in place */
756 	css_for_each_descendant_pre(css, &cgrp->self) {
757 		struct cgroup *desc = container_of(css, struct cgroup, self);
758 
759 		if (percpu_ref_is_zero(&desc->bpf.refcnt))
760 			continue;
761 
762 		/* find position of link or prog in effective progs array */
763 		for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
764 			if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
765 				continue;
766 
767 			head = &cg->bpf.progs[atype];
768 			hlist_for_each_entry(pl, head, node) {
769 				if (!prog_list_prog(pl))
770 					continue;
771 				if (pl->prog == prog && pl->link == link)
772 					goto found;
773 				pos++;
774 			}
775 		}
776 found:
777 		BUG_ON(!cg);
778 		progs = rcu_dereference_protected(
779 				desc->bpf.effective[atype],
780 				lockdep_is_held(&cgroup_mutex));
781 
782 		/* Remove the program from the array */
783 		WARN_ONCE(bpf_prog_array_delete_safe_at(progs, pos),
784 			  "Failed to purge a prog from array at index %d", pos);
785 	}
786 }
787 
788 /**
789  * __cgroup_bpf_detach() - Detach the program or link from a cgroup, and
790  *                         propagate the change to descendants
791  * @cgrp: The cgroup which descendants to traverse
792  * @prog: A program to detach or NULL
793  * @link: A link to detach or NULL
794  * @type: Type of detach operation
795  *
796  * At most one of @prog or @link can be non-NULL.
797  * Must be called with cgroup_mutex held.
798  */
799 static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
800 			       struct bpf_cgroup_link *link, enum bpf_attach_type type)
801 {
802 	enum cgroup_bpf_attach_type atype;
803 	struct bpf_prog *old_prog;
804 	struct bpf_prog_list *pl;
805 	struct hlist_head *progs;
806 	u32 flags;
807 
808 	atype = to_cgroup_bpf_attach_type(type);
809 	if (atype < 0)
810 		return -EINVAL;
811 
812 	progs = &cgrp->bpf.progs[atype];
813 	flags = cgrp->bpf.flags[atype];
814 
815 	if (prog && link)
816 		/* only one of prog or link can be specified */
817 		return -EINVAL;
818 
819 	pl = find_detach_entry(progs, prog, link, flags & BPF_F_ALLOW_MULTI);
820 	if (IS_ERR(pl))
821 		return PTR_ERR(pl);
822 
823 	/* mark it deleted, so it's ignored while recomputing effective */
824 	old_prog = pl->prog;
825 	pl->prog = NULL;
826 	pl->link = NULL;
827 
828 	if (update_effective_progs(cgrp, atype)) {
829 		/* if update effective array failed replace the prog with a dummy prog*/
830 		pl->prog = old_prog;
831 		pl->link = link;
832 		purge_effective_progs(cgrp, old_prog, link, atype);
833 	}
834 
835 	/* now can actually delete it from this cgroup list */
836 	hlist_del(&pl->node);
837 
838 	kfree(pl);
839 	if (hlist_empty(progs))
840 		/* last program was detached, reset flags to zero */
841 		cgrp->bpf.flags[atype] = 0;
842 	if (old_prog)
843 		bpf_prog_put(old_prog);
844 	static_branch_dec(&cgroup_bpf_enabled_key[atype]);
845 	return 0;
846 }
847 
848 static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
849 			     enum bpf_attach_type type)
850 {
851 	int ret;
852 
853 	mutex_lock(&cgroup_mutex);
854 	ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
855 	mutex_unlock(&cgroup_mutex);
856 	return ret;
857 }
858 
859 /* Must be called with cgroup_mutex held to avoid races. */
860 static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
861 			      union bpf_attr __user *uattr)
862 {
863 	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
864 	enum bpf_attach_type type = attr->query.attach_type;
865 	enum cgroup_bpf_attach_type atype;
866 	struct bpf_prog_array *effective;
867 	struct hlist_head *progs;
868 	struct bpf_prog *prog;
869 	int cnt, ret = 0, i;
870 	u32 flags;
871 
872 	atype = to_cgroup_bpf_attach_type(type);
873 	if (atype < 0)
874 		return -EINVAL;
875 
876 	progs = &cgrp->bpf.progs[atype];
877 	flags = cgrp->bpf.flags[atype];
878 
879 	effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
880 					      lockdep_is_held(&cgroup_mutex));
881 
882 	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
883 		cnt = bpf_prog_array_length(effective);
884 	else
885 		cnt = prog_list_length(progs);
886 
887 	if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
888 		return -EFAULT;
889 	if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt)))
890 		return -EFAULT;
891 	if (attr->query.prog_cnt == 0 || !prog_ids || !cnt)
892 		/* return early if user requested only program count + flags */
893 		return 0;
894 	if (attr->query.prog_cnt < cnt) {
895 		cnt = attr->query.prog_cnt;
896 		ret = -ENOSPC;
897 	}
898 
899 	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
900 		return bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
901 	} else {
902 		struct bpf_prog_list *pl;
903 		u32 id;
904 
905 		i = 0;
906 		hlist_for_each_entry(pl, progs, node) {
907 			prog = prog_list_prog(pl);
908 			id = prog->aux->id;
909 			if (copy_to_user(prog_ids + i, &id, sizeof(id)))
910 				return -EFAULT;
911 			if (++i == cnt)
912 				break;
913 		}
914 	}
915 	return ret;
916 }
917 
918 static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
919 			    union bpf_attr __user *uattr)
920 {
921 	int ret;
922 
923 	mutex_lock(&cgroup_mutex);
924 	ret = __cgroup_bpf_query(cgrp, attr, uattr);
925 	mutex_unlock(&cgroup_mutex);
926 	return ret;
927 }
928 
929 int cgroup_bpf_prog_attach(const union bpf_attr *attr,
930 			   enum bpf_prog_type ptype, struct bpf_prog *prog)
931 {
932 	struct bpf_prog *replace_prog = NULL;
933 	struct cgroup *cgrp;
934 	int ret;
935 
936 	cgrp = cgroup_get_from_fd(attr->target_fd);
937 	if (IS_ERR(cgrp))
938 		return PTR_ERR(cgrp);
939 
940 	if ((attr->attach_flags & BPF_F_ALLOW_MULTI) &&
941 	    (attr->attach_flags & BPF_F_REPLACE)) {
942 		replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, ptype);
943 		if (IS_ERR(replace_prog)) {
944 			cgroup_put(cgrp);
945 			return PTR_ERR(replace_prog);
946 		}
947 	}
948 
949 	ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL,
950 				attr->attach_type, attr->attach_flags);
951 
952 	if (replace_prog)
953 		bpf_prog_put(replace_prog);
954 	cgroup_put(cgrp);
955 	return ret;
956 }
957 
958 int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
959 {
960 	struct bpf_prog *prog;
961 	struct cgroup *cgrp;
962 	int ret;
963 
964 	cgrp = cgroup_get_from_fd(attr->target_fd);
965 	if (IS_ERR(cgrp))
966 		return PTR_ERR(cgrp);
967 
968 	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
969 	if (IS_ERR(prog))
970 		prog = NULL;
971 
972 	ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type);
973 	if (prog)
974 		bpf_prog_put(prog);
975 
976 	cgroup_put(cgrp);
977 	return ret;
978 }
979 
980 static void bpf_cgroup_link_release(struct bpf_link *link)
981 {
982 	struct bpf_cgroup_link *cg_link =
983 		container_of(link, struct bpf_cgroup_link, link);
984 	struct cgroup *cg;
985 
986 	/* link might have been auto-detached by dying cgroup already,
987 	 * in that case our work is done here
988 	 */
989 	if (!cg_link->cgroup)
990 		return;
991 
992 	mutex_lock(&cgroup_mutex);
993 
994 	/* re-check cgroup under lock again */
995 	if (!cg_link->cgroup) {
996 		mutex_unlock(&cgroup_mutex);
997 		return;
998 	}
999 
1000 	WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link,
1001 				    cg_link->type));
1002 
1003 	cg = cg_link->cgroup;
1004 	cg_link->cgroup = NULL;
1005 
1006 	mutex_unlock(&cgroup_mutex);
1007 
1008 	cgroup_put(cg);
1009 }
1010 
1011 static void bpf_cgroup_link_dealloc(struct bpf_link *link)
1012 {
1013 	struct bpf_cgroup_link *cg_link =
1014 		container_of(link, struct bpf_cgroup_link, link);
1015 
1016 	kfree(cg_link);
1017 }
1018 
1019 static int bpf_cgroup_link_detach(struct bpf_link *link)
1020 {
1021 	bpf_cgroup_link_release(link);
1022 
1023 	return 0;
1024 }
1025 
1026 static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link,
1027 					struct seq_file *seq)
1028 {
1029 	struct bpf_cgroup_link *cg_link =
1030 		container_of(link, struct bpf_cgroup_link, link);
1031 	u64 cg_id = 0;
1032 
1033 	mutex_lock(&cgroup_mutex);
1034 	if (cg_link->cgroup)
1035 		cg_id = cgroup_id(cg_link->cgroup);
1036 	mutex_unlock(&cgroup_mutex);
1037 
1038 	seq_printf(seq,
1039 		   "cgroup_id:\t%llu\n"
1040 		   "attach_type:\t%d\n",
1041 		   cg_id,
1042 		   cg_link->type);
1043 }
1044 
1045 static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link,
1046 					  struct bpf_link_info *info)
1047 {
1048 	struct bpf_cgroup_link *cg_link =
1049 		container_of(link, struct bpf_cgroup_link, link);
1050 	u64 cg_id = 0;
1051 
1052 	mutex_lock(&cgroup_mutex);
1053 	if (cg_link->cgroup)
1054 		cg_id = cgroup_id(cg_link->cgroup);
1055 	mutex_unlock(&cgroup_mutex);
1056 
1057 	info->cgroup.cgroup_id = cg_id;
1058 	info->cgroup.attach_type = cg_link->type;
1059 	return 0;
1060 }
1061 
1062 static const struct bpf_link_ops bpf_cgroup_link_lops = {
1063 	.release = bpf_cgroup_link_release,
1064 	.dealloc = bpf_cgroup_link_dealloc,
1065 	.detach = bpf_cgroup_link_detach,
1066 	.update_prog = cgroup_bpf_replace,
1067 	.show_fdinfo = bpf_cgroup_link_show_fdinfo,
1068 	.fill_link_info = bpf_cgroup_link_fill_link_info,
1069 };
1070 
1071 int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
1072 {
1073 	struct bpf_link_primer link_primer;
1074 	struct bpf_cgroup_link *link;
1075 	struct cgroup *cgrp;
1076 	int err;
1077 
1078 	if (attr->link_create.flags)
1079 		return -EINVAL;
1080 
1081 	cgrp = cgroup_get_from_fd(attr->link_create.target_fd);
1082 	if (IS_ERR(cgrp))
1083 		return PTR_ERR(cgrp);
1084 
1085 	link = kzalloc(sizeof(*link), GFP_USER);
1086 	if (!link) {
1087 		err = -ENOMEM;
1088 		goto out_put_cgroup;
1089 	}
1090 	bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops,
1091 		      prog);
1092 	link->cgroup = cgrp;
1093 	link->type = attr->link_create.attach_type;
1094 
1095 	err = bpf_link_prime(&link->link, &link_primer);
1096 	if (err) {
1097 		kfree(link);
1098 		goto out_put_cgroup;
1099 	}
1100 
1101 	err = cgroup_bpf_attach(cgrp, NULL, NULL, link,
1102 				link->type, BPF_F_ALLOW_MULTI);
1103 	if (err) {
1104 		bpf_link_cleanup(&link_primer);
1105 		goto out_put_cgroup;
1106 	}
1107 
1108 	return bpf_link_settle(&link_primer);
1109 
1110 out_put_cgroup:
1111 	cgroup_put(cgrp);
1112 	return err;
1113 }
1114 
1115 int cgroup_bpf_prog_query(const union bpf_attr *attr,
1116 			  union bpf_attr __user *uattr)
1117 {
1118 	struct cgroup *cgrp;
1119 	int ret;
1120 
1121 	cgrp = cgroup_get_from_fd(attr->query.target_fd);
1122 	if (IS_ERR(cgrp))
1123 		return PTR_ERR(cgrp);
1124 
1125 	ret = cgroup_bpf_query(cgrp, attr, uattr);
1126 
1127 	cgroup_put(cgrp);
1128 	return ret;
1129 }
1130 
1131 /**
1132  * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
1133  * @sk: The socket sending or receiving traffic
1134  * @skb: The skb that is being sent or received
1135  * @type: The type of program to be executed
1136  *
1137  * If no socket is passed, or the socket is not of type INET or INET6,
1138  * this function does nothing and returns 0.
1139  *
1140  * The program type passed in via @type must be suitable for network
1141  * filtering. No further check is performed to assert that.
1142  *
1143  * For egress packets, this function can return:
1144  *   NET_XMIT_SUCCESS    (0)	- continue with packet output
1145  *   NET_XMIT_DROP       (1)	- drop packet and notify TCP to call cwr
1146  *   NET_XMIT_CN         (2)	- continue with packet output and notify TCP
1147  *				  to call cwr
1148  *   -err			- drop packet
1149  *
1150  * For ingress packets, this function will return -EPERM if any
1151  * attached program was found and if it returned != 1 during execution.
1152  * Otherwise 0 is returned.
1153  */
1154 int __cgroup_bpf_run_filter_skb(struct sock *sk,
1155 				struct sk_buff *skb,
1156 				enum cgroup_bpf_attach_type atype)
1157 {
1158 	unsigned int offset = skb->data - skb_network_header(skb);
1159 	struct sock *save_sk;
1160 	void *saved_data_end;
1161 	struct cgroup *cgrp;
1162 	int ret;
1163 
1164 	if (!sk || !sk_fullsock(sk))
1165 		return 0;
1166 
1167 	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
1168 		return 0;
1169 
1170 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1171 	save_sk = skb->sk;
1172 	skb->sk = sk;
1173 	__skb_push(skb, offset);
1174 
1175 	/* compute pointers for the bpf prog */
1176 	bpf_compute_and_save_data_end(skb, &saved_data_end);
1177 
1178 	if (atype == CGROUP_INET_EGRESS) {
1179 		u32 flags = 0;
1180 		bool cn;
1181 
1182 		ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, skb,
1183 					    __bpf_prog_run_save_cb, 0, &flags);
1184 
1185 		/* Return values of CGROUP EGRESS BPF programs are:
1186 		 *   0: drop packet
1187 		 *   1: keep packet
1188 		 *   2: drop packet and cn
1189 		 *   3: keep packet and cn
1190 		 *
1191 		 * The returned value is then converted to one of the NET_XMIT
1192 		 * or an error code that is then interpreted as drop packet
1193 		 * (and no cn):
1194 		 *   0: NET_XMIT_SUCCESS  skb should be transmitted
1195 		 *   1: NET_XMIT_DROP     skb should be dropped and cn
1196 		 *   2: NET_XMIT_CN       skb should be transmitted and cn
1197 		 *   3: -err              skb should be dropped
1198 		 */
1199 
1200 		cn = flags & BPF_RET_SET_CN;
1201 		if (ret && !IS_ERR_VALUE((long)ret))
1202 			ret = -EFAULT;
1203 		if (!ret)
1204 			ret = (cn ? NET_XMIT_CN : NET_XMIT_SUCCESS);
1205 		else
1206 			ret = (cn ? NET_XMIT_DROP : ret);
1207 	} else {
1208 		ret = bpf_prog_run_array_cg(&cgrp->bpf, atype,
1209 					    skb, __bpf_prog_run_save_cb, 0,
1210 					    NULL);
1211 		if (ret && !IS_ERR_VALUE((long)ret))
1212 			ret = -EFAULT;
1213 	}
1214 	bpf_restore_data_end(skb, saved_data_end);
1215 	__skb_pull(skb, offset);
1216 	skb->sk = save_sk;
1217 
1218 	return ret;
1219 }
1220 EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
1221 
1222 /**
1223  * __cgroup_bpf_run_filter_sk() - Run a program on a sock
1224  * @sk: sock structure to manipulate
1225  * @type: The type of program to be executed
1226  *
1227  * socket is passed is expected to be of type INET or INET6.
1228  *
1229  * The program type passed in via @type must be suitable for sock
1230  * filtering. No further check is performed to assert that.
1231  *
1232  * This function will return %-EPERM if any if an attached program was found
1233  * and if it returned != 1 during execution. In all other cases, 0 is returned.
1234  */
1235 int __cgroup_bpf_run_filter_sk(struct sock *sk,
1236 			       enum cgroup_bpf_attach_type atype)
1237 {
1238 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1239 
1240 	return bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0,
1241 				     NULL);
1242 }
1243 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
1244 
1245 /**
1246  * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and
1247  *                                       provided by user sockaddr
1248  * @sk: sock struct that will use sockaddr
1249  * @uaddr: sockaddr struct provided by user
1250  * @type: The type of program to be executed
1251  * @t_ctx: Pointer to attach type specific context
1252  * @flags: Pointer to u32 which contains higher bits of BPF program
1253  *         return value (OR'ed together).
1254  *
1255  * socket is expected to be of type INET or INET6.
1256  *
1257  * This function will return %-EPERM if an attached program is found and
1258  * returned value != 1 during execution. In all other cases, 0 is returned.
1259  */
1260 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
1261 				      struct sockaddr *uaddr,
1262 				      enum cgroup_bpf_attach_type atype,
1263 				      void *t_ctx,
1264 				      u32 *flags)
1265 {
1266 	struct bpf_sock_addr_kern ctx = {
1267 		.sk = sk,
1268 		.uaddr = uaddr,
1269 		.t_ctx = t_ctx,
1270 	};
1271 	struct sockaddr_storage unspec;
1272 	struct cgroup *cgrp;
1273 
1274 	/* Check socket family since not all sockets represent network
1275 	 * endpoint (e.g. AF_UNIX).
1276 	 */
1277 	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
1278 		return 0;
1279 
1280 	if (!ctx.uaddr) {
1281 		memset(&unspec, 0, sizeof(unspec));
1282 		ctx.uaddr = (struct sockaddr *)&unspec;
1283 	}
1284 
1285 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1286 	return bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run,
1287 				     0, flags);
1288 }
1289 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
1290 
1291 /**
1292  * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock
1293  * @sk: socket to get cgroup from
1294  * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
1295  * sk with connection information (IP addresses, etc.) May not contain
1296  * cgroup info if it is a req sock.
1297  * @type: The type of program to be executed
1298  *
1299  * socket passed is expected to be of type INET or INET6.
1300  *
1301  * The program type passed in via @type must be suitable for sock_ops
1302  * filtering. No further check is performed to assert that.
1303  *
1304  * This function will return %-EPERM if any if an attached program was found
1305  * and if it returned != 1 during execution. In all other cases, 0 is returned.
1306  */
1307 int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
1308 				     struct bpf_sock_ops_kern *sock_ops,
1309 				     enum cgroup_bpf_attach_type atype)
1310 {
1311 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1312 
1313 	return bpf_prog_run_array_cg(&cgrp->bpf, atype, sock_ops, bpf_prog_run,
1314 				     0, NULL);
1315 }
1316 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
1317 
1318 int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
1319 				      short access, enum cgroup_bpf_attach_type atype)
1320 {
1321 	struct cgroup *cgrp;
1322 	struct bpf_cgroup_dev_ctx ctx = {
1323 		.access_type = (access << 16) | dev_type,
1324 		.major = major,
1325 		.minor = minor,
1326 	};
1327 	int ret;
1328 
1329 	rcu_read_lock();
1330 	cgrp = task_dfl_cgroup(current);
1331 	ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0,
1332 				    NULL);
1333 	rcu_read_unlock();
1334 
1335 	return ret;
1336 }
1337 
1338 BPF_CALL_0(bpf_get_retval)
1339 {
1340 	struct bpf_cg_run_ctx *ctx =
1341 		container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
1342 
1343 	return ctx->retval;
1344 }
1345 
1346 static const struct bpf_func_proto bpf_get_retval_proto = {
1347 	.func		= bpf_get_retval,
1348 	.gpl_only	= false,
1349 	.ret_type	= RET_INTEGER,
1350 };
1351 
1352 BPF_CALL_1(bpf_set_retval, int, retval)
1353 {
1354 	struct bpf_cg_run_ctx *ctx =
1355 		container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
1356 
1357 	ctx->retval = retval;
1358 	return 0;
1359 }
1360 
1361 static const struct bpf_func_proto bpf_set_retval_proto = {
1362 	.func		= bpf_set_retval,
1363 	.gpl_only	= false,
1364 	.ret_type	= RET_INTEGER,
1365 	.arg1_type	= ARG_ANYTHING,
1366 };
1367 
1368 static const struct bpf_func_proto *
1369 cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1370 {
1371 	switch (func_id) {
1372 	case BPF_FUNC_get_current_uid_gid:
1373 		return &bpf_get_current_uid_gid_proto;
1374 	case BPF_FUNC_get_local_storage:
1375 		return &bpf_get_local_storage_proto;
1376 	case BPF_FUNC_get_current_cgroup_id:
1377 		return &bpf_get_current_cgroup_id_proto;
1378 	case BPF_FUNC_perf_event_output:
1379 		return &bpf_event_output_data_proto;
1380 	case BPF_FUNC_get_retval:
1381 		return &bpf_get_retval_proto;
1382 	case BPF_FUNC_set_retval:
1383 		return &bpf_set_retval_proto;
1384 	default:
1385 		return bpf_base_func_proto(func_id);
1386 	}
1387 }
1388 
1389 static const struct bpf_func_proto *
1390 cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1391 {
1392 	return cgroup_base_func_proto(func_id, prog);
1393 }
1394 
1395 static bool cgroup_dev_is_valid_access(int off, int size,
1396 				       enum bpf_access_type type,
1397 				       const struct bpf_prog *prog,
1398 				       struct bpf_insn_access_aux *info)
1399 {
1400 	const int size_default = sizeof(__u32);
1401 
1402 	if (type == BPF_WRITE)
1403 		return false;
1404 
1405 	if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx))
1406 		return false;
1407 	/* The verifier guarantees that size > 0. */
1408 	if (off % size != 0)
1409 		return false;
1410 
1411 	switch (off) {
1412 	case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type):
1413 		bpf_ctx_record_field_size(info, size_default);
1414 		if (!bpf_ctx_narrow_access_ok(off, size, size_default))
1415 			return false;
1416 		break;
1417 	default:
1418 		if (size != size_default)
1419 			return false;
1420 	}
1421 
1422 	return true;
1423 }
1424 
1425 const struct bpf_prog_ops cg_dev_prog_ops = {
1426 };
1427 
1428 const struct bpf_verifier_ops cg_dev_verifier_ops = {
1429 	.get_func_proto		= cgroup_dev_func_proto,
1430 	.is_valid_access	= cgroup_dev_is_valid_access,
1431 };
1432 
1433 /**
1434  * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl
1435  *
1436  * @head: sysctl table header
1437  * @table: sysctl table
1438  * @write: sysctl is being read (= 0) or written (= 1)
1439  * @buf: pointer to buffer (in and out)
1440  * @pcount: value-result argument: value is size of buffer pointed to by @buf,
1441  *	result is size of @new_buf if program set new value, initial value
1442  *	otherwise
1443  * @ppos: value-result argument: value is position at which read from or write
1444  *	to sysctl is happening, result is new position if program overrode it,
1445  *	initial value otherwise
1446  * @type: type of program to be executed
1447  *
1448  * Program is run when sysctl is being accessed, either read or written, and
1449  * can allow or deny such access.
1450  *
1451  * This function will return %-EPERM if an attached program is found and
1452  * returned value != 1 during execution. In all other cases 0 is returned.
1453  */
1454 int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
1455 				   struct ctl_table *table, int write,
1456 				   char **buf, size_t *pcount, loff_t *ppos,
1457 				   enum cgroup_bpf_attach_type atype)
1458 {
1459 	struct bpf_sysctl_kern ctx = {
1460 		.head = head,
1461 		.table = table,
1462 		.write = write,
1463 		.ppos = ppos,
1464 		.cur_val = NULL,
1465 		.cur_len = PAGE_SIZE,
1466 		.new_val = NULL,
1467 		.new_len = 0,
1468 		.new_updated = 0,
1469 	};
1470 	struct cgroup *cgrp;
1471 	loff_t pos = 0;
1472 	int ret;
1473 
1474 	ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL);
1475 	if (!ctx.cur_val ||
1476 	    table->proc_handler(table, 0, ctx.cur_val, &ctx.cur_len, &pos)) {
1477 		/* Let BPF program decide how to proceed. */
1478 		ctx.cur_len = 0;
1479 	}
1480 
1481 	if (write && *buf && *pcount) {
1482 		/* BPF program should be able to override new value with a
1483 		 * buffer bigger than provided by user.
1484 		 */
1485 		ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL);
1486 		ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount);
1487 		if (ctx.new_val) {
1488 			memcpy(ctx.new_val, *buf, ctx.new_len);
1489 		} else {
1490 			/* Let BPF program decide how to proceed. */
1491 			ctx.new_len = 0;
1492 		}
1493 	}
1494 
1495 	rcu_read_lock();
1496 	cgrp = task_dfl_cgroup(current);
1497 	ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0,
1498 				    NULL);
1499 	rcu_read_unlock();
1500 
1501 	kfree(ctx.cur_val);
1502 
1503 	if (ret == 1 && ctx.new_updated) {
1504 		kfree(*buf);
1505 		*buf = ctx.new_val;
1506 		*pcount = ctx.new_len;
1507 	} else {
1508 		kfree(ctx.new_val);
1509 	}
1510 
1511 	return ret;
1512 }
1513 
1514 #ifdef CONFIG_NET
1515 static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
1516 			     struct bpf_sockopt_buf *buf)
1517 {
1518 	if (unlikely(max_optlen < 0))
1519 		return -EINVAL;
1520 
1521 	if (unlikely(max_optlen > PAGE_SIZE)) {
1522 		/* We don't expose optvals that are greater than PAGE_SIZE
1523 		 * to the BPF program.
1524 		 */
1525 		max_optlen = PAGE_SIZE;
1526 	}
1527 
1528 	if (max_optlen <= sizeof(buf->data)) {
1529 		/* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE
1530 		 * bytes avoid the cost of kzalloc.
1531 		 */
1532 		ctx->optval = buf->data;
1533 		ctx->optval_end = ctx->optval + max_optlen;
1534 		return max_optlen;
1535 	}
1536 
1537 	ctx->optval = kzalloc(max_optlen, GFP_USER);
1538 	if (!ctx->optval)
1539 		return -ENOMEM;
1540 
1541 	ctx->optval_end = ctx->optval + max_optlen;
1542 
1543 	return max_optlen;
1544 }
1545 
1546 static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
1547 			     struct bpf_sockopt_buf *buf)
1548 {
1549 	if (ctx->optval == buf->data)
1550 		return;
1551 	kfree(ctx->optval);
1552 }
1553 
1554 static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
1555 				  struct bpf_sockopt_buf *buf)
1556 {
1557 	return ctx->optval != buf->data;
1558 }
1559 
1560 int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
1561 				       int *optname, char __user *optval,
1562 				       int *optlen, char **kernel_optval)
1563 {
1564 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1565 	struct bpf_sockopt_buf buf = {};
1566 	struct bpf_sockopt_kern ctx = {
1567 		.sk = sk,
1568 		.level = *level,
1569 		.optname = *optname,
1570 	};
1571 	int ret, max_optlen;
1572 
1573 	/* Allocate a bit more than the initial user buffer for
1574 	 * BPF program. The canonical use case is overriding
1575 	 * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
1576 	 */
1577 	max_optlen = max_t(int, 16, *optlen);
1578 	max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
1579 	if (max_optlen < 0)
1580 		return max_optlen;
1581 
1582 	ctx.optlen = *optlen;
1583 
1584 	if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) {
1585 		ret = -EFAULT;
1586 		goto out;
1587 	}
1588 
1589 	lock_sock(sk);
1590 	ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_SETSOCKOPT,
1591 				    &ctx, bpf_prog_run, 0, NULL);
1592 	release_sock(sk);
1593 
1594 	if (ret)
1595 		goto out;
1596 
1597 	if (ctx.optlen == -1) {
1598 		/* optlen set to -1, bypass kernel */
1599 		ret = 1;
1600 	} else if (ctx.optlen > max_optlen || ctx.optlen < -1) {
1601 		/* optlen is out of bounds */
1602 		ret = -EFAULT;
1603 	} else {
1604 		/* optlen within bounds, run kernel handler */
1605 		ret = 0;
1606 
1607 		/* export any potential modifications */
1608 		*level = ctx.level;
1609 		*optname = ctx.optname;
1610 
1611 		/* optlen == 0 from BPF indicates that we should
1612 		 * use original userspace data.
1613 		 */
1614 		if (ctx.optlen != 0) {
1615 			*optlen = ctx.optlen;
1616 			/* We've used bpf_sockopt_kern->buf as an intermediary
1617 			 * storage, but the BPF program indicates that we need
1618 			 * to pass this data to the kernel setsockopt handler.
1619 			 * No way to export on-stack buf, have to allocate a
1620 			 * new buffer.
1621 			 */
1622 			if (!sockopt_buf_allocated(&ctx, &buf)) {
1623 				void *p = kmalloc(ctx.optlen, GFP_USER);
1624 
1625 				if (!p) {
1626 					ret = -ENOMEM;
1627 					goto out;
1628 				}
1629 				memcpy(p, ctx.optval, ctx.optlen);
1630 				*kernel_optval = p;
1631 			} else {
1632 				*kernel_optval = ctx.optval;
1633 			}
1634 			/* export and don't free sockopt buf */
1635 			return 0;
1636 		}
1637 	}
1638 
1639 out:
1640 	sockopt_free_buf(&ctx, &buf);
1641 	return ret;
1642 }
1643 
1644 int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
1645 				       int optname, char __user *optval,
1646 				       int __user *optlen, int max_optlen,
1647 				       int retval)
1648 {
1649 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1650 	struct bpf_sockopt_buf buf = {};
1651 	struct bpf_sockopt_kern ctx = {
1652 		.sk = sk,
1653 		.level = level,
1654 		.optname = optname,
1655 		.current_task = current,
1656 	};
1657 	int ret;
1658 
1659 	ctx.optlen = max_optlen;
1660 	max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
1661 	if (max_optlen < 0)
1662 		return max_optlen;
1663 
1664 	if (!retval) {
1665 		/* If kernel getsockopt finished successfully,
1666 		 * copy whatever was returned to the user back
1667 		 * into our temporary buffer. Set optlen to the
1668 		 * one that kernel returned as well to let
1669 		 * BPF programs inspect the value.
1670 		 */
1671 
1672 		if (get_user(ctx.optlen, optlen)) {
1673 			ret = -EFAULT;
1674 			goto out;
1675 		}
1676 
1677 		if (ctx.optlen < 0) {
1678 			ret = -EFAULT;
1679 			goto out;
1680 		}
1681 
1682 		if (copy_from_user(ctx.optval, optval,
1683 				   min(ctx.optlen, max_optlen)) != 0) {
1684 			ret = -EFAULT;
1685 			goto out;
1686 		}
1687 	}
1688 
1689 	lock_sock(sk);
1690 	ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
1691 				    &ctx, bpf_prog_run, retval, NULL);
1692 	release_sock(sk);
1693 
1694 	if (ret < 0)
1695 		goto out;
1696 
1697 	if (ctx.optlen > max_optlen || ctx.optlen < 0) {
1698 		ret = -EFAULT;
1699 		goto out;
1700 	}
1701 
1702 	if (ctx.optlen != 0) {
1703 		if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
1704 		    put_user(ctx.optlen, optlen)) {
1705 			ret = -EFAULT;
1706 			goto out;
1707 		}
1708 	}
1709 
1710 out:
1711 	sockopt_free_buf(&ctx, &buf);
1712 	return ret;
1713 }
1714 
1715 int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
1716 					    int optname, void *optval,
1717 					    int *optlen, int retval)
1718 {
1719 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1720 	struct bpf_sockopt_kern ctx = {
1721 		.sk = sk,
1722 		.level = level,
1723 		.optname = optname,
1724 		.optlen = *optlen,
1725 		.optval = optval,
1726 		.optval_end = optval + *optlen,
1727 		.current_task = current,
1728 	};
1729 	int ret;
1730 
1731 	/* Note that __cgroup_bpf_run_filter_getsockopt doesn't copy
1732 	 * user data back into BPF buffer when reval != 0. This is
1733 	 * done as an optimization to avoid extra copy, assuming
1734 	 * kernel won't populate the data in case of an error.
1735 	 * Here we always pass the data and memset() should
1736 	 * be called if that data shouldn't be "exported".
1737 	 */
1738 
1739 	ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
1740 				    &ctx, bpf_prog_run, retval, NULL);
1741 	if (ret < 0)
1742 		return ret;
1743 
1744 	if (ctx.optlen > *optlen)
1745 		return -EFAULT;
1746 
1747 	/* BPF programs can shrink the buffer, export the modifications.
1748 	 */
1749 	if (ctx.optlen != 0)
1750 		*optlen = ctx.optlen;
1751 
1752 	return ret;
1753 }
1754 #endif
1755 
1756 static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
1757 			      size_t *lenp)
1758 {
1759 	ssize_t tmp_ret = 0, ret;
1760 
1761 	if (dir->header.parent) {
1762 		tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp);
1763 		if (tmp_ret < 0)
1764 			return tmp_ret;
1765 	}
1766 
1767 	ret = strscpy(*bufp, dir->header.ctl_table[0].procname, *lenp);
1768 	if (ret < 0)
1769 		return ret;
1770 	*bufp += ret;
1771 	*lenp -= ret;
1772 	ret += tmp_ret;
1773 
1774 	/* Avoid leading slash. */
1775 	if (!ret)
1776 		return ret;
1777 
1778 	tmp_ret = strscpy(*bufp, "/", *lenp);
1779 	if (tmp_ret < 0)
1780 		return tmp_ret;
1781 	*bufp += tmp_ret;
1782 	*lenp -= tmp_ret;
1783 
1784 	return ret + tmp_ret;
1785 }
1786 
1787 BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern *, ctx, char *, buf,
1788 	   size_t, buf_len, u64, flags)
1789 {
1790 	ssize_t tmp_ret = 0, ret;
1791 
1792 	if (!buf)
1793 		return -EINVAL;
1794 
1795 	if (!(flags & BPF_F_SYSCTL_BASE_NAME)) {
1796 		if (!ctx->head)
1797 			return -EINVAL;
1798 		tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len);
1799 		if (tmp_ret < 0)
1800 			return tmp_ret;
1801 	}
1802 
1803 	ret = strscpy(buf, ctx->table->procname, buf_len);
1804 
1805 	return ret < 0 ? ret : tmp_ret + ret;
1806 }
1807 
1808 static const struct bpf_func_proto bpf_sysctl_get_name_proto = {
1809 	.func		= bpf_sysctl_get_name,
1810 	.gpl_only	= false,
1811 	.ret_type	= RET_INTEGER,
1812 	.arg1_type	= ARG_PTR_TO_CTX,
1813 	.arg2_type	= ARG_PTR_TO_MEM,
1814 	.arg3_type	= ARG_CONST_SIZE,
1815 	.arg4_type	= ARG_ANYTHING,
1816 };
1817 
1818 static int copy_sysctl_value(char *dst, size_t dst_len, char *src,
1819 			     size_t src_len)
1820 {
1821 	if (!dst)
1822 		return -EINVAL;
1823 
1824 	if (!dst_len)
1825 		return -E2BIG;
1826 
1827 	if (!src || !src_len) {
1828 		memset(dst, 0, dst_len);
1829 		return -EINVAL;
1830 	}
1831 
1832 	memcpy(dst, src, min(dst_len, src_len));
1833 
1834 	if (dst_len > src_len) {
1835 		memset(dst + src_len, '\0', dst_len - src_len);
1836 		return src_len;
1837 	}
1838 
1839 	dst[dst_len - 1] = '\0';
1840 
1841 	return -E2BIG;
1842 }
1843 
1844 BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx,
1845 	   char *, buf, size_t, buf_len)
1846 {
1847 	return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len);
1848 }
1849 
1850 static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = {
1851 	.func		= bpf_sysctl_get_current_value,
1852 	.gpl_only	= false,
1853 	.ret_type	= RET_INTEGER,
1854 	.arg1_type	= ARG_PTR_TO_CTX,
1855 	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
1856 	.arg3_type	= ARG_CONST_SIZE,
1857 };
1858 
1859 BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf,
1860 	   size_t, buf_len)
1861 {
1862 	if (!ctx->write) {
1863 		if (buf && buf_len)
1864 			memset(buf, '\0', buf_len);
1865 		return -EINVAL;
1866 	}
1867 	return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len);
1868 }
1869 
1870 static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = {
1871 	.func		= bpf_sysctl_get_new_value,
1872 	.gpl_only	= false,
1873 	.ret_type	= RET_INTEGER,
1874 	.arg1_type	= ARG_PTR_TO_CTX,
1875 	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
1876 	.arg3_type	= ARG_CONST_SIZE,
1877 };
1878 
1879 BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx,
1880 	   const char *, buf, size_t, buf_len)
1881 {
1882 	if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len)
1883 		return -EINVAL;
1884 
1885 	if (buf_len > PAGE_SIZE - 1)
1886 		return -E2BIG;
1887 
1888 	memcpy(ctx->new_val, buf, buf_len);
1889 	ctx->new_len = buf_len;
1890 	ctx->new_updated = 1;
1891 
1892 	return 0;
1893 }
1894 
1895 static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
1896 	.func		= bpf_sysctl_set_new_value,
1897 	.gpl_only	= false,
1898 	.ret_type	= RET_INTEGER,
1899 	.arg1_type	= ARG_PTR_TO_CTX,
1900 	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
1901 	.arg3_type	= ARG_CONST_SIZE,
1902 };
1903 
1904 static const struct bpf_func_proto *
1905 sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1906 {
1907 	switch (func_id) {
1908 	case BPF_FUNC_strtol:
1909 		return &bpf_strtol_proto;
1910 	case BPF_FUNC_strtoul:
1911 		return &bpf_strtoul_proto;
1912 	case BPF_FUNC_sysctl_get_name:
1913 		return &bpf_sysctl_get_name_proto;
1914 	case BPF_FUNC_sysctl_get_current_value:
1915 		return &bpf_sysctl_get_current_value_proto;
1916 	case BPF_FUNC_sysctl_get_new_value:
1917 		return &bpf_sysctl_get_new_value_proto;
1918 	case BPF_FUNC_sysctl_set_new_value:
1919 		return &bpf_sysctl_set_new_value_proto;
1920 	case BPF_FUNC_ktime_get_coarse_ns:
1921 		return &bpf_ktime_get_coarse_ns_proto;
1922 	default:
1923 		return cgroup_base_func_proto(func_id, prog);
1924 	}
1925 }
1926 
1927 static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type,
1928 				   const struct bpf_prog *prog,
1929 				   struct bpf_insn_access_aux *info)
1930 {
1931 	const int size_default = sizeof(__u32);
1932 
1933 	if (off < 0 || off + size > sizeof(struct bpf_sysctl) || off % size)
1934 		return false;
1935 
1936 	switch (off) {
1937 	case bpf_ctx_range(struct bpf_sysctl, write):
1938 		if (type != BPF_READ)
1939 			return false;
1940 		bpf_ctx_record_field_size(info, size_default);
1941 		return bpf_ctx_narrow_access_ok(off, size, size_default);
1942 	case bpf_ctx_range(struct bpf_sysctl, file_pos):
1943 		if (type == BPF_READ) {
1944 			bpf_ctx_record_field_size(info, size_default);
1945 			return bpf_ctx_narrow_access_ok(off, size, size_default);
1946 		} else {
1947 			return size == size_default;
1948 		}
1949 	default:
1950 		return false;
1951 	}
1952 }
1953 
1954 static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
1955 				     const struct bpf_insn *si,
1956 				     struct bpf_insn *insn_buf,
1957 				     struct bpf_prog *prog, u32 *target_size)
1958 {
1959 	struct bpf_insn *insn = insn_buf;
1960 	u32 read_size;
1961 
1962 	switch (si->off) {
1963 	case offsetof(struct bpf_sysctl, write):
1964 		*insn++ = BPF_LDX_MEM(
1965 			BPF_SIZE(si->code), si->dst_reg, si->src_reg,
1966 			bpf_target_off(struct bpf_sysctl_kern, write,
1967 				       sizeof_field(struct bpf_sysctl_kern,
1968 						    write),
1969 				       target_size));
1970 		break;
1971 	case offsetof(struct bpf_sysctl, file_pos):
1972 		/* ppos is a pointer so it should be accessed via indirect
1973 		 * loads and stores. Also for stores additional temporary
1974 		 * register is used since neither src_reg nor dst_reg can be
1975 		 * overridden.
1976 		 */
1977 		if (type == BPF_WRITE) {
1978 			int treg = BPF_REG_9;
1979 
1980 			if (si->src_reg == treg || si->dst_reg == treg)
1981 				--treg;
1982 			if (si->src_reg == treg || si->dst_reg == treg)
1983 				--treg;
1984 			*insn++ = BPF_STX_MEM(
1985 				BPF_DW, si->dst_reg, treg,
1986 				offsetof(struct bpf_sysctl_kern, tmp_reg));
1987 			*insn++ = BPF_LDX_MEM(
1988 				BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
1989 				treg, si->dst_reg,
1990 				offsetof(struct bpf_sysctl_kern, ppos));
1991 			*insn++ = BPF_STX_MEM(
1992 				BPF_SIZEOF(u32), treg, si->src_reg,
1993 				bpf_ctx_narrow_access_offset(
1994 					0, sizeof(u32), sizeof(loff_t)));
1995 			*insn++ = BPF_LDX_MEM(
1996 				BPF_DW, treg, si->dst_reg,
1997 				offsetof(struct bpf_sysctl_kern, tmp_reg));
1998 		} else {
1999 			*insn++ = BPF_LDX_MEM(
2000 				BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
2001 				si->dst_reg, si->src_reg,
2002 				offsetof(struct bpf_sysctl_kern, ppos));
2003 			read_size = bpf_size_to_bytes(BPF_SIZE(si->code));
2004 			*insn++ = BPF_LDX_MEM(
2005 				BPF_SIZE(si->code), si->dst_reg, si->dst_reg,
2006 				bpf_ctx_narrow_access_offset(
2007 					0, read_size, sizeof(loff_t)));
2008 		}
2009 		*target_size = sizeof(u32);
2010 		break;
2011 	}
2012 
2013 	return insn - insn_buf;
2014 }
2015 
2016 const struct bpf_verifier_ops cg_sysctl_verifier_ops = {
2017 	.get_func_proto		= sysctl_func_proto,
2018 	.is_valid_access	= sysctl_is_valid_access,
2019 	.convert_ctx_access	= sysctl_convert_ctx_access,
2020 };
2021 
2022 const struct bpf_prog_ops cg_sysctl_prog_ops = {
2023 };
2024 
2025 #ifdef CONFIG_NET
2026 BPF_CALL_1(bpf_get_netns_cookie_sockopt, struct bpf_sockopt_kern *, ctx)
2027 {
2028 	const struct net *net = ctx ? sock_net(ctx->sk) : &init_net;
2029 
2030 	return net->net_cookie;
2031 }
2032 
2033 static const struct bpf_func_proto bpf_get_netns_cookie_sockopt_proto = {
2034 	.func		= bpf_get_netns_cookie_sockopt,
2035 	.gpl_only	= false,
2036 	.ret_type	= RET_INTEGER,
2037 	.arg1_type	= ARG_PTR_TO_CTX_OR_NULL,
2038 };
2039 #endif
2040 
2041 static const struct bpf_func_proto *
2042 cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
2043 {
2044 	switch (func_id) {
2045 #ifdef CONFIG_NET
2046 	case BPF_FUNC_get_netns_cookie:
2047 		return &bpf_get_netns_cookie_sockopt_proto;
2048 	case BPF_FUNC_sk_storage_get:
2049 		return &bpf_sk_storage_get_proto;
2050 	case BPF_FUNC_sk_storage_delete:
2051 		return &bpf_sk_storage_delete_proto;
2052 	case BPF_FUNC_setsockopt:
2053 		if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT)
2054 			return &bpf_sk_setsockopt_proto;
2055 		return NULL;
2056 	case BPF_FUNC_getsockopt:
2057 		if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT)
2058 			return &bpf_sk_getsockopt_proto;
2059 		return NULL;
2060 #endif
2061 #ifdef CONFIG_INET
2062 	case BPF_FUNC_tcp_sock:
2063 		return &bpf_tcp_sock_proto;
2064 #endif
2065 	default:
2066 		return cgroup_base_func_proto(func_id, prog);
2067 	}
2068 }
2069 
2070 static bool cg_sockopt_is_valid_access(int off, int size,
2071 				       enum bpf_access_type type,
2072 				       const struct bpf_prog *prog,
2073 				       struct bpf_insn_access_aux *info)
2074 {
2075 	const int size_default = sizeof(__u32);
2076 
2077 	if (off < 0 || off >= sizeof(struct bpf_sockopt))
2078 		return false;
2079 
2080 	if (off % size != 0)
2081 		return false;
2082 
2083 	if (type == BPF_WRITE) {
2084 		switch (off) {
2085 		case offsetof(struct bpf_sockopt, retval):
2086 			if (size != size_default)
2087 				return false;
2088 			return prog->expected_attach_type ==
2089 				BPF_CGROUP_GETSOCKOPT;
2090 		case offsetof(struct bpf_sockopt, optname):
2091 			fallthrough;
2092 		case offsetof(struct bpf_sockopt, level):
2093 			if (size != size_default)
2094 				return false;
2095 			return prog->expected_attach_type ==
2096 				BPF_CGROUP_SETSOCKOPT;
2097 		case offsetof(struct bpf_sockopt, optlen):
2098 			return size == size_default;
2099 		default:
2100 			return false;
2101 		}
2102 	}
2103 
2104 	switch (off) {
2105 	case offsetof(struct bpf_sockopt, sk):
2106 		if (size != sizeof(__u64))
2107 			return false;
2108 		info->reg_type = PTR_TO_SOCKET;
2109 		break;
2110 	case offsetof(struct bpf_sockopt, optval):
2111 		if (size != sizeof(__u64))
2112 			return false;
2113 		info->reg_type = PTR_TO_PACKET;
2114 		break;
2115 	case offsetof(struct bpf_sockopt, optval_end):
2116 		if (size != sizeof(__u64))
2117 			return false;
2118 		info->reg_type = PTR_TO_PACKET_END;
2119 		break;
2120 	case offsetof(struct bpf_sockopt, retval):
2121 		if (size != size_default)
2122 			return false;
2123 		return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT;
2124 	default:
2125 		if (size != size_default)
2126 			return false;
2127 		break;
2128 	}
2129 	return true;
2130 }
2131 
2132 #define CG_SOCKOPT_ACCESS_FIELD(T, F)					\
2133 	T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F),			\
2134 	  si->dst_reg, si->src_reg,					\
2135 	  offsetof(struct bpf_sockopt_kern, F))
2136 
2137 static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
2138 					 const struct bpf_insn *si,
2139 					 struct bpf_insn *insn_buf,
2140 					 struct bpf_prog *prog,
2141 					 u32 *target_size)
2142 {
2143 	struct bpf_insn *insn = insn_buf;
2144 
2145 	switch (si->off) {
2146 	case offsetof(struct bpf_sockopt, sk):
2147 		*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk);
2148 		break;
2149 	case offsetof(struct bpf_sockopt, level):
2150 		if (type == BPF_WRITE)
2151 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level);
2152 		else
2153 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level);
2154 		break;
2155 	case offsetof(struct bpf_sockopt, optname):
2156 		if (type == BPF_WRITE)
2157 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname);
2158 		else
2159 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname);
2160 		break;
2161 	case offsetof(struct bpf_sockopt, optlen):
2162 		if (type == BPF_WRITE)
2163 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen);
2164 		else
2165 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen);
2166 		break;
2167 	case offsetof(struct bpf_sockopt, retval):
2168 		BUILD_BUG_ON(offsetof(struct bpf_cg_run_ctx, run_ctx) != 0);
2169 
2170 		if (type == BPF_WRITE) {
2171 			int treg = BPF_REG_9;
2172 
2173 			if (si->src_reg == treg || si->dst_reg == treg)
2174 				--treg;
2175 			if (si->src_reg == treg || si->dst_reg == treg)
2176 				--treg;
2177 			*insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, treg,
2178 					      offsetof(struct bpf_sockopt_kern, tmp_reg));
2179 			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
2180 					      treg, si->dst_reg,
2181 					      offsetof(struct bpf_sockopt_kern, current_task));
2182 			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
2183 					      treg, treg,
2184 					      offsetof(struct task_struct, bpf_ctx));
2185 			*insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
2186 					      treg, si->src_reg,
2187 					      offsetof(struct bpf_cg_run_ctx, retval));
2188 			*insn++ = BPF_LDX_MEM(BPF_DW, treg, si->dst_reg,
2189 					      offsetof(struct bpf_sockopt_kern, tmp_reg));
2190 		} else {
2191 			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
2192 					      si->dst_reg, si->src_reg,
2193 					      offsetof(struct bpf_sockopt_kern, current_task));
2194 			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
2195 					      si->dst_reg, si->dst_reg,
2196 					      offsetof(struct task_struct, bpf_ctx));
2197 			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
2198 					      si->dst_reg, si->dst_reg,
2199 					      offsetof(struct bpf_cg_run_ctx, retval));
2200 		}
2201 		break;
2202 	case offsetof(struct bpf_sockopt, optval):
2203 		*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval);
2204 		break;
2205 	case offsetof(struct bpf_sockopt, optval_end):
2206 		*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end);
2207 		break;
2208 	}
2209 
2210 	return insn - insn_buf;
2211 }
2212 
2213 static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf,
2214 				   bool direct_write,
2215 				   const struct bpf_prog *prog)
2216 {
2217 	/* Nothing to do for sockopt argument. The data is kzalloc'ated.
2218 	 */
2219 	return 0;
2220 }
2221 
2222 const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
2223 	.get_func_proto		= cg_sockopt_func_proto,
2224 	.is_valid_access	= cg_sockopt_is_valid_access,
2225 	.convert_ctx_access	= cg_sockopt_convert_ctx_access,
2226 	.gen_prologue		= cg_sockopt_get_prologue,
2227 };
2228 
2229 const struct bpf_prog_ops cg_sockopt_prog_ops = {
2230 };
2231