xref: /openbmc/linux/kernel/bpf/cgroup.c (revision 055eb955)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Functions to manage eBPF programs attached to cgroups
4  *
5  * Copyright (c) 2016 Daniel Mack
6  */
7 
8 #include <linux/kernel.h>
9 #include <linux/atomic.h>
10 #include <linux/cgroup.h>
11 #include <linux/filter.h>
12 #include <linux/slab.h>
13 #include <linux/sysctl.h>
14 #include <linux/string.h>
15 #include <linux/bpf.h>
16 #include <linux/bpf-cgroup.h>
17 #include <net/sock.h>
18 #include <net/bpf_sk_storage.h>
19 
20 #include "../cgroup/cgroup-internal.h"
21 
22 DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
23 EXPORT_SYMBOL(cgroup_bpf_enabled_key);
24 
25 /* __always_inline is necessary to prevent indirect call through run_prog
26  * function pointer.
27  */
28 static __always_inline int
29 bpf_prog_run_array_cg_flags(const struct cgroup_bpf *cgrp,
30 			    enum cgroup_bpf_attach_type atype,
31 			    const void *ctx, bpf_prog_run_fn run_prog,
32 			    int retval, u32 *ret_flags)
33 {
34 	const struct bpf_prog_array_item *item;
35 	const struct bpf_prog *prog;
36 	const struct bpf_prog_array *array;
37 	struct bpf_run_ctx *old_run_ctx;
38 	struct bpf_cg_run_ctx run_ctx;
39 	u32 func_ret;
40 
41 	run_ctx.retval = retval;
42 	migrate_disable();
43 	rcu_read_lock();
44 	array = rcu_dereference(cgrp->effective[atype]);
45 	item = &array->items[0];
46 	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
47 	while ((prog = READ_ONCE(item->prog))) {
48 		run_ctx.prog_item = item;
49 		func_ret = run_prog(prog, ctx);
50 		if (!(func_ret & 1) && !IS_ERR_VALUE((long)run_ctx.retval))
51 			run_ctx.retval = -EPERM;
52 		*(ret_flags) |= (func_ret >> 1);
53 		item++;
54 	}
55 	bpf_reset_run_ctx(old_run_ctx);
56 	rcu_read_unlock();
57 	migrate_enable();
58 	return run_ctx.retval;
59 }
60 
61 static __always_inline int
62 bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
63 		      enum cgroup_bpf_attach_type atype,
64 		      const void *ctx, bpf_prog_run_fn run_prog,
65 		      int retval)
66 {
67 	const struct bpf_prog_array_item *item;
68 	const struct bpf_prog *prog;
69 	const struct bpf_prog_array *array;
70 	struct bpf_run_ctx *old_run_ctx;
71 	struct bpf_cg_run_ctx run_ctx;
72 
73 	run_ctx.retval = retval;
74 	migrate_disable();
75 	rcu_read_lock();
76 	array = rcu_dereference(cgrp->effective[atype]);
77 	item = &array->items[0];
78 	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
79 	while ((prog = READ_ONCE(item->prog))) {
80 		run_ctx.prog_item = item;
81 		if (!run_prog(prog, ctx) && !IS_ERR_VALUE((long)run_ctx.retval))
82 			run_ctx.retval = -EPERM;
83 		item++;
84 	}
85 	bpf_reset_run_ctx(old_run_ctx);
86 	rcu_read_unlock();
87 	migrate_enable();
88 	return run_ctx.retval;
89 }
90 
91 void cgroup_bpf_offline(struct cgroup *cgrp)
92 {
93 	cgroup_get(cgrp);
94 	percpu_ref_kill(&cgrp->bpf.refcnt);
95 }
96 
97 static void bpf_cgroup_storages_free(struct bpf_cgroup_storage *storages[])
98 {
99 	enum bpf_cgroup_storage_type stype;
100 
101 	for_each_cgroup_storage_type(stype)
102 		bpf_cgroup_storage_free(storages[stype]);
103 }
104 
105 static int bpf_cgroup_storages_alloc(struct bpf_cgroup_storage *storages[],
106 				     struct bpf_cgroup_storage *new_storages[],
107 				     enum bpf_attach_type type,
108 				     struct bpf_prog *prog,
109 				     struct cgroup *cgrp)
110 {
111 	enum bpf_cgroup_storage_type stype;
112 	struct bpf_cgroup_storage_key key;
113 	struct bpf_map *map;
114 
115 	key.cgroup_inode_id = cgroup_id(cgrp);
116 	key.attach_type = type;
117 
118 	for_each_cgroup_storage_type(stype) {
119 		map = prog->aux->cgroup_storage[stype];
120 		if (!map)
121 			continue;
122 
123 		storages[stype] = cgroup_storage_lookup((void *)map, &key, false);
124 		if (storages[stype])
125 			continue;
126 
127 		storages[stype] = bpf_cgroup_storage_alloc(prog, stype);
128 		if (IS_ERR(storages[stype])) {
129 			bpf_cgroup_storages_free(new_storages);
130 			return -ENOMEM;
131 		}
132 
133 		new_storages[stype] = storages[stype];
134 	}
135 
136 	return 0;
137 }
138 
139 static void bpf_cgroup_storages_assign(struct bpf_cgroup_storage *dst[],
140 				       struct bpf_cgroup_storage *src[])
141 {
142 	enum bpf_cgroup_storage_type stype;
143 
144 	for_each_cgroup_storage_type(stype)
145 		dst[stype] = src[stype];
146 }
147 
148 static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[],
149 				     struct cgroup *cgrp,
150 				     enum bpf_attach_type attach_type)
151 {
152 	enum bpf_cgroup_storage_type stype;
153 
154 	for_each_cgroup_storage_type(stype)
155 		bpf_cgroup_storage_link(storages[stype], cgrp, attach_type);
156 }
157 
158 /* Called when bpf_cgroup_link is auto-detached from dying cgroup.
159  * It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It
160  * doesn't free link memory, which will eventually be done by bpf_link's
161  * release() callback, when its last FD is closed.
162  */
163 static void bpf_cgroup_link_auto_detach(struct bpf_cgroup_link *link)
164 {
165 	cgroup_put(link->cgroup);
166 	link->cgroup = NULL;
167 }
168 
169 /**
170  * cgroup_bpf_release() - put references of all bpf programs and
171  *                        release all cgroup bpf data
172  * @work: work structure embedded into the cgroup to modify
173  */
174 static void cgroup_bpf_release(struct work_struct *work)
175 {
176 	struct cgroup *p, *cgrp = container_of(work, struct cgroup,
177 					       bpf.release_work);
178 	struct bpf_prog_array *old_array;
179 	struct list_head *storages = &cgrp->bpf.storages;
180 	struct bpf_cgroup_storage *storage, *stmp;
181 
182 	unsigned int atype;
183 
184 	mutex_lock(&cgroup_mutex);
185 
186 	for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) {
187 		struct list_head *progs = &cgrp->bpf.progs[atype];
188 		struct bpf_prog_list *pl, *pltmp;
189 
190 		list_for_each_entry_safe(pl, pltmp, progs, node) {
191 			list_del(&pl->node);
192 			if (pl->prog)
193 				bpf_prog_put(pl->prog);
194 			if (pl->link)
195 				bpf_cgroup_link_auto_detach(pl->link);
196 			kfree(pl);
197 			static_branch_dec(&cgroup_bpf_enabled_key[atype]);
198 		}
199 		old_array = rcu_dereference_protected(
200 				cgrp->bpf.effective[atype],
201 				lockdep_is_held(&cgroup_mutex));
202 		bpf_prog_array_free(old_array);
203 	}
204 
205 	list_for_each_entry_safe(storage, stmp, storages, list_cg) {
206 		bpf_cgroup_storage_unlink(storage);
207 		bpf_cgroup_storage_free(storage);
208 	}
209 
210 	mutex_unlock(&cgroup_mutex);
211 
212 	for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
213 		cgroup_bpf_put(p);
214 
215 	percpu_ref_exit(&cgrp->bpf.refcnt);
216 	cgroup_put(cgrp);
217 }
218 
219 /**
220  * cgroup_bpf_release_fn() - callback used to schedule releasing
221  *                           of bpf cgroup data
222  * @ref: percpu ref counter structure
223  */
224 static void cgroup_bpf_release_fn(struct percpu_ref *ref)
225 {
226 	struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
227 
228 	INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
229 	queue_work(system_wq, &cgrp->bpf.release_work);
230 }
231 
232 /* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through
233  * link or direct prog.
234  */
235 static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
236 {
237 	if (pl->prog)
238 		return pl->prog;
239 	if (pl->link)
240 		return pl->link->link.prog;
241 	return NULL;
242 }
243 
244 /* count number of elements in the list.
245  * it's slow but the list cannot be long
246  */
247 static u32 prog_list_length(struct list_head *head)
248 {
249 	struct bpf_prog_list *pl;
250 	u32 cnt = 0;
251 
252 	list_for_each_entry(pl, head, node) {
253 		if (!prog_list_prog(pl))
254 			continue;
255 		cnt++;
256 	}
257 	return cnt;
258 }
259 
260 /* if parent has non-overridable prog attached,
261  * disallow attaching new programs to the descendent cgroup.
262  * if parent has overridable or multi-prog, allow attaching
263  */
264 static bool hierarchy_allows_attach(struct cgroup *cgrp,
265 				    enum cgroup_bpf_attach_type atype)
266 {
267 	struct cgroup *p;
268 
269 	p = cgroup_parent(cgrp);
270 	if (!p)
271 		return true;
272 	do {
273 		u32 flags = p->bpf.flags[atype];
274 		u32 cnt;
275 
276 		if (flags & BPF_F_ALLOW_MULTI)
277 			return true;
278 		cnt = prog_list_length(&p->bpf.progs[atype]);
279 		WARN_ON_ONCE(cnt > 1);
280 		if (cnt == 1)
281 			return !!(flags & BPF_F_ALLOW_OVERRIDE);
282 		p = cgroup_parent(p);
283 	} while (p);
284 	return true;
285 }
286 
287 /* compute a chain of effective programs for a given cgroup:
288  * start from the list of programs in this cgroup and add
289  * all parent programs.
290  * Note that parent's F_ALLOW_OVERRIDE-type program is yielding
291  * to programs in this cgroup
292  */
293 static int compute_effective_progs(struct cgroup *cgrp,
294 				   enum cgroup_bpf_attach_type atype,
295 				   struct bpf_prog_array **array)
296 {
297 	struct bpf_prog_array_item *item;
298 	struct bpf_prog_array *progs;
299 	struct bpf_prog_list *pl;
300 	struct cgroup *p = cgrp;
301 	int cnt = 0;
302 
303 	/* count number of effective programs by walking parents */
304 	do {
305 		if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
306 			cnt += prog_list_length(&p->bpf.progs[atype]);
307 		p = cgroup_parent(p);
308 	} while (p);
309 
310 	progs = bpf_prog_array_alloc(cnt, GFP_KERNEL);
311 	if (!progs)
312 		return -ENOMEM;
313 
314 	/* populate the array with effective progs */
315 	cnt = 0;
316 	p = cgrp;
317 	do {
318 		if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
319 			continue;
320 
321 		list_for_each_entry(pl, &p->bpf.progs[atype], node) {
322 			if (!prog_list_prog(pl))
323 				continue;
324 
325 			item = &progs->items[cnt];
326 			item->prog = prog_list_prog(pl);
327 			bpf_cgroup_storages_assign(item->cgroup_storage,
328 						   pl->storage);
329 			cnt++;
330 		}
331 	} while ((p = cgroup_parent(p)));
332 
333 	*array = progs;
334 	return 0;
335 }
336 
337 static void activate_effective_progs(struct cgroup *cgrp,
338 				     enum cgroup_bpf_attach_type atype,
339 				     struct bpf_prog_array *old_array)
340 {
341 	old_array = rcu_replace_pointer(cgrp->bpf.effective[atype], old_array,
342 					lockdep_is_held(&cgroup_mutex));
343 	/* free prog array after grace period, since __cgroup_bpf_run_*()
344 	 * might be still walking the array
345 	 */
346 	bpf_prog_array_free(old_array);
347 }
348 
349 /**
350  * cgroup_bpf_inherit() - inherit effective programs from parent
351  * @cgrp: the cgroup to modify
352  */
353 int cgroup_bpf_inherit(struct cgroup *cgrp)
354 {
355 /* has to use marco instead of const int, since compiler thinks
356  * that array below is variable length
357  */
358 #define	NR ARRAY_SIZE(cgrp->bpf.effective)
359 	struct bpf_prog_array *arrays[NR] = {};
360 	struct cgroup *p;
361 	int ret, i;
362 
363 	ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0,
364 			      GFP_KERNEL);
365 	if (ret)
366 		return ret;
367 
368 	for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
369 		cgroup_bpf_get(p);
370 
371 	for (i = 0; i < NR; i++)
372 		INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
373 
374 	INIT_LIST_HEAD(&cgrp->bpf.storages);
375 
376 	for (i = 0; i < NR; i++)
377 		if (compute_effective_progs(cgrp, i, &arrays[i]))
378 			goto cleanup;
379 
380 	for (i = 0; i < NR; i++)
381 		activate_effective_progs(cgrp, i, arrays[i]);
382 
383 	return 0;
384 cleanup:
385 	for (i = 0; i < NR; i++)
386 		bpf_prog_array_free(arrays[i]);
387 
388 	for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
389 		cgroup_bpf_put(p);
390 
391 	percpu_ref_exit(&cgrp->bpf.refcnt);
392 
393 	return -ENOMEM;
394 }
395 
396 static int update_effective_progs(struct cgroup *cgrp,
397 				  enum cgroup_bpf_attach_type atype)
398 {
399 	struct cgroup_subsys_state *css;
400 	int err;
401 
402 	/* allocate and recompute effective prog arrays */
403 	css_for_each_descendant_pre(css, &cgrp->self) {
404 		struct cgroup *desc = container_of(css, struct cgroup, self);
405 
406 		if (percpu_ref_is_zero(&desc->bpf.refcnt))
407 			continue;
408 
409 		err = compute_effective_progs(desc, atype, &desc->bpf.inactive);
410 		if (err)
411 			goto cleanup;
412 	}
413 
414 	/* all allocations were successful. Activate all prog arrays */
415 	css_for_each_descendant_pre(css, &cgrp->self) {
416 		struct cgroup *desc = container_of(css, struct cgroup, self);
417 
418 		if (percpu_ref_is_zero(&desc->bpf.refcnt)) {
419 			if (unlikely(desc->bpf.inactive)) {
420 				bpf_prog_array_free(desc->bpf.inactive);
421 				desc->bpf.inactive = NULL;
422 			}
423 			continue;
424 		}
425 
426 		activate_effective_progs(desc, atype, desc->bpf.inactive);
427 		desc->bpf.inactive = NULL;
428 	}
429 
430 	return 0;
431 
432 cleanup:
433 	/* oom while computing effective. Free all computed effective arrays
434 	 * since they were not activated
435 	 */
436 	css_for_each_descendant_pre(css, &cgrp->self) {
437 		struct cgroup *desc = container_of(css, struct cgroup, self);
438 
439 		bpf_prog_array_free(desc->bpf.inactive);
440 		desc->bpf.inactive = NULL;
441 	}
442 
443 	return err;
444 }
445 
446 #define BPF_CGROUP_MAX_PROGS 64
447 
448 static struct bpf_prog_list *find_attach_entry(struct list_head *progs,
449 					       struct bpf_prog *prog,
450 					       struct bpf_cgroup_link *link,
451 					       struct bpf_prog *replace_prog,
452 					       bool allow_multi)
453 {
454 	struct bpf_prog_list *pl;
455 
456 	/* single-attach case */
457 	if (!allow_multi) {
458 		if (list_empty(progs))
459 			return NULL;
460 		return list_first_entry(progs, typeof(*pl), node);
461 	}
462 
463 	list_for_each_entry(pl, progs, node) {
464 		if (prog && pl->prog == prog && prog != replace_prog)
465 			/* disallow attaching the same prog twice */
466 			return ERR_PTR(-EINVAL);
467 		if (link && pl->link == link)
468 			/* disallow attaching the same link twice */
469 			return ERR_PTR(-EINVAL);
470 	}
471 
472 	/* direct prog multi-attach w/ replacement case */
473 	if (replace_prog) {
474 		list_for_each_entry(pl, progs, node) {
475 			if (pl->prog == replace_prog)
476 				/* a match found */
477 				return pl;
478 		}
479 		/* prog to replace not found for cgroup */
480 		return ERR_PTR(-ENOENT);
481 	}
482 
483 	return NULL;
484 }
485 
486 /**
487  * __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and
488  *                         propagate the change to descendants
489  * @cgrp: The cgroup which descendants to traverse
490  * @prog: A program to attach
491  * @link: A link to attach
492  * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set
493  * @type: Type of attach operation
494  * @flags: Option flags
495  *
496  * Exactly one of @prog or @link can be non-null.
497  * Must be called with cgroup_mutex held.
498  */
499 static int __cgroup_bpf_attach(struct cgroup *cgrp,
500 			       struct bpf_prog *prog, struct bpf_prog *replace_prog,
501 			       struct bpf_cgroup_link *link,
502 			       enum bpf_attach_type type, u32 flags)
503 {
504 	u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
505 	struct bpf_prog *old_prog = NULL;
506 	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
507 	struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
508 	enum cgroup_bpf_attach_type atype;
509 	struct bpf_prog_list *pl;
510 	struct list_head *progs;
511 	int err;
512 
513 	if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
514 	    ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI)))
515 		/* invalid combination */
516 		return -EINVAL;
517 	if (link && (prog || replace_prog))
518 		/* only either link or prog/replace_prog can be specified */
519 		return -EINVAL;
520 	if (!!replace_prog != !!(flags & BPF_F_REPLACE))
521 		/* replace_prog implies BPF_F_REPLACE, and vice versa */
522 		return -EINVAL;
523 
524 	atype = to_cgroup_bpf_attach_type(type);
525 	if (atype < 0)
526 		return -EINVAL;
527 
528 	progs = &cgrp->bpf.progs[atype];
529 
530 	if (!hierarchy_allows_attach(cgrp, atype))
531 		return -EPERM;
532 
533 	if (!list_empty(progs) && cgrp->bpf.flags[atype] != saved_flags)
534 		/* Disallow attaching non-overridable on top
535 		 * of existing overridable in this cgroup.
536 		 * Disallow attaching multi-prog if overridable or none
537 		 */
538 		return -EPERM;
539 
540 	if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
541 		return -E2BIG;
542 
543 	pl = find_attach_entry(progs, prog, link, replace_prog,
544 			       flags & BPF_F_ALLOW_MULTI);
545 	if (IS_ERR(pl))
546 		return PTR_ERR(pl);
547 
548 	if (bpf_cgroup_storages_alloc(storage, new_storage, type,
549 				      prog ? : link->link.prog, cgrp))
550 		return -ENOMEM;
551 
552 	if (pl) {
553 		old_prog = pl->prog;
554 	} else {
555 		pl = kmalloc(sizeof(*pl), GFP_KERNEL);
556 		if (!pl) {
557 			bpf_cgroup_storages_free(new_storage);
558 			return -ENOMEM;
559 		}
560 		list_add_tail(&pl->node, progs);
561 	}
562 
563 	pl->prog = prog;
564 	pl->link = link;
565 	bpf_cgroup_storages_assign(pl->storage, storage);
566 	cgrp->bpf.flags[atype] = saved_flags;
567 
568 	err = update_effective_progs(cgrp, atype);
569 	if (err)
570 		goto cleanup;
571 
572 	if (old_prog)
573 		bpf_prog_put(old_prog);
574 	else
575 		static_branch_inc(&cgroup_bpf_enabled_key[atype]);
576 	bpf_cgroup_storages_link(new_storage, cgrp, type);
577 	return 0;
578 
579 cleanup:
580 	if (old_prog) {
581 		pl->prog = old_prog;
582 		pl->link = NULL;
583 	}
584 	bpf_cgroup_storages_free(new_storage);
585 	if (!old_prog) {
586 		list_del(&pl->node);
587 		kfree(pl);
588 	}
589 	return err;
590 }
591 
592 static int cgroup_bpf_attach(struct cgroup *cgrp,
593 			     struct bpf_prog *prog, struct bpf_prog *replace_prog,
594 			     struct bpf_cgroup_link *link,
595 			     enum bpf_attach_type type,
596 			     u32 flags)
597 {
598 	int ret;
599 
600 	mutex_lock(&cgroup_mutex);
601 	ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
602 	mutex_unlock(&cgroup_mutex);
603 	return ret;
604 }
605 
606 /* Swap updated BPF program for given link in effective program arrays across
607  * all descendant cgroups. This function is guaranteed to succeed.
608  */
609 static void replace_effective_prog(struct cgroup *cgrp,
610 				   enum cgroup_bpf_attach_type atype,
611 				   struct bpf_cgroup_link *link)
612 {
613 	struct bpf_prog_array_item *item;
614 	struct cgroup_subsys_state *css;
615 	struct bpf_prog_array *progs;
616 	struct bpf_prog_list *pl;
617 	struct list_head *head;
618 	struct cgroup *cg;
619 	int pos;
620 
621 	css_for_each_descendant_pre(css, &cgrp->self) {
622 		struct cgroup *desc = container_of(css, struct cgroup, self);
623 
624 		if (percpu_ref_is_zero(&desc->bpf.refcnt))
625 			continue;
626 
627 		/* find position of link in effective progs array */
628 		for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
629 			if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
630 				continue;
631 
632 			head = &cg->bpf.progs[atype];
633 			list_for_each_entry(pl, head, node) {
634 				if (!prog_list_prog(pl))
635 					continue;
636 				if (pl->link == link)
637 					goto found;
638 				pos++;
639 			}
640 		}
641 found:
642 		BUG_ON(!cg);
643 		progs = rcu_dereference_protected(
644 				desc->bpf.effective[atype],
645 				lockdep_is_held(&cgroup_mutex));
646 		item = &progs->items[pos];
647 		WRITE_ONCE(item->prog, link->link.prog);
648 	}
649 }
650 
651 /**
652  * __cgroup_bpf_replace() - Replace link's program and propagate the change
653  *                          to descendants
654  * @cgrp: The cgroup which descendants to traverse
655  * @link: A link for which to replace BPF program
656  * @type: Type of attach operation
657  *
658  * Must be called with cgroup_mutex held.
659  */
660 static int __cgroup_bpf_replace(struct cgroup *cgrp,
661 				struct bpf_cgroup_link *link,
662 				struct bpf_prog *new_prog)
663 {
664 	enum cgroup_bpf_attach_type atype;
665 	struct bpf_prog *old_prog;
666 	struct bpf_prog_list *pl;
667 	struct list_head *progs;
668 	bool found = false;
669 
670 	atype = to_cgroup_bpf_attach_type(link->type);
671 	if (atype < 0)
672 		return -EINVAL;
673 
674 	progs = &cgrp->bpf.progs[atype];
675 
676 	if (link->link.prog->type != new_prog->type)
677 		return -EINVAL;
678 
679 	list_for_each_entry(pl, progs, node) {
680 		if (pl->link == link) {
681 			found = true;
682 			break;
683 		}
684 	}
685 	if (!found)
686 		return -ENOENT;
687 
688 	old_prog = xchg(&link->link.prog, new_prog);
689 	replace_effective_prog(cgrp, atype, link);
690 	bpf_prog_put(old_prog);
691 	return 0;
692 }
693 
694 static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog,
695 			      struct bpf_prog *old_prog)
696 {
697 	struct bpf_cgroup_link *cg_link;
698 	int ret;
699 
700 	cg_link = container_of(link, struct bpf_cgroup_link, link);
701 
702 	mutex_lock(&cgroup_mutex);
703 	/* link might have been auto-released by dying cgroup, so fail */
704 	if (!cg_link->cgroup) {
705 		ret = -ENOLINK;
706 		goto out_unlock;
707 	}
708 	if (old_prog && link->prog != old_prog) {
709 		ret = -EPERM;
710 		goto out_unlock;
711 	}
712 	ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog);
713 out_unlock:
714 	mutex_unlock(&cgroup_mutex);
715 	return ret;
716 }
717 
718 static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
719 					       struct bpf_prog *prog,
720 					       struct bpf_cgroup_link *link,
721 					       bool allow_multi)
722 {
723 	struct bpf_prog_list *pl;
724 
725 	if (!allow_multi) {
726 		if (list_empty(progs))
727 			/* report error when trying to detach and nothing is attached */
728 			return ERR_PTR(-ENOENT);
729 
730 		/* to maintain backward compatibility NONE and OVERRIDE cgroups
731 		 * allow detaching with invalid FD (prog==NULL) in legacy mode
732 		 */
733 		return list_first_entry(progs, typeof(*pl), node);
734 	}
735 
736 	if (!prog && !link)
737 		/* to detach MULTI prog the user has to specify valid FD
738 		 * of the program or link to be detached
739 		 */
740 		return ERR_PTR(-EINVAL);
741 
742 	/* find the prog or link and detach it */
743 	list_for_each_entry(pl, progs, node) {
744 		if (pl->prog == prog && pl->link == link)
745 			return pl;
746 	}
747 	return ERR_PTR(-ENOENT);
748 }
749 
750 /**
751  * __cgroup_bpf_detach() - Detach the program or link from a cgroup, and
752  *                         propagate the change to descendants
753  * @cgrp: The cgroup which descendants to traverse
754  * @prog: A program to detach or NULL
755  * @link: A link to detach or NULL
756  * @type: Type of detach operation
757  *
758  * At most one of @prog or @link can be non-NULL.
759  * Must be called with cgroup_mutex held.
760  */
761 static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
762 			       struct bpf_cgroup_link *link, enum bpf_attach_type type)
763 {
764 	enum cgroup_bpf_attach_type atype;
765 	struct bpf_prog *old_prog;
766 	struct bpf_prog_list *pl;
767 	struct list_head *progs;
768 	u32 flags;
769 	int err;
770 
771 	atype = to_cgroup_bpf_attach_type(type);
772 	if (atype < 0)
773 		return -EINVAL;
774 
775 	progs = &cgrp->bpf.progs[atype];
776 	flags = cgrp->bpf.flags[atype];
777 
778 	if (prog && link)
779 		/* only one of prog or link can be specified */
780 		return -EINVAL;
781 
782 	pl = find_detach_entry(progs, prog, link, flags & BPF_F_ALLOW_MULTI);
783 	if (IS_ERR(pl))
784 		return PTR_ERR(pl);
785 
786 	/* mark it deleted, so it's ignored while recomputing effective */
787 	old_prog = pl->prog;
788 	pl->prog = NULL;
789 	pl->link = NULL;
790 
791 	err = update_effective_progs(cgrp, atype);
792 	if (err)
793 		goto cleanup;
794 
795 	/* now can actually delete it from this cgroup list */
796 	list_del(&pl->node);
797 	kfree(pl);
798 	if (list_empty(progs))
799 		/* last program was detached, reset flags to zero */
800 		cgrp->bpf.flags[atype] = 0;
801 	if (old_prog)
802 		bpf_prog_put(old_prog);
803 	static_branch_dec(&cgroup_bpf_enabled_key[atype]);
804 	return 0;
805 
806 cleanup:
807 	/* restore back prog or link */
808 	pl->prog = old_prog;
809 	pl->link = link;
810 	return err;
811 }
812 
813 static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
814 			     enum bpf_attach_type type)
815 {
816 	int ret;
817 
818 	mutex_lock(&cgroup_mutex);
819 	ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
820 	mutex_unlock(&cgroup_mutex);
821 	return ret;
822 }
823 
824 /* Must be called with cgroup_mutex held to avoid races. */
825 static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
826 			      union bpf_attr __user *uattr)
827 {
828 	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
829 	enum bpf_attach_type type = attr->query.attach_type;
830 	enum cgroup_bpf_attach_type atype;
831 	struct bpf_prog_array *effective;
832 	struct list_head *progs;
833 	struct bpf_prog *prog;
834 	int cnt, ret = 0, i;
835 	u32 flags;
836 
837 	atype = to_cgroup_bpf_attach_type(type);
838 	if (atype < 0)
839 		return -EINVAL;
840 
841 	progs = &cgrp->bpf.progs[atype];
842 	flags = cgrp->bpf.flags[atype];
843 
844 	effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
845 					      lockdep_is_held(&cgroup_mutex));
846 
847 	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
848 		cnt = bpf_prog_array_length(effective);
849 	else
850 		cnt = prog_list_length(progs);
851 
852 	if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
853 		return -EFAULT;
854 	if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt)))
855 		return -EFAULT;
856 	if (attr->query.prog_cnt == 0 || !prog_ids || !cnt)
857 		/* return early if user requested only program count + flags */
858 		return 0;
859 	if (attr->query.prog_cnt < cnt) {
860 		cnt = attr->query.prog_cnt;
861 		ret = -ENOSPC;
862 	}
863 
864 	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
865 		return bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
866 	} else {
867 		struct bpf_prog_list *pl;
868 		u32 id;
869 
870 		i = 0;
871 		list_for_each_entry(pl, progs, node) {
872 			prog = prog_list_prog(pl);
873 			id = prog->aux->id;
874 			if (copy_to_user(prog_ids + i, &id, sizeof(id)))
875 				return -EFAULT;
876 			if (++i == cnt)
877 				break;
878 		}
879 	}
880 	return ret;
881 }
882 
883 static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
884 			    union bpf_attr __user *uattr)
885 {
886 	int ret;
887 
888 	mutex_lock(&cgroup_mutex);
889 	ret = __cgroup_bpf_query(cgrp, attr, uattr);
890 	mutex_unlock(&cgroup_mutex);
891 	return ret;
892 }
893 
894 int cgroup_bpf_prog_attach(const union bpf_attr *attr,
895 			   enum bpf_prog_type ptype, struct bpf_prog *prog)
896 {
897 	struct bpf_prog *replace_prog = NULL;
898 	struct cgroup *cgrp;
899 	int ret;
900 
901 	cgrp = cgroup_get_from_fd(attr->target_fd);
902 	if (IS_ERR(cgrp))
903 		return PTR_ERR(cgrp);
904 
905 	if ((attr->attach_flags & BPF_F_ALLOW_MULTI) &&
906 	    (attr->attach_flags & BPF_F_REPLACE)) {
907 		replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, ptype);
908 		if (IS_ERR(replace_prog)) {
909 			cgroup_put(cgrp);
910 			return PTR_ERR(replace_prog);
911 		}
912 	}
913 
914 	ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL,
915 				attr->attach_type, attr->attach_flags);
916 
917 	if (replace_prog)
918 		bpf_prog_put(replace_prog);
919 	cgroup_put(cgrp);
920 	return ret;
921 }
922 
923 int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
924 {
925 	struct bpf_prog *prog;
926 	struct cgroup *cgrp;
927 	int ret;
928 
929 	cgrp = cgroup_get_from_fd(attr->target_fd);
930 	if (IS_ERR(cgrp))
931 		return PTR_ERR(cgrp);
932 
933 	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
934 	if (IS_ERR(prog))
935 		prog = NULL;
936 
937 	ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type);
938 	if (prog)
939 		bpf_prog_put(prog);
940 
941 	cgroup_put(cgrp);
942 	return ret;
943 }
944 
945 static void bpf_cgroup_link_release(struct bpf_link *link)
946 {
947 	struct bpf_cgroup_link *cg_link =
948 		container_of(link, struct bpf_cgroup_link, link);
949 	struct cgroup *cg;
950 
951 	/* link might have been auto-detached by dying cgroup already,
952 	 * in that case our work is done here
953 	 */
954 	if (!cg_link->cgroup)
955 		return;
956 
957 	mutex_lock(&cgroup_mutex);
958 
959 	/* re-check cgroup under lock again */
960 	if (!cg_link->cgroup) {
961 		mutex_unlock(&cgroup_mutex);
962 		return;
963 	}
964 
965 	WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link,
966 				    cg_link->type));
967 
968 	cg = cg_link->cgroup;
969 	cg_link->cgroup = NULL;
970 
971 	mutex_unlock(&cgroup_mutex);
972 
973 	cgroup_put(cg);
974 }
975 
976 static void bpf_cgroup_link_dealloc(struct bpf_link *link)
977 {
978 	struct bpf_cgroup_link *cg_link =
979 		container_of(link, struct bpf_cgroup_link, link);
980 
981 	kfree(cg_link);
982 }
983 
984 static int bpf_cgroup_link_detach(struct bpf_link *link)
985 {
986 	bpf_cgroup_link_release(link);
987 
988 	return 0;
989 }
990 
991 static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link,
992 					struct seq_file *seq)
993 {
994 	struct bpf_cgroup_link *cg_link =
995 		container_of(link, struct bpf_cgroup_link, link);
996 	u64 cg_id = 0;
997 
998 	mutex_lock(&cgroup_mutex);
999 	if (cg_link->cgroup)
1000 		cg_id = cgroup_id(cg_link->cgroup);
1001 	mutex_unlock(&cgroup_mutex);
1002 
1003 	seq_printf(seq,
1004 		   "cgroup_id:\t%llu\n"
1005 		   "attach_type:\t%d\n",
1006 		   cg_id,
1007 		   cg_link->type);
1008 }
1009 
1010 static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link,
1011 					  struct bpf_link_info *info)
1012 {
1013 	struct bpf_cgroup_link *cg_link =
1014 		container_of(link, struct bpf_cgroup_link, link);
1015 	u64 cg_id = 0;
1016 
1017 	mutex_lock(&cgroup_mutex);
1018 	if (cg_link->cgroup)
1019 		cg_id = cgroup_id(cg_link->cgroup);
1020 	mutex_unlock(&cgroup_mutex);
1021 
1022 	info->cgroup.cgroup_id = cg_id;
1023 	info->cgroup.attach_type = cg_link->type;
1024 	return 0;
1025 }
1026 
1027 static const struct bpf_link_ops bpf_cgroup_link_lops = {
1028 	.release = bpf_cgroup_link_release,
1029 	.dealloc = bpf_cgroup_link_dealloc,
1030 	.detach = bpf_cgroup_link_detach,
1031 	.update_prog = cgroup_bpf_replace,
1032 	.show_fdinfo = bpf_cgroup_link_show_fdinfo,
1033 	.fill_link_info = bpf_cgroup_link_fill_link_info,
1034 };
1035 
1036 int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
1037 {
1038 	struct bpf_link_primer link_primer;
1039 	struct bpf_cgroup_link *link;
1040 	struct cgroup *cgrp;
1041 	int err;
1042 
1043 	if (attr->link_create.flags)
1044 		return -EINVAL;
1045 
1046 	cgrp = cgroup_get_from_fd(attr->link_create.target_fd);
1047 	if (IS_ERR(cgrp))
1048 		return PTR_ERR(cgrp);
1049 
1050 	link = kzalloc(sizeof(*link), GFP_USER);
1051 	if (!link) {
1052 		err = -ENOMEM;
1053 		goto out_put_cgroup;
1054 	}
1055 	bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops,
1056 		      prog);
1057 	link->cgroup = cgrp;
1058 	link->type = attr->link_create.attach_type;
1059 
1060 	err = bpf_link_prime(&link->link, &link_primer);
1061 	if (err) {
1062 		kfree(link);
1063 		goto out_put_cgroup;
1064 	}
1065 
1066 	err = cgroup_bpf_attach(cgrp, NULL, NULL, link,
1067 				link->type, BPF_F_ALLOW_MULTI);
1068 	if (err) {
1069 		bpf_link_cleanup(&link_primer);
1070 		goto out_put_cgroup;
1071 	}
1072 
1073 	return bpf_link_settle(&link_primer);
1074 
1075 out_put_cgroup:
1076 	cgroup_put(cgrp);
1077 	return err;
1078 }
1079 
1080 int cgroup_bpf_prog_query(const union bpf_attr *attr,
1081 			  union bpf_attr __user *uattr)
1082 {
1083 	struct cgroup *cgrp;
1084 	int ret;
1085 
1086 	cgrp = cgroup_get_from_fd(attr->query.target_fd);
1087 	if (IS_ERR(cgrp))
1088 		return PTR_ERR(cgrp);
1089 
1090 	ret = cgroup_bpf_query(cgrp, attr, uattr);
1091 
1092 	cgroup_put(cgrp);
1093 	return ret;
1094 }
1095 
1096 /**
1097  * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
1098  * @sk: The socket sending or receiving traffic
1099  * @skb: The skb that is being sent or received
1100  * @type: The type of program to be executed
1101  *
1102  * If no socket is passed, or the socket is not of type INET or INET6,
1103  * this function does nothing and returns 0.
1104  *
1105  * The program type passed in via @type must be suitable for network
1106  * filtering. No further check is performed to assert that.
1107  *
1108  * For egress packets, this function can return:
1109  *   NET_XMIT_SUCCESS    (0)	- continue with packet output
1110  *   NET_XMIT_DROP       (1)	- drop packet and notify TCP to call cwr
1111  *   NET_XMIT_CN         (2)	- continue with packet output and notify TCP
1112  *				  to call cwr
1113  *   -err			- drop packet
1114  *
1115  * For ingress packets, this function will return -EPERM if any
1116  * attached program was found and if it returned != 1 during execution.
1117  * Otherwise 0 is returned.
1118  */
1119 int __cgroup_bpf_run_filter_skb(struct sock *sk,
1120 				struct sk_buff *skb,
1121 				enum cgroup_bpf_attach_type atype)
1122 {
1123 	unsigned int offset = skb->data - skb_network_header(skb);
1124 	struct sock *save_sk;
1125 	void *saved_data_end;
1126 	struct cgroup *cgrp;
1127 	int ret;
1128 
1129 	if (!sk || !sk_fullsock(sk))
1130 		return 0;
1131 
1132 	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
1133 		return 0;
1134 
1135 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1136 	save_sk = skb->sk;
1137 	skb->sk = sk;
1138 	__skb_push(skb, offset);
1139 
1140 	/* compute pointers for the bpf prog */
1141 	bpf_compute_and_save_data_end(skb, &saved_data_end);
1142 
1143 	if (atype == CGROUP_INET_EGRESS) {
1144 		u32 flags = 0;
1145 		bool cn;
1146 
1147 		ret = bpf_prog_run_array_cg_flags(
1148 			&cgrp->bpf, atype,
1149 			skb, __bpf_prog_run_save_cb, 0, &flags);
1150 
1151 		/* Return values of CGROUP EGRESS BPF programs are:
1152 		 *   0: drop packet
1153 		 *   1: keep packet
1154 		 *   2: drop packet and cn
1155 		 *   3: keep packet and cn
1156 		 *
1157 		 * The returned value is then converted to one of the NET_XMIT
1158 		 * or an error code that is then interpreted as drop packet
1159 		 * (and no cn):
1160 		 *   0: NET_XMIT_SUCCESS  skb should be transmitted
1161 		 *   1: NET_XMIT_DROP     skb should be dropped and cn
1162 		 *   2: NET_XMIT_CN       skb should be transmitted and cn
1163 		 *   3: -err              skb should be dropped
1164 		 */
1165 
1166 		cn = flags & BPF_RET_SET_CN;
1167 		if (ret && !IS_ERR_VALUE((long)ret))
1168 			ret = -EFAULT;
1169 		if (!ret)
1170 			ret = (cn ? NET_XMIT_CN : NET_XMIT_SUCCESS);
1171 		else
1172 			ret = (cn ? NET_XMIT_DROP : ret);
1173 	} else {
1174 		ret = bpf_prog_run_array_cg(&cgrp->bpf, atype,
1175 					    skb, __bpf_prog_run_save_cb, 0);
1176 		if (ret && !IS_ERR_VALUE((long)ret))
1177 			ret = -EFAULT;
1178 	}
1179 	bpf_restore_data_end(skb, saved_data_end);
1180 	__skb_pull(skb, offset);
1181 	skb->sk = save_sk;
1182 
1183 	return ret;
1184 }
1185 EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
1186 
1187 /**
1188  * __cgroup_bpf_run_filter_sk() - Run a program on a sock
1189  * @sk: sock structure to manipulate
1190  * @type: The type of program to be executed
1191  *
1192  * socket is passed is expected to be of type INET or INET6.
1193  *
1194  * The program type passed in via @type must be suitable for sock
1195  * filtering. No further check is performed to assert that.
1196  *
1197  * This function will return %-EPERM if any if an attached program was found
1198  * and if it returned != 1 during execution. In all other cases, 0 is returned.
1199  */
1200 int __cgroup_bpf_run_filter_sk(struct sock *sk,
1201 			       enum cgroup_bpf_attach_type atype)
1202 {
1203 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1204 
1205 	return bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0);
1206 }
1207 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
1208 
1209 /**
1210  * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and
1211  *                                       provided by user sockaddr
1212  * @sk: sock struct that will use sockaddr
1213  * @uaddr: sockaddr struct provided by user
1214  * @type: The type of program to be executed
1215  * @t_ctx: Pointer to attach type specific context
1216  * @flags: Pointer to u32 which contains higher bits of BPF program
1217  *         return value (OR'ed together).
1218  *
1219  * socket is expected to be of type INET or INET6.
1220  *
1221  * This function will return %-EPERM if an attached program is found and
1222  * returned value != 1 during execution. In all other cases, 0 is returned.
1223  */
1224 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
1225 				      struct sockaddr *uaddr,
1226 				      enum cgroup_bpf_attach_type atype,
1227 				      void *t_ctx,
1228 				      u32 *flags)
1229 {
1230 	struct bpf_sock_addr_kern ctx = {
1231 		.sk = sk,
1232 		.uaddr = uaddr,
1233 		.t_ctx = t_ctx,
1234 	};
1235 	struct sockaddr_storage unspec;
1236 	struct cgroup *cgrp;
1237 
1238 	/* Check socket family since not all sockets represent network
1239 	 * endpoint (e.g. AF_UNIX).
1240 	 */
1241 	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
1242 		return 0;
1243 
1244 	if (!ctx.uaddr) {
1245 		memset(&unspec, 0, sizeof(unspec));
1246 		ctx.uaddr = (struct sockaddr *)&unspec;
1247 	}
1248 
1249 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1250 	return bpf_prog_run_array_cg_flags(&cgrp->bpf, atype,
1251 					   &ctx, bpf_prog_run, 0, flags);
1252 }
1253 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
1254 
1255 /**
1256  * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock
1257  * @sk: socket to get cgroup from
1258  * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
1259  * sk with connection information (IP addresses, etc.) May not contain
1260  * cgroup info if it is a req sock.
1261  * @type: The type of program to be executed
1262  *
1263  * socket passed is expected to be of type INET or INET6.
1264  *
1265  * The program type passed in via @type must be suitable for sock_ops
1266  * filtering. No further check is performed to assert that.
1267  *
1268  * This function will return %-EPERM if any if an attached program was found
1269  * and if it returned != 1 during execution. In all other cases, 0 is returned.
1270  */
1271 int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
1272 				     struct bpf_sock_ops_kern *sock_ops,
1273 				     enum cgroup_bpf_attach_type atype)
1274 {
1275 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1276 
1277 	return bpf_prog_run_array_cg(&cgrp->bpf, atype, sock_ops, bpf_prog_run,
1278 				     0);
1279 }
1280 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
1281 
1282 int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
1283 				      short access, enum cgroup_bpf_attach_type atype)
1284 {
1285 	struct cgroup *cgrp;
1286 	struct bpf_cgroup_dev_ctx ctx = {
1287 		.access_type = (access << 16) | dev_type,
1288 		.major = major,
1289 		.minor = minor,
1290 	};
1291 	int ret;
1292 
1293 	rcu_read_lock();
1294 	cgrp = task_dfl_cgroup(current);
1295 	ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0);
1296 	rcu_read_unlock();
1297 
1298 	return ret;
1299 }
1300 
1301 BPF_CALL_0(bpf_get_retval)
1302 {
1303 	struct bpf_cg_run_ctx *ctx =
1304 		container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
1305 
1306 	return ctx->retval;
1307 }
1308 
1309 static const struct bpf_func_proto bpf_get_retval_proto = {
1310 	.func		= bpf_get_retval,
1311 	.gpl_only	= false,
1312 	.ret_type	= RET_INTEGER,
1313 };
1314 
1315 BPF_CALL_1(bpf_set_retval, int, retval)
1316 {
1317 	struct bpf_cg_run_ctx *ctx =
1318 		container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
1319 
1320 	ctx->retval = retval;
1321 	return 0;
1322 }
1323 
1324 static const struct bpf_func_proto bpf_set_retval_proto = {
1325 	.func		= bpf_set_retval,
1326 	.gpl_only	= false,
1327 	.ret_type	= RET_INTEGER,
1328 	.arg1_type	= ARG_ANYTHING,
1329 };
1330 
1331 static const struct bpf_func_proto *
1332 cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1333 {
1334 	switch (func_id) {
1335 	case BPF_FUNC_get_current_uid_gid:
1336 		return &bpf_get_current_uid_gid_proto;
1337 	case BPF_FUNC_get_local_storage:
1338 		return &bpf_get_local_storage_proto;
1339 	case BPF_FUNC_get_current_cgroup_id:
1340 		return &bpf_get_current_cgroup_id_proto;
1341 	case BPF_FUNC_perf_event_output:
1342 		return &bpf_event_output_data_proto;
1343 	case BPF_FUNC_get_retval:
1344 		return &bpf_get_retval_proto;
1345 	case BPF_FUNC_set_retval:
1346 		return &bpf_set_retval_proto;
1347 	default:
1348 		return bpf_base_func_proto(func_id);
1349 	}
1350 }
1351 
1352 static const struct bpf_func_proto *
1353 cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1354 {
1355 	return cgroup_base_func_proto(func_id, prog);
1356 }
1357 
1358 static bool cgroup_dev_is_valid_access(int off, int size,
1359 				       enum bpf_access_type type,
1360 				       const struct bpf_prog *prog,
1361 				       struct bpf_insn_access_aux *info)
1362 {
1363 	const int size_default = sizeof(__u32);
1364 
1365 	if (type == BPF_WRITE)
1366 		return false;
1367 
1368 	if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx))
1369 		return false;
1370 	/* The verifier guarantees that size > 0. */
1371 	if (off % size != 0)
1372 		return false;
1373 
1374 	switch (off) {
1375 	case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type):
1376 		bpf_ctx_record_field_size(info, size_default);
1377 		if (!bpf_ctx_narrow_access_ok(off, size, size_default))
1378 			return false;
1379 		break;
1380 	default:
1381 		if (size != size_default)
1382 			return false;
1383 	}
1384 
1385 	return true;
1386 }
1387 
1388 const struct bpf_prog_ops cg_dev_prog_ops = {
1389 };
1390 
1391 const struct bpf_verifier_ops cg_dev_verifier_ops = {
1392 	.get_func_proto		= cgroup_dev_func_proto,
1393 	.is_valid_access	= cgroup_dev_is_valid_access,
1394 };
1395 
1396 /**
1397  * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl
1398  *
1399  * @head: sysctl table header
1400  * @table: sysctl table
1401  * @write: sysctl is being read (= 0) or written (= 1)
1402  * @buf: pointer to buffer (in and out)
1403  * @pcount: value-result argument: value is size of buffer pointed to by @buf,
1404  *	result is size of @new_buf if program set new value, initial value
1405  *	otherwise
1406  * @ppos: value-result argument: value is position at which read from or write
1407  *	to sysctl is happening, result is new position if program overrode it,
1408  *	initial value otherwise
1409  * @type: type of program to be executed
1410  *
1411  * Program is run when sysctl is being accessed, either read or written, and
1412  * can allow or deny such access.
1413  *
1414  * This function will return %-EPERM if an attached program is found and
1415  * returned value != 1 during execution. In all other cases 0 is returned.
1416  */
1417 int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
1418 				   struct ctl_table *table, int write,
1419 				   char **buf, size_t *pcount, loff_t *ppos,
1420 				   enum cgroup_bpf_attach_type atype)
1421 {
1422 	struct bpf_sysctl_kern ctx = {
1423 		.head = head,
1424 		.table = table,
1425 		.write = write,
1426 		.ppos = ppos,
1427 		.cur_val = NULL,
1428 		.cur_len = PAGE_SIZE,
1429 		.new_val = NULL,
1430 		.new_len = 0,
1431 		.new_updated = 0,
1432 	};
1433 	struct cgroup *cgrp;
1434 	loff_t pos = 0;
1435 	int ret;
1436 
1437 	ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL);
1438 	if (!ctx.cur_val ||
1439 	    table->proc_handler(table, 0, ctx.cur_val, &ctx.cur_len, &pos)) {
1440 		/* Let BPF program decide how to proceed. */
1441 		ctx.cur_len = 0;
1442 	}
1443 
1444 	if (write && *buf && *pcount) {
1445 		/* BPF program should be able to override new value with a
1446 		 * buffer bigger than provided by user.
1447 		 */
1448 		ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL);
1449 		ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount);
1450 		if (ctx.new_val) {
1451 			memcpy(ctx.new_val, *buf, ctx.new_len);
1452 		} else {
1453 			/* Let BPF program decide how to proceed. */
1454 			ctx.new_len = 0;
1455 		}
1456 	}
1457 
1458 	rcu_read_lock();
1459 	cgrp = task_dfl_cgroup(current);
1460 	ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0);
1461 	rcu_read_unlock();
1462 
1463 	kfree(ctx.cur_val);
1464 
1465 	if (ret == 1 && ctx.new_updated) {
1466 		kfree(*buf);
1467 		*buf = ctx.new_val;
1468 		*pcount = ctx.new_len;
1469 	} else {
1470 		kfree(ctx.new_val);
1471 	}
1472 
1473 	return ret;
1474 }
1475 
1476 #ifdef CONFIG_NET
1477 static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
1478 			     struct bpf_sockopt_buf *buf)
1479 {
1480 	if (unlikely(max_optlen < 0))
1481 		return -EINVAL;
1482 
1483 	if (unlikely(max_optlen > PAGE_SIZE)) {
1484 		/* We don't expose optvals that are greater than PAGE_SIZE
1485 		 * to the BPF program.
1486 		 */
1487 		max_optlen = PAGE_SIZE;
1488 	}
1489 
1490 	if (max_optlen <= sizeof(buf->data)) {
1491 		/* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE
1492 		 * bytes avoid the cost of kzalloc.
1493 		 */
1494 		ctx->optval = buf->data;
1495 		ctx->optval_end = ctx->optval + max_optlen;
1496 		return max_optlen;
1497 	}
1498 
1499 	ctx->optval = kzalloc(max_optlen, GFP_USER);
1500 	if (!ctx->optval)
1501 		return -ENOMEM;
1502 
1503 	ctx->optval_end = ctx->optval + max_optlen;
1504 
1505 	return max_optlen;
1506 }
1507 
1508 static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
1509 			     struct bpf_sockopt_buf *buf)
1510 {
1511 	if (ctx->optval == buf->data)
1512 		return;
1513 	kfree(ctx->optval);
1514 }
1515 
1516 static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
1517 				  struct bpf_sockopt_buf *buf)
1518 {
1519 	return ctx->optval != buf->data;
1520 }
1521 
1522 int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
1523 				       int *optname, char __user *optval,
1524 				       int *optlen, char **kernel_optval)
1525 {
1526 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1527 	struct bpf_sockopt_buf buf = {};
1528 	struct bpf_sockopt_kern ctx = {
1529 		.sk = sk,
1530 		.level = *level,
1531 		.optname = *optname,
1532 	};
1533 	int ret, max_optlen;
1534 
1535 	/* Allocate a bit more than the initial user buffer for
1536 	 * BPF program. The canonical use case is overriding
1537 	 * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
1538 	 */
1539 	max_optlen = max_t(int, 16, *optlen);
1540 	max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
1541 	if (max_optlen < 0)
1542 		return max_optlen;
1543 
1544 	ctx.optlen = *optlen;
1545 
1546 	if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) {
1547 		ret = -EFAULT;
1548 		goto out;
1549 	}
1550 
1551 	lock_sock(sk);
1552 	ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_SETSOCKOPT,
1553 				    &ctx, bpf_prog_run, 0);
1554 	release_sock(sk);
1555 
1556 	if (ret)
1557 		goto out;
1558 
1559 	if (ctx.optlen == -1) {
1560 		/* optlen set to -1, bypass kernel */
1561 		ret = 1;
1562 	} else if (ctx.optlen > max_optlen || ctx.optlen < -1) {
1563 		/* optlen is out of bounds */
1564 		ret = -EFAULT;
1565 	} else {
1566 		/* optlen within bounds, run kernel handler */
1567 		ret = 0;
1568 
1569 		/* export any potential modifications */
1570 		*level = ctx.level;
1571 		*optname = ctx.optname;
1572 
1573 		/* optlen == 0 from BPF indicates that we should
1574 		 * use original userspace data.
1575 		 */
1576 		if (ctx.optlen != 0) {
1577 			*optlen = ctx.optlen;
1578 			/* We've used bpf_sockopt_kern->buf as an intermediary
1579 			 * storage, but the BPF program indicates that we need
1580 			 * to pass this data to the kernel setsockopt handler.
1581 			 * No way to export on-stack buf, have to allocate a
1582 			 * new buffer.
1583 			 */
1584 			if (!sockopt_buf_allocated(&ctx, &buf)) {
1585 				void *p = kmalloc(ctx.optlen, GFP_USER);
1586 
1587 				if (!p) {
1588 					ret = -ENOMEM;
1589 					goto out;
1590 				}
1591 				memcpy(p, ctx.optval, ctx.optlen);
1592 				*kernel_optval = p;
1593 			} else {
1594 				*kernel_optval = ctx.optval;
1595 			}
1596 			/* export and don't free sockopt buf */
1597 			return 0;
1598 		}
1599 	}
1600 
1601 out:
1602 	sockopt_free_buf(&ctx, &buf);
1603 	return ret;
1604 }
1605 
1606 int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
1607 				       int optname, char __user *optval,
1608 				       int __user *optlen, int max_optlen,
1609 				       int retval)
1610 {
1611 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1612 	struct bpf_sockopt_buf buf = {};
1613 	struct bpf_sockopt_kern ctx = {
1614 		.sk = sk,
1615 		.level = level,
1616 		.optname = optname,
1617 		.current_task = current,
1618 	};
1619 	int ret;
1620 
1621 	ctx.optlen = max_optlen;
1622 	max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
1623 	if (max_optlen < 0)
1624 		return max_optlen;
1625 
1626 	if (!retval) {
1627 		/* If kernel getsockopt finished successfully,
1628 		 * copy whatever was returned to the user back
1629 		 * into our temporary buffer. Set optlen to the
1630 		 * one that kernel returned as well to let
1631 		 * BPF programs inspect the value.
1632 		 */
1633 
1634 		if (get_user(ctx.optlen, optlen)) {
1635 			ret = -EFAULT;
1636 			goto out;
1637 		}
1638 
1639 		if (ctx.optlen < 0) {
1640 			ret = -EFAULT;
1641 			goto out;
1642 		}
1643 
1644 		if (copy_from_user(ctx.optval, optval,
1645 				   min(ctx.optlen, max_optlen)) != 0) {
1646 			ret = -EFAULT;
1647 			goto out;
1648 		}
1649 	}
1650 
1651 	lock_sock(sk);
1652 	ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
1653 				    &ctx, bpf_prog_run, retval);
1654 	release_sock(sk);
1655 
1656 	if (ret < 0)
1657 		goto out;
1658 
1659 	if (ctx.optlen > max_optlen || ctx.optlen < 0) {
1660 		ret = -EFAULT;
1661 		goto out;
1662 	}
1663 
1664 	if (ctx.optlen != 0) {
1665 		if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
1666 		    put_user(ctx.optlen, optlen)) {
1667 			ret = -EFAULT;
1668 			goto out;
1669 		}
1670 	}
1671 
1672 out:
1673 	sockopt_free_buf(&ctx, &buf);
1674 	return ret;
1675 }
1676 
1677 int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
1678 					    int optname, void *optval,
1679 					    int *optlen, int retval)
1680 {
1681 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1682 	struct bpf_sockopt_kern ctx = {
1683 		.sk = sk,
1684 		.level = level,
1685 		.optname = optname,
1686 		.optlen = *optlen,
1687 		.optval = optval,
1688 		.optval_end = optval + *optlen,
1689 		.current_task = current,
1690 	};
1691 	int ret;
1692 
1693 	/* Note that __cgroup_bpf_run_filter_getsockopt doesn't copy
1694 	 * user data back into BPF buffer when reval != 0. This is
1695 	 * done as an optimization to avoid extra copy, assuming
1696 	 * kernel won't populate the data in case of an error.
1697 	 * Here we always pass the data and memset() should
1698 	 * be called if that data shouldn't be "exported".
1699 	 */
1700 
1701 	ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
1702 				    &ctx, bpf_prog_run, retval);
1703 	if (ret < 0)
1704 		return ret;
1705 
1706 	if (ctx.optlen > *optlen)
1707 		return -EFAULT;
1708 
1709 	/* BPF programs can shrink the buffer, export the modifications.
1710 	 */
1711 	if (ctx.optlen != 0)
1712 		*optlen = ctx.optlen;
1713 
1714 	return ret;
1715 }
1716 #endif
1717 
1718 static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
1719 			      size_t *lenp)
1720 {
1721 	ssize_t tmp_ret = 0, ret;
1722 
1723 	if (dir->header.parent) {
1724 		tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp);
1725 		if (tmp_ret < 0)
1726 			return tmp_ret;
1727 	}
1728 
1729 	ret = strscpy(*bufp, dir->header.ctl_table[0].procname, *lenp);
1730 	if (ret < 0)
1731 		return ret;
1732 	*bufp += ret;
1733 	*lenp -= ret;
1734 	ret += tmp_ret;
1735 
1736 	/* Avoid leading slash. */
1737 	if (!ret)
1738 		return ret;
1739 
1740 	tmp_ret = strscpy(*bufp, "/", *lenp);
1741 	if (tmp_ret < 0)
1742 		return tmp_ret;
1743 	*bufp += tmp_ret;
1744 	*lenp -= tmp_ret;
1745 
1746 	return ret + tmp_ret;
1747 }
1748 
1749 BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern *, ctx, char *, buf,
1750 	   size_t, buf_len, u64, flags)
1751 {
1752 	ssize_t tmp_ret = 0, ret;
1753 
1754 	if (!buf)
1755 		return -EINVAL;
1756 
1757 	if (!(flags & BPF_F_SYSCTL_BASE_NAME)) {
1758 		if (!ctx->head)
1759 			return -EINVAL;
1760 		tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len);
1761 		if (tmp_ret < 0)
1762 			return tmp_ret;
1763 	}
1764 
1765 	ret = strscpy(buf, ctx->table->procname, buf_len);
1766 
1767 	return ret < 0 ? ret : tmp_ret + ret;
1768 }
1769 
1770 static const struct bpf_func_proto bpf_sysctl_get_name_proto = {
1771 	.func		= bpf_sysctl_get_name,
1772 	.gpl_only	= false,
1773 	.ret_type	= RET_INTEGER,
1774 	.arg1_type	= ARG_PTR_TO_CTX,
1775 	.arg2_type	= ARG_PTR_TO_MEM,
1776 	.arg3_type	= ARG_CONST_SIZE,
1777 	.arg4_type	= ARG_ANYTHING,
1778 };
1779 
1780 static int copy_sysctl_value(char *dst, size_t dst_len, char *src,
1781 			     size_t src_len)
1782 {
1783 	if (!dst)
1784 		return -EINVAL;
1785 
1786 	if (!dst_len)
1787 		return -E2BIG;
1788 
1789 	if (!src || !src_len) {
1790 		memset(dst, 0, dst_len);
1791 		return -EINVAL;
1792 	}
1793 
1794 	memcpy(dst, src, min(dst_len, src_len));
1795 
1796 	if (dst_len > src_len) {
1797 		memset(dst + src_len, '\0', dst_len - src_len);
1798 		return src_len;
1799 	}
1800 
1801 	dst[dst_len - 1] = '\0';
1802 
1803 	return -E2BIG;
1804 }
1805 
1806 BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx,
1807 	   char *, buf, size_t, buf_len)
1808 {
1809 	return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len);
1810 }
1811 
1812 static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = {
1813 	.func		= bpf_sysctl_get_current_value,
1814 	.gpl_only	= false,
1815 	.ret_type	= RET_INTEGER,
1816 	.arg1_type	= ARG_PTR_TO_CTX,
1817 	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
1818 	.arg3_type	= ARG_CONST_SIZE,
1819 };
1820 
1821 BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf,
1822 	   size_t, buf_len)
1823 {
1824 	if (!ctx->write) {
1825 		if (buf && buf_len)
1826 			memset(buf, '\0', buf_len);
1827 		return -EINVAL;
1828 	}
1829 	return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len);
1830 }
1831 
1832 static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = {
1833 	.func		= bpf_sysctl_get_new_value,
1834 	.gpl_only	= false,
1835 	.ret_type	= RET_INTEGER,
1836 	.arg1_type	= ARG_PTR_TO_CTX,
1837 	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
1838 	.arg3_type	= ARG_CONST_SIZE,
1839 };
1840 
1841 BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx,
1842 	   const char *, buf, size_t, buf_len)
1843 {
1844 	if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len)
1845 		return -EINVAL;
1846 
1847 	if (buf_len > PAGE_SIZE - 1)
1848 		return -E2BIG;
1849 
1850 	memcpy(ctx->new_val, buf, buf_len);
1851 	ctx->new_len = buf_len;
1852 	ctx->new_updated = 1;
1853 
1854 	return 0;
1855 }
1856 
1857 static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
1858 	.func		= bpf_sysctl_set_new_value,
1859 	.gpl_only	= false,
1860 	.ret_type	= RET_INTEGER,
1861 	.arg1_type	= ARG_PTR_TO_CTX,
1862 	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
1863 	.arg3_type	= ARG_CONST_SIZE,
1864 };
1865 
1866 static const struct bpf_func_proto *
1867 sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1868 {
1869 	switch (func_id) {
1870 	case BPF_FUNC_strtol:
1871 		return &bpf_strtol_proto;
1872 	case BPF_FUNC_strtoul:
1873 		return &bpf_strtoul_proto;
1874 	case BPF_FUNC_sysctl_get_name:
1875 		return &bpf_sysctl_get_name_proto;
1876 	case BPF_FUNC_sysctl_get_current_value:
1877 		return &bpf_sysctl_get_current_value_proto;
1878 	case BPF_FUNC_sysctl_get_new_value:
1879 		return &bpf_sysctl_get_new_value_proto;
1880 	case BPF_FUNC_sysctl_set_new_value:
1881 		return &bpf_sysctl_set_new_value_proto;
1882 	case BPF_FUNC_ktime_get_coarse_ns:
1883 		return &bpf_ktime_get_coarse_ns_proto;
1884 	default:
1885 		return cgroup_base_func_proto(func_id, prog);
1886 	}
1887 }
1888 
1889 static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type,
1890 				   const struct bpf_prog *prog,
1891 				   struct bpf_insn_access_aux *info)
1892 {
1893 	const int size_default = sizeof(__u32);
1894 
1895 	if (off < 0 || off + size > sizeof(struct bpf_sysctl) || off % size)
1896 		return false;
1897 
1898 	switch (off) {
1899 	case bpf_ctx_range(struct bpf_sysctl, write):
1900 		if (type != BPF_READ)
1901 			return false;
1902 		bpf_ctx_record_field_size(info, size_default);
1903 		return bpf_ctx_narrow_access_ok(off, size, size_default);
1904 	case bpf_ctx_range(struct bpf_sysctl, file_pos):
1905 		if (type == BPF_READ) {
1906 			bpf_ctx_record_field_size(info, size_default);
1907 			return bpf_ctx_narrow_access_ok(off, size, size_default);
1908 		} else {
1909 			return size == size_default;
1910 		}
1911 	default:
1912 		return false;
1913 	}
1914 }
1915 
1916 static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
1917 				     const struct bpf_insn *si,
1918 				     struct bpf_insn *insn_buf,
1919 				     struct bpf_prog *prog, u32 *target_size)
1920 {
1921 	struct bpf_insn *insn = insn_buf;
1922 	u32 read_size;
1923 
1924 	switch (si->off) {
1925 	case offsetof(struct bpf_sysctl, write):
1926 		*insn++ = BPF_LDX_MEM(
1927 			BPF_SIZE(si->code), si->dst_reg, si->src_reg,
1928 			bpf_target_off(struct bpf_sysctl_kern, write,
1929 				       sizeof_field(struct bpf_sysctl_kern,
1930 						    write),
1931 				       target_size));
1932 		break;
1933 	case offsetof(struct bpf_sysctl, file_pos):
1934 		/* ppos is a pointer so it should be accessed via indirect
1935 		 * loads and stores. Also for stores additional temporary
1936 		 * register is used since neither src_reg nor dst_reg can be
1937 		 * overridden.
1938 		 */
1939 		if (type == BPF_WRITE) {
1940 			int treg = BPF_REG_9;
1941 
1942 			if (si->src_reg == treg || si->dst_reg == treg)
1943 				--treg;
1944 			if (si->src_reg == treg || si->dst_reg == treg)
1945 				--treg;
1946 			*insn++ = BPF_STX_MEM(
1947 				BPF_DW, si->dst_reg, treg,
1948 				offsetof(struct bpf_sysctl_kern, tmp_reg));
1949 			*insn++ = BPF_LDX_MEM(
1950 				BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
1951 				treg, si->dst_reg,
1952 				offsetof(struct bpf_sysctl_kern, ppos));
1953 			*insn++ = BPF_STX_MEM(
1954 				BPF_SIZEOF(u32), treg, si->src_reg,
1955 				bpf_ctx_narrow_access_offset(
1956 					0, sizeof(u32), sizeof(loff_t)));
1957 			*insn++ = BPF_LDX_MEM(
1958 				BPF_DW, treg, si->dst_reg,
1959 				offsetof(struct bpf_sysctl_kern, tmp_reg));
1960 		} else {
1961 			*insn++ = BPF_LDX_MEM(
1962 				BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
1963 				si->dst_reg, si->src_reg,
1964 				offsetof(struct bpf_sysctl_kern, ppos));
1965 			read_size = bpf_size_to_bytes(BPF_SIZE(si->code));
1966 			*insn++ = BPF_LDX_MEM(
1967 				BPF_SIZE(si->code), si->dst_reg, si->dst_reg,
1968 				bpf_ctx_narrow_access_offset(
1969 					0, read_size, sizeof(loff_t)));
1970 		}
1971 		*target_size = sizeof(u32);
1972 		break;
1973 	}
1974 
1975 	return insn - insn_buf;
1976 }
1977 
1978 const struct bpf_verifier_ops cg_sysctl_verifier_ops = {
1979 	.get_func_proto		= sysctl_func_proto,
1980 	.is_valid_access	= sysctl_is_valid_access,
1981 	.convert_ctx_access	= sysctl_convert_ctx_access,
1982 };
1983 
1984 const struct bpf_prog_ops cg_sysctl_prog_ops = {
1985 };
1986 
1987 #ifdef CONFIG_NET
1988 BPF_CALL_1(bpf_get_netns_cookie_sockopt, struct bpf_sockopt_kern *, ctx)
1989 {
1990 	const struct net *net = ctx ? sock_net(ctx->sk) : &init_net;
1991 
1992 	return net->net_cookie;
1993 }
1994 
1995 static const struct bpf_func_proto bpf_get_netns_cookie_sockopt_proto = {
1996 	.func		= bpf_get_netns_cookie_sockopt,
1997 	.gpl_only	= false,
1998 	.ret_type	= RET_INTEGER,
1999 	.arg1_type	= ARG_PTR_TO_CTX_OR_NULL,
2000 };
2001 #endif
2002 
2003 static const struct bpf_func_proto *
2004 cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
2005 {
2006 	switch (func_id) {
2007 #ifdef CONFIG_NET
2008 	case BPF_FUNC_get_netns_cookie:
2009 		return &bpf_get_netns_cookie_sockopt_proto;
2010 	case BPF_FUNC_sk_storage_get:
2011 		return &bpf_sk_storage_get_proto;
2012 	case BPF_FUNC_sk_storage_delete:
2013 		return &bpf_sk_storage_delete_proto;
2014 	case BPF_FUNC_setsockopt:
2015 		if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT)
2016 			return &bpf_sk_setsockopt_proto;
2017 		return NULL;
2018 	case BPF_FUNC_getsockopt:
2019 		if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT)
2020 			return &bpf_sk_getsockopt_proto;
2021 		return NULL;
2022 #endif
2023 #ifdef CONFIG_INET
2024 	case BPF_FUNC_tcp_sock:
2025 		return &bpf_tcp_sock_proto;
2026 #endif
2027 	default:
2028 		return cgroup_base_func_proto(func_id, prog);
2029 	}
2030 }
2031 
2032 static bool cg_sockopt_is_valid_access(int off, int size,
2033 				       enum bpf_access_type type,
2034 				       const struct bpf_prog *prog,
2035 				       struct bpf_insn_access_aux *info)
2036 {
2037 	const int size_default = sizeof(__u32);
2038 
2039 	if (off < 0 || off >= sizeof(struct bpf_sockopt))
2040 		return false;
2041 
2042 	if (off % size != 0)
2043 		return false;
2044 
2045 	if (type == BPF_WRITE) {
2046 		switch (off) {
2047 		case offsetof(struct bpf_sockopt, retval):
2048 			if (size != size_default)
2049 				return false;
2050 			return prog->expected_attach_type ==
2051 				BPF_CGROUP_GETSOCKOPT;
2052 		case offsetof(struct bpf_sockopt, optname):
2053 			fallthrough;
2054 		case offsetof(struct bpf_sockopt, level):
2055 			if (size != size_default)
2056 				return false;
2057 			return prog->expected_attach_type ==
2058 				BPF_CGROUP_SETSOCKOPT;
2059 		case offsetof(struct bpf_sockopt, optlen):
2060 			return size == size_default;
2061 		default:
2062 			return false;
2063 		}
2064 	}
2065 
2066 	switch (off) {
2067 	case offsetof(struct bpf_sockopt, sk):
2068 		if (size != sizeof(__u64))
2069 			return false;
2070 		info->reg_type = PTR_TO_SOCKET;
2071 		break;
2072 	case offsetof(struct bpf_sockopt, optval):
2073 		if (size != sizeof(__u64))
2074 			return false;
2075 		info->reg_type = PTR_TO_PACKET;
2076 		break;
2077 	case offsetof(struct bpf_sockopt, optval_end):
2078 		if (size != sizeof(__u64))
2079 			return false;
2080 		info->reg_type = PTR_TO_PACKET_END;
2081 		break;
2082 	case offsetof(struct bpf_sockopt, retval):
2083 		if (size != size_default)
2084 			return false;
2085 		return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT;
2086 	default:
2087 		if (size != size_default)
2088 			return false;
2089 		break;
2090 	}
2091 	return true;
2092 }
2093 
2094 #define CG_SOCKOPT_ACCESS_FIELD(T, F)					\
2095 	T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F),			\
2096 	  si->dst_reg, si->src_reg,					\
2097 	  offsetof(struct bpf_sockopt_kern, F))
2098 
2099 static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
2100 					 const struct bpf_insn *si,
2101 					 struct bpf_insn *insn_buf,
2102 					 struct bpf_prog *prog,
2103 					 u32 *target_size)
2104 {
2105 	struct bpf_insn *insn = insn_buf;
2106 
2107 	switch (si->off) {
2108 	case offsetof(struct bpf_sockopt, sk):
2109 		*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk);
2110 		break;
2111 	case offsetof(struct bpf_sockopt, level):
2112 		if (type == BPF_WRITE)
2113 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level);
2114 		else
2115 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level);
2116 		break;
2117 	case offsetof(struct bpf_sockopt, optname):
2118 		if (type == BPF_WRITE)
2119 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname);
2120 		else
2121 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname);
2122 		break;
2123 	case offsetof(struct bpf_sockopt, optlen):
2124 		if (type == BPF_WRITE)
2125 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen);
2126 		else
2127 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen);
2128 		break;
2129 	case offsetof(struct bpf_sockopt, retval):
2130 		BUILD_BUG_ON(offsetof(struct bpf_cg_run_ctx, run_ctx) != 0);
2131 
2132 		if (type == BPF_WRITE) {
2133 			int treg = BPF_REG_9;
2134 
2135 			if (si->src_reg == treg || si->dst_reg == treg)
2136 				--treg;
2137 			if (si->src_reg == treg || si->dst_reg == treg)
2138 				--treg;
2139 			*insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, treg,
2140 					      offsetof(struct bpf_sockopt_kern, tmp_reg));
2141 			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
2142 					      treg, si->dst_reg,
2143 					      offsetof(struct bpf_sockopt_kern, current_task));
2144 			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
2145 					      treg, treg,
2146 					      offsetof(struct task_struct, bpf_ctx));
2147 			*insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
2148 					      treg, si->src_reg,
2149 					      offsetof(struct bpf_cg_run_ctx, retval));
2150 			*insn++ = BPF_LDX_MEM(BPF_DW, treg, si->dst_reg,
2151 					      offsetof(struct bpf_sockopt_kern, tmp_reg));
2152 		} else {
2153 			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
2154 					      si->dst_reg, si->src_reg,
2155 					      offsetof(struct bpf_sockopt_kern, current_task));
2156 			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
2157 					      si->dst_reg, si->dst_reg,
2158 					      offsetof(struct task_struct, bpf_ctx));
2159 			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
2160 					      si->dst_reg, si->dst_reg,
2161 					      offsetof(struct bpf_cg_run_ctx, retval));
2162 		}
2163 		break;
2164 	case offsetof(struct bpf_sockopt, optval):
2165 		*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval);
2166 		break;
2167 	case offsetof(struct bpf_sockopt, optval_end):
2168 		*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end);
2169 		break;
2170 	}
2171 
2172 	return insn - insn_buf;
2173 }
2174 
2175 static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf,
2176 				   bool direct_write,
2177 				   const struct bpf_prog *prog)
2178 {
2179 	/* Nothing to do for sockopt argument. The data is kzalloc'ated.
2180 	 */
2181 	return 0;
2182 }
2183 
2184 const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
2185 	.get_func_proto		= cg_sockopt_func_proto,
2186 	.is_valid_access	= cg_sockopt_is_valid_access,
2187 	.convert_ctx_access	= cg_sockopt_convert_ctx_access,
2188 	.gen_prologue		= cg_sockopt_get_prologue,
2189 };
2190 
2191 const struct bpf_prog_ops cg_sockopt_prog_ops = {
2192 };
2193