xref: /openbmc/linux/kernel/bpf/cgroup.c (revision 5edb7691)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Functions to manage eBPF programs attached to cgroups
4  *
5  * Copyright (c) 2016 Daniel Mack
6  */
7 
8 #include <linux/kernel.h>
9 #include <linux/atomic.h>
10 #include <linux/cgroup.h>
11 #include <linux/filter.h>
12 #include <linux/slab.h>
13 #include <linux/sysctl.h>
14 #include <linux/string.h>
15 #include <linux/bpf.h>
16 #include <linux/bpf-cgroup.h>
17 #include <net/sock.h>
18 #include <net/bpf_sk_storage.h>
19 
20 #include "../cgroup/cgroup-internal.h"
21 
22 DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
23 EXPORT_SYMBOL(cgroup_bpf_enabled_key);
24 
25 void cgroup_bpf_offline(struct cgroup *cgrp)
26 {
27 	cgroup_get(cgrp);
28 	percpu_ref_kill(&cgrp->bpf.refcnt);
29 }
30 
31 /**
32  * cgroup_bpf_release() - put references of all bpf programs and
33  *                        release all cgroup bpf data
34  * @work: work structure embedded into the cgroup to modify
35  */
36 static void cgroup_bpf_release(struct work_struct *work)
37 {
38 	struct cgroup *p, *cgrp = container_of(work, struct cgroup,
39 					       bpf.release_work);
40 	enum bpf_cgroup_storage_type stype;
41 	struct bpf_prog_array *old_array;
42 	unsigned int type;
43 
44 	mutex_lock(&cgroup_mutex);
45 
46 	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
47 		struct list_head *progs = &cgrp->bpf.progs[type];
48 		struct bpf_prog_list *pl, *tmp;
49 
50 		list_for_each_entry_safe(pl, tmp, progs, node) {
51 			list_del(&pl->node);
52 			bpf_prog_put(pl->prog);
53 			for_each_cgroup_storage_type(stype) {
54 				bpf_cgroup_storage_unlink(pl->storage[stype]);
55 				bpf_cgroup_storage_free(pl->storage[stype]);
56 			}
57 			kfree(pl);
58 			static_branch_dec(&cgroup_bpf_enabled_key);
59 		}
60 		old_array = rcu_dereference_protected(
61 				cgrp->bpf.effective[type],
62 				lockdep_is_held(&cgroup_mutex));
63 		bpf_prog_array_free(old_array);
64 	}
65 
66 	mutex_unlock(&cgroup_mutex);
67 
68 	for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
69 		cgroup_bpf_put(p);
70 
71 	percpu_ref_exit(&cgrp->bpf.refcnt);
72 	cgroup_put(cgrp);
73 }
74 
75 /**
76  * cgroup_bpf_release_fn() - callback used to schedule releasing
77  *                           of bpf cgroup data
78  * @ref: percpu ref counter structure
79  */
80 static void cgroup_bpf_release_fn(struct percpu_ref *ref)
81 {
82 	struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
83 
84 	INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
85 	queue_work(system_wq, &cgrp->bpf.release_work);
86 }
87 
88 /* count number of elements in the list.
89  * it's slow but the list cannot be long
90  */
91 static u32 prog_list_length(struct list_head *head)
92 {
93 	struct bpf_prog_list *pl;
94 	u32 cnt = 0;
95 
96 	list_for_each_entry(pl, head, node) {
97 		if (!pl->prog)
98 			continue;
99 		cnt++;
100 	}
101 	return cnt;
102 }
103 
104 /* if parent has non-overridable prog attached,
105  * disallow attaching new programs to the descendent cgroup.
106  * if parent has overridable or multi-prog, allow attaching
107  */
108 static bool hierarchy_allows_attach(struct cgroup *cgrp,
109 				    enum bpf_attach_type type)
110 {
111 	struct cgroup *p;
112 
113 	p = cgroup_parent(cgrp);
114 	if (!p)
115 		return true;
116 	do {
117 		u32 flags = p->bpf.flags[type];
118 		u32 cnt;
119 
120 		if (flags & BPF_F_ALLOW_MULTI)
121 			return true;
122 		cnt = prog_list_length(&p->bpf.progs[type]);
123 		WARN_ON_ONCE(cnt > 1);
124 		if (cnt == 1)
125 			return !!(flags & BPF_F_ALLOW_OVERRIDE);
126 		p = cgroup_parent(p);
127 	} while (p);
128 	return true;
129 }
130 
131 /* compute a chain of effective programs for a given cgroup:
132  * start from the list of programs in this cgroup and add
133  * all parent programs.
134  * Note that parent's F_ALLOW_OVERRIDE-type program is yielding
135  * to programs in this cgroup
136  */
137 static int compute_effective_progs(struct cgroup *cgrp,
138 				   enum bpf_attach_type type,
139 				   struct bpf_prog_array **array)
140 {
141 	enum bpf_cgroup_storage_type stype;
142 	struct bpf_prog_array *progs;
143 	struct bpf_prog_list *pl;
144 	struct cgroup *p = cgrp;
145 	int cnt = 0;
146 
147 	/* count number of effective programs by walking parents */
148 	do {
149 		if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
150 			cnt += prog_list_length(&p->bpf.progs[type]);
151 		p = cgroup_parent(p);
152 	} while (p);
153 
154 	progs = bpf_prog_array_alloc(cnt, GFP_KERNEL);
155 	if (!progs)
156 		return -ENOMEM;
157 
158 	/* populate the array with effective progs */
159 	cnt = 0;
160 	p = cgrp;
161 	do {
162 		if (cnt > 0 && !(p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
163 			continue;
164 
165 		list_for_each_entry(pl, &p->bpf.progs[type], node) {
166 			if (!pl->prog)
167 				continue;
168 
169 			progs->items[cnt].prog = pl->prog;
170 			for_each_cgroup_storage_type(stype)
171 				progs->items[cnt].cgroup_storage[stype] =
172 					pl->storage[stype];
173 			cnt++;
174 		}
175 	} while ((p = cgroup_parent(p)));
176 
177 	*array = progs;
178 	return 0;
179 }
180 
181 static void activate_effective_progs(struct cgroup *cgrp,
182 				     enum bpf_attach_type type,
183 				     struct bpf_prog_array *old_array)
184 {
185 	old_array = rcu_replace_pointer(cgrp->bpf.effective[type], old_array,
186 					lockdep_is_held(&cgroup_mutex));
187 	/* free prog array after grace period, since __cgroup_bpf_run_*()
188 	 * might be still walking the array
189 	 */
190 	bpf_prog_array_free(old_array);
191 }
192 
193 /**
194  * cgroup_bpf_inherit() - inherit effective programs from parent
195  * @cgrp: the cgroup to modify
196  */
197 int cgroup_bpf_inherit(struct cgroup *cgrp)
198 {
199 /* has to use marco instead of const int, since compiler thinks
200  * that array below is variable length
201  */
202 #define	NR ARRAY_SIZE(cgrp->bpf.effective)
203 	struct bpf_prog_array *arrays[NR] = {};
204 	struct cgroup *p;
205 	int ret, i;
206 
207 	ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0,
208 			      GFP_KERNEL);
209 	if (ret)
210 		return ret;
211 
212 	for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
213 		cgroup_bpf_get(p);
214 
215 	for (i = 0; i < NR; i++)
216 		INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
217 
218 	for (i = 0; i < NR; i++)
219 		if (compute_effective_progs(cgrp, i, &arrays[i]))
220 			goto cleanup;
221 
222 	for (i = 0; i < NR; i++)
223 		activate_effective_progs(cgrp, i, arrays[i]);
224 
225 	return 0;
226 cleanup:
227 	for (i = 0; i < NR; i++)
228 		bpf_prog_array_free(arrays[i]);
229 
230 	for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
231 		cgroup_bpf_put(p);
232 
233 	percpu_ref_exit(&cgrp->bpf.refcnt);
234 
235 	return -ENOMEM;
236 }
237 
238 static int update_effective_progs(struct cgroup *cgrp,
239 				  enum bpf_attach_type type)
240 {
241 	struct cgroup_subsys_state *css;
242 	int err;
243 
244 	/* allocate and recompute effective prog arrays */
245 	css_for_each_descendant_pre(css, &cgrp->self) {
246 		struct cgroup *desc = container_of(css, struct cgroup, self);
247 
248 		if (percpu_ref_is_zero(&desc->bpf.refcnt))
249 			continue;
250 
251 		err = compute_effective_progs(desc, type, &desc->bpf.inactive);
252 		if (err)
253 			goto cleanup;
254 	}
255 
256 	/* all allocations were successful. Activate all prog arrays */
257 	css_for_each_descendant_pre(css, &cgrp->self) {
258 		struct cgroup *desc = container_of(css, struct cgroup, self);
259 
260 		if (percpu_ref_is_zero(&desc->bpf.refcnt)) {
261 			if (unlikely(desc->bpf.inactive)) {
262 				bpf_prog_array_free(desc->bpf.inactive);
263 				desc->bpf.inactive = NULL;
264 			}
265 			continue;
266 		}
267 
268 		activate_effective_progs(desc, type, desc->bpf.inactive);
269 		desc->bpf.inactive = NULL;
270 	}
271 
272 	return 0;
273 
274 cleanup:
275 	/* oom while computing effective. Free all computed effective arrays
276 	 * since they were not activated
277 	 */
278 	css_for_each_descendant_pre(css, &cgrp->self) {
279 		struct cgroup *desc = container_of(css, struct cgroup, self);
280 
281 		bpf_prog_array_free(desc->bpf.inactive);
282 		desc->bpf.inactive = NULL;
283 	}
284 
285 	return err;
286 }
287 
288 #define BPF_CGROUP_MAX_PROGS 64
289 
290 /**
291  * __cgroup_bpf_attach() - Attach the program to a cgroup, and
292  *                         propagate the change to descendants
293  * @cgrp: The cgroup which descendants to traverse
294  * @prog: A program to attach
295  * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set
296  * @type: Type of attach operation
297  * @flags: Option flags
298  *
299  * Must be called with cgroup_mutex held.
300  */
301 int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
302 			struct bpf_prog *replace_prog,
303 			enum bpf_attach_type type, u32 flags)
304 {
305 	u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
306 	struct list_head *progs = &cgrp->bpf.progs[type];
307 	struct bpf_prog *old_prog = NULL;
308 	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
309 	struct bpf_cgroup_storage *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
310 	struct bpf_prog_list *pl, *replace_pl = NULL;
311 	enum bpf_cgroup_storage_type stype;
312 	int err;
313 
314 	if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
315 	    ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI)))
316 		/* invalid combination */
317 		return -EINVAL;
318 
319 	if (!hierarchy_allows_attach(cgrp, type))
320 		return -EPERM;
321 
322 	if (!list_empty(progs) && cgrp->bpf.flags[type] != saved_flags)
323 		/* Disallow attaching non-overridable on top
324 		 * of existing overridable in this cgroup.
325 		 * Disallow attaching multi-prog if overridable or none
326 		 */
327 		return -EPERM;
328 
329 	if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
330 		return -E2BIG;
331 
332 	if (flags & BPF_F_ALLOW_MULTI) {
333 		list_for_each_entry(pl, progs, node) {
334 			if (pl->prog == prog)
335 				/* disallow attaching the same prog twice */
336 				return -EINVAL;
337 			if (pl->prog == replace_prog)
338 				replace_pl = pl;
339 		}
340 		if ((flags & BPF_F_REPLACE) && !replace_pl)
341 			/* prog to replace not found for cgroup */
342 			return -ENOENT;
343 	} else if (!list_empty(progs)) {
344 		replace_pl = list_first_entry(progs, typeof(*pl), node);
345 	}
346 
347 	for_each_cgroup_storage_type(stype) {
348 		storage[stype] = bpf_cgroup_storage_alloc(prog, stype);
349 		if (IS_ERR(storage[stype])) {
350 			storage[stype] = NULL;
351 			for_each_cgroup_storage_type(stype)
352 				bpf_cgroup_storage_free(storage[stype]);
353 			return -ENOMEM;
354 		}
355 	}
356 
357 	if (replace_pl) {
358 		pl = replace_pl;
359 		old_prog = pl->prog;
360 		for_each_cgroup_storage_type(stype) {
361 			old_storage[stype] = pl->storage[stype];
362 			bpf_cgroup_storage_unlink(old_storage[stype]);
363 		}
364 	} else {
365 		pl = kmalloc(sizeof(*pl), GFP_KERNEL);
366 		if (!pl) {
367 			for_each_cgroup_storage_type(stype)
368 				bpf_cgroup_storage_free(storage[stype]);
369 			return -ENOMEM;
370 		}
371 		list_add_tail(&pl->node, progs);
372 	}
373 
374 	pl->prog = prog;
375 	for_each_cgroup_storage_type(stype)
376 		pl->storage[stype] = storage[stype];
377 
378 	cgrp->bpf.flags[type] = saved_flags;
379 
380 	err = update_effective_progs(cgrp, type);
381 	if (err)
382 		goto cleanup;
383 
384 	static_branch_inc(&cgroup_bpf_enabled_key);
385 	for_each_cgroup_storage_type(stype) {
386 		if (!old_storage[stype])
387 			continue;
388 		bpf_cgroup_storage_free(old_storage[stype]);
389 	}
390 	if (old_prog) {
391 		bpf_prog_put(old_prog);
392 		static_branch_dec(&cgroup_bpf_enabled_key);
393 	}
394 	for_each_cgroup_storage_type(stype)
395 		bpf_cgroup_storage_link(storage[stype], cgrp, type);
396 	return 0;
397 
398 cleanup:
399 	/* and cleanup the prog list */
400 	pl->prog = old_prog;
401 	for_each_cgroup_storage_type(stype) {
402 		bpf_cgroup_storage_free(pl->storage[stype]);
403 		pl->storage[stype] = old_storage[stype];
404 		bpf_cgroup_storage_link(old_storage[stype], cgrp, type);
405 	}
406 	if (!replace_pl) {
407 		list_del(&pl->node);
408 		kfree(pl);
409 	}
410 	return err;
411 }
412 
413 /**
414  * __cgroup_bpf_detach() - Detach the program from a cgroup, and
415  *                         propagate the change to descendants
416  * @cgrp: The cgroup which descendants to traverse
417  * @prog: A program to detach or NULL
418  * @type: Type of detach operation
419  *
420  * Must be called with cgroup_mutex held.
421  */
422 int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
423 			enum bpf_attach_type type)
424 {
425 	struct list_head *progs = &cgrp->bpf.progs[type];
426 	enum bpf_cgroup_storage_type stype;
427 	u32 flags = cgrp->bpf.flags[type];
428 	struct bpf_prog *old_prog = NULL;
429 	struct bpf_prog_list *pl;
430 	int err;
431 
432 	if (flags & BPF_F_ALLOW_MULTI) {
433 		if (!prog)
434 			/* to detach MULTI prog the user has to specify valid FD
435 			 * of the program to be detached
436 			 */
437 			return -EINVAL;
438 	} else {
439 		if (list_empty(progs))
440 			/* report error when trying to detach and nothing is attached */
441 			return -ENOENT;
442 	}
443 
444 	if (flags & BPF_F_ALLOW_MULTI) {
445 		/* find the prog and detach it */
446 		list_for_each_entry(pl, progs, node) {
447 			if (pl->prog != prog)
448 				continue;
449 			old_prog = prog;
450 			/* mark it deleted, so it's ignored while
451 			 * recomputing effective
452 			 */
453 			pl->prog = NULL;
454 			break;
455 		}
456 		if (!old_prog)
457 			return -ENOENT;
458 	} else {
459 		/* to maintain backward compatibility NONE and OVERRIDE cgroups
460 		 * allow detaching with invalid FD (prog==NULL)
461 		 */
462 		pl = list_first_entry(progs, typeof(*pl), node);
463 		old_prog = pl->prog;
464 		pl->prog = NULL;
465 	}
466 
467 	err = update_effective_progs(cgrp, type);
468 	if (err)
469 		goto cleanup;
470 
471 	/* now can actually delete it from this cgroup list */
472 	list_del(&pl->node);
473 	for_each_cgroup_storage_type(stype) {
474 		bpf_cgroup_storage_unlink(pl->storage[stype]);
475 		bpf_cgroup_storage_free(pl->storage[stype]);
476 	}
477 	kfree(pl);
478 	if (list_empty(progs))
479 		/* last program was detached, reset flags to zero */
480 		cgrp->bpf.flags[type] = 0;
481 
482 	bpf_prog_put(old_prog);
483 	static_branch_dec(&cgroup_bpf_enabled_key);
484 	return 0;
485 
486 cleanup:
487 	/* and restore back old_prog */
488 	pl->prog = old_prog;
489 	return err;
490 }
491 
492 /* Must be called with cgroup_mutex held to avoid races. */
493 int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
494 		       union bpf_attr __user *uattr)
495 {
496 	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
497 	enum bpf_attach_type type = attr->query.attach_type;
498 	struct list_head *progs = &cgrp->bpf.progs[type];
499 	u32 flags = cgrp->bpf.flags[type];
500 	struct bpf_prog_array *effective;
501 	int cnt, ret = 0, i;
502 
503 	effective = rcu_dereference_protected(cgrp->bpf.effective[type],
504 					      lockdep_is_held(&cgroup_mutex));
505 
506 	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
507 		cnt = bpf_prog_array_length(effective);
508 	else
509 		cnt = prog_list_length(progs);
510 
511 	if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
512 		return -EFAULT;
513 	if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt)))
514 		return -EFAULT;
515 	if (attr->query.prog_cnt == 0 || !prog_ids || !cnt)
516 		/* return early if user requested only program count + flags */
517 		return 0;
518 	if (attr->query.prog_cnt < cnt) {
519 		cnt = attr->query.prog_cnt;
520 		ret = -ENOSPC;
521 	}
522 
523 	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
524 		return bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
525 	} else {
526 		struct bpf_prog_list *pl;
527 		u32 id;
528 
529 		i = 0;
530 		list_for_each_entry(pl, progs, node) {
531 			id = pl->prog->aux->id;
532 			if (copy_to_user(prog_ids + i, &id, sizeof(id)))
533 				return -EFAULT;
534 			if (++i == cnt)
535 				break;
536 		}
537 	}
538 	return ret;
539 }
540 
541 int cgroup_bpf_prog_attach(const union bpf_attr *attr,
542 			   enum bpf_prog_type ptype, struct bpf_prog *prog)
543 {
544 	struct bpf_prog *replace_prog = NULL;
545 	struct cgroup *cgrp;
546 	int ret;
547 
548 	cgrp = cgroup_get_from_fd(attr->target_fd);
549 	if (IS_ERR(cgrp))
550 		return PTR_ERR(cgrp);
551 
552 	if ((attr->attach_flags & BPF_F_ALLOW_MULTI) &&
553 	    (attr->attach_flags & BPF_F_REPLACE)) {
554 		replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, ptype);
555 		if (IS_ERR(replace_prog)) {
556 			cgroup_put(cgrp);
557 			return PTR_ERR(replace_prog);
558 		}
559 	}
560 
561 	ret = cgroup_bpf_attach(cgrp, prog, replace_prog, attr->attach_type,
562 				attr->attach_flags);
563 
564 	if (replace_prog)
565 		bpf_prog_put(replace_prog);
566 	cgroup_put(cgrp);
567 	return ret;
568 }
569 
570 int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
571 {
572 	struct bpf_prog *prog;
573 	struct cgroup *cgrp;
574 	int ret;
575 
576 	cgrp = cgroup_get_from_fd(attr->target_fd);
577 	if (IS_ERR(cgrp))
578 		return PTR_ERR(cgrp);
579 
580 	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
581 	if (IS_ERR(prog))
582 		prog = NULL;
583 
584 	ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0);
585 	if (prog)
586 		bpf_prog_put(prog);
587 
588 	cgroup_put(cgrp);
589 	return ret;
590 }
591 
592 int cgroup_bpf_prog_query(const union bpf_attr *attr,
593 			  union bpf_attr __user *uattr)
594 {
595 	struct cgroup *cgrp;
596 	int ret;
597 
598 	cgrp = cgroup_get_from_fd(attr->query.target_fd);
599 	if (IS_ERR(cgrp))
600 		return PTR_ERR(cgrp);
601 
602 	ret = cgroup_bpf_query(cgrp, attr, uattr);
603 
604 	cgroup_put(cgrp);
605 	return ret;
606 }
607 
608 /**
609  * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
610  * @sk: The socket sending or receiving traffic
611  * @skb: The skb that is being sent or received
612  * @type: The type of program to be exectuted
613  *
614  * If no socket is passed, or the socket is not of type INET or INET6,
615  * this function does nothing and returns 0.
616  *
617  * The program type passed in via @type must be suitable for network
618  * filtering. No further check is performed to assert that.
619  *
620  * For egress packets, this function can return:
621  *   NET_XMIT_SUCCESS    (0)	- continue with packet output
622  *   NET_XMIT_DROP       (1)	- drop packet and notify TCP to call cwr
623  *   NET_XMIT_CN         (2)	- continue with packet output and notify TCP
624  *				  to call cwr
625  *   -EPERM			- drop packet
626  *
627  * For ingress packets, this function will return -EPERM if any
628  * attached program was found and if it returned != 1 during execution.
629  * Otherwise 0 is returned.
630  */
631 int __cgroup_bpf_run_filter_skb(struct sock *sk,
632 				struct sk_buff *skb,
633 				enum bpf_attach_type type)
634 {
635 	unsigned int offset = skb->data - skb_network_header(skb);
636 	struct sock *save_sk;
637 	void *saved_data_end;
638 	struct cgroup *cgrp;
639 	int ret;
640 
641 	if (!sk || !sk_fullsock(sk))
642 		return 0;
643 
644 	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
645 		return 0;
646 
647 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
648 	save_sk = skb->sk;
649 	skb->sk = sk;
650 	__skb_push(skb, offset);
651 
652 	/* compute pointers for the bpf prog */
653 	bpf_compute_and_save_data_end(skb, &saved_data_end);
654 
655 	if (type == BPF_CGROUP_INET_EGRESS) {
656 		ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(
657 			cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb);
658 	} else {
659 		ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
660 					  __bpf_prog_run_save_cb);
661 		ret = (ret == 1 ? 0 : -EPERM);
662 	}
663 	bpf_restore_data_end(skb, saved_data_end);
664 	__skb_pull(skb, offset);
665 	skb->sk = save_sk;
666 
667 	return ret;
668 }
669 EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
670 
671 /**
672  * __cgroup_bpf_run_filter_sk() - Run a program on a sock
673  * @sk: sock structure to manipulate
674  * @type: The type of program to be exectuted
675  *
676  * socket is passed is expected to be of type INET or INET6.
677  *
678  * The program type passed in via @type must be suitable for sock
679  * filtering. No further check is performed to assert that.
680  *
681  * This function will return %-EPERM if any if an attached program was found
682  * and if it returned != 1 during execution. In all other cases, 0 is returned.
683  */
684 int __cgroup_bpf_run_filter_sk(struct sock *sk,
685 			       enum bpf_attach_type type)
686 {
687 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
688 	int ret;
689 
690 	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN);
691 	return ret == 1 ? 0 : -EPERM;
692 }
693 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
694 
695 /**
696  * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and
697  *                                       provided by user sockaddr
698  * @sk: sock struct that will use sockaddr
699  * @uaddr: sockaddr struct provided by user
700  * @type: The type of program to be exectuted
701  * @t_ctx: Pointer to attach type specific context
702  *
703  * socket is expected to be of type INET or INET6.
704  *
705  * This function will return %-EPERM if an attached program is found and
706  * returned value != 1 during execution. In all other cases, 0 is returned.
707  */
708 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
709 				      struct sockaddr *uaddr,
710 				      enum bpf_attach_type type,
711 				      void *t_ctx)
712 {
713 	struct bpf_sock_addr_kern ctx = {
714 		.sk = sk,
715 		.uaddr = uaddr,
716 		.t_ctx = t_ctx,
717 	};
718 	struct sockaddr_storage unspec;
719 	struct cgroup *cgrp;
720 	int ret;
721 
722 	/* Check socket family since not all sockets represent network
723 	 * endpoint (e.g. AF_UNIX).
724 	 */
725 	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
726 		return 0;
727 
728 	if (!ctx.uaddr) {
729 		memset(&unspec, 0, sizeof(unspec));
730 		ctx.uaddr = (struct sockaddr *)&unspec;
731 	}
732 
733 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
734 	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
735 
736 	return ret == 1 ? 0 : -EPERM;
737 }
738 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
739 
740 /**
741  * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock
742  * @sk: socket to get cgroup from
743  * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
744  * sk with connection information (IP addresses, etc.) May not contain
745  * cgroup info if it is a req sock.
746  * @type: The type of program to be exectuted
747  *
748  * socket passed is expected to be of type INET or INET6.
749  *
750  * The program type passed in via @type must be suitable for sock_ops
751  * filtering. No further check is performed to assert that.
752  *
753  * This function will return %-EPERM if any if an attached program was found
754  * and if it returned != 1 during execution. In all other cases, 0 is returned.
755  */
756 int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
757 				     struct bpf_sock_ops_kern *sock_ops,
758 				     enum bpf_attach_type type)
759 {
760 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
761 	int ret;
762 
763 	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops,
764 				 BPF_PROG_RUN);
765 	return ret == 1 ? 0 : -EPERM;
766 }
767 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
768 
769 int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
770 				      short access, enum bpf_attach_type type)
771 {
772 	struct cgroup *cgrp;
773 	struct bpf_cgroup_dev_ctx ctx = {
774 		.access_type = (access << 16) | dev_type,
775 		.major = major,
776 		.minor = minor,
777 	};
778 	int allow = 1;
779 
780 	rcu_read_lock();
781 	cgrp = task_dfl_cgroup(current);
782 	allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx,
783 				   BPF_PROG_RUN);
784 	rcu_read_unlock();
785 
786 	return !allow;
787 }
788 EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission);
789 
790 static const struct bpf_func_proto *
791 cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
792 {
793 	switch (func_id) {
794 	case BPF_FUNC_map_lookup_elem:
795 		return &bpf_map_lookup_elem_proto;
796 	case BPF_FUNC_map_update_elem:
797 		return &bpf_map_update_elem_proto;
798 	case BPF_FUNC_map_delete_elem:
799 		return &bpf_map_delete_elem_proto;
800 	case BPF_FUNC_map_push_elem:
801 		return &bpf_map_push_elem_proto;
802 	case BPF_FUNC_map_pop_elem:
803 		return &bpf_map_pop_elem_proto;
804 	case BPF_FUNC_map_peek_elem:
805 		return &bpf_map_peek_elem_proto;
806 	case BPF_FUNC_get_current_uid_gid:
807 		return &bpf_get_current_uid_gid_proto;
808 	case BPF_FUNC_get_local_storage:
809 		return &bpf_get_local_storage_proto;
810 	case BPF_FUNC_get_current_cgroup_id:
811 		return &bpf_get_current_cgroup_id_proto;
812 	case BPF_FUNC_trace_printk:
813 		if (capable(CAP_SYS_ADMIN))
814 			return bpf_get_trace_printk_proto();
815 		/* fall through */
816 	default:
817 		return NULL;
818 	}
819 }
820 
821 static const struct bpf_func_proto *
822 cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
823 {
824 	return cgroup_base_func_proto(func_id, prog);
825 }
826 
827 static bool cgroup_dev_is_valid_access(int off, int size,
828 				       enum bpf_access_type type,
829 				       const struct bpf_prog *prog,
830 				       struct bpf_insn_access_aux *info)
831 {
832 	const int size_default = sizeof(__u32);
833 
834 	if (type == BPF_WRITE)
835 		return false;
836 
837 	if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx))
838 		return false;
839 	/* The verifier guarantees that size > 0. */
840 	if (off % size != 0)
841 		return false;
842 
843 	switch (off) {
844 	case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type):
845 		bpf_ctx_record_field_size(info, size_default);
846 		if (!bpf_ctx_narrow_access_ok(off, size, size_default))
847 			return false;
848 		break;
849 	default:
850 		if (size != size_default)
851 			return false;
852 	}
853 
854 	return true;
855 }
856 
857 const struct bpf_prog_ops cg_dev_prog_ops = {
858 };
859 
860 const struct bpf_verifier_ops cg_dev_verifier_ops = {
861 	.get_func_proto		= cgroup_dev_func_proto,
862 	.is_valid_access	= cgroup_dev_is_valid_access,
863 };
864 
865 /**
866  * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl
867  *
868  * @head: sysctl table header
869  * @table: sysctl table
870  * @write: sysctl is being read (= 0) or written (= 1)
871  * @buf: pointer to buffer passed by user space
872  * @pcount: value-result argument: value is size of buffer pointed to by @buf,
873  *	result is size of @new_buf if program set new value, initial value
874  *	otherwise
875  * @ppos: value-result argument: value is position at which read from or write
876  *	to sysctl is happening, result is new position if program overrode it,
877  *	initial value otherwise
878  * @new_buf: pointer to pointer to new buffer that will be allocated if program
879  *	overrides new value provided by user space on sysctl write
880  *	NOTE: it's caller responsibility to free *new_buf if it was set
881  * @type: type of program to be executed
882  *
883  * Program is run when sysctl is being accessed, either read or written, and
884  * can allow or deny such access.
885  *
886  * This function will return %-EPERM if an attached program is found and
887  * returned value != 1 during execution. In all other cases 0 is returned.
888  */
889 int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
890 				   struct ctl_table *table, int write,
891 				   void __user *buf, size_t *pcount,
892 				   loff_t *ppos, void **new_buf,
893 				   enum bpf_attach_type type)
894 {
895 	struct bpf_sysctl_kern ctx = {
896 		.head = head,
897 		.table = table,
898 		.write = write,
899 		.ppos = ppos,
900 		.cur_val = NULL,
901 		.cur_len = PAGE_SIZE,
902 		.new_val = NULL,
903 		.new_len = 0,
904 		.new_updated = 0,
905 	};
906 	struct cgroup *cgrp;
907 	int ret;
908 
909 	ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL);
910 	if (ctx.cur_val) {
911 		mm_segment_t old_fs;
912 		loff_t pos = 0;
913 
914 		old_fs = get_fs();
915 		set_fs(KERNEL_DS);
916 		if (table->proc_handler(table, 0, (void __user *)ctx.cur_val,
917 					&ctx.cur_len, &pos)) {
918 			/* Let BPF program decide how to proceed. */
919 			ctx.cur_len = 0;
920 		}
921 		set_fs(old_fs);
922 	} else {
923 		/* Let BPF program decide how to proceed. */
924 		ctx.cur_len = 0;
925 	}
926 
927 	if (write && buf && *pcount) {
928 		/* BPF program should be able to override new value with a
929 		 * buffer bigger than provided by user.
930 		 */
931 		ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL);
932 		ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount);
933 		if (!ctx.new_val ||
934 		    copy_from_user(ctx.new_val, buf, ctx.new_len))
935 			/* Let BPF program decide how to proceed. */
936 			ctx.new_len = 0;
937 	}
938 
939 	rcu_read_lock();
940 	cgrp = task_dfl_cgroup(current);
941 	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
942 	rcu_read_unlock();
943 
944 	kfree(ctx.cur_val);
945 
946 	if (ret == 1 && ctx.new_updated) {
947 		*new_buf = ctx.new_val;
948 		*pcount = ctx.new_len;
949 	} else {
950 		kfree(ctx.new_val);
951 	}
952 
953 	return ret == 1 ? 0 : -EPERM;
954 }
955 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl);
956 
957 #ifdef CONFIG_NET
958 static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
959 					     enum bpf_attach_type attach_type)
960 {
961 	struct bpf_prog_array *prog_array;
962 	bool empty;
963 
964 	rcu_read_lock();
965 	prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]);
966 	empty = bpf_prog_array_is_empty(prog_array);
967 	rcu_read_unlock();
968 
969 	return empty;
970 }
971 
972 static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
973 {
974 	if (unlikely(max_optlen > PAGE_SIZE) || max_optlen < 0)
975 		return -EINVAL;
976 
977 	ctx->optval = kzalloc(max_optlen, GFP_USER);
978 	if (!ctx->optval)
979 		return -ENOMEM;
980 
981 	ctx->optval_end = ctx->optval + max_optlen;
982 
983 	return 0;
984 }
985 
986 static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
987 {
988 	kfree(ctx->optval);
989 }
990 
991 int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
992 				       int *optname, char __user *optval,
993 				       int *optlen, char **kernel_optval)
994 {
995 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
996 	struct bpf_sockopt_kern ctx = {
997 		.sk = sk,
998 		.level = *level,
999 		.optname = *optname,
1000 	};
1001 	int ret, max_optlen;
1002 
1003 	/* Opportunistic check to see whether we have any BPF program
1004 	 * attached to the hook so we don't waste time allocating
1005 	 * memory and locking the socket.
1006 	 */
1007 	if (!cgroup_bpf_enabled ||
1008 	    __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT))
1009 		return 0;
1010 
1011 	/* Allocate a bit more than the initial user buffer for
1012 	 * BPF program. The canonical use case is overriding
1013 	 * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
1014 	 */
1015 	max_optlen = max_t(int, 16, *optlen);
1016 
1017 	ret = sockopt_alloc_buf(&ctx, max_optlen);
1018 	if (ret)
1019 		return ret;
1020 
1021 	ctx.optlen = *optlen;
1022 
1023 	if (copy_from_user(ctx.optval, optval, *optlen) != 0) {
1024 		ret = -EFAULT;
1025 		goto out;
1026 	}
1027 
1028 	lock_sock(sk);
1029 	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT],
1030 				 &ctx, BPF_PROG_RUN);
1031 	release_sock(sk);
1032 
1033 	if (!ret) {
1034 		ret = -EPERM;
1035 		goto out;
1036 	}
1037 
1038 	if (ctx.optlen == -1) {
1039 		/* optlen set to -1, bypass kernel */
1040 		ret = 1;
1041 	} else if (ctx.optlen > max_optlen || ctx.optlen < -1) {
1042 		/* optlen is out of bounds */
1043 		ret = -EFAULT;
1044 	} else {
1045 		/* optlen within bounds, run kernel handler */
1046 		ret = 0;
1047 
1048 		/* export any potential modifications */
1049 		*level = ctx.level;
1050 		*optname = ctx.optname;
1051 		*optlen = ctx.optlen;
1052 		*kernel_optval = ctx.optval;
1053 	}
1054 
1055 out:
1056 	if (ret)
1057 		sockopt_free_buf(&ctx);
1058 	return ret;
1059 }
1060 EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt);
1061 
1062 int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
1063 				       int optname, char __user *optval,
1064 				       int __user *optlen, int max_optlen,
1065 				       int retval)
1066 {
1067 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1068 	struct bpf_sockopt_kern ctx = {
1069 		.sk = sk,
1070 		.level = level,
1071 		.optname = optname,
1072 		.retval = retval,
1073 	};
1074 	int ret;
1075 
1076 	/* Opportunistic check to see whether we have any BPF program
1077 	 * attached to the hook so we don't waste time allocating
1078 	 * memory and locking the socket.
1079 	 */
1080 	if (!cgroup_bpf_enabled ||
1081 	    __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT))
1082 		return retval;
1083 
1084 	ret = sockopt_alloc_buf(&ctx, max_optlen);
1085 	if (ret)
1086 		return ret;
1087 
1088 	ctx.optlen = max_optlen;
1089 
1090 	if (!retval) {
1091 		/* If kernel getsockopt finished successfully,
1092 		 * copy whatever was returned to the user back
1093 		 * into our temporary buffer. Set optlen to the
1094 		 * one that kernel returned as well to let
1095 		 * BPF programs inspect the value.
1096 		 */
1097 
1098 		if (get_user(ctx.optlen, optlen)) {
1099 			ret = -EFAULT;
1100 			goto out;
1101 		}
1102 
1103 		if (ctx.optlen > max_optlen)
1104 			ctx.optlen = max_optlen;
1105 
1106 		if (copy_from_user(ctx.optval, optval, ctx.optlen) != 0) {
1107 			ret = -EFAULT;
1108 			goto out;
1109 		}
1110 	}
1111 
1112 	lock_sock(sk);
1113 	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
1114 				 &ctx, BPF_PROG_RUN);
1115 	release_sock(sk);
1116 
1117 	if (!ret) {
1118 		ret = -EPERM;
1119 		goto out;
1120 	}
1121 
1122 	if (ctx.optlen > max_optlen) {
1123 		ret = -EFAULT;
1124 		goto out;
1125 	}
1126 
1127 	/* BPF programs only allowed to set retval to 0, not some
1128 	 * arbitrary value.
1129 	 */
1130 	if (ctx.retval != 0 && ctx.retval != retval) {
1131 		ret = -EFAULT;
1132 		goto out;
1133 	}
1134 
1135 	if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
1136 	    put_user(ctx.optlen, optlen)) {
1137 		ret = -EFAULT;
1138 		goto out;
1139 	}
1140 
1141 	ret = ctx.retval;
1142 
1143 out:
1144 	sockopt_free_buf(&ctx);
1145 	return ret;
1146 }
1147 EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt);
1148 #endif
1149 
1150 static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
1151 			      size_t *lenp)
1152 {
1153 	ssize_t tmp_ret = 0, ret;
1154 
1155 	if (dir->header.parent) {
1156 		tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp);
1157 		if (tmp_ret < 0)
1158 			return tmp_ret;
1159 	}
1160 
1161 	ret = strscpy(*bufp, dir->header.ctl_table[0].procname, *lenp);
1162 	if (ret < 0)
1163 		return ret;
1164 	*bufp += ret;
1165 	*lenp -= ret;
1166 	ret += tmp_ret;
1167 
1168 	/* Avoid leading slash. */
1169 	if (!ret)
1170 		return ret;
1171 
1172 	tmp_ret = strscpy(*bufp, "/", *lenp);
1173 	if (tmp_ret < 0)
1174 		return tmp_ret;
1175 	*bufp += tmp_ret;
1176 	*lenp -= tmp_ret;
1177 
1178 	return ret + tmp_ret;
1179 }
1180 
1181 BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern *, ctx, char *, buf,
1182 	   size_t, buf_len, u64, flags)
1183 {
1184 	ssize_t tmp_ret = 0, ret;
1185 
1186 	if (!buf)
1187 		return -EINVAL;
1188 
1189 	if (!(flags & BPF_F_SYSCTL_BASE_NAME)) {
1190 		if (!ctx->head)
1191 			return -EINVAL;
1192 		tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len);
1193 		if (tmp_ret < 0)
1194 			return tmp_ret;
1195 	}
1196 
1197 	ret = strscpy(buf, ctx->table->procname, buf_len);
1198 
1199 	return ret < 0 ? ret : tmp_ret + ret;
1200 }
1201 
1202 static const struct bpf_func_proto bpf_sysctl_get_name_proto = {
1203 	.func		= bpf_sysctl_get_name,
1204 	.gpl_only	= false,
1205 	.ret_type	= RET_INTEGER,
1206 	.arg1_type	= ARG_PTR_TO_CTX,
1207 	.arg2_type	= ARG_PTR_TO_MEM,
1208 	.arg3_type	= ARG_CONST_SIZE,
1209 	.arg4_type	= ARG_ANYTHING,
1210 };
1211 
1212 static int copy_sysctl_value(char *dst, size_t dst_len, char *src,
1213 			     size_t src_len)
1214 {
1215 	if (!dst)
1216 		return -EINVAL;
1217 
1218 	if (!dst_len)
1219 		return -E2BIG;
1220 
1221 	if (!src || !src_len) {
1222 		memset(dst, 0, dst_len);
1223 		return -EINVAL;
1224 	}
1225 
1226 	memcpy(dst, src, min(dst_len, src_len));
1227 
1228 	if (dst_len > src_len) {
1229 		memset(dst + src_len, '\0', dst_len - src_len);
1230 		return src_len;
1231 	}
1232 
1233 	dst[dst_len - 1] = '\0';
1234 
1235 	return -E2BIG;
1236 }
1237 
1238 BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx,
1239 	   char *, buf, size_t, buf_len)
1240 {
1241 	return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len);
1242 }
1243 
1244 static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = {
1245 	.func		= bpf_sysctl_get_current_value,
1246 	.gpl_only	= false,
1247 	.ret_type	= RET_INTEGER,
1248 	.arg1_type	= ARG_PTR_TO_CTX,
1249 	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
1250 	.arg3_type	= ARG_CONST_SIZE,
1251 };
1252 
1253 BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf,
1254 	   size_t, buf_len)
1255 {
1256 	if (!ctx->write) {
1257 		if (buf && buf_len)
1258 			memset(buf, '\0', buf_len);
1259 		return -EINVAL;
1260 	}
1261 	return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len);
1262 }
1263 
1264 static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = {
1265 	.func		= bpf_sysctl_get_new_value,
1266 	.gpl_only	= false,
1267 	.ret_type	= RET_INTEGER,
1268 	.arg1_type	= ARG_PTR_TO_CTX,
1269 	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
1270 	.arg3_type	= ARG_CONST_SIZE,
1271 };
1272 
1273 BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx,
1274 	   const char *, buf, size_t, buf_len)
1275 {
1276 	if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len)
1277 		return -EINVAL;
1278 
1279 	if (buf_len > PAGE_SIZE - 1)
1280 		return -E2BIG;
1281 
1282 	memcpy(ctx->new_val, buf, buf_len);
1283 	ctx->new_len = buf_len;
1284 	ctx->new_updated = 1;
1285 
1286 	return 0;
1287 }
1288 
1289 static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
1290 	.func		= bpf_sysctl_set_new_value,
1291 	.gpl_only	= false,
1292 	.ret_type	= RET_INTEGER,
1293 	.arg1_type	= ARG_PTR_TO_CTX,
1294 	.arg2_type	= ARG_PTR_TO_MEM,
1295 	.arg3_type	= ARG_CONST_SIZE,
1296 };
1297 
1298 static const struct bpf_func_proto *
1299 sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1300 {
1301 	switch (func_id) {
1302 	case BPF_FUNC_strtol:
1303 		return &bpf_strtol_proto;
1304 	case BPF_FUNC_strtoul:
1305 		return &bpf_strtoul_proto;
1306 	case BPF_FUNC_sysctl_get_name:
1307 		return &bpf_sysctl_get_name_proto;
1308 	case BPF_FUNC_sysctl_get_current_value:
1309 		return &bpf_sysctl_get_current_value_proto;
1310 	case BPF_FUNC_sysctl_get_new_value:
1311 		return &bpf_sysctl_get_new_value_proto;
1312 	case BPF_FUNC_sysctl_set_new_value:
1313 		return &bpf_sysctl_set_new_value_proto;
1314 	default:
1315 		return cgroup_base_func_proto(func_id, prog);
1316 	}
1317 }
1318 
1319 static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type,
1320 				   const struct bpf_prog *prog,
1321 				   struct bpf_insn_access_aux *info)
1322 {
1323 	const int size_default = sizeof(__u32);
1324 
1325 	if (off < 0 || off + size > sizeof(struct bpf_sysctl) || off % size)
1326 		return false;
1327 
1328 	switch (off) {
1329 	case bpf_ctx_range(struct bpf_sysctl, write):
1330 		if (type != BPF_READ)
1331 			return false;
1332 		bpf_ctx_record_field_size(info, size_default);
1333 		return bpf_ctx_narrow_access_ok(off, size, size_default);
1334 	case bpf_ctx_range(struct bpf_sysctl, file_pos):
1335 		if (type == BPF_READ) {
1336 			bpf_ctx_record_field_size(info, size_default);
1337 			return bpf_ctx_narrow_access_ok(off, size, size_default);
1338 		} else {
1339 			return size == size_default;
1340 		}
1341 	default:
1342 		return false;
1343 	}
1344 }
1345 
1346 static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
1347 				     const struct bpf_insn *si,
1348 				     struct bpf_insn *insn_buf,
1349 				     struct bpf_prog *prog, u32 *target_size)
1350 {
1351 	struct bpf_insn *insn = insn_buf;
1352 	u32 read_size;
1353 
1354 	switch (si->off) {
1355 	case offsetof(struct bpf_sysctl, write):
1356 		*insn++ = BPF_LDX_MEM(
1357 			BPF_SIZE(si->code), si->dst_reg, si->src_reg,
1358 			bpf_target_off(struct bpf_sysctl_kern, write,
1359 				       sizeof_field(struct bpf_sysctl_kern,
1360 						    write),
1361 				       target_size));
1362 		break;
1363 	case offsetof(struct bpf_sysctl, file_pos):
1364 		/* ppos is a pointer so it should be accessed via indirect
1365 		 * loads and stores. Also for stores additional temporary
1366 		 * register is used since neither src_reg nor dst_reg can be
1367 		 * overridden.
1368 		 */
1369 		if (type == BPF_WRITE) {
1370 			int treg = BPF_REG_9;
1371 
1372 			if (si->src_reg == treg || si->dst_reg == treg)
1373 				--treg;
1374 			if (si->src_reg == treg || si->dst_reg == treg)
1375 				--treg;
1376 			*insn++ = BPF_STX_MEM(
1377 				BPF_DW, si->dst_reg, treg,
1378 				offsetof(struct bpf_sysctl_kern, tmp_reg));
1379 			*insn++ = BPF_LDX_MEM(
1380 				BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
1381 				treg, si->dst_reg,
1382 				offsetof(struct bpf_sysctl_kern, ppos));
1383 			*insn++ = BPF_STX_MEM(
1384 				BPF_SIZEOF(u32), treg, si->src_reg,
1385 				bpf_ctx_narrow_access_offset(
1386 					0, sizeof(u32), sizeof(loff_t)));
1387 			*insn++ = BPF_LDX_MEM(
1388 				BPF_DW, treg, si->dst_reg,
1389 				offsetof(struct bpf_sysctl_kern, tmp_reg));
1390 		} else {
1391 			*insn++ = BPF_LDX_MEM(
1392 				BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
1393 				si->dst_reg, si->src_reg,
1394 				offsetof(struct bpf_sysctl_kern, ppos));
1395 			read_size = bpf_size_to_bytes(BPF_SIZE(si->code));
1396 			*insn++ = BPF_LDX_MEM(
1397 				BPF_SIZE(si->code), si->dst_reg, si->dst_reg,
1398 				bpf_ctx_narrow_access_offset(
1399 					0, read_size, sizeof(loff_t)));
1400 		}
1401 		*target_size = sizeof(u32);
1402 		break;
1403 	}
1404 
1405 	return insn - insn_buf;
1406 }
1407 
1408 const struct bpf_verifier_ops cg_sysctl_verifier_ops = {
1409 	.get_func_proto		= sysctl_func_proto,
1410 	.is_valid_access	= sysctl_is_valid_access,
1411 	.convert_ctx_access	= sysctl_convert_ctx_access,
1412 };
1413 
1414 const struct bpf_prog_ops cg_sysctl_prog_ops = {
1415 };
1416 
1417 static const struct bpf_func_proto *
1418 cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1419 {
1420 	switch (func_id) {
1421 #ifdef CONFIG_NET
1422 	case BPF_FUNC_sk_storage_get:
1423 		return &bpf_sk_storage_get_proto;
1424 	case BPF_FUNC_sk_storage_delete:
1425 		return &bpf_sk_storage_delete_proto;
1426 #endif
1427 #ifdef CONFIG_INET
1428 	case BPF_FUNC_tcp_sock:
1429 		return &bpf_tcp_sock_proto;
1430 #endif
1431 	default:
1432 		return cgroup_base_func_proto(func_id, prog);
1433 	}
1434 }
1435 
1436 static bool cg_sockopt_is_valid_access(int off, int size,
1437 				       enum bpf_access_type type,
1438 				       const struct bpf_prog *prog,
1439 				       struct bpf_insn_access_aux *info)
1440 {
1441 	const int size_default = sizeof(__u32);
1442 
1443 	if (off < 0 || off >= sizeof(struct bpf_sockopt))
1444 		return false;
1445 
1446 	if (off % size != 0)
1447 		return false;
1448 
1449 	if (type == BPF_WRITE) {
1450 		switch (off) {
1451 		case offsetof(struct bpf_sockopt, retval):
1452 			if (size != size_default)
1453 				return false;
1454 			return prog->expected_attach_type ==
1455 				BPF_CGROUP_GETSOCKOPT;
1456 		case offsetof(struct bpf_sockopt, optname):
1457 			/* fallthrough */
1458 		case offsetof(struct bpf_sockopt, level):
1459 			if (size != size_default)
1460 				return false;
1461 			return prog->expected_attach_type ==
1462 				BPF_CGROUP_SETSOCKOPT;
1463 		case offsetof(struct bpf_sockopt, optlen):
1464 			return size == size_default;
1465 		default:
1466 			return false;
1467 		}
1468 	}
1469 
1470 	switch (off) {
1471 	case offsetof(struct bpf_sockopt, sk):
1472 		if (size != sizeof(__u64))
1473 			return false;
1474 		info->reg_type = PTR_TO_SOCKET;
1475 		break;
1476 	case offsetof(struct bpf_sockopt, optval):
1477 		if (size != sizeof(__u64))
1478 			return false;
1479 		info->reg_type = PTR_TO_PACKET;
1480 		break;
1481 	case offsetof(struct bpf_sockopt, optval_end):
1482 		if (size != sizeof(__u64))
1483 			return false;
1484 		info->reg_type = PTR_TO_PACKET_END;
1485 		break;
1486 	case offsetof(struct bpf_sockopt, retval):
1487 		if (size != size_default)
1488 			return false;
1489 		return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT;
1490 	default:
1491 		if (size != size_default)
1492 			return false;
1493 		break;
1494 	}
1495 	return true;
1496 }
1497 
1498 #define CG_SOCKOPT_ACCESS_FIELD(T, F)					\
1499 	T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F),			\
1500 	  si->dst_reg, si->src_reg,					\
1501 	  offsetof(struct bpf_sockopt_kern, F))
1502 
1503 static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
1504 					 const struct bpf_insn *si,
1505 					 struct bpf_insn *insn_buf,
1506 					 struct bpf_prog *prog,
1507 					 u32 *target_size)
1508 {
1509 	struct bpf_insn *insn = insn_buf;
1510 
1511 	switch (si->off) {
1512 	case offsetof(struct bpf_sockopt, sk):
1513 		*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk);
1514 		break;
1515 	case offsetof(struct bpf_sockopt, level):
1516 		if (type == BPF_WRITE)
1517 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level);
1518 		else
1519 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level);
1520 		break;
1521 	case offsetof(struct bpf_sockopt, optname):
1522 		if (type == BPF_WRITE)
1523 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname);
1524 		else
1525 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname);
1526 		break;
1527 	case offsetof(struct bpf_sockopt, optlen):
1528 		if (type == BPF_WRITE)
1529 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen);
1530 		else
1531 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen);
1532 		break;
1533 	case offsetof(struct bpf_sockopt, retval):
1534 		if (type == BPF_WRITE)
1535 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval);
1536 		else
1537 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval);
1538 		break;
1539 	case offsetof(struct bpf_sockopt, optval):
1540 		*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval);
1541 		break;
1542 	case offsetof(struct bpf_sockopt, optval_end):
1543 		*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end);
1544 		break;
1545 	}
1546 
1547 	return insn - insn_buf;
1548 }
1549 
1550 static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf,
1551 				   bool direct_write,
1552 				   const struct bpf_prog *prog)
1553 {
1554 	/* Nothing to do for sockopt argument. The data is kzalloc'ated.
1555 	 */
1556 	return 0;
1557 }
1558 
1559 const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
1560 	.get_func_proto		= cg_sockopt_func_proto,
1561 	.is_valid_access	= cg_sockopt_is_valid_access,
1562 	.convert_ctx_access	= cg_sockopt_convert_ctx_access,
1563 	.gen_prologue		= cg_sockopt_get_prologue,
1564 };
1565 
1566 const struct bpf_prog_ops cg_sockopt_prog_ops = {
1567 };
1568