xref: /openbmc/linux/kernel/bpf/cgroup.c (revision 7dd68b32)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Functions to manage eBPF programs attached to cgroups
4  *
5  * Copyright (c) 2016 Daniel Mack
6  */
7 
8 #include <linux/kernel.h>
9 #include <linux/atomic.h>
10 #include <linux/cgroup.h>
11 #include <linux/filter.h>
12 #include <linux/slab.h>
13 #include <linux/sysctl.h>
14 #include <linux/string.h>
15 #include <linux/bpf.h>
16 #include <linux/bpf-cgroup.h>
17 #include <net/sock.h>
18 #include <net/bpf_sk_storage.h>
19 
20 #include "../cgroup/cgroup-internal.h"
21 
22 DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
23 EXPORT_SYMBOL(cgroup_bpf_enabled_key);
24 
25 void cgroup_bpf_offline(struct cgroup *cgrp)
26 {
27 	cgroup_get(cgrp);
28 	percpu_ref_kill(&cgrp->bpf.refcnt);
29 }
30 
31 /**
32  * cgroup_bpf_release() - put references of all bpf programs and
33  *                        release all cgroup bpf data
34  * @work: work structure embedded into the cgroup to modify
35  */
36 static void cgroup_bpf_release(struct work_struct *work)
37 {
38 	struct cgroup *cgrp = container_of(work, struct cgroup,
39 					   bpf.release_work);
40 	enum bpf_cgroup_storage_type stype;
41 	struct bpf_prog_array *old_array;
42 	unsigned int type;
43 
44 	mutex_lock(&cgroup_mutex);
45 
46 	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
47 		struct list_head *progs = &cgrp->bpf.progs[type];
48 		struct bpf_prog_list *pl, *tmp;
49 
50 		list_for_each_entry_safe(pl, tmp, progs, node) {
51 			list_del(&pl->node);
52 			bpf_prog_put(pl->prog);
53 			for_each_cgroup_storage_type(stype) {
54 				bpf_cgroup_storage_unlink(pl->storage[stype]);
55 				bpf_cgroup_storage_free(pl->storage[stype]);
56 			}
57 			kfree(pl);
58 			static_branch_dec(&cgroup_bpf_enabled_key);
59 		}
60 		old_array = rcu_dereference_protected(
61 				cgrp->bpf.effective[type],
62 				lockdep_is_held(&cgroup_mutex));
63 		bpf_prog_array_free(old_array);
64 	}
65 
66 	mutex_unlock(&cgroup_mutex);
67 
68 	percpu_ref_exit(&cgrp->bpf.refcnt);
69 	cgroup_put(cgrp);
70 }
71 
72 /**
73  * cgroup_bpf_release_fn() - callback used to schedule releasing
74  *                           of bpf cgroup data
75  * @ref: percpu ref counter structure
76  */
77 static void cgroup_bpf_release_fn(struct percpu_ref *ref)
78 {
79 	struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
80 
81 	INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
82 	queue_work(system_wq, &cgrp->bpf.release_work);
83 }
84 
85 /* count number of elements in the list.
86  * it's slow but the list cannot be long
87  */
88 static u32 prog_list_length(struct list_head *head)
89 {
90 	struct bpf_prog_list *pl;
91 	u32 cnt = 0;
92 
93 	list_for_each_entry(pl, head, node) {
94 		if (!pl->prog)
95 			continue;
96 		cnt++;
97 	}
98 	return cnt;
99 }
100 
101 /* if parent has non-overridable prog attached,
102  * disallow attaching new programs to the descendent cgroup.
103  * if parent has overridable or multi-prog, allow attaching
104  */
105 static bool hierarchy_allows_attach(struct cgroup *cgrp,
106 				    enum bpf_attach_type type)
107 {
108 	struct cgroup *p;
109 
110 	p = cgroup_parent(cgrp);
111 	if (!p)
112 		return true;
113 	do {
114 		u32 flags = p->bpf.flags[type];
115 		u32 cnt;
116 
117 		if (flags & BPF_F_ALLOW_MULTI)
118 			return true;
119 		cnt = prog_list_length(&p->bpf.progs[type]);
120 		WARN_ON_ONCE(cnt > 1);
121 		if (cnt == 1)
122 			return !!(flags & BPF_F_ALLOW_OVERRIDE);
123 		p = cgroup_parent(p);
124 	} while (p);
125 	return true;
126 }
127 
128 /* compute a chain of effective programs for a given cgroup:
129  * start from the list of programs in this cgroup and add
130  * all parent programs.
131  * Note that parent's F_ALLOW_OVERRIDE-type program is yielding
132  * to programs in this cgroup
133  */
134 static int compute_effective_progs(struct cgroup *cgrp,
135 				   enum bpf_attach_type type,
136 				   struct bpf_prog_array **array)
137 {
138 	enum bpf_cgroup_storage_type stype;
139 	struct bpf_prog_array *progs;
140 	struct bpf_prog_list *pl;
141 	struct cgroup *p = cgrp;
142 	int cnt = 0;
143 
144 	/* count number of effective programs by walking parents */
145 	do {
146 		if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
147 			cnt += prog_list_length(&p->bpf.progs[type]);
148 		p = cgroup_parent(p);
149 	} while (p);
150 
151 	progs = bpf_prog_array_alloc(cnt, GFP_KERNEL);
152 	if (!progs)
153 		return -ENOMEM;
154 
155 	/* populate the array with effective progs */
156 	cnt = 0;
157 	p = cgrp;
158 	do {
159 		if (cnt > 0 && !(p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
160 			continue;
161 
162 		list_for_each_entry(pl, &p->bpf.progs[type], node) {
163 			if (!pl->prog)
164 				continue;
165 
166 			progs->items[cnt].prog = pl->prog;
167 			for_each_cgroup_storage_type(stype)
168 				progs->items[cnt].cgroup_storage[stype] =
169 					pl->storage[stype];
170 			cnt++;
171 		}
172 	} while ((p = cgroup_parent(p)));
173 
174 	*array = progs;
175 	return 0;
176 }
177 
178 static void activate_effective_progs(struct cgroup *cgrp,
179 				     enum bpf_attach_type type,
180 				     struct bpf_prog_array *old_array)
181 {
182 	old_array = rcu_replace_pointer(cgrp->bpf.effective[type], old_array,
183 					lockdep_is_held(&cgroup_mutex));
184 	/* free prog array after grace period, since __cgroup_bpf_run_*()
185 	 * might be still walking the array
186 	 */
187 	bpf_prog_array_free(old_array);
188 }
189 
190 /**
191  * cgroup_bpf_inherit() - inherit effective programs from parent
192  * @cgrp: the cgroup to modify
193  */
194 int cgroup_bpf_inherit(struct cgroup *cgrp)
195 {
196 /* has to use marco instead of const int, since compiler thinks
197  * that array below is variable length
198  */
199 #define	NR ARRAY_SIZE(cgrp->bpf.effective)
200 	struct bpf_prog_array *arrays[NR] = {};
201 	int ret, i;
202 
203 	ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0,
204 			      GFP_KERNEL);
205 	if (ret)
206 		return ret;
207 
208 	for (i = 0; i < NR; i++)
209 		INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
210 
211 	for (i = 0; i < NR; i++)
212 		if (compute_effective_progs(cgrp, i, &arrays[i]))
213 			goto cleanup;
214 
215 	for (i = 0; i < NR; i++)
216 		activate_effective_progs(cgrp, i, arrays[i]);
217 
218 	return 0;
219 cleanup:
220 	for (i = 0; i < NR; i++)
221 		bpf_prog_array_free(arrays[i]);
222 
223 	percpu_ref_exit(&cgrp->bpf.refcnt);
224 
225 	return -ENOMEM;
226 }
227 
228 static int update_effective_progs(struct cgroup *cgrp,
229 				  enum bpf_attach_type type)
230 {
231 	struct cgroup_subsys_state *css;
232 	int err;
233 
234 	/* allocate and recompute effective prog arrays */
235 	css_for_each_descendant_pre(css, &cgrp->self) {
236 		struct cgroup *desc = container_of(css, struct cgroup, self);
237 
238 		if (percpu_ref_is_zero(&desc->bpf.refcnt))
239 			continue;
240 
241 		err = compute_effective_progs(desc, type, &desc->bpf.inactive);
242 		if (err)
243 			goto cleanup;
244 	}
245 
246 	/* all allocations were successful. Activate all prog arrays */
247 	css_for_each_descendant_pre(css, &cgrp->self) {
248 		struct cgroup *desc = container_of(css, struct cgroup, self);
249 
250 		if (percpu_ref_is_zero(&desc->bpf.refcnt)) {
251 			if (unlikely(desc->bpf.inactive)) {
252 				bpf_prog_array_free(desc->bpf.inactive);
253 				desc->bpf.inactive = NULL;
254 			}
255 			continue;
256 		}
257 
258 		activate_effective_progs(desc, type, desc->bpf.inactive);
259 		desc->bpf.inactive = NULL;
260 	}
261 
262 	return 0;
263 
264 cleanup:
265 	/* oom while computing effective. Free all computed effective arrays
266 	 * since they were not activated
267 	 */
268 	css_for_each_descendant_pre(css, &cgrp->self) {
269 		struct cgroup *desc = container_of(css, struct cgroup, self);
270 
271 		bpf_prog_array_free(desc->bpf.inactive);
272 		desc->bpf.inactive = NULL;
273 	}
274 
275 	return err;
276 }
277 
278 #define BPF_CGROUP_MAX_PROGS 64
279 
280 /**
281  * __cgroup_bpf_attach() - Attach the program to a cgroup, and
282  *                         propagate the change to descendants
283  * @cgrp: The cgroup which descendants to traverse
284  * @prog: A program to attach
285  * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set
286  * @type: Type of attach operation
287  * @flags: Option flags
288  *
289  * Must be called with cgroup_mutex held.
290  */
291 int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
292 			struct bpf_prog *replace_prog,
293 			enum bpf_attach_type type, u32 flags)
294 {
295 	u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
296 	struct list_head *progs = &cgrp->bpf.progs[type];
297 	struct bpf_prog *old_prog = NULL;
298 	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE],
299 		*old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL};
300 	struct bpf_prog_list *pl, *replace_pl = NULL;
301 	enum bpf_cgroup_storage_type stype;
302 	int err;
303 
304 	if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
305 	    ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI)))
306 		/* invalid combination */
307 		return -EINVAL;
308 
309 	if (!hierarchy_allows_attach(cgrp, type))
310 		return -EPERM;
311 
312 	if (!list_empty(progs) && cgrp->bpf.flags[type] != saved_flags)
313 		/* Disallow attaching non-overridable on top
314 		 * of existing overridable in this cgroup.
315 		 * Disallow attaching multi-prog if overridable or none
316 		 */
317 		return -EPERM;
318 
319 	if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
320 		return -E2BIG;
321 
322 	if (flags & BPF_F_ALLOW_MULTI) {
323 		list_for_each_entry(pl, progs, node) {
324 			if (pl->prog == prog)
325 				/* disallow attaching the same prog twice */
326 				return -EINVAL;
327 			if (pl->prog == replace_prog)
328 				replace_pl = pl;
329 		}
330 		if ((flags & BPF_F_REPLACE) && !replace_pl)
331 			/* prog to replace not found for cgroup */
332 			return -ENOENT;
333 	} else if (!list_empty(progs)) {
334 		replace_pl = list_first_entry(progs, typeof(*pl), node);
335 	}
336 
337 	for_each_cgroup_storage_type(stype) {
338 		storage[stype] = bpf_cgroup_storage_alloc(prog, stype);
339 		if (IS_ERR(storage[stype])) {
340 			storage[stype] = NULL;
341 			for_each_cgroup_storage_type(stype)
342 				bpf_cgroup_storage_free(storage[stype]);
343 			return -ENOMEM;
344 		}
345 	}
346 
347 	if (replace_pl) {
348 		pl = replace_pl;
349 		old_prog = pl->prog;
350 		for_each_cgroup_storage_type(stype) {
351 			old_storage[stype] = pl->storage[stype];
352 			bpf_cgroup_storage_unlink(old_storage[stype]);
353 		}
354 	} else {
355 		pl = kmalloc(sizeof(*pl), GFP_KERNEL);
356 		if (!pl) {
357 			for_each_cgroup_storage_type(stype)
358 				bpf_cgroup_storage_free(storage[stype]);
359 			return -ENOMEM;
360 		}
361 		list_add_tail(&pl->node, progs);
362 	}
363 
364 	pl->prog = prog;
365 	for_each_cgroup_storage_type(stype)
366 		pl->storage[stype] = storage[stype];
367 
368 	cgrp->bpf.flags[type] = saved_flags;
369 
370 	err = update_effective_progs(cgrp, type);
371 	if (err)
372 		goto cleanup;
373 
374 	static_branch_inc(&cgroup_bpf_enabled_key);
375 	for_each_cgroup_storage_type(stype) {
376 		if (!old_storage[stype])
377 			continue;
378 		bpf_cgroup_storage_free(old_storage[stype]);
379 	}
380 	if (old_prog) {
381 		bpf_prog_put(old_prog);
382 		static_branch_dec(&cgroup_bpf_enabled_key);
383 	}
384 	for_each_cgroup_storage_type(stype)
385 		bpf_cgroup_storage_link(storage[stype], cgrp, type);
386 	return 0;
387 
388 cleanup:
389 	/* and cleanup the prog list */
390 	pl->prog = old_prog;
391 	for_each_cgroup_storage_type(stype) {
392 		bpf_cgroup_storage_free(pl->storage[stype]);
393 		pl->storage[stype] = old_storage[stype];
394 		bpf_cgroup_storage_link(old_storage[stype], cgrp, type);
395 	}
396 	if (!replace_pl) {
397 		list_del(&pl->node);
398 		kfree(pl);
399 	}
400 	return err;
401 }
402 
403 /**
404  * __cgroup_bpf_detach() - Detach the program from a cgroup, and
405  *                         propagate the change to descendants
406  * @cgrp: The cgroup which descendants to traverse
407  * @prog: A program to detach or NULL
408  * @type: Type of detach operation
409  *
410  * Must be called with cgroup_mutex held.
411  */
412 int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
413 			enum bpf_attach_type type)
414 {
415 	struct list_head *progs = &cgrp->bpf.progs[type];
416 	enum bpf_cgroup_storage_type stype;
417 	u32 flags = cgrp->bpf.flags[type];
418 	struct bpf_prog *old_prog = NULL;
419 	struct bpf_prog_list *pl;
420 	int err;
421 
422 	if (flags & BPF_F_ALLOW_MULTI) {
423 		if (!prog)
424 			/* to detach MULTI prog the user has to specify valid FD
425 			 * of the program to be detached
426 			 */
427 			return -EINVAL;
428 	} else {
429 		if (list_empty(progs))
430 			/* report error when trying to detach and nothing is attached */
431 			return -ENOENT;
432 	}
433 
434 	if (flags & BPF_F_ALLOW_MULTI) {
435 		/* find the prog and detach it */
436 		list_for_each_entry(pl, progs, node) {
437 			if (pl->prog != prog)
438 				continue;
439 			old_prog = prog;
440 			/* mark it deleted, so it's ignored while
441 			 * recomputing effective
442 			 */
443 			pl->prog = NULL;
444 			break;
445 		}
446 		if (!old_prog)
447 			return -ENOENT;
448 	} else {
449 		/* to maintain backward compatibility NONE and OVERRIDE cgroups
450 		 * allow detaching with invalid FD (prog==NULL)
451 		 */
452 		pl = list_first_entry(progs, typeof(*pl), node);
453 		old_prog = pl->prog;
454 		pl->prog = NULL;
455 	}
456 
457 	err = update_effective_progs(cgrp, type);
458 	if (err)
459 		goto cleanup;
460 
461 	/* now can actually delete it from this cgroup list */
462 	list_del(&pl->node);
463 	for_each_cgroup_storage_type(stype) {
464 		bpf_cgroup_storage_unlink(pl->storage[stype]);
465 		bpf_cgroup_storage_free(pl->storage[stype]);
466 	}
467 	kfree(pl);
468 	if (list_empty(progs))
469 		/* last program was detached, reset flags to zero */
470 		cgrp->bpf.flags[type] = 0;
471 
472 	bpf_prog_put(old_prog);
473 	static_branch_dec(&cgroup_bpf_enabled_key);
474 	return 0;
475 
476 cleanup:
477 	/* and restore back old_prog */
478 	pl->prog = old_prog;
479 	return err;
480 }
481 
482 /* Must be called with cgroup_mutex held to avoid races. */
483 int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
484 		       union bpf_attr __user *uattr)
485 {
486 	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
487 	enum bpf_attach_type type = attr->query.attach_type;
488 	struct list_head *progs = &cgrp->bpf.progs[type];
489 	u32 flags = cgrp->bpf.flags[type];
490 	struct bpf_prog_array *effective;
491 	int cnt, ret = 0, i;
492 
493 	effective = rcu_dereference_protected(cgrp->bpf.effective[type],
494 					      lockdep_is_held(&cgroup_mutex));
495 
496 	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
497 		cnt = bpf_prog_array_length(effective);
498 	else
499 		cnt = prog_list_length(progs);
500 
501 	if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
502 		return -EFAULT;
503 	if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt)))
504 		return -EFAULT;
505 	if (attr->query.prog_cnt == 0 || !prog_ids || !cnt)
506 		/* return early if user requested only program count + flags */
507 		return 0;
508 	if (attr->query.prog_cnt < cnt) {
509 		cnt = attr->query.prog_cnt;
510 		ret = -ENOSPC;
511 	}
512 
513 	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
514 		return bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
515 	} else {
516 		struct bpf_prog_list *pl;
517 		u32 id;
518 
519 		i = 0;
520 		list_for_each_entry(pl, progs, node) {
521 			id = pl->prog->aux->id;
522 			if (copy_to_user(prog_ids + i, &id, sizeof(id)))
523 				return -EFAULT;
524 			if (++i == cnt)
525 				break;
526 		}
527 	}
528 	return ret;
529 }
530 
531 int cgroup_bpf_prog_attach(const union bpf_attr *attr,
532 			   enum bpf_prog_type ptype, struct bpf_prog *prog)
533 {
534 	struct bpf_prog *replace_prog = NULL;
535 	struct cgroup *cgrp;
536 	int ret;
537 
538 	cgrp = cgroup_get_from_fd(attr->target_fd);
539 	if (IS_ERR(cgrp))
540 		return PTR_ERR(cgrp);
541 
542 	if ((attr->attach_flags & BPF_F_ALLOW_MULTI) &&
543 	    (attr->attach_flags & BPF_F_REPLACE)) {
544 		replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, ptype);
545 		if (IS_ERR(replace_prog)) {
546 			cgroup_put(cgrp);
547 			return PTR_ERR(replace_prog);
548 		}
549 	}
550 
551 	ret = cgroup_bpf_attach(cgrp, prog, replace_prog, attr->attach_type,
552 				attr->attach_flags);
553 
554 	if (replace_prog)
555 		bpf_prog_put(replace_prog);
556 	cgroup_put(cgrp);
557 	return ret;
558 }
559 
560 int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
561 {
562 	struct bpf_prog *prog;
563 	struct cgroup *cgrp;
564 	int ret;
565 
566 	cgrp = cgroup_get_from_fd(attr->target_fd);
567 	if (IS_ERR(cgrp))
568 		return PTR_ERR(cgrp);
569 
570 	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
571 	if (IS_ERR(prog))
572 		prog = NULL;
573 
574 	ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0);
575 	if (prog)
576 		bpf_prog_put(prog);
577 
578 	cgroup_put(cgrp);
579 	return ret;
580 }
581 
582 int cgroup_bpf_prog_query(const union bpf_attr *attr,
583 			  union bpf_attr __user *uattr)
584 {
585 	struct cgroup *cgrp;
586 	int ret;
587 
588 	cgrp = cgroup_get_from_fd(attr->query.target_fd);
589 	if (IS_ERR(cgrp))
590 		return PTR_ERR(cgrp);
591 
592 	ret = cgroup_bpf_query(cgrp, attr, uattr);
593 
594 	cgroup_put(cgrp);
595 	return ret;
596 }
597 
598 /**
599  * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
600  * @sk: The socket sending or receiving traffic
601  * @skb: The skb that is being sent or received
602  * @type: The type of program to be exectuted
603  *
604  * If no socket is passed, or the socket is not of type INET or INET6,
605  * this function does nothing and returns 0.
606  *
607  * The program type passed in via @type must be suitable for network
608  * filtering. No further check is performed to assert that.
609  *
610  * For egress packets, this function can return:
611  *   NET_XMIT_SUCCESS    (0)	- continue with packet output
612  *   NET_XMIT_DROP       (1)	- drop packet and notify TCP to call cwr
613  *   NET_XMIT_CN         (2)	- continue with packet output and notify TCP
614  *				  to call cwr
615  *   -EPERM			- drop packet
616  *
617  * For ingress packets, this function will return -EPERM if any
618  * attached program was found and if it returned != 1 during execution.
619  * Otherwise 0 is returned.
620  */
621 int __cgroup_bpf_run_filter_skb(struct sock *sk,
622 				struct sk_buff *skb,
623 				enum bpf_attach_type type)
624 {
625 	unsigned int offset = skb->data - skb_network_header(skb);
626 	struct sock *save_sk;
627 	void *saved_data_end;
628 	struct cgroup *cgrp;
629 	int ret;
630 
631 	if (!sk || !sk_fullsock(sk))
632 		return 0;
633 
634 	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
635 		return 0;
636 
637 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
638 	save_sk = skb->sk;
639 	skb->sk = sk;
640 	__skb_push(skb, offset);
641 
642 	/* compute pointers for the bpf prog */
643 	bpf_compute_and_save_data_end(skb, &saved_data_end);
644 
645 	if (type == BPF_CGROUP_INET_EGRESS) {
646 		ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(
647 			cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb);
648 	} else {
649 		ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
650 					  __bpf_prog_run_save_cb);
651 		ret = (ret == 1 ? 0 : -EPERM);
652 	}
653 	bpf_restore_data_end(skb, saved_data_end);
654 	__skb_pull(skb, offset);
655 	skb->sk = save_sk;
656 
657 	return ret;
658 }
659 EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
660 
661 /**
662  * __cgroup_bpf_run_filter_sk() - Run a program on a sock
663  * @sk: sock structure to manipulate
664  * @type: The type of program to be exectuted
665  *
666  * socket is passed is expected to be of type INET or INET6.
667  *
668  * The program type passed in via @type must be suitable for sock
669  * filtering. No further check is performed to assert that.
670  *
671  * This function will return %-EPERM if any if an attached program was found
672  * and if it returned != 1 during execution. In all other cases, 0 is returned.
673  */
674 int __cgroup_bpf_run_filter_sk(struct sock *sk,
675 			       enum bpf_attach_type type)
676 {
677 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
678 	int ret;
679 
680 	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN);
681 	return ret == 1 ? 0 : -EPERM;
682 }
683 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
684 
685 /**
686  * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and
687  *                                       provided by user sockaddr
688  * @sk: sock struct that will use sockaddr
689  * @uaddr: sockaddr struct provided by user
690  * @type: The type of program to be exectuted
691  * @t_ctx: Pointer to attach type specific context
692  *
693  * socket is expected to be of type INET or INET6.
694  *
695  * This function will return %-EPERM if an attached program is found and
696  * returned value != 1 during execution. In all other cases, 0 is returned.
697  */
698 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
699 				      struct sockaddr *uaddr,
700 				      enum bpf_attach_type type,
701 				      void *t_ctx)
702 {
703 	struct bpf_sock_addr_kern ctx = {
704 		.sk = sk,
705 		.uaddr = uaddr,
706 		.t_ctx = t_ctx,
707 	};
708 	struct sockaddr_storage unspec;
709 	struct cgroup *cgrp;
710 	int ret;
711 
712 	/* Check socket family since not all sockets represent network
713 	 * endpoint (e.g. AF_UNIX).
714 	 */
715 	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
716 		return 0;
717 
718 	if (!ctx.uaddr) {
719 		memset(&unspec, 0, sizeof(unspec));
720 		ctx.uaddr = (struct sockaddr *)&unspec;
721 	}
722 
723 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
724 	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
725 
726 	return ret == 1 ? 0 : -EPERM;
727 }
728 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
729 
730 /**
731  * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock
732  * @sk: socket to get cgroup from
733  * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
734  * sk with connection information (IP addresses, etc.) May not contain
735  * cgroup info if it is a req sock.
736  * @type: The type of program to be exectuted
737  *
738  * socket passed is expected to be of type INET or INET6.
739  *
740  * The program type passed in via @type must be suitable for sock_ops
741  * filtering. No further check is performed to assert that.
742  *
743  * This function will return %-EPERM if any if an attached program was found
744  * and if it returned != 1 during execution. In all other cases, 0 is returned.
745  */
746 int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
747 				     struct bpf_sock_ops_kern *sock_ops,
748 				     enum bpf_attach_type type)
749 {
750 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
751 	int ret;
752 
753 	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops,
754 				 BPF_PROG_RUN);
755 	return ret == 1 ? 0 : -EPERM;
756 }
757 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
758 
759 int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
760 				      short access, enum bpf_attach_type type)
761 {
762 	struct cgroup *cgrp;
763 	struct bpf_cgroup_dev_ctx ctx = {
764 		.access_type = (access << 16) | dev_type,
765 		.major = major,
766 		.minor = minor,
767 	};
768 	int allow = 1;
769 
770 	rcu_read_lock();
771 	cgrp = task_dfl_cgroup(current);
772 	allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx,
773 				   BPF_PROG_RUN);
774 	rcu_read_unlock();
775 
776 	return !allow;
777 }
778 EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission);
779 
780 static const struct bpf_func_proto *
781 cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
782 {
783 	switch (func_id) {
784 	case BPF_FUNC_map_lookup_elem:
785 		return &bpf_map_lookup_elem_proto;
786 	case BPF_FUNC_map_update_elem:
787 		return &bpf_map_update_elem_proto;
788 	case BPF_FUNC_map_delete_elem:
789 		return &bpf_map_delete_elem_proto;
790 	case BPF_FUNC_map_push_elem:
791 		return &bpf_map_push_elem_proto;
792 	case BPF_FUNC_map_pop_elem:
793 		return &bpf_map_pop_elem_proto;
794 	case BPF_FUNC_map_peek_elem:
795 		return &bpf_map_peek_elem_proto;
796 	case BPF_FUNC_get_current_uid_gid:
797 		return &bpf_get_current_uid_gid_proto;
798 	case BPF_FUNC_get_local_storage:
799 		return &bpf_get_local_storage_proto;
800 	case BPF_FUNC_get_current_cgroup_id:
801 		return &bpf_get_current_cgroup_id_proto;
802 	case BPF_FUNC_trace_printk:
803 		if (capable(CAP_SYS_ADMIN))
804 			return bpf_get_trace_printk_proto();
805 		/* fall through */
806 	default:
807 		return NULL;
808 	}
809 }
810 
811 static const struct bpf_func_proto *
812 cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
813 {
814 	return cgroup_base_func_proto(func_id, prog);
815 }
816 
817 static bool cgroup_dev_is_valid_access(int off, int size,
818 				       enum bpf_access_type type,
819 				       const struct bpf_prog *prog,
820 				       struct bpf_insn_access_aux *info)
821 {
822 	const int size_default = sizeof(__u32);
823 
824 	if (type == BPF_WRITE)
825 		return false;
826 
827 	if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx))
828 		return false;
829 	/* The verifier guarantees that size > 0. */
830 	if (off % size != 0)
831 		return false;
832 
833 	switch (off) {
834 	case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type):
835 		bpf_ctx_record_field_size(info, size_default);
836 		if (!bpf_ctx_narrow_access_ok(off, size, size_default))
837 			return false;
838 		break;
839 	default:
840 		if (size != size_default)
841 			return false;
842 	}
843 
844 	return true;
845 }
846 
847 const struct bpf_prog_ops cg_dev_prog_ops = {
848 };
849 
850 const struct bpf_verifier_ops cg_dev_verifier_ops = {
851 	.get_func_proto		= cgroup_dev_func_proto,
852 	.is_valid_access	= cgroup_dev_is_valid_access,
853 };
854 
855 /**
856  * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl
857  *
858  * @head: sysctl table header
859  * @table: sysctl table
860  * @write: sysctl is being read (= 0) or written (= 1)
861  * @buf: pointer to buffer passed by user space
862  * @pcount: value-result argument: value is size of buffer pointed to by @buf,
863  *	result is size of @new_buf if program set new value, initial value
864  *	otherwise
865  * @ppos: value-result argument: value is position at which read from or write
866  *	to sysctl is happening, result is new position if program overrode it,
867  *	initial value otherwise
868  * @new_buf: pointer to pointer to new buffer that will be allocated if program
869  *	overrides new value provided by user space on sysctl write
870  *	NOTE: it's caller responsibility to free *new_buf if it was set
871  * @type: type of program to be executed
872  *
873  * Program is run when sysctl is being accessed, either read or written, and
874  * can allow or deny such access.
875  *
876  * This function will return %-EPERM if an attached program is found and
877  * returned value != 1 during execution. In all other cases 0 is returned.
878  */
879 int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
880 				   struct ctl_table *table, int write,
881 				   void __user *buf, size_t *pcount,
882 				   loff_t *ppos, void **new_buf,
883 				   enum bpf_attach_type type)
884 {
885 	struct bpf_sysctl_kern ctx = {
886 		.head = head,
887 		.table = table,
888 		.write = write,
889 		.ppos = ppos,
890 		.cur_val = NULL,
891 		.cur_len = PAGE_SIZE,
892 		.new_val = NULL,
893 		.new_len = 0,
894 		.new_updated = 0,
895 	};
896 	struct cgroup *cgrp;
897 	int ret;
898 
899 	ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL);
900 	if (ctx.cur_val) {
901 		mm_segment_t old_fs;
902 		loff_t pos = 0;
903 
904 		old_fs = get_fs();
905 		set_fs(KERNEL_DS);
906 		if (table->proc_handler(table, 0, (void __user *)ctx.cur_val,
907 					&ctx.cur_len, &pos)) {
908 			/* Let BPF program decide how to proceed. */
909 			ctx.cur_len = 0;
910 		}
911 		set_fs(old_fs);
912 	} else {
913 		/* Let BPF program decide how to proceed. */
914 		ctx.cur_len = 0;
915 	}
916 
917 	if (write && buf && *pcount) {
918 		/* BPF program should be able to override new value with a
919 		 * buffer bigger than provided by user.
920 		 */
921 		ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL);
922 		ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount);
923 		if (!ctx.new_val ||
924 		    copy_from_user(ctx.new_val, buf, ctx.new_len))
925 			/* Let BPF program decide how to proceed. */
926 			ctx.new_len = 0;
927 	}
928 
929 	rcu_read_lock();
930 	cgrp = task_dfl_cgroup(current);
931 	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
932 	rcu_read_unlock();
933 
934 	kfree(ctx.cur_val);
935 
936 	if (ret == 1 && ctx.new_updated) {
937 		*new_buf = ctx.new_val;
938 		*pcount = ctx.new_len;
939 	} else {
940 		kfree(ctx.new_val);
941 	}
942 
943 	return ret == 1 ? 0 : -EPERM;
944 }
945 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl);
946 
947 #ifdef CONFIG_NET
948 static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
949 					     enum bpf_attach_type attach_type)
950 {
951 	struct bpf_prog_array *prog_array;
952 	bool empty;
953 
954 	rcu_read_lock();
955 	prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]);
956 	empty = bpf_prog_array_is_empty(prog_array);
957 	rcu_read_unlock();
958 
959 	return empty;
960 }
961 
962 static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
963 {
964 	if (unlikely(max_optlen > PAGE_SIZE) || max_optlen < 0)
965 		return -EINVAL;
966 
967 	ctx->optval = kzalloc(max_optlen, GFP_USER);
968 	if (!ctx->optval)
969 		return -ENOMEM;
970 
971 	ctx->optval_end = ctx->optval + max_optlen;
972 
973 	return 0;
974 }
975 
976 static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
977 {
978 	kfree(ctx->optval);
979 }
980 
981 int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
982 				       int *optname, char __user *optval,
983 				       int *optlen, char **kernel_optval)
984 {
985 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
986 	struct bpf_sockopt_kern ctx = {
987 		.sk = sk,
988 		.level = *level,
989 		.optname = *optname,
990 	};
991 	int ret, max_optlen;
992 
993 	/* Opportunistic check to see whether we have any BPF program
994 	 * attached to the hook so we don't waste time allocating
995 	 * memory and locking the socket.
996 	 */
997 	if (!cgroup_bpf_enabled ||
998 	    __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT))
999 		return 0;
1000 
1001 	/* Allocate a bit more than the initial user buffer for
1002 	 * BPF program. The canonical use case is overriding
1003 	 * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
1004 	 */
1005 	max_optlen = max_t(int, 16, *optlen);
1006 
1007 	ret = sockopt_alloc_buf(&ctx, max_optlen);
1008 	if (ret)
1009 		return ret;
1010 
1011 	ctx.optlen = *optlen;
1012 
1013 	if (copy_from_user(ctx.optval, optval, *optlen) != 0) {
1014 		ret = -EFAULT;
1015 		goto out;
1016 	}
1017 
1018 	lock_sock(sk);
1019 	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT],
1020 				 &ctx, BPF_PROG_RUN);
1021 	release_sock(sk);
1022 
1023 	if (!ret) {
1024 		ret = -EPERM;
1025 		goto out;
1026 	}
1027 
1028 	if (ctx.optlen == -1) {
1029 		/* optlen set to -1, bypass kernel */
1030 		ret = 1;
1031 	} else if (ctx.optlen > max_optlen || ctx.optlen < -1) {
1032 		/* optlen is out of bounds */
1033 		ret = -EFAULT;
1034 	} else {
1035 		/* optlen within bounds, run kernel handler */
1036 		ret = 0;
1037 
1038 		/* export any potential modifications */
1039 		*level = ctx.level;
1040 		*optname = ctx.optname;
1041 		*optlen = ctx.optlen;
1042 		*kernel_optval = ctx.optval;
1043 	}
1044 
1045 out:
1046 	if (ret)
1047 		sockopt_free_buf(&ctx);
1048 	return ret;
1049 }
1050 EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt);
1051 
1052 int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
1053 				       int optname, char __user *optval,
1054 				       int __user *optlen, int max_optlen,
1055 				       int retval)
1056 {
1057 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1058 	struct bpf_sockopt_kern ctx = {
1059 		.sk = sk,
1060 		.level = level,
1061 		.optname = optname,
1062 		.retval = retval,
1063 	};
1064 	int ret;
1065 
1066 	/* Opportunistic check to see whether we have any BPF program
1067 	 * attached to the hook so we don't waste time allocating
1068 	 * memory and locking the socket.
1069 	 */
1070 	if (!cgroup_bpf_enabled ||
1071 	    __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT))
1072 		return retval;
1073 
1074 	ret = sockopt_alloc_buf(&ctx, max_optlen);
1075 	if (ret)
1076 		return ret;
1077 
1078 	ctx.optlen = max_optlen;
1079 
1080 	if (!retval) {
1081 		/* If kernel getsockopt finished successfully,
1082 		 * copy whatever was returned to the user back
1083 		 * into our temporary buffer. Set optlen to the
1084 		 * one that kernel returned as well to let
1085 		 * BPF programs inspect the value.
1086 		 */
1087 
1088 		if (get_user(ctx.optlen, optlen)) {
1089 			ret = -EFAULT;
1090 			goto out;
1091 		}
1092 
1093 		if (ctx.optlen > max_optlen)
1094 			ctx.optlen = max_optlen;
1095 
1096 		if (copy_from_user(ctx.optval, optval, ctx.optlen) != 0) {
1097 			ret = -EFAULT;
1098 			goto out;
1099 		}
1100 	}
1101 
1102 	lock_sock(sk);
1103 	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
1104 				 &ctx, BPF_PROG_RUN);
1105 	release_sock(sk);
1106 
1107 	if (!ret) {
1108 		ret = -EPERM;
1109 		goto out;
1110 	}
1111 
1112 	if (ctx.optlen > max_optlen) {
1113 		ret = -EFAULT;
1114 		goto out;
1115 	}
1116 
1117 	/* BPF programs only allowed to set retval to 0, not some
1118 	 * arbitrary value.
1119 	 */
1120 	if (ctx.retval != 0 && ctx.retval != retval) {
1121 		ret = -EFAULT;
1122 		goto out;
1123 	}
1124 
1125 	if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
1126 	    put_user(ctx.optlen, optlen)) {
1127 		ret = -EFAULT;
1128 		goto out;
1129 	}
1130 
1131 	ret = ctx.retval;
1132 
1133 out:
1134 	sockopt_free_buf(&ctx);
1135 	return ret;
1136 }
1137 EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt);
1138 #endif
1139 
1140 static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
1141 			      size_t *lenp)
1142 {
1143 	ssize_t tmp_ret = 0, ret;
1144 
1145 	if (dir->header.parent) {
1146 		tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp);
1147 		if (tmp_ret < 0)
1148 			return tmp_ret;
1149 	}
1150 
1151 	ret = strscpy(*bufp, dir->header.ctl_table[0].procname, *lenp);
1152 	if (ret < 0)
1153 		return ret;
1154 	*bufp += ret;
1155 	*lenp -= ret;
1156 	ret += tmp_ret;
1157 
1158 	/* Avoid leading slash. */
1159 	if (!ret)
1160 		return ret;
1161 
1162 	tmp_ret = strscpy(*bufp, "/", *lenp);
1163 	if (tmp_ret < 0)
1164 		return tmp_ret;
1165 	*bufp += tmp_ret;
1166 	*lenp -= tmp_ret;
1167 
1168 	return ret + tmp_ret;
1169 }
1170 
1171 BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern *, ctx, char *, buf,
1172 	   size_t, buf_len, u64, flags)
1173 {
1174 	ssize_t tmp_ret = 0, ret;
1175 
1176 	if (!buf)
1177 		return -EINVAL;
1178 
1179 	if (!(flags & BPF_F_SYSCTL_BASE_NAME)) {
1180 		if (!ctx->head)
1181 			return -EINVAL;
1182 		tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len);
1183 		if (tmp_ret < 0)
1184 			return tmp_ret;
1185 	}
1186 
1187 	ret = strscpy(buf, ctx->table->procname, buf_len);
1188 
1189 	return ret < 0 ? ret : tmp_ret + ret;
1190 }
1191 
1192 static const struct bpf_func_proto bpf_sysctl_get_name_proto = {
1193 	.func		= bpf_sysctl_get_name,
1194 	.gpl_only	= false,
1195 	.ret_type	= RET_INTEGER,
1196 	.arg1_type	= ARG_PTR_TO_CTX,
1197 	.arg2_type	= ARG_PTR_TO_MEM,
1198 	.arg3_type	= ARG_CONST_SIZE,
1199 	.arg4_type	= ARG_ANYTHING,
1200 };
1201 
1202 static int copy_sysctl_value(char *dst, size_t dst_len, char *src,
1203 			     size_t src_len)
1204 {
1205 	if (!dst)
1206 		return -EINVAL;
1207 
1208 	if (!dst_len)
1209 		return -E2BIG;
1210 
1211 	if (!src || !src_len) {
1212 		memset(dst, 0, dst_len);
1213 		return -EINVAL;
1214 	}
1215 
1216 	memcpy(dst, src, min(dst_len, src_len));
1217 
1218 	if (dst_len > src_len) {
1219 		memset(dst + src_len, '\0', dst_len - src_len);
1220 		return src_len;
1221 	}
1222 
1223 	dst[dst_len - 1] = '\0';
1224 
1225 	return -E2BIG;
1226 }
1227 
1228 BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx,
1229 	   char *, buf, size_t, buf_len)
1230 {
1231 	return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len);
1232 }
1233 
1234 static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = {
1235 	.func		= bpf_sysctl_get_current_value,
1236 	.gpl_only	= false,
1237 	.ret_type	= RET_INTEGER,
1238 	.arg1_type	= ARG_PTR_TO_CTX,
1239 	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
1240 	.arg3_type	= ARG_CONST_SIZE,
1241 };
1242 
1243 BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf,
1244 	   size_t, buf_len)
1245 {
1246 	if (!ctx->write) {
1247 		if (buf && buf_len)
1248 			memset(buf, '\0', buf_len);
1249 		return -EINVAL;
1250 	}
1251 	return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len);
1252 }
1253 
1254 static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = {
1255 	.func		= bpf_sysctl_get_new_value,
1256 	.gpl_only	= false,
1257 	.ret_type	= RET_INTEGER,
1258 	.arg1_type	= ARG_PTR_TO_CTX,
1259 	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
1260 	.arg3_type	= ARG_CONST_SIZE,
1261 };
1262 
1263 BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx,
1264 	   const char *, buf, size_t, buf_len)
1265 {
1266 	if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len)
1267 		return -EINVAL;
1268 
1269 	if (buf_len > PAGE_SIZE - 1)
1270 		return -E2BIG;
1271 
1272 	memcpy(ctx->new_val, buf, buf_len);
1273 	ctx->new_len = buf_len;
1274 	ctx->new_updated = 1;
1275 
1276 	return 0;
1277 }
1278 
1279 static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
1280 	.func		= bpf_sysctl_set_new_value,
1281 	.gpl_only	= false,
1282 	.ret_type	= RET_INTEGER,
1283 	.arg1_type	= ARG_PTR_TO_CTX,
1284 	.arg2_type	= ARG_PTR_TO_MEM,
1285 	.arg3_type	= ARG_CONST_SIZE,
1286 };
1287 
1288 static const struct bpf_func_proto *
1289 sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1290 {
1291 	switch (func_id) {
1292 	case BPF_FUNC_strtol:
1293 		return &bpf_strtol_proto;
1294 	case BPF_FUNC_strtoul:
1295 		return &bpf_strtoul_proto;
1296 	case BPF_FUNC_sysctl_get_name:
1297 		return &bpf_sysctl_get_name_proto;
1298 	case BPF_FUNC_sysctl_get_current_value:
1299 		return &bpf_sysctl_get_current_value_proto;
1300 	case BPF_FUNC_sysctl_get_new_value:
1301 		return &bpf_sysctl_get_new_value_proto;
1302 	case BPF_FUNC_sysctl_set_new_value:
1303 		return &bpf_sysctl_set_new_value_proto;
1304 	default:
1305 		return cgroup_base_func_proto(func_id, prog);
1306 	}
1307 }
1308 
1309 static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type,
1310 				   const struct bpf_prog *prog,
1311 				   struct bpf_insn_access_aux *info)
1312 {
1313 	const int size_default = sizeof(__u32);
1314 
1315 	if (off < 0 || off + size > sizeof(struct bpf_sysctl) || off % size)
1316 		return false;
1317 
1318 	switch (off) {
1319 	case bpf_ctx_range(struct bpf_sysctl, write):
1320 		if (type != BPF_READ)
1321 			return false;
1322 		bpf_ctx_record_field_size(info, size_default);
1323 		return bpf_ctx_narrow_access_ok(off, size, size_default);
1324 	case bpf_ctx_range(struct bpf_sysctl, file_pos):
1325 		if (type == BPF_READ) {
1326 			bpf_ctx_record_field_size(info, size_default);
1327 			return bpf_ctx_narrow_access_ok(off, size, size_default);
1328 		} else {
1329 			return size == size_default;
1330 		}
1331 	default:
1332 		return false;
1333 	}
1334 }
1335 
1336 static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
1337 				     const struct bpf_insn *si,
1338 				     struct bpf_insn *insn_buf,
1339 				     struct bpf_prog *prog, u32 *target_size)
1340 {
1341 	struct bpf_insn *insn = insn_buf;
1342 	u32 read_size;
1343 
1344 	switch (si->off) {
1345 	case offsetof(struct bpf_sysctl, write):
1346 		*insn++ = BPF_LDX_MEM(
1347 			BPF_SIZE(si->code), si->dst_reg, si->src_reg,
1348 			bpf_target_off(struct bpf_sysctl_kern, write,
1349 				       FIELD_SIZEOF(struct bpf_sysctl_kern,
1350 						    write),
1351 				       target_size));
1352 		break;
1353 	case offsetof(struct bpf_sysctl, file_pos):
1354 		/* ppos is a pointer so it should be accessed via indirect
1355 		 * loads and stores. Also for stores additional temporary
1356 		 * register is used since neither src_reg nor dst_reg can be
1357 		 * overridden.
1358 		 */
1359 		if (type == BPF_WRITE) {
1360 			int treg = BPF_REG_9;
1361 
1362 			if (si->src_reg == treg || si->dst_reg == treg)
1363 				--treg;
1364 			if (si->src_reg == treg || si->dst_reg == treg)
1365 				--treg;
1366 			*insn++ = BPF_STX_MEM(
1367 				BPF_DW, si->dst_reg, treg,
1368 				offsetof(struct bpf_sysctl_kern, tmp_reg));
1369 			*insn++ = BPF_LDX_MEM(
1370 				BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
1371 				treg, si->dst_reg,
1372 				offsetof(struct bpf_sysctl_kern, ppos));
1373 			*insn++ = BPF_STX_MEM(
1374 				BPF_SIZEOF(u32), treg, si->src_reg,
1375 				bpf_ctx_narrow_access_offset(
1376 					0, sizeof(u32), sizeof(loff_t)));
1377 			*insn++ = BPF_LDX_MEM(
1378 				BPF_DW, treg, si->dst_reg,
1379 				offsetof(struct bpf_sysctl_kern, tmp_reg));
1380 		} else {
1381 			*insn++ = BPF_LDX_MEM(
1382 				BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
1383 				si->dst_reg, si->src_reg,
1384 				offsetof(struct bpf_sysctl_kern, ppos));
1385 			read_size = bpf_size_to_bytes(BPF_SIZE(si->code));
1386 			*insn++ = BPF_LDX_MEM(
1387 				BPF_SIZE(si->code), si->dst_reg, si->dst_reg,
1388 				bpf_ctx_narrow_access_offset(
1389 					0, read_size, sizeof(loff_t)));
1390 		}
1391 		*target_size = sizeof(u32);
1392 		break;
1393 	}
1394 
1395 	return insn - insn_buf;
1396 }
1397 
1398 const struct bpf_verifier_ops cg_sysctl_verifier_ops = {
1399 	.get_func_proto		= sysctl_func_proto,
1400 	.is_valid_access	= sysctl_is_valid_access,
1401 	.convert_ctx_access	= sysctl_convert_ctx_access,
1402 };
1403 
1404 const struct bpf_prog_ops cg_sysctl_prog_ops = {
1405 };
1406 
1407 static const struct bpf_func_proto *
1408 cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1409 {
1410 	switch (func_id) {
1411 #ifdef CONFIG_NET
1412 	case BPF_FUNC_sk_storage_get:
1413 		return &bpf_sk_storage_get_proto;
1414 	case BPF_FUNC_sk_storage_delete:
1415 		return &bpf_sk_storage_delete_proto;
1416 #endif
1417 #ifdef CONFIG_INET
1418 	case BPF_FUNC_tcp_sock:
1419 		return &bpf_tcp_sock_proto;
1420 #endif
1421 	default:
1422 		return cgroup_base_func_proto(func_id, prog);
1423 	}
1424 }
1425 
1426 static bool cg_sockopt_is_valid_access(int off, int size,
1427 				       enum bpf_access_type type,
1428 				       const struct bpf_prog *prog,
1429 				       struct bpf_insn_access_aux *info)
1430 {
1431 	const int size_default = sizeof(__u32);
1432 
1433 	if (off < 0 || off >= sizeof(struct bpf_sockopt))
1434 		return false;
1435 
1436 	if (off % size != 0)
1437 		return false;
1438 
1439 	if (type == BPF_WRITE) {
1440 		switch (off) {
1441 		case offsetof(struct bpf_sockopt, retval):
1442 			if (size != size_default)
1443 				return false;
1444 			return prog->expected_attach_type ==
1445 				BPF_CGROUP_GETSOCKOPT;
1446 		case offsetof(struct bpf_sockopt, optname):
1447 			/* fallthrough */
1448 		case offsetof(struct bpf_sockopt, level):
1449 			if (size != size_default)
1450 				return false;
1451 			return prog->expected_attach_type ==
1452 				BPF_CGROUP_SETSOCKOPT;
1453 		case offsetof(struct bpf_sockopt, optlen):
1454 			return size == size_default;
1455 		default:
1456 			return false;
1457 		}
1458 	}
1459 
1460 	switch (off) {
1461 	case offsetof(struct bpf_sockopt, sk):
1462 		if (size != sizeof(__u64))
1463 			return false;
1464 		info->reg_type = PTR_TO_SOCKET;
1465 		break;
1466 	case offsetof(struct bpf_sockopt, optval):
1467 		if (size != sizeof(__u64))
1468 			return false;
1469 		info->reg_type = PTR_TO_PACKET;
1470 		break;
1471 	case offsetof(struct bpf_sockopt, optval_end):
1472 		if (size != sizeof(__u64))
1473 			return false;
1474 		info->reg_type = PTR_TO_PACKET_END;
1475 		break;
1476 	case offsetof(struct bpf_sockopt, retval):
1477 		if (size != size_default)
1478 			return false;
1479 		return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT;
1480 	default:
1481 		if (size != size_default)
1482 			return false;
1483 		break;
1484 	}
1485 	return true;
1486 }
1487 
1488 #define CG_SOCKOPT_ACCESS_FIELD(T, F)					\
1489 	T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F),			\
1490 	  si->dst_reg, si->src_reg,					\
1491 	  offsetof(struct bpf_sockopt_kern, F))
1492 
1493 static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
1494 					 const struct bpf_insn *si,
1495 					 struct bpf_insn *insn_buf,
1496 					 struct bpf_prog *prog,
1497 					 u32 *target_size)
1498 {
1499 	struct bpf_insn *insn = insn_buf;
1500 
1501 	switch (si->off) {
1502 	case offsetof(struct bpf_sockopt, sk):
1503 		*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk);
1504 		break;
1505 	case offsetof(struct bpf_sockopt, level):
1506 		if (type == BPF_WRITE)
1507 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level);
1508 		else
1509 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level);
1510 		break;
1511 	case offsetof(struct bpf_sockopt, optname):
1512 		if (type == BPF_WRITE)
1513 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname);
1514 		else
1515 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname);
1516 		break;
1517 	case offsetof(struct bpf_sockopt, optlen):
1518 		if (type == BPF_WRITE)
1519 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen);
1520 		else
1521 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen);
1522 		break;
1523 	case offsetof(struct bpf_sockopt, retval):
1524 		if (type == BPF_WRITE)
1525 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval);
1526 		else
1527 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval);
1528 		break;
1529 	case offsetof(struct bpf_sockopt, optval):
1530 		*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval);
1531 		break;
1532 	case offsetof(struct bpf_sockopt, optval_end):
1533 		*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end);
1534 		break;
1535 	}
1536 
1537 	return insn - insn_buf;
1538 }
1539 
1540 static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf,
1541 				   bool direct_write,
1542 				   const struct bpf_prog *prog)
1543 {
1544 	/* Nothing to do for sockopt argument. The data is kzalloc'ated.
1545 	 */
1546 	return 0;
1547 }
1548 
1549 const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
1550 	.get_func_proto		= cg_sockopt_func_proto,
1551 	.is_valid_access	= cg_sockopt_is_valid_access,
1552 	.convert_ctx_access	= cg_sockopt_convert_ctx_access,
1553 	.gen_prologue		= cg_sockopt_get_prologue,
1554 };
1555 
1556 const struct bpf_prog_ops cg_sockopt_prog_ops = {
1557 };
1558