xref: /openbmc/linux/kernel/bpf/cgroup.c (revision 4c46091e)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Functions to manage eBPF programs attached to cgroups
4  *
5  * Copyright (c) 2016 Daniel Mack
6  */
7 
8 #include <linux/kernel.h>
9 #include <linux/atomic.h>
10 #include <linux/cgroup.h>
11 #include <linux/filter.h>
12 #include <linux/slab.h>
13 #include <linux/sysctl.h>
14 #include <linux/string.h>
15 #include <linux/bpf.h>
16 #include <linux/bpf-cgroup.h>
17 #include <net/sock.h>
18 #include <net/bpf_sk_storage.h>
19 
20 #include "../cgroup/cgroup-internal.h"
21 
22 DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
23 EXPORT_SYMBOL(cgroup_bpf_enabled_key);
24 
25 /* __always_inline is necessary to prevent indirect call through run_prog
26  * function pointer.
27  */
28 static __always_inline int
29 bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
30 		      enum cgroup_bpf_attach_type atype,
31 		      const void *ctx, bpf_prog_run_fn run_prog,
32 		      int retval, u32 *ret_flags)
33 {
34 	const struct bpf_prog_array_item *item;
35 	const struct bpf_prog *prog;
36 	const struct bpf_prog_array *array;
37 	struct bpf_run_ctx *old_run_ctx;
38 	struct bpf_cg_run_ctx run_ctx;
39 	u32 func_ret;
40 
41 	run_ctx.retval = retval;
42 	migrate_disable();
43 	rcu_read_lock();
44 	array = rcu_dereference(cgrp->effective[atype]);
45 	item = &array->items[0];
46 	old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
47 	while ((prog = READ_ONCE(item->prog))) {
48 		run_ctx.prog_item = item;
49 		func_ret = run_prog(prog, ctx);
50 		if (ret_flags) {
51 			*(ret_flags) |= (func_ret >> 1);
52 			func_ret &= 1;
53 		}
54 		if (!func_ret && !IS_ERR_VALUE((long)run_ctx.retval))
55 			run_ctx.retval = -EPERM;
56 		item++;
57 	}
58 	bpf_reset_run_ctx(old_run_ctx);
59 	rcu_read_unlock();
60 	migrate_enable();
61 	return run_ctx.retval;
62 }
63 
64 void cgroup_bpf_offline(struct cgroup *cgrp)
65 {
66 	cgroup_get(cgrp);
67 	percpu_ref_kill(&cgrp->bpf.refcnt);
68 }
69 
70 static void bpf_cgroup_storages_free(struct bpf_cgroup_storage *storages[])
71 {
72 	enum bpf_cgroup_storage_type stype;
73 
74 	for_each_cgroup_storage_type(stype)
75 		bpf_cgroup_storage_free(storages[stype]);
76 }
77 
78 static int bpf_cgroup_storages_alloc(struct bpf_cgroup_storage *storages[],
79 				     struct bpf_cgroup_storage *new_storages[],
80 				     enum bpf_attach_type type,
81 				     struct bpf_prog *prog,
82 				     struct cgroup *cgrp)
83 {
84 	enum bpf_cgroup_storage_type stype;
85 	struct bpf_cgroup_storage_key key;
86 	struct bpf_map *map;
87 
88 	key.cgroup_inode_id = cgroup_id(cgrp);
89 	key.attach_type = type;
90 
91 	for_each_cgroup_storage_type(stype) {
92 		map = prog->aux->cgroup_storage[stype];
93 		if (!map)
94 			continue;
95 
96 		storages[stype] = cgroup_storage_lookup((void *)map, &key, false);
97 		if (storages[stype])
98 			continue;
99 
100 		storages[stype] = bpf_cgroup_storage_alloc(prog, stype);
101 		if (IS_ERR(storages[stype])) {
102 			bpf_cgroup_storages_free(new_storages);
103 			return -ENOMEM;
104 		}
105 
106 		new_storages[stype] = storages[stype];
107 	}
108 
109 	return 0;
110 }
111 
112 static void bpf_cgroup_storages_assign(struct bpf_cgroup_storage *dst[],
113 				       struct bpf_cgroup_storage *src[])
114 {
115 	enum bpf_cgroup_storage_type stype;
116 
117 	for_each_cgroup_storage_type(stype)
118 		dst[stype] = src[stype];
119 }
120 
121 static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[],
122 				     struct cgroup *cgrp,
123 				     enum bpf_attach_type attach_type)
124 {
125 	enum bpf_cgroup_storage_type stype;
126 
127 	for_each_cgroup_storage_type(stype)
128 		bpf_cgroup_storage_link(storages[stype], cgrp, attach_type);
129 }
130 
131 /* Called when bpf_cgroup_link is auto-detached from dying cgroup.
132  * It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It
133  * doesn't free link memory, which will eventually be done by bpf_link's
134  * release() callback, when its last FD is closed.
135  */
136 static void bpf_cgroup_link_auto_detach(struct bpf_cgroup_link *link)
137 {
138 	cgroup_put(link->cgroup);
139 	link->cgroup = NULL;
140 }
141 
142 /**
143  * cgroup_bpf_release() - put references of all bpf programs and
144  *                        release all cgroup bpf data
145  * @work: work structure embedded into the cgroup to modify
146  */
147 static void cgroup_bpf_release(struct work_struct *work)
148 {
149 	struct cgroup *p, *cgrp = container_of(work, struct cgroup,
150 					       bpf.release_work);
151 	struct bpf_prog_array *old_array;
152 	struct list_head *storages = &cgrp->bpf.storages;
153 	struct bpf_cgroup_storage *storage, *stmp;
154 
155 	unsigned int atype;
156 
157 	mutex_lock(&cgroup_mutex);
158 
159 	for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) {
160 		struct list_head *progs = &cgrp->bpf.progs[atype];
161 		struct bpf_prog_list *pl, *pltmp;
162 
163 		list_for_each_entry_safe(pl, pltmp, progs, node) {
164 			list_del(&pl->node);
165 			if (pl->prog)
166 				bpf_prog_put(pl->prog);
167 			if (pl->link)
168 				bpf_cgroup_link_auto_detach(pl->link);
169 			kfree(pl);
170 			static_branch_dec(&cgroup_bpf_enabled_key[atype]);
171 		}
172 		old_array = rcu_dereference_protected(
173 				cgrp->bpf.effective[atype],
174 				lockdep_is_held(&cgroup_mutex));
175 		bpf_prog_array_free(old_array);
176 	}
177 
178 	list_for_each_entry_safe(storage, stmp, storages, list_cg) {
179 		bpf_cgroup_storage_unlink(storage);
180 		bpf_cgroup_storage_free(storage);
181 	}
182 
183 	mutex_unlock(&cgroup_mutex);
184 
185 	for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
186 		cgroup_bpf_put(p);
187 
188 	percpu_ref_exit(&cgrp->bpf.refcnt);
189 	cgroup_put(cgrp);
190 }
191 
192 /**
193  * cgroup_bpf_release_fn() - callback used to schedule releasing
194  *                           of bpf cgroup data
195  * @ref: percpu ref counter structure
196  */
197 static void cgroup_bpf_release_fn(struct percpu_ref *ref)
198 {
199 	struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
200 
201 	INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
202 	queue_work(system_wq, &cgrp->bpf.release_work);
203 }
204 
205 /* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through
206  * link or direct prog.
207  */
208 static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
209 {
210 	if (pl->prog)
211 		return pl->prog;
212 	if (pl->link)
213 		return pl->link->link.prog;
214 	return NULL;
215 }
216 
217 /* count number of elements in the list.
218  * it's slow but the list cannot be long
219  */
220 static u32 prog_list_length(struct list_head *head)
221 {
222 	struct bpf_prog_list *pl;
223 	u32 cnt = 0;
224 
225 	list_for_each_entry(pl, head, node) {
226 		if (!prog_list_prog(pl))
227 			continue;
228 		cnt++;
229 	}
230 	return cnt;
231 }
232 
233 /* if parent has non-overridable prog attached,
234  * disallow attaching new programs to the descendent cgroup.
235  * if parent has overridable or multi-prog, allow attaching
236  */
237 static bool hierarchy_allows_attach(struct cgroup *cgrp,
238 				    enum cgroup_bpf_attach_type atype)
239 {
240 	struct cgroup *p;
241 
242 	p = cgroup_parent(cgrp);
243 	if (!p)
244 		return true;
245 	do {
246 		u32 flags = p->bpf.flags[atype];
247 		u32 cnt;
248 
249 		if (flags & BPF_F_ALLOW_MULTI)
250 			return true;
251 		cnt = prog_list_length(&p->bpf.progs[atype]);
252 		WARN_ON_ONCE(cnt > 1);
253 		if (cnt == 1)
254 			return !!(flags & BPF_F_ALLOW_OVERRIDE);
255 		p = cgroup_parent(p);
256 	} while (p);
257 	return true;
258 }
259 
260 /* compute a chain of effective programs for a given cgroup:
261  * start from the list of programs in this cgroup and add
262  * all parent programs.
263  * Note that parent's F_ALLOW_OVERRIDE-type program is yielding
264  * to programs in this cgroup
265  */
266 static int compute_effective_progs(struct cgroup *cgrp,
267 				   enum cgroup_bpf_attach_type atype,
268 				   struct bpf_prog_array **array)
269 {
270 	struct bpf_prog_array_item *item;
271 	struct bpf_prog_array *progs;
272 	struct bpf_prog_list *pl;
273 	struct cgroup *p = cgrp;
274 	int cnt = 0;
275 
276 	/* count number of effective programs by walking parents */
277 	do {
278 		if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
279 			cnt += prog_list_length(&p->bpf.progs[atype]);
280 		p = cgroup_parent(p);
281 	} while (p);
282 
283 	progs = bpf_prog_array_alloc(cnt, GFP_KERNEL);
284 	if (!progs)
285 		return -ENOMEM;
286 
287 	/* populate the array with effective progs */
288 	cnt = 0;
289 	p = cgrp;
290 	do {
291 		if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
292 			continue;
293 
294 		list_for_each_entry(pl, &p->bpf.progs[atype], node) {
295 			if (!prog_list_prog(pl))
296 				continue;
297 
298 			item = &progs->items[cnt];
299 			item->prog = prog_list_prog(pl);
300 			bpf_cgroup_storages_assign(item->cgroup_storage,
301 						   pl->storage);
302 			cnt++;
303 		}
304 	} while ((p = cgroup_parent(p)));
305 
306 	*array = progs;
307 	return 0;
308 }
309 
310 static void activate_effective_progs(struct cgroup *cgrp,
311 				     enum cgroup_bpf_attach_type atype,
312 				     struct bpf_prog_array *old_array)
313 {
314 	old_array = rcu_replace_pointer(cgrp->bpf.effective[atype], old_array,
315 					lockdep_is_held(&cgroup_mutex));
316 	/* free prog array after grace period, since __cgroup_bpf_run_*()
317 	 * might be still walking the array
318 	 */
319 	bpf_prog_array_free(old_array);
320 }
321 
322 /**
323  * cgroup_bpf_inherit() - inherit effective programs from parent
324  * @cgrp: the cgroup to modify
325  */
326 int cgroup_bpf_inherit(struct cgroup *cgrp)
327 {
328 /* has to use marco instead of const int, since compiler thinks
329  * that array below is variable length
330  */
331 #define	NR ARRAY_SIZE(cgrp->bpf.effective)
332 	struct bpf_prog_array *arrays[NR] = {};
333 	struct cgroup *p;
334 	int ret, i;
335 
336 	ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0,
337 			      GFP_KERNEL);
338 	if (ret)
339 		return ret;
340 
341 	for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
342 		cgroup_bpf_get(p);
343 
344 	for (i = 0; i < NR; i++)
345 		INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
346 
347 	INIT_LIST_HEAD(&cgrp->bpf.storages);
348 
349 	for (i = 0; i < NR; i++)
350 		if (compute_effective_progs(cgrp, i, &arrays[i]))
351 			goto cleanup;
352 
353 	for (i = 0; i < NR; i++)
354 		activate_effective_progs(cgrp, i, arrays[i]);
355 
356 	return 0;
357 cleanup:
358 	for (i = 0; i < NR; i++)
359 		bpf_prog_array_free(arrays[i]);
360 
361 	for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
362 		cgroup_bpf_put(p);
363 
364 	percpu_ref_exit(&cgrp->bpf.refcnt);
365 
366 	return -ENOMEM;
367 }
368 
369 static int update_effective_progs(struct cgroup *cgrp,
370 				  enum cgroup_bpf_attach_type atype)
371 {
372 	struct cgroup_subsys_state *css;
373 	int err;
374 
375 	/* allocate and recompute effective prog arrays */
376 	css_for_each_descendant_pre(css, &cgrp->self) {
377 		struct cgroup *desc = container_of(css, struct cgroup, self);
378 
379 		if (percpu_ref_is_zero(&desc->bpf.refcnt))
380 			continue;
381 
382 		err = compute_effective_progs(desc, atype, &desc->bpf.inactive);
383 		if (err)
384 			goto cleanup;
385 	}
386 
387 	/* all allocations were successful. Activate all prog arrays */
388 	css_for_each_descendant_pre(css, &cgrp->self) {
389 		struct cgroup *desc = container_of(css, struct cgroup, self);
390 
391 		if (percpu_ref_is_zero(&desc->bpf.refcnt)) {
392 			if (unlikely(desc->bpf.inactive)) {
393 				bpf_prog_array_free(desc->bpf.inactive);
394 				desc->bpf.inactive = NULL;
395 			}
396 			continue;
397 		}
398 
399 		activate_effective_progs(desc, atype, desc->bpf.inactive);
400 		desc->bpf.inactive = NULL;
401 	}
402 
403 	return 0;
404 
405 cleanup:
406 	/* oom while computing effective. Free all computed effective arrays
407 	 * since they were not activated
408 	 */
409 	css_for_each_descendant_pre(css, &cgrp->self) {
410 		struct cgroup *desc = container_of(css, struct cgroup, self);
411 
412 		bpf_prog_array_free(desc->bpf.inactive);
413 		desc->bpf.inactive = NULL;
414 	}
415 
416 	return err;
417 }
418 
419 #define BPF_CGROUP_MAX_PROGS 64
420 
421 static struct bpf_prog_list *find_attach_entry(struct list_head *progs,
422 					       struct bpf_prog *prog,
423 					       struct bpf_cgroup_link *link,
424 					       struct bpf_prog *replace_prog,
425 					       bool allow_multi)
426 {
427 	struct bpf_prog_list *pl;
428 
429 	/* single-attach case */
430 	if (!allow_multi) {
431 		if (list_empty(progs))
432 			return NULL;
433 		return list_first_entry(progs, typeof(*pl), node);
434 	}
435 
436 	list_for_each_entry(pl, progs, node) {
437 		if (prog && pl->prog == prog && prog != replace_prog)
438 			/* disallow attaching the same prog twice */
439 			return ERR_PTR(-EINVAL);
440 		if (link && pl->link == link)
441 			/* disallow attaching the same link twice */
442 			return ERR_PTR(-EINVAL);
443 	}
444 
445 	/* direct prog multi-attach w/ replacement case */
446 	if (replace_prog) {
447 		list_for_each_entry(pl, progs, node) {
448 			if (pl->prog == replace_prog)
449 				/* a match found */
450 				return pl;
451 		}
452 		/* prog to replace not found for cgroup */
453 		return ERR_PTR(-ENOENT);
454 	}
455 
456 	return NULL;
457 }
458 
459 /**
460  * __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and
461  *                         propagate the change to descendants
462  * @cgrp: The cgroup which descendants to traverse
463  * @prog: A program to attach
464  * @link: A link to attach
465  * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set
466  * @type: Type of attach operation
467  * @flags: Option flags
468  *
469  * Exactly one of @prog or @link can be non-null.
470  * Must be called with cgroup_mutex held.
471  */
472 static int __cgroup_bpf_attach(struct cgroup *cgrp,
473 			       struct bpf_prog *prog, struct bpf_prog *replace_prog,
474 			       struct bpf_cgroup_link *link,
475 			       enum bpf_attach_type type, u32 flags)
476 {
477 	u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
478 	struct bpf_prog *old_prog = NULL;
479 	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
480 	struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
481 	enum cgroup_bpf_attach_type atype;
482 	struct bpf_prog_list *pl;
483 	struct list_head *progs;
484 	int err;
485 
486 	if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
487 	    ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI)))
488 		/* invalid combination */
489 		return -EINVAL;
490 	if (link && (prog || replace_prog))
491 		/* only either link or prog/replace_prog can be specified */
492 		return -EINVAL;
493 	if (!!replace_prog != !!(flags & BPF_F_REPLACE))
494 		/* replace_prog implies BPF_F_REPLACE, and vice versa */
495 		return -EINVAL;
496 
497 	atype = to_cgroup_bpf_attach_type(type);
498 	if (atype < 0)
499 		return -EINVAL;
500 
501 	progs = &cgrp->bpf.progs[atype];
502 
503 	if (!hierarchy_allows_attach(cgrp, atype))
504 		return -EPERM;
505 
506 	if (!list_empty(progs) && cgrp->bpf.flags[atype] != saved_flags)
507 		/* Disallow attaching non-overridable on top
508 		 * of existing overridable in this cgroup.
509 		 * Disallow attaching multi-prog if overridable or none
510 		 */
511 		return -EPERM;
512 
513 	if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
514 		return -E2BIG;
515 
516 	pl = find_attach_entry(progs, prog, link, replace_prog,
517 			       flags & BPF_F_ALLOW_MULTI);
518 	if (IS_ERR(pl))
519 		return PTR_ERR(pl);
520 
521 	if (bpf_cgroup_storages_alloc(storage, new_storage, type,
522 				      prog ? : link->link.prog, cgrp))
523 		return -ENOMEM;
524 
525 	if (pl) {
526 		old_prog = pl->prog;
527 	} else {
528 		pl = kmalloc(sizeof(*pl), GFP_KERNEL);
529 		if (!pl) {
530 			bpf_cgroup_storages_free(new_storage);
531 			return -ENOMEM;
532 		}
533 		list_add_tail(&pl->node, progs);
534 	}
535 
536 	pl->prog = prog;
537 	pl->link = link;
538 	bpf_cgroup_storages_assign(pl->storage, storage);
539 	cgrp->bpf.flags[atype] = saved_flags;
540 
541 	err = update_effective_progs(cgrp, atype);
542 	if (err)
543 		goto cleanup;
544 
545 	if (old_prog)
546 		bpf_prog_put(old_prog);
547 	else
548 		static_branch_inc(&cgroup_bpf_enabled_key[atype]);
549 	bpf_cgroup_storages_link(new_storage, cgrp, type);
550 	return 0;
551 
552 cleanup:
553 	if (old_prog) {
554 		pl->prog = old_prog;
555 		pl->link = NULL;
556 	}
557 	bpf_cgroup_storages_free(new_storage);
558 	if (!old_prog) {
559 		list_del(&pl->node);
560 		kfree(pl);
561 	}
562 	return err;
563 }
564 
565 static int cgroup_bpf_attach(struct cgroup *cgrp,
566 			     struct bpf_prog *prog, struct bpf_prog *replace_prog,
567 			     struct bpf_cgroup_link *link,
568 			     enum bpf_attach_type type,
569 			     u32 flags)
570 {
571 	int ret;
572 
573 	mutex_lock(&cgroup_mutex);
574 	ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
575 	mutex_unlock(&cgroup_mutex);
576 	return ret;
577 }
578 
579 /* Swap updated BPF program for given link in effective program arrays across
580  * all descendant cgroups. This function is guaranteed to succeed.
581  */
582 static void replace_effective_prog(struct cgroup *cgrp,
583 				   enum cgroup_bpf_attach_type atype,
584 				   struct bpf_cgroup_link *link)
585 {
586 	struct bpf_prog_array_item *item;
587 	struct cgroup_subsys_state *css;
588 	struct bpf_prog_array *progs;
589 	struct bpf_prog_list *pl;
590 	struct list_head *head;
591 	struct cgroup *cg;
592 	int pos;
593 
594 	css_for_each_descendant_pre(css, &cgrp->self) {
595 		struct cgroup *desc = container_of(css, struct cgroup, self);
596 
597 		if (percpu_ref_is_zero(&desc->bpf.refcnt))
598 			continue;
599 
600 		/* find position of link in effective progs array */
601 		for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
602 			if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
603 				continue;
604 
605 			head = &cg->bpf.progs[atype];
606 			list_for_each_entry(pl, head, node) {
607 				if (!prog_list_prog(pl))
608 					continue;
609 				if (pl->link == link)
610 					goto found;
611 				pos++;
612 			}
613 		}
614 found:
615 		BUG_ON(!cg);
616 		progs = rcu_dereference_protected(
617 				desc->bpf.effective[atype],
618 				lockdep_is_held(&cgroup_mutex));
619 		item = &progs->items[pos];
620 		WRITE_ONCE(item->prog, link->link.prog);
621 	}
622 }
623 
624 /**
625  * __cgroup_bpf_replace() - Replace link's program and propagate the change
626  *                          to descendants
627  * @cgrp: The cgroup which descendants to traverse
628  * @link: A link for which to replace BPF program
629  * @type: Type of attach operation
630  *
631  * Must be called with cgroup_mutex held.
632  */
633 static int __cgroup_bpf_replace(struct cgroup *cgrp,
634 				struct bpf_cgroup_link *link,
635 				struct bpf_prog *new_prog)
636 {
637 	enum cgroup_bpf_attach_type atype;
638 	struct bpf_prog *old_prog;
639 	struct bpf_prog_list *pl;
640 	struct list_head *progs;
641 	bool found = false;
642 
643 	atype = to_cgroup_bpf_attach_type(link->type);
644 	if (atype < 0)
645 		return -EINVAL;
646 
647 	progs = &cgrp->bpf.progs[atype];
648 
649 	if (link->link.prog->type != new_prog->type)
650 		return -EINVAL;
651 
652 	list_for_each_entry(pl, progs, node) {
653 		if (pl->link == link) {
654 			found = true;
655 			break;
656 		}
657 	}
658 	if (!found)
659 		return -ENOENT;
660 
661 	old_prog = xchg(&link->link.prog, new_prog);
662 	replace_effective_prog(cgrp, atype, link);
663 	bpf_prog_put(old_prog);
664 	return 0;
665 }
666 
667 static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog,
668 			      struct bpf_prog *old_prog)
669 {
670 	struct bpf_cgroup_link *cg_link;
671 	int ret;
672 
673 	cg_link = container_of(link, struct bpf_cgroup_link, link);
674 
675 	mutex_lock(&cgroup_mutex);
676 	/* link might have been auto-released by dying cgroup, so fail */
677 	if (!cg_link->cgroup) {
678 		ret = -ENOLINK;
679 		goto out_unlock;
680 	}
681 	if (old_prog && link->prog != old_prog) {
682 		ret = -EPERM;
683 		goto out_unlock;
684 	}
685 	ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog);
686 out_unlock:
687 	mutex_unlock(&cgroup_mutex);
688 	return ret;
689 }
690 
691 static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
692 					       struct bpf_prog *prog,
693 					       struct bpf_cgroup_link *link,
694 					       bool allow_multi)
695 {
696 	struct bpf_prog_list *pl;
697 
698 	if (!allow_multi) {
699 		if (list_empty(progs))
700 			/* report error when trying to detach and nothing is attached */
701 			return ERR_PTR(-ENOENT);
702 
703 		/* to maintain backward compatibility NONE and OVERRIDE cgroups
704 		 * allow detaching with invalid FD (prog==NULL) in legacy mode
705 		 */
706 		return list_first_entry(progs, typeof(*pl), node);
707 	}
708 
709 	if (!prog && !link)
710 		/* to detach MULTI prog the user has to specify valid FD
711 		 * of the program or link to be detached
712 		 */
713 		return ERR_PTR(-EINVAL);
714 
715 	/* find the prog or link and detach it */
716 	list_for_each_entry(pl, progs, node) {
717 		if (pl->prog == prog && pl->link == link)
718 			return pl;
719 	}
720 	return ERR_PTR(-ENOENT);
721 }
722 
723 /**
724  * purge_effective_progs() - After compute_effective_progs fails to alloc new
725  *                           cgrp->bpf.inactive table we can recover by
726  *                           recomputing the array in place.
727  *
728  * @cgrp: The cgroup which descendants to travers
729  * @prog: A program to detach or NULL
730  * @link: A link to detach or NULL
731  * @atype: Type of detach operation
732  */
733 static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog,
734 				  struct bpf_cgroup_link *link,
735 				  enum cgroup_bpf_attach_type atype)
736 {
737 	struct cgroup_subsys_state *css;
738 	struct bpf_prog_array *progs;
739 	struct bpf_prog_list *pl;
740 	struct list_head *head;
741 	struct cgroup *cg;
742 	int pos;
743 
744 	/* recompute effective prog array in place */
745 	css_for_each_descendant_pre(css, &cgrp->self) {
746 		struct cgroup *desc = container_of(css, struct cgroup, self);
747 
748 		if (percpu_ref_is_zero(&desc->bpf.refcnt))
749 			continue;
750 
751 		/* find position of link or prog in effective progs array */
752 		for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
753 			if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
754 				continue;
755 
756 			head = &cg->bpf.progs[atype];
757 			list_for_each_entry(pl, head, node) {
758 				if (!prog_list_prog(pl))
759 					continue;
760 				if (pl->prog == prog && pl->link == link)
761 					goto found;
762 				pos++;
763 			}
764 		}
765 found:
766 		BUG_ON(!cg);
767 		progs = rcu_dereference_protected(
768 				desc->bpf.effective[atype],
769 				lockdep_is_held(&cgroup_mutex));
770 
771 		/* Remove the program from the array */
772 		WARN_ONCE(bpf_prog_array_delete_safe_at(progs, pos),
773 			  "Failed to purge a prog from array at index %d", pos);
774 	}
775 }
776 
777 /**
778  * __cgroup_bpf_detach() - Detach the program or link from a cgroup, and
779  *                         propagate the change to descendants
780  * @cgrp: The cgroup which descendants to traverse
781  * @prog: A program to detach or NULL
782  * @link: A link to detach or NULL
783  * @type: Type of detach operation
784  *
785  * At most one of @prog or @link can be non-NULL.
786  * Must be called with cgroup_mutex held.
787  */
788 static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
789 			       struct bpf_cgroup_link *link, enum bpf_attach_type type)
790 {
791 	enum cgroup_bpf_attach_type atype;
792 	struct bpf_prog *old_prog;
793 	struct bpf_prog_list *pl;
794 	struct list_head *progs;
795 	u32 flags;
796 
797 	atype = to_cgroup_bpf_attach_type(type);
798 	if (atype < 0)
799 		return -EINVAL;
800 
801 	progs = &cgrp->bpf.progs[atype];
802 	flags = cgrp->bpf.flags[atype];
803 
804 	if (prog && link)
805 		/* only one of prog or link can be specified */
806 		return -EINVAL;
807 
808 	pl = find_detach_entry(progs, prog, link, flags & BPF_F_ALLOW_MULTI);
809 	if (IS_ERR(pl))
810 		return PTR_ERR(pl);
811 
812 	/* mark it deleted, so it's ignored while recomputing effective */
813 	old_prog = pl->prog;
814 	pl->prog = NULL;
815 	pl->link = NULL;
816 
817 	if (update_effective_progs(cgrp, atype)) {
818 		/* if update effective array failed replace the prog with a dummy prog*/
819 		pl->prog = old_prog;
820 		pl->link = link;
821 		purge_effective_progs(cgrp, old_prog, link, atype);
822 	}
823 
824 	/* now can actually delete it from this cgroup list */
825 	list_del(&pl->node);
826 	kfree(pl);
827 	if (list_empty(progs))
828 		/* last program was detached, reset flags to zero */
829 		cgrp->bpf.flags[atype] = 0;
830 	if (old_prog)
831 		bpf_prog_put(old_prog);
832 	static_branch_dec(&cgroup_bpf_enabled_key[atype]);
833 	return 0;
834 }
835 
836 static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
837 			     enum bpf_attach_type type)
838 {
839 	int ret;
840 
841 	mutex_lock(&cgroup_mutex);
842 	ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
843 	mutex_unlock(&cgroup_mutex);
844 	return ret;
845 }
846 
847 /* Must be called with cgroup_mutex held to avoid races. */
848 static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
849 			      union bpf_attr __user *uattr)
850 {
851 	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
852 	enum bpf_attach_type type = attr->query.attach_type;
853 	enum cgroup_bpf_attach_type atype;
854 	struct bpf_prog_array *effective;
855 	struct list_head *progs;
856 	struct bpf_prog *prog;
857 	int cnt, ret = 0, i;
858 	u32 flags;
859 
860 	atype = to_cgroup_bpf_attach_type(type);
861 	if (atype < 0)
862 		return -EINVAL;
863 
864 	progs = &cgrp->bpf.progs[atype];
865 	flags = cgrp->bpf.flags[atype];
866 
867 	effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
868 					      lockdep_is_held(&cgroup_mutex));
869 
870 	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
871 		cnt = bpf_prog_array_length(effective);
872 	else
873 		cnt = prog_list_length(progs);
874 
875 	if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
876 		return -EFAULT;
877 	if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt)))
878 		return -EFAULT;
879 	if (attr->query.prog_cnt == 0 || !prog_ids || !cnt)
880 		/* return early if user requested only program count + flags */
881 		return 0;
882 	if (attr->query.prog_cnt < cnt) {
883 		cnt = attr->query.prog_cnt;
884 		ret = -ENOSPC;
885 	}
886 
887 	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
888 		return bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
889 	} else {
890 		struct bpf_prog_list *pl;
891 		u32 id;
892 
893 		i = 0;
894 		list_for_each_entry(pl, progs, node) {
895 			prog = prog_list_prog(pl);
896 			id = prog->aux->id;
897 			if (copy_to_user(prog_ids + i, &id, sizeof(id)))
898 				return -EFAULT;
899 			if (++i == cnt)
900 				break;
901 		}
902 	}
903 	return ret;
904 }
905 
906 static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
907 			    union bpf_attr __user *uattr)
908 {
909 	int ret;
910 
911 	mutex_lock(&cgroup_mutex);
912 	ret = __cgroup_bpf_query(cgrp, attr, uattr);
913 	mutex_unlock(&cgroup_mutex);
914 	return ret;
915 }
916 
917 int cgroup_bpf_prog_attach(const union bpf_attr *attr,
918 			   enum bpf_prog_type ptype, struct bpf_prog *prog)
919 {
920 	struct bpf_prog *replace_prog = NULL;
921 	struct cgroup *cgrp;
922 	int ret;
923 
924 	cgrp = cgroup_get_from_fd(attr->target_fd);
925 	if (IS_ERR(cgrp))
926 		return PTR_ERR(cgrp);
927 
928 	if ((attr->attach_flags & BPF_F_ALLOW_MULTI) &&
929 	    (attr->attach_flags & BPF_F_REPLACE)) {
930 		replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, ptype);
931 		if (IS_ERR(replace_prog)) {
932 			cgroup_put(cgrp);
933 			return PTR_ERR(replace_prog);
934 		}
935 	}
936 
937 	ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL,
938 				attr->attach_type, attr->attach_flags);
939 
940 	if (replace_prog)
941 		bpf_prog_put(replace_prog);
942 	cgroup_put(cgrp);
943 	return ret;
944 }
945 
946 int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
947 {
948 	struct bpf_prog *prog;
949 	struct cgroup *cgrp;
950 	int ret;
951 
952 	cgrp = cgroup_get_from_fd(attr->target_fd);
953 	if (IS_ERR(cgrp))
954 		return PTR_ERR(cgrp);
955 
956 	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
957 	if (IS_ERR(prog))
958 		prog = NULL;
959 
960 	ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type);
961 	if (prog)
962 		bpf_prog_put(prog);
963 
964 	cgroup_put(cgrp);
965 	return ret;
966 }
967 
968 static void bpf_cgroup_link_release(struct bpf_link *link)
969 {
970 	struct bpf_cgroup_link *cg_link =
971 		container_of(link, struct bpf_cgroup_link, link);
972 	struct cgroup *cg;
973 
974 	/* link might have been auto-detached by dying cgroup already,
975 	 * in that case our work is done here
976 	 */
977 	if (!cg_link->cgroup)
978 		return;
979 
980 	mutex_lock(&cgroup_mutex);
981 
982 	/* re-check cgroup under lock again */
983 	if (!cg_link->cgroup) {
984 		mutex_unlock(&cgroup_mutex);
985 		return;
986 	}
987 
988 	WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link,
989 				    cg_link->type));
990 
991 	cg = cg_link->cgroup;
992 	cg_link->cgroup = NULL;
993 
994 	mutex_unlock(&cgroup_mutex);
995 
996 	cgroup_put(cg);
997 }
998 
999 static void bpf_cgroup_link_dealloc(struct bpf_link *link)
1000 {
1001 	struct bpf_cgroup_link *cg_link =
1002 		container_of(link, struct bpf_cgroup_link, link);
1003 
1004 	kfree(cg_link);
1005 }
1006 
1007 static int bpf_cgroup_link_detach(struct bpf_link *link)
1008 {
1009 	bpf_cgroup_link_release(link);
1010 
1011 	return 0;
1012 }
1013 
1014 static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link,
1015 					struct seq_file *seq)
1016 {
1017 	struct bpf_cgroup_link *cg_link =
1018 		container_of(link, struct bpf_cgroup_link, link);
1019 	u64 cg_id = 0;
1020 
1021 	mutex_lock(&cgroup_mutex);
1022 	if (cg_link->cgroup)
1023 		cg_id = cgroup_id(cg_link->cgroup);
1024 	mutex_unlock(&cgroup_mutex);
1025 
1026 	seq_printf(seq,
1027 		   "cgroup_id:\t%llu\n"
1028 		   "attach_type:\t%d\n",
1029 		   cg_id,
1030 		   cg_link->type);
1031 }
1032 
1033 static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link,
1034 					  struct bpf_link_info *info)
1035 {
1036 	struct bpf_cgroup_link *cg_link =
1037 		container_of(link, struct bpf_cgroup_link, link);
1038 	u64 cg_id = 0;
1039 
1040 	mutex_lock(&cgroup_mutex);
1041 	if (cg_link->cgroup)
1042 		cg_id = cgroup_id(cg_link->cgroup);
1043 	mutex_unlock(&cgroup_mutex);
1044 
1045 	info->cgroup.cgroup_id = cg_id;
1046 	info->cgroup.attach_type = cg_link->type;
1047 	return 0;
1048 }
1049 
1050 static const struct bpf_link_ops bpf_cgroup_link_lops = {
1051 	.release = bpf_cgroup_link_release,
1052 	.dealloc = bpf_cgroup_link_dealloc,
1053 	.detach = bpf_cgroup_link_detach,
1054 	.update_prog = cgroup_bpf_replace,
1055 	.show_fdinfo = bpf_cgroup_link_show_fdinfo,
1056 	.fill_link_info = bpf_cgroup_link_fill_link_info,
1057 };
1058 
1059 int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
1060 {
1061 	struct bpf_link_primer link_primer;
1062 	struct bpf_cgroup_link *link;
1063 	struct cgroup *cgrp;
1064 	int err;
1065 
1066 	if (attr->link_create.flags)
1067 		return -EINVAL;
1068 
1069 	cgrp = cgroup_get_from_fd(attr->link_create.target_fd);
1070 	if (IS_ERR(cgrp))
1071 		return PTR_ERR(cgrp);
1072 
1073 	link = kzalloc(sizeof(*link), GFP_USER);
1074 	if (!link) {
1075 		err = -ENOMEM;
1076 		goto out_put_cgroup;
1077 	}
1078 	bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops,
1079 		      prog);
1080 	link->cgroup = cgrp;
1081 	link->type = attr->link_create.attach_type;
1082 
1083 	err = bpf_link_prime(&link->link, &link_primer);
1084 	if (err) {
1085 		kfree(link);
1086 		goto out_put_cgroup;
1087 	}
1088 
1089 	err = cgroup_bpf_attach(cgrp, NULL, NULL, link,
1090 				link->type, BPF_F_ALLOW_MULTI);
1091 	if (err) {
1092 		bpf_link_cleanup(&link_primer);
1093 		goto out_put_cgroup;
1094 	}
1095 
1096 	return bpf_link_settle(&link_primer);
1097 
1098 out_put_cgroup:
1099 	cgroup_put(cgrp);
1100 	return err;
1101 }
1102 
1103 int cgroup_bpf_prog_query(const union bpf_attr *attr,
1104 			  union bpf_attr __user *uattr)
1105 {
1106 	struct cgroup *cgrp;
1107 	int ret;
1108 
1109 	cgrp = cgroup_get_from_fd(attr->query.target_fd);
1110 	if (IS_ERR(cgrp))
1111 		return PTR_ERR(cgrp);
1112 
1113 	ret = cgroup_bpf_query(cgrp, attr, uattr);
1114 
1115 	cgroup_put(cgrp);
1116 	return ret;
1117 }
1118 
1119 /**
1120  * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
1121  * @sk: The socket sending or receiving traffic
1122  * @skb: The skb that is being sent or received
1123  * @type: The type of program to be executed
1124  *
1125  * If no socket is passed, or the socket is not of type INET or INET6,
1126  * this function does nothing and returns 0.
1127  *
1128  * The program type passed in via @type must be suitable for network
1129  * filtering. No further check is performed to assert that.
1130  *
1131  * For egress packets, this function can return:
1132  *   NET_XMIT_SUCCESS    (0)	- continue with packet output
1133  *   NET_XMIT_DROP       (1)	- drop packet and notify TCP to call cwr
1134  *   NET_XMIT_CN         (2)	- continue with packet output and notify TCP
1135  *				  to call cwr
1136  *   -err			- drop packet
1137  *
1138  * For ingress packets, this function will return -EPERM if any
1139  * attached program was found and if it returned != 1 during execution.
1140  * Otherwise 0 is returned.
1141  */
1142 int __cgroup_bpf_run_filter_skb(struct sock *sk,
1143 				struct sk_buff *skb,
1144 				enum cgroup_bpf_attach_type atype)
1145 {
1146 	unsigned int offset = skb->data - skb_network_header(skb);
1147 	struct sock *save_sk;
1148 	void *saved_data_end;
1149 	struct cgroup *cgrp;
1150 	int ret;
1151 
1152 	if (!sk || !sk_fullsock(sk))
1153 		return 0;
1154 
1155 	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
1156 		return 0;
1157 
1158 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1159 	save_sk = skb->sk;
1160 	skb->sk = sk;
1161 	__skb_push(skb, offset);
1162 
1163 	/* compute pointers for the bpf prog */
1164 	bpf_compute_and_save_data_end(skb, &saved_data_end);
1165 
1166 	if (atype == CGROUP_INET_EGRESS) {
1167 		u32 flags = 0;
1168 		bool cn;
1169 
1170 		ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, skb,
1171 					    __bpf_prog_run_save_cb, 0, &flags);
1172 
1173 		/* Return values of CGROUP EGRESS BPF programs are:
1174 		 *   0: drop packet
1175 		 *   1: keep packet
1176 		 *   2: drop packet and cn
1177 		 *   3: keep packet and cn
1178 		 *
1179 		 * The returned value is then converted to one of the NET_XMIT
1180 		 * or an error code that is then interpreted as drop packet
1181 		 * (and no cn):
1182 		 *   0: NET_XMIT_SUCCESS  skb should be transmitted
1183 		 *   1: NET_XMIT_DROP     skb should be dropped and cn
1184 		 *   2: NET_XMIT_CN       skb should be transmitted and cn
1185 		 *   3: -err              skb should be dropped
1186 		 */
1187 
1188 		cn = flags & BPF_RET_SET_CN;
1189 		if (ret && !IS_ERR_VALUE((long)ret))
1190 			ret = -EFAULT;
1191 		if (!ret)
1192 			ret = (cn ? NET_XMIT_CN : NET_XMIT_SUCCESS);
1193 		else
1194 			ret = (cn ? NET_XMIT_DROP : ret);
1195 	} else {
1196 		ret = bpf_prog_run_array_cg(&cgrp->bpf, atype,
1197 					    skb, __bpf_prog_run_save_cb, 0,
1198 					    NULL);
1199 		if (ret && !IS_ERR_VALUE((long)ret))
1200 			ret = -EFAULT;
1201 	}
1202 	bpf_restore_data_end(skb, saved_data_end);
1203 	__skb_pull(skb, offset);
1204 	skb->sk = save_sk;
1205 
1206 	return ret;
1207 }
1208 EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
1209 
1210 /**
1211  * __cgroup_bpf_run_filter_sk() - Run a program on a sock
1212  * @sk: sock structure to manipulate
1213  * @type: The type of program to be executed
1214  *
1215  * socket is passed is expected to be of type INET or INET6.
1216  *
1217  * The program type passed in via @type must be suitable for sock
1218  * filtering. No further check is performed to assert that.
1219  *
1220  * This function will return %-EPERM if any if an attached program was found
1221  * and if it returned != 1 during execution. In all other cases, 0 is returned.
1222  */
1223 int __cgroup_bpf_run_filter_sk(struct sock *sk,
1224 			       enum cgroup_bpf_attach_type atype)
1225 {
1226 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1227 
1228 	return bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0,
1229 				     NULL);
1230 }
1231 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
1232 
1233 /**
1234  * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and
1235  *                                       provided by user sockaddr
1236  * @sk: sock struct that will use sockaddr
1237  * @uaddr: sockaddr struct provided by user
1238  * @type: The type of program to be executed
1239  * @t_ctx: Pointer to attach type specific context
1240  * @flags: Pointer to u32 which contains higher bits of BPF program
1241  *         return value (OR'ed together).
1242  *
1243  * socket is expected to be of type INET or INET6.
1244  *
1245  * This function will return %-EPERM if an attached program is found and
1246  * returned value != 1 during execution. In all other cases, 0 is returned.
1247  */
1248 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
1249 				      struct sockaddr *uaddr,
1250 				      enum cgroup_bpf_attach_type atype,
1251 				      void *t_ctx,
1252 				      u32 *flags)
1253 {
1254 	struct bpf_sock_addr_kern ctx = {
1255 		.sk = sk,
1256 		.uaddr = uaddr,
1257 		.t_ctx = t_ctx,
1258 	};
1259 	struct sockaddr_storage unspec;
1260 	struct cgroup *cgrp;
1261 
1262 	/* Check socket family since not all sockets represent network
1263 	 * endpoint (e.g. AF_UNIX).
1264 	 */
1265 	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
1266 		return 0;
1267 
1268 	if (!ctx.uaddr) {
1269 		memset(&unspec, 0, sizeof(unspec));
1270 		ctx.uaddr = (struct sockaddr *)&unspec;
1271 	}
1272 
1273 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1274 	return bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run,
1275 				     0, flags);
1276 }
1277 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
1278 
1279 /**
1280  * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock
1281  * @sk: socket to get cgroup from
1282  * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
1283  * sk with connection information (IP addresses, etc.) May not contain
1284  * cgroup info if it is a req sock.
1285  * @type: The type of program to be executed
1286  *
1287  * socket passed is expected to be of type INET or INET6.
1288  *
1289  * The program type passed in via @type must be suitable for sock_ops
1290  * filtering. No further check is performed to assert that.
1291  *
1292  * This function will return %-EPERM if any if an attached program was found
1293  * and if it returned != 1 during execution. In all other cases, 0 is returned.
1294  */
1295 int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
1296 				     struct bpf_sock_ops_kern *sock_ops,
1297 				     enum cgroup_bpf_attach_type atype)
1298 {
1299 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1300 
1301 	return bpf_prog_run_array_cg(&cgrp->bpf, atype, sock_ops, bpf_prog_run,
1302 				     0, NULL);
1303 }
1304 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
1305 
1306 int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
1307 				      short access, enum cgroup_bpf_attach_type atype)
1308 {
1309 	struct cgroup *cgrp;
1310 	struct bpf_cgroup_dev_ctx ctx = {
1311 		.access_type = (access << 16) | dev_type,
1312 		.major = major,
1313 		.minor = minor,
1314 	};
1315 	int ret;
1316 
1317 	rcu_read_lock();
1318 	cgrp = task_dfl_cgroup(current);
1319 	ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0,
1320 				    NULL);
1321 	rcu_read_unlock();
1322 
1323 	return ret;
1324 }
1325 
1326 BPF_CALL_0(bpf_get_retval)
1327 {
1328 	struct bpf_cg_run_ctx *ctx =
1329 		container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
1330 
1331 	return ctx->retval;
1332 }
1333 
1334 static const struct bpf_func_proto bpf_get_retval_proto = {
1335 	.func		= bpf_get_retval,
1336 	.gpl_only	= false,
1337 	.ret_type	= RET_INTEGER,
1338 };
1339 
1340 BPF_CALL_1(bpf_set_retval, int, retval)
1341 {
1342 	struct bpf_cg_run_ctx *ctx =
1343 		container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
1344 
1345 	ctx->retval = retval;
1346 	return 0;
1347 }
1348 
1349 static const struct bpf_func_proto bpf_set_retval_proto = {
1350 	.func		= bpf_set_retval,
1351 	.gpl_only	= false,
1352 	.ret_type	= RET_INTEGER,
1353 	.arg1_type	= ARG_ANYTHING,
1354 };
1355 
1356 static const struct bpf_func_proto *
1357 cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1358 {
1359 	switch (func_id) {
1360 	case BPF_FUNC_get_current_uid_gid:
1361 		return &bpf_get_current_uid_gid_proto;
1362 	case BPF_FUNC_get_local_storage:
1363 		return &bpf_get_local_storage_proto;
1364 	case BPF_FUNC_get_current_cgroup_id:
1365 		return &bpf_get_current_cgroup_id_proto;
1366 	case BPF_FUNC_perf_event_output:
1367 		return &bpf_event_output_data_proto;
1368 	case BPF_FUNC_get_retval:
1369 		return &bpf_get_retval_proto;
1370 	case BPF_FUNC_set_retval:
1371 		return &bpf_set_retval_proto;
1372 	default:
1373 		return bpf_base_func_proto(func_id);
1374 	}
1375 }
1376 
1377 static const struct bpf_func_proto *
1378 cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1379 {
1380 	return cgroup_base_func_proto(func_id, prog);
1381 }
1382 
1383 static bool cgroup_dev_is_valid_access(int off, int size,
1384 				       enum bpf_access_type type,
1385 				       const struct bpf_prog *prog,
1386 				       struct bpf_insn_access_aux *info)
1387 {
1388 	const int size_default = sizeof(__u32);
1389 
1390 	if (type == BPF_WRITE)
1391 		return false;
1392 
1393 	if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx))
1394 		return false;
1395 	/* The verifier guarantees that size > 0. */
1396 	if (off % size != 0)
1397 		return false;
1398 
1399 	switch (off) {
1400 	case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type):
1401 		bpf_ctx_record_field_size(info, size_default);
1402 		if (!bpf_ctx_narrow_access_ok(off, size, size_default))
1403 			return false;
1404 		break;
1405 	default:
1406 		if (size != size_default)
1407 			return false;
1408 	}
1409 
1410 	return true;
1411 }
1412 
1413 const struct bpf_prog_ops cg_dev_prog_ops = {
1414 };
1415 
1416 const struct bpf_verifier_ops cg_dev_verifier_ops = {
1417 	.get_func_proto		= cgroup_dev_func_proto,
1418 	.is_valid_access	= cgroup_dev_is_valid_access,
1419 };
1420 
1421 /**
1422  * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl
1423  *
1424  * @head: sysctl table header
1425  * @table: sysctl table
1426  * @write: sysctl is being read (= 0) or written (= 1)
1427  * @buf: pointer to buffer (in and out)
1428  * @pcount: value-result argument: value is size of buffer pointed to by @buf,
1429  *	result is size of @new_buf if program set new value, initial value
1430  *	otherwise
1431  * @ppos: value-result argument: value is position at which read from or write
1432  *	to sysctl is happening, result is new position if program overrode it,
1433  *	initial value otherwise
1434  * @type: type of program to be executed
1435  *
1436  * Program is run when sysctl is being accessed, either read or written, and
1437  * can allow or deny such access.
1438  *
1439  * This function will return %-EPERM if an attached program is found and
1440  * returned value != 1 during execution. In all other cases 0 is returned.
1441  */
1442 int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
1443 				   struct ctl_table *table, int write,
1444 				   char **buf, size_t *pcount, loff_t *ppos,
1445 				   enum cgroup_bpf_attach_type atype)
1446 {
1447 	struct bpf_sysctl_kern ctx = {
1448 		.head = head,
1449 		.table = table,
1450 		.write = write,
1451 		.ppos = ppos,
1452 		.cur_val = NULL,
1453 		.cur_len = PAGE_SIZE,
1454 		.new_val = NULL,
1455 		.new_len = 0,
1456 		.new_updated = 0,
1457 	};
1458 	struct cgroup *cgrp;
1459 	loff_t pos = 0;
1460 	int ret;
1461 
1462 	ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL);
1463 	if (!ctx.cur_val ||
1464 	    table->proc_handler(table, 0, ctx.cur_val, &ctx.cur_len, &pos)) {
1465 		/* Let BPF program decide how to proceed. */
1466 		ctx.cur_len = 0;
1467 	}
1468 
1469 	if (write && *buf && *pcount) {
1470 		/* BPF program should be able to override new value with a
1471 		 * buffer bigger than provided by user.
1472 		 */
1473 		ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL);
1474 		ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount);
1475 		if (ctx.new_val) {
1476 			memcpy(ctx.new_val, *buf, ctx.new_len);
1477 		} else {
1478 			/* Let BPF program decide how to proceed. */
1479 			ctx.new_len = 0;
1480 		}
1481 	}
1482 
1483 	rcu_read_lock();
1484 	cgrp = task_dfl_cgroup(current);
1485 	ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0,
1486 				    NULL);
1487 	rcu_read_unlock();
1488 
1489 	kfree(ctx.cur_val);
1490 
1491 	if (ret == 1 && ctx.new_updated) {
1492 		kfree(*buf);
1493 		*buf = ctx.new_val;
1494 		*pcount = ctx.new_len;
1495 	} else {
1496 		kfree(ctx.new_val);
1497 	}
1498 
1499 	return ret;
1500 }
1501 
1502 #ifdef CONFIG_NET
1503 static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
1504 			     struct bpf_sockopt_buf *buf)
1505 {
1506 	if (unlikely(max_optlen < 0))
1507 		return -EINVAL;
1508 
1509 	if (unlikely(max_optlen > PAGE_SIZE)) {
1510 		/* We don't expose optvals that are greater than PAGE_SIZE
1511 		 * to the BPF program.
1512 		 */
1513 		max_optlen = PAGE_SIZE;
1514 	}
1515 
1516 	if (max_optlen <= sizeof(buf->data)) {
1517 		/* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE
1518 		 * bytes avoid the cost of kzalloc.
1519 		 */
1520 		ctx->optval = buf->data;
1521 		ctx->optval_end = ctx->optval + max_optlen;
1522 		return max_optlen;
1523 	}
1524 
1525 	ctx->optval = kzalloc(max_optlen, GFP_USER);
1526 	if (!ctx->optval)
1527 		return -ENOMEM;
1528 
1529 	ctx->optval_end = ctx->optval + max_optlen;
1530 
1531 	return max_optlen;
1532 }
1533 
1534 static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
1535 			     struct bpf_sockopt_buf *buf)
1536 {
1537 	if (ctx->optval == buf->data)
1538 		return;
1539 	kfree(ctx->optval);
1540 }
1541 
1542 static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
1543 				  struct bpf_sockopt_buf *buf)
1544 {
1545 	return ctx->optval != buf->data;
1546 }
1547 
1548 int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
1549 				       int *optname, char __user *optval,
1550 				       int *optlen, char **kernel_optval)
1551 {
1552 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1553 	struct bpf_sockopt_buf buf = {};
1554 	struct bpf_sockopt_kern ctx = {
1555 		.sk = sk,
1556 		.level = *level,
1557 		.optname = *optname,
1558 	};
1559 	int ret, max_optlen;
1560 
1561 	/* Allocate a bit more than the initial user buffer for
1562 	 * BPF program. The canonical use case is overriding
1563 	 * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
1564 	 */
1565 	max_optlen = max_t(int, 16, *optlen);
1566 	max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
1567 	if (max_optlen < 0)
1568 		return max_optlen;
1569 
1570 	ctx.optlen = *optlen;
1571 
1572 	if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) {
1573 		ret = -EFAULT;
1574 		goto out;
1575 	}
1576 
1577 	lock_sock(sk);
1578 	ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_SETSOCKOPT,
1579 				    &ctx, bpf_prog_run, 0, NULL);
1580 	release_sock(sk);
1581 
1582 	if (ret)
1583 		goto out;
1584 
1585 	if (ctx.optlen == -1) {
1586 		/* optlen set to -1, bypass kernel */
1587 		ret = 1;
1588 	} else if (ctx.optlen > max_optlen || ctx.optlen < -1) {
1589 		/* optlen is out of bounds */
1590 		ret = -EFAULT;
1591 	} else {
1592 		/* optlen within bounds, run kernel handler */
1593 		ret = 0;
1594 
1595 		/* export any potential modifications */
1596 		*level = ctx.level;
1597 		*optname = ctx.optname;
1598 
1599 		/* optlen == 0 from BPF indicates that we should
1600 		 * use original userspace data.
1601 		 */
1602 		if (ctx.optlen != 0) {
1603 			*optlen = ctx.optlen;
1604 			/* We've used bpf_sockopt_kern->buf as an intermediary
1605 			 * storage, but the BPF program indicates that we need
1606 			 * to pass this data to the kernel setsockopt handler.
1607 			 * No way to export on-stack buf, have to allocate a
1608 			 * new buffer.
1609 			 */
1610 			if (!sockopt_buf_allocated(&ctx, &buf)) {
1611 				void *p = kmalloc(ctx.optlen, GFP_USER);
1612 
1613 				if (!p) {
1614 					ret = -ENOMEM;
1615 					goto out;
1616 				}
1617 				memcpy(p, ctx.optval, ctx.optlen);
1618 				*kernel_optval = p;
1619 			} else {
1620 				*kernel_optval = ctx.optval;
1621 			}
1622 			/* export and don't free sockopt buf */
1623 			return 0;
1624 		}
1625 	}
1626 
1627 out:
1628 	sockopt_free_buf(&ctx, &buf);
1629 	return ret;
1630 }
1631 
1632 int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
1633 				       int optname, char __user *optval,
1634 				       int __user *optlen, int max_optlen,
1635 				       int retval)
1636 {
1637 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1638 	struct bpf_sockopt_buf buf = {};
1639 	struct bpf_sockopt_kern ctx = {
1640 		.sk = sk,
1641 		.level = level,
1642 		.optname = optname,
1643 		.current_task = current,
1644 	};
1645 	int ret;
1646 
1647 	ctx.optlen = max_optlen;
1648 	max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
1649 	if (max_optlen < 0)
1650 		return max_optlen;
1651 
1652 	if (!retval) {
1653 		/* If kernel getsockopt finished successfully,
1654 		 * copy whatever was returned to the user back
1655 		 * into our temporary buffer. Set optlen to the
1656 		 * one that kernel returned as well to let
1657 		 * BPF programs inspect the value.
1658 		 */
1659 
1660 		if (get_user(ctx.optlen, optlen)) {
1661 			ret = -EFAULT;
1662 			goto out;
1663 		}
1664 
1665 		if (ctx.optlen < 0) {
1666 			ret = -EFAULT;
1667 			goto out;
1668 		}
1669 
1670 		if (copy_from_user(ctx.optval, optval,
1671 				   min(ctx.optlen, max_optlen)) != 0) {
1672 			ret = -EFAULT;
1673 			goto out;
1674 		}
1675 	}
1676 
1677 	lock_sock(sk);
1678 	ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
1679 				    &ctx, bpf_prog_run, retval, NULL);
1680 	release_sock(sk);
1681 
1682 	if (ret < 0)
1683 		goto out;
1684 
1685 	if (ctx.optlen > max_optlen || ctx.optlen < 0) {
1686 		ret = -EFAULT;
1687 		goto out;
1688 	}
1689 
1690 	if (ctx.optlen != 0) {
1691 		if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
1692 		    put_user(ctx.optlen, optlen)) {
1693 			ret = -EFAULT;
1694 			goto out;
1695 		}
1696 	}
1697 
1698 out:
1699 	sockopt_free_buf(&ctx, &buf);
1700 	return ret;
1701 }
1702 
1703 int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
1704 					    int optname, void *optval,
1705 					    int *optlen, int retval)
1706 {
1707 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
1708 	struct bpf_sockopt_kern ctx = {
1709 		.sk = sk,
1710 		.level = level,
1711 		.optname = optname,
1712 		.optlen = *optlen,
1713 		.optval = optval,
1714 		.optval_end = optval + *optlen,
1715 		.current_task = current,
1716 	};
1717 	int ret;
1718 
1719 	/* Note that __cgroup_bpf_run_filter_getsockopt doesn't copy
1720 	 * user data back into BPF buffer when reval != 0. This is
1721 	 * done as an optimization to avoid extra copy, assuming
1722 	 * kernel won't populate the data in case of an error.
1723 	 * Here we always pass the data and memset() should
1724 	 * be called if that data shouldn't be "exported".
1725 	 */
1726 
1727 	ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
1728 				    &ctx, bpf_prog_run, retval, NULL);
1729 	if (ret < 0)
1730 		return ret;
1731 
1732 	if (ctx.optlen > *optlen)
1733 		return -EFAULT;
1734 
1735 	/* BPF programs can shrink the buffer, export the modifications.
1736 	 */
1737 	if (ctx.optlen != 0)
1738 		*optlen = ctx.optlen;
1739 
1740 	return ret;
1741 }
1742 #endif
1743 
1744 static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
1745 			      size_t *lenp)
1746 {
1747 	ssize_t tmp_ret = 0, ret;
1748 
1749 	if (dir->header.parent) {
1750 		tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp);
1751 		if (tmp_ret < 0)
1752 			return tmp_ret;
1753 	}
1754 
1755 	ret = strscpy(*bufp, dir->header.ctl_table[0].procname, *lenp);
1756 	if (ret < 0)
1757 		return ret;
1758 	*bufp += ret;
1759 	*lenp -= ret;
1760 	ret += tmp_ret;
1761 
1762 	/* Avoid leading slash. */
1763 	if (!ret)
1764 		return ret;
1765 
1766 	tmp_ret = strscpy(*bufp, "/", *lenp);
1767 	if (tmp_ret < 0)
1768 		return tmp_ret;
1769 	*bufp += tmp_ret;
1770 	*lenp -= tmp_ret;
1771 
1772 	return ret + tmp_ret;
1773 }
1774 
1775 BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern *, ctx, char *, buf,
1776 	   size_t, buf_len, u64, flags)
1777 {
1778 	ssize_t tmp_ret = 0, ret;
1779 
1780 	if (!buf)
1781 		return -EINVAL;
1782 
1783 	if (!(flags & BPF_F_SYSCTL_BASE_NAME)) {
1784 		if (!ctx->head)
1785 			return -EINVAL;
1786 		tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len);
1787 		if (tmp_ret < 0)
1788 			return tmp_ret;
1789 	}
1790 
1791 	ret = strscpy(buf, ctx->table->procname, buf_len);
1792 
1793 	return ret < 0 ? ret : tmp_ret + ret;
1794 }
1795 
1796 static const struct bpf_func_proto bpf_sysctl_get_name_proto = {
1797 	.func		= bpf_sysctl_get_name,
1798 	.gpl_only	= false,
1799 	.ret_type	= RET_INTEGER,
1800 	.arg1_type	= ARG_PTR_TO_CTX,
1801 	.arg2_type	= ARG_PTR_TO_MEM,
1802 	.arg3_type	= ARG_CONST_SIZE,
1803 	.arg4_type	= ARG_ANYTHING,
1804 };
1805 
1806 static int copy_sysctl_value(char *dst, size_t dst_len, char *src,
1807 			     size_t src_len)
1808 {
1809 	if (!dst)
1810 		return -EINVAL;
1811 
1812 	if (!dst_len)
1813 		return -E2BIG;
1814 
1815 	if (!src || !src_len) {
1816 		memset(dst, 0, dst_len);
1817 		return -EINVAL;
1818 	}
1819 
1820 	memcpy(dst, src, min(dst_len, src_len));
1821 
1822 	if (dst_len > src_len) {
1823 		memset(dst + src_len, '\0', dst_len - src_len);
1824 		return src_len;
1825 	}
1826 
1827 	dst[dst_len - 1] = '\0';
1828 
1829 	return -E2BIG;
1830 }
1831 
1832 BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx,
1833 	   char *, buf, size_t, buf_len)
1834 {
1835 	return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len);
1836 }
1837 
1838 static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = {
1839 	.func		= bpf_sysctl_get_current_value,
1840 	.gpl_only	= false,
1841 	.ret_type	= RET_INTEGER,
1842 	.arg1_type	= ARG_PTR_TO_CTX,
1843 	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
1844 	.arg3_type	= ARG_CONST_SIZE,
1845 };
1846 
1847 BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf,
1848 	   size_t, buf_len)
1849 {
1850 	if (!ctx->write) {
1851 		if (buf && buf_len)
1852 			memset(buf, '\0', buf_len);
1853 		return -EINVAL;
1854 	}
1855 	return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len);
1856 }
1857 
1858 static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = {
1859 	.func		= bpf_sysctl_get_new_value,
1860 	.gpl_only	= false,
1861 	.ret_type	= RET_INTEGER,
1862 	.arg1_type	= ARG_PTR_TO_CTX,
1863 	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
1864 	.arg3_type	= ARG_CONST_SIZE,
1865 };
1866 
1867 BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx,
1868 	   const char *, buf, size_t, buf_len)
1869 {
1870 	if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len)
1871 		return -EINVAL;
1872 
1873 	if (buf_len > PAGE_SIZE - 1)
1874 		return -E2BIG;
1875 
1876 	memcpy(ctx->new_val, buf, buf_len);
1877 	ctx->new_len = buf_len;
1878 	ctx->new_updated = 1;
1879 
1880 	return 0;
1881 }
1882 
1883 static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
1884 	.func		= bpf_sysctl_set_new_value,
1885 	.gpl_only	= false,
1886 	.ret_type	= RET_INTEGER,
1887 	.arg1_type	= ARG_PTR_TO_CTX,
1888 	.arg2_type	= ARG_PTR_TO_MEM | MEM_RDONLY,
1889 	.arg3_type	= ARG_CONST_SIZE,
1890 };
1891 
1892 static const struct bpf_func_proto *
1893 sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1894 {
1895 	switch (func_id) {
1896 	case BPF_FUNC_strtol:
1897 		return &bpf_strtol_proto;
1898 	case BPF_FUNC_strtoul:
1899 		return &bpf_strtoul_proto;
1900 	case BPF_FUNC_sysctl_get_name:
1901 		return &bpf_sysctl_get_name_proto;
1902 	case BPF_FUNC_sysctl_get_current_value:
1903 		return &bpf_sysctl_get_current_value_proto;
1904 	case BPF_FUNC_sysctl_get_new_value:
1905 		return &bpf_sysctl_get_new_value_proto;
1906 	case BPF_FUNC_sysctl_set_new_value:
1907 		return &bpf_sysctl_set_new_value_proto;
1908 	case BPF_FUNC_ktime_get_coarse_ns:
1909 		return &bpf_ktime_get_coarse_ns_proto;
1910 	default:
1911 		return cgroup_base_func_proto(func_id, prog);
1912 	}
1913 }
1914 
1915 static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type,
1916 				   const struct bpf_prog *prog,
1917 				   struct bpf_insn_access_aux *info)
1918 {
1919 	const int size_default = sizeof(__u32);
1920 
1921 	if (off < 0 || off + size > sizeof(struct bpf_sysctl) || off % size)
1922 		return false;
1923 
1924 	switch (off) {
1925 	case bpf_ctx_range(struct bpf_sysctl, write):
1926 		if (type != BPF_READ)
1927 			return false;
1928 		bpf_ctx_record_field_size(info, size_default);
1929 		return bpf_ctx_narrow_access_ok(off, size, size_default);
1930 	case bpf_ctx_range(struct bpf_sysctl, file_pos):
1931 		if (type == BPF_READ) {
1932 			bpf_ctx_record_field_size(info, size_default);
1933 			return bpf_ctx_narrow_access_ok(off, size, size_default);
1934 		} else {
1935 			return size == size_default;
1936 		}
1937 	default:
1938 		return false;
1939 	}
1940 }
1941 
1942 static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
1943 				     const struct bpf_insn *si,
1944 				     struct bpf_insn *insn_buf,
1945 				     struct bpf_prog *prog, u32 *target_size)
1946 {
1947 	struct bpf_insn *insn = insn_buf;
1948 	u32 read_size;
1949 
1950 	switch (si->off) {
1951 	case offsetof(struct bpf_sysctl, write):
1952 		*insn++ = BPF_LDX_MEM(
1953 			BPF_SIZE(si->code), si->dst_reg, si->src_reg,
1954 			bpf_target_off(struct bpf_sysctl_kern, write,
1955 				       sizeof_field(struct bpf_sysctl_kern,
1956 						    write),
1957 				       target_size));
1958 		break;
1959 	case offsetof(struct bpf_sysctl, file_pos):
1960 		/* ppos is a pointer so it should be accessed via indirect
1961 		 * loads and stores. Also for stores additional temporary
1962 		 * register is used since neither src_reg nor dst_reg can be
1963 		 * overridden.
1964 		 */
1965 		if (type == BPF_WRITE) {
1966 			int treg = BPF_REG_9;
1967 
1968 			if (si->src_reg == treg || si->dst_reg == treg)
1969 				--treg;
1970 			if (si->src_reg == treg || si->dst_reg == treg)
1971 				--treg;
1972 			*insn++ = BPF_STX_MEM(
1973 				BPF_DW, si->dst_reg, treg,
1974 				offsetof(struct bpf_sysctl_kern, tmp_reg));
1975 			*insn++ = BPF_LDX_MEM(
1976 				BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
1977 				treg, si->dst_reg,
1978 				offsetof(struct bpf_sysctl_kern, ppos));
1979 			*insn++ = BPF_STX_MEM(
1980 				BPF_SIZEOF(u32), treg, si->src_reg,
1981 				bpf_ctx_narrow_access_offset(
1982 					0, sizeof(u32), sizeof(loff_t)));
1983 			*insn++ = BPF_LDX_MEM(
1984 				BPF_DW, treg, si->dst_reg,
1985 				offsetof(struct bpf_sysctl_kern, tmp_reg));
1986 		} else {
1987 			*insn++ = BPF_LDX_MEM(
1988 				BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
1989 				si->dst_reg, si->src_reg,
1990 				offsetof(struct bpf_sysctl_kern, ppos));
1991 			read_size = bpf_size_to_bytes(BPF_SIZE(si->code));
1992 			*insn++ = BPF_LDX_MEM(
1993 				BPF_SIZE(si->code), si->dst_reg, si->dst_reg,
1994 				bpf_ctx_narrow_access_offset(
1995 					0, read_size, sizeof(loff_t)));
1996 		}
1997 		*target_size = sizeof(u32);
1998 		break;
1999 	}
2000 
2001 	return insn - insn_buf;
2002 }
2003 
2004 const struct bpf_verifier_ops cg_sysctl_verifier_ops = {
2005 	.get_func_proto		= sysctl_func_proto,
2006 	.is_valid_access	= sysctl_is_valid_access,
2007 	.convert_ctx_access	= sysctl_convert_ctx_access,
2008 };
2009 
2010 const struct bpf_prog_ops cg_sysctl_prog_ops = {
2011 };
2012 
2013 #ifdef CONFIG_NET
2014 BPF_CALL_1(bpf_get_netns_cookie_sockopt, struct bpf_sockopt_kern *, ctx)
2015 {
2016 	const struct net *net = ctx ? sock_net(ctx->sk) : &init_net;
2017 
2018 	return net->net_cookie;
2019 }
2020 
2021 static const struct bpf_func_proto bpf_get_netns_cookie_sockopt_proto = {
2022 	.func		= bpf_get_netns_cookie_sockopt,
2023 	.gpl_only	= false,
2024 	.ret_type	= RET_INTEGER,
2025 	.arg1_type	= ARG_PTR_TO_CTX_OR_NULL,
2026 };
2027 #endif
2028 
2029 static const struct bpf_func_proto *
2030 cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
2031 {
2032 	switch (func_id) {
2033 #ifdef CONFIG_NET
2034 	case BPF_FUNC_get_netns_cookie:
2035 		return &bpf_get_netns_cookie_sockopt_proto;
2036 	case BPF_FUNC_sk_storage_get:
2037 		return &bpf_sk_storage_get_proto;
2038 	case BPF_FUNC_sk_storage_delete:
2039 		return &bpf_sk_storage_delete_proto;
2040 	case BPF_FUNC_setsockopt:
2041 		if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT)
2042 			return &bpf_sk_setsockopt_proto;
2043 		return NULL;
2044 	case BPF_FUNC_getsockopt:
2045 		if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT)
2046 			return &bpf_sk_getsockopt_proto;
2047 		return NULL;
2048 #endif
2049 #ifdef CONFIG_INET
2050 	case BPF_FUNC_tcp_sock:
2051 		return &bpf_tcp_sock_proto;
2052 #endif
2053 	default:
2054 		return cgroup_base_func_proto(func_id, prog);
2055 	}
2056 }
2057 
2058 static bool cg_sockopt_is_valid_access(int off, int size,
2059 				       enum bpf_access_type type,
2060 				       const struct bpf_prog *prog,
2061 				       struct bpf_insn_access_aux *info)
2062 {
2063 	const int size_default = sizeof(__u32);
2064 
2065 	if (off < 0 || off >= sizeof(struct bpf_sockopt))
2066 		return false;
2067 
2068 	if (off % size != 0)
2069 		return false;
2070 
2071 	if (type == BPF_WRITE) {
2072 		switch (off) {
2073 		case offsetof(struct bpf_sockopt, retval):
2074 			if (size != size_default)
2075 				return false;
2076 			return prog->expected_attach_type ==
2077 				BPF_CGROUP_GETSOCKOPT;
2078 		case offsetof(struct bpf_sockopt, optname):
2079 			fallthrough;
2080 		case offsetof(struct bpf_sockopt, level):
2081 			if (size != size_default)
2082 				return false;
2083 			return prog->expected_attach_type ==
2084 				BPF_CGROUP_SETSOCKOPT;
2085 		case offsetof(struct bpf_sockopt, optlen):
2086 			return size == size_default;
2087 		default:
2088 			return false;
2089 		}
2090 	}
2091 
2092 	switch (off) {
2093 	case offsetof(struct bpf_sockopt, sk):
2094 		if (size != sizeof(__u64))
2095 			return false;
2096 		info->reg_type = PTR_TO_SOCKET;
2097 		break;
2098 	case offsetof(struct bpf_sockopt, optval):
2099 		if (size != sizeof(__u64))
2100 			return false;
2101 		info->reg_type = PTR_TO_PACKET;
2102 		break;
2103 	case offsetof(struct bpf_sockopt, optval_end):
2104 		if (size != sizeof(__u64))
2105 			return false;
2106 		info->reg_type = PTR_TO_PACKET_END;
2107 		break;
2108 	case offsetof(struct bpf_sockopt, retval):
2109 		if (size != size_default)
2110 			return false;
2111 		return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT;
2112 	default:
2113 		if (size != size_default)
2114 			return false;
2115 		break;
2116 	}
2117 	return true;
2118 }
2119 
2120 #define CG_SOCKOPT_ACCESS_FIELD(T, F)					\
2121 	T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F),			\
2122 	  si->dst_reg, si->src_reg,					\
2123 	  offsetof(struct bpf_sockopt_kern, F))
2124 
2125 static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
2126 					 const struct bpf_insn *si,
2127 					 struct bpf_insn *insn_buf,
2128 					 struct bpf_prog *prog,
2129 					 u32 *target_size)
2130 {
2131 	struct bpf_insn *insn = insn_buf;
2132 
2133 	switch (si->off) {
2134 	case offsetof(struct bpf_sockopt, sk):
2135 		*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk);
2136 		break;
2137 	case offsetof(struct bpf_sockopt, level):
2138 		if (type == BPF_WRITE)
2139 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level);
2140 		else
2141 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level);
2142 		break;
2143 	case offsetof(struct bpf_sockopt, optname):
2144 		if (type == BPF_WRITE)
2145 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname);
2146 		else
2147 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname);
2148 		break;
2149 	case offsetof(struct bpf_sockopt, optlen):
2150 		if (type == BPF_WRITE)
2151 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen);
2152 		else
2153 			*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen);
2154 		break;
2155 	case offsetof(struct bpf_sockopt, retval):
2156 		BUILD_BUG_ON(offsetof(struct bpf_cg_run_ctx, run_ctx) != 0);
2157 
2158 		if (type == BPF_WRITE) {
2159 			int treg = BPF_REG_9;
2160 
2161 			if (si->src_reg == treg || si->dst_reg == treg)
2162 				--treg;
2163 			if (si->src_reg == treg || si->dst_reg == treg)
2164 				--treg;
2165 			*insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, treg,
2166 					      offsetof(struct bpf_sockopt_kern, tmp_reg));
2167 			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
2168 					      treg, si->dst_reg,
2169 					      offsetof(struct bpf_sockopt_kern, current_task));
2170 			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
2171 					      treg, treg,
2172 					      offsetof(struct task_struct, bpf_ctx));
2173 			*insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
2174 					      treg, si->src_reg,
2175 					      offsetof(struct bpf_cg_run_ctx, retval));
2176 			*insn++ = BPF_LDX_MEM(BPF_DW, treg, si->dst_reg,
2177 					      offsetof(struct bpf_sockopt_kern, tmp_reg));
2178 		} else {
2179 			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
2180 					      si->dst_reg, si->src_reg,
2181 					      offsetof(struct bpf_sockopt_kern, current_task));
2182 			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
2183 					      si->dst_reg, si->dst_reg,
2184 					      offsetof(struct task_struct, bpf_ctx));
2185 			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
2186 					      si->dst_reg, si->dst_reg,
2187 					      offsetof(struct bpf_cg_run_ctx, retval));
2188 		}
2189 		break;
2190 	case offsetof(struct bpf_sockopt, optval):
2191 		*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval);
2192 		break;
2193 	case offsetof(struct bpf_sockopt, optval_end):
2194 		*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end);
2195 		break;
2196 	}
2197 
2198 	return insn - insn_buf;
2199 }
2200 
2201 static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf,
2202 				   bool direct_write,
2203 				   const struct bpf_prog *prog)
2204 {
2205 	/* Nothing to do for sockopt argument. The data is kzalloc'ated.
2206 	 */
2207 	return 0;
2208 }
2209 
2210 const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
2211 	.get_func_proto		= cg_sockopt_func_proto,
2212 	.is_valid_access	= cg_sockopt_is_valid_access,
2213 	.convert_ctx_access	= cg_sockopt_convert_ctx_access,
2214 	.gen_prologue		= cg_sockopt_get_prologue,
2215 };
2216 
2217 const struct bpf_prog_ops cg_sockopt_prog_ops = {
2218 };
2219