xref: /openbmc/linux/net/core/filter.c (revision e5c86679)
1 /*
2  * Linux Socket Filter - Kernel level socket filtering
3  *
4  * Based on the design of the Berkeley Packet Filter. The new
5  * internal format has been designed by PLUMgrid:
6  *
7  *	Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
8  *
9  * Authors:
10  *
11  *	Jay Schulist <jschlst@samba.org>
12  *	Alexei Starovoitov <ast@plumgrid.com>
13  *	Daniel Borkmann <dborkman@redhat.com>
14  *
15  * This program is free software; you can redistribute it and/or
16  * modify it under the terms of the GNU General Public License
17  * as published by the Free Software Foundation; either version
18  * 2 of the License, or (at your option) any later version.
19  *
20  * Andi Kleen - Fix a few bad bugs and races.
21  * Kris Katterjohn - Added many additional checks in bpf_check_classic()
22  */
23 
24 #include <linux/module.h>
25 #include <linux/types.h>
26 #include <linux/mm.h>
27 #include <linux/fcntl.h>
28 #include <linux/socket.h>
29 #include <linux/in.h>
30 #include <linux/inet.h>
31 #include <linux/netdevice.h>
32 #include <linux/if_packet.h>
33 #include <linux/if_arp.h>
34 #include <linux/gfp.h>
35 #include <net/ip.h>
36 #include <net/protocol.h>
37 #include <net/netlink.h>
38 #include <linux/skbuff.h>
39 #include <net/sock.h>
40 #include <net/flow_dissector.h>
41 #include <linux/errno.h>
42 #include <linux/timer.h>
43 #include <linux/uaccess.h>
44 #include <asm/unaligned.h>
45 #include <linux/filter.h>
46 #include <linux/ratelimit.h>
47 #include <linux/seccomp.h>
48 #include <linux/if_vlan.h>
49 #include <linux/bpf.h>
50 #include <net/sch_generic.h>
51 #include <net/cls_cgroup.h>
52 #include <net/dst_metadata.h>
53 #include <net/dst.h>
54 #include <net/sock_reuseport.h>
55 
56 /**
57  *	sk_filter_trim_cap - run a packet through a socket filter
58  *	@sk: sock associated with &sk_buff
59  *	@skb: buffer to filter
60  *	@cap: limit on how short the eBPF program may trim the packet
61  *
62  * Run the eBPF program and then cut skb->data to correct size returned by
63  * the program. If pkt_len is 0 we toss packet. If skb->len is smaller
64  * than pkt_len we keep whole skb->data. This is the socket level
65  * wrapper to BPF_PROG_RUN. It returns 0 if the packet should
66  * be accepted or -EPERM if the packet should be tossed.
67  *
68  */
69 int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
70 {
71 	int err;
72 	struct sk_filter *filter;
73 
74 	/*
75 	 * If the skb was allocated from pfmemalloc reserves, only
76 	 * allow SOCK_MEMALLOC sockets to use it as this socket is
77 	 * helping free memory
78 	 */
79 	if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) {
80 		NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
81 		return -ENOMEM;
82 	}
83 	err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb);
84 	if (err)
85 		return err;
86 
87 	err = security_sock_rcv_skb(sk, skb);
88 	if (err)
89 		return err;
90 
91 	rcu_read_lock();
92 	filter = rcu_dereference(sk->sk_filter);
93 	if (filter) {
94 		unsigned int pkt_len = bpf_prog_run_save_cb(filter->prog, skb);
95 		err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM;
96 	}
97 	rcu_read_unlock();
98 
99 	return err;
100 }
101 EXPORT_SYMBOL(sk_filter_trim_cap);
102 
103 BPF_CALL_1(__skb_get_pay_offset, struct sk_buff *, skb)
104 {
105 	return skb_get_poff(skb);
106 }
107 
108 BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x)
109 {
110 	struct nlattr *nla;
111 
112 	if (skb_is_nonlinear(skb))
113 		return 0;
114 
115 	if (skb->len < sizeof(struct nlattr))
116 		return 0;
117 
118 	if (a > skb->len - sizeof(struct nlattr))
119 		return 0;
120 
121 	nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x);
122 	if (nla)
123 		return (void *) nla - (void *) skb->data;
124 
125 	return 0;
126 }
127 
128 BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
129 {
130 	struct nlattr *nla;
131 
132 	if (skb_is_nonlinear(skb))
133 		return 0;
134 
135 	if (skb->len < sizeof(struct nlattr))
136 		return 0;
137 
138 	if (a > skb->len - sizeof(struct nlattr))
139 		return 0;
140 
141 	nla = (struct nlattr *) &skb->data[a];
142 	if (nla->nla_len > skb->len - a)
143 		return 0;
144 
145 	nla = nla_find_nested(nla, x);
146 	if (nla)
147 		return (void *) nla - (void *) skb->data;
148 
149 	return 0;
150 }
151 
152 BPF_CALL_0(__get_raw_cpu_id)
153 {
154 	return raw_smp_processor_id();
155 }
156 
157 static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = {
158 	.func		= __get_raw_cpu_id,
159 	.gpl_only	= false,
160 	.ret_type	= RET_INTEGER,
161 };
162 
163 static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
164 			      struct bpf_insn *insn_buf)
165 {
166 	struct bpf_insn *insn = insn_buf;
167 
168 	switch (skb_field) {
169 	case SKF_AD_MARK:
170 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
171 
172 		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
173 				      offsetof(struct sk_buff, mark));
174 		break;
175 
176 	case SKF_AD_PKTTYPE:
177 		*insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET());
178 		*insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX);
179 #ifdef __BIG_ENDIAN_BITFIELD
180 		*insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5);
181 #endif
182 		break;
183 
184 	case SKF_AD_QUEUE:
185 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2);
186 
187 		*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
188 				      offsetof(struct sk_buff, queue_mapping));
189 		break;
190 
191 	case SKF_AD_VLAN_TAG:
192 	case SKF_AD_VLAN_TAG_PRESENT:
193 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2);
194 		BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000);
195 
196 		/* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */
197 		*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
198 				      offsetof(struct sk_buff, vlan_tci));
199 		if (skb_field == SKF_AD_VLAN_TAG) {
200 			*insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg,
201 						~VLAN_TAG_PRESENT);
202 		} else {
203 			/* dst_reg >>= 12 */
204 			*insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 12);
205 			/* dst_reg &= 1 */
206 			*insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1);
207 		}
208 		break;
209 	}
210 
211 	return insn - insn_buf;
212 }
213 
214 static bool convert_bpf_extensions(struct sock_filter *fp,
215 				   struct bpf_insn **insnp)
216 {
217 	struct bpf_insn *insn = *insnp;
218 	u32 cnt;
219 
220 	switch (fp->k) {
221 	case SKF_AD_OFF + SKF_AD_PROTOCOL:
222 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
223 
224 		/* A = *(u16 *) (CTX + offsetof(protocol)) */
225 		*insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
226 				      offsetof(struct sk_buff, protocol));
227 		/* A = ntohs(A) [emitting a nop or swap16] */
228 		*insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
229 		break;
230 
231 	case SKF_AD_OFF + SKF_AD_PKTTYPE:
232 		cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn);
233 		insn += cnt - 1;
234 		break;
235 
236 	case SKF_AD_OFF + SKF_AD_IFINDEX:
237 	case SKF_AD_OFF + SKF_AD_HATYPE:
238 		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
239 		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2);
240 
241 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
242 				      BPF_REG_TMP, BPF_REG_CTX,
243 				      offsetof(struct sk_buff, dev));
244 		/* if (tmp != 0) goto pc + 1 */
245 		*insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1);
246 		*insn++ = BPF_EXIT_INSN();
247 		if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX)
248 			*insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP,
249 					    offsetof(struct net_device, ifindex));
250 		else
251 			*insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP,
252 					    offsetof(struct net_device, type));
253 		break;
254 
255 	case SKF_AD_OFF + SKF_AD_MARK:
256 		cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn);
257 		insn += cnt - 1;
258 		break;
259 
260 	case SKF_AD_OFF + SKF_AD_RXHASH:
261 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
262 
263 		*insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX,
264 				    offsetof(struct sk_buff, hash));
265 		break;
266 
267 	case SKF_AD_OFF + SKF_AD_QUEUE:
268 		cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn);
269 		insn += cnt - 1;
270 		break;
271 
272 	case SKF_AD_OFF + SKF_AD_VLAN_TAG:
273 		cnt = convert_skb_access(SKF_AD_VLAN_TAG,
274 					 BPF_REG_A, BPF_REG_CTX, insn);
275 		insn += cnt - 1;
276 		break;
277 
278 	case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT:
279 		cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
280 					 BPF_REG_A, BPF_REG_CTX, insn);
281 		insn += cnt - 1;
282 		break;
283 
284 	case SKF_AD_OFF + SKF_AD_VLAN_TPID:
285 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2);
286 
287 		/* A = *(u16 *) (CTX + offsetof(vlan_proto)) */
288 		*insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
289 				      offsetof(struct sk_buff, vlan_proto));
290 		/* A = ntohs(A) [emitting a nop or swap16] */
291 		*insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
292 		break;
293 
294 	case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
295 	case SKF_AD_OFF + SKF_AD_NLATTR:
296 	case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
297 	case SKF_AD_OFF + SKF_AD_CPU:
298 	case SKF_AD_OFF + SKF_AD_RANDOM:
299 		/* arg1 = CTX */
300 		*insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
301 		/* arg2 = A */
302 		*insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A);
303 		/* arg3 = X */
304 		*insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X);
305 		/* Emit call(arg1=CTX, arg2=A, arg3=X) */
306 		switch (fp->k) {
307 		case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
308 			*insn = BPF_EMIT_CALL(__skb_get_pay_offset);
309 			break;
310 		case SKF_AD_OFF + SKF_AD_NLATTR:
311 			*insn = BPF_EMIT_CALL(__skb_get_nlattr);
312 			break;
313 		case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
314 			*insn = BPF_EMIT_CALL(__skb_get_nlattr_nest);
315 			break;
316 		case SKF_AD_OFF + SKF_AD_CPU:
317 			*insn = BPF_EMIT_CALL(__get_raw_cpu_id);
318 			break;
319 		case SKF_AD_OFF + SKF_AD_RANDOM:
320 			*insn = BPF_EMIT_CALL(bpf_user_rnd_u32);
321 			bpf_user_rnd_init_once();
322 			break;
323 		}
324 		break;
325 
326 	case SKF_AD_OFF + SKF_AD_ALU_XOR_X:
327 		/* A ^= X */
328 		*insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X);
329 		break;
330 
331 	default:
332 		/* This is just a dummy call to avoid letting the compiler
333 		 * evict __bpf_call_base() as an optimization. Placed here
334 		 * where no-one bothers.
335 		 */
336 		BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0);
337 		return false;
338 	}
339 
340 	*insnp = insn;
341 	return true;
342 }
343 
344 /**
345  *	bpf_convert_filter - convert filter program
346  *	@prog: the user passed filter program
347  *	@len: the length of the user passed filter program
348  *	@new_prog: buffer where converted program will be stored
349  *	@new_len: pointer to store length of converted program
350  *
351  * Remap 'sock_filter' style BPF instruction set to 'sock_filter_ext' style.
352  * Conversion workflow:
353  *
354  * 1) First pass for calculating the new program length:
355  *   bpf_convert_filter(old_prog, old_len, NULL, &new_len)
356  *
357  * 2) 2nd pass to remap in two passes: 1st pass finds new
358  *    jump offsets, 2nd pass remapping:
359  *   new_prog = kmalloc(sizeof(struct bpf_insn) * new_len);
360  *   bpf_convert_filter(old_prog, old_len, new_prog, &new_len);
361  */
362 static int bpf_convert_filter(struct sock_filter *prog, int len,
363 			      struct bpf_insn *new_prog, int *new_len)
364 {
365 	int new_flen = 0, pass = 0, target, i;
366 	struct bpf_insn *new_insn;
367 	struct sock_filter *fp;
368 	int *addrs = NULL;
369 	u8 bpf_src;
370 
371 	BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK);
372 	BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
373 
374 	if (len <= 0 || len > BPF_MAXINSNS)
375 		return -EINVAL;
376 
377 	if (new_prog) {
378 		addrs = kcalloc(len, sizeof(*addrs),
379 				GFP_KERNEL | __GFP_NOWARN);
380 		if (!addrs)
381 			return -ENOMEM;
382 	}
383 
384 do_pass:
385 	new_insn = new_prog;
386 	fp = prog;
387 
388 	/* Classic BPF related prologue emission. */
389 	if (new_insn) {
390 		/* Classic BPF expects A and X to be reset first. These need
391 		 * to be guaranteed to be the first two instructions.
392 		 */
393 		*new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
394 		*new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_X, BPF_REG_X);
395 
396 		/* All programs must keep CTX in callee saved BPF_REG_CTX.
397 		 * In eBPF case it's done by the compiler, here we need to
398 		 * do this ourself. Initial CTX is present in BPF_REG_ARG1.
399 		 */
400 		*new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1);
401 	} else {
402 		new_insn += 3;
403 	}
404 
405 	for (i = 0; i < len; fp++, i++) {
406 		struct bpf_insn tmp_insns[6] = { };
407 		struct bpf_insn *insn = tmp_insns;
408 
409 		if (addrs)
410 			addrs[i] = new_insn - new_prog;
411 
412 		switch (fp->code) {
413 		/* All arithmetic insns and skb loads map as-is. */
414 		case BPF_ALU | BPF_ADD | BPF_X:
415 		case BPF_ALU | BPF_ADD | BPF_K:
416 		case BPF_ALU | BPF_SUB | BPF_X:
417 		case BPF_ALU | BPF_SUB | BPF_K:
418 		case BPF_ALU | BPF_AND | BPF_X:
419 		case BPF_ALU | BPF_AND | BPF_K:
420 		case BPF_ALU | BPF_OR | BPF_X:
421 		case BPF_ALU | BPF_OR | BPF_K:
422 		case BPF_ALU | BPF_LSH | BPF_X:
423 		case BPF_ALU | BPF_LSH | BPF_K:
424 		case BPF_ALU | BPF_RSH | BPF_X:
425 		case BPF_ALU | BPF_RSH | BPF_K:
426 		case BPF_ALU | BPF_XOR | BPF_X:
427 		case BPF_ALU | BPF_XOR | BPF_K:
428 		case BPF_ALU | BPF_MUL | BPF_X:
429 		case BPF_ALU | BPF_MUL | BPF_K:
430 		case BPF_ALU | BPF_DIV | BPF_X:
431 		case BPF_ALU | BPF_DIV | BPF_K:
432 		case BPF_ALU | BPF_MOD | BPF_X:
433 		case BPF_ALU | BPF_MOD | BPF_K:
434 		case BPF_ALU | BPF_NEG:
435 		case BPF_LD | BPF_ABS | BPF_W:
436 		case BPF_LD | BPF_ABS | BPF_H:
437 		case BPF_LD | BPF_ABS | BPF_B:
438 		case BPF_LD | BPF_IND | BPF_W:
439 		case BPF_LD | BPF_IND | BPF_H:
440 		case BPF_LD | BPF_IND | BPF_B:
441 			/* Check for overloaded BPF extension and
442 			 * directly convert it if found, otherwise
443 			 * just move on with mapping.
444 			 */
445 			if (BPF_CLASS(fp->code) == BPF_LD &&
446 			    BPF_MODE(fp->code) == BPF_ABS &&
447 			    convert_bpf_extensions(fp, &insn))
448 				break;
449 
450 			*insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k);
451 			break;
452 
453 		/* Jump transformation cannot use BPF block macros
454 		 * everywhere as offset calculation and target updates
455 		 * require a bit more work than the rest, i.e. jump
456 		 * opcodes map as-is, but offsets need adjustment.
457 		 */
458 
459 #define BPF_EMIT_JMP							\
460 	do {								\
461 		if (target >= len || target < 0)			\
462 			goto err;					\
463 		insn->off = addrs ? addrs[target] - addrs[i] - 1 : 0;	\
464 		/* Adjust pc relative offset for 2nd or 3rd insn. */	\
465 		insn->off -= insn - tmp_insns;				\
466 	} while (0)
467 
468 		case BPF_JMP | BPF_JA:
469 			target = i + fp->k + 1;
470 			insn->code = fp->code;
471 			BPF_EMIT_JMP;
472 			break;
473 
474 		case BPF_JMP | BPF_JEQ | BPF_K:
475 		case BPF_JMP | BPF_JEQ | BPF_X:
476 		case BPF_JMP | BPF_JSET | BPF_K:
477 		case BPF_JMP | BPF_JSET | BPF_X:
478 		case BPF_JMP | BPF_JGT | BPF_K:
479 		case BPF_JMP | BPF_JGT | BPF_X:
480 		case BPF_JMP | BPF_JGE | BPF_K:
481 		case BPF_JMP | BPF_JGE | BPF_X:
482 			if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) {
483 				/* BPF immediates are signed, zero extend
484 				 * immediate into tmp register and use it
485 				 * in compare insn.
486 				 */
487 				*insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k);
488 
489 				insn->dst_reg = BPF_REG_A;
490 				insn->src_reg = BPF_REG_TMP;
491 				bpf_src = BPF_X;
492 			} else {
493 				insn->dst_reg = BPF_REG_A;
494 				insn->imm = fp->k;
495 				bpf_src = BPF_SRC(fp->code);
496 				insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0;
497 			}
498 
499 			/* Common case where 'jump_false' is next insn. */
500 			if (fp->jf == 0) {
501 				insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
502 				target = i + fp->jt + 1;
503 				BPF_EMIT_JMP;
504 				break;
505 			}
506 
507 			/* Convert JEQ into JNE when 'jump_true' is next insn. */
508 			if (fp->jt == 0 && BPF_OP(fp->code) == BPF_JEQ) {
509 				insn->code = BPF_JMP | BPF_JNE | bpf_src;
510 				target = i + fp->jf + 1;
511 				BPF_EMIT_JMP;
512 				break;
513 			}
514 
515 			/* Other jumps are mapped into two insns: Jxx and JA. */
516 			target = i + fp->jt + 1;
517 			insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
518 			BPF_EMIT_JMP;
519 			insn++;
520 
521 			insn->code = BPF_JMP | BPF_JA;
522 			target = i + fp->jf + 1;
523 			BPF_EMIT_JMP;
524 			break;
525 
526 		/* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */
527 		case BPF_LDX | BPF_MSH | BPF_B:
528 			/* tmp = A */
529 			*insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A);
530 			/* A = BPF_R0 = *(u8 *) (skb->data + K) */
531 			*insn++ = BPF_LD_ABS(BPF_B, fp->k);
532 			/* A &= 0xf */
533 			*insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf);
534 			/* A <<= 2 */
535 			*insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2);
536 			/* X = A */
537 			*insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
538 			/* A = tmp */
539 			*insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP);
540 			break;
541 
542 		/* RET_K is remaped into 2 insns. RET_A case doesn't need an
543 		 * extra mov as BPF_REG_0 is already mapped into BPF_REG_A.
544 		 */
545 		case BPF_RET | BPF_A:
546 		case BPF_RET | BPF_K:
547 			if (BPF_RVAL(fp->code) == BPF_K)
548 				*insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0,
549 							0, fp->k);
550 			*insn = BPF_EXIT_INSN();
551 			break;
552 
553 		/* Store to stack. */
554 		case BPF_ST:
555 		case BPF_STX:
556 			*insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) ==
557 					    BPF_ST ? BPF_REG_A : BPF_REG_X,
558 					    -(BPF_MEMWORDS - fp->k) * 4);
559 			break;
560 
561 		/* Load from stack. */
562 		case BPF_LD | BPF_MEM:
563 		case BPF_LDX | BPF_MEM:
564 			*insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD  ?
565 					    BPF_REG_A : BPF_REG_X, BPF_REG_FP,
566 					    -(BPF_MEMWORDS - fp->k) * 4);
567 			break;
568 
569 		/* A = K or X = K */
570 		case BPF_LD | BPF_IMM:
571 		case BPF_LDX | BPF_IMM:
572 			*insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ?
573 					      BPF_REG_A : BPF_REG_X, fp->k);
574 			break;
575 
576 		/* X = A */
577 		case BPF_MISC | BPF_TAX:
578 			*insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
579 			break;
580 
581 		/* A = X */
582 		case BPF_MISC | BPF_TXA:
583 			*insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X);
584 			break;
585 
586 		/* A = skb->len or X = skb->len */
587 		case BPF_LD | BPF_W | BPF_LEN:
588 		case BPF_LDX | BPF_W | BPF_LEN:
589 			*insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ?
590 					    BPF_REG_A : BPF_REG_X, BPF_REG_CTX,
591 					    offsetof(struct sk_buff, len));
592 			break;
593 
594 		/* Access seccomp_data fields. */
595 		case BPF_LDX | BPF_ABS | BPF_W:
596 			/* A = *(u32 *) (ctx + K) */
597 			*insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k);
598 			break;
599 
600 		/* Unknown instruction. */
601 		default:
602 			goto err;
603 		}
604 
605 		insn++;
606 		if (new_prog)
607 			memcpy(new_insn, tmp_insns,
608 			       sizeof(*insn) * (insn - tmp_insns));
609 		new_insn += insn - tmp_insns;
610 	}
611 
612 	if (!new_prog) {
613 		/* Only calculating new length. */
614 		*new_len = new_insn - new_prog;
615 		return 0;
616 	}
617 
618 	pass++;
619 	if (new_flen != new_insn - new_prog) {
620 		new_flen = new_insn - new_prog;
621 		if (pass > 2)
622 			goto err;
623 		goto do_pass;
624 	}
625 
626 	kfree(addrs);
627 	BUG_ON(*new_len != new_flen);
628 	return 0;
629 err:
630 	kfree(addrs);
631 	return -EINVAL;
632 }
633 
634 /* Security:
635  *
636  * As we dont want to clear mem[] array for each packet going through
637  * __bpf_prog_run(), we check that filter loaded by user never try to read
638  * a cell if not previously written, and we check all branches to be sure
639  * a malicious user doesn't try to abuse us.
640  */
641 static int check_load_and_stores(const struct sock_filter *filter, int flen)
642 {
643 	u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */
644 	int pc, ret = 0;
645 
646 	BUILD_BUG_ON(BPF_MEMWORDS > 16);
647 
648 	masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL);
649 	if (!masks)
650 		return -ENOMEM;
651 
652 	memset(masks, 0xff, flen * sizeof(*masks));
653 
654 	for (pc = 0; pc < flen; pc++) {
655 		memvalid &= masks[pc];
656 
657 		switch (filter[pc].code) {
658 		case BPF_ST:
659 		case BPF_STX:
660 			memvalid |= (1 << filter[pc].k);
661 			break;
662 		case BPF_LD | BPF_MEM:
663 		case BPF_LDX | BPF_MEM:
664 			if (!(memvalid & (1 << filter[pc].k))) {
665 				ret = -EINVAL;
666 				goto error;
667 			}
668 			break;
669 		case BPF_JMP | BPF_JA:
670 			/* A jump must set masks on target */
671 			masks[pc + 1 + filter[pc].k] &= memvalid;
672 			memvalid = ~0;
673 			break;
674 		case BPF_JMP | BPF_JEQ | BPF_K:
675 		case BPF_JMP | BPF_JEQ | BPF_X:
676 		case BPF_JMP | BPF_JGE | BPF_K:
677 		case BPF_JMP | BPF_JGE | BPF_X:
678 		case BPF_JMP | BPF_JGT | BPF_K:
679 		case BPF_JMP | BPF_JGT | BPF_X:
680 		case BPF_JMP | BPF_JSET | BPF_K:
681 		case BPF_JMP | BPF_JSET | BPF_X:
682 			/* A jump must set masks on targets */
683 			masks[pc + 1 + filter[pc].jt] &= memvalid;
684 			masks[pc + 1 + filter[pc].jf] &= memvalid;
685 			memvalid = ~0;
686 			break;
687 		}
688 	}
689 error:
690 	kfree(masks);
691 	return ret;
692 }
693 
694 static bool chk_code_allowed(u16 code_to_probe)
695 {
696 	static const bool codes[] = {
697 		/* 32 bit ALU operations */
698 		[BPF_ALU | BPF_ADD | BPF_K] = true,
699 		[BPF_ALU | BPF_ADD | BPF_X] = true,
700 		[BPF_ALU | BPF_SUB | BPF_K] = true,
701 		[BPF_ALU | BPF_SUB | BPF_X] = true,
702 		[BPF_ALU | BPF_MUL | BPF_K] = true,
703 		[BPF_ALU | BPF_MUL | BPF_X] = true,
704 		[BPF_ALU | BPF_DIV | BPF_K] = true,
705 		[BPF_ALU | BPF_DIV | BPF_X] = true,
706 		[BPF_ALU | BPF_MOD | BPF_K] = true,
707 		[BPF_ALU | BPF_MOD | BPF_X] = true,
708 		[BPF_ALU | BPF_AND | BPF_K] = true,
709 		[BPF_ALU | BPF_AND | BPF_X] = true,
710 		[BPF_ALU | BPF_OR | BPF_K] = true,
711 		[BPF_ALU | BPF_OR | BPF_X] = true,
712 		[BPF_ALU | BPF_XOR | BPF_K] = true,
713 		[BPF_ALU | BPF_XOR | BPF_X] = true,
714 		[BPF_ALU | BPF_LSH | BPF_K] = true,
715 		[BPF_ALU | BPF_LSH | BPF_X] = true,
716 		[BPF_ALU | BPF_RSH | BPF_K] = true,
717 		[BPF_ALU | BPF_RSH | BPF_X] = true,
718 		[BPF_ALU | BPF_NEG] = true,
719 		/* Load instructions */
720 		[BPF_LD | BPF_W | BPF_ABS] = true,
721 		[BPF_LD | BPF_H | BPF_ABS] = true,
722 		[BPF_LD | BPF_B | BPF_ABS] = true,
723 		[BPF_LD | BPF_W | BPF_LEN] = true,
724 		[BPF_LD | BPF_W | BPF_IND] = true,
725 		[BPF_LD | BPF_H | BPF_IND] = true,
726 		[BPF_LD | BPF_B | BPF_IND] = true,
727 		[BPF_LD | BPF_IMM] = true,
728 		[BPF_LD | BPF_MEM] = true,
729 		[BPF_LDX | BPF_W | BPF_LEN] = true,
730 		[BPF_LDX | BPF_B | BPF_MSH] = true,
731 		[BPF_LDX | BPF_IMM] = true,
732 		[BPF_LDX | BPF_MEM] = true,
733 		/* Store instructions */
734 		[BPF_ST] = true,
735 		[BPF_STX] = true,
736 		/* Misc instructions */
737 		[BPF_MISC | BPF_TAX] = true,
738 		[BPF_MISC | BPF_TXA] = true,
739 		/* Return instructions */
740 		[BPF_RET | BPF_K] = true,
741 		[BPF_RET | BPF_A] = true,
742 		/* Jump instructions */
743 		[BPF_JMP | BPF_JA] = true,
744 		[BPF_JMP | BPF_JEQ | BPF_K] = true,
745 		[BPF_JMP | BPF_JEQ | BPF_X] = true,
746 		[BPF_JMP | BPF_JGE | BPF_K] = true,
747 		[BPF_JMP | BPF_JGE | BPF_X] = true,
748 		[BPF_JMP | BPF_JGT | BPF_K] = true,
749 		[BPF_JMP | BPF_JGT | BPF_X] = true,
750 		[BPF_JMP | BPF_JSET | BPF_K] = true,
751 		[BPF_JMP | BPF_JSET | BPF_X] = true,
752 	};
753 
754 	if (code_to_probe >= ARRAY_SIZE(codes))
755 		return false;
756 
757 	return codes[code_to_probe];
758 }
759 
760 static bool bpf_check_basics_ok(const struct sock_filter *filter,
761 				unsigned int flen)
762 {
763 	if (filter == NULL)
764 		return false;
765 	if (flen == 0 || flen > BPF_MAXINSNS)
766 		return false;
767 
768 	return true;
769 }
770 
771 /**
772  *	bpf_check_classic - verify socket filter code
773  *	@filter: filter to verify
774  *	@flen: length of filter
775  *
776  * Check the user's filter code. If we let some ugly
777  * filter code slip through kaboom! The filter must contain
778  * no references or jumps that are out of range, no illegal
779  * instructions, and must end with a RET instruction.
780  *
781  * All jumps are forward as they are not signed.
782  *
783  * Returns 0 if the rule set is legal or -EINVAL if not.
784  */
785 static int bpf_check_classic(const struct sock_filter *filter,
786 			     unsigned int flen)
787 {
788 	bool anc_found;
789 	int pc;
790 
791 	/* Check the filter code now */
792 	for (pc = 0; pc < flen; pc++) {
793 		const struct sock_filter *ftest = &filter[pc];
794 
795 		/* May we actually operate on this code? */
796 		if (!chk_code_allowed(ftest->code))
797 			return -EINVAL;
798 
799 		/* Some instructions need special checks */
800 		switch (ftest->code) {
801 		case BPF_ALU | BPF_DIV | BPF_K:
802 		case BPF_ALU | BPF_MOD | BPF_K:
803 			/* Check for division by zero */
804 			if (ftest->k == 0)
805 				return -EINVAL;
806 			break;
807 		case BPF_ALU | BPF_LSH | BPF_K:
808 		case BPF_ALU | BPF_RSH | BPF_K:
809 			if (ftest->k >= 32)
810 				return -EINVAL;
811 			break;
812 		case BPF_LD | BPF_MEM:
813 		case BPF_LDX | BPF_MEM:
814 		case BPF_ST:
815 		case BPF_STX:
816 			/* Check for invalid memory addresses */
817 			if (ftest->k >= BPF_MEMWORDS)
818 				return -EINVAL;
819 			break;
820 		case BPF_JMP | BPF_JA:
821 			/* Note, the large ftest->k might cause loops.
822 			 * Compare this with conditional jumps below,
823 			 * where offsets are limited. --ANK (981016)
824 			 */
825 			if (ftest->k >= (unsigned int)(flen - pc - 1))
826 				return -EINVAL;
827 			break;
828 		case BPF_JMP | BPF_JEQ | BPF_K:
829 		case BPF_JMP | BPF_JEQ | BPF_X:
830 		case BPF_JMP | BPF_JGE | BPF_K:
831 		case BPF_JMP | BPF_JGE | BPF_X:
832 		case BPF_JMP | BPF_JGT | BPF_K:
833 		case BPF_JMP | BPF_JGT | BPF_X:
834 		case BPF_JMP | BPF_JSET | BPF_K:
835 		case BPF_JMP | BPF_JSET | BPF_X:
836 			/* Both conditionals must be safe */
837 			if (pc + ftest->jt + 1 >= flen ||
838 			    pc + ftest->jf + 1 >= flen)
839 				return -EINVAL;
840 			break;
841 		case BPF_LD | BPF_W | BPF_ABS:
842 		case BPF_LD | BPF_H | BPF_ABS:
843 		case BPF_LD | BPF_B | BPF_ABS:
844 			anc_found = false;
845 			if (bpf_anc_helper(ftest) & BPF_ANC)
846 				anc_found = true;
847 			/* Ancillary operation unknown or unsupported */
848 			if (anc_found == false && ftest->k >= SKF_AD_OFF)
849 				return -EINVAL;
850 		}
851 	}
852 
853 	/* Last instruction must be a RET code */
854 	switch (filter[flen - 1].code) {
855 	case BPF_RET | BPF_K:
856 	case BPF_RET | BPF_A:
857 		return check_load_and_stores(filter, flen);
858 	}
859 
860 	return -EINVAL;
861 }
862 
863 static int bpf_prog_store_orig_filter(struct bpf_prog *fp,
864 				      const struct sock_fprog *fprog)
865 {
866 	unsigned int fsize = bpf_classic_proglen(fprog);
867 	struct sock_fprog_kern *fkprog;
868 
869 	fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL);
870 	if (!fp->orig_prog)
871 		return -ENOMEM;
872 
873 	fkprog = fp->orig_prog;
874 	fkprog->len = fprog->len;
875 
876 	fkprog->filter = kmemdup(fp->insns, fsize,
877 				 GFP_KERNEL | __GFP_NOWARN);
878 	if (!fkprog->filter) {
879 		kfree(fp->orig_prog);
880 		return -ENOMEM;
881 	}
882 
883 	return 0;
884 }
885 
886 static void bpf_release_orig_filter(struct bpf_prog *fp)
887 {
888 	struct sock_fprog_kern *fprog = fp->orig_prog;
889 
890 	if (fprog) {
891 		kfree(fprog->filter);
892 		kfree(fprog);
893 	}
894 }
895 
896 static void __bpf_prog_release(struct bpf_prog *prog)
897 {
898 	if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) {
899 		bpf_prog_put(prog);
900 	} else {
901 		bpf_release_orig_filter(prog);
902 		bpf_prog_free(prog);
903 	}
904 }
905 
906 static void __sk_filter_release(struct sk_filter *fp)
907 {
908 	__bpf_prog_release(fp->prog);
909 	kfree(fp);
910 }
911 
912 /**
913  * 	sk_filter_release_rcu - Release a socket filter by rcu_head
914  *	@rcu: rcu_head that contains the sk_filter to free
915  */
916 static void sk_filter_release_rcu(struct rcu_head *rcu)
917 {
918 	struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);
919 
920 	__sk_filter_release(fp);
921 }
922 
923 /**
924  *	sk_filter_release - release a socket filter
925  *	@fp: filter to remove
926  *
927  *	Remove a filter from a socket and release its resources.
928  */
929 static void sk_filter_release(struct sk_filter *fp)
930 {
931 	if (atomic_dec_and_test(&fp->refcnt))
932 		call_rcu(&fp->rcu, sk_filter_release_rcu);
933 }
934 
935 void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
936 {
937 	u32 filter_size = bpf_prog_size(fp->prog->len);
938 
939 	atomic_sub(filter_size, &sk->sk_omem_alloc);
940 	sk_filter_release(fp);
941 }
942 
943 /* try to charge the socket memory if there is space available
944  * return true on success
945  */
946 bool sk_filter_charge(struct sock *sk, struct sk_filter *fp)
947 {
948 	u32 filter_size = bpf_prog_size(fp->prog->len);
949 
950 	/* same check as in sock_kmalloc() */
951 	if (filter_size <= sysctl_optmem_max &&
952 	    atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) {
953 		atomic_inc(&fp->refcnt);
954 		atomic_add(filter_size, &sk->sk_omem_alloc);
955 		return true;
956 	}
957 	return false;
958 }
959 
960 static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
961 {
962 	struct sock_filter *old_prog;
963 	struct bpf_prog *old_fp;
964 	int err, new_len, old_len = fp->len;
965 
966 	/* We are free to overwrite insns et al right here as it
967 	 * won't be used at this point in time anymore internally
968 	 * after the migration to the internal BPF instruction
969 	 * representation.
970 	 */
971 	BUILD_BUG_ON(sizeof(struct sock_filter) !=
972 		     sizeof(struct bpf_insn));
973 
974 	/* Conversion cannot happen on overlapping memory areas,
975 	 * so we need to keep the user BPF around until the 2nd
976 	 * pass. At this time, the user BPF is stored in fp->insns.
977 	 */
978 	old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter),
979 			   GFP_KERNEL | __GFP_NOWARN);
980 	if (!old_prog) {
981 		err = -ENOMEM;
982 		goto out_err;
983 	}
984 
985 	/* 1st pass: calculate the new program length. */
986 	err = bpf_convert_filter(old_prog, old_len, NULL, &new_len);
987 	if (err)
988 		goto out_err_free;
989 
990 	/* Expand fp for appending the new filter representation. */
991 	old_fp = fp;
992 	fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0);
993 	if (!fp) {
994 		/* The old_fp is still around in case we couldn't
995 		 * allocate new memory, so uncharge on that one.
996 		 */
997 		fp = old_fp;
998 		err = -ENOMEM;
999 		goto out_err_free;
1000 	}
1001 
1002 	fp->len = new_len;
1003 
1004 	/* 2nd pass: remap sock_filter insns into bpf_insn insns. */
1005 	err = bpf_convert_filter(old_prog, old_len, fp->insnsi, &new_len);
1006 	if (err)
1007 		/* 2nd bpf_convert_filter() can fail only if it fails
1008 		 * to allocate memory, remapping must succeed. Note,
1009 		 * that at this time old_fp has already been released
1010 		 * by krealloc().
1011 		 */
1012 		goto out_err_free;
1013 
1014 	/* We are guaranteed to never error here with cBPF to eBPF
1015 	 * transitions, since there's no issue with type compatibility
1016 	 * checks on program arrays.
1017 	 */
1018 	fp = bpf_prog_select_runtime(fp, &err);
1019 
1020 	kfree(old_prog);
1021 	return fp;
1022 
1023 out_err_free:
1024 	kfree(old_prog);
1025 out_err:
1026 	__bpf_prog_release(fp);
1027 	return ERR_PTR(err);
1028 }
1029 
1030 static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp,
1031 					   bpf_aux_classic_check_t trans)
1032 {
1033 	int err;
1034 
1035 	fp->bpf_func = NULL;
1036 	fp->jited = 0;
1037 
1038 	err = bpf_check_classic(fp->insns, fp->len);
1039 	if (err) {
1040 		__bpf_prog_release(fp);
1041 		return ERR_PTR(err);
1042 	}
1043 
1044 	/* There might be additional checks and transformations
1045 	 * needed on classic filters, f.e. in case of seccomp.
1046 	 */
1047 	if (trans) {
1048 		err = trans(fp->insns, fp->len);
1049 		if (err) {
1050 			__bpf_prog_release(fp);
1051 			return ERR_PTR(err);
1052 		}
1053 	}
1054 
1055 	/* Probe if we can JIT compile the filter and if so, do
1056 	 * the compilation of the filter.
1057 	 */
1058 	bpf_jit_compile(fp);
1059 
1060 	/* JIT compiler couldn't process this filter, so do the
1061 	 * internal BPF translation for the optimized interpreter.
1062 	 */
1063 	if (!fp->jited)
1064 		fp = bpf_migrate_filter(fp);
1065 
1066 	return fp;
1067 }
1068 
1069 /**
1070  *	bpf_prog_create - create an unattached filter
1071  *	@pfp: the unattached filter that is created
1072  *	@fprog: the filter program
1073  *
1074  * Create a filter independent of any socket. We first run some
1075  * sanity checks on it to make sure it does not explode on us later.
1076  * If an error occurs or there is insufficient memory for the filter
1077  * a negative errno code is returned. On success the return is zero.
1078  */
1079 int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog)
1080 {
1081 	unsigned int fsize = bpf_classic_proglen(fprog);
1082 	struct bpf_prog *fp;
1083 
1084 	/* Make sure new filter is there and in the right amounts. */
1085 	if (!bpf_check_basics_ok(fprog->filter, fprog->len))
1086 		return -EINVAL;
1087 
1088 	fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
1089 	if (!fp)
1090 		return -ENOMEM;
1091 
1092 	memcpy(fp->insns, fprog->filter, fsize);
1093 
1094 	fp->len = fprog->len;
1095 	/* Since unattached filters are not copied back to user
1096 	 * space through sk_get_filter(), we do not need to hold
1097 	 * a copy here, and can spare us the work.
1098 	 */
1099 	fp->orig_prog = NULL;
1100 
1101 	/* bpf_prepare_filter() already takes care of freeing
1102 	 * memory in case something goes wrong.
1103 	 */
1104 	fp = bpf_prepare_filter(fp, NULL);
1105 	if (IS_ERR(fp))
1106 		return PTR_ERR(fp);
1107 
1108 	*pfp = fp;
1109 	return 0;
1110 }
1111 EXPORT_SYMBOL_GPL(bpf_prog_create);
1112 
1113 /**
1114  *	bpf_prog_create_from_user - create an unattached filter from user buffer
1115  *	@pfp: the unattached filter that is created
1116  *	@fprog: the filter program
1117  *	@trans: post-classic verifier transformation handler
1118  *	@save_orig: save classic BPF program
1119  *
1120  * This function effectively does the same as bpf_prog_create(), only
1121  * that it builds up its insns buffer from user space provided buffer.
1122  * It also allows for passing a bpf_aux_classic_check_t handler.
1123  */
1124 int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
1125 			      bpf_aux_classic_check_t trans, bool save_orig)
1126 {
1127 	unsigned int fsize = bpf_classic_proglen(fprog);
1128 	struct bpf_prog *fp;
1129 	int err;
1130 
1131 	/* Make sure new filter is there and in the right amounts. */
1132 	if (!bpf_check_basics_ok(fprog->filter, fprog->len))
1133 		return -EINVAL;
1134 
1135 	fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
1136 	if (!fp)
1137 		return -ENOMEM;
1138 
1139 	if (copy_from_user(fp->insns, fprog->filter, fsize)) {
1140 		__bpf_prog_free(fp);
1141 		return -EFAULT;
1142 	}
1143 
1144 	fp->len = fprog->len;
1145 	fp->orig_prog = NULL;
1146 
1147 	if (save_orig) {
1148 		err = bpf_prog_store_orig_filter(fp, fprog);
1149 		if (err) {
1150 			__bpf_prog_free(fp);
1151 			return -ENOMEM;
1152 		}
1153 	}
1154 
1155 	/* bpf_prepare_filter() already takes care of freeing
1156 	 * memory in case something goes wrong.
1157 	 */
1158 	fp = bpf_prepare_filter(fp, trans);
1159 	if (IS_ERR(fp))
1160 		return PTR_ERR(fp);
1161 
1162 	*pfp = fp;
1163 	return 0;
1164 }
1165 EXPORT_SYMBOL_GPL(bpf_prog_create_from_user);
1166 
1167 void bpf_prog_destroy(struct bpf_prog *fp)
1168 {
1169 	__bpf_prog_release(fp);
1170 }
1171 EXPORT_SYMBOL_GPL(bpf_prog_destroy);
1172 
1173 static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
1174 {
1175 	struct sk_filter *fp, *old_fp;
1176 
1177 	fp = kmalloc(sizeof(*fp), GFP_KERNEL);
1178 	if (!fp)
1179 		return -ENOMEM;
1180 
1181 	fp->prog = prog;
1182 	atomic_set(&fp->refcnt, 0);
1183 
1184 	if (!sk_filter_charge(sk, fp)) {
1185 		kfree(fp);
1186 		return -ENOMEM;
1187 	}
1188 
1189 	old_fp = rcu_dereference_protected(sk->sk_filter,
1190 					   lockdep_sock_is_held(sk));
1191 	rcu_assign_pointer(sk->sk_filter, fp);
1192 
1193 	if (old_fp)
1194 		sk_filter_uncharge(sk, old_fp);
1195 
1196 	return 0;
1197 }
1198 
1199 static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk)
1200 {
1201 	struct bpf_prog *old_prog;
1202 	int err;
1203 
1204 	if (bpf_prog_size(prog->len) > sysctl_optmem_max)
1205 		return -ENOMEM;
1206 
1207 	if (sk_unhashed(sk) && sk->sk_reuseport) {
1208 		err = reuseport_alloc(sk);
1209 		if (err)
1210 			return err;
1211 	} else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
1212 		/* The socket wasn't bound with SO_REUSEPORT */
1213 		return -EINVAL;
1214 	}
1215 
1216 	old_prog = reuseport_attach_prog(sk, prog);
1217 	if (old_prog)
1218 		bpf_prog_destroy(old_prog);
1219 
1220 	return 0;
1221 }
1222 
1223 static
1224 struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk)
1225 {
1226 	unsigned int fsize = bpf_classic_proglen(fprog);
1227 	struct bpf_prog *prog;
1228 	int err;
1229 
1230 	if (sock_flag(sk, SOCK_FILTER_LOCKED))
1231 		return ERR_PTR(-EPERM);
1232 
1233 	/* Make sure new filter is there and in the right amounts. */
1234 	if (!bpf_check_basics_ok(fprog->filter, fprog->len))
1235 		return ERR_PTR(-EINVAL);
1236 
1237 	prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
1238 	if (!prog)
1239 		return ERR_PTR(-ENOMEM);
1240 
1241 	if (copy_from_user(prog->insns, fprog->filter, fsize)) {
1242 		__bpf_prog_free(prog);
1243 		return ERR_PTR(-EFAULT);
1244 	}
1245 
1246 	prog->len = fprog->len;
1247 
1248 	err = bpf_prog_store_orig_filter(prog, fprog);
1249 	if (err) {
1250 		__bpf_prog_free(prog);
1251 		return ERR_PTR(-ENOMEM);
1252 	}
1253 
1254 	/* bpf_prepare_filter() already takes care of freeing
1255 	 * memory in case something goes wrong.
1256 	 */
1257 	return bpf_prepare_filter(prog, NULL);
1258 }
1259 
1260 /**
1261  *	sk_attach_filter - attach a socket filter
1262  *	@fprog: the filter program
1263  *	@sk: the socket to use
1264  *
1265  * Attach the user's filter code. We first run some sanity checks on
1266  * it to make sure it does not explode on us later. If an error
1267  * occurs or there is insufficient memory for the filter a negative
1268  * errno code is returned. On success the return is zero.
1269  */
1270 int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
1271 {
1272 	struct bpf_prog *prog = __get_filter(fprog, sk);
1273 	int err;
1274 
1275 	if (IS_ERR(prog))
1276 		return PTR_ERR(prog);
1277 
1278 	err = __sk_attach_prog(prog, sk);
1279 	if (err < 0) {
1280 		__bpf_prog_release(prog);
1281 		return err;
1282 	}
1283 
1284 	return 0;
1285 }
1286 EXPORT_SYMBOL_GPL(sk_attach_filter);
1287 
1288 int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
1289 {
1290 	struct bpf_prog *prog = __get_filter(fprog, sk);
1291 	int err;
1292 
1293 	if (IS_ERR(prog))
1294 		return PTR_ERR(prog);
1295 
1296 	err = __reuseport_attach_prog(prog, sk);
1297 	if (err < 0) {
1298 		__bpf_prog_release(prog);
1299 		return err;
1300 	}
1301 
1302 	return 0;
1303 }
1304 
1305 static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk)
1306 {
1307 	if (sock_flag(sk, SOCK_FILTER_LOCKED))
1308 		return ERR_PTR(-EPERM);
1309 
1310 	return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
1311 }
1312 
1313 int sk_attach_bpf(u32 ufd, struct sock *sk)
1314 {
1315 	struct bpf_prog *prog = __get_bpf(ufd, sk);
1316 	int err;
1317 
1318 	if (IS_ERR(prog))
1319 		return PTR_ERR(prog);
1320 
1321 	err = __sk_attach_prog(prog, sk);
1322 	if (err < 0) {
1323 		bpf_prog_put(prog);
1324 		return err;
1325 	}
1326 
1327 	return 0;
1328 }
1329 
1330 int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
1331 {
1332 	struct bpf_prog *prog = __get_bpf(ufd, sk);
1333 	int err;
1334 
1335 	if (IS_ERR(prog))
1336 		return PTR_ERR(prog);
1337 
1338 	err = __reuseport_attach_prog(prog, sk);
1339 	if (err < 0) {
1340 		bpf_prog_put(prog);
1341 		return err;
1342 	}
1343 
1344 	return 0;
1345 }
1346 
1347 struct bpf_scratchpad {
1348 	union {
1349 		__be32 diff[MAX_BPF_STACK / sizeof(__be32)];
1350 		u8     buff[MAX_BPF_STACK];
1351 	};
1352 };
1353 
1354 static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp);
1355 
1356 static inline int __bpf_try_make_writable(struct sk_buff *skb,
1357 					  unsigned int write_len)
1358 {
1359 	return skb_ensure_writable(skb, write_len);
1360 }
1361 
1362 static inline int bpf_try_make_writable(struct sk_buff *skb,
1363 					unsigned int write_len)
1364 {
1365 	int err = __bpf_try_make_writable(skb, write_len);
1366 
1367 	bpf_compute_data_end(skb);
1368 	return err;
1369 }
1370 
1371 static int bpf_try_make_head_writable(struct sk_buff *skb)
1372 {
1373 	return bpf_try_make_writable(skb, skb_headlen(skb));
1374 }
1375 
1376 static inline void bpf_push_mac_rcsum(struct sk_buff *skb)
1377 {
1378 	if (skb_at_tc_ingress(skb))
1379 		skb_postpush_rcsum(skb, skb_mac_header(skb), skb->mac_len);
1380 }
1381 
1382 static inline void bpf_pull_mac_rcsum(struct sk_buff *skb)
1383 {
1384 	if (skb_at_tc_ingress(skb))
1385 		skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len);
1386 }
1387 
1388 BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset,
1389 	   const void *, from, u32, len, u64, flags)
1390 {
1391 	void *ptr;
1392 
1393 	if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH)))
1394 		return -EINVAL;
1395 	if (unlikely(offset > 0xffff))
1396 		return -EFAULT;
1397 	if (unlikely(bpf_try_make_writable(skb, offset + len)))
1398 		return -EFAULT;
1399 
1400 	ptr = skb->data + offset;
1401 	if (flags & BPF_F_RECOMPUTE_CSUM)
1402 		__skb_postpull_rcsum(skb, ptr, len, offset);
1403 
1404 	memcpy(ptr, from, len);
1405 
1406 	if (flags & BPF_F_RECOMPUTE_CSUM)
1407 		__skb_postpush_rcsum(skb, ptr, len, offset);
1408 	if (flags & BPF_F_INVALIDATE_HASH)
1409 		skb_clear_hash(skb);
1410 
1411 	return 0;
1412 }
1413 
1414 static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
1415 	.func		= bpf_skb_store_bytes,
1416 	.gpl_only	= false,
1417 	.ret_type	= RET_INTEGER,
1418 	.arg1_type	= ARG_PTR_TO_CTX,
1419 	.arg2_type	= ARG_ANYTHING,
1420 	.arg3_type	= ARG_PTR_TO_MEM,
1421 	.arg4_type	= ARG_CONST_SIZE,
1422 	.arg5_type	= ARG_ANYTHING,
1423 };
1424 
1425 BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset,
1426 	   void *, to, u32, len)
1427 {
1428 	void *ptr;
1429 
1430 	if (unlikely(offset > 0xffff))
1431 		goto err_clear;
1432 
1433 	ptr = skb_header_pointer(skb, offset, len, to);
1434 	if (unlikely(!ptr))
1435 		goto err_clear;
1436 	if (ptr != to)
1437 		memcpy(to, ptr, len);
1438 
1439 	return 0;
1440 err_clear:
1441 	memset(to, 0, len);
1442 	return -EFAULT;
1443 }
1444 
1445 static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
1446 	.func		= bpf_skb_load_bytes,
1447 	.gpl_only	= false,
1448 	.ret_type	= RET_INTEGER,
1449 	.arg1_type	= ARG_PTR_TO_CTX,
1450 	.arg2_type	= ARG_ANYTHING,
1451 	.arg3_type	= ARG_PTR_TO_UNINIT_MEM,
1452 	.arg4_type	= ARG_CONST_SIZE,
1453 };
1454 
1455 BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
1456 {
1457 	/* Idea is the following: should the needed direct read/write
1458 	 * test fail during runtime, we can pull in more data and redo
1459 	 * again, since implicitly, we invalidate previous checks here.
1460 	 *
1461 	 * Or, since we know how much we need to make read/writeable,
1462 	 * this can be done once at the program beginning for direct
1463 	 * access case. By this we overcome limitations of only current
1464 	 * headroom being accessible.
1465 	 */
1466 	return bpf_try_make_writable(skb, len ? : skb_headlen(skb));
1467 }
1468 
1469 static const struct bpf_func_proto bpf_skb_pull_data_proto = {
1470 	.func		= bpf_skb_pull_data,
1471 	.gpl_only	= false,
1472 	.ret_type	= RET_INTEGER,
1473 	.arg1_type	= ARG_PTR_TO_CTX,
1474 	.arg2_type	= ARG_ANYTHING,
1475 };
1476 
1477 BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset,
1478 	   u64, from, u64, to, u64, flags)
1479 {
1480 	__sum16 *ptr;
1481 
1482 	if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK)))
1483 		return -EINVAL;
1484 	if (unlikely(offset > 0xffff || offset & 1))
1485 		return -EFAULT;
1486 	if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr))))
1487 		return -EFAULT;
1488 
1489 	ptr = (__sum16 *)(skb->data + offset);
1490 	switch (flags & BPF_F_HDR_FIELD_MASK) {
1491 	case 0:
1492 		if (unlikely(from != 0))
1493 			return -EINVAL;
1494 
1495 		csum_replace_by_diff(ptr, to);
1496 		break;
1497 	case 2:
1498 		csum_replace2(ptr, from, to);
1499 		break;
1500 	case 4:
1501 		csum_replace4(ptr, from, to);
1502 		break;
1503 	default:
1504 		return -EINVAL;
1505 	}
1506 
1507 	return 0;
1508 }
1509 
1510 static const struct bpf_func_proto bpf_l3_csum_replace_proto = {
1511 	.func		= bpf_l3_csum_replace,
1512 	.gpl_only	= false,
1513 	.ret_type	= RET_INTEGER,
1514 	.arg1_type	= ARG_PTR_TO_CTX,
1515 	.arg2_type	= ARG_ANYTHING,
1516 	.arg3_type	= ARG_ANYTHING,
1517 	.arg4_type	= ARG_ANYTHING,
1518 	.arg5_type	= ARG_ANYTHING,
1519 };
1520 
1521 BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
1522 	   u64, from, u64, to, u64, flags)
1523 {
1524 	bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
1525 	bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
1526 	bool do_mforce = flags & BPF_F_MARK_ENFORCE;
1527 	__sum16 *ptr;
1528 
1529 	if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE |
1530 			       BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK)))
1531 		return -EINVAL;
1532 	if (unlikely(offset > 0xffff || offset & 1))
1533 		return -EFAULT;
1534 	if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr))))
1535 		return -EFAULT;
1536 
1537 	ptr = (__sum16 *)(skb->data + offset);
1538 	if (is_mmzero && !do_mforce && !*ptr)
1539 		return 0;
1540 
1541 	switch (flags & BPF_F_HDR_FIELD_MASK) {
1542 	case 0:
1543 		if (unlikely(from != 0))
1544 			return -EINVAL;
1545 
1546 		inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo);
1547 		break;
1548 	case 2:
1549 		inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo);
1550 		break;
1551 	case 4:
1552 		inet_proto_csum_replace4(ptr, skb, from, to, is_pseudo);
1553 		break;
1554 	default:
1555 		return -EINVAL;
1556 	}
1557 
1558 	if (is_mmzero && !*ptr)
1559 		*ptr = CSUM_MANGLED_0;
1560 	return 0;
1561 }
1562 
1563 static const struct bpf_func_proto bpf_l4_csum_replace_proto = {
1564 	.func		= bpf_l4_csum_replace,
1565 	.gpl_only	= false,
1566 	.ret_type	= RET_INTEGER,
1567 	.arg1_type	= ARG_PTR_TO_CTX,
1568 	.arg2_type	= ARG_ANYTHING,
1569 	.arg3_type	= ARG_ANYTHING,
1570 	.arg4_type	= ARG_ANYTHING,
1571 	.arg5_type	= ARG_ANYTHING,
1572 };
1573 
1574 BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size,
1575 	   __be32 *, to, u32, to_size, __wsum, seed)
1576 {
1577 	struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp);
1578 	u32 diff_size = from_size + to_size;
1579 	int i, j = 0;
1580 
1581 	/* This is quite flexible, some examples:
1582 	 *
1583 	 * from_size == 0, to_size > 0,  seed := csum --> pushing data
1584 	 * from_size > 0,  to_size == 0, seed := csum --> pulling data
1585 	 * from_size > 0,  to_size > 0,  seed := 0    --> diffing data
1586 	 *
1587 	 * Even for diffing, from_size and to_size don't need to be equal.
1588 	 */
1589 	if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) ||
1590 		     diff_size > sizeof(sp->diff)))
1591 		return -EINVAL;
1592 
1593 	for (i = 0; i < from_size / sizeof(__be32); i++, j++)
1594 		sp->diff[j] = ~from[i];
1595 	for (i = 0; i <   to_size / sizeof(__be32); i++, j++)
1596 		sp->diff[j] = to[i];
1597 
1598 	return csum_partial(sp->diff, diff_size, seed);
1599 }
1600 
1601 static const struct bpf_func_proto bpf_csum_diff_proto = {
1602 	.func		= bpf_csum_diff,
1603 	.gpl_only	= false,
1604 	.pkt_access	= true,
1605 	.ret_type	= RET_INTEGER,
1606 	.arg1_type	= ARG_PTR_TO_MEM,
1607 	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
1608 	.arg3_type	= ARG_PTR_TO_MEM,
1609 	.arg4_type	= ARG_CONST_SIZE_OR_ZERO,
1610 	.arg5_type	= ARG_ANYTHING,
1611 };
1612 
1613 BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum)
1614 {
1615 	/* The interface is to be used in combination with bpf_csum_diff()
1616 	 * for direct packet writes. csum rotation for alignment as well
1617 	 * as emulating csum_sub() can be done from the eBPF program.
1618 	 */
1619 	if (skb->ip_summed == CHECKSUM_COMPLETE)
1620 		return (skb->csum = csum_add(skb->csum, csum));
1621 
1622 	return -ENOTSUPP;
1623 }
1624 
1625 static const struct bpf_func_proto bpf_csum_update_proto = {
1626 	.func		= bpf_csum_update,
1627 	.gpl_only	= false,
1628 	.ret_type	= RET_INTEGER,
1629 	.arg1_type	= ARG_PTR_TO_CTX,
1630 	.arg2_type	= ARG_ANYTHING,
1631 };
1632 
1633 static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
1634 {
1635 	return dev_forward_skb(dev, skb);
1636 }
1637 
1638 static inline int __bpf_rx_skb_no_mac(struct net_device *dev,
1639 				      struct sk_buff *skb)
1640 {
1641 	int ret = ____dev_forward_skb(dev, skb);
1642 
1643 	if (likely(!ret)) {
1644 		skb->dev = dev;
1645 		ret = netif_rx(skb);
1646 	}
1647 
1648 	return ret;
1649 }
1650 
1651 static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
1652 {
1653 	int ret;
1654 
1655 	if (unlikely(__this_cpu_read(xmit_recursion) > XMIT_RECURSION_LIMIT)) {
1656 		net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
1657 		kfree_skb(skb);
1658 		return -ENETDOWN;
1659 	}
1660 
1661 	skb->dev = dev;
1662 
1663 	__this_cpu_inc(xmit_recursion);
1664 	ret = dev_queue_xmit(skb);
1665 	__this_cpu_dec(xmit_recursion);
1666 
1667 	return ret;
1668 }
1669 
1670 static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev,
1671 				 u32 flags)
1672 {
1673 	/* skb->mac_len is not set on normal egress */
1674 	unsigned int mlen = skb->network_header - skb->mac_header;
1675 
1676 	__skb_pull(skb, mlen);
1677 
1678 	/* At ingress, the mac header has already been pulled once.
1679 	 * At egress, skb_pospull_rcsum has to be done in case that
1680 	 * the skb is originated from ingress (i.e. a forwarded skb)
1681 	 * to ensure that rcsum starts at net header.
1682 	 */
1683 	if (!skb_at_tc_ingress(skb))
1684 		skb_postpull_rcsum(skb, skb_mac_header(skb), mlen);
1685 	skb_pop_mac_header(skb);
1686 	skb_reset_mac_len(skb);
1687 	return flags & BPF_F_INGRESS ?
1688 	       __bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb);
1689 }
1690 
1691 static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev,
1692 				 u32 flags)
1693 {
1694 	/* Verify that a link layer header is carried */
1695 	if (unlikely(skb->mac_header >= skb->network_header)) {
1696 		kfree_skb(skb);
1697 		return -ERANGE;
1698 	}
1699 
1700 	bpf_push_mac_rcsum(skb);
1701 	return flags & BPF_F_INGRESS ?
1702 	       __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
1703 }
1704 
1705 static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
1706 			  u32 flags)
1707 {
1708 	if (dev_is_mac_header_xmit(dev))
1709 		return __bpf_redirect_common(skb, dev, flags);
1710 	else
1711 		return __bpf_redirect_no_mac(skb, dev, flags);
1712 }
1713 
1714 BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
1715 {
1716 	struct net_device *dev;
1717 	struct sk_buff *clone;
1718 	int ret;
1719 
1720 	if (unlikely(flags & ~(BPF_F_INGRESS)))
1721 		return -EINVAL;
1722 
1723 	dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex);
1724 	if (unlikely(!dev))
1725 		return -EINVAL;
1726 
1727 	clone = skb_clone(skb, GFP_ATOMIC);
1728 	if (unlikely(!clone))
1729 		return -ENOMEM;
1730 
1731 	/* For direct write, we need to keep the invariant that the skbs
1732 	 * we're dealing with need to be uncloned. Should uncloning fail
1733 	 * here, we need to free the just generated clone to unclone once
1734 	 * again.
1735 	 */
1736 	ret = bpf_try_make_head_writable(skb);
1737 	if (unlikely(ret)) {
1738 		kfree_skb(clone);
1739 		return -ENOMEM;
1740 	}
1741 
1742 	return __bpf_redirect(clone, dev, flags);
1743 }
1744 
1745 static const struct bpf_func_proto bpf_clone_redirect_proto = {
1746 	.func           = bpf_clone_redirect,
1747 	.gpl_only       = false,
1748 	.ret_type       = RET_INTEGER,
1749 	.arg1_type      = ARG_PTR_TO_CTX,
1750 	.arg2_type      = ARG_ANYTHING,
1751 	.arg3_type      = ARG_ANYTHING,
1752 };
1753 
1754 struct redirect_info {
1755 	u32 ifindex;
1756 	u32 flags;
1757 };
1758 
1759 static DEFINE_PER_CPU(struct redirect_info, redirect_info);
1760 
1761 BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
1762 {
1763 	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
1764 
1765 	if (unlikely(flags & ~(BPF_F_INGRESS)))
1766 		return TC_ACT_SHOT;
1767 
1768 	ri->ifindex = ifindex;
1769 	ri->flags = flags;
1770 
1771 	return TC_ACT_REDIRECT;
1772 }
1773 
1774 int skb_do_redirect(struct sk_buff *skb)
1775 {
1776 	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
1777 	struct net_device *dev;
1778 
1779 	dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex);
1780 	ri->ifindex = 0;
1781 	if (unlikely(!dev)) {
1782 		kfree_skb(skb);
1783 		return -EINVAL;
1784 	}
1785 
1786 	return __bpf_redirect(skb, dev, ri->flags);
1787 }
1788 
1789 static const struct bpf_func_proto bpf_redirect_proto = {
1790 	.func           = bpf_redirect,
1791 	.gpl_only       = false,
1792 	.ret_type       = RET_INTEGER,
1793 	.arg1_type      = ARG_ANYTHING,
1794 	.arg2_type      = ARG_ANYTHING,
1795 };
1796 
1797 BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
1798 {
1799 	return task_get_classid(skb);
1800 }
1801 
1802 static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
1803 	.func           = bpf_get_cgroup_classid,
1804 	.gpl_only       = false,
1805 	.ret_type       = RET_INTEGER,
1806 	.arg1_type      = ARG_PTR_TO_CTX,
1807 };
1808 
1809 BPF_CALL_1(bpf_get_route_realm, const struct sk_buff *, skb)
1810 {
1811 	return dst_tclassid(skb);
1812 }
1813 
1814 static const struct bpf_func_proto bpf_get_route_realm_proto = {
1815 	.func           = bpf_get_route_realm,
1816 	.gpl_only       = false,
1817 	.ret_type       = RET_INTEGER,
1818 	.arg1_type      = ARG_PTR_TO_CTX,
1819 };
1820 
1821 BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb)
1822 {
1823 	/* If skb_clear_hash() was called due to mangling, we can
1824 	 * trigger SW recalculation here. Later access to hash
1825 	 * can then use the inline skb->hash via context directly
1826 	 * instead of calling this helper again.
1827 	 */
1828 	return skb_get_hash(skb);
1829 }
1830 
1831 static const struct bpf_func_proto bpf_get_hash_recalc_proto = {
1832 	.func		= bpf_get_hash_recalc,
1833 	.gpl_only	= false,
1834 	.ret_type	= RET_INTEGER,
1835 	.arg1_type	= ARG_PTR_TO_CTX,
1836 };
1837 
1838 BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb)
1839 {
1840 	/* After all direct packet write, this can be used once for
1841 	 * triggering a lazy recalc on next skb_get_hash() invocation.
1842 	 */
1843 	skb_clear_hash(skb);
1844 	return 0;
1845 }
1846 
1847 static const struct bpf_func_proto bpf_set_hash_invalid_proto = {
1848 	.func		= bpf_set_hash_invalid,
1849 	.gpl_only	= false,
1850 	.ret_type	= RET_INTEGER,
1851 	.arg1_type	= ARG_PTR_TO_CTX,
1852 };
1853 
1854 BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
1855 	   u16, vlan_tci)
1856 {
1857 	int ret;
1858 
1859 	if (unlikely(vlan_proto != htons(ETH_P_8021Q) &&
1860 		     vlan_proto != htons(ETH_P_8021AD)))
1861 		vlan_proto = htons(ETH_P_8021Q);
1862 
1863 	bpf_push_mac_rcsum(skb);
1864 	ret = skb_vlan_push(skb, vlan_proto, vlan_tci);
1865 	bpf_pull_mac_rcsum(skb);
1866 
1867 	bpf_compute_data_end(skb);
1868 	return ret;
1869 }
1870 
1871 const struct bpf_func_proto bpf_skb_vlan_push_proto = {
1872 	.func           = bpf_skb_vlan_push,
1873 	.gpl_only       = false,
1874 	.ret_type       = RET_INTEGER,
1875 	.arg1_type      = ARG_PTR_TO_CTX,
1876 	.arg2_type      = ARG_ANYTHING,
1877 	.arg3_type      = ARG_ANYTHING,
1878 };
1879 EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto);
1880 
1881 BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
1882 {
1883 	int ret;
1884 
1885 	bpf_push_mac_rcsum(skb);
1886 	ret = skb_vlan_pop(skb);
1887 	bpf_pull_mac_rcsum(skb);
1888 
1889 	bpf_compute_data_end(skb);
1890 	return ret;
1891 }
1892 
1893 const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
1894 	.func           = bpf_skb_vlan_pop,
1895 	.gpl_only       = false,
1896 	.ret_type       = RET_INTEGER,
1897 	.arg1_type      = ARG_PTR_TO_CTX,
1898 };
1899 EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto);
1900 
1901 static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
1902 {
1903 	/* Caller already did skb_cow() with len as headroom,
1904 	 * so no need to do it here.
1905 	 */
1906 	skb_push(skb, len);
1907 	memmove(skb->data, skb->data + len, off);
1908 	memset(skb->data + off, 0, len);
1909 
1910 	/* No skb_postpush_rcsum(skb, skb->data + off, len)
1911 	 * needed here as it does not change the skb->csum
1912 	 * result for checksum complete when summing over
1913 	 * zeroed blocks.
1914 	 */
1915 	return 0;
1916 }
1917 
1918 static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len)
1919 {
1920 	/* skb_ensure_writable() is not needed here, as we're
1921 	 * already working on an uncloned skb.
1922 	 */
1923 	if (unlikely(!pskb_may_pull(skb, off + len)))
1924 		return -ENOMEM;
1925 
1926 	skb_postpull_rcsum(skb, skb->data + off, len);
1927 	memmove(skb->data + len, skb->data, off);
1928 	__skb_pull(skb, len);
1929 
1930 	return 0;
1931 }
1932 
1933 static int bpf_skb_net_hdr_push(struct sk_buff *skb, u32 off, u32 len)
1934 {
1935 	bool trans_same = skb->transport_header == skb->network_header;
1936 	int ret;
1937 
1938 	/* There's no need for __skb_push()/__skb_pull() pair to
1939 	 * get to the start of the mac header as we're guaranteed
1940 	 * to always start from here under eBPF.
1941 	 */
1942 	ret = bpf_skb_generic_push(skb, off, len);
1943 	if (likely(!ret)) {
1944 		skb->mac_header -= len;
1945 		skb->network_header -= len;
1946 		if (trans_same)
1947 			skb->transport_header = skb->network_header;
1948 	}
1949 
1950 	return ret;
1951 }
1952 
1953 static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len)
1954 {
1955 	bool trans_same = skb->transport_header == skb->network_header;
1956 	int ret;
1957 
1958 	/* Same here, __skb_push()/__skb_pull() pair not needed. */
1959 	ret = bpf_skb_generic_pop(skb, off, len);
1960 	if (likely(!ret)) {
1961 		skb->mac_header += len;
1962 		skb->network_header += len;
1963 		if (trans_same)
1964 			skb->transport_header = skb->network_header;
1965 	}
1966 
1967 	return ret;
1968 }
1969 
1970 static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
1971 {
1972 	const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
1973 	u32 off = skb->network_header - skb->mac_header;
1974 	int ret;
1975 
1976 	ret = skb_cow(skb, len_diff);
1977 	if (unlikely(ret < 0))
1978 		return ret;
1979 
1980 	ret = bpf_skb_net_hdr_push(skb, off, len_diff);
1981 	if (unlikely(ret < 0))
1982 		return ret;
1983 
1984 	if (skb_is_gso(skb)) {
1985 		/* SKB_GSO_UDP stays as is. SKB_GSO_TCPV4 needs to
1986 		 * be changed into SKB_GSO_TCPV6.
1987 		 */
1988 		if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
1989 			skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV4;
1990 			skb_shinfo(skb)->gso_type |=  SKB_GSO_TCPV6;
1991 		}
1992 
1993 		/* Due to IPv6 header, MSS needs to be downgraded. */
1994 		skb_shinfo(skb)->gso_size -= len_diff;
1995 		/* Header must be checked, and gso_segs recomputed. */
1996 		skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
1997 		skb_shinfo(skb)->gso_segs = 0;
1998 	}
1999 
2000 	skb->protocol = htons(ETH_P_IPV6);
2001 	skb_clear_hash(skb);
2002 
2003 	return 0;
2004 }
2005 
2006 static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
2007 {
2008 	const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
2009 	u32 off = skb->network_header - skb->mac_header;
2010 	int ret;
2011 
2012 	ret = skb_unclone(skb, GFP_ATOMIC);
2013 	if (unlikely(ret < 0))
2014 		return ret;
2015 
2016 	ret = bpf_skb_net_hdr_pop(skb, off, len_diff);
2017 	if (unlikely(ret < 0))
2018 		return ret;
2019 
2020 	if (skb_is_gso(skb)) {
2021 		/* SKB_GSO_UDP stays as is. SKB_GSO_TCPV6 needs to
2022 		 * be changed into SKB_GSO_TCPV4.
2023 		 */
2024 		if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) {
2025 			skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV6;
2026 			skb_shinfo(skb)->gso_type |=  SKB_GSO_TCPV4;
2027 		}
2028 
2029 		/* Due to IPv4 header, MSS can be upgraded. */
2030 		skb_shinfo(skb)->gso_size += len_diff;
2031 		/* Header must be checked, and gso_segs recomputed. */
2032 		skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
2033 		skb_shinfo(skb)->gso_segs = 0;
2034 	}
2035 
2036 	skb->protocol = htons(ETH_P_IP);
2037 	skb_clear_hash(skb);
2038 
2039 	return 0;
2040 }
2041 
2042 static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto)
2043 {
2044 	__be16 from_proto = skb->protocol;
2045 
2046 	if (from_proto == htons(ETH_P_IP) &&
2047 	      to_proto == htons(ETH_P_IPV6))
2048 		return bpf_skb_proto_4_to_6(skb);
2049 
2050 	if (from_proto == htons(ETH_P_IPV6) &&
2051 	      to_proto == htons(ETH_P_IP))
2052 		return bpf_skb_proto_6_to_4(skb);
2053 
2054 	return -ENOTSUPP;
2055 }
2056 
2057 BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto,
2058 	   u64, flags)
2059 {
2060 	int ret;
2061 
2062 	if (unlikely(flags))
2063 		return -EINVAL;
2064 
2065 	/* General idea is that this helper does the basic groundwork
2066 	 * needed for changing the protocol, and eBPF program fills the
2067 	 * rest through bpf_skb_store_bytes(), bpf_lX_csum_replace()
2068 	 * and other helpers, rather than passing a raw buffer here.
2069 	 *
2070 	 * The rationale is to keep this minimal and without a need to
2071 	 * deal with raw packet data. F.e. even if we would pass buffers
2072 	 * here, the program still needs to call the bpf_lX_csum_replace()
2073 	 * helpers anyway. Plus, this way we keep also separation of
2074 	 * concerns, since f.e. bpf_skb_store_bytes() should only take
2075 	 * care of stores.
2076 	 *
2077 	 * Currently, additional options and extension header space are
2078 	 * not supported, but flags register is reserved so we can adapt
2079 	 * that. For offloads, we mark packet as dodgy, so that headers
2080 	 * need to be verified first.
2081 	 */
2082 	ret = bpf_skb_proto_xlat(skb, proto);
2083 	bpf_compute_data_end(skb);
2084 	return ret;
2085 }
2086 
2087 static const struct bpf_func_proto bpf_skb_change_proto_proto = {
2088 	.func		= bpf_skb_change_proto,
2089 	.gpl_only	= false,
2090 	.ret_type	= RET_INTEGER,
2091 	.arg1_type	= ARG_PTR_TO_CTX,
2092 	.arg2_type	= ARG_ANYTHING,
2093 	.arg3_type	= ARG_ANYTHING,
2094 };
2095 
2096 BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type)
2097 {
2098 	/* We only allow a restricted subset to be changed for now. */
2099 	if (unlikely(!skb_pkt_type_ok(skb->pkt_type) ||
2100 		     !skb_pkt_type_ok(pkt_type)))
2101 		return -EINVAL;
2102 
2103 	skb->pkt_type = pkt_type;
2104 	return 0;
2105 }
2106 
2107 static const struct bpf_func_proto bpf_skb_change_type_proto = {
2108 	.func		= bpf_skb_change_type,
2109 	.gpl_only	= false,
2110 	.ret_type	= RET_INTEGER,
2111 	.arg1_type	= ARG_PTR_TO_CTX,
2112 	.arg2_type	= ARG_ANYTHING,
2113 };
2114 
2115 static u32 __bpf_skb_min_len(const struct sk_buff *skb)
2116 {
2117 	u32 min_len = skb_network_offset(skb);
2118 
2119 	if (skb_transport_header_was_set(skb))
2120 		min_len = skb_transport_offset(skb);
2121 	if (skb->ip_summed == CHECKSUM_PARTIAL)
2122 		min_len = skb_checksum_start_offset(skb) +
2123 			  skb->csum_offset + sizeof(__sum16);
2124 	return min_len;
2125 }
2126 
2127 static u32 __bpf_skb_max_len(const struct sk_buff *skb)
2128 {
2129 	return skb->dev->mtu + skb->dev->hard_header_len;
2130 }
2131 
2132 static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len)
2133 {
2134 	unsigned int old_len = skb->len;
2135 	int ret;
2136 
2137 	ret = __skb_grow_rcsum(skb, new_len);
2138 	if (!ret)
2139 		memset(skb->data + old_len, 0, new_len - old_len);
2140 	return ret;
2141 }
2142 
2143 static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len)
2144 {
2145 	return __skb_trim_rcsum(skb, new_len);
2146 }
2147 
2148 BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len,
2149 	   u64, flags)
2150 {
2151 	u32 max_len = __bpf_skb_max_len(skb);
2152 	u32 min_len = __bpf_skb_min_len(skb);
2153 	int ret;
2154 
2155 	if (unlikely(flags || new_len > max_len || new_len < min_len))
2156 		return -EINVAL;
2157 	if (skb->encapsulation)
2158 		return -ENOTSUPP;
2159 
2160 	/* The basic idea of this helper is that it's performing the
2161 	 * needed work to either grow or trim an skb, and eBPF program
2162 	 * rewrites the rest via helpers like bpf_skb_store_bytes(),
2163 	 * bpf_lX_csum_replace() and others rather than passing a raw
2164 	 * buffer here. This one is a slow path helper and intended
2165 	 * for replies with control messages.
2166 	 *
2167 	 * Like in bpf_skb_change_proto(), we want to keep this rather
2168 	 * minimal and without protocol specifics so that we are able
2169 	 * to separate concerns as in bpf_skb_store_bytes() should only
2170 	 * be the one responsible for writing buffers.
2171 	 *
2172 	 * It's really expected to be a slow path operation here for
2173 	 * control message replies, so we're implicitly linearizing,
2174 	 * uncloning and drop offloads from the skb by this.
2175 	 */
2176 	ret = __bpf_try_make_writable(skb, skb->len);
2177 	if (!ret) {
2178 		if (new_len > skb->len)
2179 			ret = bpf_skb_grow_rcsum(skb, new_len);
2180 		else if (new_len < skb->len)
2181 			ret = bpf_skb_trim_rcsum(skb, new_len);
2182 		if (!ret && skb_is_gso(skb))
2183 			skb_gso_reset(skb);
2184 	}
2185 
2186 	bpf_compute_data_end(skb);
2187 	return ret;
2188 }
2189 
2190 static const struct bpf_func_proto bpf_skb_change_tail_proto = {
2191 	.func		= bpf_skb_change_tail,
2192 	.gpl_only	= false,
2193 	.ret_type	= RET_INTEGER,
2194 	.arg1_type	= ARG_PTR_TO_CTX,
2195 	.arg2_type	= ARG_ANYTHING,
2196 	.arg3_type	= ARG_ANYTHING,
2197 };
2198 
2199 BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room,
2200 	   u64, flags)
2201 {
2202 	u32 max_len = __bpf_skb_max_len(skb);
2203 	u32 new_len = skb->len + head_room;
2204 	int ret;
2205 
2206 	if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) ||
2207 		     new_len < skb->len))
2208 		return -EINVAL;
2209 
2210 	ret = skb_cow(skb, head_room);
2211 	if (likely(!ret)) {
2212 		/* Idea for this helper is that we currently only
2213 		 * allow to expand on mac header. This means that
2214 		 * skb->protocol network header, etc, stay as is.
2215 		 * Compared to bpf_skb_change_tail(), we're more
2216 		 * flexible due to not needing to linearize or
2217 		 * reset GSO. Intention for this helper is to be
2218 		 * used by an L3 skb that needs to push mac header
2219 		 * for redirection into L2 device.
2220 		 */
2221 		__skb_push(skb, head_room);
2222 		memset(skb->data, 0, head_room);
2223 		skb_reset_mac_header(skb);
2224 	}
2225 
2226 	bpf_compute_data_end(skb);
2227 	return 0;
2228 }
2229 
2230 static const struct bpf_func_proto bpf_skb_change_head_proto = {
2231 	.func		= bpf_skb_change_head,
2232 	.gpl_only	= false,
2233 	.ret_type	= RET_INTEGER,
2234 	.arg1_type	= ARG_PTR_TO_CTX,
2235 	.arg2_type	= ARG_ANYTHING,
2236 	.arg3_type	= ARG_ANYTHING,
2237 };
2238 
2239 BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
2240 {
2241 	void *data = xdp->data + offset;
2242 
2243 	if (unlikely(data < xdp->data_hard_start ||
2244 		     data > xdp->data_end - ETH_HLEN))
2245 		return -EINVAL;
2246 
2247 	xdp->data = data;
2248 
2249 	return 0;
2250 }
2251 
2252 static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
2253 	.func		= bpf_xdp_adjust_head,
2254 	.gpl_only	= false,
2255 	.ret_type	= RET_INTEGER,
2256 	.arg1_type	= ARG_PTR_TO_CTX,
2257 	.arg2_type	= ARG_ANYTHING,
2258 };
2259 
2260 bool bpf_helper_changes_pkt_data(void *func)
2261 {
2262 	if (func == bpf_skb_vlan_push ||
2263 	    func == bpf_skb_vlan_pop ||
2264 	    func == bpf_skb_store_bytes ||
2265 	    func == bpf_skb_change_proto ||
2266 	    func == bpf_skb_change_head ||
2267 	    func == bpf_skb_change_tail ||
2268 	    func == bpf_skb_pull_data ||
2269 	    func == bpf_l3_csum_replace ||
2270 	    func == bpf_l4_csum_replace ||
2271 	    func == bpf_xdp_adjust_head)
2272 		return true;
2273 
2274 	return false;
2275 }
2276 
2277 static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
2278 				  unsigned long off, unsigned long len)
2279 {
2280 	void *ptr = skb_header_pointer(skb, off, len, dst_buff);
2281 
2282 	if (unlikely(!ptr))
2283 		return len;
2284 	if (ptr != dst_buff)
2285 		memcpy(dst_buff, ptr, len);
2286 
2287 	return 0;
2288 }
2289 
2290 BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map,
2291 	   u64, flags, void *, meta, u64, meta_size)
2292 {
2293 	u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
2294 
2295 	if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
2296 		return -EINVAL;
2297 	if (unlikely(skb_size > skb->len))
2298 		return -EFAULT;
2299 
2300 	return bpf_event_output(map, flags, meta, meta_size, skb, skb_size,
2301 				bpf_skb_copy);
2302 }
2303 
2304 static const struct bpf_func_proto bpf_skb_event_output_proto = {
2305 	.func		= bpf_skb_event_output,
2306 	.gpl_only	= true,
2307 	.ret_type	= RET_INTEGER,
2308 	.arg1_type	= ARG_PTR_TO_CTX,
2309 	.arg2_type	= ARG_CONST_MAP_PTR,
2310 	.arg3_type	= ARG_ANYTHING,
2311 	.arg4_type	= ARG_PTR_TO_MEM,
2312 	.arg5_type	= ARG_CONST_SIZE,
2313 };
2314 
2315 static unsigned short bpf_tunnel_key_af(u64 flags)
2316 {
2317 	return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET;
2318 }
2319 
2320 BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key *, to,
2321 	   u32, size, u64, flags)
2322 {
2323 	const struct ip_tunnel_info *info = skb_tunnel_info(skb);
2324 	u8 compat[sizeof(struct bpf_tunnel_key)];
2325 	void *to_orig = to;
2326 	int err;
2327 
2328 	if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6)))) {
2329 		err = -EINVAL;
2330 		goto err_clear;
2331 	}
2332 	if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) {
2333 		err = -EPROTO;
2334 		goto err_clear;
2335 	}
2336 	if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
2337 		err = -EINVAL;
2338 		switch (size) {
2339 		case offsetof(struct bpf_tunnel_key, tunnel_label):
2340 		case offsetof(struct bpf_tunnel_key, tunnel_ext):
2341 			goto set_compat;
2342 		case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
2343 			/* Fixup deprecated structure layouts here, so we have
2344 			 * a common path later on.
2345 			 */
2346 			if (ip_tunnel_info_af(info) != AF_INET)
2347 				goto err_clear;
2348 set_compat:
2349 			to = (struct bpf_tunnel_key *)compat;
2350 			break;
2351 		default:
2352 			goto err_clear;
2353 		}
2354 	}
2355 
2356 	to->tunnel_id = be64_to_cpu(info->key.tun_id);
2357 	to->tunnel_tos = info->key.tos;
2358 	to->tunnel_ttl = info->key.ttl;
2359 
2360 	if (flags & BPF_F_TUNINFO_IPV6) {
2361 		memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
2362 		       sizeof(to->remote_ipv6));
2363 		to->tunnel_label = be32_to_cpu(info->key.label);
2364 	} else {
2365 		to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
2366 	}
2367 
2368 	if (unlikely(size != sizeof(struct bpf_tunnel_key)))
2369 		memcpy(to_orig, to, size);
2370 
2371 	return 0;
2372 err_clear:
2373 	memset(to_orig, 0, size);
2374 	return err;
2375 }
2376 
2377 static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
2378 	.func		= bpf_skb_get_tunnel_key,
2379 	.gpl_only	= false,
2380 	.ret_type	= RET_INTEGER,
2381 	.arg1_type	= ARG_PTR_TO_CTX,
2382 	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
2383 	.arg3_type	= ARG_CONST_SIZE,
2384 	.arg4_type	= ARG_ANYTHING,
2385 };
2386 
2387 BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size)
2388 {
2389 	const struct ip_tunnel_info *info = skb_tunnel_info(skb);
2390 	int err;
2391 
2392 	if (unlikely(!info ||
2393 		     !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) {
2394 		err = -ENOENT;
2395 		goto err_clear;
2396 	}
2397 	if (unlikely(size < info->options_len)) {
2398 		err = -ENOMEM;
2399 		goto err_clear;
2400 	}
2401 
2402 	ip_tunnel_info_opts_get(to, info);
2403 	if (size > info->options_len)
2404 		memset(to + info->options_len, 0, size - info->options_len);
2405 
2406 	return info->options_len;
2407 err_clear:
2408 	memset(to, 0, size);
2409 	return err;
2410 }
2411 
2412 static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
2413 	.func		= bpf_skb_get_tunnel_opt,
2414 	.gpl_only	= false,
2415 	.ret_type	= RET_INTEGER,
2416 	.arg1_type	= ARG_PTR_TO_CTX,
2417 	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
2418 	.arg3_type	= ARG_CONST_SIZE,
2419 };
2420 
2421 static struct metadata_dst __percpu *md_dst;
2422 
2423 BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
2424 	   const struct bpf_tunnel_key *, from, u32, size, u64, flags)
2425 {
2426 	struct metadata_dst *md = this_cpu_ptr(md_dst);
2427 	u8 compat[sizeof(struct bpf_tunnel_key)];
2428 	struct ip_tunnel_info *info;
2429 
2430 	if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX |
2431 			       BPF_F_DONT_FRAGMENT)))
2432 		return -EINVAL;
2433 	if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
2434 		switch (size) {
2435 		case offsetof(struct bpf_tunnel_key, tunnel_label):
2436 		case offsetof(struct bpf_tunnel_key, tunnel_ext):
2437 		case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
2438 			/* Fixup deprecated structure layouts here, so we have
2439 			 * a common path later on.
2440 			 */
2441 			memcpy(compat, from, size);
2442 			memset(compat + size, 0, sizeof(compat) - size);
2443 			from = (const struct bpf_tunnel_key *) compat;
2444 			break;
2445 		default:
2446 			return -EINVAL;
2447 		}
2448 	}
2449 	if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) ||
2450 		     from->tunnel_ext))
2451 		return -EINVAL;
2452 
2453 	skb_dst_drop(skb);
2454 	dst_hold((struct dst_entry *) md);
2455 	skb_dst_set(skb, (struct dst_entry *) md);
2456 
2457 	info = &md->u.tun_info;
2458 	info->mode = IP_TUNNEL_INFO_TX;
2459 
2460 	info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE;
2461 	if (flags & BPF_F_DONT_FRAGMENT)
2462 		info->key.tun_flags |= TUNNEL_DONT_FRAGMENT;
2463 
2464 	info->key.tun_id = cpu_to_be64(from->tunnel_id);
2465 	info->key.tos = from->tunnel_tos;
2466 	info->key.ttl = from->tunnel_ttl;
2467 
2468 	if (flags & BPF_F_TUNINFO_IPV6) {
2469 		info->mode |= IP_TUNNEL_INFO_IPV6;
2470 		memcpy(&info->key.u.ipv6.dst, from->remote_ipv6,
2471 		       sizeof(from->remote_ipv6));
2472 		info->key.label = cpu_to_be32(from->tunnel_label) &
2473 				  IPV6_FLOWLABEL_MASK;
2474 	} else {
2475 		info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
2476 		if (flags & BPF_F_ZERO_CSUM_TX)
2477 			info->key.tun_flags &= ~TUNNEL_CSUM;
2478 	}
2479 
2480 	return 0;
2481 }
2482 
2483 static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
2484 	.func		= bpf_skb_set_tunnel_key,
2485 	.gpl_only	= false,
2486 	.ret_type	= RET_INTEGER,
2487 	.arg1_type	= ARG_PTR_TO_CTX,
2488 	.arg2_type	= ARG_PTR_TO_MEM,
2489 	.arg3_type	= ARG_CONST_SIZE,
2490 	.arg4_type	= ARG_ANYTHING,
2491 };
2492 
2493 BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb,
2494 	   const u8 *, from, u32, size)
2495 {
2496 	struct ip_tunnel_info *info = skb_tunnel_info(skb);
2497 	const struct metadata_dst *md = this_cpu_ptr(md_dst);
2498 
2499 	if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1))))
2500 		return -EINVAL;
2501 	if (unlikely(size > IP_TUNNEL_OPTS_MAX))
2502 		return -ENOMEM;
2503 
2504 	ip_tunnel_info_opts_set(info, from, size);
2505 
2506 	return 0;
2507 }
2508 
2509 static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = {
2510 	.func		= bpf_skb_set_tunnel_opt,
2511 	.gpl_only	= false,
2512 	.ret_type	= RET_INTEGER,
2513 	.arg1_type	= ARG_PTR_TO_CTX,
2514 	.arg2_type	= ARG_PTR_TO_MEM,
2515 	.arg3_type	= ARG_CONST_SIZE,
2516 };
2517 
2518 static const struct bpf_func_proto *
2519 bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
2520 {
2521 	if (!md_dst) {
2522 		/* Race is not possible, since it's called from verifier
2523 		 * that is holding verifier mutex.
2524 		 */
2525 		md_dst = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX,
2526 						   GFP_KERNEL);
2527 		if (!md_dst)
2528 			return NULL;
2529 	}
2530 
2531 	switch (which) {
2532 	case BPF_FUNC_skb_set_tunnel_key:
2533 		return &bpf_skb_set_tunnel_key_proto;
2534 	case BPF_FUNC_skb_set_tunnel_opt:
2535 		return &bpf_skb_set_tunnel_opt_proto;
2536 	default:
2537 		return NULL;
2538 	}
2539 }
2540 
2541 BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map,
2542 	   u32, idx)
2543 {
2544 	struct bpf_array *array = container_of(map, struct bpf_array, map);
2545 	struct cgroup *cgrp;
2546 	struct sock *sk;
2547 
2548 	sk = skb_to_full_sk(skb);
2549 	if (!sk || !sk_fullsock(sk))
2550 		return -ENOENT;
2551 	if (unlikely(idx >= array->map.max_entries))
2552 		return -E2BIG;
2553 
2554 	cgrp = READ_ONCE(array->ptrs[idx]);
2555 	if (unlikely(!cgrp))
2556 		return -EAGAIN;
2557 
2558 	return sk_under_cgroup_hierarchy(sk, cgrp);
2559 }
2560 
2561 static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
2562 	.func		= bpf_skb_under_cgroup,
2563 	.gpl_only	= false,
2564 	.ret_type	= RET_INTEGER,
2565 	.arg1_type	= ARG_PTR_TO_CTX,
2566 	.arg2_type	= ARG_CONST_MAP_PTR,
2567 	.arg3_type	= ARG_ANYTHING,
2568 };
2569 
2570 static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
2571 				  unsigned long off, unsigned long len)
2572 {
2573 	memcpy(dst_buff, src_buff + off, len);
2574 	return 0;
2575 }
2576 
2577 BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map,
2578 	   u64, flags, void *, meta, u64, meta_size)
2579 {
2580 	u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
2581 
2582 	if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
2583 		return -EINVAL;
2584 	if (unlikely(xdp_size > (unsigned long)(xdp->data_end - xdp->data)))
2585 		return -EFAULT;
2586 
2587 	return bpf_event_output(map, flags, meta, meta_size, xdp->data,
2588 				xdp_size, bpf_xdp_copy);
2589 }
2590 
2591 static const struct bpf_func_proto bpf_xdp_event_output_proto = {
2592 	.func		= bpf_xdp_event_output,
2593 	.gpl_only	= true,
2594 	.ret_type	= RET_INTEGER,
2595 	.arg1_type	= ARG_PTR_TO_CTX,
2596 	.arg2_type	= ARG_CONST_MAP_PTR,
2597 	.arg3_type	= ARG_ANYTHING,
2598 	.arg4_type	= ARG_PTR_TO_MEM,
2599 	.arg5_type	= ARG_CONST_SIZE,
2600 };
2601 
2602 static const struct bpf_func_proto *
2603 bpf_base_func_proto(enum bpf_func_id func_id)
2604 {
2605 	switch (func_id) {
2606 	case BPF_FUNC_map_lookup_elem:
2607 		return &bpf_map_lookup_elem_proto;
2608 	case BPF_FUNC_map_update_elem:
2609 		return &bpf_map_update_elem_proto;
2610 	case BPF_FUNC_map_delete_elem:
2611 		return &bpf_map_delete_elem_proto;
2612 	case BPF_FUNC_get_prandom_u32:
2613 		return &bpf_get_prandom_u32_proto;
2614 	case BPF_FUNC_get_smp_processor_id:
2615 		return &bpf_get_raw_smp_processor_id_proto;
2616 	case BPF_FUNC_get_numa_node_id:
2617 		return &bpf_get_numa_node_id_proto;
2618 	case BPF_FUNC_tail_call:
2619 		return &bpf_tail_call_proto;
2620 	case BPF_FUNC_ktime_get_ns:
2621 		return &bpf_ktime_get_ns_proto;
2622 	case BPF_FUNC_trace_printk:
2623 		if (capable(CAP_SYS_ADMIN))
2624 			return bpf_get_trace_printk_proto();
2625 	default:
2626 		return NULL;
2627 	}
2628 }
2629 
2630 static const struct bpf_func_proto *
2631 sk_filter_func_proto(enum bpf_func_id func_id)
2632 {
2633 	switch (func_id) {
2634 	case BPF_FUNC_skb_load_bytes:
2635 		return &bpf_skb_load_bytes_proto;
2636 	default:
2637 		return bpf_base_func_proto(func_id);
2638 	}
2639 }
2640 
2641 static const struct bpf_func_proto *
2642 tc_cls_act_func_proto(enum bpf_func_id func_id)
2643 {
2644 	switch (func_id) {
2645 	case BPF_FUNC_skb_store_bytes:
2646 		return &bpf_skb_store_bytes_proto;
2647 	case BPF_FUNC_skb_load_bytes:
2648 		return &bpf_skb_load_bytes_proto;
2649 	case BPF_FUNC_skb_pull_data:
2650 		return &bpf_skb_pull_data_proto;
2651 	case BPF_FUNC_csum_diff:
2652 		return &bpf_csum_diff_proto;
2653 	case BPF_FUNC_csum_update:
2654 		return &bpf_csum_update_proto;
2655 	case BPF_FUNC_l3_csum_replace:
2656 		return &bpf_l3_csum_replace_proto;
2657 	case BPF_FUNC_l4_csum_replace:
2658 		return &bpf_l4_csum_replace_proto;
2659 	case BPF_FUNC_clone_redirect:
2660 		return &bpf_clone_redirect_proto;
2661 	case BPF_FUNC_get_cgroup_classid:
2662 		return &bpf_get_cgroup_classid_proto;
2663 	case BPF_FUNC_skb_vlan_push:
2664 		return &bpf_skb_vlan_push_proto;
2665 	case BPF_FUNC_skb_vlan_pop:
2666 		return &bpf_skb_vlan_pop_proto;
2667 	case BPF_FUNC_skb_change_proto:
2668 		return &bpf_skb_change_proto_proto;
2669 	case BPF_FUNC_skb_change_type:
2670 		return &bpf_skb_change_type_proto;
2671 	case BPF_FUNC_skb_change_tail:
2672 		return &bpf_skb_change_tail_proto;
2673 	case BPF_FUNC_skb_get_tunnel_key:
2674 		return &bpf_skb_get_tunnel_key_proto;
2675 	case BPF_FUNC_skb_set_tunnel_key:
2676 		return bpf_get_skb_set_tunnel_proto(func_id);
2677 	case BPF_FUNC_skb_get_tunnel_opt:
2678 		return &bpf_skb_get_tunnel_opt_proto;
2679 	case BPF_FUNC_skb_set_tunnel_opt:
2680 		return bpf_get_skb_set_tunnel_proto(func_id);
2681 	case BPF_FUNC_redirect:
2682 		return &bpf_redirect_proto;
2683 	case BPF_FUNC_get_route_realm:
2684 		return &bpf_get_route_realm_proto;
2685 	case BPF_FUNC_get_hash_recalc:
2686 		return &bpf_get_hash_recalc_proto;
2687 	case BPF_FUNC_set_hash_invalid:
2688 		return &bpf_set_hash_invalid_proto;
2689 	case BPF_FUNC_perf_event_output:
2690 		return &bpf_skb_event_output_proto;
2691 	case BPF_FUNC_get_smp_processor_id:
2692 		return &bpf_get_smp_processor_id_proto;
2693 	case BPF_FUNC_skb_under_cgroup:
2694 		return &bpf_skb_under_cgroup_proto;
2695 	default:
2696 		return bpf_base_func_proto(func_id);
2697 	}
2698 }
2699 
2700 static const struct bpf_func_proto *
2701 xdp_func_proto(enum bpf_func_id func_id)
2702 {
2703 	switch (func_id) {
2704 	case BPF_FUNC_perf_event_output:
2705 		return &bpf_xdp_event_output_proto;
2706 	case BPF_FUNC_get_smp_processor_id:
2707 		return &bpf_get_smp_processor_id_proto;
2708 	case BPF_FUNC_xdp_adjust_head:
2709 		return &bpf_xdp_adjust_head_proto;
2710 	default:
2711 		return bpf_base_func_proto(func_id);
2712 	}
2713 }
2714 
2715 static const struct bpf_func_proto *
2716 cg_skb_func_proto(enum bpf_func_id func_id)
2717 {
2718 	switch (func_id) {
2719 	case BPF_FUNC_skb_load_bytes:
2720 		return &bpf_skb_load_bytes_proto;
2721 	default:
2722 		return bpf_base_func_proto(func_id);
2723 	}
2724 }
2725 
2726 static const struct bpf_func_proto *
2727 lwt_inout_func_proto(enum bpf_func_id func_id)
2728 {
2729 	switch (func_id) {
2730 	case BPF_FUNC_skb_load_bytes:
2731 		return &bpf_skb_load_bytes_proto;
2732 	case BPF_FUNC_skb_pull_data:
2733 		return &bpf_skb_pull_data_proto;
2734 	case BPF_FUNC_csum_diff:
2735 		return &bpf_csum_diff_proto;
2736 	case BPF_FUNC_get_cgroup_classid:
2737 		return &bpf_get_cgroup_classid_proto;
2738 	case BPF_FUNC_get_route_realm:
2739 		return &bpf_get_route_realm_proto;
2740 	case BPF_FUNC_get_hash_recalc:
2741 		return &bpf_get_hash_recalc_proto;
2742 	case BPF_FUNC_perf_event_output:
2743 		return &bpf_skb_event_output_proto;
2744 	case BPF_FUNC_get_smp_processor_id:
2745 		return &bpf_get_smp_processor_id_proto;
2746 	case BPF_FUNC_skb_under_cgroup:
2747 		return &bpf_skb_under_cgroup_proto;
2748 	default:
2749 		return bpf_base_func_proto(func_id);
2750 	}
2751 }
2752 
2753 static const struct bpf_func_proto *
2754 lwt_xmit_func_proto(enum bpf_func_id func_id)
2755 {
2756 	switch (func_id) {
2757 	case BPF_FUNC_skb_get_tunnel_key:
2758 		return &bpf_skb_get_tunnel_key_proto;
2759 	case BPF_FUNC_skb_set_tunnel_key:
2760 		return bpf_get_skb_set_tunnel_proto(func_id);
2761 	case BPF_FUNC_skb_get_tunnel_opt:
2762 		return &bpf_skb_get_tunnel_opt_proto;
2763 	case BPF_FUNC_skb_set_tunnel_opt:
2764 		return bpf_get_skb_set_tunnel_proto(func_id);
2765 	case BPF_FUNC_redirect:
2766 		return &bpf_redirect_proto;
2767 	case BPF_FUNC_clone_redirect:
2768 		return &bpf_clone_redirect_proto;
2769 	case BPF_FUNC_skb_change_tail:
2770 		return &bpf_skb_change_tail_proto;
2771 	case BPF_FUNC_skb_change_head:
2772 		return &bpf_skb_change_head_proto;
2773 	case BPF_FUNC_skb_store_bytes:
2774 		return &bpf_skb_store_bytes_proto;
2775 	case BPF_FUNC_csum_update:
2776 		return &bpf_csum_update_proto;
2777 	case BPF_FUNC_l3_csum_replace:
2778 		return &bpf_l3_csum_replace_proto;
2779 	case BPF_FUNC_l4_csum_replace:
2780 		return &bpf_l4_csum_replace_proto;
2781 	case BPF_FUNC_set_hash_invalid:
2782 		return &bpf_set_hash_invalid_proto;
2783 	default:
2784 		return lwt_inout_func_proto(func_id);
2785 	}
2786 }
2787 
2788 static bool __is_valid_access(int off, int size)
2789 {
2790 	if (off < 0 || off >= sizeof(struct __sk_buff))
2791 		return false;
2792 
2793 	/* The verifier guarantees that size > 0. */
2794 	if (off % size != 0)
2795 		return false;
2796 
2797 	switch (off) {
2798 	case offsetof(struct __sk_buff, cb[0]) ...
2799 	     offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
2800 		if (off + size >
2801 		    offsetof(struct __sk_buff, cb[4]) + sizeof(__u32))
2802 			return false;
2803 		break;
2804 	default:
2805 		if (size != sizeof(__u32))
2806 			return false;
2807 	}
2808 
2809 	return true;
2810 }
2811 
2812 static bool sk_filter_is_valid_access(int off, int size,
2813 				      enum bpf_access_type type,
2814 				      enum bpf_reg_type *reg_type)
2815 {
2816 	switch (off) {
2817 	case offsetof(struct __sk_buff, tc_classid):
2818 	case offsetof(struct __sk_buff, data):
2819 	case offsetof(struct __sk_buff, data_end):
2820 		return false;
2821 	}
2822 
2823 	if (type == BPF_WRITE) {
2824 		switch (off) {
2825 		case offsetof(struct __sk_buff, cb[0]) ...
2826 		     offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
2827 			break;
2828 		default:
2829 			return false;
2830 		}
2831 	}
2832 
2833 	return __is_valid_access(off, size);
2834 }
2835 
2836 static bool lwt_is_valid_access(int off, int size,
2837 				enum bpf_access_type type,
2838 				enum bpf_reg_type *reg_type)
2839 {
2840 	switch (off) {
2841 	case offsetof(struct __sk_buff, tc_classid):
2842 		return false;
2843 	}
2844 
2845 	if (type == BPF_WRITE) {
2846 		switch (off) {
2847 		case offsetof(struct __sk_buff, mark):
2848 		case offsetof(struct __sk_buff, priority):
2849 		case offsetof(struct __sk_buff, cb[0]) ...
2850 		     offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
2851 			break;
2852 		default:
2853 			return false;
2854 		}
2855 	}
2856 
2857 	switch (off) {
2858 	case offsetof(struct __sk_buff, data):
2859 		*reg_type = PTR_TO_PACKET;
2860 		break;
2861 	case offsetof(struct __sk_buff, data_end):
2862 		*reg_type = PTR_TO_PACKET_END;
2863 		break;
2864 	}
2865 
2866 	return __is_valid_access(off, size);
2867 }
2868 
2869 static bool sock_filter_is_valid_access(int off, int size,
2870 					enum bpf_access_type type,
2871 					enum bpf_reg_type *reg_type)
2872 {
2873 	if (type == BPF_WRITE) {
2874 		switch (off) {
2875 		case offsetof(struct bpf_sock, bound_dev_if):
2876 			break;
2877 		default:
2878 			return false;
2879 		}
2880 	}
2881 
2882 	if (off < 0 || off + size > sizeof(struct bpf_sock))
2883 		return false;
2884 	/* The verifier guarantees that size > 0. */
2885 	if (off % size != 0)
2886 		return false;
2887 	if (size != sizeof(__u32))
2888 		return false;
2889 
2890 	return true;
2891 }
2892 
2893 static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
2894 			       const struct bpf_prog *prog)
2895 {
2896 	struct bpf_insn *insn = insn_buf;
2897 
2898 	if (!direct_write)
2899 		return 0;
2900 
2901 	/* if (!skb->cloned)
2902 	 *       goto start;
2903 	 *
2904 	 * (Fast-path, otherwise approximation that we might be
2905 	 *  a clone, do the rest in helper.)
2906 	 */
2907 	*insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET());
2908 	*insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK);
2909 	*insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7);
2910 
2911 	/* ret = bpf_skb_pull_data(skb, 0); */
2912 	*insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
2913 	*insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2);
2914 	*insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
2915 			       BPF_FUNC_skb_pull_data);
2916 	/* if (!ret)
2917 	 *      goto restore;
2918 	 * return TC_ACT_SHOT;
2919 	 */
2920 	*insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2);
2921 	*insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, TC_ACT_SHOT);
2922 	*insn++ = BPF_EXIT_INSN();
2923 
2924 	/* restore: */
2925 	*insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6);
2926 	/* start: */
2927 	*insn++ = prog->insnsi[0];
2928 
2929 	return insn - insn_buf;
2930 }
2931 
2932 static bool tc_cls_act_is_valid_access(int off, int size,
2933 				       enum bpf_access_type type,
2934 				       enum bpf_reg_type *reg_type)
2935 {
2936 	if (type == BPF_WRITE) {
2937 		switch (off) {
2938 		case offsetof(struct __sk_buff, mark):
2939 		case offsetof(struct __sk_buff, tc_index):
2940 		case offsetof(struct __sk_buff, priority):
2941 		case offsetof(struct __sk_buff, cb[0]) ...
2942 		     offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
2943 		case offsetof(struct __sk_buff, tc_classid):
2944 			break;
2945 		default:
2946 			return false;
2947 		}
2948 	}
2949 
2950 	switch (off) {
2951 	case offsetof(struct __sk_buff, data):
2952 		*reg_type = PTR_TO_PACKET;
2953 		break;
2954 	case offsetof(struct __sk_buff, data_end):
2955 		*reg_type = PTR_TO_PACKET_END;
2956 		break;
2957 	}
2958 
2959 	return __is_valid_access(off, size);
2960 }
2961 
2962 static bool __is_valid_xdp_access(int off, int size)
2963 {
2964 	if (off < 0 || off >= sizeof(struct xdp_md))
2965 		return false;
2966 	if (off % size != 0)
2967 		return false;
2968 	if (size != sizeof(__u32))
2969 		return false;
2970 
2971 	return true;
2972 }
2973 
2974 static bool xdp_is_valid_access(int off, int size,
2975 				enum bpf_access_type type,
2976 				enum bpf_reg_type *reg_type)
2977 {
2978 	if (type == BPF_WRITE)
2979 		return false;
2980 
2981 	switch (off) {
2982 	case offsetof(struct xdp_md, data):
2983 		*reg_type = PTR_TO_PACKET;
2984 		break;
2985 	case offsetof(struct xdp_md, data_end):
2986 		*reg_type = PTR_TO_PACKET_END;
2987 		break;
2988 	}
2989 
2990 	return __is_valid_xdp_access(off, size);
2991 }
2992 
2993 void bpf_warn_invalid_xdp_action(u32 act)
2994 {
2995 	WARN_ONCE(1, "Illegal XDP return value %u, expect packet loss\n", act);
2996 }
2997 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
2998 
2999 static u32 bpf_convert_ctx_access(enum bpf_access_type type,
3000 				  const struct bpf_insn *si,
3001 				  struct bpf_insn *insn_buf,
3002 				  struct bpf_prog *prog)
3003 {
3004 	struct bpf_insn *insn = insn_buf;
3005 	int off;
3006 
3007 	switch (si->off) {
3008 	case offsetof(struct __sk_buff, len):
3009 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
3010 
3011 		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
3012 				      offsetof(struct sk_buff, len));
3013 		break;
3014 
3015 	case offsetof(struct __sk_buff, protocol):
3016 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
3017 
3018 		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
3019 				      offsetof(struct sk_buff, protocol));
3020 		break;
3021 
3022 	case offsetof(struct __sk_buff, vlan_proto):
3023 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2);
3024 
3025 		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
3026 				      offsetof(struct sk_buff, vlan_proto));
3027 		break;
3028 
3029 	case offsetof(struct __sk_buff, priority):
3030 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4);
3031 
3032 		if (type == BPF_WRITE)
3033 			*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
3034 					      offsetof(struct sk_buff, priority));
3035 		else
3036 			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
3037 					      offsetof(struct sk_buff, priority));
3038 		break;
3039 
3040 	case offsetof(struct __sk_buff, ingress_ifindex):
3041 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4);
3042 
3043 		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
3044 				      offsetof(struct sk_buff, skb_iif));
3045 		break;
3046 
3047 	case offsetof(struct __sk_buff, ifindex):
3048 		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
3049 
3050 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
3051 				      si->dst_reg, si->src_reg,
3052 				      offsetof(struct sk_buff, dev));
3053 		*insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
3054 		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
3055 				      offsetof(struct net_device, ifindex));
3056 		break;
3057 
3058 	case offsetof(struct __sk_buff, hash):
3059 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
3060 
3061 		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
3062 				      offsetof(struct sk_buff, hash));
3063 		break;
3064 
3065 	case offsetof(struct __sk_buff, mark):
3066 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
3067 
3068 		if (type == BPF_WRITE)
3069 			*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
3070 					      offsetof(struct sk_buff, mark));
3071 		else
3072 			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
3073 					      offsetof(struct sk_buff, mark));
3074 		break;
3075 
3076 	case offsetof(struct __sk_buff, pkt_type):
3077 		return convert_skb_access(SKF_AD_PKTTYPE, si->dst_reg,
3078 					  si->src_reg, insn);
3079 
3080 	case offsetof(struct __sk_buff, queue_mapping):
3081 		return convert_skb_access(SKF_AD_QUEUE, si->dst_reg,
3082 					  si->src_reg, insn);
3083 
3084 	case offsetof(struct __sk_buff, vlan_present):
3085 		return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
3086 					  si->dst_reg, si->src_reg, insn);
3087 
3088 	case offsetof(struct __sk_buff, vlan_tci):
3089 		return convert_skb_access(SKF_AD_VLAN_TAG,
3090 					  si->dst_reg, si->src_reg, insn);
3091 
3092 	case offsetof(struct __sk_buff, cb[0]) ...
3093 	     offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1:
3094 		BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20);
3095 		BUILD_BUG_ON((offsetof(struct sk_buff, cb) +
3096 			      offsetof(struct qdisc_skb_cb, data)) %
3097 			     sizeof(__u64));
3098 
3099 		prog->cb_access = 1;
3100 		off  = si->off;
3101 		off -= offsetof(struct __sk_buff, cb[0]);
3102 		off += offsetof(struct sk_buff, cb);
3103 		off += offsetof(struct qdisc_skb_cb, data);
3104 		if (type == BPF_WRITE)
3105 			*insn++ = BPF_STX_MEM(BPF_SIZE(si->code), si->dst_reg,
3106 					      si->src_reg, off);
3107 		else
3108 			*insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg,
3109 					      si->src_reg, off);
3110 		break;
3111 
3112 	case offsetof(struct __sk_buff, tc_classid):
3113 		BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, tc_classid) != 2);
3114 
3115 		off  = si->off;
3116 		off -= offsetof(struct __sk_buff, tc_classid);
3117 		off += offsetof(struct sk_buff, cb);
3118 		off += offsetof(struct qdisc_skb_cb, tc_classid);
3119 		if (type == BPF_WRITE)
3120 			*insn++ = BPF_STX_MEM(BPF_H, si->dst_reg,
3121 					      si->src_reg, off);
3122 		else
3123 			*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg,
3124 					      si->src_reg, off);
3125 		break;
3126 
3127 	case offsetof(struct __sk_buff, data):
3128 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
3129 				      si->dst_reg, si->src_reg,
3130 				      offsetof(struct sk_buff, data));
3131 		break;
3132 
3133 	case offsetof(struct __sk_buff, data_end):
3134 		off  = si->off;
3135 		off -= offsetof(struct __sk_buff, data_end);
3136 		off += offsetof(struct sk_buff, cb);
3137 		off += offsetof(struct bpf_skb_data_end, data_end);
3138 		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
3139 				      si->src_reg, off);
3140 		break;
3141 
3142 	case offsetof(struct __sk_buff, tc_index):
3143 #ifdef CONFIG_NET_SCHED
3144 		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2);
3145 
3146 		if (type == BPF_WRITE)
3147 			*insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg,
3148 					      offsetof(struct sk_buff, tc_index));
3149 		else
3150 			*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
3151 					      offsetof(struct sk_buff, tc_index));
3152 #else
3153 		if (type == BPF_WRITE)
3154 			*insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg);
3155 		else
3156 			*insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
3157 #endif
3158 		break;
3159 	}
3160 
3161 	return insn - insn_buf;
3162 }
3163 
3164 static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
3165 					  const struct bpf_insn *si,
3166 					  struct bpf_insn *insn_buf,
3167 					  struct bpf_prog *prog)
3168 {
3169 	struct bpf_insn *insn = insn_buf;
3170 
3171 	switch (si->off) {
3172 	case offsetof(struct bpf_sock, bound_dev_if):
3173 		BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4);
3174 
3175 		if (type == BPF_WRITE)
3176 			*insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
3177 					offsetof(struct sock, sk_bound_dev_if));
3178 		else
3179 			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
3180 				      offsetof(struct sock, sk_bound_dev_if));
3181 		break;
3182 
3183 	case offsetof(struct bpf_sock, family):
3184 		BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2);
3185 
3186 		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
3187 				      offsetof(struct sock, sk_family));
3188 		break;
3189 
3190 	case offsetof(struct bpf_sock, type):
3191 		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
3192 				      offsetof(struct sock, __sk_flags_offset));
3193 		*insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK);
3194 		*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT);
3195 		break;
3196 
3197 	case offsetof(struct bpf_sock, protocol):
3198 		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
3199 				      offsetof(struct sock, __sk_flags_offset));
3200 		*insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK);
3201 		*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT);
3202 		break;
3203 	}
3204 
3205 	return insn - insn_buf;
3206 }
3207 
3208 static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type,
3209 					 const struct bpf_insn *si,
3210 					 struct bpf_insn *insn_buf,
3211 					 struct bpf_prog *prog)
3212 {
3213 	struct bpf_insn *insn = insn_buf;
3214 
3215 	switch (si->off) {
3216 	case offsetof(struct __sk_buff, ifindex):
3217 		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
3218 
3219 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
3220 				      si->dst_reg, si->src_reg,
3221 				      offsetof(struct sk_buff, dev));
3222 		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
3223 				      offsetof(struct net_device, ifindex));
3224 		break;
3225 	default:
3226 		return bpf_convert_ctx_access(type, si, insn_buf, prog);
3227 	}
3228 
3229 	return insn - insn_buf;
3230 }
3231 
3232 static u32 xdp_convert_ctx_access(enum bpf_access_type type,
3233 				  const struct bpf_insn *si,
3234 				  struct bpf_insn *insn_buf,
3235 				  struct bpf_prog *prog)
3236 {
3237 	struct bpf_insn *insn = insn_buf;
3238 
3239 	switch (si->off) {
3240 	case offsetof(struct xdp_md, data):
3241 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data),
3242 				      si->dst_reg, si->src_reg,
3243 				      offsetof(struct xdp_buff, data));
3244 		break;
3245 	case offsetof(struct xdp_md, data_end):
3246 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),
3247 				      si->dst_reg, si->src_reg,
3248 				      offsetof(struct xdp_buff, data_end));
3249 		break;
3250 	}
3251 
3252 	return insn - insn_buf;
3253 }
3254 
3255 static const struct bpf_verifier_ops sk_filter_ops = {
3256 	.get_func_proto		= sk_filter_func_proto,
3257 	.is_valid_access	= sk_filter_is_valid_access,
3258 	.convert_ctx_access	= bpf_convert_ctx_access,
3259 };
3260 
3261 static const struct bpf_verifier_ops tc_cls_act_ops = {
3262 	.get_func_proto		= tc_cls_act_func_proto,
3263 	.is_valid_access	= tc_cls_act_is_valid_access,
3264 	.convert_ctx_access	= tc_cls_act_convert_ctx_access,
3265 	.gen_prologue		= tc_cls_act_prologue,
3266 };
3267 
3268 static const struct bpf_verifier_ops xdp_ops = {
3269 	.get_func_proto		= xdp_func_proto,
3270 	.is_valid_access	= xdp_is_valid_access,
3271 	.convert_ctx_access	= xdp_convert_ctx_access,
3272 };
3273 
3274 static const struct bpf_verifier_ops cg_skb_ops = {
3275 	.get_func_proto		= cg_skb_func_proto,
3276 	.is_valid_access	= sk_filter_is_valid_access,
3277 	.convert_ctx_access	= bpf_convert_ctx_access,
3278 };
3279 
3280 static const struct bpf_verifier_ops lwt_inout_ops = {
3281 	.get_func_proto		= lwt_inout_func_proto,
3282 	.is_valid_access	= lwt_is_valid_access,
3283 	.convert_ctx_access	= bpf_convert_ctx_access,
3284 };
3285 
3286 static const struct bpf_verifier_ops lwt_xmit_ops = {
3287 	.get_func_proto		= lwt_xmit_func_proto,
3288 	.is_valid_access	= lwt_is_valid_access,
3289 	.convert_ctx_access	= bpf_convert_ctx_access,
3290 	.gen_prologue		= tc_cls_act_prologue,
3291 };
3292 
3293 static const struct bpf_verifier_ops cg_sock_ops = {
3294 	.get_func_proto		= bpf_base_func_proto,
3295 	.is_valid_access	= sock_filter_is_valid_access,
3296 	.convert_ctx_access	= sock_filter_convert_ctx_access,
3297 };
3298 
3299 static struct bpf_prog_type_list sk_filter_type __ro_after_init = {
3300 	.ops	= &sk_filter_ops,
3301 	.type	= BPF_PROG_TYPE_SOCKET_FILTER,
3302 };
3303 
3304 static struct bpf_prog_type_list sched_cls_type __ro_after_init = {
3305 	.ops	= &tc_cls_act_ops,
3306 	.type	= BPF_PROG_TYPE_SCHED_CLS,
3307 };
3308 
3309 static struct bpf_prog_type_list sched_act_type __ro_after_init = {
3310 	.ops	= &tc_cls_act_ops,
3311 	.type	= BPF_PROG_TYPE_SCHED_ACT,
3312 };
3313 
3314 static struct bpf_prog_type_list xdp_type __ro_after_init = {
3315 	.ops	= &xdp_ops,
3316 	.type	= BPF_PROG_TYPE_XDP,
3317 };
3318 
3319 static struct bpf_prog_type_list cg_skb_type __ro_after_init = {
3320 	.ops	= &cg_skb_ops,
3321 	.type	= BPF_PROG_TYPE_CGROUP_SKB,
3322 };
3323 
3324 static struct bpf_prog_type_list lwt_in_type __ro_after_init = {
3325 	.ops	= &lwt_inout_ops,
3326 	.type	= BPF_PROG_TYPE_LWT_IN,
3327 };
3328 
3329 static struct bpf_prog_type_list lwt_out_type __ro_after_init = {
3330 	.ops	= &lwt_inout_ops,
3331 	.type	= BPF_PROG_TYPE_LWT_OUT,
3332 };
3333 
3334 static struct bpf_prog_type_list lwt_xmit_type __ro_after_init = {
3335 	.ops	= &lwt_xmit_ops,
3336 	.type	= BPF_PROG_TYPE_LWT_XMIT,
3337 };
3338 
3339 static struct bpf_prog_type_list cg_sock_type __ro_after_init = {
3340 	.ops	= &cg_sock_ops,
3341 	.type	= BPF_PROG_TYPE_CGROUP_SOCK
3342 };
3343 
3344 static int __init register_sk_filter_ops(void)
3345 {
3346 	bpf_register_prog_type(&sk_filter_type);
3347 	bpf_register_prog_type(&sched_cls_type);
3348 	bpf_register_prog_type(&sched_act_type);
3349 	bpf_register_prog_type(&xdp_type);
3350 	bpf_register_prog_type(&cg_skb_type);
3351 	bpf_register_prog_type(&cg_sock_type);
3352 	bpf_register_prog_type(&lwt_in_type);
3353 	bpf_register_prog_type(&lwt_out_type);
3354 	bpf_register_prog_type(&lwt_xmit_type);
3355 
3356 	return 0;
3357 }
3358 late_initcall(register_sk_filter_ops);
3359 
3360 int sk_detach_filter(struct sock *sk)
3361 {
3362 	int ret = -ENOENT;
3363 	struct sk_filter *filter;
3364 
3365 	if (sock_flag(sk, SOCK_FILTER_LOCKED))
3366 		return -EPERM;
3367 
3368 	filter = rcu_dereference_protected(sk->sk_filter,
3369 					   lockdep_sock_is_held(sk));
3370 	if (filter) {
3371 		RCU_INIT_POINTER(sk->sk_filter, NULL);
3372 		sk_filter_uncharge(sk, filter);
3373 		ret = 0;
3374 	}
3375 
3376 	return ret;
3377 }
3378 EXPORT_SYMBOL_GPL(sk_detach_filter);
3379 
3380 int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
3381 		  unsigned int len)
3382 {
3383 	struct sock_fprog_kern *fprog;
3384 	struct sk_filter *filter;
3385 	int ret = 0;
3386 
3387 	lock_sock(sk);
3388 	filter = rcu_dereference_protected(sk->sk_filter,
3389 					   lockdep_sock_is_held(sk));
3390 	if (!filter)
3391 		goto out;
3392 
3393 	/* We're copying the filter that has been originally attached,
3394 	 * so no conversion/decode needed anymore. eBPF programs that
3395 	 * have no original program cannot be dumped through this.
3396 	 */
3397 	ret = -EACCES;
3398 	fprog = filter->prog->orig_prog;
3399 	if (!fprog)
3400 		goto out;
3401 
3402 	ret = fprog->len;
3403 	if (!len)
3404 		/* User space only enquires number of filter blocks. */
3405 		goto out;
3406 
3407 	ret = -EINVAL;
3408 	if (len < fprog->len)
3409 		goto out;
3410 
3411 	ret = -EFAULT;
3412 	if (copy_to_user(ubuf, fprog->filter, bpf_classic_proglen(fprog)))
3413 		goto out;
3414 
3415 	/* Instead of bytes, the API requests to return the number
3416 	 * of filter blocks.
3417 	 */
3418 	ret = fprog->len;
3419 out:
3420 	release_sock(sk);
3421 	return ret;
3422 }
3423