1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Linux Socket Filter - Kernel level socket filtering
4 *
5 * Based on the design of the Berkeley Packet Filter. The new
6 * internal format has been designed by PLUMgrid:
7 *
8 * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
9 *
10 * Authors:
11 *
12 * Jay Schulist <jschlst@samba.org>
13 * Alexei Starovoitov <ast@plumgrid.com>
14 * Daniel Borkmann <dborkman@redhat.com>
15 *
16 * Andi Kleen - Fix a few bad bugs and races.
17 * Kris Katterjohn - Added many additional checks in bpf_check_classic()
18 */
19
20 #include <linux/atomic.h>
21 #include <linux/bpf_verifier.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/mm.h>
25 #include <linux/fcntl.h>
26 #include <linux/socket.h>
27 #include <linux/sock_diag.h>
28 #include <linux/in.h>
29 #include <linux/inet.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_packet.h>
32 #include <linux/if_arp.h>
33 #include <linux/gfp.h>
34 #include <net/inet_common.h>
35 #include <net/ip.h>
36 #include <net/protocol.h>
37 #include <net/netlink.h>
38 #include <linux/skbuff.h>
39 #include <linux/skmsg.h>
40 #include <net/sock.h>
41 #include <net/flow_dissector.h>
42 #include <linux/errno.h>
43 #include <linux/timer.h>
44 #include <linux/uaccess.h>
45 #include <asm/unaligned.h>
46 #include <linux/filter.h>
47 #include <linux/ratelimit.h>
48 #include <linux/seccomp.h>
49 #include <linux/if_vlan.h>
50 #include <linux/bpf.h>
51 #include <linux/btf.h>
52 #include <net/sch_generic.h>
53 #include <net/cls_cgroup.h>
54 #include <net/dst_metadata.h>
55 #include <net/dst.h>
56 #include <net/sock_reuseport.h>
57 #include <net/busy_poll.h>
58 #include <net/tcp.h>
59 #include <net/xfrm.h>
60 #include <net/udp.h>
61 #include <linux/bpf_trace.h>
62 #include <net/xdp_sock.h>
63 #include <linux/inetdevice.h>
64 #include <net/inet_hashtables.h>
65 #include <net/inet6_hashtables.h>
66 #include <net/ip_fib.h>
67 #include <net/nexthop.h>
68 #include <net/flow.h>
69 #include <net/arp.h>
70 #include <net/ipv6.h>
71 #include <net/net_namespace.h>
72 #include <linux/seg6_local.h>
73 #include <net/seg6.h>
74 #include <net/seg6_local.h>
75 #include <net/lwtunnel.h>
76 #include <net/ipv6_stubs.h>
77 #include <net/bpf_sk_storage.h>
78 #include <net/transp_v6.h>
79 #include <linux/btf_ids.h>
80 #include <net/tls.h>
81 #include <net/xdp.h>
82 #include <net/mptcp.h>
83 #include <net/netfilter/nf_conntrack_bpf.h>
84 #include <linux/un.h>
85 #include <net/xdp_sock_drv.h>
86
87 static const struct bpf_func_proto *
88 bpf_sk_base_func_proto(enum bpf_func_id func_id);
89
copy_bpf_fprog_from_user(struct sock_fprog * dst,sockptr_t src,int len)90 int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len)
91 {
92 if (in_compat_syscall()) {
93 struct compat_sock_fprog f32;
94
95 if (len != sizeof(f32))
96 return -EINVAL;
97 if (copy_from_sockptr(&f32, src, sizeof(f32)))
98 return -EFAULT;
99 memset(dst, 0, sizeof(*dst));
100 dst->len = f32.len;
101 dst->filter = compat_ptr(f32.filter);
102 } else {
103 if (len != sizeof(*dst))
104 return -EINVAL;
105 if (copy_from_sockptr(dst, src, sizeof(*dst)))
106 return -EFAULT;
107 }
108
109 return 0;
110 }
111 EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user);
112
113 /**
114 * sk_filter_trim_cap - run a packet through a socket filter
115 * @sk: sock associated with &sk_buff
116 * @skb: buffer to filter
117 * @cap: limit on how short the eBPF program may trim the packet
118 *
119 * Run the eBPF program and then cut skb->data to correct size returned by
120 * the program. If pkt_len is 0 we toss packet. If skb->len is smaller
121 * than pkt_len we keep whole skb->data. This is the socket level
122 * wrapper to bpf_prog_run. It returns 0 if the packet should
123 * be accepted or -EPERM if the packet should be tossed.
124 *
125 */
sk_filter_trim_cap(struct sock * sk,struct sk_buff * skb,unsigned int cap)126 int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
127 {
128 int err;
129 struct sk_filter *filter;
130
131 /*
132 * If the skb was allocated from pfmemalloc reserves, only
133 * allow SOCK_MEMALLOC sockets to use it as this socket is
134 * helping free memory
135 */
136 if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) {
137 NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
138 return -ENOMEM;
139 }
140 err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb);
141 if (err)
142 return err;
143
144 err = security_sock_rcv_skb(sk, skb);
145 if (err)
146 return err;
147
148 rcu_read_lock();
149 filter = rcu_dereference(sk->sk_filter);
150 if (filter) {
151 struct sock *save_sk = skb->sk;
152 unsigned int pkt_len;
153
154 skb->sk = sk;
155 pkt_len = bpf_prog_run_save_cb(filter->prog, skb);
156 skb->sk = save_sk;
157 err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM;
158 }
159 rcu_read_unlock();
160
161 return err;
162 }
163 EXPORT_SYMBOL(sk_filter_trim_cap);
164
BPF_CALL_1(bpf_skb_get_pay_offset,struct sk_buff *,skb)165 BPF_CALL_1(bpf_skb_get_pay_offset, struct sk_buff *, skb)
166 {
167 return skb_get_poff(skb);
168 }
169
BPF_CALL_3(bpf_skb_get_nlattr,struct sk_buff *,skb,u32,a,u32,x)170 BPF_CALL_3(bpf_skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x)
171 {
172 struct nlattr *nla;
173
174 if (skb_is_nonlinear(skb))
175 return 0;
176
177 if (skb->len < sizeof(struct nlattr))
178 return 0;
179
180 if (a > skb->len - sizeof(struct nlattr))
181 return 0;
182
183 nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x);
184 if (nla)
185 return (void *) nla - (void *) skb->data;
186
187 return 0;
188 }
189
BPF_CALL_3(bpf_skb_get_nlattr_nest,struct sk_buff *,skb,u32,a,u32,x)190 BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
191 {
192 struct nlattr *nla;
193
194 if (skb_is_nonlinear(skb))
195 return 0;
196
197 if (skb->len < sizeof(struct nlattr))
198 return 0;
199
200 if (a > skb->len - sizeof(struct nlattr))
201 return 0;
202
203 nla = (struct nlattr *) &skb->data[a];
204 if (nla->nla_len > skb->len - a)
205 return 0;
206
207 nla = nla_find_nested(nla, x);
208 if (nla)
209 return (void *) nla - (void *) skb->data;
210
211 return 0;
212 }
213
bpf_skb_load_helper_convert_offset(const struct sk_buff * skb,int offset)214 static int bpf_skb_load_helper_convert_offset(const struct sk_buff *skb, int offset)
215 {
216 if (likely(offset >= 0))
217 return offset;
218
219 if (offset >= SKF_NET_OFF)
220 return offset - SKF_NET_OFF + skb_network_offset(skb);
221
222 if (offset >= SKF_LL_OFF && skb_mac_header_was_set(skb))
223 return offset - SKF_LL_OFF + skb_mac_offset(skb);
224
225 return INT_MIN;
226 }
227
BPF_CALL_4(bpf_skb_load_helper_8,const struct sk_buff *,skb,const void *,data,int,headlen,int,offset)228 BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *,
229 data, int, headlen, int, offset)
230 {
231 u8 tmp;
232 const int len = sizeof(tmp);
233
234 offset = bpf_skb_load_helper_convert_offset(skb, offset);
235 if (offset == INT_MIN)
236 return -EFAULT;
237
238 if (headlen - offset >= len)
239 return *(u8 *)(data + offset);
240 if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
241 return tmp;
242 else
243 return -EFAULT;
244 }
245
BPF_CALL_2(bpf_skb_load_helper_8_no_cache,const struct sk_buff *,skb,int,offset)246 BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb,
247 int, offset)
248 {
249 return ____bpf_skb_load_helper_8(skb, skb->data, skb->len - skb->data_len,
250 offset);
251 }
252
BPF_CALL_4(bpf_skb_load_helper_16,const struct sk_buff *,skb,const void *,data,int,headlen,int,offset)253 BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *,
254 data, int, headlen, int, offset)
255 {
256 __be16 tmp;
257 const int len = sizeof(tmp);
258
259 offset = bpf_skb_load_helper_convert_offset(skb, offset);
260 if (offset == INT_MIN)
261 return -EFAULT;
262
263 if (headlen - offset >= len)
264 return get_unaligned_be16(data + offset);
265 if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
266 return be16_to_cpu(tmp);
267 else
268 return -EFAULT;
269 }
270
BPF_CALL_2(bpf_skb_load_helper_16_no_cache,const struct sk_buff *,skb,int,offset)271 BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb,
272 int, offset)
273 {
274 return ____bpf_skb_load_helper_16(skb, skb->data, skb->len - skb->data_len,
275 offset);
276 }
277
BPF_CALL_4(bpf_skb_load_helper_32,const struct sk_buff *,skb,const void *,data,int,headlen,int,offset)278 BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *,
279 data, int, headlen, int, offset)
280 {
281 __be32 tmp;
282 const int len = sizeof(tmp);
283
284 offset = bpf_skb_load_helper_convert_offset(skb, offset);
285 if (offset == INT_MIN)
286 return -EFAULT;
287
288 if (headlen - offset >= len)
289 return get_unaligned_be32(data + offset);
290 if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
291 return be32_to_cpu(tmp);
292 else
293 return -EFAULT;
294 }
295
BPF_CALL_2(bpf_skb_load_helper_32_no_cache,const struct sk_buff *,skb,int,offset)296 BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb,
297 int, offset)
298 {
299 return ____bpf_skb_load_helper_32(skb, skb->data, skb->len - skb->data_len,
300 offset);
301 }
302
convert_skb_access(int skb_field,int dst_reg,int src_reg,struct bpf_insn * insn_buf)303 static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
304 struct bpf_insn *insn_buf)
305 {
306 struct bpf_insn *insn = insn_buf;
307
308 switch (skb_field) {
309 case SKF_AD_MARK:
310 BUILD_BUG_ON(sizeof_field(struct sk_buff, mark) != 4);
311
312 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
313 offsetof(struct sk_buff, mark));
314 break;
315
316 case SKF_AD_PKTTYPE:
317 *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET);
318 *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX);
319 #ifdef __BIG_ENDIAN_BITFIELD
320 *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5);
321 #endif
322 break;
323
324 case SKF_AD_QUEUE:
325 BUILD_BUG_ON(sizeof_field(struct sk_buff, queue_mapping) != 2);
326
327 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
328 offsetof(struct sk_buff, queue_mapping));
329 break;
330
331 case SKF_AD_VLAN_TAG:
332 BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_tci) != 2);
333
334 /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */
335 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
336 offsetof(struct sk_buff, vlan_tci));
337 break;
338 case SKF_AD_VLAN_TAG_PRESENT:
339 BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_all) != 4);
340 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
341 offsetof(struct sk_buff, vlan_all));
342 *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1);
343 *insn++ = BPF_ALU32_IMM(BPF_MOV, dst_reg, 1);
344 break;
345 }
346
347 return insn - insn_buf;
348 }
349
convert_bpf_extensions(struct sock_filter * fp,struct bpf_insn ** insnp)350 static bool convert_bpf_extensions(struct sock_filter *fp,
351 struct bpf_insn **insnp)
352 {
353 struct bpf_insn *insn = *insnp;
354 u32 cnt;
355
356 switch (fp->k) {
357 case SKF_AD_OFF + SKF_AD_PROTOCOL:
358 BUILD_BUG_ON(sizeof_field(struct sk_buff, protocol) != 2);
359
360 /* A = *(u16 *) (CTX + offsetof(protocol)) */
361 *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
362 offsetof(struct sk_buff, protocol));
363 /* A = ntohs(A) [emitting a nop or swap16] */
364 *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
365 break;
366
367 case SKF_AD_OFF + SKF_AD_PKTTYPE:
368 cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn);
369 insn += cnt - 1;
370 break;
371
372 case SKF_AD_OFF + SKF_AD_IFINDEX:
373 case SKF_AD_OFF + SKF_AD_HATYPE:
374 BUILD_BUG_ON(sizeof_field(struct net_device, ifindex) != 4);
375 BUILD_BUG_ON(sizeof_field(struct net_device, type) != 2);
376
377 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
378 BPF_REG_TMP, BPF_REG_CTX,
379 offsetof(struct sk_buff, dev));
380 /* if (tmp != 0) goto pc + 1 */
381 *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1);
382 *insn++ = BPF_EXIT_INSN();
383 if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX)
384 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP,
385 offsetof(struct net_device, ifindex));
386 else
387 *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP,
388 offsetof(struct net_device, type));
389 break;
390
391 case SKF_AD_OFF + SKF_AD_MARK:
392 cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn);
393 insn += cnt - 1;
394 break;
395
396 case SKF_AD_OFF + SKF_AD_RXHASH:
397 BUILD_BUG_ON(sizeof_field(struct sk_buff, hash) != 4);
398
399 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX,
400 offsetof(struct sk_buff, hash));
401 break;
402
403 case SKF_AD_OFF + SKF_AD_QUEUE:
404 cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn);
405 insn += cnt - 1;
406 break;
407
408 case SKF_AD_OFF + SKF_AD_VLAN_TAG:
409 cnt = convert_skb_access(SKF_AD_VLAN_TAG,
410 BPF_REG_A, BPF_REG_CTX, insn);
411 insn += cnt - 1;
412 break;
413
414 case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT:
415 cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
416 BPF_REG_A, BPF_REG_CTX, insn);
417 insn += cnt - 1;
418 break;
419
420 case SKF_AD_OFF + SKF_AD_VLAN_TPID:
421 BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_proto) != 2);
422
423 /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */
424 *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
425 offsetof(struct sk_buff, vlan_proto));
426 /* A = ntohs(A) [emitting a nop or swap16] */
427 *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
428 break;
429
430 case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
431 case SKF_AD_OFF + SKF_AD_NLATTR:
432 case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
433 case SKF_AD_OFF + SKF_AD_CPU:
434 case SKF_AD_OFF + SKF_AD_RANDOM:
435 /* arg1 = CTX */
436 *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
437 /* arg2 = A */
438 *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A);
439 /* arg3 = X */
440 *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X);
441 /* Emit call(arg1=CTX, arg2=A, arg3=X) */
442 switch (fp->k) {
443 case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
444 *insn = BPF_EMIT_CALL(bpf_skb_get_pay_offset);
445 break;
446 case SKF_AD_OFF + SKF_AD_NLATTR:
447 *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr);
448 break;
449 case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
450 *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr_nest);
451 break;
452 case SKF_AD_OFF + SKF_AD_CPU:
453 *insn = BPF_EMIT_CALL(bpf_get_raw_cpu_id);
454 break;
455 case SKF_AD_OFF + SKF_AD_RANDOM:
456 *insn = BPF_EMIT_CALL(bpf_user_rnd_u32);
457 bpf_user_rnd_init_once();
458 break;
459 }
460 break;
461
462 case SKF_AD_OFF + SKF_AD_ALU_XOR_X:
463 /* A ^= X */
464 *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X);
465 break;
466
467 default:
468 /* This is just a dummy call to avoid letting the compiler
469 * evict __bpf_call_base() as an optimization. Placed here
470 * where no-one bothers.
471 */
472 BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0);
473 return false;
474 }
475
476 *insnp = insn;
477 return true;
478 }
479
convert_bpf_ld_abs(struct sock_filter * fp,struct bpf_insn ** insnp)480 static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp)
481 {
482 const bool unaligned_ok = IS_BUILTIN(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS);
483 int size = bpf_size_to_bytes(BPF_SIZE(fp->code));
484 bool endian = BPF_SIZE(fp->code) == BPF_H ||
485 BPF_SIZE(fp->code) == BPF_W;
486 bool indirect = BPF_MODE(fp->code) == BPF_IND;
487 const int ip_align = NET_IP_ALIGN;
488 struct bpf_insn *insn = *insnp;
489 int offset = fp->k;
490
491 if (!indirect &&
492 ((unaligned_ok && offset >= 0) ||
493 (!unaligned_ok && offset >= 0 &&
494 offset + ip_align >= 0 &&
495 offset + ip_align % size == 0))) {
496 bool ldx_off_ok = offset <= S16_MAX;
497
498 *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H);
499 if (offset)
500 *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset);
501 *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP,
502 size, 2 + endian + (!ldx_off_ok * 2));
503 if (ldx_off_ok) {
504 *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A,
505 BPF_REG_D, offset);
506 } else {
507 *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_D);
508 *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_TMP, offset);
509 *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A,
510 BPF_REG_TMP, 0);
511 }
512 if (endian)
513 *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8);
514 *insn++ = BPF_JMP_A(8);
515 }
516
517 *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
518 *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_D);
519 *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_H);
520 if (!indirect) {
521 *insn++ = BPF_MOV64_IMM(BPF_REG_ARG4, offset);
522 } else {
523 *insn++ = BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_X);
524 if (fp->k)
525 *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG4, offset);
526 }
527
528 switch (BPF_SIZE(fp->code)) {
529 case BPF_B:
530 *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8);
531 break;
532 case BPF_H:
533 *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16);
534 break;
535 case BPF_W:
536 *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32);
537 break;
538 default:
539 return false;
540 }
541
542 *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_A, 0, 2);
543 *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
544 *insn = BPF_EXIT_INSN();
545
546 *insnp = insn;
547 return true;
548 }
549
550 /**
551 * bpf_convert_filter - convert filter program
552 * @prog: the user passed filter program
553 * @len: the length of the user passed filter program
554 * @new_prog: allocated 'struct bpf_prog' or NULL
555 * @new_len: pointer to store length of converted program
556 * @seen_ld_abs: bool whether we've seen ld_abs/ind
557 *
558 * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn'
559 * style extended BPF (eBPF).
560 * Conversion workflow:
561 *
562 * 1) First pass for calculating the new program length:
563 * bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs)
564 *
565 * 2) 2nd pass to remap in two passes: 1st pass finds new
566 * jump offsets, 2nd pass remapping:
567 * bpf_convert_filter(old_prog, old_len, new_prog, &new_len, &seen_ld_abs)
568 */
bpf_convert_filter(struct sock_filter * prog,int len,struct bpf_prog * new_prog,int * new_len,bool * seen_ld_abs)569 static int bpf_convert_filter(struct sock_filter *prog, int len,
570 struct bpf_prog *new_prog, int *new_len,
571 bool *seen_ld_abs)
572 {
573 int new_flen = 0, pass = 0, target, i, stack_off;
574 struct bpf_insn *new_insn, *first_insn = NULL;
575 struct sock_filter *fp;
576 int *addrs = NULL;
577 u8 bpf_src;
578
579 BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK);
580 BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
581
582 if (len <= 0 || len > BPF_MAXINSNS)
583 return -EINVAL;
584
585 if (new_prog) {
586 first_insn = new_prog->insnsi;
587 addrs = kcalloc(len, sizeof(*addrs),
588 GFP_KERNEL | __GFP_NOWARN);
589 if (!addrs)
590 return -ENOMEM;
591 }
592
593 do_pass:
594 new_insn = first_insn;
595 fp = prog;
596
597 /* Classic BPF related prologue emission. */
598 if (new_prog) {
599 /* Classic BPF expects A and X to be reset first. These need
600 * to be guaranteed to be the first two instructions.
601 */
602 *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
603 *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_X, BPF_REG_X);
604
605 /* All programs must keep CTX in callee saved BPF_REG_CTX.
606 * In eBPF case it's done by the compiler, here we need to
607 * do this ourself. Initial CTX is present in BPF_REG_ARG1.
608 */
609 *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1);
610 if (*seen_ld_abs) {
611 /* For packet access in classic BPF, cache skb->data
612 * in callee-saved BPF R8 and skb->len - skb->data_len
613 * (headlen) in BPF R9. Since classic BPF is read-only
614 * on CTX, we only need to cache it once.
615 */
616 *new_insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
617 BPF_REG_D, BPF_REG_CTX,
618 offsetof(struct sk_buff, data));
619 *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_H, BPF_REG_CTX,
620 offsetof(struct sk_buff, len));
621 *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_TMP, BPF_REG_CTX,
622 offsetof(struct sk_buff, data_len));
623 *new_insn++ = BPF_ALU32_REG(BPF_SUB, BPF_REG_H, BPF_REG_TMP);
624 }
625 } else {
626 new_insn += 3;
627 }
628
629 for (i = 0; i < len; fp++, i++) {
630 struct bpf_insn tmp_insns[32] = { };
631 struct bpf_insn *insn = tmp_insns;
632
633 if (addrs)
634 addrs[i] = new_insn - first_insn;
635
636 switch (fp->code) {
637 /* All arithmetic insns and skb loads map as-is. */
638 case BPF_ALU | BPF_ADD | BPF_X:
639 case BPF_ALU | BPF_ADD | BPF_K:
640 case BPF_ALU | BPF_SUB | BPF_X:
641 case BPF_ALU | BPF_SUB | BPF_K:
642 case BPF_ALU | BPF_AND | BPF_X:
643 case BPF_ALU | BPF_AND | BPF_K:
644 case BPF_ALU | BPF_OR | BPF_X:
645 case BPF_ALU | BPF_OR | BPF_K:
646 case BPF_ALU | BPF_LSH | BPF_X:
647 case BPF_ALU | BPF_LSH | BPF_K:
648 case BPF_ALU | BPF_RSH | BPF_X:
649 case BPF_ALU | BPF_RSH | BPF_K:
650 case BPF_ALU | BPF_XOR | BPF_X:
651 case BPF_ALU | BPF_XOR | BPF_K:
652 case BPF_ALU | BPF_MUL | BPF_X:
653 case BPF_ALU | BPF_MUL | BPF_K:
654 case BPF_ALU | BPF_DIV | BPF_X:
655 case BPF_ALU | BPF_DIV | BPF_K:
656 case BPF_ALU | BPF_MOD | BPF_X:
657 case BPF_ALU | BPF_MOD | BPF_K:
658 case BPF_ALU | BPF_NEG:
659 case BPF_LD | BPF_ABS | BPF_W:
660 case BPF_LD | BPF_ABS | BPF_H:
661 case BPF_LD | BPF_ABS | BPF_B:
662 case BPF_LD | BPF_IND | BPF_W:
663 case BPF_LD | BPF_IND | BPF_H:
664 case BPF_LD | BPF_IND | BPF_B:
665 /* Check for overloaded BPF extension and
666 * directly convert it if found, otherwise
667 * just move on with mapping.
668 */
669 if (BPF_CLASS(fp->code) == BPF_LD &&
670 BPF_MODE(fp->code) == BPF_ABS &&
671 convert_bpf_extensions(fp, &insn))
672 break;
673 if (BPF_CLASS(fp->code) == BPF_LD &&
674 convert_bpf_ld_abs(fp, &insn)) {
675 *seen_ld_abs = true;
676 break;
677 }
678
679 if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) ||
680 fp->code == (BPF_ALU | BPF_MOD | BPF_X)) {
681 *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X);
682 /* Error with exception code on div/mod by 0.
683 * For cBPF programs, this was always return 0.
684 */
685 *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_X, 0, 2);
686 *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
687 *insn++ = BPF_EXIT_INSN();
688 }
689
690 *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k);
691 break;
692
693 /* Jump transformation cannot use BPF block macros
694 * everywhere as offset calculation and target updates
695 * require a bit more work than the rest, i.e. jump
696 * opcodes map as-is, but offsets need adjustment.
697 */
698
699 #define BPF_EMIT_JMP \
700 do { \
701 const s32 off_min = S16_MIN, off_max = S16_MAX; \
702 s32 off; \
703 \
704 if (target >= len || target < 0) \
705 goto err; \
706 off = addrs ? addrs[target] - addrs[i] - 1 : 0; \
707 /* Adjust pc relative offset for 2nd or 3rd insn. */ \
708 off -= insn - tmp_insns; \
709 /* Reject anything not fitting into insn->off. */ \
710 if (off < off_min || off > off_max) \
711 goto err; \
712 insn->off = off; \
713 } while (0)
714
715 case BPF_JMP | BPF_JA:
716 target = i + fp->k + 1;
717 insn->code = fp->code;
718 BPF_EMIT_JMP;
719 break;
720
721 case BPF_JMP | BPF_JEQ | BPF_K:
722 case BPF_JMP | BPF_JEQ | BPF_X:
723 case BPF_JMP | BPF_JSET | BPF_K:
724 case BPF_JMP | BPF_JSET | BPF_X:
725 case BPF_JMP | BPF_JGT | BPF_K:
726 case BPF_JMP | BPF_JGT | BPF_X:
727 case BPF_JMP | BPF_JGE | BPF_K:
728 case BPF_JMP | BPF_JGE | BPF_X:
729 if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) {
730 /* BPF immediates are signed, zero extend
731 * immediate into tmp register and use it
732 * in compare insn.
733 */
734 *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k);
735
736 insn->dst_reg = BPF_REG_A;
737 insn->src_reg = BPF_REG_TMP;
738 bpf_src = BPF_X;
739 } else {
740 insn->dst_reg = BPF_REG_A;
741 insn->imm = fp->k;
742 bpf_src = BPF_SRC(fp->code);
743 insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0;
744 }
745
746 /* Common case where 'jump_false' is next insn. */
747 if (fp->jf == 0) {
748 insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
749 target = i + fp->jt + 1;
750 BPF_EMIT_JMP;
751 break;
752 }
753
754 /* Convert some jumps when 'jump_true' is next insn. */
755 if (fp->jt == 0) {
756 switch (BPF_OP(fp->code)) {
757 case BPF_JEQ:
758 insn->code = BPF_JMP | BPF_JNE | bpf_src;
759 break;
760 case BPF_JGT:
761 insn->code = BPF_JMP | BPF_JLE | bpf_src;
762 break;
763 case BPF_JGE:
764 insn->code = BPF_JMP | BPF_JLT | bpf_src;
765 break;
766 default:
767 goto jmp_rest;
768 }
769
770 target = i + fp->jf + 1;
771 BPF_EMIT_JMP;
772 break;
773 }
774 jmp_rest:
775 /* Other jumps are mapped into two insns: Jxx and JA. */
776 target = i + fp->jt + 1;
777 insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
778 BPF_EMIT_JMP;
779 insn++;
780
781 insn->code = BPF_JMP | BPF_JA;
782 target = i + fp->jf + 1;
783 BPF_EMIT_JMP;
784 break;
785
786 /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */
787 case BPF_LDX | BPF_MSH | BPF_B: {
788 struct sock_filter tmp = {
789 .code = BPF_LD | BPF_ABS | BPF_B,
790 .k = fp->k,
791 };
792
793 *seen_ld_abs = true;
794
795 /* X = A */
796 *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
797 /* A = BPF_R0 = *(u8 *) (skb->data + K) */
798 convert_bpf_ld_abs(&tmp, &insn);
799 insn++;
800 /* A &= 0xf */
801 *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf);
802 /* A <<= 2 */
803 *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2);
804 /* tmp = X */
805 *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_X);
806 /* X = A */
807 *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
808 /* A = tmp */
809 *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP);
810 break;
811 }
812 /* RET_K is remaped into 2 insns. RET_A case doesn't need an
813 * extra mov as BPF_REG_0 is already mapped into BPF_REG_A.
814 */
815 case BPF_RET | BPF_A:
816 case BPF_RET | BPF_K:
817 if (BPF_RVAL(fp->code) == BPF_K)
818 *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0,
819 0, fp->k);
820 *insn = BPF_EXIT_INSN();
821 break;
822
823 /* Store to stack. */
824 case BPF_ST:
825 case BPF_STX:
826 stack_off = fp->k * 4 + 4;
827 *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) ==
828 BPF_ST ? BPF_REG_A : BPF_REG_X,
829 -stack_off);
830 /* check_load_and_stores() verifies that classic BPF can
831 * load from stack only after write, so tracking
832 * stack_depth for ST|STX insns is enough
833 */
834 if (new_prog && new_prog->aux->stack_depth < stack_off)
835 new_prog->aux->stack_depth = stack_off;
836 break;
837
838 /* Load from stack. */
839 case BPF_LD | BPF_MEM:
840 case BPF_LDX | BPF_MEM:
841 stack_off = fp->k * 4 + 4;
842 *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ?
843 BPF_REG_A : BPF_REG_X, BPF_REG_FP,
844 -stack_off);
845 break;
846
847 /* A = K or X = K */
848 case BPF_LD | BPF_IMM:
849 case BPF_LDX | BPF_IMM:
850 *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ?
851 BPF_REG_A : BPF_REG_X, fp->k);
852 break;
853
854 /* X = A */
855 case BPF_MISC | BPF_TAX:
856 *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
857 break;
858
859 /* A = X */
860 case BPF_MISC | BPF_TXA:
861 *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X);
862 break;
863
864 /* A = skb->len or X = skb->len */
865 case BPF_LD | BPF_W | BPF_LEN:
866 case BPF_LDX | BPF_W | BPF_LEN:
867 *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ?
868 BPF_REG_A : BPF_REG_X, BPF_REG_CTX,
869 offsetof(struct sk_buff, len));
870 break;
871
872 /* Access seccomp_data fields. */
873 case BPF_LDX | BPF_ABS | BPF_W:
874 /* A = *(u32 *) (ctx + K) */
875 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k);
876 break;
877
878 /* Unknown instruction. */
879 default:
880 goto err;
881 }
882
883 insn++;
884 if (new_prog)
885 memcpy(new_insn, tmp_insns,
886 sizeof(*insn) * (insn - tmp_insns));
887 new_insn += insn - tmp_insns;
888 }
889
890 if (!new_prog) {
891 /* Only calculating new length. */
892 *new_len = new_insn - first_insn;
893 if (*seen_ld_abs)
894 *new_len += 4; /* Prologue bits. */
895 return 0;
896 }
897
898 pass++;
899 if (new_flen != new_insn - first_insn) {
900 new_flen = new_insn - first_insn;
901 if (pass > 2)
902 goto err;
903 goto do_pass;
904 }
905
906 kfree(addrs);
907 BUG_ON(*new_len != new_flen);
908 return 0;
909 err:
910 kfree(addrs);
911 return -EINVAL;
912 }
913
914 /* Security:
915 *
916 * As we dont want to clear mem[] array for each packet going through
917 * __bpf_prog_run(), we check that filter loaded by user never try to read
918 * a cell if not previously written, and we check all branches to be sure
919 * a malicious user doesn't try to abuse us.
920 */
check_load_and_stores(const struct sock_filter * filter,int flen)921 static int check_load_and_stores(const struct sock_filter *filter, int flen)
922 {
923 u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */
924 int pc, ret = 0;
925
926 BUILD_BUG_ON(BPF_MEMWORDS > 16);
927
928 masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL);
929 if (!masks)
930 return -ENOMEM;
931
932 memset(masks, 0xff, flen * sizeof(*masks));
933
934 for (pc = 0; pc < flen; pc++) {
935 memvalid &= masks[pc];
936
937 switch (filter[pc].code) {
938 case BPF_ST:
939 case BPF_STX:
940 memvalid |= (1 << filter[pc].k);
941 break;
942 case BPF_LD | BPF_MEM:
943 case BPF_LDX | BPF_MEM:
944 if (!(memvalid & (1 << filter[pc].k))) {
945 ret = -EINVAL;
946 goto error;
947 }
948 break;
949 case BPF_JMP | BPF_JA:
950 /* A jump must set masks on target */
951 masks[pc + 1 + filter[pc].k] &= memvalid;
952 memvalid = ~0;
953 break;
954 case BPF_JMP | BPF_JEQ | BPF_K:
955 case BPF_JMP | BPF_JEQ | BPF_X:
956 case BPF_JMP | BPF_JGE | BPF_K:
957 case BPF_JMP | BPF_JGE | BPF_X:
958 case BPF_JMP | BPF_JGT | BPF_K:
959 case BPF_JMP | BPF_JGT | BPF_X:
960 case BPF_JMP | BPF_JSET | BPF_K:
961 case BPF_JMP | BPF_JSET | BPF_X:
962 /* A jump must set masks on targets */
963 masks[pc + 1 + filter[pc].jt] &= memvalid;
964 masks[pc + 1 + filter[pc].jf] &= memvalid;
965 memvalid = ~0;
966 break;
967 }
968 }
969 error:
970 kfree(masks);
971 return ret;
972 }
973
chk_code_allowed(u16 code_to_probe)974 static bool chk_code_allowed(u16 code_to_probe)
975 {
976 static const bool codes[] = {
977 /* 32 bit ALU operations */
978 [BPF_ALU | BPF_ADD | BPF_K] = true,
979 [BPF_ALU | BPF_ADD | BPF_X] = true,
980 [BPF_ALU | BPF_SUB | BPF_K] = true,
981 [BPF_ALU | BPF_SUB | BPF_X] = true,
982 [BPF_ALU | BPF_MUL | BPF_K] = true,
983 [BPF_ALU | BPF_MUL | BPF_X] = true,
984 [BPF_ALU | BPF_DIV | BPF_K] = true,
985 [BPF_ALU | BPF_DIV | BPF_X] = true,
986 [BPF_ALU | BPF_MOD | BPF_K] = true,
987 [BPF_ALU | BPF_MOD | BPF_X] = true,
988 [BPF_ALU | BPF_AND | BPF_K] = true,
989 [BPF_ALU | BPF_AND | BPF_X] = true,
990 [BPF_ALU | BPF_OR | BPF_K] = true,
991 [BPF_ALU | BPF_OR | BPF_X] = true,
992 [BPF_ALU | BPF_XOR | BPF_K] = true,
993 [BPF_ALU | BPF_XOR | BPF_X] = true,
994 [BPF_ALU | BPF_LSH | BPF_K] = true,
995 [BPF_ALU | BPF_LSH | BPF_X] = true,
996 [BPF_ALU | BPF_RSH | BPF_K] = true,
997 [BPF_ALU | BPF_RSH | BPF_X] = true,
998 [BPF_ALU | BPF_NEG] = true,
999 /* Load instructions */
1000 [BPF_LD | BPF_W | BPF_ABS] = true,
1001 [BPF_LD | BPF_H | BPF_ABS] = true,
1002 [BPF_LD | BPF_B | BPF_ABS] = true,
1003 [BPF_LD | BPF_W | BPF_LEN] = true,
1004 [BPF_LD | BPF_W | BPF_IND] = true,
1005 [BPF_LD | BPF_H | BPF_IND] = true,
1006 [BPF_LD | BPF_B | BPF_IND] = true,
1007 [BPF_LD | BPF_IMM] = true,
1008 [BPF_LD | BPF_MEM] = true,
1009 [BPF_LDX | BPF_W | BPF_LEN] = true,
1010 [BPF_LDX | BPF_B | BPF_MSH] = true,
1011 [BPF_LDX | BPF_IMM] = true,
1012 [BPF_LDX | BPF_MEM] = true,
1013 /* Store instructions */
1014 [BPF_ST] = true,
1015 [BPF_STX] = true,
1016 /* Misc instructions */
1017 [BPF_MISC | BPF_TAX] = true,
1018 [BPF_MISC | BPF_TXA] = true,
1019 /* Return instructions */
1020 [BPF_RET | BPF_K] = true,
1021 [BPF_RET | BPF_A] = true,
1022 /* Jump instructions */
1023 [BPF_JMP | BPF_JA] = true,
1024 [BPF_JMP | BPF_JEQ | BPF_K] = true,
1025 [BPF_JMP | BPF_JEQ | BPF_X] = true,
1026 [BPF_JMP | BPF_JGE | BPF_K] = true,
1027 [BPF_JMP | BPF_JGE | BPF_X] = true,
1028 [BPF_JMP | BPF_JGT | BPF_K] = true,
1029 [BPF_JMP | BPF_JGT | BPF_X] = true,
1030 [BPF_JMP | BPF_JSET | BPF_K] = true,
1031 [BPF_JMP | BPF_JSET | BPF_X] = true,
1032 };
1033
1034 if (code_to_probe >= ARRAY_SIZE(codes))
1035 return false;
1036
1037 return codes[code_to_probe];
1038 }
1039
bpf_check_basics_ok(const struct sock_filter * filter,unsigned int flen)1040 static bool bpf_check_basics_ok(const struct sock_filter *filter,
1041 unsigned int flen)
1042 {
1043 if (filter == NULL)
1044 return false;
1045 if (flen == 0 || flen > BPF_MAXINSNS)
1046 return false;
1047
1048 return true;
1049 }
1050
1051 /**
1052 * bpf_check_classic - verify socket filter code
1053 * @filter: filter to verify
1054 * @flen: length of filter
1055 *
1056 * Check the user's filter code. If we let some ugly
1057 * filter code slip through kaboom! The filter must contain
1058 * no references or jumps that are out of range, no illegal
1059 * instructions, and must end with a RET instruction.
1060 *
1061 * All jumps are forward as they are not signed.
1062 *
1063 * Returns 0 if the rule set is legal or -EINVAL if not.
1064 */
bpf_check_classic(const struct sock_filter * filter,unsigned int flen)1065 static int bpf_check_classic(const struct sock_filter *filter,
1066 unsigned int flen)
1067 {
1068 bool anc_found;
1069 int pc;
1070
1071 /* Check the filter code now */
1072 for (pc = 0; pc < flen; pc++) {
1073 const struct sock_filter *ftest = &filter[pc];
1074
1075 /* May we actually operate on this code? */
1076 if (!chk_code_allowed(ftest->code))
1077 return -EINVAL;
1078
1079 /* Some instructions need special checks */
1080 switch (ftest->code) {
1081 case BPF_ALU | BPF_DIV | BPF_K:
1082 case BPF_ALU | BPF_MOD | BPF_K:
1083 /* Check for division by zero */
1084 if (ftest->k == 0)
1085 return -EINVAL;
1086 break;
1087 case BPF_ALU | BPF_LSH | BPF_K:
1088 case BPF_ALU | BPF_RSH | BPF_K:
1089 if (ftest->k >= 32)
1090 return -EINVAL;
1091 break;
1092 case BPF_LD | BPF_MEM:
1093 case BPF_LDX | BPF_MEM:
1094 case BPF_ST:
1095 case BPF_STX:
1096 /* Check for invalid memory addresses */
1097 if (ftest->k >= BPF_MEMWORDS)
1098 return -EINVAL;
1099 break;
1100 case BPF_JMP | BPF_JA:
1101 /* Note, the large ftest->k might cause loops.
1102 * Compare this with conditional jumps below,
1103 * where offsets are limited. --ANK (981016)
1104 */
1105 if (ftest->k >= (unsigned int)(flen - pc - 1))
1106 return -EINVAL;
1107 break;
1108 case BPF_JMP | BPF_JEQ | BPF_K:
1109 case BPF_JMP | BPF_JEQ | BPF_X:
1110 case BPF_JMP | BPF_JGE | BPF_K:
1111 case BPF_JMP | BPF_JGE | BPF_X:
1112 case BPF_JMP | BPF_JGT | BPF_K:
1113 case BPF_JMP | BPF_JGT | BPF_X:
1114 case BPF_JMP | BPF_JSET | BPF_K:
1115 case BPF_JMP | BPF_JSET | BPF_X:
1116 /* Both conditionals must be safe */
1117 if (pc + ftest->jt + 1 >= flen ||
1118 pc + ftest->jf + 1 >= flen)
1119 return -EINVAL;
1120 break;
1121 case BPF_LD | BPF_W | BPF_ABS:
1122 case BPF_LD | BPF_H | BPF_ABS:
1123 case BPF_LD | BPF_B | BPF_ABS:
1124 anc_found = false;
1125 if (bpf_anc_helper(ftest) & BPF_ANC)
1126 anc_found = true;
1127 /* Ancillary operation unknown or unsupported */
1128 if (anc_found == false && ftest->k >= SKF_AD_OFF)
1129 return -EINVAL;
1130 }
1131 }
1132
1133 /* Last instruction must be a RET code */
1134 switch (filter[flen - 1].code) {
1135 case BPF_RET | BPF_K:
1136 case BPF_RET | BPF_A:
1137 return check_load_and_stores(filter, flen);
1138 }
1139
1140 return -EINVAL;
1141 }
1142
bpf_prog_store_orig_filter(struct bpf_prog * fp,const struct sock_fprog * fprog)1143 static int bpf_prog_store_orig_filter(struct bpf_prog *fp,
1144 const struct sock_fprog *fprog)
1145 {
1146 unsigned int fsize = bpf_classic_proglen(fprog);
1147 struct sock_fprog_kern *fkprog;
1148
1149 fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL);
1150 if (!fp->orig_prog)
1151 return -ENOMEM;
1152
1153 fkprog = fp->orig_prog;
1154 fkprog->len = fprog->len;
1155
1156 fkprog->filter = kmemdup(fp->insns, fsize,
1157 GFP_KERNEL | __GFP_NOWARN);
1158 if (!fkprog->filter) {
1159 kfree(fp->orig_prog);
1160 return -ENOMEM;
1161 }
1162
1163 return 0;
1164 }
1165
bpf_release_orig_filter(struct bpf_prog * fp)1166 static void bpf_release_orig_filter(struct bpf_prog *fp)
1167 {
1168 struct sock_fprog_kern *fprog = fp->orig_prog;
1169
1170 if (fprog) {
1171 kfree(fprog->filter);
1172 kfree(fprog);
1173 }
1174 }
1175
__bpf_prog_release(struct bpf_prog * prog)1176 static void __bpf_prog_release(struct bpf_prog *prog)
1177 {
1178 if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) {
1179 bpf_prog_put(prog);
1180 } else {
1181 bpf_release_orig_filter(prog);
1182 bpf_prog_free(prog);
1183 }
1184 }
1185
__sk_filter_release(struct sk_filter * fp)1186 static void __sk_filter_release(struct sk_filter *fp)
1187 {
1188 __bpf_prog_release(fp->prog);
1189 kfree(fp);
1190 }
1191
1192 /**
1193 * sk_filter_release_rcu - Release a socket filter by rcu_head
1194 * @rcu: rcu_head that contains the sk_filter to free
1195 */
sk_filter_release_rcu(struct rcu_head * rcu)1196 static void sk_filter_release_rcu(struct rcu_head *rcu)
1197 {
1198 struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);
1199
1200 __sk_filter_release(fp);
1201 }
1202
1203 /**
1204 * sk_filter_release - release a socket filter
1205 * @fp: filter to remove
1206 *
1207 * Remove a filter from a socket and release its resources.
1208 */
sk_filter_release(struct sk_filter * fp)1209 static void sk_filter_release(struct sk_filter *fp)
1210 {
1211 if (refcount_dec_and_test(&fp->refcnt))
1212 call_rcu(&fp->rcu, sk_filter_release_rcu);
1213 }
1214
sk_filter_uncharge(struct sock * sk,struct sk_filter * fp)1215 void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
1216 {
1217 u32 filter_size = bpf_prog_size(fp->prog->len);
1218
1219 atomic_sub(filter_size, &sk->sk_omem_alloc);
1220 sk_filter_release(fp);
1221 }
1222
1223 /* try to charge the socket memory if there is space available
1224 * return true on success
1225 */
__sk_filter_charge(struct sock * sk,struct sk_filter * fp)1226 static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp)
1227 {
1228 u32 filter_size = bpf_prog_size(fp->prog->len);
1229 int optmem_max = READ_ONCE(sysctl_optmem_max);
1230
1231 /* same check as in sock_kmalloc() */
1232 if (filter_size <= optmem_max &&
1233 atomic_read(&sk->sk_omem_alloc) + filter_size < optmem_max) {
1234 atomic_add(filter_size, &sk->sk_omem_alloc);
1235 return true;
1236 }
1237 return false;
1238 }
1239
sk_filter_charge(struct sock * sk,struct sk_filter * fp)1240 bool sk_filter_charge(struct sock *sk, struct sk_filter *fp)
1241 {
1242 if (!refcount_inc_not_zero(&fp->refcnt))
1243 return false;
1244
1245 if (!__sk_filter_charge(sk, fp)) {
1246 sk_filter_release(fp);
1247 return false;
1248 }
1249 return true;
1250 }
1251
bpf_migrate_filter(struct bpf_prog * fp)1252 static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
1253 {
1254 struct sock_filter *old_prog;
1255 struct bpf_prog *old_fp;
1256 int err, new_len, old_len = fp->len;
1257 bool seen_ld_abs = false;
1258
1259 /* We are free to overwrite insns et al right here as it won't be used at
1260 * this point in time anymore internally after the migration to the eBPF
1261 * instruction representation.
1262 */
1263 BUILD_BUG_ON(sizeof(struct sock_filter) !=
1264 sizeof(struct bpf_insn));
1265
1266 /* Conversion cannot happen on overlapping memory areas,
1267 * so we need to keep the user BPF around until the 2nd
1268 * pass. At this time, the user BPF is stored in fp->insns.
1269 */
1270 old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter),
1271 GFP_KERNEL | __GFP_NOWARN);
1272 if (!old_prog) {
1273 err = -ENOMEM;
1274 goto out_err;
1275 }
1276
1277 /* 1st pass: calculate the new program length. */
1278 err = bpf_convert_filter(old_prog, old_len, NULL, &new_len,
1279 &seen_ld_abs);
1280 if (err)
1281 goto out_err_free;
1282
1283 /* Expand fp for appending the new filter representation. */
1284 old_fp = fp;
1285 fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0);
1286 if (!fp) {
1287 /* The old_fp is still around in case we couldn't
1288 * allocate new memory, so uncharge on that one.
1289 */
1290 fp = old_fp;
1291 err = -ENOMEM;
1292 goto out_err_free;
1293 }
1294
1295 fp->len = new_len;
1296
1297 /* 2nd pass: remap sock_filter insns into bpf_insn insns. */
1298 err = bpf_convert_filter(old_prog, old_len, fp, &new_len,
1299 &seen_ld_abs);
1300 if (err)
1301 /* 2nd bpf_convert_filter() can fail only if it fails
1302 * to allocate memory, remapping must succeed. Note,
1303 * that at this time old_fp has already been released
1304 * by krealloc().
1305 */
1306 goto out_err_free;
1307
1308 fp = bpf_prog_select_runtime(fp, &err);
1309 if (err)
1310 goto out_err_free;
1311
1312 kfree(old_prog);
1313 return fp;
1314
1315 out_err_free:
1316 kfree(old_prog);
1317 out_err:
1318 __bpf_prog_release(fp);
1319 return ERR_PTR(err);
1320 }
1321
bpf_prepare_filter(struct bpf_prog * fp,bpf_aux_classic_check_t trans)1322 static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp,
1323 bpf_aux_classic_check_t trans)
1324 {
1325 int err;
1326
1327 fp->bpf_func = NULL;
1328 fp->jited = 0;
1329
1330 err = bpf_check_classic(fp->insns, fp->len);
1331 if (err) {
1332 __bpf_prog_release(fp);
1333 return ERR_PTR(err);
1334 }
1335
1336 /* There might be additional checks and transformations
1337 * needed on classic filters, f.e. in case of seccomp.
1338 */
1339 if (trans) {
1340 err = trans(fp->insns, fp->len);
1341 if (err) {
1342 __bpf_prog_release(fp);
1343 return ERR_PTR(err);
1344 }
1345 }
1346
1347 /* Probe if we can JIT compile the filter and if so, do
1348 * the compilation of the filter.
1349 */
1350 bpf_jit_compile(fp);
1351
1352 /* JIT compiler couldn't process this filter, so do the eBPF translation
1353 * for the optimized interpreter.
1354 */
1355 if (!fp->jited)
1356 fp = bpf_migrate_filter(fp);
1357
1358 return fp;
1359 }
1360
1361 /**
1362 * bpf_prog_create - create an unattached filter
1363 * @pfp: the unattached filter that is created
1364 * @fprog: the filter program
1365 *
1366 * Create a filter independent of any socket. We first run some
1367 * sanity checks on it to make sure it does not explode on us later.
1368 * If an error occurs or there is insufficient memory for the filter
1369 * a negative errno code is returned. On success the return is zero.
1370 */
bpf_prog_create(struct bpf_prog ** pfp,struct sock_fprog_kern * fprog)1371 int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog)
1372 {
1373 unsigned int fsize = bpf_classic_proglen(fprog);
1374 struct bpf_prog *fp;
1375
1376 /* Make sure new filter is there and in the right amounts. */
1377 if (!bpf_check_basics_ok(fprog->filter, fprog->len))
1378 return -EINVAL;
1379
1380 fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
1381 if (!fp)
1382 return -ENOMEM;
1383
1384 memcpy(fp->insns, fprog->filter, fsize);
1385
1386 fp->len = fprog->len;
1387 /* Since unattached filters are not copied back to user
1388 * space through sk_get_filter(), we do not need to hold
1389 * a copy here, and can spare us the work.
1390 */
1391 fp->orig_prog = NULL;
1392
1393 /* bpf_prepare_filter() already takes care of freeing
1394 * memory in case something goes wrong.
1395 */
1396 fp = bpf_prepare_filter(fp, NULL);
1397 if (IS_ERR(fp))
1398 return PTR_ERR(fp);
1399
1400 *pfp = fp;
1401 return 0;
1402 }
1403 EXPORT_SYMBOL_GPL(bpf_prog_create);
1404
1405 /**
1406 * bpf_prog_create_from_user - create an unattached filter from user buffer
1407 * @pfp: the unattached filter that is created
1408 * @fprog: the filter program
1409 * @trans: post-classic verifier transformation handler
1410 * @save_orig: save classic BPF program
1411 *
1412 * This function effectively does the same as bpf_prog_create(), only
1413 * that it builds up its insns buffer from user space provided buffer.
1414 * It also allows for passing a bpf_aux_classic_check_t handler.
1415 */
bpf_prog_create_from_user(struct bpf_prog ** pfp,struct sock_fprog * fprog,bpf_aux_classic_check_t trans,bool save_orig)1416 int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
1417 bpf_aux_classic_check_t trans, bool save_orig)
1418 {
1419 unsigned int fsize = bpf_classic_proglen(fprog);
1420 struct bpf_prog *fp;
1421 int err;
1422
1423 /* Make sure new filter is there and in the right amounts. */
1424 if (!bpf_check_basics_ok(fprog->filter, fprog->len))
1425 return -EINVAL;
1426
1427 fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
1428 if (!fp)
1429 return -ENOMEM;
1430
1431 if (copy_from_user(fp->insns, fprog->filter, fsize)) {
1432 __bpf_prog_free(fp);
1433 return -EFAULT;
1434 }
1435
1436 fp->len = fprog->len;
1437 fp->orig_prog = NULL;
1438
1439 if (save_orig) {
1440 err = bpf_prog_store_orig_filter(fp, fprog);
1441 if (err) {
1442 __bpf_prog_free(fp);
1443 return -ENOMEM;
1444 }
1445 }
1446
1447 /* bpf_prepare_filter() already takes care of freeing
1448 * memory in case something goes wrong.
1449 */
1450 fp = bpf_prepare_filter(fp, trans);
1451 if (IS_ERR(fp))
1452 return PTR_ERR(fp);
1453
1454 *pfp = fp;
1455 return 0;
1456 }
1457 EXPORT_SYMBOL_GPL(bpf_prog_create_from_user);
1458
bpf_prog_destroy(struct bpf_prog * fp)1459 void bpf_prog_destroy(struct bpf_prog *fp)
1460 {
1461 __bpf_prog_release(fp);
1462 }
1463 EXPORT_SYMBOL_GPL(bpf_prog_destroy);
1464
__sk_attach_prog(struct bpf_prog * prog,struct sock * sk)1465 static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
1466 {
1467 struct sk_filter *fp, *old_fp;
1468
1469 fp = kmalloc(sizeof(*fp), GFP_KERNEL);
1470 if (!fp)
1471 return -ENOMEM;
1472
1473 fp->prog = prog;
1474
1475 if (!__sk_filter_charge(sk, fp)) {
1476 kfree(fp);
1477 return -ENOMEM;
1478 }
1479 refcount_set(&fp->refcnt, 1);
1480
1481 old_fp = rcu_dereference_protected(sk->sk_filter,
1482 lockdep_sock_is_held(sk));
1483 rcu_assign_pointer(sk->sk_filter, fp);
1484
1485 if (old_fp)
1486 sk_filter_uncharge(sk, old_fp);
1487
1488 return 0;
1489 }
1490
1491 static
__get_filter(struct sock_fprog * fprog,struct sock * sk)1492 struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk)
1493 {
1494 unsigned int fsize = bpf_classic_proglen(fprog);
1495 struct bpf_prog *prog;
1496 int err;
1497
1498 if (sock_flag(sk, SOCK_FILTER_LOCKED))
1499 return ERR_PTR(-EPERM);
1500
1501 /* Make sure new filter is there and in the right amounts. */
1502 if (!bpf_check_basics_ok(fprog->filter, fprog->len))
1503 return ERR_PTR(-EINVAL);
1504
1505 prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
1506 if (!prog)
1507 return ERR_PTR(-ENOMEM);
1508
1509 if (copy_from_user(prog->insns, fprog->filter, fsize)) {
1510 __bpf_prog_free(prog);
1511 return ERR_PTR(-EFAULT);
1512 }
1513
1514 prog->len = fprog->len;
1515
1516 err = bpf_prog_store_orig_filter(prog, fprog);
1517 if (err) {
1518 __bpf_prog_free(prog);
1519 return ERR_PTR(-ENOMEM);
1520 }
1521
1522 /* bpf_prepare_filter() already takes care of freeing
1523 * memory in case something goes wrong.
1524 */
1525 return bpf_prepare_filter(prog, NULL);
1526 }
1527
1528 /**
1529 * sk_attach_filter - attach a socket filter
1530 * @fprog: the filter program
1531 * @sk: the socket to use
1532 *
1533 * Attach the user's filter code. We first run some sanity checks on
1534 * it to make sure it does not explode on us later. If an error
1535 * occurs or there is insufficient memory for the filter a negative
1536 * errno code is returned. On success the return is zero.
1537 */
sk_attach_filter(struct sock_fprog * fprog,struct sock * sk)1538 int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
1539 {
1540 struct bpf_prog *prog = __get_filter(fprog, sk);
1541 int err;
1542
1543 if (IS_ERR(prog))
1544 return PTR_ERR(prog);
1545
1546 err = __sk_attach_prog(prog, sk);
1547 if (err < 0) {
1548 __bpf_prog_release(prog);
1549 return err;
1550 }
1551
1552 return 0;
1553 }
1554 EXPORT_SYMBOL_GPL(sk_attach_filter);
1555
sk_reuseport_attach_filter(struct sock_fprog * fprog,struct sock * sk)1556 int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
1557 {
1558 struct bpf_prog *prog = __get_filter(fprog, sk);
1559 int err;
1560
1561 if (IS_ERR(prog))
1562 return PTR_ERR(prog);
1563
1564 if (bpf_prog_size(prog->len) > READ_ONCE(sysctl_optmem_max))
1565 err = -ENOMEM;
1566 else
1567 err = reuseport_attach_prog(sk, prog);
1568
1569 if (err)
1570 __bpf_prog_release(prog);
1571
1572 return err;
1573 }
1574
__get_bpf(u32 ufd,struct sock * sk)1575 static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk)
1576 {
1577 if (sock_flag(sk, SOCK_FILTER_LOCKED))
1578 return ERR_PTR(-EPERM);
1579
1580 return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
1581 }
1582
sk_attach_bpf(u32 ufd,struct sock * sk)1583 int sk_attach_bpf(u32 ufd, struct sock *sk)
1584 {
1585 struct bpf_prog *prog = __get_bpf(ufd, sk);
1586 int err;
1587
1588 if (IS_ERR(prog))
1589 return PTR_ERR(prog);
1590
1591 err = __sk_attach_prog(prog, sk);
1592 if (err < 0) {
1593 bpf_prog_put(prog);
1594 return err;
1595 }
1596
1597 return 0;
1598 }
1599
sk_reuseport_attach_bpf(u32 ufd,struct sock * sk)1600 int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
1601 {
1602 struct bpf_prog *prog;
1603 int err;
1604
1605 if (sock_flag(sk, SOCK_FILTER_LOCKED))
1606 return -EPERM;
1607
1608 prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
1609 if (PTR_ERR(prog) == -EINVAL)
1610 prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SK_REUSEPORT);
1611 if (IS_ERR(prog))
1612 return PTR_ERR(prog);
1613
1614 if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) {
1615 /* Like other non BPF_PROG_TYPE_SOCKET_FILTER
1616 * bpf prog (e.g. sockmap). It depends on the
1617 * limitation imposed by bpf_prog_load().
1618 * Hence, sysctl_optmem_max is not checked.
1619 */
1620 if ((sk->sk_type != SOCK_STREAM &&
1621 sk->sk_type != SOCK_DGRAM) ||
1622 (sk->sk_protocol != IPPROTO_UDP &&
1623 sk->sk_protocol != IPPROTO_TCP) ||
1624 (sk->sk_family != AF_INET &&
1625 sk->sk_family != AF_INET6)) {
1626 err = -ENOTSUPP;
1627 goto err_prog_put;
1628 }
1629 } else {
1630 /* BPF_PROG_TYPE_SOCKET_FILTER */
1631 if (bpf_prog_size(prog->len) > READ_ONCE(sysctl_optmem_max)) {
1632 err = -ENOMEM;
1633 goto err_prog_put;
1634 }
1635 }
1636
1637 err = reuseport_attach_prog(sk, prog);
1638 err_prog_put:
1639 if (err)
1640 bpf_prog_put(prog);
1641
1642 return err;
1643 }
1644
sk_reuseport_prog_free(struct bpf_prog * prog)1645 void sk_reuseport_prog_free(struct bpf_prog *prog)
1646 {
1647 if (!prog)
1648 return;
1649
1650 if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
1651 bpf_prog_put(prog);
1652 else
1653 bpf_prog_destroy(prog);
1654 }
1655
1656 struct bpf_scratchpad {
1657 union {
1658 __be32 diff[MAX_BPF_STACK / sizeof(__be32)];
1659 u8 buff[MAX_BPF_STACK];
1660 };
1661 };
1662
1663 static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp);
1664
__bpf_try_make_writable(struct sk_buff * skb,unsigned int write_len)1665 static inline int __bpf_try_make_writable(struct sk_buff *skb,
1666 unsigned int write_len)
1667 {
1668 #ifdef CONFIG_DEBUG_NET
1669 /* Avoid a splat in pskb_may_pull_reason() */
1670 if (write_len > INT_MAX)
1671 return -EINVAL;
1672 #endif
1673 return skb_ensure_writable(skb, write_len);
1674 }
1675
bpf_try_make_writable(struct sk_buff * skb,unsigned int write_len)1676 static inline int bpf_try_make_writable(struct sk_buff *skb,
1677 unsigned int write_len)
1678 {
1679 int err = __bpf_try_make_writable(skb, write_len);
1680
1681 bpf_compute_data_pointers(skb);
1682 return err;
1683 }
1684
bpf_try_make_head_writable(struct sk_buff * skb)1685 static int bpf_try_make_head_writable(struct sk_buff *skb)
1686 {
1687 return bpf_try_make_writable(skb, skb_headlen(skb));
1688 }
1689
bpf_push_mac_rcsum(struct sk_buff * skb)1690 static inline void bpf_push_mac_rcsum(struct sk_buff *skb)
1691 {
1692 if (skb_at_tc_ingress(skb))
1693 skb_postpush_rcsum(skb, skb_mac_header(skb), skb->mac_len);
1694 }
1695
bpf_pull_mac_rcsum(struct sk_buff * skb)1696 static inline void bpf_pull_mac_rcsum(struct sk_buff *skb)
1697 {
1698 if (skb_at_tc_ingress(skb))
1699 skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len);
1700 }
1701
BPF_CALL_5(bpf_skb_store_bytes,struct sk_buff *,skb,u32,offset,const void *,from,u32,len,u64,flags)1702 BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset,
1703 const void *, from, u32, len, u64, flags)
1704 {
1705 void *ptr;
1706
1707 if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH)))
1708 return -EINVAL;
1709 if (unlikely(offset > INT_MAX))
1710 return -EFAULT;
1711 if (unlikely(bpf_try_make_writable(skb, offset + len)))
1712 return -EFAULT;
1713
1714 ptr = skb->data + offset;
1715 if (flags & BPF_F_RECOMPUTE_CSUM)
1716 __skb_postpull_rcsum(skb, ptr, len, offset);
1717
1718 memcpy(ptr, from, len);
1719
1720 if (flags & BPF_F_RECOMPUTE_CSUM)
1721 __skb_postpush_rcsum(skb, ptr, len, offset);
1722 if (flags & BPF_F_INVALIDATE_HASH)
1723 skb_clear_hash(skb);
1724
1725 return 0;
1726 }
1727
1728 static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
1729 .func = bpf_skb_store_bytes,
1730 .gpl_only = false,
1731 .ret_type = RET_INTEGER,
1732 .arg1_type = ARG_PTR_TO_CTX,
1733 .arg2_type = ARG_ANYTHING,
1734 .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
1735 .arg4_type = ARG_CONST_SIZE,
1736 .arg5_type = ARG_ANYTHING,
1737 };
1738
__bpf_skb_store_bytes(struct sk_buff * skb,u32 offset,const void * from,u32 len,u64 flags)1739 int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from,
1740 u32 len, u64 flags)
1741 {
1742 return ____bpf_skb_store_bytes(skb, offset, from, len, flags);
1743 }
1744
BPF_CALL_4(bpf_skb_load_bytes,const struct sk_buff *,skb,u32,offset,void *,to,u32,len)1745 BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset,
1746 void *, to, u32, len)
1747 {
1748 void *ptr;
1749
1750 if (unlikely(offset > INT_MAX))
1751 goto err_clear;
1752
1753 ptr = skb_header_pointer(skb, offset, len, to);
1754 if (unlikely(!ptr))
1755 goto err_clear;
1756 if (ptr != to)
1757 memcpy(to, ptr, len);
1758
1759 return 0;
1760 err_clear:
1761 memset(to, 0, len);
1762 return -EFAULT;
1763 }
1764
1765 static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
1766 .func = bpf_skb_load_bytes,
1767 .gpl_only = false,
1768 .ret_type = RET_INTEGER,
1769 .arg1_type = ARG_PTR_TO_CTX,
1770 .arg2_type = ARG_ANYTHING,
1771 .arg3_type = ARG_PTR_TO_UNINIT_MEM,
1772 .arg4_type = ARG_CONST_SIZE,
1773 };
1774
__bpf_skb_load_bytes(const struct sk_buff * skb,u32 offset,void * to,u32 len)1775 int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len)
1776 {
1777 return ____bpf_skb_load_bytes(skb, offset, to, len);
1778 }
1779
BPF_CALL_4(bpf_flow_dissector_load_bytes,const struct bpf_flow_dissector *,ctx,u32,offset,void *,to,u32,len)1780 BPF_CALL_4(bpf_flow_dissector_load_bytes,
1781 const struct bpf_flow_dissector *, ctx, u32, offset,
1782 void *, to, u32, len)
1783 {
1784 void *ptr;
1785
1786 if (unlikely(offset > 0xffff))
1787 goto err_clear;
1788
1789 if (unlikely(!ctx->skb))
1790 goto err_clear;
1791
1792 ptr = skb_header_pointer(ctx->skb, offset, len, to);
1793 if (unlikely(!ptr))
1794 goto err_clear;
1795 if (ptr != to)
1796 memcpy(to, ptr, len);
1797
1798 return 0;
1799 err_clear:
1800 memset(to, 0, len);
1801 return -EFAULT;
1802 }
1803
1804 static const struct bpf_func_proto bpf_flow_dissector_load_bytes_proto = {
1805 .func = bpf_flow_dissector_load_bytes,
1806 .gpl_only = false,
1807 .ret_type = RET_INTEGER,
1808 .arg1_type = ARG_PTR_TO_CTX,
1809 .arg2_type = ARG_ANYTHING,
1810 .arg3_type = ARG_PTR_TO_UNINIT_MEM,
1811 .arg4_type = ARG_CONST_SIZE,
1812 };
1813
BPF_CALL_5(bpf_skb_load_bytes_relative,const struct sk_buff *,skb,u32,offset,void *,to,u32,len,u32,start_header)1814 BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb,
1815 u32, offset, void *, to, u32, len, u32, start_header)
1816 {
1817 u8 *end = skb_tail_pointer(skb);
1818 u8 *start, *ptr;
1819
1820 if (unlikely(offset > 0xffff))
1821 goto err_clear;
1822
1823 switch (start_header) {
1824 case BPF_HDR_START_MAC:
1825 if (unlikely(!skb_mac_header_was_set(skb)))
1826 goto err_clear;
1827 start = skb_mac_header(skb);
1828 break;
1829 case BPF_HDR_START_NET:
1830 start = skb_network_header(skb);
1831 break;
1832 default:
1833 goto err_clear;
1834 }
1835
1836 ptr = start + offset;
1837
1838 if (likely(ptr + len <= end)) {
1839 memcpy(to, ptr, len);
1840 return 0;
1841 }
1842
1843 err_clear:
1844 memset(to, 0, len);
1845 return -EFAULT;
1846 }
1847
1848 static const struct bpf_func_proto bpf_skb_load_bytes_relative_proto = {
1849 .func = bpf_skb_load_bytes_relative,
1850 .gpl_only = false,
1851 .ret_type = RET_INTEGER,
1852 .arg1_type = ARG_PTR_TO_CTX,
1853 .arg2_type = ARG_ANYTHING,
1854 .arg3_type = ARG_PTR_TO_UNINIT_MEM,
1855 .arg4_type = ARG_CONST_SIZE,
1856 .arg5_type = ARG_ANYTHING,
1857 };
1858
BPF_CALL_2(bpf_skb_pull_data,struct sk_buff *,skb,u32,len)1859 BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
1860 {
1861 /* Idea is the following: should the needed direct read/write
1862 * test fail during runtime, we can pull in more data and redo
1863 * again, since implicitly, we invalidate previous checks here.
1864 *
1865 * Or, since we know how much we need to make read/writeable,
1866 * this can be done once at the program beginning for direct
1867 * access case. By this we overcome limitations of only current
1868 * headroom being accessible.
1869 */
1870 return bpf_try_make_writable(skb, len ? : skb_headlen(skb));
1871 }
1872
1873 static const struct bpf_func_proto bpf_skb_pull_data_proto = {
1874 .func = bpf_skb_pull_data,
1875 .gpl_only = false,
1876 .ret_type = RET_INTEGER,
1877 .arg1_type = ARG_PTR_TO_CTX,
1878 .arg2_type = ARG_ANYTHING,
1879 };
1880
BPF_CALL_1(bpf_sk_fullsock,struct sock *,sk)1881 BPF_CALL_1(bpf_sk_fullsock, struct sock *, sk)
1882 {
1883 return sk_fullsock(sk) ? (unsigned long)sk : (unsigned long)NULL;
1884 }
1885
1886 static const struct bpf_func_proto bpf_sk_fullsock_proto = {
1887 .func = bpf_sk_fullsock,
1888 .gpl_only = false,
1889 .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
1890 .arg1_type = ARG_PTR_TO_SOCK_COMMON,
1891 };
1892
sk_skb_try_make_writable(struct sk_buff * skb,unsigned int write_len)1893 static inline int sk_skb_try_make_writable(struct sk_buff *skb,
1894 unsigned int write_len)
1895 {
1896 return __bpf_try_make_writable(skb, write_len);
1897 }
1898
BPF_CALL_2(sk_skb_pull_data,struct sk_buff *,skb,u32,len)1899 BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len)
1900 {
1901 /* Idea is the following: should the needed direct read/write
1902 * test fail during runtime, we can pull in more data and redo
1903 * again, since implicitly, we invalidate previous checks here.
1904 *
1905 * Or, since we know how much we need to make read/writeable,
1906 * this can be done once at the program beginning for direct
1907 * access case. By this we overcome limitations of only current
1908 * headroom being accessible.
1909 */
1910 return sk_skb_try_make_writable(skb, len ? : skb_headlen(skb));
1911 }
1912
1913 static const struct bpf_func_proto sk_skb_pull_data_proto = {
1914 .func = sk_skb_pull_data,
1915 .gpl_only = false,
1916 .ret_type = RET_INTEGER,
1917 .arg1_type = ARG_PTR_TO_CTX,
1918 .arg2_type = ARG_ANYTHING,
1919 };
1920
BPF_CALL_5(bpf_l3_csum_replace,struct sk_buff *,skb,u32,offset,u64,from,u64,to,u64,flags)1921 BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset,
1922 u64, from, u64, to, u64, flags)
1923 {
1924 __sum16 *ptr;
1925
1926 if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK)))
1927 return -EINVAL;
1928 if (unlikely(offset > 0xffff || offset & 1))
1929 return -EFAULT;
1930 if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr))))
1931 return -EFAULT;
1932
1933 ptr = (__sum16 *)(skb->data + offset);
1934 switch (flags & BPF_F_HDR_FIELD_MASK) {
1935 case 0:
1936 if (unlikely(from != 0))
1937 return -EINVAL;
1938
1939 csum_replace_by_diff(ptr, to);
1940 break;
1941 case 2:
1942 csum_replace2(ptr, from, to);
1943 break;
1944 case 4:
1945 csum_replace4(ptr, from, to);
1946 break;
1947 default:
1948 return -EINVAL;
1949 }
1950
1951 return 0;
1952 }
1953
1954 static const struct bpf_func_proto bpf_l3_csum_replace_proto = {
1955 .func = bpf_l3_csum_replace,
1956 .gpl_only = false,
1957 .ret_type = RET_INTEGER,
1958 .arg1_type = ARG_PTR_TO_CTX,
1959 .arg2_type = ARG_ANYTHING,
1960 .arg3_type = ARG_ANYTHING,
1961 .arg4_type = ARG_ANYTHING,
1962 .arg5_type = ARG_ANYTHING,
1963 };
1964
BPF_CALL_5(bpf_l4_csum_replace,struct sk_buff *,skb,u32,offset,u64,from,u64,to,u64,flags)1965 BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
1966 u64, from, u64, to, u64, flags)
1967 {
1968 bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
1969 bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
1970 bool do_mforce = flags & BPF_F_MARK_ENFORCE;
1971 __sum16 *ptr;
1972
1973 if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE |
1974 BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK)))
1975 return -EINVAL;
1976 if (unlikely(offset > 0xffff || offset & 1))
1977 return -EFAULT;
1978 if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr))))
1979 return -EFAULT;
1980
1981 ptr = (__sum16 *)(skb->data + offset);
1982 if (is_mmzero && !do_mforce && !*ptr)
1983 return 0;
1984
1985 switch (flags & BPF_F_HDR_FIELD_MASK) {
1986 case 0:
1987 if (unlikely(from != 0))
1988 return -EINVAL;
1989
1990 inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo);
1991 break;
1992 case 2:
1993 inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo);
1994 break;
1995 case 4:
1996 inet_proto_csum_replace4(ptr, skb, from, to, is_pseudo);
1997 break;
1998 default:
1999 return -EINVAL;
2000 }
2001
2002 if (is_mmzero && !*ptr)
2003 *ptr = CSUM_MANGLED_0;
2004 return 0;
2005 }
2006
2007 static const struct bpf_func_proto bpf_l4_csum_replace_proto = {
2008 .func = bpf_l4_csum_replace,
2009 .gpl_only = false,
2010 .ret_type = RET_INTEGER,
2011 .arg1_type = ARG_PTR_TO_CTX,
2012 .arg2_type = ARG_ANYTHING,
2013 .arg3_type = ARG_ANYTHING,
2014 .arg4_type = ARG_ANYTHING,
2015 .arg5_type = ARG_ANYTHING,
2016 };
2017
BPF_CALL_5(bpf_csum_diff,__be32 *,from,u32,from_size,__be32 *,to,u32,to_size,__wsum,seed)2018 BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size,
2019 __be32 *, to, u32, to_size, __wsum, seed)
2020 {
2021 struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp);
2022 u32 diff_size = from_size + to_size;
2023 int i, j = 0;
2024
2025 /* This is quite flexible, some examples:
2026 *
2027 * from_size == 0, to_size > 0, seed := csum --> pushing data
2028 * from_size > 0, to_size == 0, seed := csum --> pulling data
2029 * from_size > 0, to_size > 0, seed := 0 --> diffing data
2030 *
2031 * Even for diffing, from_size and to_size don't need to be equal.
2032 */
2033 if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) ||
2034 diff_size > sizeof(sp->diff)))
2035 return -EINVAL;
2036
2037 for (i = 0; i < from_size / sizeof(__be32); i++, j++)
2038 sp->diff[j] = ~from[i];
2039 for (i = 0; i < to_size / sizeof(__be32); i++, j++)
2040 sp->diff[j] = to[i];
2041
2042 return csum_partial(sp->diff, diff_size, seed);
2043 }
2044
2045 static const struct bpf_func_proto bpf_csum_diff_proto = {
2046 .func = bpf_csum_diff,
2047 .gpl_only = false,
2048 .pkt_access = true,
2049 .ret_type = RET_INTEGER,
2050 .arg1_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
2051 .arg2_type = ARG_CONST_SIZE_OR_ZERO,
2052 .arg3_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
2053 .arg4_type = ARG_CONST_SIZE_OR_ZERO,
2054 .arg5_type = ARG_ANYTHING,
2055 };
2056
BPF_CALL_2(bpf_csum_update,struct sk_buff *,skb,__wsum,csum)2057 BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum)
2058 {
2059 /* The interface is to be used in combination with bpf_csum_diff()
2060 * for direct packet writes. csum rotation for alignment as well
2061 * as emulating csum_sub() can be done from the eBPF program.
2062 */
2063 if (skb->ip_summed == CHECKSUM_COMPLETE)
2064 return (skb->csum = csum_add(skb->csum, csum));
2065
2066 return -ENOTSUPP;
2067 }
2068
2069 static const struct bpf_func_proto bpf_csum_update_proto = {
2070 .func = bpf_csum_update,
2071 .gpl_only = false,
2072 .ret_type = RET_INTEGER,
2073 .arg1_type = ARG_PTR_TO_CTX,
2074 .arg2_type = ARG_ANYTHING,
2075 };
2076
BPF_CALL_2(bpf_csum_level,struct sk_buff *,skb,u64,level)2077 BPF_CALL_2(bpf_csum_level, struct sk_buff *, skb, u64, level)
2078 {
2079 /* The interface is to be used in combination with bpf_skb_adjust_room()
2080 * for encap/decap of packet headers when BPF_F_ADJ_ROOM_NO_CSUM_RESET
2081 * is passed as flags, for example.
2082 */
2083 switch (level) {
2084 case BPF_CSUM_LEVEL_INC:
2085 __skb_incr_checksum_unnecessary(skb);
2086 break;
2087 case BPF_CSUM_LEVEL_DEC:
2088 __skb_decr_checksum_unnecessary(skb);
2089 break;
2090 case BPF_CSUM_LEVEL_RESET:
2091 __skb_reset_checksum_unnecessary(skb);
2092 break;
2093 case BPF_CSUM_LEVEL_QUERY:
2094 return skb->ip_summed == CHECKSUM_UNNECESSARY ?
2095 skb->csum_level : -EACCES;
2096 default:
2097 return -EINVAL;
2098 }
2099
2100 return 0;
2101 }
2102
2103 static const struct bpf_func_proto bpf_csum_level_proto = {
2104 .func = bpf_csum_level,
2105 .gpl_only = false,
2106 .ret_type = RET_INTEGER,
2107 .arg1_type = ARG_PTR_TO_CTX,
2108 .arg2_type = ARG_ANYTHING,
2109 };
2110
__bpf_rx_skb(struct net_device * dev,struct sk_buff * skb)2111 static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
2112 {
2113 return dev_forward_skb_nomtu(dev, skb);
2114 }
2115
__bpf_rx_skb_no_mac(struct net_device * dev,struct sk_buff * skb)2116 static inline int __bpf_rx_skb_no_mac(struct net_device *dev,
2117 struct sk_buff *skb)
2118 {
2119 int ret = ____dev_forward_skb(dev, skb, false);
2120
2121 if (likely(!ret)) {
2122 skb->dev = dev;
2123 ret = netif_rx(skb);
2124 }
2125
2126 return ret;
2127 }
2128
__bpf_tx_skb(struct net_device * dev,struct sk_buff * skb)2129 static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
2130 {
2131 int ret;
2132
2133 if (dev_xmit_recursion()) {
2134 net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
2135 kfree_skb(skb);
2136 return -ENETDOWN;
2137 }
2138
2139 skb->dev = dev;
2140 skb_set_redirected_noclear(skb, skb_at_tc_ingress(skb));
2141 skb_clear_tstamp(skb);
2142
2143 dev_xmit_recursion_inc();
2144 ret = dev_queue_xmit(skb);
2145 dev_xmit_recursion_dec();
2146
2147 return ret;
2148 }
2149
__bpf_redirect_no_mac(struct sk_buff * skb,struct net_device * dev,u32 flags)2150 static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev,
2151 u32 flags)
2152 {
2153 unsigned int mlen = skb_network_offset(skb);
2154
2155 if (unlikely(skb->len <= mlen)) {
2156 kfree_skb(skb);
2157 return -ERANGE;
2158 }
2159
2160 if (mlen) {
2161 __skb_pull(skb, mlen);
2162
2163 /* At ingress, the mac header has already been pulled once.
2164 * At egress, skb_pospull_rcsum has to be done in case that
2165 * the skb is originated from ingress (i.e. a forwarded skb)
2166 * to ensure that rcsum starts at net header.
2167 */
2168 if (!skb_at_tc_ingress(skb))
2169 skb_postpull_rcsum(skb, skb_mac_header(skb), mlen);
2170 }
2171 skb_pop_mac_header(skb);
2172 skb_reset_mac_len(skb);
2173 return flags & BPF_F_INGRESS ?
2174 __bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb);
2175 }
2176
__bpf_redirect_common(struct sk_buff * skb,struct net_device * dev,u32 flags)2177 static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev,
2178 u32 flags)
2179 {
2180 /* Verify that a link layer header is carried */
2181 if (unlikely(skb->mac_header >= skb->network_header || skb->len == 0)) {
2182 kfree_skb(skb);
2183 return -ERANGE;
2184 }
2185
2186 bpf_push_mac_rcsum(skb);
2187 return flags & BPF_F_INGRESS ?
2188 __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
2189 }
2190
__bpf_redirect(struct sk_buff * skb,struct net_device * dev,u32 flags)2191 static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
2192 u32 flags)
2193 {
2194 if (dev_is_mac_header_xmit(dev))
2195 return __bpf_redirect_common(skb, dev, flags);
2196 else
2197 return __bpf_redirect_no_mac(skb, dev, flags);
2198 }
2199
2200 #if IS_ENABLED(CONFIG_IPV6)
bpf_out_neigh_v6(struct net * net,struct sk_buff * skb,struct net_device * dev,struct bpf_nh_params * nh)2201 static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
2202 struct net_device *dev, struct bpf_nh_params *nh)
2203 {
2204 u32 hh_len = LL_RESERVED_SPACE(dev);
2205 const struct in6_addr *nexthop;
2206 struct dst_entry *dst = NULL;
2207 struct neighbour *neigh;
2208
2209 if (dev_xmit_recursion()) {
2210 net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
2211 goto out_drop;
2212 }
2213
2214 skb->dev = dev;
2215 skb_clear_tstamp(skb);
2216
2217 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
2218 skb = skb_expand_head(skb, hh_len);
2219 if (!skb)
2220 return -ENOMEM;
2221 }
2222
2223 rcu_read_lock();
2224 if (!nh) {
2225 dst = skb_dst(skb);
2226 nexthop = rt6_nexthop(dst_rt6_info(dst),
2227 &ipv6_hdr(skb)->daddr);
2228 } else {
2229 nexthop = &nh->ipv6_nh;
2230 }
2231 neigh = ip_neigh_gw6(dev, nexthop);
2232 if (likely(!IS_ERR(neigh))) {
2233 int ret;
2234
2235 sock_confirm_neigh(skb, neigh);
2236 local_bh_disable();
2237 dev_xmit_recursion_inc();
2238 ret = neigh_output(neigh, skb, false);
2239 dev_xmit_recursion_dec();
2240 local_bh_enable();
2241 rcu_read_unlock();
2242 return ret;
2243 }
2244 rcu_read_unlock();
2245 if (dst)
2246 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
2247 out_drop:
2248 kfree_skb(skb);
2249 return -ENETDOWN;
2250 }
2251
__bpf_redirect_neigh_v6(struct sk_buff * skb,struct net_device * dev,struct bpf_nh_params * nh)2252 static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
2253 struct bpf_nh_params *nh)
2254 {
2255 const struct ipv6hdr *ip6h = ipv6_hdr(skb);
2256 struct net *net = dev_net(dev);
2257 int err, ret = NET_XMIT_DROP;
2258
2259 if (!nh) {
2260 struct dst_entry *dst;
2261 struct flowi6 fl6 = {
2262 .flowi6_flags = FLOWI_FLAG_ANYSRC,
2263 .flowi6_mark = skb->mark,
2264 .flowlabel = ip6_flowinfo(ip6h),
2265 .flowi6_oif = dev->ifindex,
2266 .flowi6_proto = ip6h->nexthdr,
2267 .daddr = ip6h->daddr,
2268 .saddr = ip6h->saddr,
2269 };
2270
2271 dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL);
2272 if (IS_ERR(dst))
2273 goto out_drop;
2274
2275 skb_dst_set(skb, dst);
2276 } else if (nh->nh_family != AF_INET6) {
2277 goto out_drop;
2278 }
2279
2280 err = bpf_out_neigh_v6(net, skb, dev, nh);
2281 if (unlikely(net_xmit_eval(err)))
2282 DEV_STATS_INC(dev, tx_errors);
2283 else
2284 ret = NET_XMIT_SUCCESS;
2285 goto out_xmit;
2286 out_drop:
2287 DEV_STATS_INC(dev, tx_errors);
2288 kfree_skb(skb);
2289 out_xmit:
2290 return ret;
2291 }
2292 #else
__bpf_redirect_neigh_v6(struct sk_buff * skb,struct net_device * dev,struct bpf_nh_params * nh)2293 static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
2294 struct bpf_nh_params *nh)
2295 {
2296 kfree_skb(skb);
2297 return NET_XMIT_DROP;
2298 }
2299 #endif /* CONFIG_IPV6 */
2300
2301 #if IS_ENABLED(CONFIG_INET)
bpf_out_neigh_v4(struct net * net,struct sk_buff * skb,struct net_device * dev,struct bpf_nh_params * nh)2302 static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,
2303 struct net_device *dev, struct bpf_nh_params *nh)
2304 {
2305 u32 hh_len = LL_RESERVED_SPACE(dev);
2306 struct neighbour *neigh;
2307 bool is_v6gw = false;
2308
2309 if (dev_xmit_recursion()) {
2310 net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
2311 goto out_drop;
2312 }
2313
2314 skb->dev = dev;
2315 skb_clear_tstamp(skb);
2316
2317 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
2318 skb = skb_expand_head(skb, hh_len);
2319 if (!skb)
2320 return -ENOMEM;
2321 }
2322
2323 rcu_read_lock();
2324 if (!nh) {
2325 struct dst_entry *dst = skb_dst(skb);
2326 struct rtable *rt = container_of(dst, struct rtable, dst);
2327
2328 neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
2329 } else if (nh->nh_family == AF_INET6) {
2330 neigh = ip_neigh_gw6(dev, &nh->ipv6_nh);
2331 is_v6gw = true;
2332 } else if (nh->nh_family == AF_INET) {
2333 neigh = ip_neigh_gw4(dev, nh->ipv4_nh);
2334 } else {
2335 rcu_read_unlock();
2336 goto out_drop;
2337 }
2338
2339 if (likely(!IS_ERR(neigh))) {
2340 int ret;
2341
2342 sock_confirm_neigh(skb, neigh);
2343 local_bh_disable();
2344 dev_xmit_recursion_inc();
2345 ret = neigh_output(neigh, skb, is_v6gw);
2346 dev_xmit_recursion_dec();
2347 local_bh_enable();
2348 rcu_read_unlock();
2349 return ret;
2350 }
2351 rcu_read_unlock();
2352 out_drop:
2353 kfree_skb(skb);
2354 return -ENETDOWN;
2355 }
2356
__bpf_redirect_neigh_v4(struct sk_buff * skb,struct net_device * dev,struct bpf_nh_params * nh)2357 static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
2358 struct bpf_nh_params *nh)
2359 {
2360 const struct iphdr *ip4h = ip_hdr(skb);
2361 struct net *net = dev_net(dev);
2362 int err, ret = NET_XMIT_DROP;
2363
2364 if (!nh) {
2365 struct flowi4 fl4 = {
2366 .flowi4_flags = FLOWI_FLAG_ANYSRC,
2367 .flowi4_mark = skb->mark,
2368 .flowi4_tos = RT_TOS(ip4h->tos),
2369 .flowi4_oif = dev->ifindex,
2370 .flowi4_proto = ip4h->protocol,
2371 .daddr = ip4h->daddr,
2372 .saddr = ip4h->saddr,
2373 };
2374 struct rtable *rt;
2375
2376 rt = ip_route_output_flow(net, &fl4, NULL);
2377 if (IS_ERR(rt))
2378 goto out_drop;
2379 if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
2380 ip_rt_put(rt);
2381 goto out_drop;
2382 }
2383
2384 skb_dst_set(skb, &rt->dst);
2385 }
2386
2387 err = bpf_out_neigh_v4(net, skb, dev, nh);
2388 if (unlikely(net_xmit_eval(err)))
2389 DEV_STATS_INC(dev, tx_errors);
2390 else
2391 ret = NET_XMIT_SUCCESS;
2392 goto out_xmit;
2393 out_drop:
2394 DEV_STATS_INC(dev, tx_errors);
2395 kfree_skb(skb);
2396 out_xmit:
2397 return ret;
2398 }
2399 #else
__bpf_redirect_neigh_v4(struct sk_buff * skb,struct net_device * dev,struct bpf_nh_params * nh)2400 static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
2401 struct bpf_nh_params *nh)
2402 {
2403 kfree_skb(skb);
2404 return NET_XMIT_DROP;
2405 }
2406 #endif /* CONFIG_INET */
2407
__bpf_redirect_neigh(struct sk_buff * skb,struct net_device * dev,struct bpf_nh_params * nh)2408 static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev,
2409 struct bpf_nh_params *nh)
2410 {
2411 struct ethhdr *ethh = eth_hdr(skb);
2412
2413 if (unlikely(skb->mac_header >= skb->network_header))
2414 goto out;
2415 bpf_push_mac_rcsum(skb);
2416 if (is_multicast_ether_addr(ethh->h_dest))
2417 goto out;
2418
2419 skb_pull(skb, sizeof(*ethh));
2420 skb_unset_mac_header(skb);
2421 skb_reset_network_header(skb);
2422
2423 if (skb->protocol == htons(ETH_P_IP))
2424 return __bpf_redirect_neigh_v4(skb, dev, nh);
2425 else if (skb->protocol == htons(ETH_P_IPV6))
2426 return __bpf_redirect_neigh_v6(skb, dev, nh);
2427 out:
2428 kfree_skb(skb);
2429 return -ENOTSUPP;
2430 }
2431
2432 /* Internal, non-exposed redirect flags. */
2433 enum {
2434 BPF_F_NEIGH = (1ULL << 16),
2435 BPF_F_PEER = (1ULL << 17),
2436 BPF_F_NEXTHOP = (1ULL << 18),
2437 #define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP)
2438 };
2439
BPF_CALL_3(bpf_clone_redirect,struct sk_buff *,skb,u32,ifindex,u64,flags)2440 BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
2441 {
2442 struct net_device *dev;
2443 struct sk_buff *clone;
2444 int ret;
2445
2446 BUILD_BUG_ON(BPF_F_REDIRECT_INTERNAL & BPF_F_REDIRECT_FLAGS);
2447
2448 if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
2449 return -EINVAL;
2450
2451 dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex);
2452 if (unlikely(!dev))
2453 return -EINVAL;
2454
2455 clone = skb_clone(skb, GFP_ATOMIC);
2456 if (unlikely(!clone))
2457 return -ENOMEM;
2458
2459 /* For direct write, we need to keep the invariant that the skbs
2460 * we're dealing with need to be uncloned. Should uncloning fail
2461 * here, we need to free the just generated clone to unclone once
2462 * again.
2463 */
2464 ret = bpf_try_make_head_writable(skb);
2465 if (unlikely(ret)) {
2466 kfree_skb(clone);
2467 return -ENOMEM;
2468 }
2469
2470 return __bpf_redirect(clone, dev, flags);
2471 }
2472
2473 static const struct bpf_func_proto bpf_clone_redirect_proto = {
2474 .func = bpf_clone_redirect,
2475 .gpl_only = false,
2476 .ret_type = RET_INTEGER,
2477 .arg1_type = ARG_PTR_TO_CTX,
2478 .arg2_type = ARG_ANYTHING,
2479 .arg3_type = ARG_ANYTHING,
2480 };
2481
2482 DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
2483 EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info);
2484
skb_do_redirect(struct sk_buff * skb)2485 int skb_do_redirect(struct sk_buff *skb)
2486 {
2487 struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
2488 struct net *net = dev_net(skb->dev);
2489 struct net_device *dev;
2490 u32 flags = ri->flags;
2491
2492 dev = dev_get_by_index_rcu(net, ri->tgt_index);
2493 ri->tgt_index = 0;
2494 ri->flags = 0;
2495 if (unlikely(!dev))
2496 goto out_drop;
2497 if (flags & BPF_F_PEER) {
2498 const struct net_device_ops *ops = dev->netdev_ops;
2499
2500 if (unlikely(!ops->ndo_get_peer_dev ||
2501 !skb_at_tc_ingress(skb)))
2502 goto out_drop;
2503 dev = ops->ndo_get_peer_dev(dev);
2504 if (unlikely(!dev ||
2505 !(dev->flags & IFF_UP) ||
2506 net_eq(net, dev_net(dev))))
2507 goto out_drop;
2508 skb->dev = dev;
2509 dev_sw_netstats_rx_add(dev, skb->len);
2510 skb_scrub_packet(skb, false);
2511 return -EAGAIN;
2512 }
2513 return flags & BPF_F_NEIGH ?
2514 __bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ?
2515 &ri->nh : NULL) :
2516 __bpf_redirect(skb, dev, flags);
2517 out_drop:
2518 kfree_skb(skb);
2519 return -EINVAL;
2520 }
2521
BPF_CALL_2(bpf_redirect,u32,ifindex,u64,flags)2522 BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
2523 {
2524 struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
2525
2526 if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
2527 return TC_ACT_SHOT;
2528
2529 ri->flags = flags;
2530 ri->tgt_index = ifindex;
2531
2532 return TC_ACT_REDIRECT;
2533 }
2534
2535 static const struct bpf_func_proto bpf_redirect_proto = {
2536 .func = bpf_redirect,
2537 .gpl_only = false,
2538 .ret_type = RET_INTEGER,
2539 .arg1_type = ARG_ANYTHING,
2540 .arg2_type = ARG_ANYTHING,
2541 };
2542
BPF_CALL_2(bpf_redirect_peer,u32,ifindex,u64,flags)2543 BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags)
2544 {
2545 struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
2546
2547 if (unlikely(flags))
2548 return TC_ACT_SHOT;
2549
2550 ri->flags = BPF_F_PEER;
2551 ri->tgt_index = ifindex;
2552
2553 return TC_ACT_REDIRECT;
2554 }
2555
2556 static const struct bpf_func_proto bpf_redirect_peer_proto = {
2557 .func = bpf_redirect_peer,
2558 .gpl_only = false,
2559 .ret_type = RET_INTEGER,
2560 .arg1_type = ARG_ANYTHING,
2561 .arg2_type = ARG_ANYTHING,
2562 };
2563
BPF_CALL_4(bpf_redirect_neigh,u32,ifindex,struct bpf_redir_neigh *,params,int,plen,u64,flags)2564 BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params,
2565 int, plen, u64, flags)
2566 {
2567 struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
2568
2569 if (unlikely((plen && plen < sizeof(*params)) || flags))
2570 return TC_ACT_SHOT;
2571
2572 ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0);
2573 ri->tgt_index = ifindex;
2574
2575 BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params));
2576 if (plen)
2577 memcpy(&ri->nh, params, sizeof(ri->nh));
2578
2579 return TC_ACT_REDIRECT;
2580 }
2581
2582 static const struct bpf_func_proto bpf_redirect_neigh_proto = {
2583 .func = bpf_redirect_neigh,
2584 .gpl_only = false,
2585 .ret_type = RET_INTEGER,
2586 .arg1_type = ARG_ANYTHING,
2587 .arg2_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
2588 .arg3_type = ARG_CONST_SIZE_OR_ZERO,
2589 .arg4_type = ARG_ANYTHING,
2590 };
2591
BPF_CALL_2(bpf_msg_apply_bytes,struct sk_msg *,msg,u32,bytes)2592 BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes)
2593 {
2594 msg->apply_bytes = bytes;
2595 return 0;
2596 }
2597
2598 static const struct bpf_func_proto bpf_msg_apply_bytes_proto = {
2599 .func = bpf_msg_apply_bytes,
2600 .gpl_only = false,
2601 .ret_type = RET_INTEGER,
2602 .arg1_type = ARG_PTR_TO_CTX,
2603 .arg2_type = ARG_ANYTHING,
2604 };
2605
BPF_CALL_2(bpf_msg_cork_bytes,struct sk_msg *,msg,u32,bytes)2606 BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes)
2607 {
2608 msg->cork_bytes = bytes;
2609 return 0;
2610 }
2611
sk_msg_reset_curr(struct sk_msg * msg)2612 static void sk_msg_reset_curr(struct sk_msg *msg)
2613 {
2614 if (!msg->sg.size) {
2615 msg->sg.curr = msg->sg.start;
2616 msg->sg.copybreak = 0;
2617 } else {
2618 u32 i = msg->sg.end;
2619
2620 sk_msg_iter_var_prev(i);
2621 msg->sg.curr = i;
2622 msg->sg.copybreak = msg->sg.data[i].length;
2623 }
2624 }
2625
2626 static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
2627 .func = bpf_msg_cork_bytes,
2628 .gpl_only = false,
2629 .ret_type = RET_INTEGER,
2630 .arg1_type = ARG_PTR_TO_CTX,
2631 .arg2_type = ARG_ANYTHING,
2632 };
2633
BPF_CALL_4(bpf_msg_pull_data,struct sk_msg *,msg,u32,start,u32,end,u64,flags)2634 BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
2635 u32, end, u64, flags)
2636 {
2637 u32 len = 0, offset = 0, copy = 0, poffset = 0, bytes = end - start;
2638 u32 first_sge, last_sge, i, shift, bytes_sg_total;
2639 struct scatterlist *sge;
2640 u8 *raw, *to, *from;
2641 struct page *page;
2642
2643 if (unlikely(flags || end <= start))
2644 return -EINVAL;
2645
2646 /* First find the starting scatterlist element */
2647 i = msg->sg.start;
2648 do {
2649 offset += len;
2650 len = sk_msg_elem(msg, i)->length;
2651 if (start < offset + len)
2652 break;
2653 sk_msg_iter_var_next(i);
2654 } while (i != msg->sg.end);
2655
2656 if (unlikely(start >= offset + len))
2657 return -EINVAL;
2658
2659 first_sge = i;
2660 /* The start may point into the sg element so we need to also
2661 * account for the headroom.
2662 */
2663 bytes_sg_total = start - offset + bytes;
2664 if (!test_bit(i, msg->sg.copy) && bytes_sg_total <= len)
2665 goto out;
2666
2667 /* At this point we need to linearize multiple scatterlist
2668 * elements or a single shared page. Either way we need to
2669 * copy into a linear buffer exclusively owned by BPF. Then
2670 * place the buffer in the scatterlist and fixup the original
2671 * entries by removing the entries now in the linear buffer
2672 * and shifting the remaining entries. For now we do not try
2673 * to copy partial entries to avoid complexity of running out
2674 * of sg_entry slots. The downside is reading a single byte
2675 * will copy the entire sg entry.
2676 */
2677 do {
2678 copy += sk_msg_elem(msg, i)->length;
2679 sk_msg_iter_var_next(i);
2680 if (bytes_sg_total <= copy)
2681 break;
2682 } while (i != msg->sg.end);
2683 last_sge = i;
2684
2685 if (unlikely(bytes_sg_total > copy))
2686 return -EINVAL;
2687
2688 page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP,
2689 get_order(copy));
2690 if (unlikely(!page))
2691 return -ENOMEM;
2692
2693 raw = page_address(page);
2694 i = first_sge;
2695 do {
2696 sge = sk_msg_elem(msg, i);
2697 from = sg_virt(sge);
2698 len = sge->length;
2699 to = raw + poffset;
2700
2701 memcpy(to, from, len);
2702 poffset += len;
2703 sge->length = 0;
2704 put_page(sg_page(sge));
2705
2706 sk_msg_iter_var_next(i);
2707 } while (i != last_sge);
2708
2709 sg_set_page(&msg->sg.data[first_sge], page, copy, 0);
2710
2711 /* To repair sg ring we need to shift entries. If we only
2712 * had a single entry though we can just replace it and
2713 * be done. Otherwise walk the ring and shift the entries.
2714 */
2715 WARN_ON_ONCE(last_sge == first_sge);
2716 shift = last_sge > first_sge ?
2717 last_sge - first_sge - 1 :
2718 NR_MSG_FRAG_IDS - first_sge + last_sge - 1;
2719 if (!shift)
2720 goto out;
2721
2722 i = first_sge;
2723 sk_msg_iter_var_next(i);
2724 do {
2725 u32 move_from;
2726
2727 if (i + shift >= NR_MSG_FRAG_IDS)
2728 move_from = i + shift - NR_MSG_FRAG_IDS;
2729 else
2730 move_from = i + shift;
2731 if (move_from == msg->sg.end)
2732 break;
2733
2734 msg->sg.data[i] = msg->sg.data[move_from];
2735 msg->sg.data[move_from].length = 0;
2736 msg->sg.data[move_from].page_link = 0;
2737 msg->sg.data[move_from].offset = 0;
2738 sk_msg_iter_var_next(i);
2739 } while (1);
2740
2741 msg->sg.end = msg->sg.end - shift > msg->sg.end ?
2742 msg->sg.end - shift + NR_MSG_FRAG_IDS :
2743 msg->sg.end - shift;
2744 out:
2745 sk_msg_reset_curr(msg);
2746 msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset;
2747 msg->data_end = msg->data + bytes;
2748 return 0;
2749 }
2750
2751 static const struct bpf_func_proto bpf_msg_pull_data_proto = {
2752 .func = bpf_msg_pull_data,
2753 .gpl_only = false,
2754 .ret_type = RET_INTEGER,
2755 .arg1_type = ARG_PTR_TO_CTX,
2756 .arg2_type = ARG_ANYTHING,
2757 .arg3_type = ARG_ANYTHING,
2758 .arg4_type = ARG_ANYTHING,
2759 };
2760
BPF_CALL_4(bpf_msg_push_data,struct sk_msg *,msg,u32,start,u32,len,u64,flags)2761 BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
2762 u32, len, u64, flags)
2763 {
2764 struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge;
2765 u32 new, i = 0, l = 0, space, copy = 0, offset = 0;
2766 u8 *raw, *to, *from;
2767 struct page *page;
2768
2769 if (unlikely(flags))
2770 return -EINVAL;
2771
2772 if (unlikely(len == 0))
2773 return 0;
2774
2775 /* First find the starting scatterlist element */
2776 i = msg->sg.start;
2777 do {
2778 offset += l;
2779 l = sk_msg_elem(msg, i)->length;
2780
2781 if (start < offset + l)
2782 break;
2783 sk_msg_iter_var_next(i);
2784 } while (i != msg->sg.end);
2785
2786 if (start > offset + l)
2787 return -EINVAL;
2788
2789 space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);
2790
2791 /* If no space available will fallback to copy, we need at
2792 * least one scatterlist elem available to push data into
2793 * when start aligns to the beginning of an element or two
2794 * when it falls inside an element. We handle the start equals
2795 * offset case because its the common case for inserting a
2796 * header.
2797 */
2798 if (!space || (space == 1 && start != offset))
2799 copy = msg->sg.data[i].length;
2800
2801 page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP,
2802 get_order(copy + len));
2803 if (unlikely(!page))
2804 return -ENOMEM;
2805
2806 if (copy) {
2807 int front, back;
2808
2809 raw = page_address(page);
2810
2811 if (i == msg->sg.end)
2812 sk_msg_iter_var_prev(i);
2813 psge = sk_msg_elem(msg, i);
2814 front = start - offset;
2815 back = psge->length - front;
2816 from = sg_virt(psge);
2817
2818 if (front)
2819 memcpy(raw, from, front);
2820
2821 if (back) {
2822 from += front;
2823 to = raw + front + len;
2824
2825 memcpy(to, from, back);
2826 }
2827
2828 put_page(sg_page(psge));
2829 new = i;
2830 goto place_new;
2831 }
2832
2833 if (start - offset) {
2834 if (i == msg->sg.end)
2835 sk_msg_iter_var_prev(i);
2836 psge = sk_msg_elem(msg, i);
2837 rsge = sk_msg_elem_cpy(msg, i);
2838
2839 psge->length = start - offset;
2840 rsge.length -= psge->length;
2841 rsge.offset += start;
2842
2843 sk_msg_iter_var_next(i);
2844 sg_unmark_end(psge);
2845 sg_unmark_end(&rsge);
2846 }
2847
2848 /* Slot(s) to place newly allocated data */
2849 sk_msg_iter_next(msg, end);
2850 new = i;
2851 sk_msg_iter_var_next(i);
2852
2853 if (i == msg->sg.end) {
2854 if (!rsge.length)
2855 goto place_new;
2856 sk_msg_iter_next(msg, end);
2857 goto place_new;
2858 }
2859
2860 /* Shift one or two slots as needed */
2861 sge = sk_msg_elem_cpy(msg, new);
2862 sg_unmark_end(&sge);
2863
2864 nsge = sk_msg_elem_cpy(msg, i);
2865 if (rsge.length) {
2866 sk_msg_iter_var_next(i);
2867 nnsge = sk_msg_elem_cpy(msg, i);
2868 sk_msg_iter_next(msg, end);
2869 }
2870
2871 while (i != msg->sg.end) {
2872 msg->sg.data[i] = sge;
2873 sge = nsge;
2874 sk_msg_iter_var_next(i);
2875 if (rsge.length) {
2876 nsge = nnsge;
2877 nnsge = sk_msg_elem_cpy(msg, i);
2878 } else {
2879 nsge = sk_msg_elem_cpy(msg, i);
2880 }
2881 }
2882
2883 place_new:
2884 /* Place newly allocated data buffer */
2885 sk_mem_charge(msg->sk, len);
2886 msg->sg.size += len;
2887 __clear_bit(new, msg->sg.copy);
2888 sg_set_page(&msg->sg.data[new], page, len + copy, 0);
2889 if (rsge.length) {
2890 get_page(sg_page(&rsge));
2891 sk_msg_iter_var_next(new);
2892 msg->sg.data[new] = rsge;
2893 }
2894
2895 sk_msg_reset_curr(msg);
2896 sk_msg_compute_data_pointers(msg);
2897 return 0;
2898 }
2899
2900 static const struct bpf_func_proto bpf_msg_push_data_proto = {
2901 .func = bpf_msg_push_data,
2902 .gpl_only = false,
2903 .ret_type = RET_INTEGER,
2904 .arg1_type = ARG_PTR_TO_CTX,
2905 .arg2_type = ARG_ANYTHING,
2906 .arg3_type = ARG_ANYTHING,
2907 .arg4_type = ARG_ANYTHING,
2908 };
2909
sk_msg_shift_left(struct sk_msg * msg,int i)2910 static void sk_msg_shift_left(struct sk_msg *msg, int i)
2911 {
2912 struct scatterlist *sge = sk_msg_elem(msg, i);
2913 int prev;
2914
2915 put_page(sg_page(sge));
2916 do {
2917 prev = i;
2918 sk_msg_iter_var_next(i);
2919 msg->sg.data[prev] = msg->sg.data[i];
2920 } while (i != msg->sg.end);
2921
2922 sk_msg_iter_prev(msg, end);
2923 }
2924
sk_msg_shift_right(struct sk_msg * msg,int i)2925 static void sk_msg_shift_right(struct sk_msg *msg, int i)
2926 {
2927 struct scatterlist tmp, sge;
2928
2929 sk_msg_iter_next(msg, end);
2930 sge = sk_msg_elem_cpy(msg, i);
2931 sk_msg_iter_var_next(i);
2932 tmp = sk_msg_elem_cpy(msg, i);
2933
2934 while (i != msg->sg.end) {
2935 msg->sg.data[i] = sge;
2936 sk_msg_iter_var_next(i);
2937 sge = tmp;
2938 tmp = sk_msg_elem_cpy(msg, i);
2939 }
2940 }
2941
BPF_CALL_4(bpf_msg_pop_data,struct sk_msg *,msg,u32,start,u32,len,u64,flags)2942 BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
2943 u32, len, u64, flags)
2944 {
2945 u32 i = 0, l = 0, space, offset = 0;
2946 u64 last = start + len;
2947 int pop;
2948
2949 if (unlikely(flags))
2950 return -EINVAL;
2951
2952 if (unlikely(len == 0))
2953 return 0;
2954
2955 /* First find the starting scatterlist element */
2956 i = msg->sg.start;
2957 do {
2958 offset += l;
2959 l = sk_msg_elem(msg, i)->length;
2960
2961 if (start < offset + l)
2962 break;
2963 sk_msg_iter_var_next(i);
2964 } while (i != msg->sg.end);
2965
2966 /* Bounds checks: start and pop must be inside message */
2967 if (start >= offset + l || last > msg->sg.size)
2968 return -EINVAL;
2969
2970 space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);
2971
2972 pop = len;
2973 /* --------------| offset
2974 * -| start |-------- len -------|
2975 *
2976 * |----- a ----|-------- pop -------|----- b ----|
2977 * |______________________________________________| length
2978 *
2979 *
2980 * a: region at front of scatter element to save
2981 * b: region at back of scatter element to save when length > A + pop
2982 * pop: region to pop from element, same as input 'pop' here will be
2983 * decremented below per iteration.
2984 *
2985 * Two top-level cases to handle when start != offset, first B is non
2986 * zero and second B is zero corresponding to when a pop includes more
2987 * than one element.
2988 *
2989 * Then if B is non-zero AND there is no space allocate space and
2990 * compact A, B regions into page. If there is space shift ring to
2991 * the rigth free'ing the next element in ring to place B, leaving
2992 * A untouched except to reduce length.
2993 */
2994 if (start != offset) {
2995 struct scatterlist *nsge, *sge = sk_msg_elem(msg, i);
2996 int a = start - offset;
2997 int b = sge->length - pop - a;
2998
2999 sk_msg_iter_var_next(i);
3000
3001 if (b > 0) {
3002 if (space) {
3003 sge->length = a;
3004 sk_msg_shift_right(msg, i);
3005 nsge = sk_msg_elem(msg, i);
3006 get_page(sg_page(sge));
3007 sg_set_page(nsge,
3008 sg_page(sge),
3009 b, sge->offset + pop + a);
3010 } else {
3011 struct page *page, *orig;
3012 u8 *to, *from;
3013
3014 page = alloc_pages(__GFP_NOWARN |
3015 __GFP_COMP | GFP_ATOMIC,
3016 get_order(a + b));
3017 if (unlikely(!page))
3018 return -ENOMEM;
3019
3020 orig = sg_page(sge);
3021 from = sg_virt(sge);
3022 to = page_address(page);
3023 memcpy(to, from, a);
3024 memcpy(to + a, from + a + pop, b);
3025 sg_set_page(sge, page, a + b, 0);
3026 put_page(orig);
3027 }
3028 pop = 0;
3029 } else {
3030 pop -= (sge->length - a);
3031 sge->length = a;
3032 }
3033 }
3034
3035 /* From above the current layout _must_ be as follows,
3036 *
3037 * -| offset
3038 * -| start
3039 *
3040 * |---- pop ---|---------------- b ------------|
3041 * |____________________________________________| length
3042 *
3043 * Offset and start of the current msg elem are equal because in the
3044 * previous case we handled offset != start and either consumed the
3045 * entire element and advanced to the next element OR pop == 0.
3046 *
3047 * Two cases to handle here are first pop is less than the length
3048 * leaving some remainder b above. Simply adjust the element's layout
3049 * in this case. Or pop >= length of the element so that b = 0. In this
3050 * case advance to next element decrementing pop.
3051 */
3052 while (pop) {
3053 struct scatterlist *sge = sk_msg_elem(msg, i);
3054
3055 if (pop < sge->length) {
3056 sge->length -= pop;
3057 sge->offset += pop;
3058 pop = 0;
3059 } else {
3060 pop -= sge->length;
3061 sk_msg_shift_left(msg, i);
3062 }
3063 }
3064
3065 sk_mem_uncharge(msg->sk, len - pop);
3066 msg->sg.size -= (len - pop);
3067 sk_msg_reset_curr(msg);
3068 sk_msg_compute_data_pointers(msg);
3069 return 0;
3070 }
3071
3072 static const struct bpf_func_proto bpf_msg_pop_data_proto = {
3073 .func = bpf_msg_pop_data,
3074 .gpl_only = false,
3075 .ret_type = RET_INTEGER,
3076 .arg1_type = ARG_PTR_TO_CTX,
3077 .arg2_type = ARG_ANYTHING,
3078 .arg3_type = ARG_ANYTHING,
3079 .arg4_type = ARG_ANYTHING,
3080 };
3081
3082 #ifdef CONFIG_CGROUP_NET_CLASSID
BPF_CALL_0(bpf_get_cgroup_classid_curr)3083 BPF_CALL_0(bpf_get_cgroup_classid_curr)
3084 {
3085 return __task_get_classid(current);
3086 }
3087
3088 const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = {
3089 .func = bpf_get_cgroup_classid_curr,
3090 .gpl_only = false,
3091 .ret_type = RET_INTEGER,
3092 };
3093
BPF_CALL_1(bpf_skb_cgroup_classid,const struct sk_buff *,skb)3094 BPF_CALL_1(bpf_skb_cgroup_classid, const struct sk_buff *, skb)
3095 {
3096 struct sock *sk = skb_to_full_sk(skb);
3097
3098 if (!sk || !sk_fullsock(sk))
3099 return 0;
3100
3101 return sock_cgroup_classid(&sk->sk_cgrp_data);
3102 }
3103
3104 static const struct bpf_func_proto bpf_skb_cgroup_classid_proto = {
3105 .func = bpf_skb_cgroup_classid,
3106 .gpl_only = false,
3107 .ret_type = RET_INTEGER,
3108 .arg1_type = ARG_PTR_TO_CTX,
3109 };
3110 #endif
3111
BPF_CALL_1(bpf_get_cgroup_classid,const struct sk_buff *,skb)3112 BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
3113 {
3114 return task_get_classid(skb);
3115 }
3116
3117 static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
3118 .func = bpf_get_cgroup_classid,
3119 .gpl_only = false,
3120 .ret_type = RET_INTEGER,
3121 .arg1_type = ARG_PTR_TO_CTX,
3122 };
3123
BPF_CALL_1(bpf_get_route_realm,const struct sk_buff *,skb)3124 BPF_CALL_1(bpf_get_route_realm, const struct sk_buff *, skb)
3125 {
3126 return dst_tclassid(skb);
3127 }
3128
3129 static const struct bpf_func_proto bpf_get_route_realm_proto = {
3130 .func = bpf_get_route_realm,
3131 .gpl_only = false,
3132 .ret_type = RET_INTEGER,
3133 .arg1_type = ARG_PTR_TO_CTX,
3134 };
3135
BPF_CALL_1(bpf_get_hash_recalc,struct sk_buff *,skb)3136 BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb)
3137 {
3138 /* If skb_clear_hash() was called due to mangling, we can
3139 * trigger SW recalculation here. Later access to hash
3140 * can then use the inline skb->hash via context directly
3141 * instead of calling this helper again.
3142 */
3143 return skb_get_hash(skb);
3144 }
3145
3146 static const struct bpf_func_proto bpf_get_hash_recalc_proto = {
3147 .func = bpf_get_hash_recalc,
3148 .gpl_only = false,
3149 .ret_type = RET_INTEGER,
3150 .arg1_type = ARG_PTR_TO_CTX,
3151 };
3152
BPF_CALL_1(bpf_set_hash_invalid,struct sk_buff *,skb)3153 BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb)
3154 {
3155 /* After all direct packet write, this can be used once for
3156 * triggering a lazy recalc on next skb_get_hash() invocation.
3157 */
3158 skb_clear_hash(skb);
3159 return 0;
3160 }
3161
3162 static const struct bpf_func_proto bpf_set_hash_invalid_proto = {
3163 .func = bpf_set_hash_invalid,
3164 .gpl_only = false,
3165 .ret_type = RET_INTEGER,
3166 .arg1_type = ARG_PTR_TO_CTX,
3167 };
3168
BPF_CALL_2(bpf_set_hash,struct sk_buff *,skb,u32,hash)3169 BPF_CALL_2(bpf_set_hash, struct sk_buff *, skb, u32, hash)
3170 {
3171 /* Set user specified hash as L4(+), so that it gets returned
3172 * on skb_get_hash() call unless BPF prog later on triggers a
3173 * skb_clear_hash().
3174 */
3175 __skb_set_sw_hash(skb, hash, true);
3176 return 0;
3177 }
3178
3179 static const struct bpf_func_proto bpf_set_hash_proto = {
3180 .func = bpf_set_hash,
3181 .gpl_only = false,
3182 .ret_type = RET_INTEGER,
3183 .arg1_type = ARG_PTR_TO_CTX,
3184 .arg2_type = ARG_ANYTHING,
3185 };
3186
BPF_CALL_3(bpf_skb_vlan_push,struct sk_buff *,skb,__be16,vlan_proto,u16,vlan_tci)3187 BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
3188 u16, vlan_tci)
3189 {
3190 int ret;
3191
3192 if (unlikely(vlan_proto != htons(ETH_P_8021Q) &&
3193 vlan_proto != htons(ETH_P_8021AD)))
3194 vlan_proto = htons(ETH_P_8021Q);
3195
3196 bpf_push_mac_rcsum(skb);
3197 ret = skb_vlan_push(skb, vlan_proto, vlan_tci);
3198 bpf_pull_mac_rcsum(skb);
3199
3200 bpf_compute_data_pointers(skb);
3201 return ret;
3202 }
3203
3204 static const struct bpf_func_proto bpf_skb_vlan_push_proto = {
3205 .func = bpf_skb_vlan_push,
3206 .gpl_only = false,
3207 .ret_type = RET_INTEGER,
3208 .arg1_type = ARG_PTR_TO_CTX,
3209 .arg2_type = ARG_ANYTHING,
3210 .arg3_type = ARG_ANYTHING,
3211 };
3212
BPF_CALL_1(bpf_skb_vlan_pop,struct sk_buff *,skb)3213 BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
3214 {
3215 int ret;
3216
3217 bpf_push_mac_rcsum(skb);
3218 ret = skb_vlan_pop(skb);
3219 bpf_pull_mac_rcsum(skb);
3220
3221 bpf_compute_data_pointers(skb);
3222 return ret;
3223 }
3224
3225 static const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
3226 .func = bpf_skb_vlan_pop,
3227 .gpl_only = false,
3228 .ret_type = RET_INTEGER,
3229 .arg1_type = ARG_PTR_TO_CTX,
3230 };
3231
bpf_skb_generic_push(struct sk_buff * skb,u32 off,u32 len)3232 static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
3233 {
3234 /* Caller already did skb_cow() with len as headroom,
3235 * so no need to do it here.
3236 */
3237 skb_push(skb, len);
3238 memmove(skb->data, skb->data + len, off);
3239 memset(skb->data + off, 0, len);
3240
3241 /* No skb_postpush_rcsum(skb, skb->data + off, len)
3242 * needed here as it does not change the skb->csum
3243 * result for checksum complete when summing over
3244 * zeroed blocks.
3245 */
3246 return 0;
3247 }
3248
bpf_skb_generic_pop(struct sk_buff * skb,u32 off,u32 len)3249 static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len)
3250 {
3251 void *old_data;
3252
3253 /* skb_ensure_writable() is not needed here, as we're
3254 * already working on an uncloned skb.
3255 */
3256 if (unlikely(!pskb_may_pull(skb, off + len)))
3257 return -ENOMEM;
3258
3259 old_data = skb->data;
3260 __skb_pull(skb, len);
3261 skb_postpull_rcsum(skb, old_data + off, len);
3262 memmove(skb->data, old_data, off);
3263
3264 return 0;
3265 }
3266
bpf_skb_net_hdr_push(struct sk_buff * skb,u32 off,u32 len)3267 static int bpf_skb_net_hdr_push(struct sk_buff *skb, u32 off, u32 len)
3268 {
3269 bool trans_same = skb->transport_header == skb->network_header;
3270 int ret;
3271
3272 /* There's no need for __skb_push()/__skb_pull() pair to
3273 * get to the start of the mac header as we're guaranteed
3274 * to always start from here under eBPF.
3275 */
3276 ret = bpf_skb_generic_push(skb, off, len);
3277 if (likely(!ret)) {
3278 skb->mac_header -= len;
3279 skb->network_header -= len;
3280 if (trans_same)
3281 skb->transport_header = skb->network_header;
3282 }
3283
3284 return ret;
3285 }
3286
bpf_skb_net_hdr_pop(struct sk_buff * skb,u32 off,u32 len)3287 static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len)
3288 {
3289 bool trans_same = skb->transport_header == skb->network_header;
3290 int ret;
3291
3292 /* Same here, __skb_push()/__skb_pull() pair not needed. */
3293 ret = bpf_skb_generic_pop(skb, off, len);
3294 if (likely(!ret)) {
3295 skb->mac_header += len;
3296 skb->network_header += len;
3297 if (trans_same)
3298 skb->transport_header = skb->network_header;
3299 }
3300
3301 return ret;
3302 }
3303
bpf_skb_proto_4_to_6(struct sk_buff * skb)3304 static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
3305 {
3306 const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
3307 u32 off = skb_mac_header_len(skb);
3308 int ret;
3309
3310 ret = skb_cow(skb, len_diff);
3311 if (unlikely(ret < 0))
3312 return ret;
3313
3314 ret = bpf_skb_net_hdr_push(skb, off, len_diff);
3315 if (unlikely(ret < 0))
3316 return ret;
3317
3318 if (skb_is_gso(skb)) {
3319 struct skb_shared_info *shinfo = skb_shinfo(skb);
3320
3321 /* SKB_GSO_TCPV4 needs to be changed into SKB_GSO_TCPV6. */
3322 if (shinfo->gso_type & SKB_GSO_TCPV4) {
3323 shinfo->gso_type &= ~SKB_GSO_TCPV4;
3324 shinfo->gso_type |= SKB_GSO_TCPV6;
3325 }
3326 }
3327
3328 skb->protocol = htons(ETH_P_IPV6);
3329 skb_clear_hash(skb);
3330
3331 return 0;
3332 }
3333
bpf_skb_proto_6_to_4(struct sk_buff * skb)3334 static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
3335 {
3336 const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
3337 u32 off = skb_mac_header_len(skb);
3338 int ret;
3339
3340 ret = skb_unclone(skb, GFP_ATOMIC);
3341 if (unlikely(ret < 0))
3342 return ret;
3343
3344 ret = bpf_skb_net_hdr_pop(skb, off, len_diff);
3345 if (unlikely(ret < 0))
3346 return ret;
3347
3348 if (skb_is_gso(skb)) {
3349 struct skb_shared_info *shinfo = skb_shinfo(skb);
3350
3351 /* SKB_GSO_TCPV6 needs to be changed into SKB_GSO_TCPV4. */
3352 if (shinfo->gso_type & SKB_GSO_TCPV6) {
3353 shinfo->gso_type &= ~SKB_GSO_TCPV6;
3354 shinfo->gso_type |= SKB_GSO_TCPV4;
3355 }
3356 }
3357
3358 skb->protocol = htons(ETH_P_IP);
3359 skb_clear_hash(skb);
3360
3361 return 0;
3362 }
3363
bpf_skb_proto_xlat(struct sk_buff * skb,__be16 to_proto)3364 static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto)
3365 {
3366 __be16 from_proto = skb->protocol;
3367
3368 if (from_proto == htons(ETH_P_IP) &&
3369 to_proto == htons(ETH_P_IPV6))
3370 return bpf_skb_proto_4_to_6(skb);
3371
3372 if (from_proto == htons(ETH_P_IPV6) &&
3373 to_proto == htons(ETH_P_IP))
3374 return bpf_skb_proto_6_to_4(skb);
3375
3376 return -ENOTSUPP;
3377 }
3378
BPF_CALL_3(bpf_skb_change_proto,struct sk_buff *,skb,__be16,proto,u64,flags)3379 BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto,
3380 u64, flags)
3381 {
3382 int ret;
3383
3384 if (unlikely(flags))
3385 return -EINVAL;
3386
3387 /* General idea is that this helper does the basic groundwork
3388 * needed for changing the protocol, and eBPF program fills the
3389 * rest through bpf_skb_store_bytes(), bpf_lX_csum_replace()
3390 * and other helpers, rather than passing a raw buffer here.
3391 *
3392 * The rationale is to keep this minimal and without a need to
3393 * deal with raw packet data. F.e. even if we would pass buffers
3394 * here, the program still needs to call the bpf_lX_csum_replace()
3395 * helpers anyway. Plus, this way we keep also separation of
3396 * concerns, since f.e. bpf_skb_store_bytes() should only take
3397 * care of stores.
3398 *
3399 * Currently, additional options and extension header space are
3400 * not supported, but flags register is reserved so we can adapt
3401 * that. For offloads, we mark packet as dodgy, so that headers
3402 * need to be verified first.
3403 */
3404 ret = bpf_skb_proto_xlat(skb, proto);
3405 bpf_compute_data_pointers(skb);
3406 return ret;
3407 }
3408
3409 static const struct bpf_func_proto bpf_skb_change_proto_proto = {
3410 .func = bpf_skb_change_proto,
3411 .gpl_only = false,
3412 .ret_type = RET_INTEGER,
3413 .arg1_type = ARG_PTR_TO_CTX,
3414 .arg2_type = ARG_ANYTHING,
3415 .arg3_type = ARG_ANYTHING,
3416 };
3417
BPF_CALL_2(bpf_skb_change_type,struct sk_buff *,skb,u32,pkt_type)3418 BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type)
3419 {
3420 /* We only allow a restricted subset to be changed for now. */
3421 if (unlikely(!skb_pkt_type_ok(skb->pkt_type) ||
3422 !skb_pkt_type_ok(pkt_type)))
3423 return -EINVAL;
3424
3425 skb->pkt_type = pkt_type;
3426 return 0;
3427 }
3428
3429 static const struct bpf_func_proto bpf_skb_change_type_proto = {
3430 .func = bpf_skb_change_type,
3431 .gpl_only = false,
3432 .ret_type = RET_INTEGER,
3433 .arg1_type = ARG_PTR_TO_CTX,
3434 .arg2_type = ARG_ANYTHING,
3435 };
3436
bpf_skb_net_base_len(const struct sk_buff * skb)3437 static u32 bpf_skb_net_base_len(const struct sk_buff *skb)
3438 {
3439 switch (skb->protocol) {
3440 case htons(ETH_P_IP):
3441 return sizeof(struct iphdr);
3442 case htons(ETH_P_IPV6):
3443 return sizeof(struct ipv6hdr);
3444 default:
3445 return ~0U;
3446 }
3447 }
3448
3449 #define BPF_F_ADJ_ROOM_ENCAP_L3_MASK (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \
3450 BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
3451
3452 #define BPF_F_ADJ_ROOM_DECAP_L3_MASK (BPF_F_ADJ_ROOM_DECAP_L3_IPV4 | \
3453 BPF_F_ADJ_ROOM_DECAP_L3_IPV6)
3454
3455 #define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \
3456 BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \
3457 BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \
3458 BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \
3459 BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \
3460 BPF_F_ADJ_ROOM_ENCAP_L2( \
3461 BPF_ADJ_ROOM_ENCAP_L2_MASK) | \
3462 BPF_F_ADJ_ROOM_DECAP_L3_MASK)
3463
bpf_skb_net_grow(struct sk_buff * skb,u32 off,u32 len_diff,u64 flags)3464 static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
3465 u64 flags)
3466 {
3467 u8 inner_mac_len = flags >> BPF_ADJ_ROOM_ENCAP_L2_SHIFT;
3468 bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK;
3469 u16 mac_len = 0, inner_net = 0, inner_trans = 0;
3470 unsigned int gso_type = SKB_GSO_DODGY;
3471 int ret;
3472
3473 if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) {
3474 /* udp gso_size delineates datagrams, only allow if fixed */
3475 if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) ||
3476 !(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
3477 return -ENOTSUPP;
3478 }
3479
3480 ret = skb_cow_head(skb, len_diff);
3481 if (unlikely(ret < 0))
3482 return ret;
3483
3484 if (encap) {
3485 if (skb->protocol != htons(ETH_P_IP) &&
3486 skb->protocol != htons(ETH_P_IPV6))
3487 return -ENOTSUPP;
3488
3489 if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 &&
3490 flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
3491 return -EINVAL;
3492
3493 if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE &&
3494 flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
3495 return -EINVAL;
3496
3497 if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH &&
3498 inner_mac_len < ETH_HLEN)
3499 return -EINVAL;
3500
3501 if (skb->encapsulation)
3502 return -EALREADY;
3503
3504 mac_len = skb->network_header - skb->mac_header;
3505 inner_net = skb->network_header;
3506 if (inner_mac_len > len_diff)
3507 return -EINVAL;
3508 inner_trans = skb->transport_header;
3509 }
3510
3511 ret = bpf_skb_net_hdr_push(skb, off, len_diff);
3512 if (unlikely(ret < 0))
3513 return ret;
3514
3515 if (encap) {
3516 skb->inner_mac_header = inner_net - inner_mac_len;
3517 skb->inner_network_header = inner_net;
3518 skb->inner_transport_header = inner_trans;
3519
3520 if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH)
3521 skb_set_inner_protocol(skb, htons(ETH_P_TEB));
3522 else
3523 skb_set_inner_protocol(skb, skb->protocol);
3524
3525 skb->encapsulation = 1;
3526 skb_set_network_header(skb, mac_len);
3527
3528 if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
3529 gso_type |= SKB_GSO_UDP_TUNNEL;
3530 else if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE)
3531 gso_type |= SKB_GSO_GRE;
3532 else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
3533 gso_type |= SKB_GSO_IPXIP6;
3534 else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4)
3535 gso_type |= SKB_GSO_IPXIP4;
3536
3537 if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE ||
3538 flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) {
3539 int nh_len = flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 ?
3540 sizeof(struct ipv6hdr) :
3541 sizeof(struct iphdr);
3542
3543 skb_set_transport_header(skb, mac_len + nh_len);
3544 }
3545
3546 /* Match skb->protocol to new outer l3 protocol */
3547 if (skb->protocol == htons(ETH_P_IP) &&
3548 flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
3549 skb->protocol = htons(ETH_P_IPV6);
3550 else if (skb->protocol == htons(ETH_P_IPV6) &&
3551 flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4)
3552 skb->protocol = htons(ETH_P_IP);
3553 }
3554
3555 if (skb_is_gso(skb)) {
3556 struct skb_shared_info *shinfo = skb_shinfo(skb);
3557
3558 /* Header must be checked, and gso_segs recomputed. */
3559 shinfo->gso_type |= gso_type;
3560 shinfo->gso_segs = 0;
3561
3562 /* Due to header growth, MSS needs to be downgraded.
3563 * There is a BUG_ON() when segmenting the frag_list with
3564 * head_frag true, so linearize the skb after downgrading
3565 * the MSS.
3566 */
3567 if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) {
3568 skb_decrease_gso_size(shinfo, len_diff);
3569 if (shinfo->frag_list)
3570 return skb_linearize(skb);
3571 }
3572 }
3573
3574 return 0;
3575 }
3576
bpf_skb_net_shrink(struct sk_buff * skb,u32 off,u32 len_diff,u64 flags)3577 static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff,
3578 u64 flags)
3579 {
3580 int ret;
3581
3582 if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO |
3583 BPF_F_ADJ_ROOM_DECAP_L3_MASK |
3584 BPF_F_ADJ_ROOM_NO_CSUM_RESET)))
3585 return -EINVAL;
3586
3587 if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) {
3588 /* udp gso_size delineates datagrams, only allow if fixed */
3589 if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) ||
3590 !(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
3591 return -ENOTSUPP;
3592 }
3593
3594 ret = skb_unclone(skb, GFP_ATOMIC);
3595 if (unlikely(ret < 0))
3596 return ret;
3597
3598 ret = bpf_skb_net_hdr_pop(skb, off, len_diff);
3599 if (unlikely(ret < 0))
3600 return ret;
3601
3602 /* Match skb->protocol to new outer l3 protocol */
3603 if (skb->protocol == htons(ETH_P_IP) &&
3604 flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6)
3605 skb->protocol = htons(ETH_P_IPV6);
3606 else if (skb->protocol == htons(ETH_P_IPV6) &&
3607 flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4)
3608 skb->protocol = htons(ETH_P_IP);
3609
3610 if (skb_is_gso(skb)) {
3611 struct skb_shared_info *shinfo = skb_shinfo(skb);
3612
3613 /* Due to header shrink, MSS can be upgraded. */
3614 if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
3615 skb_increase_gso_size(shinfo, len_diff);
3616
3617 /* Header must be checked, and gso_segs recomputed. */
3618 shinfo->gso_type |= SKB_GSO_DODGY;
3619 shinfo->gso_segs = 0;
3620 }
3621
3622 return 0;
3623 }
3624
3625 #define BPF_SKB_MAX_LEN SKB_MAX_ALLOC
3626
BPF_CALL_4(sk_skb_adjust_room,struct sk_buff *,skb,s32,len_diff,u32,mode,u64,flags)3627 BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
3628 u32, mode, u64, flags)
3629 {
3630 u32 len_diff_abs = abs(len_diff);
3631 bool shrink = len_diff < 0;
3632 int ret = 0;
3633
3634 if (unlikely(flags || mode))
3635 return -EINVAL;
3636 if (unlikely(len_diff_abs > 0xfffU))
3637 return -EFAULT;
3638
3639 if (!shrink) {
3640 ret = skb_cow(skb, len_diff);
3641 if (unlikely(ret < 0))
3642 return ret;
3643 __skb_push(skb, len_diff_abs);
3644 memset(skb->data, 0, len_diff_abs);
3645 } else {
3646 if (unlikely(!pskb_may_pull(skb, len_diff_abs)))
3647 return -ENOMEM;
3648 __skb_pull(skb, len_diff_abs);
3649 }
3650 if (tls_sw_has_ctx_rx(skb->sk)) {
3651 struct strp_msg *rxm = strp_msg(skb);
3652
3653 rxm->full_len += len_diff;
3654 }
3655 return ret;
3656 }
3657
3658 static const struct bpf_func_proto sk_skb_adjust_room_proto = {
3659 .func = sk_skb_adjust_room,
3660 .gpl_only = false,
3661 .ret_type = RET_INTEGER,
3662 .arg1_type = ARG_PTR_TO_CTX,
3663 .arg2_type = ARG_ANYTHING,
3664 .arg3_type = ARG_ANYTHING,
3665 .arg4_type = ARG_ANYTHING,
3666 };
3667
BPF_CALL_4(bpf_skb_adjust_room,struct sk_buff *,skb,s32,len_diff,u32,mode,u64,flags)3668 BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
3669 u32, mode, u64, flags)
3670 {
3671 u32 len_cur, len_diff_abs = abs(len_diff);
3672 u32 len_min = bpf_skb_net_base_len(skb);
3673 u32 len_max = BPF_SKB_MAX_LEN;
3674 __be16 proto = skb->protocol;
3675 bool shrink = len_diff < 0;
3676 u32 off;
3677 int ret;
3678
3679 if (unlikely(flags & ~(BPF_F_ADJ_ROOM_MASK |
3680 BPF_F_ADJ_ROOM_NO_CSUM_RESET)))
3681 return -EINVAL;
3682 if (unlikely(len_diff_abs > 0xfffU))
3683 return -EFAULT;
3684 if (unlikely(proto != htons(ETH_P_IP) &&
3685 proto != htons(ETH_P_IPV6)))
3686 return -ENOTSUPP;
3687
3688 off = skb_mac_header_len(skb);
3689 switch (mode) {
3690 case BPF_ADJ_ROOM_NET:
3691 off += bpf_skb_net_base_len(skb);
3692 break;
3693 case BPF_ADJ_ROOM_MAC:
3694 break;
3695 default:
3696 return -ENOTSUPP;
3697 }
3698
3699 if (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) {
3700 if (!shrink)
3701 return -EINVAL;
3702
3703 switch (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) {
3704 case BPF_F_ADJ_ROOM_DECAP_L3_IPV4:
3705 len_min = sizeof(struct iphdr);
3706 break;
3707 case BPF_F_ADJ_ROOM_DECAP_L3_IPV6:
3708 len_min = sizeof(struct ipv6hdr);
3709 break;
3710 default:
3711 return -EINVAL;
3712 }
3713 }
3714
3715 len_cur = skb->len - skb_network_offset(skb);
3716 if ((shrink && (len_diff_abs >= len_cur ||
3717 len_cur - len_diff_abs < len_min)) ||
3718 (!shrink && (skb->len + len_diff_abs > len_max &&
3719 !skb_is_gso(skb))))
3720 return -ENOTSUPP;
3721
3722 ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) :
3723 bpf_skb_net_grow(skb, off, len_diff_abs, flags);
3724 if (!ret && !(flags & BPF_F_ADJ_ROOM_NO_CSUM_RESET))
3725 __skb_reset_checksum_unnecessary(skb);
3726
3727 bpf_compute_data_pointers(skb);
3728 return ret;
3729 }
3730
3731 static const struct bpf_func_proto bpf_skb_adjust_room_proto = {
3732 .func = bpf_skb_adjust_room,
3733 .gpl_only = false,
3734 .ret_type = RET_INTEGER,
3735 .arg1_type = ARG_PTR_TO_CTX,
3736 .arg2_type = ARG_ANYTHING,
3737 .arg3_type = ARG_ANYTHING,
3738 .arg4_type = ARG_ANYTHING,
3739 };
3740
__bpf_skb_min_len(const struct sk_buff * skb)3741 static u32 __bpf_skb_min_len(const struct sk_buff *skb)
3742 {
3743 int offset = skb_network_offset(skb);
3744 u32 min_len = 0;
3745
3746 if (offset > 0)
3747 min_len = offset;
3748 if (skb_transport_header_was_set(skb)) {
3749 offset = skb_transport_offset(skb);
3750 if (offset > 0)
3751 min_len = offset;
3752 }
3753 if (skb->ip_summed == CHECKSUM_PARTIAL) {
3754 offset = skb_checksum_start_offset(skb) +
3755 skb->csum_offset + sizeof(__sum16);
3756 if (offset > 0)
3757 min_len = offset;
3758 }
3759 return min_len;
3760 }
3761
bpf_skb_grow_rcsum(struct sk_buff * skb,unsigned int new_len)3762 static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len)
3763 {
3764 unsigned int old_len = skb->len;
3765 int ret;
3766
3767 ret = __skb_grow_rcsum(skb, new_len);
3768 if (!ret)
3769 memset(skb->data + old_len, 0, new_len - old_len);
3770 return ret;
3771 }
3772
bpf_skb_trim_rcsum(struct sk_buff * skb,unsigned int new_len)3773 static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len)
3774 {
3775 return __skb_trim_rcsum(skb, new_len);
3776 }
3777
__bpf_skb_change_tail(struct sk_buff * skb,u32 new_len,u64 flags)3778 static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len,
3779 u64 flags)
3780 {
3781 u32 max_len = BPF_SKB_MAX_LEN;
3782 u32 min_len = __bpf_skb_min_len(skb);
3783 int ret;
3784
3785 if (unlikely(flags || new_len > max_len || new_len < min_len))
3786 return -EINVAL;
3787 if (skb->encapsulation)
3788 return -ENOTSUPP;
3789
3790 /* The basic idea of this helper is that it's performing the
3791 * needed work to either grow or trim an skb, and eBPF program
3792 * rewrites the rest via helpers like bpf_skb_store_bytes(),
3793 * bpf_lX_csum_replace() and others rather than passing a raw
3794 * buffer here. This one is a slow path helper and intended
3795 * for replies with control messages.
3796 *
3797 * Like in bpf_skb_change_proto(), we want to keep this rather
3798 * minimal and without protocol specifics so that we are able
3799 * to separate concerns as in bpf_skb_store_bytes() should only
3800 * be the one responsible for writing buffers.
3801 *
3802 * It's really expected to be a slow path operation here for
3803 * control message replies, so we're implicitly linearizing,
3804 * uncloning and drop offloads from the skb by this.
3805 */
3806 ret = __bpf_try_make_writable(skb, skb->len);
3807 if (!ret) {
3808 if (new_len > skb->len)
3809 ret = bpf_skb_grow_rcsum(skb, new_len);
3810 else if (new_len < skb->len)
3811 ret = bpf_skb_trim_rcsum(skb, new_len);
3812 if (!ret && skb_is_gso(skb))
3813 skb_gso_reset(skb);
3814 }
3815 return ret;
3816 }
3817
BPF_CALL_3(bpf_skb_change_tail,struct sk_buff *,skb,u32,new_len,u64,flags)3818 BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len,
3819 u64, flags)
3820 {
3821 int ret = __bpf_skb_change_tail(skb, new_len, flags);
3822
3823 bpf_compute_data_pointers(skb);
3824 return ret;
3825 }
3826
3827 static const struct bpf_func_proto bpf_skb_change_tail_proto = {
3828 .func = bpf_skb_change_tail,
3829 .gpl_only = false,
3830 .ret_type = RET_INTEGER,
3831 .arg1_type = ARG_PTR_TO_CTX,
3832 .arg2_type = ARG_ANYTHING,
3833 .arg3_type = ARG_ANYTHING,
3834 };
3835
BPF_CALL_3(sk_skb_change_tail,struct sk_buff *,skb,u32,new_len,u64,flags)3836 BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len,
3837 u64, flags)
3838 {
3839 return __bpf_skb_change_tail(skb, new_len, flags);
3840 }
3841
3842 static const struct bpf_func_proto sk_skb_change_tail_proto = {
3843 .func = sk_skb_change_tail,
3844 .gpl_only = false,
3845 .ret_type = RET_INTEGER,
3846 .arg1_type = ARG_PTR_TO_CTX,
3847 .arg2_type = ARG_ANYTHING,
3848 .arg3_type = ARG_ANYTHING,
3849 };
3850
__bpf_skb_change_head(struct sk_buff * skb,u32 head_room,u64 flags)3851 static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room,
3852 u64 flags)
3853 {
3854 u32 max_len = BPF_SKB_MAX_LEN;
3855 u32 new_len = skb->len + head_room;
3856 int ret;
3857
3858 if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) ||
3859 new_len < skb->len))
3860 return -EINVAL;
3861
3862 ret = skb_cow(skb, head_room);
3863 if (likely(!ret)) {
3864 /* Idea for this helper is that we currently only
3865 * allow to expand on mac header. This means that
3866 * skb->protocol network header, etc, stay as is.
3867 * Compared to bpf_skb_change_tail(), we're more
3868 * flexible due to not needing to linearize or
3869 * reset GSO. Intention for this helper is to be
3870 * used by an L3 skb that needs to push mac header
3871 * for redirection into L2 device.
3872 */
3873 __skb_push(skb, head_room);
3874 memset(skb->data, 0, head_room);
3875 skb_reset_mac_header(skb);
3876 skb_reset_mac_len(skb);
3877 }
3878
3879 return ret;
3880 }
3881
BPF_CALL_3(bpf_skb_change_head,struct sk_buff *,skb,u32,head_room,u64,flags)3882 BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room,
3883 u64, flags)
3884 {
3885 int ret = __bpf_skb_change_head(skb, head_room, flags);
3886
3887 bpf_compute_data_pointers(skb);
3888 return ret;
3889 }
3890
3891 static const struct bpf_func_proto bpf_skb_change_head_proto = {
3892 .func = bpf_skb_change_head,
3893 .gpl_only = false,
3894 .ret_type = RET_INTEGER,
3895 .arg1_type = ARG_PTR_TO_CTX,
3896 .arg2_type = ARG_ANYTHING,
3897 .arg3_type = ARG_ANYTHING,
3898 };
3899
BPF_CALL_3(sk_skb_change_head,struct sk_buff *,skb,u32,head_room,u64,flags)3900 BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room,
3901 u64, flags)
3902 {
3903 return __bpf_skb_change_head(skb, head_room, flags);
3904 }
3905
3906 static const struct bpf_func_proto sk_skb_change_head_proto = {
3907 .func = sk_skb_change_head,
3908 .gpl_only = false,
3909 .ret_type = RET_INTEGER,
3910 .arg1_type = ARG_PTR_TO_CTX,
3911 .arg2_type = ARG_ANYTHING,
3912 .arg3_type = ARG_ANYTHING,
3913 };
3914
BPF_CALL_1(bpf_xdp_get_buff_len,struct xdp_buff *,xdp)3915 BPF_CALL_1(bpf_xdp_get_buff_len, struct xdp_buff*, xdp)
3916 {
3917 return xdp_get_buff_len(xdp);
3918 }
3919
3920 static const struct bpf_func_proto bpf_xdp_get_buff_len_proto = {
3921 .func = bpf_xdp_get_buff_len,
3922 .gpl_only = false,
3923 .ret_type = RET_INTEGER,
3924 .arg1_type = ARG_PTR_TO_CTX,
3925 };
3926
3927 BTF_ID_LIST_SINGLE(bpf_xdp_get_buff_len_bpf_ids, struct, xdp_buff)
3928
3929 const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto = {
3930 .func = bpf_xdp_get_buff_len,
3931 .gpl_only = false,
3932 .arg1_type = ARG_PTR_TO_BTF_ID,
3933 .arg1_btf_id = &bpf_xdp_get_buff_len_bpf_ids[0],
3934 };
3935
xdp_get_metalen(const struct xdp_buff * xdp)3936 static unsigned long xdp_get_metalen(const struct xdp_buff *xdp)
3937 {
3938 return xdp_data_meta_unsupported(xdp) ? 0 :
3939 xdp->data - xdp->data_meta;
3940 }
3941
BPF_CALL_2(bpf_xdp_adjust_head,struct xdp_buff *,xdp,int,offset)3942 BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
3943 {
3944 void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
3945 unsigned long metalen = xdp_get_metalen(xdp);
3946 void *data_start = xdp_frame_end + metalen;
3947 void *data = xdp->data + offset;
3948
3949 if (unlikely(data < data_start ||
3950 data > xdp->data_end - ETH_HLEN))
3951 return -EINVAL;
3952
3953 if (metalen)
3954 memmove(xdp->data_meta + offset,
3955 xdp->data_meta, metalen);
3956 xdp->data_meta += offset;
3957 xdp->data = data;
3958
3959 return 0;
3960 }
3961
3962 static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
3963 .func = bpf_xdp_adjust_head,
3964 .gpl_only = false,
3965 .ret_type = RET_INTEGER,
3966 .arg1_type = ARG_PTR_TO_CTX,
3967 .arg2_type = ARG_ANYTHING,
3968 };
3969
bpf_xdp_copy_buf(struct xdp_buff * xdp,unsigned long off,void * buf,unsigned long len,bool flush)3970 void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off,
3971 void *buf, unsigned long len, bool flush)
3972 {
3973 unsigned long ptr_len, ptr_off = 0;
3974 skb_frag_t *next_frag, *end_frag;
3975 struct skb_shared_info *sinfo;
3976 void *src, *dst;
3977 u8 *ptr_buf;
3978
3979 if (likely(xdp->data_end - xdp->data >= off + len)) {
3980 src = flush ? buf : xdp->data + off;
3981 dst = flush ? xdp->data + off : buf;
3982 memcpy(dst, src, len);
3983 return;
3984 }
3985
3986 sinfo = xdp_get_shared_info_from_buff(xdp);
3987 end_frag = &sinfo->frags[sinfo->nr_frags];
3988 next_frag = &sinfo->frags[0];
3989
3990 ptr_len = xdp->data_end - xdp->data;
3991 ptr_buf = xdp->data;
3992
3993 while (true) {
3994 if (off < ptr_off + ptr_len) {
3995 unsigned long copy_off = off - ptr_off;
3996 unsigned long copy_len = min(len, ptr_len - copy_off);
3997
3998 src = flush ? buf : ptr_buf + copy_off;
3999 dst = flush ? ptr_buf + copy_off : buf;
4000 memcpy(dst, src, copy_len);
4001
4002 off += copy_len;
4003 len -= copy_len;
4004 buf += copy_len;
4005 }
4006
4007 if (!len || next_frag == end_frag)
4008 break;
4009
4010 ptr_off += ptr_len;
4011 ptr_buf = skb_frag_address(next_frag);
4012 ptr_len = skb_frag_size(next_frag);
4013 next_frag++;
4014 }
4015 }
4016
bpf_xdp_pointer(struct xdp_buff * xdp,u32 offset,u32 len)4017 void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len)
4018 {
4019 u32 size = xdp->data_end - xdp->data;
4020 struct skb_shared_info *sinfo;
4021 void *addr = xdp->data;
4022 int i;
4023
4024 if (unlikely(offset > 0xffff || len > 0xffff))
4025 return ERR_PTR(-EFAULT);
4026
4027 if (unlikely(offset + len > xdp_get_buff_len(xdp)))
4028 return ERR_PTR(-EINVAL);
4029
4030 if (likely(offset < size)) /* linear area */
4031 goto out;
4032
4033 sinfo = xdp_get_shared_info_from_buff(xdp);
4034 offset -= size;
4035 for (i = 0; i < sinfo->nr_frags; i++) { /* paged area */
4036 u32 frag_size = skb_frag_size(&sinfo->frags[i]);
4037
4038 if (offset < frag_size) {
4039 addr = skb_frag_address(&sinfo->frags[i]);
4040 size = frag_size;
4041 break;
4042 }
4043 offset -= frag_size;
4044 }
4045 out:
4046 return offset + len <= size ? addr + offset : NULL;
4047 }
4048
BPF_CALL_4(bpf_xdp_load_bytes,struct xdp_buff *,xdp,u32,offset,void *,buf,u32,len)4049 BPF_CALL_4(bpf_xdp_load_bytes, struct xdp_buff *, xdp, u32, offset,
4050 void *, buf, u32, len)
4051 {
4052 void *ptr;
4053
4054 ptr = bpf_xdp_pointer(xdp, offset, len);
4055 if (IS_ERR(ptr))
4056 return PTR_ERR(ptr);
4057
4058 if (!ptr)
4059 bpf_xdp_copy_buf(xdp, offset, buf, len, false);
4060 else
4061 memcpy(buf, ptr, len);
4062
4063 return 0;
4064 }
4065
4066 static const struct bpf_func_proto bpf_xdp_load_bytes_proto = {
4067 .func = bpf_xdp_load_bytes,
4068 .gpl_only = false,
4069 .ret_type = RET_INTEGER,
4070 .arg1_type = ARG_PTR_TO_CTX,
4071 .arg2_type = ARG_ANYTHING,
4072 .arg3_type = ARG_PTR_TO_UNINIT_MEM,
4073 .arg4_type = ARG_CONST_SIZE,
4074 };
4075
__bpf_xdp_load_bytes(struct xdp_buff * xdp,u32 offset,void * buf,u32 len)4076 int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len)
4077 {
4078 return ____bpf_xdp_load_bytes(xdp, offset, buf, len);
4079 }
4080
BPF_CALL_4(bpf_xdp_store_bytes,struct xdp_buff *,xdp,u32,offset,void *,buf,u32,len)4081 BPF_CALL_4(bpf_xdp_store_bytes, struct xdp_buff *, xdp, u32, offset,
4082 void *, buf, u32, len)
4083 {
4084 void *ptr;
4085
4086 ptr = bpf_xdp_pointer(xdp, offset, len);
4087 if (IS_ERR(ptr))
4088 return PTR_ERR(ptr);
4089
4090 if (!ptr)
4091 bpf_xdp_copy_buf(xdp, offset, buf, len, true);
4092 else
4093 memcpy(ptr, buf, len);
4094
4095 return 0;
4096 }
4097
4098 static const struct bpf_func_proto bpf_xdp_store_bytes_proto = {
4099 .func = bpf_xdp_store_bytes,
4100 .gpl_only = false,
4101 .ret_type = RET_INTEGER,
4102 .arg1_type = ARG_PTR_TO_CTX,
4103 .arg2_type = ARG_ANYTHING,
4104 .arg3_type = ARG_PTR_TO_UNINIT_MEM,
4105 .arg4_type = ARG_CONST_SIZE,
4106 };
4107
__bpf_xdp_store_bytes(struct xdp_buff * xdp,u32 offset,void * buf,u32 len)4108 int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len)
4109 {
4110 return ____bpf_xdp_store_bytes(xdp, offset, buf, len);
4111 }
4112
bpf_xdp_frags_increase_tail(struct xdp_buff * xdp,int offset)4113 static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset)
4114 {
4115 struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
4116 skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags - 1];
4117 struct xdp_rxq_info *rxq = xdp->rxq;
4118 unsigned int tailroom;
4119
4120 if (!rxq->frag_size || rxq->frag_size > xdp->frame_sz)
4121 return -EOPNOTSUPP;
4122
4123 tailroom = rxq->frag_size - skb_frag_size(frag) - skb_frag_off(frag);
4124 if (unlikely(offset > tailroom))
4125 return -EINVAL;
4126
4127 memset(skb_frag_address(frag) + skb_frag_size(frag), 0, offset);
4128 skb_frag_size_add(frag, offset);
4129 sinfo->xdp_frags_size += offset;
4130 if (rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL)
4131 xsk_buff_get_tail(xdp)->data_end += offset;
4132
4133 return 0;
4134 }
4135
bpf_xdp_shrink_data_zc(struct xdp_buff * xdp,int shrink,struct xdp_mem_info * mem_info,bool release)4136 static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink,
4137 struct xdp_mem_info *mem_info, bool release)
4138 {
4139 struct xdp_buff *zc_frag = xsk_buff_get_tail(xdp);
4140
4141 if (release) {
4142 xsk_buff_del_tail(zc_frag);
4143 __xdp_return(NULL, mem_info, false, zc_frag);
4144 } else {
4145 zc_frag->data_end -= shrink;
4146 }
4147 }
4148
bpf_xdp_shrink_data(struct xdp_buff * xdp,skb_frag_t * frag,int shrink)4149 static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag,
4150 int shrink)
4151 {
4152 struct xdp_mem_info *mem_info = &xdp->rxq->mem;
4153 bool release = skb_frag_size(frag) == shrink;
4154
4155 if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) {
4156 bpf_xdp_shrink_data_zc(xdp, shrink, mem_info, release);
4157 goto out;
4158 }
4159
4160 if (release) {
4161 struct page *page = skb_frag_page(frag);
4162
4163 __xdp_return(page_address(page), mem_info, false, NULL);
4164 }
4165
4166 out:
4167 return release;
4168 }
4169
bpf_xdp_frags_shrink_tail(struct xdp_buff * xdp,int offset)4170 static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset)
4171 {
4172 struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
4173 int i, n_frags_free = 0, len_free = 0;
4174
4175 if (unlikely(offset > (int)xdp_get_buff_len(xdp) - ETH_HLEN))
4176 return -EINVAL;
4177
4178 for (i = sinfo->nr_frags - 1; i >= 0 && offset > 0; i--) {
4179 skb_frag_t *frag = &sinfo->frags[i];
4180 int shrink = min_t(int, offset, skb_frag_size(frag));
4181
4182 len_free += shrink;
4183 offset -= shrink;
4184 if (bpf_xdp_shrink_data(xdp, frag, shrink)) {
4185 n_frags_free++;
4186 } else {
4187 skb_frag_size_sub(frag, shrink);
4188 break;
4189 }
4190 }
4191 sinfo->nr_frags -= n_frags_free;
4192 sinfo->xdp_frags_size -= len_free;
4193
4194 if (unlikely(!sinfo->nr_frags)) {
4195 xdp_buff_clear_frags_flag(xdp);
4196 xdp->data_end -= offset;
4197 }
4198
4199 return 0;
4200 }
4201
BPF_CALL_2(bpf_xdp_adjust_tail,struct xdp_buff *,xdp,int,offset)4202 BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset)
4203 {
4204 void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */
4205 void *data_end = xdp->data_end + offset;
4206
4207 if (unlikely(xdp_buff_has_frags(xdp))) { /* non-linear xdp buff */
4208 if (offset < 0)
4209 return bpf_xdp_frags_shrink_tail(xdp, -offset);
4210
4211 return bpf_xdp_frags_increase_tail(xdp, offset);
4212 }
4213
4214 /* Notice that xdp_data_hard_end have reserved some tailroom */
4215 if (unlikely(data_end > data_hard_end))
4216 return -EINVAL;
4217
4218 if (unlikely(data_end < xdp->data + ETH_HLEN))
4219 return -EINVAL;
4220
4221 /* Clear memory area on grow, can contain uninit kernel memory */
4222 if (offset > 0)
4223 memset(xdp->data_end, 0, offset);
4224
4225 xdp->data_end = data_end;
4226
4227 return 0;
4228 }
4229
4230 static const struct bpf_func_proto bpf_xdp_adjust_tail_proto = {
4231 .func = bpf_xdp_adjust_tail,
4232 .gpl_only = false,
4233 .ret_type = RET_INTEGER,
4234 .arg1_type = ARG_PTR_TO_CTX,
4235 .arg2_type = ARG_ANYTHING,
4236 };
4237
BPF_CALL_2(bpf_xdp_adjust_meta,struct xdp_buff *,xdp,int,offset)4238 BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
4239 {
4240 void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
4241 void *meta = xdp->data_meta + offset;
4242 unsigned long metalen = xdp->data - meta;
4243
4244 if (xdp_data_meta_unsupported(xdp))
4245 return -ENOTSUPP;
4246 if (unlikely(meta < xdp_frame_end ||
4247 meta > xdp->data))
4248 return -EINVAL;
4249 if (unlikely(xdp_metalen_invalid(metalen)))
4250 return -EACCES;
4251
4252 xdp->data_meta = meta;
4253
4254 return 0;
4255 }
4256
4257 static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
4258 .func = bpf_xdp_adjust_meta,
4259 .gpl_only = false,
4260 .ret_type = RET_INTEGER,
4261 .arg1_type = ARG_PTR_TO_CTX,
4262 .arg2_type = ARG_ANYTHING,
4263 };
4264
4265 /**
4266 * DOC: xdp redirect
4267 *
4268 * XDP_REDIRECT works by a three-step process, implemented in the functions
4269 * below:
4270 *
4271 * 1. The bpf_redirect() and bpf_redirect_map() helpers will lookup the target
4272 * of the redirect and store it (along with some other metadata) in a per-CPU
4273 * struct bpf_redirect_info.
4274 *
4275 * 2. When the program returns the XDP_REDIRECT return code, the driver will
4276 * call xdp_do_redirect() which will use the information in struct
4277 * bpf_redirect_info to actually enqueue the frame into a map type-specific
4278 * bulk queue structure.
4279 *
4280 * 3. Before exiting its NAPI poll loop, the driver will call
4281 * xdp_do_flush(), which will flush all the different bulk queues,
4282 * thus completing the redirect. Note that xdp_do_flush() must be
4283 * called before napi_complete_done() in the driver, as the
4284 * XDP_REDIRECT logic relies on being inside a single NAPI instance
4285 * through to the xdp_do_flush() call for RCU protection of all
4286 * in-kernel data structures.
4287 */
4288 /*
4289 * Pointers to the map entries will be kept around for this whole sequence of
4290 * steps, protected by RCU. However, there is no top-level rcu_read_lock() in
4291 * the core code; instead, the RCU protection relies on everything happening
4292 * inside a single NAPI poll sequence, which means it's between a pair of calls
4293 * to local_bh_disable()/local_bh_enable().
4294 *
4295 * The map entries are marked as __rcu and the map code makes sure to
4296 * dereference those pointers with rcu_dereference_check() in a way that works
4297 * for both sections that to hold an rcu_read_lock() and sections that are
4298 * called from NAPI without a separate rcu_read_lock(). The code below does not
4299 * use RCU annotations, but relies on those in the map code.
4300 */
xdp_do_flush(void)4301 void xdp_do_flush(void)
4302 {
4303 __dev_flush();
4304 __cpu_map_flush();
4305 __xsk_map_flush();
4306 }
4307 EXPORT_SYMBOL_GPL(xdp_do_flush);
4308
bpf_clear_redirect_map(struct bpf_map * map)4309 void bpf_clear_redirect_map(struct bpf_map *map)
4310 {
4311 struct bpf_redirect_info *ri;
4312 int cpu;
4313
4314 for_each_possible_cpu(cpu) {
4315 ri = per_cpu_ptr(&bpf_redirect_info, cpu);
4316 /* Avoid polluting remote cacheline due to writes if
4317 * not needed. Once we pass this test, we need the
4318 * cmpxchg() to make sure it hasn't been changed in
4319 * the meantime by remote CPU.
4320 */
4321 if (unlikely(READ_ONCE(ri->map) == map))
4322 cmpxchg(&ri->map, map, NULL);
4323 }
4324 }
4325
4326 DEFINE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key);
4327 EXPORT_SYMBOL_GPL(bpf_master_redirect_enabled_key);
4328
xdp_master_redirect(struct xdp_buff * xdp)4329 u32 xdp_master_redirect(struct xdp_buff *xdp)
4330 {
4331 struct net_device *master, *slave;
4332 struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
4333
4334 master = netdev_master_upper_dev_get_rcu(xdp->rxq->dev);
4335 slave = master->netdev_ops->ndo_xdp_get_xmit_slave(master, xdp);
4336 if (slave && slave != xdp->rxq->dev) {
4337 /* The target device is different from the receiving device, so
4338 * redirect it to the new device.
4339 * Using XDP_REDIRECT gets the correct behaviour from XDP enabled
4340 * drivers to unmap the packet from their rx ring.
4341 */
4342 ri->tgt_index = slave->ifindex;
4343 ri->map_id = INT_MAX;
4344 ri->map_type = BPF_MAP_TYPE_UNSPEC;
4345 return XDP_REDIRECT;
4346 }
4347 return XDP_TX;
4348 }
4349 EXPORT_SYMBOL_GPL(xdp_master_redirect);
4350
__xdp_do_redirect_xsk(struct bpf_redirect_info * ri,struct net_device * dev,struct xdp_buff * xdp,struct bpf_prog * xdp_prog)4351 static inline int __xdp_do_redirect_xsk(struct bpf_redirect_info *ri,
4352 struct net_device *dev,
4353 struct xdp_buff *xdp,
4354 struct bpf_prog *xdp_prog)
4355 {
4356 enum bpf_map_type map_type = ri->map_type;
4357 void *fwd = ri->tgt_value;
4358 u32 map_id = ri->map_id;
4359 int err;
4360
4361 ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
4362 ri->map_type = BPF_MAP_TYPE_UNSPEC;
4363
4364 err = __xsk_map_redirect(fwd, xdp);
4365 if (unlikely(err))
4366 goto err;
4367
4368 _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
4369 return 0;
4370 err:
4371 _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
4372 return err;
4373 }
4374
__xdp_do_redirect_frame(struct bpf_redirect_info * ri,struct net_device * dev,struct xdp_frame * xdpf,struct bpf_prog * xdp_prog)4375 static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri,
4376 struct net_device *dev,
4377 struct xdp_frame *xdpf,
4378 struct bpf_prog *xdp_prog)
4379 {
4380 enum bpf_map_type map_type = ri->map_type;
4381 void *fwd = ri->tgt_value;
4382 u32 map_id = ri->map_id;
4383 u32 flags = ri->flags;
4384 struct bpf_map *map;
4385 int err;
4386
4387 ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
4388 ri->flags = 0;
4389 ri->map_type = BPF_MAP_TYPE_UNSPEC;
4390
4391 if (unlikely(!xdpf)) {
4392 err = -EOVERFLOW;
4393 goto err;
4394 }
4395
4396 switch (map_type) {
4397 case BPF_MAP_TYPE_DEVMAP:
4398 fallthrough;
4399 case BPF_MAP_TYPE_DEVMAP_HASH:
4400 if (unlikely(flags & BPF_F_BROADCAST)) {
4401 map = READ_ONCE(ri->map);
4402
4403 /* The map pointer is cleared when the map is being torn
4404 * down by bpf_clear_redirect_map()
4405 */
4406 if (unlikely(!map)) {
4407 err = -ENOENT;
4408 break;
4409 }
4410
4411 WRITE_ONCE(ri->map, NULL);
4412 err = dev_map_enqueue_multi(xdpf, dev, map,
4413 flags & BPF_F_EXCLUDE_INGRESS);
4414 } else {
4415 err = dev_map_enqueue(fwd, xdpf, dev);
4416 }
4417 break;
4418 case BPF_MAP_TYPE_CPUMAP:
4419 err = cpu_map_enqueue(fwd, xdpf, dev);
4420 break;
4421 case BPF_MAP_TYPE_UNSPEC:
4422 if (map_id == INT_MAX) {
4423 fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
4424 if (unlikely(!fwd)) {
4425 err = -EINVAL;
4426 break;
4427 }
4428 err = dev_xdp_enqueue(fwd, xdpf, dev);
4429 break;
4430 }
4431 fallthrough;
4432 default:
4433 err = -EBADRQC;
4434 }
4435
4436 if (unlikely(err))
4437 goto err;
4438
4439 _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
4440 return 0;
4441 err:
4442 _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
4443 return err;
4444 }
4445
xdp_do_redirect(struct net_device * dev,struct xdp_buff * xdp,struct bpf_prog * xdp_prog)4446 int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
4447 struct bpf_prog *xdp_prog)
4448 {
4449 struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
4450 enum bpf_map_type map_type = ri->map_type;
4451
4452 if (map_type == BPF_MAP_TYPE_XSKMAP)
4453 return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog);
4454
4455 return __xdp_do_redirect_frame(ri, dev, xdp_convert_buff_to_frame(xdp),
4456 xdp_prog);
4457 }
4458 EXPORT_SYMBOL_GPL(xdp_do_redirect);
4459
xdp_do_redirect_frame(struct net_device * dev,struct xdp_buff * xdp,struct xdp_frame * xdpf,struct bpf_prog * xdp_prog)4460 int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp,
4461 struct xdp_frame *xdpf, struct bpf_prog *xdp_prog)
4462 {
4463 struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
4464 enum bpf_map_type map_type = ri->map_type;
4465
4466 if (map_type == BPF_MAP_TYPE_XSKMAP)
4467 return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog);
4468
4469 return __xdp_do_redirect_frame(ri, dev, xdpf, xdp_prog);
4470 }
4471 EXPORT_SYMBOL_GPL(xdp_do_redirect_frame);
4472
xdp_do_generic_redirect_map(struct net_device * dev,struct sk_buff * skb,struct xdp_buff * xdp,struct bpf_prog * xdp_prog,void * fwd,enum bpf_map_type map_type,u32 map_id,u32 flags)4473 static int xdp_do_generic_redirect_map(struct net_device *dev,
4474 struct sk_buff *skb,
4475 struct xdp_buff *xdp,
4476 struct bpf_prog *xdp_prog, void *fwd,
4477 enum bpf_map_type map_type, u32 map_id,
4478 u32 flags)
4479 {
4480 struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
4481 struct bpf_map *map;
4482 int err;
4483
4484 switch (map_type) {
4485 case BPF_MAP_TYPE_DEVMAP:
4486 fallthrough;
4487 case BPF_MAP_TYPE_DEVMAP_HASH:
4488 if (unlikely(flags & BPF_F_BROADCAST)) {
4489 map = READ_ONCE(ri->map);
4490
4491 /* The map pointer is cleared when the map is being torn
4492 * down by bpf_clear_redirect_map()
4493 */
4494 if (unlikely(!map)) {
4495 err = -ENOENT;
4496 break;
4497 }
4498
4499 WRITE_ONCE(ri->map, NULL);
4500 err = dev_map_redirect_multi(dev, skb, xdp_prog, map,
4501 flags & BPF_F_EXCLUDE_INGRESS);
4502 } else {
4503 err = dev_map_generic_redirect(fwd, skb, xdp_prog);
4504 }
4505 if (unlikely(err))
4506 goto err;
4507 break;
4508 case BPF_MAP_TYPE_XSKMAP:
4509 err = xsk_generic_rcv(fwd, xdp);
4510 if (err)
4511 goto err;
4512 consume_skb(skb);
4513 break;
4514 case BPF_MAP_TYPE_CPUMAP:
4515 err = cpu_map_generic_redirect(fwd, skb);
4516 if (unlikely(err))
4517 goto err;
4518 break;
4519 default:
4520 err = -EBADRQC;
4521 goto err;
4522 }
4523
4524 _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
4525 return 0;
4526 err:
4527 _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
4528 return err;
4529 }
4530
xdp_do_generic_redirect(struct net_device * dev,struct sk_buff * skb,struct xdp_buff * xdp,struct bpf_prog * xdp_prog)4531 int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
4532 struct xdp_buff *xdp, struct bpf_prog *xdp_prog)
4533 {
4534 struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
4535 enum bpf_map_type map_type = ri->map_type;
4536 void *fwd = ri->tgt_value;
4537 u32 map_id = ri->map_id;
4538 u32 flags = ri->flags;
4539 int err;
4540
4541 ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
4542 ri->flags = 0;
4543 ri->map_type = BPF_MAP_TYPE_UNSPEC;
4544
4545 if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) {
4546 fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
4547 if (unlikely(!fwd)) {
4548 err = -EINVAL;
4549 goto err;
4550 }
4551
4552 err = xdp_ok_fwd_dev(fwd, skb->len);
4553 if (unlikely(err))
4554 goto err;
4555
4556 skb->dev = fwd;
4557 _trace_xdp_redirect(dev, xdp_prog, ri->tgt_index);
4558 generic_xdp_tx(skb, xdp_prog);
4559 return 0;
4560 }
4561
4562 return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id, flags);
4563 err:
4564 _trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err);
4565 return err;
4566 }
4567
BPF_CALL_2(bpf_xdp_redirect,u32,ifindex,u64,flags)4568 BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
4569 {
4570 struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
4571
4572 if (unlikely(flags))
4573 return XDP_ABORTED;
4574
4575 /* NB! Map type UNSPEC and map_id == INT_MAX (never generated
4576 * by map_idr) is used for ifindex based XDP redirect.
4577 */
4578 ri->tgt_index = ifindex;
4579 ri->map_id = INT_MAX;
4580 ri->map_type = BPF_MAP_TYPE_UNSPEC;
4581
4582 return XDP_REDIRECT;
4583 }
4584
4585 static const struct bpf_func_proto bpf_xdp_redirect_proto = {
4586 .func = bpf_xdp_redirect,
4587 .gpl_only = false,
4588 .ret_type = RET_INTEGER,
4589 .arg1_type = ARG_ANYTHING,
4590 .arg2_type = ARG_ANYTHING,
4591 };
4592
BPF_CALL_3(bpf_xdp_redirect_map,struct bpf_map *,map,u64,key,u64,flags)4593 BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u64, key,
4594 u64, flags)
4595 {
4596 return map->ops->map_redirect(map, key, flags);
4597 }
4598
4599 static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
4600 .func = bpf_xdp_redirect_map,
4601 .gpl_only = false,
4602 .ret_type = RET_INTEGER,
4603 .arg1_type = ARG_CONST_MAP_PTR,
4604 .arg2_type = ARG_ANYTHING,
4605 .arg3_type = ARG_ANYTHING,
4606 };
4607
bpf_skb_copy(void * dst_buff,const void * skb,unsigned long off,unsigned long len)4608 static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
4609 unsigned long off, unsigned long len)
4610 {
4611 void *ptr = skb_header_pointer(skb, off, len, dst_buff);
4612
4613 if (unlikely(!ptr))
4614 return len;
4615 if (ptr != dst_buff)
4616 memcpy(dst_buff, ptr, len);
4617
4618 return 0;
4619 }
4620
BPF_CALL_5(bpf_skb_event_output,struct sk_buff *,skb,struct bpf_map *,map,u64,flags,void *,meta,u64,meta_size)4621 BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map,
4622 u64, flags, void *, meta, u64, meta_size)
4623 {
4624 u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
4625
4626 if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
4627 return -EINVAL;
4628 if (unlikely(!skb || skb_size > skb->len))
4629 return -EFAULT;
4630
4631 return bpf_event_output(map, flags, meta, meta_size, skb, skb_size,
4632 bpf_skb_copy);
4633 }
4634
4635 static const struct bpf_func_proto bpf_skb_event_output_proto = {
4636 .func = bpf_skb_event_output,
4637 .gpl_only = true,
4638 .ret_type = RET_INTEGER,
4639 .arg1_type = ARG_PTR_TO_CTX,
4640 .arg2_type = ARG_CONST_MAP_PTR,
4641 .arg3_type = ARG_ANYTHING,
4642 .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
4643 .arg5_type = ARG_CONST_SIZE_OR_ZERO,
4644 };
4645
4646 BTF_ID_LIST_SINGLE(bpf_skb_output_btf_ids, struct, sk_buff)
4647
4648 const struct bpf_func_proto bpf_skb_output_proto = {
4649 .func = bpf_skb_event_output,
4650 .gpl_only = true,
4651 .ret_type = RET_INTEGER,
4652 .arg1_type = ARG_PTR_TO_BTF_ID,
4653 .arg1_btf_id = &bpf_skb_output_btf_ids[0],
4654 .arg2_type = ARG_CONST_MAP_PTR,
4655 .arg3_type = ARG_ANYTHING,
4656 .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
4657 .arg5_type = ARG_CONST_SIZE_OR_ZERO,
4658 };
4659
bpf_tunnel_key_af(u64 flags)4660 static unsigned short bpf_tunnel_key_af(u64 flags)
4661 {
4662 return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET;
4663 }
4664
BPF_CALL_4(bpf_skb_get_tunnel_key,struct sk_buff *,skb,struct bpf_tunnel_key *,to,u32,size,u64,flags)4665 BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key *, to,
4666 u32, size, u64, flags)
4667 {
4668 const struct ip_tunnel_info *info = skb_tunnel_info(skb);
4669 u8 compat[sizeof(struct bpf_tunnel_key)];
4670 void *to_orig = to;
4671 int err;
4672
4673 if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6 |
4674 BPF_F_TUNINFO_FLAGS)))) {
4675 err = -EINVAL;
4676 goto err_clear;
4677 }
4678 if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) {
4679 err = -EPROTO;
4680 goto err_clear;
4681 }
4682 if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
4683 err = -EINVAL;
4684 switch (size) {
4685 case offsetof(struct bpf_tunnel_key, local_ipv6[0]):
4686 case offsetof(struct bpf_tunnel_key, tunnel_label):
4687 case offsetof(struct bpf_tunnel_key, tunnel_ext):
4688 goto set_compat;
4689 case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
4690 /* Fixup deprecated structure layouts here, so we have
4691 * a common path later on.
4692 */
4693 if (ip_tunnel_info_af(info) != AF_INET)
4694 goto err_clear;
4695 set_compat:
4696 to = (struct bpf_tunnel_key *)compat;
4697 break;
4698 default:
4699 goto err_clear;
4700 }
4701 }
4702
4703 to->tunnel_id = be64_to_cpu(info->key.tun_id);
4704 to->tunnel_tos = info->key.tos;
4705 to->tunnel_ttl = info->key.ttl;
4706 if (flags & BPF_F_TUNINFO_FLAGS)
4707 to->tunnel_flags = info->key.tun_flags;
4708 else
4709 to->tunnel_ext = 0;
4710
4711 if (flags & BPF_F_TUNINFO_IPV6) {
4712 memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
4713 sizeof(to->remote_ipv6));
4714 memcpy(to->local_ipv6, &info->key.u.ipv6.dst,
4715 sizeof(to->local_ipv6));
4716 to->tunnel_label = be32_to_cpu(info->key.label);
4717 } else {
4718 to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
4719 memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
4720 to->local_ipv4 = be32_to_cpu(info->key.u.ipv4.dst);
4721 memset(&to->local_ipv6[1], 0, sizeof(__u32) * 3);
4722 to->tunnel_label = 0;
4723 }
4724
4725 if (unlikely(size != sizeof(struct bpf_tunnel_key)))
4726 memcpy(to_orig, to, size);
4727
4728 return 0;
4729 err_clear:
4730 memset(to_orig, 0, size);
4731 return err;
4732 }
4733
4734 static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
4735 .func = bpf_skb_get_tunnel_key,
4736 .gpl_only = false,
4737 .ret_type = RET_INTEGER,
4738 .arg1_type = ARG_PTR_TO_CTX,
4739 .arg2_type = ARG_PTR_TO_UNINIT_MEM,
4740 .arg3_type = ARG_CONST_SIZE,
4741 .arg4_type = ARG_ANYTHING,
4742 };
4743
BPF_CALL_3(bpf_skb_get_tunnel_opt,struct sk_buff *,skb,u8 *,to,u32,size)4744 BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size)
4745 {
4746 const struct ip_tunnel_info *info = skb_tunnel_info(skb);
4747 int err;
4748
4749 if (unlikely(!info ||
4750 !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) {
4751 err = -ENOENT;
4752 goto err_clear;
4753 }
4754 if (unlikely(size < info->options_len)) {
4755 err = -ENOMEM;
4756 goto err_clear;
4757 }
4758
4759 ip_tunnel_info_opts_get(to, info);
4760 if (size > info->options_len)
4761 memset(to + info->options_len, 0, size - info->options_len);
4762
4763 return info->options_len;
4764 err_clear:
4765 memset(to, 0, size);
4766 return err;
4767 }
4768
4769 static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
4770 .func = bpf_skb_get_tunnel_opt,
4771 .gpl_only = false,
4772 .ret_type = RET_INTEGER,
4773 .arg1_type = ARG_PTR_TO_CTX,
4774 .arg2_type = ARG_PTR_TO_UNINIT_MEM,
4775 .arg3_type = ARG_CONST_SIZE,
4776 };
4777
4778 static struct metadata_dst __percpu *md_dst;
4779
BPF_CALL_4(bpf_skb_set_tunnel_key,struct sk_buff *,skb,const struct bpf_tunnel_key *,from,u32,size,u64,flags)4780 BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
4781 const struct bpf_tunnel_key *, from, u32, size, u64, flags)
4782 {
4783 struct metadata_dst *md = this_cpu_ptr(md_dst);
4784 u8 compat[sizeof(struct bpf_tunnel_key)];
4785 struct ip_tunnel_info *info;
4786
4787 if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX |
4788 BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER |
4789 BPF_F_NO_TUNNEL_KEY)))
4790 return -EINVAL;
4791 if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
4792 switch (size) {
4793 case offsetof(struct bpf_tunnel_key, local_ipv6[0]):
4794 case offsetof(struct bpf_tunnel_key, tunnel_label):
4795 case offsetof(struct bpf_tunnel_key, tunnel_ext):
4796 case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
4797 /* Fixup deprecated structure layouts here, so we have
4798 * a common path later on.
4799 */
4800 memcpy(compat, from, size);
4801 memset(compat + size, 0, sizeof(compat) - size);
4802 from = (const struct bpf_tunnel_key *) compat;
4803 break;
4804 default:
4805 return -EINVAL;
4806 }
4807 }
4808 if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) ||
4809 from->tunnel_ext))
4810 return -EINVAL;
4811
4812 skb_dst_drop(skb);
4813 dst_hold((struct dst_entry *) md);
4814 skb_dst_set(skb, (struct dst_entry *) md);
4815
4816 info = &md->u.tun_info;
4817 memset(info, 0, sizeof(*info));
4818 info->mode = IP_TUNNEL_INFO_TX;
4819
4820 info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE;
4821 if (flags & BPF_F_DONT_FRAGMENT)
4822 info->key.tun_flags |= TUNNEL_DONT_FRAGMENT;
4823 if (flags & BPF_F_ZERO_CSUM_TX)
4824 info->key.tun_flags &= ~TUNNEL_CSUM;
4825 if (flags & BPF_F_SEQ_NUMBER)
4826 info->key.tun_flags |= TUNNEL_SEQ;
4827 if (flags & BPF_F_NO_TUNNEL_KEY)
4828 info->key.tun_flags &= ~TUNNEL_KEY;
4829
4830 info->key.tun_id = cpu_to_be64(from->tunnel_id);
4831 info->key.tos = from->tunnel_tos;
4832 info->key.ttl = from->tunnel_ttl;
4833
4834 if (flags & BPF_F_TUNINFO_IPV6) {
4835 info->mode |= IP_TUNNEL_INFO_IPV6;
4836 memcpy(&info->key.u.ipv6.dst, from->remote_ipv6,
4837 sizeof(from->remote_ipv6));
4838 memcpy(&info->key.u.ipv6.src, from->local_ipv6,
4839 sizeof(from->local_ipv6));
4840 info->key.label = cpu_to_be32(from->tunnel_label) &
4841 IPV6_FLOWLABEL_MASK;
4842 } else {
4843 info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
4844 info->key.u.ipv4.src = cpu_to_be32(from->local_ipv4);
4845 info->key.flow_flags = FLOWI_FLAG_ANYSRC;
4846 }
4847
4848 return 0;
4849 }
4850
4851 static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
4852 .func = bpf_skb_set_tunnel_key,
4853 .gpl_only = false,
4854 .ret_type = RET_INTEGER,
4855 .arg1_type = ARG_PTR_TO_CTX,
4856 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
4857 .arg3_type = ARG_CONST_SIZE,
4858 .arg4_type = ARG_ANYTHING,
4859 };
4860
BPF_CALL_3(bpf_skb_set_tunnel_opt,struct sk_buff *,skb,const u8 *,from,u32,size)4861 BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb,
4862 const u8 *, from, u32, size)
4863 {
4864 struct ip_tunnel_info *info = skb_tunnel_info(skb);
4865 const struct metadata_dst *md = this_cpu_ptr(md_dst);
4866
4867 if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1))))
4868 return -EINVAL;
4869 if (unlikely(size > IP_TUNNEL_OPTS_MAX))
4870 return -ENOMEM;
4871
4872 ip_tunnel_info_opts_set(info, from, size, TUNNEL_OPTIONS_PRESENT);
4873
4874 return 0;
4875 }
4876
4877 static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = {
4878 .func = bpf_skb_set_tunnel_opt,
4879 .gpl_only = false,
4880 .ret_type = RET_INTEGER,
4881 .arg1_type = ARG_PTR_TO_CTX,
4882 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
4883 .arg3_type = ARG_CONST_SIZE,
4884 };
4885
4886 static const struct bpf_func_proto *
bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)4887 bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
4888 {
4889 if (!md_dst) {
4890 struct metadata_dst __percpu *tmp;
4891
4892 tmp = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX,
4893 METADATA_IP_TUNNEL,
4894 GFP_KERNEL);
4895 if (!tmp)
4896 return NULL;
4897 if (cmpxchg(&md_dst, NULL, tmp))
4898 metadata_dst_free_percpu(tmp);
4899 }
4900
4901 switch (which) {
4902 case BPF_FUNC_skb_set_tunnel_key:
4903 return &bpf_skb_set_tunnel_key_proto;
4904 case BPF_FUNC_skb_set_tunnel_opt:
4905 return &bpf_skb_set_tunnel_opt_proto;
4906 default:
4907 return NULL;
4908 }
4909 }
4910
BPF_CALL_3(bpf_skb_under_cgroup,struct sk_buff *,skb,struct bpf_map *,map,u32,idx)4911 BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map,
4912 u32, idx)
4913 {
4914 struct bpf_array *array = container_of(map, struct bpf_array, map);
4915 struct cgroup *cgrp;
4916 struct sock *sk;
4917
4918 sk = skb_to_full_sk(skb);
4919 if (!sk || !sk_fullsock(sk))
4920 return -ENOENT;
4921 if (unlikely(idx >= array->map.max_entries))
4922 return -E2BIG;
4923
4924 cgrp = READ_ONCE(array->ptrs[idx]);
4925 if (unlikely(!cgrp))
4926 return -EAGAIN;
4927
4928 return sk_under_cgroup_hierarchy(sk, cgrp);
4929 }
4930
4931 static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
4932 .func = bpf_skb_under_cgroup,
4933 .gpl_only = false,
4934 .ret_type = RET_INTEGER,
4935 .arg1_type = ARG_PTR_TO_CTX,
4936 .arg2_type = ARG_CONST_MAP_PTR,
4937 .arg3_type = ARG_ANYTHING,
4938 };
4939
4940 #ifdef CONFIG_SOCK_CGROUP_DATA
__bpf_sk_cgroup_id(struct sock * sk)4941 static inline u64 __bpf_sk_cgroup_id(struct sock *sk)
4942 {
4943 struct cgroup *cgrp;
4944
4945 sk = sk_to_full_sk(sk);
4946 if (!sk || !sk_fullsock(sk))
4947 return 0;
4948
4949 cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
4950 return cgroup_id(cgrp);
4951 }
4952
BPF_CALL_1(bpf_skb_cgroup_id,const struct sk_buff *,skb)4953 BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb)
4954 {
4955 return __bpf_sk_cgroup_id(skb->sk);
4956 }
4957
4958 static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
4959 .func = bpf_skb_cgroup_id,
4960 .gpl_only = false,
4961 .ret_type = RET_INTEGER,
4962 .arg1_type = ARG_PTR_TO_CTX,
4963 };
4964
__bpf_sk_ancestor_cgroup_id(struct sock * sk,int ancestor_level)4965 static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk,
4966 int ancestor_level)
4967 {
4968 struct cgroup *ancestor;
4969 struct cgroup *cgrp;
4970
4971 sk = sk_to_full_sk(sk);
4972 if (!sk || !sk_fullsock(sk))
4973 return 0;
4974
4975 cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
4976 ancestor = cgroup_ancestor(cgrp, ancestor_level);
4977 if (!ancestor)
4978 return 0;
4979
4980 return cgroup_id(ancestor);
4981 }
4982
BPF_CALL_2(bpf_skb_ancestor_cgroup_id,const struct sk_buff *,skb,int,ancestor_level)4983 BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
4984 ancestor_level)
4985 {
4986 return __bpf_sk_ancestor_cgroup_id(skb->sk, ancestor_level);
4987 }
4988
4989 static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = {
4990 .func = bpf_skb_ancestor_cgroup_id,
4991 .gpl_only = false,
4992 .ret_type = RET_INTEGER,
4993 .arg1_type = ARG_PTR_TO_CTX,
4994 .arg2_type = ARG_ANYTHING,
4995 };
4996
BPF_CALL_1(bpf_sk_cgroup_id,struct sock *,sk)4997 BPF_CALL_1(bpf_sk_cgroup_id, struct sock *, sk)
4998 {
4999 return __bpf_sk_cgroup_id(sk);
5000 }
5001
5002 static const struct bpf_func_proto bpf_sk_cgroup_id_proto = {
5003 .func = bpf_sk_cgroup_id,
5004 .gpl_only = false,
5005 .ret_type = RET_INTEGER,
5006 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
5007 };
5008
BPF_CALL_2(bpf_sk_ancestor_cgroup_id,struct sock *,sk,int,ancestor_level)5009 BPF_CALL_2(bpf_sk_ancestor_cgroup_id, struct sock *, sk, int, ancestor_level)
5010 {
5011 return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level);
5012 }
5013
5014 static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = {
5015 .func = bpf_sk_ancestor_cgroup_id,
5016 .gpl_only = false,
5017 .ret_type = RET_INTEGER,
5018 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
5019 .arg2_type = ARG_ANYTHING,
5020 };
5021 #endif
5022
bpf_xdp_copy(void * dst,const void * ctx,unsigned long off,unsigned long len)5023 static unsigned long bpf_xdp_copy(void *dst, const void *ctx,
5024 unsigned long off, unsigned long len)
5025 {
5026 struct xdp_buff *xdp = (struct xdp_buff *)ctx;
5027
5028 bpf_xdp_copy_buf(xdp, off, dst, len, false);
5029 return 0;
5030 }
5031
BPF_CALL_5(bpf_xdp_event_output,struct xdp_buff *,xdp,struct bpf_map *,map,u64,flags,void *,meta,u64,meta_size)5032 BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map,
5033 u64, flags, void *, meta, u64, meta_size)
5034 {
5035 u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32;
5036
5037 if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
5038 return -EINVAL;
5039
5040 if (unlikely(!xdp || xdp_size > xdp_get_buff_len(xdp)))
5041 return -EFAULT;
5042
5043 return bpf_event_output(map, flags, meta, meta_size, xdp,
5044 xdp_size, bpf_xdp_copy);
5045 }
5046
5047 static const struct bpf_func_proto bpf_xdp_event_output_proto = {
5048 .func = bpf_xdp_event_output,
5049 .gpl_only = true,
5050 .ret_type = RET_INTEGER,
5051 .arg1_type = ARG_PTR_TO_CTX,
5052 .arg2_type = ARG_CONST_MAP_PTR,
5053 .arg3_type = ARG_ANYTHING,
5054 .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
5055 .arg5_type = ARG_CONST_SIZE_OR_ZERO,
5056 };
5057
5058 BTF_ID_LIST_SINGLE(bpf_xdp_output_btf_ids, struct, xdp_buff)
5059
5060 const struct bpf_func_proto bpf_xdp_output_proto = {
5061 .func = bpf_xdp_event_output,
5062 .gpl_only = true,
5063 .ret_type = RET_INTEGER,
5064 .arg1_type = ARG_PTR_TO_BTF_ID,
5065 .arg1_btf_id = &bpf_xdp_output_btf_ids[0],
5066 .arg2_type = ARG_CONST_MAP_PTR,
5067 .arg3_type = ARG_ANYTHING,
5068 .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
5069 .arg5_type = ARG_CONST_SIZE_OR_ZERO,
5070 };
5071
BPF_CALL_1(bpf_get_socket_cookie,struct sk_buff *,skb)5072 BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb)
5073 {
5074 return skb->sk ? __sock_gen_cookie(skb->sk) : 0;
5075 }
5076
5077 static const struct bpf_func_proto bpf_get_socket_cookie_proto = {
5078 .func = bpf_get_socket_cookie,
5079 .gpl_only = false,
5080 .ret_type = RET_INTEGER,
5081 .arg1_type = ARG_PTR_TO_CTX,
5082 };
5083
BPF_CALL_1(bpf_get_socket_cookie_sock_addr,struct bpf_sock_addr_kern *,ctx)5084 BPF_CALL_1(bpf_get_socket_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx)
5085 {
5086 return __sock_gen_cookie(ctx->sk);
5087 }
5088
5089 static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = {
5090 .func = bpf_get_socket_cookie_sock_addr,
5091 .gpl_only = false,
5092 .ret_type = RET_INTEGER,
5093 .arg1_type = ARG_PTR_TO_CTX,
5094 };
5095
BPF_CALL_1(bpf_get_socket_cookie_sock,struct sock *,ctx)5096 BPF_CALL_1(bpf_get_socket_cookie_sock, struct sock *, ctx)
5097 {
5098 return __sock_gen_cookie(ctx);
5099 }
5100
5101 static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = {
5102 .func = bpf_get_socket_cookie_sock,
5103 .gpl_only = false,
5104 .ret_type = RET_INTEGER,
5105 .arg1_type = ARG_PTR_TO_CTX,
5106 };
5107
BPF_CALL_1(bpf_get_socket_ptr_cookie,struct sock *,sk)5108 BPF_CALL_1(bpf_get_socket_ptr_cookie, struct sock *, sk)
5109 {
5110 return sk ? sock_gen_cookie(sk) : 0;
5111 }
5112
5113 const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto = {
5114 .func = bpf_get_socket_ptr_cookie,
5115 .gpl_only = false,
5116 .ret_type = RET_INTEGER,
5117 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON | PTR_MAYBE_NULL,
5118 };
5119
BPF_CALL_1(bpf_get_socket_cookie_sock_ops,struct bpf_sock_ops_kern *,ctx)5120 BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
5121 {
5122 return __sock_gen_cookie(ctx->sk);
5123 }
5124
5125 static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = {
5126 .func = bpf_get_socket_cookie_sock_ops,
5127 .gpl_only = false,
5128 .ret_type = RET_INTEGER,
5129 .arg1_type = ARG_PTR_TO_CTX,
5130 };
5131
__bpf_get_netns_cookie(struct sock * sk)5132 static u64 __bpf_get_netns_cookie(struct sock *sk)
5133 {
5134 const struct net *net = sk ? sock_net(sk) : &init_net;
5135
5136 return net->net_cookie;
5137 }
5138
BPF_CALL_1(bpf_get_netns_cookie_sock,struct sock *,ctx)5139 BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx)
5140 {
5141 return __bpf_get_netns_cookie(ctx);
5142 }
5143
5144 static const struct bpf_func_proto bpf_get_netns_cookie_sock_proto = {
5145 .func = bpf_get_netns_cookie_sock,
5146 .gpl_only = false,
5147 .ret_type = RET_INTEGER,
5148 .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
5149 };
5150
BPF_CALL_1(bpf_get_netns_cookie_sock_addr,struct bpf_sock_addr_kern *,ctx)5151 BPF_CALL_1(bpf_get_netns_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx)
5152 {
5153 return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
5154 }
5155
5156 static const struct bpf_func_proto bpf_get_netns_cookie_sock_addr_proto = {
5157 .func = bpf_get_netns_cookie_sock_addr,
5158 .gpl_only = false,
5159 .ret_type = RET_INTEGER,
5160 .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
5161 };
5162
BPF_CALL_1(bpf_get_netns_cookie_sock_ops,struct bpf_sock_ops_kern *,ctx)5163 BPF_CALL_1(bpf_get_netns_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
5164 {
5165 return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
5166 }
5167
5168 static const struct bpf_func_proto bpf_get_netns_cookie_sock_ops_proto = {
5169 .func = bpf_get_netns_cookie_sock_ops,
5170 .gpl_only = false,
5171 .ret_type = RET_INTEGER,
5172 .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
5173 };
5174
BPF_CALL_1(bpf_get_netns_cookie_sk_msg,struct sk_msg *,ctx)5175 BPF_CALL_1(bpf_get_netns_cookie_sk_msg, struct sk_msg *, ctx)
5176 {
5177 return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
5178 }
5179
5180 static const struct bpf_func_proto bpf_get_netns_cookie_sk_msg_proto = {
5181 .func = bpf_get_netns_cookie_sk_msg,
5182 .gpl_only = false,
5183 .ret_type = RET_INTEGER,
5184 .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
5185 };
5186
BPF_CALL_1(bpf_get_socket_uid,struct sk_buff *,skb)5187 BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb)
5188 {
5189 struct sock *sk = sk_to_full_sk(skb->sk);
5190 kuid_t kuid;
5191
5192 if (!sk || !sk_fullsock(sk))
5193 return overflowuid;
5194 kuid = sock_net_uid(sock_net(sk), sk);
5195 return from_kuid_munged(sock_net(sk)->user_ns, kuid);
5196 }
5197
5198 static const struct bpf_func_proto bpf_get_socket_uid_proto = {
5199 .func = bpf_get_socket_uid,
5200 .gpl_only = false,
5201 .ret_type = RET_INTEGER,
5202 .arg1_type = ARG_PTR_TO_CTX,
5203 };
5204
sol_socket_sockopt(struct sock * sk,int optname,char * optval,int * optlen,bool getopt)5205 static int sol_socket_sockopt(struct sock *sk, int optname,
5206 char *optval, int *optlen,
5207 bool getopt)
5208 {
5209 switch (optname) {
5210 case SO_REUSEADDR:
5211 case SO_SNDBUF:
5212 case SO_RCVBUF:
5213 case SO_KEEPALIVE:
5214 case SO_PRIORITY:
5215 case SO_REUSEPORT:
5216 case SO_RCVLOWAT:
5217 case SO_MARK:
5218 case SO_MAX_PACING_RATE:
5219 case SO_BINDTOIFINDEX:
5220 case SO_TXREHASH:
5221 if (*optlen != sizeof(int))
5222 return -EINVAL;
5223 break;
5224 case SO_BINDTODEVICE:
5225 break;
5226 default:
5227 return -EINVAL;
5228 }
5229
5230 if (getopt) {
5231 if (optname == SO_BINDTODEVICE)
5232 return -EINVAL;
5233 return sk_getsockopt(sk, SOL_SOCKET, optname,
5234 KERNEL_SOCKPTR(optval),
5235 KERNEL_SOCKPTR(optlen));
5236 }
5237
5238 return sk_setsockopt(sk, SOL_SOCKET, optname,
5239 KERNEL_SOCKPTR(optval), *optlen);
5240 }
5241
bpf_sol_tcp_setsockopt(struct sock * sk,int optname,char * optval,int optlen)5242 static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname,
5243 char *optval, int optlen)
5244 {
5245 struct tcp_sock *tp = tcp_sk(sk);
5246 unsigned long timeout;
5247 int val;
5248
5249 if (optlen != sizeof(int))
5250 return -EINVAL;
5251
5252 val = *(int *)optval;
5253
5254 /* Only some options are supported */
5255 switch (optname) {
5256 case TCP_BPF_IW:
5257 if (val <= 0 || tp->data_segs_out > tp->syn_data)
5258 return -EINVAL;
5259 tcp_snd_cwnd_set(tp, val);
5260 break;
5261 case TCP_BPF_SNDCWND_CLAMP:
5262 if (val <= 0)
5263 return -EINVAL;
5264 tp->snd_cwnd_clamp = val;
5265 tp->snd_ssthresh = val;
5266 break;
5267 case TCP_BPF_DELACK_MAX:
5268 timeout = usecs_to_jiffies(val);
5269 if (timeout > TCP_DELACK_MAX ||
5270 timeout < TCP_TIMEOUT_MIN)
5271 return -EINVAL;
5272 inet_csk(sk)->icsk_delack_max = timeout;
5273 break;
5274 case TCP_BPF_RTO_MIN:
5275 timeout = usecs_to_jiffies(val);
5276 if (timeout > TCP_RTO_MIN ||
5277 timeout < TCP_TIMEOUT_MIN)
5278 return -EINVAL;
5279 inet_csk(sk)->icsk_rto_min = timeout;
5280 break;
5281 default:
5282 return -EINVAL;
5283 }
5284
5285 return 0;
5286 }
5287
sol_tcp_sockopt_congestion(struct sock * sk,char * optval,int * optlen,bool getopt)5288 static int sol_tcp_sockopt_congestion(struct sock *sk, char *optval,
5289 int *optlen, bool getopt)
5290 {
5291 struct tcp_sock *tp;
5292 int ret;
5293
5294 if (*optlen < 2)
5295 return -EINVAL;
5296
5297 if (getopt) {
5298 if (!inet_csk(sk)->icsk_ca_ops)
5299 return -EINVAL;
5300 /* BPF expects NULL-terminated tcp-cc string */
5301 optval[--(*optlen)] = '\0';
5302 return do_tcp_getsockopt(sk, SOL_TCP, TCP_CONGESTION,
5303 KERNEL_SOCKPTR(optval),
5304 KERNEL_SOCKPTR(optlen));
5305 }
5306
5307 /* "cdg" is the only cc that alloc a ptr
5308 * in inet_csk_ca area. The bpf-tcp-cc may
5309 * overwrite this ptr after switching to cdg.
5310 */
5311 if (*optlen >= sizeof("cdg") - 1 && !strncmp("cdg", optval, *optlen))
5312 return -ENOTSUPP;
5313
5314 /* It stops this looping
5315 *
5316 * .init => bpf_setsockopt(tcp_cc) => .init =>
5317 * bpf_setsockopt(tcp_cc)" => .init => ....
5318 *
5319 * The second bpf_setsockopt(tcp_cc) is not allowed
5320 * in order to break the loop when both .init
5321 * are the same bpf prog.
5322 *
5323 * This applies even the second bpf_setsockopt(tcp_cc)
5324 * does not cause a loop. This limits only the first
5325 * '.init' can call bpf_setsockopt(TCP_CONGESTION) to
5326 * pick a fallback cc (eg. peer does not support ECN)
5327 * and the second '.init' cannot fallback to
5328 * another.
5329 */
5330 tp = tcp_sk(sk);
5331 if (tp->bpf_chg_cc_inprogress)
5332 return -EBUSY;
5333
5334 tp->bpf_chg_cc_inprogress = 1;
5335 ret = do_tcp_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
5336 KERNEL_SOCKPTR(optval), *optlen);
5337 tp->bpf_chg_cc_inprogress = 0;
5338 return ret;
5339 }
5340
sol_tcp_sockopt(struct sock * sk,int optname,char * optval,int * optlen,bool getopt)5341 static int sol_tcp_sockopt(struct sock *sk, int optname,
5342 char *optval, int *optlen,
5343 bool getopt)
5344 {
5345 if (sk->sk_protocol != IPPROTO_TCP)
5346 return -EINVAL;
5347
5348 switch (optname) {
5349 case TCP_NODELAY:
5350 case TCP_MAXSEG:
5351 case TCP_KEEPIDLE:
5352 case TCP_KEEPINTVL:
5353 case TCP_KEEPCNT:
5354 case TCP_SYNCNT:
5355 case TCP_WINDOW_CLAMP:
5356 case TCP_THIN_LINEAR_TIMEOUTS:
5357 case TCP_USER_TIMEOUT:
5358 case TCP_NOTSENT_LOWAT:
5359 case TCP_SAVE_SYN:
5360 if (*optlen != sizeof(int))
5361 return -EINVAL;
5362 break;
5363 case TCP_CONGESTION:
5364 return sol_tcp_sockopt_congestion(sk, optval, optlen, getopt);
5365 case TCP_SAVED_SYN:
5366 if (*optlen < 1)
5367 return -EINVAL;
5368 break;
5369 default:
5370 if (getopt)
5371 return -EINVAL;
5372 return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen);
5373 }
5374
5375 if (getopt) {
5376 if (optname == TCP_SAVED_SYN) {
5377 struct tcp_sock *tp = tcp_sk(sk);
5378
5379 if (!tp->saved_syn ||
5380 *optlen > tcp_saved_syn_len(tp->saved_syn))
5381 return -EINVAL;
5382 memcpy(optval, tp->saved_syn->data, *optlen);
5383 /* It cannot free tp->saved_syn here because it
5384 * does not know if the user space still needs it.
5385 */
5386 return 0;
5387 }
5388
5389 return do_tcp_getsockopt(sk, SOL_TCP, optname,
5390 KERNEL_SOCKPTR(optval),
5391 KERNEL_SOCKPTR(optlen));
5392 }
5393
5394 return do_tcp_setsockopt(sk, SOL_TCP, optname,
5395 KERNEL_SOCKPTR(optval), *optlen);
5396 }
5397
sol_ip_sockopt(struct sock * sk,int optname,char * optval,int * optlen,bool getopt)5398 static int sol_ip_sockopt(struct sock *sk, int optname,
5399 char *optval, int *optlen,
5400 bool getopt)
5401 {
5402 if (sk->sk_family != AF_INET)
5403 return -EINVAL;
5404
5405 switch (optname) {
5406 case IP_TOS:
5407 if (*optlen != sizeof(int))
5408 return -EINVAL;
5409 break;
5410 default:
5411 return -EINVAL;
5412 }
5413
5414 if (getopt)
5415 return do_ip_getsockopt(sk, SOL_IP, optname,
5416 KERNEL_SOCKPTR(optval),
5417 KERNEL_SOCKPTR(optlen));
5418
5419 return do_ip_setsockopt(sk, SOL_IP, optname,
5420 KERNEL_SOCKPTR(optval), *optlen);
5421 }
5422
sol_ipv6_sockopt(struct sock * sk,int optname,char * optval,int * optlen,bool getopt)5423 static int sol_ipv6_sockopt(struct sock *sk, int optname,
5424 char *optval, int *optlen,
5425 bool getopt)
5426 {
5427 if (sk->sk_family != AF_INET6)
5428 return -EINVAL;
5429
5430 switch (optname) {
5431 case IPV6_TCLASS:
5432 case IPV6_AUTOFLOWLABEL:
5433 if (*optlen != sizeof(int))
5434 return -EINVAL;
5435 break;
5436 default:
5437 return -EINVAL;
5438 }
5439
5440 if (getopt)
5441 return ipv6_bpf_stub->ipv6_getsockopt(sk, SOL_IPV6, optname,
5442 KERNEL_SOCKPTR(optval),
5443 KERNEL_SOCKPTR(optlen));
5444
5445 return ipv6_bpf_stub->ipv6_setsockopt(sk, SOL_IPV6, optname,
5446 KERNEL_SOCKPTR(optval), *optlen);
5447 }
5448
__bpf_setsockopt(struct sock * sk,int level,int optname,char * optval,int optlen)5449 static int __bpf_setsockopt(struct sock *sk, int level, int optname,
5450 char *optval, int optlen)
5451 {
5452 if (!sk_fullsock(sk))
5453 return -EINVAL;
5454
5455 if (level == SOL_SOCKET)
5456 return sol_socket_sockopt(sk, optname, optval, &optlen, false);
5457 else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP)
5458 return sol_ip_sockopt(sk, optname, optval, &optlen, false);
5459 else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6)
5460 return sol_ipv6_sockopt(sk, optname, optval, &optlen, false);
5461 else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP)
5462 return sol_tcp_sockopt(sk, optname, optval, &optlen, false);
5463
5464 return -EINVAL;
5465 }
5466
_bpf_setsockopt(struct sock * sk,int level,int optname,char * optval,int optlen)5467 static int _bpf_setsockopt(struct sock *sk, int level, int optname,
5468 char *optval, int optlen)
5469 {
5470 if (sk_fullsock(sk))
5471 sock_owned_by_me(sk);
5472 return __bpf_setsockopt(sk, level, optname, optval, optlen);
5473 }
5474
__bpf_getsockopt(struct sock * sk,int level,int optname,char * optval,int optlen)5475 static int __bpf_getsockopt(struct sock *sk, int level, int optname,
5476 char *optval, int optlen)
5477 {
5478 int err, saved_optlen = optlen;
5479
5480 if (!sk_fullsock(sk)) {
5481 err = -EINVAL;
5482 goto done;
5483 }
5484
5485 if (level == SOL_SOCKET)
5486 err = sol_socket_sockopt(sk, optname, optval, &optlen, true);
5487 else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP)
5488 err = sol_tcp_sockopt(sk, optname, optval, &optlen, true);
5489 else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP)
5490 err = sol_ip_sockopt(sk, optname, optval, &optlen, true);
5491 else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6)
5492 err = sol_ipv6_sockopt(sk, optname, optval, &optlen, true);
5493 else
5494 err = -EINVAL;
5495
5496 done:
5497 if (err)
5498 optlen = 0;
5499 if (optlen < saved_optlen)
5500 memset(optval + optlen, 0, saved_optlen - optlen);
5501 return err;
5502 }
5503
_bpf_getsockopt(struct sock * sk,int level,int optname,char * optval,int optlen)5504 static int _bpf_getsockopt(struct sock *sk, int level, int optname,
5505 char *optval, int optlen)
5506 {
5507 if (sk_fullsock(sk))
5508 sock_owned_by_me(sk);
5509 return __bpf_getsockopt(sk, level, optname, optval, optlen);
5510 }
5511
BPF_CALL_5(bpf_sk_setsockopt,struct sock *,sk,int,level,int,optname,char *,optval,int,optlen)5512 BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level,
5513 int, optname, char *, optval, int, optlen)
5514 {
5515 return _bpf_setsockopt(sk, level, optname, optval, optlen);
5516 }
5517
5518 const struct bpf_func_proto bpf_sk_setsockopt_proto = {
5519 .func = bpf_sk_setsockopt,
5520 .gpl_only = false,
5521 .ret_type = RET_INTEGER,
5522 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
5523 .arg2_type = ARG_ANYTHING,
5524 .arg3_type = ARG_ANYTHING,
5525 .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
5526 .arg5_type = ARG_CONST_SIZE,
5527 };
5528
BPF_CALL_5(bpf_sk_getsockopt,struct sock *,sk,int,level,int,optname,char *,optval,int,optlen)5529 BPF_CALL_5(bpf_sk_getsockopt, struct sock *, sk, int, level,
5530 int, optname, char *, optval, int, optlen)
5531 {
5532 return _bpf_getsockopt(sk, level, optname, optval, optlen);
5533 }
5534
5535 const struct bpf_func_proto bpf_sk_getsockopt_proto = {
5536 .func = bpf_sk_getsockopt,
5537 .gpl_only = false,
5538 .ret_type = RET_INTEGER,
5539 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
5540 .arg2_type = ARG_ANYTHING,
5541 .arg3_type = ARG_ANYTHING,
5542 .arg4_type = ARG_PTR_TO_UNINIT_MEM,
5543 .arg5_type = ARG_CONST_SIZE,
5544 };
5545
BPF_CALL_5(bpf_unlocked_sk_setsockopt,struct sock *,sk,int,level,int,optname,char *,optval,int,optlen)5546 BPF_CALL_5(bpf_unlocked_sk_setsockopt, struct sock *, sk, int, level,
5547 int, optname, char *, optval, int, optlen)
5548 {
5549 return __bpf_setsockopt(sk, level, optname, optval, optlen);
5550 }
5551
5552 const struct bpf_func_proto bpf_unlocked_sk_setsockopt_proto = {
5553 .func = bpf_unlocked_sk_setsockopt,
5554 .gpl_only = false,
5555 .ret_type = RET_INTEGER,
5556 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
5557 .arg2_type = ARG_ANYTHING,
5558 .arg3_type = ARG_ANYTHING,
5559 .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
5560 .arg5_type = ARG_CONST_SIZE,
5561 };
5562
BPF_CALL_5(bpf_unlocked_sk_getsockopt,struct sock *,sk,int,level,int,optname,char *,optval,int,optlen)5563 BPF_CALL_5(bpf_unlocked_sk_getsockopt, struct sock *, sk, int, level,
5564 int, optname, char *, optval, int, optlen)
5565 {
5566 return __bpf_getsockopt(sk, level, optname, optval, optlen);
5567 }
5568
5569 const struct bpf_func_proto bpf_unlocked_sk_getsockopt_proto = {
5570 .func = bpf_unlocked_sk_getsockopt,
5571 .gpl_only = false,
5572 .ret_type = RET_INTEGER,
5573 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
5574 .arg2_type = ARG_ANYTHING,
5575 .arg3_type = ARG_ANYTHING,
5576 .arg4_type = ARG_PTR_TO_UNINIT_MEM,
5577 .arg5_type = ARG_CONST_SIZE,
5578 };
5579
BPF_CALL_5(bpf_sock_addr_setsockopt,struct bpf_sock_addr_kern *,ctx,int,level,int,optname,char *,optval,int,optlen)5580 BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx,
5581 int, level, int, optname, char *, optval, int, optlen)
5582 {
5583 return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen);
5584 }
5585
5586 static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = {
5587 .func = bpf_sock_addr_setsockopt,
5588 .gpl_only = false,
5589 .ret_type = RET_INTEGER,
5590 .arg1_type = ARG_PTR_TO_CTX,
5591 .arg2_type = ARG_ANYTHING,
5592 .arg3_type = ARG_ANYTHING,
5593 .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
5594 .arg5_type = ARG_CONST_SIZE,
5595 };
5596
BPF_CALL_5(bpf_sock_addr_getsockopt,struct bpf_sock_addr_kern *,ctx,int,level,int,optname,char *,optval,int,optlen)5597 BPF_CALL_5(bpf_sock_addr_getsockopt, struct bpf_sock_addr_kern *, ctx,
5598 int, level, int, optname, char *, optval, int, optlen)
5599 {
5600 return _bpf_getsockopt(ctx->sk, level, optname, optval, optlen);
5601 }
5602
5603 static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = {
5604 .func = bpf_sock_addr_getsockopt,
5605 .gpl_only = false,
5606 .ret_type = RET_INTEGER,
5607 .arg1_type = ARG_PTR_TO_CTX,
5608 .arg2_type = ARG_ANYTHING,
5609 .arg3_type = ARG_ANYTHING,
5610 .arg4_type = ARG_PTR_TO_UNINIT_MEM,
5611 .arg5_type = ARG_CONST_SIZE,
5612 };
5613
BPF_CALL_5(bpf_sock_ops_setsockopt,struct bpf_sock_ops_kern *,bpf_sock,int,level,int,optname,char *,optval,int,optlen)5614 BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
5615 int, level, int, optname, char *, optval, int, optlen)
5616 {
5617 return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen);
5618 }
5619
5620 static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = {
5621 .func = bpf_sock_ops_setsockopt,
5622 .gpl_only = false,
5623 .ret_type = RET_INTEGER,
5624 .arg1_type = ARG_PTR_TO_CTX,
5625 .arg2_type = ARG_ANYTHING,
5626 .arg3_type = ARG_ANYTHING,
5627 .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
5628 .arg5_type = ARG_CONST_SIZE,
5629 };
5630
bpf_sock_ops_get_syn(struct bpf_sock_ops_kern * bpf_sock,int optname,const u8 ** start)5631 static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock,
5632 int optname, const u8 **start)
5633 {
5634 struct sk_buff *syn_skb = bpf_sock->syn_skb;
5635 const u8 *hdr_start;
5636 int ret;
5637
5638 if (syn_skb) {
5639 /* sk is a request_sock here */
5640
5641 if (optname == TCP_BPF_SYN) {
5642 hdr_start = syn_skb->data;
5643 ret = tcp_hdrlen(syn_skb);
5644 } else if (optname == TCP_BPF_SYN_IP) {
5645 hdr_start = skb_network_header(syn_skb);
5646 ret = skb_network_header_len(syn_skb) +
5647 tcp_hdrlen(syn_skb);
5648 } else {
5649 /* optname == TCP_BPF_SYN_MAC */
5650 hdr_start = skb_mac_header(syn_skb);
5651 ret = skb_mac_header_len(syn_skb) +
5652 skb_network_header_len(syn_skb) +
5653 tcp_hdrlen(syn_skb);
5654 }
5655 } else {
5656 struct sock *sk = bpf_sock->sk;
5657 struct saved_syn *saved_syn;
5658
5659 if (sk->sk_state == TCP_NEW_SYN_RECV)
5660 /* synack retransmit. bpf_sock->syn_skb will
5661 * not be available. It has to resort to
5662 * saved_syn (if it is saved).
5663 */
5664 saved_syn = inet_reqsk(sk)->saved_syn;
5665 else
5666 saved_syn = tcp_sk(sk)->saved_syn;
5667
5668 if (!saved_syn)
5669 return -ENOENT;
5670
5671 if (optname == TCP_BPF_SYN) {
5672 hdr_start = saved_syn->data +
5673 saved_syn->mac_hdrlen +
5674 saved_syn->network_hdrlen;
5675 ret = saved_syn->tcp_hdrlen;
5676 } else if (optname == TCP_BPF_SYN_IP) {
5677 hdr_start = saved_syn->data +
5678 saved_syn->mac_hdrlen;
5679 ret = saved_syn->network_hdrlen +
5680 saved_syn->tcp_hdrlen;
5681 } else {
5682 /* optname == TCP_BPF_SYN_MAC */
5683
5684 /* TCP_SAVE_SYN may not have saved the mac hdr */
5685 if (!saved_syn->mac_hdrlen)
5686 return -ENOENT;
5687
5688 hdr_start = saved_syn->data;
5689 ret = saved_syn->mac_hdrlen +
5690 saved_syn->network_hdrlen +
5691 saved_syn->tcp_hdrlen;
5692 }
5693 }
5694
5695 *start = hdr_start;
5696 return ret;
5697 }
5698
BPF_CALL_5(bpf_sock_ops_getsockopt,struct bpf_sock_ops_kern *,bpf_sock,int,level,int,optname,char *,optval,int,optlen)5699 BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
5700 int, level, int, optname, char *, optval, int, optlen)
5701 {
5702 if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP &&
5703 optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_MAC) {
5704 int ret, copy_len = 0;
5705 const u8 *start;
5706
5707 ret = bpf_sock_ops_get_syn(bpf_sock, optname, &start);
5708 if (ret > 0) {
5709 copy_len = ret;
5710 if (optlen < copy_len) {
5711 copy_len = optlen;
5712 ret = -ENOSPC;
5713 }
5714
5715 memcpy(optval, start, copy_len);
5716 }
5717
5718 /* Zero out unused buffer at the end */
5719 memset(optval + copy_len, 0, optlen - copy_len);
5720
5721 return ret;
5722 }
5723
5724 return _bpf_getsockopt(bpf_sock->sk, level, optname, optval, optlen);
5725 }
5726
5727 static const struct bpf_func_proto bpf_sock_ops_getsockopt_proto = {
5728 .func = bpf_sock_ops_getsockopt,
5729 .gpl_only = false,
5730 .ret_type = RET_INTEGER,
5731 .arg1_type = ARG_PTR_TO_CTX,
5732 .arg2_type = ARG_ANYTHING,
5733 .arg3_type = ARG_ANYTHING,
5734 .arg4_type = ARG_PTR_TO_UNINIT_MEM,
5735 .arg5_type = ARG_CONST_SIZE,
5736 };
5737
BPF_CALL_2(bpf_sock_ops_cb_flags_set,struct bpf_sock_ops_kern *,bpf_sock,int,argval)5738 BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock,
5739 int, argval)
5740 {
5741 struct sock *sk = bpf_sock->sk;
5742 int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS;
5743
5744 if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk))
5745 return -EINVAL;
5746
5747 tcp_sk(sk)->bpf_sock_ops_cb_flags = val;
5748
5749 return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS);
5750 }
5751
5752 static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = {
5753 .func = bpf_sock_ops_cb_flags_set,
5754 .gpl_only = false,
5755 .ret_type = RET_INTEGER,
5756 .arg1_type = ARG_PTR_TO_CTX,
5757 .arg2_type = ARG_ANYTHING,
5758 };
5759
5760 const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly;
5761 EXPORT_SYMBOL_GPL(ipv6_bpf_stub);
5762
BPF_CALL_3(bpf_bind,struct bpf_sock_addr_kern *,ctx,struct sockaddr *,addr,int,addr_len)5763 BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr,
5764 int, addr_len)
5765 {
5766 #ifdef CONFIG_INET
5767 struct sock *sk = ctx->sk;
5768 u32 flags = BIND_FROM_BPF;
5769 int err;
5770
5771 err = -EINVAL;
5772 if (addr_len < offsetofend(struct sockaddr, sa_family))
5773 return err;
5774 if (addr->sa_family == AF_INET) {
5775 if (addr_len < sizeof(struct sockaddr_in))
5776 return err;
5777 if (((struct sockaddr_in *)addr)->sin_port == htons(0))
5778 flags |= BIND_FORCE_ADDRESS_NO_PORT;
5779 return __inet_bind(sk, addr, addr_len, flags);
5780 #if IS_ENABLED(CONFIG_IPV6)
5781 } else if (addr->sa_family == AF_INET6) {
5782 if (addr_len < SIN6_LEN_RFC2133)
5783 return err;
5784 if (((struct sockaddr_in6 *)addr)->sin6_port == htons(0))
5785 flags |= BIND_FORCE_ADDRESS_NO_PORT;
5786 /* ipv6_bpf_stub cannot be NULL, since it's called from
5787 * bpf_cgroup_inet6_connect hook and ipv6 is already loaded
5788 */
5789 return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, flags);
5790 #endif /* CONFIG_IPV6 */
5791 }
5792 #endif /* CONFIG_INET */
5793
5794 return -EAFNOSUPPORT;
5795 }
5796
5797 static const struct bpf_func_proto bpf_bind_proto = {
5798 .func = bpf_bind,
5799 .gpl_only = false,
5800 .ret_type = RET_INTEGER,
5801 .arg1_type = ARG_PTR_TO_CTX,
5802 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
5803 .arg3_type = ARG_CONST_SIZE,
5804 };
5805
5806 #ifdef CONFIG_XFRM
5807
5808 #if (IS_BUILTIN(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \
5809 (IS_MODULE(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))
5810
5811 struct metadata_dst __percpu *xfrm_bpf_md_dst;
5812 EXPORT_SYMBOL_GPL(xfrm_bpf_md_dst);
5813
5814 #endif
5815
BPF_CALL_5(bpf_skb_get_xfrm_state,struct sk_buff *,skb,u32,index,struct bpf_xfrm_state *,to,u32,size,u64,flags)5816 BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index,
5817 struct bpf_xfrm_state *, to, u32, size, u64, flags)
5818 {
5819 const struct sec_path *sp = skb_sec_path(skb);
5820 const struct xfrm_state *x;
5821
5822 if (!sp || unlikely(index >= sp->len || flags))
5823 goto err_clear;
5824
5825 x = sp->xvec[index];
5826
5827 if (unlikely(size != sizeof(struct bpf_xfrm_state)))
5828 goto err_clear;
5829
5830 to->reqid = x->props.reqid;
5831 to->spi = x->id.spi;
5832 to->family = x->props.family;
5833 to->ext = 0;
5834
5835 if (to->family == AF_INET6) {
5836 memcpy(to->remote_ipv6, x->props.saddr.a6,
5837 sizeof(to->remote_ipv6));
5838 } else {
5839 to->remote_ipv4 = x->props.saddr.a4;
5840 memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
5841 }
5842
5843 return 0;
5844 err_clear:
5845 memset(to, 0, size);
5846 return -EINVAL;
5847 }
5848
5849 static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
5850 .func = bpf_skb_get_xfrm_state,
5851 .gpl_only = false,
5852 .ret_type = RET_INTEGER,
5853 .arg1_type = ARG_PTR_TO_CTX,
5854 .arg2_type = ARG_ANYTHING,
5855 .arg3_type = ARG_PTR_TO_UNINIT_MEM,
5856 .arg4_type = ARG_CONST_SIZE,
5857 .arg5_type = ARG_ANYTHING,
5858 };
5859 #endif
5860
5861 #if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
bpf_fib_set_fwd_params(struct bpf_fib_lookup * params,u32 mtu)5862 static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, u32 mtu)
5863 {
5864 params->h_vlan_TCI = 0;
5865 params->h_vlan_proto = 0;
5866 if (mtu)
5867 params->mtu_result = mtu; /* union with tot_len */
5868
5869 return 0;
5870 }
5871 #endif
5872
5873 #if IS_ENABLED(CONFIG_INET)
bpf_ipv4_fib_lookup(struct net * net,struct bpf_fib_lookup * params,u32 flags,bool check_mtu)5874 static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
5875 u32 flags, bool check_mtu)
5876 {
5877 struct fib_nh_common *nhc;
5878 struct in_device *in_dev;
5879 struct neighbour *neigh;
5880 struct net_device *dev;
5881 struct fib_result res;
5882 struct flowi4 fl4;
5883 u32 mtu = 0;
5884 int err;
5885
5886 dev = dev_get_by_index_rcu(net, params->ifindex);
5887 if (unlikely(!dev))
5888 return -ENODEV;
5889
5890 /* verify forwarding is enabled on this interface */
5891 in_dev = __in_dev_get_rcu(dev);
5892 if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
5893 return BPF_FIB_LKUP_RET_FWD_DISABLED;
5894
5895 if (flags & BPF_FIB_LOOKUP_OUTPUT) {
5896 fl4.flowi4_iif = 1;
5897 fl4.flowi4_oif = params->ifindex;
5898 } else {
5899 fl4.flowi4_iif = params->ifindex;
5900 fl4.flowi4_oif = 0;
5901 }
5902 fl4.flowi4_tos = params->tos & IPTOS_RT_MASK;
5903 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
5904 fl4.flowi4_flags = 0;
5905
5906 fl4.flowi4_proto = params->l4_protocol;
5907 fl4.daddr = params->ipv4_dst;
5908 fl4.saddr = params->ipv4_src;
5909 fl4.fl4_sport = params->sport;
5910 fl4.fl4_dport = params->dport;
5911 fl4.flowi4_multipath_hash = 0;
5912
5913 if (flags & BPF_FIB_LOOKUP_DIRECT) {
5914 u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
5915 struct fib_table *tb;
5916
5917 if (flags & BPF_FIB_LOOKUP_TBID) {
5918 tbid = params->tbid;
5919 /* zero out for vlan output */
5920 params->tbid = 0;
5921 }
5922
5923 tb = fib_get_table(net, tbid);
5924 if (unlikely(!tb))
5925 return BPF_FIB_LKUP_RET_NOT_FWDED;
5926
5927 err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
5928 } else {
5929 fl4.flowi4_mark = 0;
5930 fl4.flowi4_secid = 0;
5931 fl4.flowi4_tun_key.tun_id = 0;
5932 fl4.flowi4_uid = sock_net_uid(net, NULL);
5933
5934 err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
5935 }
5936
5937 if (err) {
5938 /* map fib lookup errors to RTN_ type */
5939 if (err == -EINVAL)
5940 return BPF_FIB_LKUP_RET_BLACKHOLE;
5941 if (err == -EHOSTUNREACH)
5942 return BPF_FIB_LKUP_RET_UNREACHABLE;
5943 if (err == -EACCES)
5944 return BPF_FIB_LKUP_RET_PROHIBIT;
5945
5946 return BPF_FIB_LKUP_RET_NOT_FWDED;
5947 }
5948
5949 if (res.type != RTN_UNICAST)
5950 return BPF_FIB_LKUP_RET_NOT_FWDED;
5951
5952 if (fib_info_num_path(res.fi) > 1)
5953 fib_select_path(net, &res, &fl4, NULL);
5954
5955 if (check_mtu) {
5956 mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst);
5957 if (params->tot_len > mtu) {
5958 params->mtu_result = mtu; /* union with tot_len */
5959 return BPF_FIB_LKUP_RET_FRAG_NEEDED;
5960 }
5961 }
5962
5963 nhc = res.nhc;
5964
5965 /* do not handle lwt encaps right now */
5966 if (nhc->nhc_lwtstate)
5967 return BPF_FIB_LKUP_RET_UNSUPP_LWT;
5968
5969 dev = nhc->nhc_dev;
5970
5971 params->rt_metric = res.fi->fib_priority;
5972 params->ifindex = dev->ifindex;
5973
5974 if (flags & BPF_FIB_LOOKUP_SRC)
5975 params->ipv4_src = fib_result_prefsrc(net, &res);
5976
5977 /* xdp and cls_bpf programs are run in RCU-bh so
5978 * rcu_read_lock_bh is not needed here
5979 */
5980 if (likely(nhc->nhc_gw_family != AF_INET6)) {
5981 if (nhc->nhc_gw_family)
5982 params->ipv4_dst = nhc->nhc_gw.ipv4;
5983 } else {
5984 struct in6_addr *dst = (struct in6_addr *)params->ipv6_dst;
5985
5986 params->family = AF_INET6;
5987 *dst = nhc->nhc_gw.ipv6;
5988 }
5989
5990 if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH)
5991 goto set_fwd_params;
5992
5993 if (likely(nhc->nhc_gw_family != AF_INET6))
5994 neigh = __ipv4_neigh_lookup_noref(dev,
5995 (__force u32)params->ipv4_dst);
5996 else
5997 neigh = __ipv6_neigh_lookup_noref_stub(dev, params->ipv6_dst);
5998
5999 if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID))
6000 return BPF_FIB_LKUP_RET_NO_NEIGH;
6001 memcpy(params->dmac, neigh->ha, ETH_ALEN);
6002 memcpy(params->smac, dev->dev_addr, ETH_ALEN);
6003
6004 set_fwd_params:
6005 return bpf_fib_set_fwd_params(params, mtu);
6006 }
6007 #endif
6008
6009 #if IS_ENABLED(CONFIG_IPV6)
bpf_ipv6_fib_lookup(struct net * net,struct bpf_fib_lookup * params,u32 flags,bool check_mtu)6010 static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
6011 u32 flags, bool check_mtu)
6012 {
6013 struct in6_addr *src = (struct in6_addr *) params->ipv6_src;
6014 struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst;
6015 struct fib6_result res = {};
6016 struct neighbour *neigh;
6017 struct net_device *dev;
6018 struct inet6_dev *idev;
6019 struct flowi6 fl6;
6020 int strict = 0;
6021 int oif, err;
6022 u32 mtu = 0;
6023
6024 /* link local addresses are never forwarded */
6025 if (rt6_need_strict(dst) || rt6_need_strict(src))
6026 return BPF_FIB_LKUP_RET_NOT_FWDED;
6027
6028 dev = dev_get_by_index_rcu(net, params->ifindex);
6029 if (unlikely(!dev))
6030 return -ENODEV;
6031
6032 idev = __in6_dev_get_safely(dev);
6033 if (unlikely(!idev || !idev->cnf.forwarding))
6034 return BPF_FIB_LKUP_RET_FWD_DISABLED;
6035
6036 if (flags & BPF_FIB_LOOKUP_OUTPUT) {
6037 fl6.flowi6_iif = 1;
6038 oif = fl6.flowi6_oif = params->ifindex;
6039 } else {
6040 oif = fl6.flowi6_iif = params->ifindex;
6041 fl6.flowi6_oif = 0;
6042 strict = RT6_LOOKUP_F_HAS_SADDR;
6043 }
6044 fl6.flowlabel = params->flowinfo;
6045 fl6.flowi6_scope = 0;
6046 fl6.flowi6_flags = 0;
6047 fl6.mp_hash = 0;
6048
6049 fl6.flowi6_proto = params->l4_protocol;
6050 fl6.daddr = *dst;
6051 fl6.saddr = *src;
6052 fl6.fl6_sport = params->sport;
6053 fl6.fl6_dport = params->dport;
6054
6055 if (flags & BPF_FIB_LOOKUP_DIRECT) {
6056 u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
6057 struct fib6_table *tb;
6058
6059 if (flags & BPF_FIB_LOOKUP_TBID) {
6060 tbid = params->tbid;
6061 /* zero out for vlan output */
6062 params->tbid = 0;
6063 }
6064
6065 tb = ipv6_stub->fib6_get_table(net, tbid);
6066 if (unlikely(!tb))
6067 return BPF_FIB_LKUP_RET_NOT_FWDED;
6068
6069 err = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, &res,
6070 strict);
6071 } else {
6072 fl6.flowi6_mark = 0;
6073 fl6.flowi6_secid = 0;
6074 fl6.flowi6_tun_key.tun_id = 0;
6075 fl6.flowi6_uid = sock_net_uid(net, NULL);
6076
6077 err = ipv6_stub->fib6_lookup(net, oif, &fl6, &res, strict);
6078 }
6079
6080 if (unlikely(err || IS_ERR_OR_NULL(res.f6i) ||
6081 res.f6i == net->ipv6.fib6_null_entry))
6082 return BPF_FIB_LKUP_RET_NOT_FWDED;
6083
6084 switch (res.fib6_type) {
6085 /* only unicast is forwarded */
6086 case RTN_UNICAST:
6087 break;
6088 case RTN_BLACKHOLE:
6089 return BPF_FIB_LKUP_RET_BLACKHOLE;
6090 case RTN_UNREACHABLE:
6091 return BPF_FIB_LKUP_RET_UNREACHABLE;
6092 case RTN_PROHIBIT:
6093 return BPF_FIB_LKUP_RET_PROHIBIT;
6094 default:
6095 return BPF_FIB_LKUP_RET_NOT_FWDED;
6096 }
6097
6098 ipv6_stub->fib6_select_path(net, &res, &fl6, fl6.flowi6_oif,
6099 fl6.flowi6_oif != 0, NULL, strict);
6100
6101 if (check_mtu) {
6102 mtu = ipv6_stub->ip6_mtu_from_fib6(&res, dst, src);
6103 if (params->tot_len > mtu) {
6104 params->mtu_result = mtu; /* union with tot_len */
6105 return BPF_FIB_LKUP_RET_FRAG_NEEDED;
6106 }
6107 }
6108
6109 if (res.nh->fib_nh_lws)
6110 return BPF_FIB_LKUP_RET_UNSUPP_LWT;
6111
6112 if (res.nh->fib_nh_gw_family)
6113 *dst = res.nh->fib_nh_gw6;
6114
6115 dev = res.nh->fib_nh_dev;
6116 params->rt_metric = res.f6i->fib6_metric;
6117 params->ifindex = dev->ifindex;
6118
6119 if (flags & BPF_FIB_LOOKUP_SRC) {
6120 if (res.f6i->fib6_prefsrc.plen) {
6121 *src = res.f6i->fib6_prefsrc.addr;
6122 } else {
6123 err = ipv6_bpf_stub->ipv6_dev_get_saddr(net, dev,
6124 &fl6.daddr, 0,
6125 src);
6126 if (err)
6127 return BPF_FIB_LKUP_RET_NO_SRC_ADDR;
6128 }
6129 }
6130
6131 if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH)
6132 goto set_fwd_params;
6133
6134 /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
6135 * not needed here.
6136 */
6137 neigh = __ipv6_neigh_lookup_noref_stub(dev, dst);
6138 if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID))
6139 return BPF_FIB_LKUP_RET_NO_NEIGH;
6140 memcpy(params->dmac, neigh->ha, ETH_ALEN);
6141 memcpy(params->smac, dev->dev_addr, ETH_ALEN);
6142
6143 set_fwd_params:
6144 return bpf_fib_set_fwd_params(params, mtu);
6145 }
6146 #endif
6147
6148 #define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \
6149 BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID | \
6150 BPF_FIB_LOOKUP_SRC)
6151
BPF_CALL_4(bpf_xdp_fib_lookup,struct xdp_buff *,ctx,struct bpf_fib_lookup *,params,int,plen,u32,flags)6152 BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
6153 struct bpf_fib_lookup *, params, int, plen, u32, flags)
6154 {
6155 if (plen < sizeof(*params))
6156 return -EINVAL;
6157
6158 if (flags & ~BPF_FIB_LOOKUP_MASK)
6159 return -EINVAL;
6160
6161 switch (params->family) {
6162 #if IS_ENABLED(CONFIG_INET)
6163 case AF_INET:
6164 return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params,
6165 flags, true);
6166 #endif
6167 #if IS_ENABLED(CONFIG_IPV6)
6168 case AF_INET6:
6169 return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params,
6170 flags, true);
6171 #endif
6172 }
6173 return -EAFNOSUPPORT;
6174 }
6175
6176 static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = {
6177 .func = bpf_xdp_fib_lookup,
6178 .gpl_only = true,
6179 .ret_type = RET_INTEGER,
6180 .arg1_type = ARG_PTR_TO_CTX,
6181 .arg2_type = ARG_PTR_TO_MEM,
6182 .arg3_type = ARG_CONST_SIZE,
6183 .arg4_type = ARG_ANYTHING,
6184 };
6185
BPF_CALL_4(bpf_skb_fib_lookup,struct sk_buff *,skb,struct bpf_fib_lookup *,params,int,plen,u32,flags)6186 BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
6187 struct bpf_fib_lookup *, params, int, plen, u32, flags)
6188 {
6189 struct net *net = dev_net(skb->dev);
6190 int rc = -EAFNOSUPPORT;
6191 bool check_mtu = false;
6192
6193 if (plen < sizeof(*params))
6194 return -EINVAL;
6195
6196 if (flags & ~BPF_FIB_LOOKUP_MASK)
6197 return -EINVAL;
6198
6199 if (params->tot_len)
6200 check_mtu = true;
6201
6202 switch (params->family) {
6203 #if IS_ENABLED(CONFIG_INET)
6204 case AF_INET:
6205 rc = bpf_ipv4_fib_lookup(net, params, flags, check_mtu);
6206 break;
6207 #endif
6208 #if IS_ENABLED(CONFIG_IPV6)
6209 case AF_INET6:
6210 rc = bpf_ipv6_fib_lookup(net, params, flags, check_mtu);
6211 break;
6212 #endif
6213 }
6214
6215 if (rc == BPF_FIB_LKUP_RET_SUCCESS && !check_mtu) {
6216 struct net_device *dev;
6217
6218 /* When tot_len isn't provided by user, check skb
6219 * against MTU of FIB lookup resulting net_device
6220 */
6221 dev = dev_get_by_index_rcu(net, params->ifindex);
6222 if (!is_skb_forwardable(dev, skb))
6223 rc = BPF_FIB_LKUP_RET_FRAG_NEEDED;
6224
6225 params->mtu_result = dev->mtu; /* union with tot_len */
6226 }
6227
6228 return rc;
6229 }
6230
6231 static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
6232 .func = bpf_skb_fib_lookup,
6233 .gpl_only = true,
6234 .ret_type = RET_INTEGER,
6235 .arg1_type = ARG_PTR_TO_CTX,
6236 .arg2_type = ARG_PTR_TO_MEM,
6237 .arg3_type = ARG_CONST_SIZE,
6238 .arg4_type = ARG_ANYTHING,
6239 };
6240
__dev_via_ifindex(struct net_device * dev_curr,u32 ifindex)6241 static struct net_device *__dev_via_ifindex(struct net_device *dev_curr,
6242 u32 ifindex)
6243 {
6244 struct net *netns = dev_net(dev_curr);
6245
6246 /* Non-redirect use-cases can use ifindex=0 and save ifindex lookup */
6247 if (ifindex == 0)
6248 return dev_curr;
6249
6250 return dev_get_by_index_rcu(netns, ifindex);
6251 }
6252
BPF_CALL_5(bpf_skb_check_mtu,struct sk_buff *,skb,u32,ifindex,u32 *,mtu_len,s32,len_diff,u64,flags)6253 BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb,
6254 u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags)
6255 {
6256 int ret = BPF_MTU_CHK_RET_FRAG_NEEDED;
6257 struct net_device *dev = skb->dev;
6258 int mtu, dev_len, skb_len;
6259
6260 if (unlikely(flags & ~(BPF_MTU_CHK_SEGS)))
6261 return -EINVAL;
6262 if (unlikely(flags & BPF_MTU_CHK_SEGS && (len_diff || *mtu_len)))
6263 return -EINVAL;
6264
6265 dev = __dev_via_ifindex(dev, ifindex);
6266 if (unlikely(!dev))
6267 return -ENODEV;
6268
6269 mtu = READ_ONCE(dev->mtu);
6270 dev_len = mtu + dev->hard_header_len;
6271
6272 /* If set use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */
6273 skb_len = *mtu_len ? *mtu_len + dev->hard_header_len : skb->len;
6274
6275 skb_len += len_diff; /* minus result pass check */
6276 if (skb_len <= dev_len) {
6277 ret = BPF_MTU_CHK_RET_SUCCESS;
6278 goto out;
6279 }
6280 /* At this point, skb->len exceed MTU, but as it include length of all
6281 * segments, it can still be below MTU. The SKB can possibly get
6282 * re-segmented in transmit path (see validate_xmit_skb). Thus, user
6283 * must choose if segs are to be MTU checked.
6284 */
6285 if (skb_is_gso(skb)) {
6286 ret = BPF_MTU_CHK_RET_SUCCESS;
6287 if (flags & BPF_MTU_CHK_SEGS &&
6288 !skb_gso_validate_network_len(skb, mtu))
6289 ret = BPF_MTU_CHK_RET_SEGS_TOOBIG;
6290 }
6291 out:
6292 *mtu_len = mtu;
6293 return ret;
6294 }
6295
BPF_CALL_5(bpf_xdp_check_mtu,struct xdp_buff *,xdp,u32,ifindex,u32 *,mtu_len,s32,len_diff,u64,flags)6296 BPF_CALL_5(bpf_xdp_check_mtu, struct xdp_buff *, xdp,
6297 u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags)
6298 {
6299 struct net_device *dev = xdp->rxq->dev;
6300 int xdp_len = xdp->data_end - xdp->data;
6301 int ret = BPF_MTU_CHK_RET_SUCCESS;
6302 int mtu, dev_len;
6303
6304 /* XDP variant doesn't support multi-buffer segment check (yet) */
6305 if (unlikely(flags))
6306 return -EINVAL;
6307
6308 dev = __dev_via_ifindex(dev, ifindex);
6309 if (unlikely(!dev))
6310 return -ENODEV;
6311
6312 mtu = READ_ONCE(dev->mtu);
6313 dev_len = mtu + dev->hard_header_len;
6314
6315 /* Use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */
6316 if (*mtu_len)
6317 xdp_len = *mtu_len + dev->hard_header_len;
6318
6319 xdp_len += len_diff; /* minus result pass check */
6320 if (xdp_len > dev_len)
6321 ret = BPF_MTU_CHK_RET_FRAG_NEEDED;
6322
6323 *mtu_len = mtu;
6324 return ret;
6325 }
6326
6327 static const struct bpf_func_proto bpf_skb_check_mtu_proto = {
6328 .func = bpf_skb_check_mtu,
6329 .gpl_only = true,
6330 .ret_type = RET_INTEGER,
6331 .arg1_type = ARG_PTR_TO_CTX,
6332 .arg2_type = ARG_ANYTHING,
6333 .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_WRITE | MEM_ALIGNED,
6334 .arg3_size = sizeof(u32),
6335 .arg4_type = ARG_ANYTHING,
6336 .arg5_type = ARG_ANYTHING,
6337 };
6338
6339 static const struct bpf_func_proto bpf_xdp_check_mtu_proto = {
6340 .func = bpf_xdp_check_mtu,
6341 .gpl_only = true,
6342 .ret_type = RET_INTEGER,
6343 .arg1_type = ARG_PTR_TO_CTX,
6344 .arg2_type = ARG_ANYTHING,
6345 .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_WRITE | MEM_ALIGNED,
6346 .arg3_size = sizeof(u32),
6347 .arg4_type = ARG_ANYTHING,
6348 .arg5_type = ARG_ANYTHING,
6349 };
6350
6351 #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
bpf_push_seg6_encap(struct sk_buff * skb,u32 type,void * hdr,u32 len)6352 static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
6353 {
6354 int err;
6355 struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr;
6356
6357 if (!seg6_validate_srh(srh, len, false))
6358 return -EINVAL;
6359
6360 switch (type) {
6361 case BPF_LWT_ENCAP_SEG6_INLINE:
6362 if (skb->protocol != htons(ETH_P_IPV6))
6363 return -EBADMSG;
6364
6365 err = seg6_do_srh_inline(skb, srh);
6366 break;
6367 case BPF_LWT_ENCAP_SEG6:
6368 skb_reset_inner_headers(skb);
6369 skb->encapsulation = 1;
6370 err = seg6_do_srh_encap(skb, srh, IPPROTO_IPV6);
6371 break;
6372 default:
6373 return -EINVAL;
6374 }
6375
6376 bpf_compute_data_pointers(skb);
6377 if (err)
6378 return err;
6379
6380 skb_set_transport_header(skb, sizeof(struct ipv6hdr));
6381
6382 return seg6_lookup_nexthop(skb, NULL, 0);
6383 }
6384 #endif /* CONFIG_IPV6_SEG6_BPF */
6385
6386 #if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
bpf_push_ip_encap(struct sk_buff * skb,void * hdr,u32 len,bool ingress)6387 static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
6388 bool ingress)
6389 {
6390 return bpf_lwt_push_ip_encap(skb, hdr, len, ingress);
6391 }
6392 #endif
6393
BPF_CALL_4(bpf_lwt_in_push_encap,struct sk_buff *,skb,u32,type,void *,hdr,u32,len)6394 BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
6395 u32, len)
6396 {
6397 switch (type) {
6398 #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
6399 case BPF_LWT_ENCAP_SEG6:
6400 case BPF_LWT_ENCAP_SEG6_INLINE:
6401 return bpf_push_seg6_encap(skb, type, hdr, len);
6402 #endif
6403 #if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
6404 case BPF_LWT_ENCAP_IP:
6405 return bpf_push_ip_encap(skb, hdr, len, true /* ingress */);
6406 #endif
6407 default:
6408 return -EINVAL;
6409 }
6410 }
6411
BPF_CALL_4(bpf_lwt_xmit_push_encap,struct sk_buff *,skb,u32,type,void *,hdr,u32,len)6412 BPF_CALL_4(bpf_lwt_xmit_push_encap, struct sk_buff *, skb, u32, type,
6413 void *, hdr, u32, len)
6414 {
6415 switch (type) {
6416 #if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
6417 case BPF_LWT_ENCAP_IP:
6418 return bpf_push_ip_encap(skb, hdr, len, false /* egress */);
6419 #endif
6420 default:
6421 return -EINVAL;
6422 }
6423 }
6424
6425 static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = {
6426 .func = bpf_lwt_in_push_encap,
6427 .gpl_only = false,
6428 .ret_type = RET_INTEGER,
6429 .arg1_type = ARG_PTR_TO_CTX,
6430 .arg2_type = ARG_ANYTHING,
6431 .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
6432 .arg4_type = ARG_CONST_SIZE
6433 };
6434
6435 static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = {
6436 .func = bpf_lwt_xmit_push_encap,
6437 .gpl_only = false,
6438 .ret_type = RET_INTEGER,
6439 .arg1_type = ARG_PTR_TO_CTX,
6440 .arg2_type = ARG_ANYTHING,
6441 .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
6442 .arg4_type = ARG_CONST_SIZE
6443 };
6444
6445 #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
BPF_CALL_4(bpf_lwt_seg6_store_bytes,struct sk_buff *,skb,u32,offset,const void *,from,u32,len)6446 BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset,
6447 const void *, from, u32, len)
6448 {
6449 struct seg6_bpf_srh_state *srh_state =
6450 this_cpu_ptr(&seg6_bpf_srh_states);
6451 struct ipv6_sr_hdr *srh = srh_state->srh;
6452 void *srh_tlvs, *srh_end, *ptr;
6453 int srhoff = 0;
6454
6455 if (srh == NULL)
6456 return -EINVAL;
6457
6458 srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4));
6459 srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen);
6460
6461 ptr = skb->data + offset;
6462 if (ptr >= srh_tlvs && ptr + len <= srh_end)
6463 srh_state->valid = false;
6464 else if (ptr < (void *)&srh->flags ||
6465 ptr + len > (void *)&srh->segments)
6466 return -EFAULT;
6467
6468 if (unlikely(bpf_try_make_writable(skb, offset + len)))
6469 return -EFAULT;
6470 if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
6471 return -EINVAL;
6472 srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
6473
6474 memcpy(skb->data + offset, from, len);
6475 return 0;
6476 }
6477
6478 static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = {
6479 .func = bpf_lwt_seg6_store_bytes,
6480 .gpl_only = false,
6481 .ret_type = RET_INTEGER,
6482 .arg1_type = ARG_PTR_TO_CTX,
6483 .arg2_type = ARG_ANYTHING,
6484 .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
6485 .arg4_type = ARG_CONST_SIZE
6486 };
6487
bpf_update_srh_state(struct sk_buff * skb)6488 static void bpf_update_srh_state(struct sk_buff *skb)
6489 {
6490 struct seg6_bpf_srh_state *srh_state =
6491 this_cpu_ptr(&seg6_bpf_srh_states);
6492 int srhoff = 0;
6493
6494 if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) {
6495 srh_state->srh = NULL;
6496 } else {
6497 srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
6498 srh_state->hdrlen = srh_state->srh->hdrlen << 3;
6499 srh_state->valid = true;
6500 }
6501 }
6502
BPF_CALL_4(bpf_lwt_seg6_action,struct sk_buff *,skb,u32,action,void *,param,u32,param_len)6503 BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb,
6504 u32, action, void *, param, u32, param_len)
6505 {
6506 struct seg6_bpf_srh_state *srh_state =
6507 this_cpu_ptr(&seg6_bpf_srh_states);
6508 int hdroff = 0;
6509 int err;
6510
6511 switch (action) {
6512 case SEG6_LOCAL_ACTION_END_X:
6513 if (!seg6_bpf_has_valid_srh(skb))
6514 return -EBADMSG;
6515 if (param_len != sizeof(struct in6_addr))
6516 return -EINVAL;
6517 return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0);
6518 case SEG6_LOCAL_ACTION_END_T:
6519 if (!seg6_bpf_has_valid_srh(skb))
6520 return -EBADMSG;
6521 if (param_len != sizeof(int))
6522 return -EINVAL;
6523 return seg6_lookup_nexthop(skb, NULL, *(int *)param);
6524 case SEG6_LOCAL_ACTION_END_DT6:
6525 if (!seg6_bpf_has_valid_srh(skb))
6526 return -EBADMSG;
6527 if (param_len != sizeof(int))
6528 return -EINVAL;
6529
6530 if (ipv6_find_hdr(skb, &hdroff, IPPROTO_IPV6, NULL, NULL) < 0)
6531 return -EBADMSG;
6532 if (!pskb_pull(skb, hdroff))
6533 return -EBADMSG;
6534
6535 skb_postpull_rcsum(skb, skb_network_header(skb), hdroff);
6536 skb_reset_network_header(skb);
6537 skb_reset_transport_header(skb);
6538 skb->encapsulation = 0;
6539
6540 bpf_compute_data_pointers(skb);
6541 bpf_update_srh_state(skb);
6542 return seg6_lookup_nexthop(skb, NULL, *(int *)param);
6543 case SEG6_LOCAL_ACTION_END_B6:
6544 if (srh_state->srh && !seg6_bpf_has_valid_srh(skb))
6545 return -EBADMSG;
6546 err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE,
6547 param, param_len);
6548 if (!err)
6549 bpf_update_srh_state(skb);
6550
6551 return err;
6552 case SEG6_LOCAL_ACTION_END_B6_ENCAP:
6553 if (srh_state->srh && !seg6_bpf_has_valid_srh(skb))
6554 return -EBADMSG;
6555 err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6,
6556 param, param_len);
6557 if (!err)
6558 bpf_update_srh_state(skb);
6559
6560 return err;
6561 default:
6562 return -EINVAL;
6563 }
6564 }
6565
6566 static const struct bpf_func_proto bpf_lwt_seg6_action_proto = {
6567 .func = bpf_lwt_seg6_action,
6568 .gpl_only = false,
6569 .ret_type = RET_INTEGER,
6570 .arg1_type = ARG_PTR_TO_CTX,
6571 .arg2_type = ARG_ANYTHING,
6572 .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
6573 .arg4_type = ARG_CONST_SIZE
6574 };
6575
BPF_CALL_3(bpf_lwt_seg6_adjust_srh,struct sk_buff *,skb,u32,offset,s32,len)6576 BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset,
6577 s32, len)
6578 {
6579 struct seg6_bpf_srh_state *srh_state =
6580 this_cpu_ptr(&seg6_bpf_srh_states);
6581 struct ipv6_sr_hdr *srh = srh_state->srh;
6582 void *srh_end, *srh_tlvs, *ptr;
6583 struct ipv6hdr *hdr;
6584 int srhoff = 0;
6585 int ret;
6586
6587 if (unlikely(srh == NULL))
6588 return -EINVAL;
6589
6590 srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) +
6591 ((srh->first_segment + 1) << 4));
6592 srh_end = (void *)((unsigned char *)srh + sizeof(*srh) +
6593 srh_state->hdrlen);
6594 ptr = skb->data + offset;
6595
6596 if (unlikely(ptr < srh_tlvs || ptr > srh_end))
6597 return -EFAULT;
6598 if (unlikely(len < 0 && (void *)((char *)ptr - len) > srh_end))
6599 return -EFAULT;
6600
6601 if (len > 0) {
6602 ret = skb_cow_head(skb, len);
6603 if (unlikely(ret < 0))
6604 return ret;
6605
6606 ret = bpf_skb_net_hdr_push(skb, offset, len);
6607 } else {
6608 ret = bpf_skb_net_hdr_pop(skb, offset, -1 * len);
6609 }
6610
6611 bpf_compute_data_pointers(skb);
6612 if (unlikely(ret < 0))
6613 return ret;
6614
6615 hdr = (struct ipv6hdr *)skb->data;
6616 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
6617
6618 if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
6619 return -EINVAL;
6620 srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
6621 srh_state->hdrlen += len;
6622 srh_state->valid = false;
6623 return 0;
6624 }
6625
6626 static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
6627 .func = bpf_lwt_seg6_adjust_srh,
6628 .gpl_only = false,
6629 .ret_type = RET_INTEGER,
6630 .arg1_type = ARG_PTR_TO_CTX,
6631 .arg2_type = ARG_ANYTHING,
6632 .arg3_type = ARG_ANYTHING,
6633 };
6634 #endif /* CONFIG_IPV6_SEG6_BPF */
6635
6636 #ifdef CONFIG_INET
sk_lookup(struct net * net,struct bpf_sock_tuple * tuple,int dif,int sdif,u8 family,u8 proto)6637 static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
6638 int dif, int sdif, u8 family, u8 proto)
6639 {
6640 struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo;
6641 bool refcounted = false;
6642 struct sock *sk = NULL;
6643
6644 if (family == AF_INET) {
6645 __be32 src4 = tuple->ipv4.saddr;
6646 __be32 dst4 = tuple->ipv4.daddr;
6647
6648 if (proto == IPPROTO_TCP)
6649 sk = __inet_lookup(net, hinfo, NULL, 0,
6650 src4, tuple->ipv4.sport,
6651 dst4, tuple->ipv4.dport,
6652 dif, sdif, &refcounted);
6653 else
6654 sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport,
6655 dst4, tuple->ipv4.dport,
6656 dif, sdif, net->ipv4.udp_table, NULL);
6657 #if IS_ENABLED(CONFIG_IPV6)
6658 } else {
6659 struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr;
6660 struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr;
6661
6662 if (proto == IPPROTO_TCP)
6663 sk = __inet6_lookup(net, hinfo, NULL, 0,
6664 src6, tuple->ipv6.sport,
6665 dst6, ntohs(tuple->ipv6.dport),
6666 dif, sdif, &refcounted);
6667 else if (likely(ipv6_bpf_stub))
6668 sk = ipv6_bpf_stub->udp6_lib_lookup(net,
6669 src6, tuple->ipv6.sport,
6670 dst6, tuple->ipv6.dport,
6671 dif, sdif,
6672 net->ipv4.udp_table, NULL);
6673 #endif
6674 }
6675
6676 if (unlikely(sk && !refcounted && !sock_flag(sk, SOCK_RCU_FREE))) {
6677 WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
6678 sk = NULL;
6679 }
6680 return sk;
6681 }
6682
6683 /* bpf_skc_lookup performs the core lookup for different types of sockets,
6684 * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE.
6685 */
6686 static struct sock *
__bpf_skc_lookup(struct sk_buff * skb,struct bpf_sock_tuple * tuple,u32 len,struct net * caller_net,u32 ifindex,u8 proto,u64 netns_id,u64 flags,int sdif)6687 __bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
6688 struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id,
6689 u64 flags, int sdif)
6690 {
6691 struct sock *sk = NULL;
6692 struct net *net;
6693 u8 family;
6694
6695 if (len == sizeof(tuple->ipv4))
6696 family = AF_INET;
6697 else if (len == sizeof(tuple->ipv6))
6698 family = AF_INET6;
6699 else
6700 return NULL;
6701
6702 if (unlikely(flags || !((s32)netns_id < 0 || netns_id <= S32_MAX)))
6703 goto out;
6704
6705 if (sdif < 0) {
6706 if (family == AF_INET)
6707 sdif = inet_sdif(skb);
6708 else
6709 sdif = inet6_sdif(skb);
6710 }
6711
6712 if ((s32)netns_id < 0) {
6713 net = caller_net;
6714 sk = sk_lookup(net, tuple, ifindex, sdif, family, proto);
6715 } else {
6716 net = get_net_ns_by_id(caller_net, netns_id);
6717 if (unlikely(!net))
6718 goto out;
6719 sk = sk_lookup(net, tuple, ifindex, sdif, family, proto);
6720 put_net(net);
6721 }
6722
6723 out:
6724 return sk;
6725 }
6726
6727 static struct sock *
__bpf_sk_lookup(struct sk_buff * skb,struct bpf_sock_tuple * tuple,u32 len,struct net * caller_net,u32 ifindex,u8 proto,u64 netns_id,u64 flags,int sdif)6728 __bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
6729 struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id,
6730 u64 flags, int sdif)
6731 {
6732 struct sock *sk = __bpf_skc_lookup(skb, tuple, len, caller_net,
6733 ifindex, proto, netns_id, flags,
6734 sdif);
6735
6736 if (sk) {
6737 struct sock *sk2 = sk_to_full_sk(sk);
6738
6739 /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
6740 * sock refcnt is decremented to prevent a request_sock leak.
6741 */
6742 if (!sk_fullsock(sk2))
6743 sk2 = NULL;
6744 if (sk2 != sk) {
6745 sock_gen_put(sk);
6746 /* Ensure there is no need to bump sk2 refcnt */
6747 if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) {
6748 WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
6749 return NULL;
6750 }
6751 sk = sk2;
6752 }
6753 }
6754
6755 return sk;
6756 }
6757
6758 static struct sock *
bpf_skc_lookup(struct sk_buff * skb,struct bpf_sock_tuple * tuple,u32 len,u8 proto,u64 netns_id,u64 flags)6759 bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
6760 u8 proto, u64 netns_id, u64 flags)
6761 {
6762 struct net *caller_net;
6763 int ifindex;
6764
6765 if (skb->dev) {
6766 caller_net = dev_net(skb->dev);
6767 ifindex = skb->dev->ifindex;
6768 } else {
6769 caller_net = sock_net(skb->sk);
6770 ifindex = 0;
6771 }
6772
6773 return __bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, proto,
6774 netns_id, flags, -1);
6775 }
6776
6777 static struct sock *
bpf_sk_lookup(struct sk_buff * skb,struct bpf_sock_tuple * tuple,u32 len,u8 proto,u64 netns_id,u64 flags)6778 bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
6779 u8 proto, u64 netns_id, u64 flags)
6780 {
6781 struct sock *sk = bpf_skc_lookup(skb, tuple, len, proto, netns_id,
6782 flags);
6783
6784 if (sk) {
6785 struct sock *sk2 = sk_to_full_sk(sk);
6786
6787 /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
6788 * sock refcnt is decremented to prevent a request_sock leak.
6789 */
6790 if (!sk_fullsock(sk2))
6791 sk2 = NULL;
6792 if (sk2 != sk) {
6793 sock_gen_put(sk);
6794 /* Ensure there is no need to bump sk2 refcnt */
6795 if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) {
6796 WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
6797 return NULL;
6798 }
6799 sk = sk2;
6800 }
6801 }
6802
6803 return sk;
6804 }
6805
BPF_CALL_5(bpf_skc_lookup_tcp,struct sk_buff *,skb,struct bpf_sock_tuple *,tuple,u32,len,u64,netns_id,u64,flags)6806 BPF_CALL_5(bpf_skc_lookup_tcp, struct sk_buff *, skb,
6807 struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
6808 {
6809 return (unsigned long)bpf_skc_lookup(skb, tuple, len, IPPROTO_TCP,
6810 netns_id, flags);
6811 }
6812
6813 static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = {
6814 .func = bpf_skc_lookup_tcp,
6815 .gpl_only = false,
6816 .pkt_access = true,
6817 .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL,
6818 .arg1_type = ARG_PTR_TO_CTX,
6819 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
6820 .arg3_type = ARG_CONST_SIZE,
6821 .arg4_type = ARG_ANYTHING,
6822 .arg5_type = ARG_ANYTHING,
6823 };
6824
BPF_CALL_5(bpf_sk_lookup_tcp,struct sk_buff *,skb,struct bpf_sock_tuple *,tuple,u32,len,u64,netns_id,u64,flags)6825 BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb,
6826 struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
6827 {
6828 return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP,
6829 netns_id, flags);
6830 }
6831
6832 static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = {
6833 .func = bpf_sk_lookup_tcp,
6834 .gpl_only = false,
6835 .pkt_access = true,
6836 .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
6837 .arg1_type = ARG_PTR_TO_CTX,
6838 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
6839 .arg3_type = ARG_CONST_SIZE,
6840 .arg4_type = ARG_ANYTHING,
6841 .arg5_type = ARG_ANYTHING,
6842 };
6843
BPF_CALL_5(bpf_sk_lookup_udp,struct sk_buff *,skb,struct bpf_sock_tuple *,tuple,u32,len,u64,netns_id,u64,flags)6844 BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb,
6845 struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
6846 {
6847 return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP,
6848 netns_id, flags);
6849 }
6850
6851 static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
6852 .func = bpf_sk_lookup_udp,
6853 .gpl_only = false,
6854 .pkt_access = true,
6855 .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
6856 .arg1_type = ARG_PTR_TO_CTX,
6857 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
6858 .arg3_type = ARG_CONST_SIZE,
6859 .arg4_type = ARG_ANYTHING,
6860 .arg5_type = ARG_ANYTHING,
6861 };
6862
BPF_CALL_5(bpf_tc_skc_lookup_tcp,struct sk_buff *,skb,struct bpf_sock_tuple *,tuple,u32,len,u64,netns_id,u64,flags)6863 BPF_CALL_5(bpf_tc_skc_lookup_tcp, struct sk_buff *, skb,
6864 struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
6865 {
6866 struct net_device *dev = skb->dev;
6867 int ifindex = dev->ifindex, sdif = dev_sdif(dev);
6868 struct net *caller_net = dev_net(dev);
6869
6870 return (unsigned long)__bpf_skc_lookup(skb, tuple, len, caller_net,
6871 ifindex, IPPROTO_TCP, netns_id,
6872 flags, sdif);
6873 }
6874
6875 static const struct bpf_func_proto bpf_tc_skc_lookup_tcp_proto = {
6876 .func = bpf_tc_skc_lookup_tcp,
6877 .gpl_only = false,
6878 .pkt_access = true,
6879 .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL,
6880 .arg1_type = ARG_PTR_TO_CTX,
6881 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
6882 .arg3_type = ARG_CONST_SIZE,
6883 .arg4_type = ARG_ANYTHING,
6884 .arg5_type = ARG_ANYTHING,
6885 };
6886
BPF_CALL_5(bpf_tc_sk_lookup_tcp,struct sk_buff *,skb,struct bpf_sock_tuple *,tuple,u32,len,u64,netns_id,u64,flags)6887 BPF_CALL_5(bpf_tc_sk_lookup_tcp, struct sk_buff *, skb,
6888 struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
6889 {
6890 struct net_device *dev = skb->dev;
6891 int ifindex = dev->ifindex, sdif = dev_sdif(dev);
6892 struct net *caller_net = dev_net(dev);
6893
6894 return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net,
6895 ifindex, IPPROTO_TCP, netns_id,
6896 flags, sdif);
6897 }
6898
6899 static const struct bpf_func_proto bpf_tc_sk_lookup_tcp_proto = {
6900 .func = bpf_tc_sk_lookup_tcp,
6901 .gpl_only = false,
6902 .pkt_access = true,
6903 .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
6904 .arg1_type = ARG_PTR_TO_CTX,
6905 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
6906 .arg3_type = ARG_CONST_SIZE,
6907 .arg4_type = ARG_ANYTHING,
6908 .arg5_type = ARG_ANYTHING,
6909 };
6910
BPF_CALL_5(bpf_tc_sk_lookup_udp,struct sk_buff *,skb,struct bpf_sock_tuple *,tuple,u32,len,u64,netns_id,u64,flags)6911 BPF_CALL_5(bpf_tc_sk_lookup_udp, struct sk_buff *, skb,
6912 struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
6913 {
6914 struct net_device *dev = skb->dev;
6915 int ifindex = dev->ifindex, sdif = dev_sdif(dev);
6916 struct net *caller_net = dev_net(dev);
6917
6918 return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net,
6919 ifindex, IPPROTO_UDP, netns_id,
6920 flags, sdif);
6921 }
6922
6923 static const struct bpf_func_proto bpf_tc_sk_lookup_udp_proto = {
6924 .func = bpf_tc_sk_lookup_udp,
6925 .gpl_only = false,
6926 .pkt_access = true,
6927 .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
6928 .arg1_type = ARG_PTR_TO_CTX,
6929 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
6930 .arg3_type = ARG_CONST_SIZE,
6931 .arg4_type = ARG_ANYTHING,
6932 .arg5_type = ARG_ANYTHING,
6933 };
6934
BPF_CALL_1(bpf_sk_release,struct sock *,sk)6935 BPF_CALL_1(bpf_sk_release, struct sock *, sk)
6936 {
6937 if (sk && sk_is_refcounted(sk))
6938 sock_gen_put(sk);
6939 return 0;
6940 }
6941
6942 static const struct bpf_func_proto bpf_sk_release_proto = {
6943 .func = bpf_sk_release,
6944 .gpl_only = false,
6945 .ret_type = RET_INTEGER,
6946 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON | OBJ_RELEASE,
6947 };
6948
BPF_CALL_5(bpf_xdp_sk_lookup_udp,struct xdp_buff *,ctx,struct bpf_sock_tuple *,tuple,u32,len,u32,netns_id,u64,flags)6949 BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx,
6950 struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
6951 {
6952 struct net_device *dev = ctx->rxq->dev;
6953 int ifindex = dev->ifindex, sdif = dev_sdif(dev);
6954 struct net *caller_net = dev_net(dev);
6955
6956 return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net,
6957 ifindex, IPPROTO_UDP, netns_id,
6958 flags, sdif);
6959 }
6960
6961 static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = {
6962 .func = bpf_xdp_sk_lookup_udp,
6963 .gpl_only = false,
6964 .pkt_access = true,
6965 .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
6966 .arg1_type = ARG_PTR_TO_CTX,
6967 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
6968 .arg3_type = ARG_CONST_SIZE,
6969 .arg4_type = ARG_ANYTHING,
6970 .arg5_type = ARG_ANYTHING,
6971 };
6972
BPF_CALL_5(bpf_xdp_skc_lookup_tcp,struct xdp_buff *,ctx,struct bpf_sock_tuple *,tuple,u32,len,u32,netns_id,u64,flags)6973 BPF_CALL_5(bpf_xdp_skc_lookup_tcp, struct xdp_buff *, ctx,
6974 struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
6975 {
6976 struct net_device *dev = ctx->rxq->dev;
6977 int ifindex = dev->ifindex, sdif = dev_sdif(dev);
6978 struct net *caller_net = dev_net(dev);
6979
6980 return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, caller_net,
6981 ifindex, IPPROTO_TCP, netns_id,
6982 flags, sdif);
6983 }
6984
6985 static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = {
6986 .func = bpf_xdp_skc_lookup_tcp,
6987 .gpl_only = false,
6988 .pkt_access = true,
6989 .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL,
6990 .arg1_type = ARG_PTR_TO_CTX,
6991 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
6992 .arg3_type = ARG_CONST_SIZE,
6993 .arg4_type = ARG_ANYTHING,
6994 .arg5_type = ARG_ANYTHING,
6995 };
6996
BPF_CALL_5(bpf_xdp_sk_lookup_tcp,struct xdp_buff *,ctx,struct bpf_sock_tuple *,tuple,u32,len,u32,netns_id,u64,flags)6997 BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx,
6998 struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
6999 {
7000 struct net_device *dev = ctx->rxq->dev;
7001 int ifindex = dev->ifindex, sdif = dev_sdif(dev);
7002 struct net *caller_net = dev_net(dev);
7003
7004 return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net,
7005 ifindex, IPPROTO_TCP, netns_id,
7006 flags, sdif);
7007 }
7008
7009 static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = {
7010 .func = bpf_xdp_sk_lookup_tcp,
7011 .gpl_only = false,
7012 .pkt_access = true,
7013 .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
7014 .arg1_type = ARG_PTR_TO_CTX,
7015 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
7016 .arg3_type = ARG_CONST_SIZE,
7017 .arg4_type = ARG_ANYTHING,
7018 .arg5_type = ARG_ANYTHING,
7019 };
7020
BPF_CALL_5(bpf_sock_addr_skc_lookup_tcp,struct bpf_sock_addr_kern *,ctx,struct bpf_sock_tuple *,tuple,u32,len,u64,netns_id,u64,flags)7021 BPF_CALL_5(bpf_sock_addr_skc_lookup_tcp, struct bpf_sock_addr_kern *, ctx,
7022 struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
7023 {
7024 return (unsigned long)__bpf_skc_lookup(NULL, tuple, len,
7025 sock_net(ctx->sk), 0,
7026 IPPROTO_TCP, netns_id, flags,
7027 -1);
7028 }
7029
7030 static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = {
7031 .func = bpf_sock_addr_skc_lookup_tcp,
7032 .gpl_only = false,
7033 .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL,
7034 .arg1_type = ARG_PTR_TO_CTX,
7035 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
7036 .arg3_type = ARG_CONST_SIZE,
7037 .arg4_type = ARG_ANYTHING,
7038 .arg5_type = ARG_ANYTHING,
7039 };
7040
BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp,struct bpf_sock_addr_kern *,ctx,struct bpf_sock_tuple *,tuple,u32,len,u64,netns_id,u64,flags)7041 BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx,
7042 struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
7043 {
7044 return (unsigned long)__bpf_sk_lookup(NULL, tuple, len,
7045 sock_net(ctx->sk), 0, IPPROTO_TCP,
7046 netns_id, flags, -1);
7047 }
7048
7049 static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = {
7050 .func = bpf_sock_addr_sk_lookup_tcp,
7051 .gpl_only = false,
7052 .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
7053 .arg1_type = ARG_PTR_TO_CTX,
7054 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
7055 .arg3_type = ARG_CONST_SIZE,
7056 .arg4_type = ARG_ANYTHING,
7057 .arg5_type = ARG_ANYTHING,
7058 };
7059
BPF_CALL_5(bpf_sock_addr_sk_lookup_udp,struct bpf_sock_addr_kern *,ctx,struct bpf_sock_tuple *,tuple,u32,len,u64,netns_id,u64,flags)7060 BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx,
7061 struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
7062 {
7063 return (unsigned long)__bpf_sk_lookup(NULL, tuple, len,
7064 sock_net(ctx->sk), 0, IPPROTO_UDP,
7065 netns_id, flags, -1);
7066 }
7067
7068 static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = {
7069 .func = bpf_sock_addr_sk_lookup_udp,
7070 .gpl_only = false,
7071 .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
7072 .arg1_type = ARG_PTR_TO_CTX,
7073 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
7074 .arg3_type = ARG_CONST_SIZE,
7075 .arg4_type = ARG_ANYTHING,
7076 .arg5_type = ARG_ANYTHING,
7077 };
7078
bpf_tcp_sock_is_valid_access(int off,int size,enum bpf_access_type type,struct bpf_insn_access_aux * info)7079 bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
7080 struct bpf_insn_access_aux *info)
7081 {
7082 if (off < 0 || off >= offsetofend(struct bpf_tcp_sock,
7083 icsk_retransmits))
7084 return false;
7085
7086 if (off % size != 0)
7087 return false;
7088
7089 switch (off) {
7090 case offsetof(struct bpf_tcp_sock, bytes_received):
7091 case offsetof(struct bpf_tcp_sock, bytes_acked):
7092 return size == sizeof(__u64);
7093 default:
7094 return size == sizeof(__u32);
7095 }
7096 }
7097
bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,const struct bpf_insn * si,struct bpf_insn * insn_buf,struct bpf_prog * prog,u32 * target_size)7098 u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
7099 const struct bpf_insn *si,
7100 struct bpf_insn *insn_buf,
7101 struct bpf_prog *prog, u32 *target_size)
7102 {
7103 struct bpf_insn *insn = insn_buf;
7104
7105 #define BPF_TCP_SOCK_GET_COMMON(FIELD) \
7106 do { \
7107 BUILD_BUG_ON(sizeof_field(struct tcp_sock, FIELD) > \
7108 sizeof_field(struct bpf_tcp_sock, FIELD)); \
7109 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_sock, FIELD),\
7110 si->dst_reg, si->src_reg, \
7111 offsetof(struct tcp_sock, FIELD)); \
7112 } while (0)
7113
7114 #define BPF_INET_SOCK_GET_COMMON(FIELD) \
7115 do { \
7116 BUILD_BUG_ON(sizeof_field(struct inet_connection_sock, \
7117 FIELD) > \
7118 sizeof_field(struct bpf_tcp_sock, FIELD)); \
7119 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
7120 struct inet_connection_sock, \
7121 FIELD), \
7122 si->dst_reg, si->src_reg, \
7123 offsetof( \
7124 struct inet_connection_sock, \
7125 FIELD)); \
7126 } while (0)
7127
7128 BTF_TYPE_EMIT(struct bpf_tcp_sock);
7129
7130 switch (si->off) {
7131 case offsetof(struct bpf_tcp_sock, rtt_min):
7132 BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) !=
7133 sizeof(struct minmax));
7134 BUILD_BUG_ON(sizeof(struct minmax) <
7135 sizeof(struct minmax_sample));
7136
7137 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
7138 offsetof(struct tcp_sock, rtt_min) +
7139 offsetof(struct minmax_sample, v));
7140 break;
7141 case offsetof(struct bpf_tcp_sock, snd_cwnd):
7142 BPF_TCP_SOCK_GET_COMMON(snd_cwnd);
7143 break;
7144 case offsetof(struct bpf_tcp_sock, srtt_us):
7145 BPF_TCP_SOCK_GET_COMMON(srtt_us);
7146 break;
7147 case offsetof(struct bpf_tcp_sock, snd_ssthresh):
7148 BPF_TCP_SOCK_GET_COMMON(snd_ssthresh);
7149 break;
7150 case offsetof(struct bpf_tcp_sock, rcv_nxt):
7151 BPF_TCP_SOCK_GET_COMMON(rcv_nxt);
7152 break;
7153 case offsetof(struct bpf_tcp_sock, snd_nxt):
7154 BPF_TCP_SOCK_GET_COMMON(snd_nxt);
7155 break;
7156 case offsetof(struct bpf_tcp_sock, snd_una):
7157 BPF_TCP_SOCK_GET_COMMON(snd_una);
7158 break;
7159 case offsetof(struct bpf_tcp_sock, mss_cache):
7160 BPF_TCP_SOCK_GET_COMMON(mss_cache);
7161 break;
7162 case offsetof(struct bpf_tcp_sock, ecn_flags):
7163 BPF_TCP_SOCK_GET_COMMON(ecn_flags);
7164 break;
7165 case offsetof(struct bpf_tcp_sock, rate_delivered):
7166 BPF_TCP_SOCK_GET_COMMON(rate_delivered);
7167 break;
7168 case offsetof(struct bpf_tcp_sock, rate_interval_us):
7169 BPF_TCP_SOCK_GET_COMMON(rate_interval_us);
7170 break;
7171 case offsetof(struct bpf_tcp_sock, packets_out):
7172 BPF_TCP_SOCK_GET_COMMON(packets_out);
7173 break;
7174 case offsetof(struct bpf_tcp_sock, retrans_out):
7175 BPF_TCP_SOCK_GET_COMMON(retrans_out);
7176 break;
7177 case offsetof(struct bpf_tcp_sock, total_retrans):
7178 BPF_TCP_SOCK_GET_COMMON(total_retrans);
7179 break;
7180 case offsetof(struct bpf_tcp_sock, segs_in):
7181 BPF_TCP_SOCK_GET_COMMON(segs_in);
7182 break;
7183 case offsetof(struct bpf_tcp_sock, data_segs_in):
7184 BPF_TCP_SOCK_GET_COMMON(data_segs_in);
7185 break;
7186 case offsetof(struct bpf_tcp_sock, segs_out):
7187 BPF_TCP_SOCK_GET_COMMON(segs_out);
7188 break;
7189 case offsetof(struct bpf_tcp_sock, data_segs_out):
7190 BPF_TCP_SOCK_GET_COMMON(data_segs_out);
7191 break;
7192 case offsetof(struct bpf_tcp_sock, lost_out):
7193 BPF_TCP_SOCK_GET_COMMON(lost_out);
7194 break;
7195 case offsetof(struct bpf_tcp_sock, sacked_out):
7196 BPF_TCP_SOCK_GET_COMMON(sacked_out);
7197 break;
7198 case offsetof(struct bpf_tcp_sock, bytes_received):
7199 BPF_TCP_SOCK_GET_COMMON(bytes_received);
7200 break;
7201 case offsetof(struct bpf_tcp_sock, bytes_acked):
7202 BPF_TCP_SOCK_GET_COMMON(bytes_acked);
7203 break;
7204 case offsetof(struct bpf_tcp_sock, dsack_dups):
7205 BPF_TCP_SOCK_GET_COMMON(dsack_dups);
7206 break;
7207 case offsetof(struct bpf_tcp_sock, delivered):
7208 BPF_TCP_SOCK_GET_COMMON(delivered);
7209 break;
7210 case offsetof(struct bpf_tcp_sock, delivered_ce):
7211 BPF_TCP_SOCK_GET_COMMON(delivered_ce);
7212 break;
7213 case offsetof(struct bpf_tcp_sock, icsk_retransmits):
7214 BPF_INET_SOCK_GET_COMMON(icsk_retransmits);
7215 break;
7216 }
7217
7218 return insn - insn_buf;
7219 }
7220
BPF_CALL_1(bpf_tcp_sock,struct sock *,sk)7221 BPF_CALL_1(bpf_tcp_sock, struct sock *, sk)
7222 {
7223 if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP)
7224 return (unsigned long)sk;
7225
7226 return (unsigned long)NULL;
7227 }
7228
7229 const struct bpf_func_proto bpf_tcp_sock_proto = {
7230 .func = bpf_tcp_sock,
7231 .gpl_only = false,
7232 .ret_type = RET_PTR_TO_TCP_SOCK_OR_NULL,
7233 .arg1_type = ARG_PTR_TO_SOCK_COMMON,
7234 };
7235
BPF_CALL_1(bpf_get_listener_sock,struct sock *,sk)7236 BPF_CALL_1(bpf_get_listener_sock, struct sock *, sk)
7237 {
7238 sk = sk_to_full_sk(sk);
7239
7240 if (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE))
7241 return (unsigned long)sk;
7242
7243 return (unsigned long)NULL;
7244 }
7245
7246 static const struct bpf_func_proto bpf_get_listener_sock_proto = {
7247 .func = bpf_get_listener_sock,
7248 .gpl_only = false,
7249 .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
7250 .arg1_type = ARG_PTR_TO_SOCK_COMMON,
7251 };
7252
BPF_CALL_1(bpf_skb_ecn_set_ce,struct sk_buff *,skb)7253 BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb)
7254 {
7255 unsigned int iphdr_len;
7256
7257 switch (skb_protocol(skb, true)) {
7258 case cpu_to_be16(ETH_P_IP):
7259 iphdr_len = sizeof(struct iphdr);
7260 break;
7261 case cpu_to_be16(ETH_P_IPV6):
7262 iphdr_len = sizeof(struct ipv6hdr);
7263 break;
7264 default:
7265 return 0;
7266 }
7267
7268 if (skb_headlen(skb) < iphdr_len)
7269 return 0;
7270
7271 if (skb_cloned(skb) && !skb_clone_writable(skb, iphdr_len))
7272 return 0;
7273
7274 return INET_ECN_set_ce(skb);
7275 }
7276
bpf_xdp_sock_is_valid_access(int off,int size,enum bpf_access_type type,struct bpf_insn_access_aux * info)7277 bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
7278 struct bpf_insn_access_aux *info)
7279 {
7280 if (off < 0 || off >= offsetofend(struct bpf_xdp_sock, queue_id))
7281 return false;
7282
7283 if (off % size != 0)
7284 return false;
7285
7286 switch (off) {
7287 default:
7288 return size == sizeof(__u32);
7289 }
7290 }
7291
bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,const struct bpf_insn * si,struct bpf_insn * insn_buf,struct bpf_prog * prog,u32 * target_size)7292 u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
7293 const struct bpf_insn *si,
7294 struct bpf_insn *insn_buf,
7295 struct bpf_prog *prog, u32 *target_size)
7296 {
7297 struct bpf_insn *insn = insn_buf;
7298
7299 #define BPF_XDP_SOCK_GET(FIELD) \
7300 do { \
7301 BUILD_BUG_ON(sizeof_field(struct xdp_sock, FIELD) > \
7302 sizeof_field(struct bpf_xdp_sock, FIELD)); \
7303 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_sock, FIELD),\
7304 si->dst_reg, si->src_reg, \
7305 offsetof(struct xdp_sock, FIELD)); \
7306 } while (0)
7307
7308 switch (si->off) {
7309 case offsetof(struct bpf_xdp_sock, queue_id):
7310 BPF_XDP_SOCK_GET(queue_id);
7311 break;
7312 }
7313
7314 return insn - insn_buf;
7315 }
7316
7317 static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = {
7318 .func = bpf_skb_ecn_set_ce,
7319 .gpl_only = false,
7320 .ret_type = RET_INTEGER,
7321 .arg1_type = ARG_PTR_TO_CTX,
7322 };
7323
BPF_CALL_5(bpf_tcp_check_syncookie,struct sock *,sk,void *,iph,u32,iph_len,struct tcphdr *,th,u32,th_len)7324 BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len,
7325 struct tcphdr *, th, u32, th_len)
7326 {
7327 #ifdef CONFIG_SYN_COOKIES
7328 u32 cookie;
7329 int ret;
7330
7331 if (unlikely(!sk || th_len < sizeof(*th)))
7332 return -EINVAL;
7333
7334 /* sk_listener() allows TCP_NEW_SYN_RECV, which makes no sense here. */
7335 if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN)
7336 return -EINVAL;
7337
7338 if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies))
7339 return -EINVAL;
7340
7341 if (!th->ack || th->rst || th->syn)
7342 return -ENOENT;
7343
7344 if (unlikely(iph_len < sizeof(struct iphdr)))
7345 return -EINVAL;
7346
7347 if (tcp_synq_no_recent_overflow(sk))
7348 return -ENOENT;
7349
7350 cookie = ntohl(th->ack_seq) - 1;
7351
7352 /* Both struct iphdr and struct ipv6hdr have the version field at the
7353 * same offset so we can cast to the shorter header (struct iphdr).
7354 */
7355 switch (((struct iphdr *)iph)->version) {
7356 case 4:
7357 if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk))
7358 return -EINVAL;
7359
7360 ret = __cookie_v4_check((struct iphdr *)iph, th, cookie);
7361 break;
7362
7363 #if IS_BUILTIN(CONFIG_IPV6)
7364 case 6:
7365 if (unlikely(iph_len < sizeof(struct ipv6hdr)))
7366 return -EINVAL;
7367
7368 if (sk->sk_family != AF_INET6)
7369 return -EINVAL;
7370
7371 ret = __cookie_v6_check((struct ipv6hdr *)iph, th, cookie);
7372 break;
7373 #endif /* CONFIG_IPV6 */
7374
7375 default:
7376 return -EPROTONOSUPPORT;
7377 }
7378
7379 if (ret > 0)
7380 return 0;
7381
7382 return -ENOENT;
7383 #else
7384 return -ENOTSUPP;
7385 #endif
7386 }
7387
7388 static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = {
7389 .func = bpf_tcp_check_syncookie,
7390 .gpl_only = true,
7391 .pkt_access = true,
7392 .ret_type = RET_INTEGER,
7393 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
7394 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
7395 .arg3_type = ARG_CONST_SIZE,
7396 .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
7397 .arg5_type = ARG_CONST_SIZE,
7398 };
7399
BPF_CALL_5(bpf_tcp_gen_syncookie,struct sock *,sk,void *,iph,u32,iph_len,struct tcphdr *,th,u32,th_len)7400 BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len,
7401 struct tcphdr *, th, u32, th_len)
7402 {
7403 #ifdef CONFIG_SYN_COOKIES
7404 u32 cookie;
7405 u16 mss;
7406
7407 if (unlikely(!sk || th_len < sizeof(*th) || th_len != th->doff * 4))
7408 return -EINVAL;
7409
7410 if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN)
7411 return -EINVAL;
7412
7413 if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies))
7414 return -ENOENT;
7415
7416 if (!th->syn || th->ack || th->fin || th->rst)
7417 return -EINVAL;
7418
7419 if (unlikely(iph_len < sizeof(struct iphdr)))
7420 return -EINVAL;
7421
7422 /* Both struct iphdr and struct ipv6hdr have the version field at the
7423 * same offset so we can cast to the shorter header (struct iphdr).
7424 */
7425 switch (((struct iphdr *)iph)->version) {
7426 case 4:
7427 if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk))
7428 return -EINVAL;
7429
7430 mss = tcp_v4_get_syncookie(sk, iph, th, &cookie);
7431 break;
7432
7433 #if IS_BUILTIN(CONFIG_IPV6)
7434 case 6:
7435 if (unlikely(iph_len < sizeof(struct ipv6hdr)))
7436 return -EINVAL;
7437
7438 if (sk->sk_family != AF_INET6)
7439 return -EINVAL;
7440
7441 mss = tcp_v6_get_syncookie(sk, iph, th, &cookie);
7442 break;
7443 #endif /* CONFIG_IPV6 */
7444
7445 default:
7446 return -EPROTONOSUPPORT;
7447 }
7448 if (mss == 0)
7449 return -ENOENT;
7450
7451 return cookie | ((u64)mss << 32);
7452 #else
7453 return -EOPNOTSUPP;
7454 #endif /* CONFIG_SYN_COOKIES */
7455 }
7456
7457 static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = {
7458 .func = bpf_tcp_gen_syncookie,
7459 .gpl_only = true, /* __cookie_v*_init_sequence() is GPL */
7460 .pkt_access = true,
7461 .ret_type = RET_INTEGER,
7462 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
7463 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
7464 .arg3_type = ARG_CONST_SIZE,
7465 .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
7466 .arg5_type = ARG_CONST_SIZE,
7467 };
7468
BPF_CALL_3(bpf_sk_assign,struct sk_buff *,skb,struct sock *,sk,u64,flags)7469 BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags)
7470 {
7471 if (!sk || flags != 0)
7472 return -EINVAL;
7473 if (!skb_at_tc_ingress(skb))
7474 return -EOPNOTSUPP;
7475 if (unlikely(dev_net(skb->dev) != sock_net(sk)))
7476 return -ENETUNREACH;
7477 if (sk_unhashed(sk))
7478 return -EOPNOTSUPP;
7479 if (sk_is_refcounted(sk) &&
7480 unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
7481 return -ENOENT;
7482
7483 skb_orphan(skb);
7484 skb->sk = sk;
7485 skb->destructor = sock_pfree;
7486
7487 return 0;
7488 }
7489
7490 static const struct bpf_func_proto bpf_sk_assign_proto = {
7491 .func = bpf_sk_assign,
7492 .gpl_only = false,
7493 .ret_type = RET_INTEGER,
7494 .arg1_type = ARG_PTR_TO_CTX,
7495 .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
7496 .arg3_type = ARG_ANYTHING,
7497 };
7498
bpf_search_tcp_opt(const u8 * op,const u8 * opend,u8 search_kind,const u8 * magic,u8 magic_len,bool * eol)7499 static const u8 *bpf_search_tcp_opt(const u8 *op, const u8 *opend,
7500 u8 search_kind, const u8 *magic,
7501 u8 magic_len, bool *eol)
7502 {
7503 u8 kind, kind_len;
7504
7505 *eol = false;
7506
7507 while (op < opend) {
7508 kind = op[0];
7509
7510 if (kind == TCPOPT_EOL) {
7511 *eol = true;
7512 return ERR_PTR(-ENOMSG);
7513 } else if (kind == TCPOPT_NOP) {
7514 op++;
7515 continue;
7516 }
7517
7518 if (opend - op < 2 || opend - op < op[1] || op[1] < 2)
7519 /* Something is wrong in the received header.
7520 * Follow the TCP stack's tcp_parse_options()
7521 * and just bail here.
7522 */
7523 return ERR_PTR(-EFAULT);
7524
7525 kind_len = op[1];
7526 if (search_kind == kind) {
7527 if (!magic_len)
7528 return op;
7529
7530 if (magic_len > kind_len - 2)
7531 return ERR_PTR(-ENOMSG);
7532
7533 if (!memcmp(&op[2], magic, magic_len))
7534 return op;
7535 }
7536
7537 op += kind_len;
7538 }
7539
7540 return ERR_PTR(-ENOMSG);
7541 }
7542
BPF_CALL_4(bpf_sock_ops_load_hdr_opt,struct bpf_sock_ops_kern *,bpf_sock,void *,search_res,u32,len,u64,flags)7543 BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
7544 void *, search_res, u32, len, u64, flags)
7545 {
7546 bool eol, load_syn = flags & BPF_LOAD_HDR_OPT_TCP_SYN;
7547 const u8 *op, *opend, *magic, *search = search_res;
7548 u8 search_kind, search_len, copy_len, magic_len;
7549 int ret;
7550
7551 /* 2 byte is the minimal option len except TCPOPT_NOP and
7552 * TCPOPT_EOL which are useless for the bpf prog to learn
7553 * and this helper disallow loading them also.
7554 */
7555 if (len < 2 || flags & ~BPF_LOAD_HDR_OPT_TCP_SYN)
7556 return -EINVAL;
7557
7558 search_kind = search[0];
7559 search_len = search[1];
7560
7561 if (search_len > len || search_kind == TCPOPT_NOP ||
7562 search_kind == TCPOPT_EOL)
7563 return -EINVAL;
7564
7565 if (search_kind == TCPOPT_EXP || search_kind == 253) {
7566 /* 16 or 32 bit magic. +2 for kind and kind length */
7567 if (search_len != 4 && search_len != 6)
7568 return -EINVAL;
7569 magic = &search[2];
7570 magic_len = search_len - 2;
7571 } else {
7572 if (search_len)
7573 return -EINVAL;
7574 magic = NULL;
7575 magic_len = 0;
7576 }
7577
7578 if (load_syn) {
7579 ret = bpf_sock_ops_get_syn(bpf_sock, TCP_BPF_SYN, &op);
7580 if (ret < 0)
7581 return ret;
7582
7583 opend = op + ret;
7584 op += sizeof(struct tcphdr);
7585 } else {
7586 if (!bpf_sock->skb ||
7587 bpf_sock->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB)
7588 /* This bpf_sock->op cannot call this helper */
7589 return -EPERM;
7590
7591 opend = bpf_sock->skb_data_end;
7592 op = bpf_sock->skb->data + sizeof(struct tcphdr);
7593 }
7594
7595 op = bpf_search_tcp_opt(op, opend, search_kind, magic, magic_len,
7596 &eol);
7597 if (IS_ERR(op))
7598 return PTR_ERR(op);
7599
7600 copy_len = op[1];
7601 ret = copy_len;
7602 if (copy_len > len) {
7603 ret = -ENOSPC;
7604 copy_len = len;
7605 }
7606
7607 memcpy(search_res, op, copy_len);
7608 return ret;
7609 }
7610
7611 static const struct bpf_func_proto bpf_sock_ops_load_hdr_opt_proto = {
7612 .func = bpf_sock_ops_load_hdr_opt,
7613 .gpl_only = false,
7614 .ret_type = RET_INTEGER,
7615 .arg1_type = ARG_PTR_TO_CTX,
7616 .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE,
7617 .arg3_type = ARG_CONST_SIZE,
7618 .arg4_type = ARG_ANYTHING,
7619 };
7620
BPF_CALL_4(bpf_sock_ops_store_hdr_opt,struct bpf_sock_ops_kern *,bpf_sock,const void *,from,u32,len,u64,flags)7621 BPF_CALL_4(bpf_sock_ops_store_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
7622 const void *, from, u32, len, u64, flags)
7623 {
7624 u8 new_kind, new_kind_len, magic_len = 0, *opend;
7625 const u8 *op, *new_op, *magic = NULL;
7626 struct sk_buff *skb;
7627 bool eol;
7628
7629 if (bpf_sock->op != BPF_SOCK_OPS_WRITE_HDR_OPT_CB)
7630 return -EPERM;
7631
7632 if (len < 2 || flags)
7633 return -EINVAL;
7634
7635 new_op = from;
7636 new_kind = new_op[0];
7637 new_kind_len = new_op[1];
7638
7639 if (new_kind_len > len || new_kind == TCPOPT_NOP ||
7640 new_kind == TCPOPT_EOL)
7641 return -EINVAL;
7642
7643 if (new_kind_len > bpf_sock->remaining_opt_len)
7644 return -ENOSPC;
7645
7646 /* 253 is another experimental kind */
7647 if (new_kind == TCPOPT_EXP || new_kind == 253) {
7648 if (new_kind_len < 4)
7649 return -EINVAL;
7650 /* Match for the 2 byte magic also.
7651 * RFC 6994: the magic could be 2 or 4 bytes.
7652 * Hence, matching by 2 byte only is on the
7653 * conservative side but it is the right
7654 * thing to do for the 'search-for-duplication'
7655 * purpose.
7656 */
7657 magic = &new_op[2];
7658 magic_len = 2;
7659 }
7660
7661 /* Check for duplication */
7662 skb = bpf_sock->skb;
7663 op = skb->data + sizeof(struct tcphdr);
7664 opend = bpf_sock->skb_data_end;
7665
7666 op = bpf_search_tcp_opt(op, opend, new_kind, magic, magic_len,
7667 &eol);
7668 if (!IS_ERR(op))
7669 return -EEXIST;
7670
7671 if (PTR_ERR(op) != -ENOMSG)
7672 return PTR_ERR(op);
7673
7674 if (eol)
7675 /* The option has been ended. Treat it as no more
7676 * header option can be written.
7677 */
7678 return -ENOSPC;
7679
7680 /* No duplication found. Store the header option. */
7681 memcpy(opend, from, new_kind_len);
7682
7683 bpf_sock->remaining_opt_len -= new_kind_len;
7684 bpf_sock->skb_data_end += new_kind_len;
7685
7686 return 0;
7687 }
7688
7689 static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = {
7690 .func = bpf_sock_ops_store_hdr_opt,
7691 .gpl_only = false,
7692 .ret_type = RET_INTEGER,
7693 .arg1_type = ARG_PTR_TO_CTX,
7694 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
7695 .arg3_type = ARG_CONST_SIZE,
7696 .arg4_type = ARG_ANYTHING,
7697 };
7698
BPF_CALL_3(bpf_sock_ops_reserve_hdr_opt,struct bpf_sock_ops_kern *,bpf_sock,u32,len,u64,flags)7699 BPF_CALL_3(bpf_sock_ops_reserve_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
7700 u32, len, u64, flags)
7701 {
7702 if (bpf_sock->op != BPF_SOCK_OPS_HDR_OPT_LEN_CB)
7703 return -EPERM;
7704
7705 if (flags || len < 2)
7706 return -EINVAL;
7707
7708 if (len > bpf_sock->remaining_opt_len)
7709 return -ENOSPC;
7710
7711 bpf_sock->remaining_opt_len -= len;
7712
7713 return 0;
7714 }
7715
7716 static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = {
7717 .func = bpf_sock_ops_reserve_hdr_opt,
7718 .gpl_only = false,
7719 .ret_type = RET_INTEGER,
7720 .arg1_type = ARG_PTR_TO_CTX,
7721 .arg2_type = ARG_ANYTHING,
7722 .arg3_type = ARG_ANYTHING,
7723 };
7724
BPF_CALL_3(bpf_skb_set_tstamp,struct sk_buff *,skb,u64,tstamp,u32,tstamp_type)7725 BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb,
7726 u64, tstamp, u32, tstamp_type)
7727 {
7728 /* skb_clear_delivery_time() is done for inet protocol */
7729 if (skb->protocol != htons(ETH_P_IP) &&
7730 skb->protocol != htons(ETH_P_IPV6))
7731 return -EOPNOTSUPP;
7732
7733 switch (tstamp_type) {
7734 case BPF_SKB_TSTAMP_DELIVERY_MONO:
7735 if (!tstamp)
7736 return -EINVAL;
7737 skb->tstamp = tstamp;
7738 skb->tstamp_type = SKB_CLOCK_MONOTONIC;
7739 break;
7740 case BPF_SKB_TSTAMP_UNSPEC:
7741 if (tstamp)
7742 return -EINVAL;
7743 skb->tstamp = 0;
7744 skb->tstamp_type = SKB_CLOCK_REALTIME;
7745 break;
7746 default:
7747 return -EINVAL;
7748 }
7749
7750 return 0;
7751 }
7752
7753 static const struct bpf_func_proto bpf_skb_set_tstamp_proto = {
7754 .func = bpf_skb_set_tstamp,
7755 .gpl_only = false,
7756 .ret_type = RET_INTEGER,
7757 .arg1_type = ARG_PTR_TO_CTX,
7758 .arg2_type = ARG_ANYTHING,
7759 .arg3_type = ARG_ANYTHING,
7760 };
7761
7762 #ifdef CONFIG_SYN_COOKIES
BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv4,struct iphdr *,iph,struct tcphdr *,th,u32,th_len)7763 BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv4, struct iphdr *, iph,
7764 struct tcphdr *, th, u32, th_len)
7765 {
7766 u32 cookie;
7767 u16 mss;
7768
7769 if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4))
7770 return -EINVAL;
7771
7772 mss = tcp_parse_mss_option(th, 0) ?: TCP_MSS_DEFAULT;
7773 cookie = __cookie_v4_init_sequence(iph, th, &mss);
7774
7775 return cookie | ((u64)mss << 32);
7776 }
7777
7778 static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv4_proto = {
7779 .func = bpf_tcp_raw_gen_syncookie_ipv4,
7780 .gpl_only = true, /* __cookie_v4_init_sequence() is GPL */
7781 .pkt_access = true,
7782 .ret_type = RET_INTEGER,
7783 .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM,
7784 .arg1_size = sizeof(struct iphdr),
7785 .arg2_type = ARG_PTR_TO_MEM,
7786 .arg3_type = ARG_CONST_SIZE_OR_ZERO,
7787 };
7788
BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv6,struct ipv6hdr *,iph,struct tcphdr *,th,u32,th_len)7789 BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv6, struct ipv6hdr *, iph,
7790 struct tcphdr *, th, u32, th_len)
7791 {
7792 #if IS_BUILTIN(CONFIG_IPV6)
7793 const u16 mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) -
7794 sizeof(struct ipv6hdr);
7795 u32 cookie;
7796 u16 mss;
7797
7798 if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4))
7799 return -EINVAL;
7800
7801 mss = tcp_parse_mss_option(th, 0) ?: mss_clamp;
7802 cookie = __cookie_v6_init_sequence(iph, th, &mss);
7803
7804 return cookie | ((u64)mss << 32);
7805 #else
7806 return -EPROTONOSUPPORT;
7807 #endif
7808 }
7809
7810 static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv6_proto = {
7811 .func = bpf_tcp_raw_gen_syncookie_ipv6,
7812 .gpl_only = true, /* __cookie_v6_init_sequence() is GPL */
7813 .pkt_access = true,
7814 .ret_type = RET_INTEGER,
7815 .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM,
7816 .arg1_size = sizeof(struct ipv6hdr),
7817 .arg2_type = ARG_PTR_TO_MEM,
7818 .arg3_type = ARG_CONST_SIZE_OR_ZERO,
7819 };
7820
BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv4,struct iphdr *,iph,struct tcphdr *,th)7821 BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv4, struct iphdr *, iph,
7822 struct tcphdr *, th)
7823 {
7824 u32 cookie = ntohl(th->ack_seq) - 1;
7825
7826 if (__cookie_v4_check(iph, th, cookie) > 0)
7827 return 0;
7828
7829 return -EACCES;
7830 }
7831
7832 static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv4_proto = {
7833 .func = bpf_tcp_raw_check_syncookie_ipv4,
7834 .gpl_only = true, /* __cookie_v4_check is GPL */
7835 .pkt_access = true,
7836 .ret_type = RET_INTEGER,
7837 .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM,
7838 .arg1_size = sizeof(struct iphdr),
7839 .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM,
7840 .arg2_size = sizeof(struct tcphdr),
7841 };
7842
BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv6,struct ipv6hdr *,iph,struct tcphdr *,th)7843 BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv6, struct ipv6hdr *, iph,
7844 struct tcphdr *, th)
7845 {
7846 #if IS_BUILTIN(CONFIG_IPV6)
7847 u32 cookie = ntohl(th->ack_seq) - 1;
7848
7849 if (__cookie_v6_check(iph, th, cookie) > 0)
7850 return 0;
7851
7852 return -EACCES;
7853 #else
7854 return -EPROTONOSUPPORT;
7855 #endif
7856 }
7857
7858 static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv6_proto = {
7859 .func = bpf_tcp_raw_check_syncookie_ipv6,
7860 .gpl_only = true, /* __cookie_v6_check is GPL */
7861 .pkt_access = true,
7862 .ret_type = RET_INTEGER,
7863 .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM,
7864 .arg1_size = sizeof(struct ipv6hdr),
7865 .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM,
7866 .arg2_size = sizeof(struct tcphdr),
7867 };
7868 #endif /* CONFIG_SYN_COOKIES */
7869
7870 #endif /* CONFIG_INET */
7871
bpf_helper_changes_pkt_data(enum bpf_func_id func_id)7872 bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id)
7873 {
7874 switch (func_id) {
7875 case BPF_FUNC_clone_redirect:
7876 case BPF_FUNC_l3_csum_replace:
7877 case BPF_FUNC_l4_csum_replace:
7878 case BPF_FUNC_lwt_push_encap:
7879 case BPF_FUNC_lwt_seg6_action:
7880 case BPF_FUNC_lwt_seg6_adjust_srh:
7881 case BPF_FUNC_lwt_seg6_store_bytes:
7882 case BPF_FUNC_msg_pop_data:
7883 case BPF_FUNC_msg_pull_data:
7884 case BPF_FUNC_msg_push_data:
7885 case BPF_FUNC_skb_adjust_room:
7886 case BPF_FUNC_skb_change_head:
7887 case BPF_FUNC_skb_change_proto:
7888 case BPF_FUNC_skb_change_tail:
7889 case BPF_FUNC_skb_pull_data:
7890 case BPF_FUNC_skb_store_bytes:
7891 case BPF_FUNC_skb_vlan_pop:
7892 case BPF_FUNC_skb_vlan_push:
7893 case BPF_FUNC_store_hdr_opt:
7894 case BPF_FUNC_xdp_adjust_head:
7895 case BPF_FUNC_xdp_adjust_meta:
7896 case BPF_FUNC_xdp_adjust_tail:
7897 /* tail-called program could call any of the above */
7898 case BPF_FUNC_tail_call:
7899 return true;
7900 default:
7901 return false;
7902 }
7903 }
7904
7905 const struct bpf_func_proto bpf_event_output_data_proto __weak;
7906 const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto __weak;
7907
7908 static const struct bpf_func_proto *
sock_filter_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)7909 sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
7910 {
7911 const struct bpf_func_proto *func_proto;
7912
7913 func_proto = cgroup_common_func_proto(func_id, prog);
7914 if (func_proto)
7915 return func_proto;
7916
7917 func_proto = cgroup_current_func_proto(func_id, prog);
7918 if (func_proto)
7919 return func_proto;
7920
7921 switch (func_id) {
7922 case BPF_FUNC_get_socket_cookie:
7923 return &bpf_get_socket_cookie_sock_proto;
7924 case BPF_FUNC_get_netns_cookie:
7925 return &bpf_get_netns_cookie_sock_proto;
7926 case BPF_FUNC_perf_event_output:
7927 return &bpf_event_output_data_proto;
7928 case BPF_FUNC_sk_storage_get:
7929 return &bpf_sk_storage_get_cg_sock_proto;
7930 case BPF_FUNC_ktime_get_coarse_ns:
7931 return &bpf_ktime_get_coarse_ns_proto;
7932 default:
7933 return bpf_base_func_proto(func_id);
7934 }
7935 }
7936
7937 static const struct bpf_func_proto *
sock_addr_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)7938 sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
7939 {
7940 const struct bpf_func_proto *func_proto;
7941
7942 func_proto = cgroup_common_func_proto(func_id, prog);
7943 if (func_proto)
7944 return func_proto;
7945
7946 func_proto = cgroup_current_func_proto(func_id, prog);
7947 if (func_proto)
7948 return func_proto;
7949
7950 switch (func_id) {
7951 case BPF_FUNC_bind:
7952 switch (prog->expected_attach_type) {
7953 case BPF_CGROUP_INET4_CONNECT:
7954 case BPF_CGROUP_INET6_CONNECT:
7955 return &bpf_bind_proto;
7956 default:
7957 return NULL;
7958 }
7959 case BPF_FUNC_get_socket_cookie:
7960 return &bpf_get_socket_cookie_sock_addr_proto;
7961 case BPF_FUNC_get_netns_cookie:
7962 return &bpf_get_netns_cookie_sock_addr_proto;
7963 case BPF_FUNC_perf_event_output:
7964 return &bpf_event_output_data_proto;
7965 #ifdef CONFIG_INET
7966 case BPF_FUNC_sk_lookup_tcp:
7967 return &bpf_sock_addr_sk_lookup_tcp_proto;
7968 case BPF_FUNC_sk_lookup_udp:
7969 return &bpf_sock_addr_sk_lookup_udp_proto;
7970 case BPF_FUNC_sk_release:
7971 return &bpf_sk_release_proto;
7972 case BPF_FUNC_skc_lookup_tcp:
7973 return &bpf_sock_addr_skc_lookup_tcp_proto;
7974 #endif /* CONFIG_INET */
7975 case BPF_FUNC_sk_storage_get:
7976 return &bpf_sk_storage_get_proto;
7977 case BPF_FUNC_sk_storage_delete:
7978 return &bpf_sk_storage_delete_proto;
7979 case BPF_FUNC_setsockopt:
7980 switch (prog->expected_attach_type) {
7981 case BPF_CGROUP_INET4_BIND:
7982 case BPF_CGROUP_INET6_BIND:
7983 case BPF_CGROUP_INET4_CONNECT:
7984 case BPF_CGROUP_INET6_CONNECT:
7985 case BPF_CGROUP_UDP4_RECVMSG:
7986 case BPF_CGROUP_UDP6_RECVMSG:
7987 case BPF_CGROUP_UDP4_SENDMSG:
7988 case BPF_CGROUP_UDP6_SENDMSG:
7989 case BPF_CGROUP_INET4_GETPEERNAME:
7990 case BPF_CGROUP_INET6_GETPEERNAME:
7991 case BPF_CGROUP_INET4_GETSOCKNAME:
7992 case BPF_CGROUP_INET6_GETSOCKNAME:
7993 return &bpf_sock_addr_setsockopt_proto;
7994 default:
7995 return NULL;
7996 }
7997 case BPF_FUNC_getsockopt:
7998 switch (prog->expected_attach_type) {
7999 case BPF_CGROUP_INET4_BIND:
8000 case BPF_CGROUP_INET6_BIND:
8001 case BPF_CGROUP_INET4_CONNECT:
8002 case BPF_CGROUP_INET6_CONNECT:
8003 case BPF_CGROUP_UDP4_RECVMSG:
8004 case BPF_CGROUP_UDP6_RECVMSG:
8005 case BPF_CGROUP_UDP4_SENDMSG:
8006 case BPF_CGROUP_UDP6_SENDMSG:
8007 case BPF_CGROUP_INET4_GETPEERNAME:
8008 case BPF_CGROUP_INET6_GETPEERNAME:
8009 case BPF_CGROUP_INET4_GETSOCKNAME:
8010 case BPF_CGROUP_INET6_GETSOCKNAME:
8011 return &bpf_sock_addr_getsockopt_proto;
8012 default:
8013 return NULL;
8014 }
8015 default:
8016 return bpf_sk_base_func_proto(func_id);
8017 }
8018 }
8019
8020 static const struct bpf_func_proto *
sk_filter_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)8021 sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
8022 {
8023 switch (func_id) {
8024 case BPF_FUNC_skb_load_bytes:
8025 return &bpf_skb_load_bytes_proto;
8026 case BPF_FUNC_skb_load_bytes_relative:
8027 return &bpf_skb_load_bytes_relative_proto;
8028 case BPF_FUNC_get_socket_cookie:
8029 return &bpf_get_socket_cookie_proto;
8030 case BPF_FUNC_get_socket_uid:
8031 return &bpf_get_socket_uid_proto;
8032 case BPF_FUNC_perf_event_output:
8033 return &bpf_skb_event_output_proto;
8034 default:
8035 return bpf_sk_base_func_proto(func_id);
8036 }
8037 }
8038
8039 const struct bpf_func_proto bpf_sk_storage_get_proto __weak;
8040 const struct bpf_func_proto bpf_sk_storage_delete_proto __weak;
8041
8042 static const struct bpf_func_proto *
cg_skb_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)8043 cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
8044 {
8045 const struct bpf_func_proto *func_proto;
8046
8047 func_proto = cgroup_common_func_proto(func_id, prog);
8048 if (func_proto)
8049 return func_proto;
8050
8051 switch (func_id) {
8052 case BPF_FUNC_sk_fullsock:
8053 return &bpf_sk_fullsock_proto;
8054 case BPF_FUNC_sk_storage_get:
8055 return &bpf_sk_storage_get_proto;
8056 case BPF_FUNC_sk_storage_delete:
8057 return &bpf_sk_storage_delete_proto;
8058 case BPF_FUNC_perf_event_output:
8059 return &bpf_skb_event_output_proto;
8060 #ifdef CONFIG_SOCK_CGROUP_DATA
8061 case BPF_FUNC_skb_cgroup_id:
8062 return &bpf_skb_cgroup_id_proto;
8063 case BPF_FUNC_skb_ancestor_cgroup_id:
8064 return &bpf_skb_ancestor_cgroup_id_proto;
8065 case BPF_FUNC_sk_cgroup_id:
8066 return &bpf_sk_cgroup_id_proto;
8067 case BPF_FUNC_sk_ancestor_cgroup_id:
8068 return &bpf_sk_ancestor_cgroup_id_proto;
8069 #endif
8070 #ifdef CONFIG_INET
8071 case BPF_FUNC_sk_lookup_tcp:
8072 return &bpf_sk_lookup_tcp_proto;
8073 case BPF_FUNC_sk_lookup_udp:
8074 return &bpf_sk_lookup_udp_proto;
8075 case BPF_FUNC_sk_release:
8076 return &bpf_sk_release_proto;
8077 case BPF_FUNC_skc_lookup_tcp:
8078 return &bpf_skc_lookup_tcp_proto;
8079 case BPF_FUNC_tcp_sock:
8080 return &bpf_tcp_sock_proto;
8081 case BPF_FUNC_get_listener_sock:
8082 return &bpf_get_listener_sock_proto;
8083 case BPF_FUNC_skb_ecn_set_ce:
8084 return &bpf_skb_ecn_set_ce_proto;
8085 #endif
8086 default:
8087 return sk_filter_func_proto(func_id, prog);
8088 }
8089 }
8090
8091 static const struct bpf_func_proto *
tc_cls_act_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)8092 tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
8093 {
8094 switch (func_id) {
8095 case BPF_FUNC_skb_store_bytes:
8096 return &bpf_skb_store_bytes_proto;
8097 case BPF_FUNC_skb_load_bytes:
8098 return &bpf_skb_load_bytes_proto;
8099 case BPF_FUNC_skb_load_bytes_relative:
8100 return &bpf_skb_load_bytes_relative_proto;
8101 case BPF_FUNC_skb_pull_data:
8102 return &bpf_skb_pull_data_proto;
8103 case BPF_FUNC_csum_diff:
8104 return &bpf_csum_diff_proto;
8105 case BPF_FUNC_csum_update:
8106 return &bpf_csum_update_proto;
8107 case BPF_FUNC_csum_level:
8108 return &bpf_csum_level_proto;
8109 case BPF_FUNC_l3_csum_replace:
8110 return &bpf_l3_csum_replace_proto;
8111 case BPF_FUNC_l4_csum_replace:
8112 return &bpf_l4_csum_replace_proto;
8113 case BPF_FUNC_clone_redirect:
8114 return &bpf_clone_redirect_proto;
8115 case BPF_FUNC_get_cgroup_classid:
8116 return &bpf_get_cgroup_classid_proto;
8117 case BPF_FUNC_skb_vlan_push:
8118 return &bpf_skb_vlan_push_proto;
8119 case BPF_FUNC_skb_vlan_pop:
8120 return &bpf_skb_vlan_pop_proto;
8121 case BPF_FUNC_skb_change_proto:
8122 return &bpf_skb_change_proto_proto;
8123 case BPF_FUNC_skb_change_type:
8124 return &bpf_skb_change_type_proto;
8125 case BPF_FUNC_skb_adjust_room:
8126 return &bpf_skb_adjust_room_proto;
8127 case BPF_FUNC_skb_change_tail:
8128 return &bpf_skb_change_tail_proto;
8129 case BPF_FUNC_skb_change_head:
8130 return &bpf_skb_change_head_proto;
8131 case BPF_FUNC_skb_get_tunnel_key:
8132 return &bpf_skb_get_tunnel_key_proto;
8133 case BPF_FUNC_skb_set_tunnel_key:
8134 return bpf_get_skb_set_tunnel_proto(func_id);
8135 case BPF_FUNC_skb_get_tunnel_opt:
8136 return &bpf_skb_get_tunnel_opt_proto;
8137 case BPF_FUNC_skb_set_tunnel_opt:
8138 return bpf_get_skb_set_tunnel_proto(func_id);
8139 case BPF_FUNC_redirect:
8140 return &bpf_redirect_proto;
8141 case BPF_FUNC_redirect_neigh:
8142 return &bpf_redirect_neigh_proto;
8143 case BPF_FUNC_redirect_peer:
8144 return &bpf_redirect_peer_proto;
8145 case BPF_FUNC_get_route_realm:
8146 return &bpf_get_route_realm_proto;
8147 case BPF_FUNC_get_hash_recalc:
8148 return &bpf_get_hash_recalc_proto;
8149 case BPF_FUNC_set_hash_invalid:
8150 return &bpf_set_hash_invalid_proto;
8151 case BPF_FUNC_set_hash:
8152 return &bpf_set_hash_proto;
8153 case BPF_FUNC_perf_event_output:
8154 return &bpf_skb_event_output_proto;
8155 case BPF_FUNC_get_smp_processor_id:
8156 return &bpf_get_smp_processor_id_proto;
8157 case BPF_FUNC_skb_under_cgroup:
8158 return &bpf_skb_under_cgroup_proto;
8159 case BPF_FUNC_get_socket_cookie:
8160 return &bpf_get_socket_cookie_proto;
8161 case BPF_FUNC_get_socket_uid:
8162 return &bpf_get_socket_uid_proto;
8163 case BPF_FUNC_fib_lookup:
8164 return &bpf_skb_fib_lookup_proto;
8165 case BPF_FUNC_check_mtu:
8166 return &bpf_skb_check_mtu_proto;
8167 case BPF_FUNC_sk_fullsock:
8168 return &bpf_sk_fullsock_proto;
8169 case BPF_FUNC_sk_storage_get:
8170 return &bpf_sk_storage_get_proto;
8171 case BPF_FUNC_sk_storage_delete:
8172 return &bpf_sk_storage_delete_proto;
8173 #ifdef CONFIG_XFRM
8174 case BPF_FUNC_skb_get_xfrm_state:
8175 return &bpf_skb_get_xfrm_state_proto;
8176 #endif
8177 #ifdef CONFIG_CGROUP_NET_CLASSID
8178 case BPF_FUNC_skb_cgroup_classid:
8179 return &bpf_skb_cgroup_classid_proto;
8180 #endif
8181 #ifdef CONFIG_SOCK_CGROUP_DATA
8182 case BPF_FUNC_skb_cgroup_id:
8183 return &bpf_skb_cgroup_id_proto;
8184 case BPF_FUNC_skb_ancestor_cgroup_id:
8185 return &bpf_skb_ancestor_cgroup_id_proto;
8186 #endif
8187 #ifdef CONFIG_INET
8188 case BPF_FUNC_sk_lookup_tcp:
8189 return &bpf_tc_sk_lookup_tcp_proto;
8190 case BPF_FUNC_sk_lookup_udp:
8191 return &bpf_tc_sk_lookup_udp_proto;
8192 case BPF_FUNC_sk_release:
8193 return &bpf_sk_release_proto;
8194 case BPF_FUNC_tcp_sock:
8195 return &bpf_tcp_sock_proto;
8196 case BPF_FUNC_get_listener_sock:
8197 return &bpf_get_listener_sock_proto;
8198 case BPF_FUNC_skc_lookup_tcp:
8199 return &bpf_tc_skc_lookup_tcp_proto;
8200 case BPF_FUNC_tcp_check_syncookie:
8201 return &bpf_tcp_check_syncookie_proto;
8202 case BPF_FUNC_skb_ecn_set_ce:
8203 return &bpf_skb_ecn_set_ce_proto;
8204 case BPF_FUNC_tcp_gen_syncookie:
8205 return &bpf_tcp_gen_syncookie_proto;
8206 case BPF_FUNC_sk_assign:
8207 return &bpf_sk_assign_proto;
8208 case BPF_FUNC_skb_set_tstamp:
8209 return &bpf_skb_set_tstamp_proto;
8210 #ifdef CONFIG_SYN_COOKIES
8211 case BPF_FUNC_tcp_raw_gen_syncookie_ipv4:
8212 return &bpf_tcp_raw_gen_syncookie_ipv4_proto;
8213 case BPF_FUNC_tcp_raw_gen_syncookie_ipv6:
8214 return &bpf_tcp_raw_gen_syncookie_ipv6_proto;
8215 case BPF_FUNC_tcp_raw_check_syncookie_ipv4:
8216 return &bpf_tcp_raw_check_syncookie_ipv4_proto;
8217 case BPF_FUNC_tcp_raw_check_syncookie_ipv6:
8218 return &bpf_tcp_raw_check_syncookie_ipv6_proto;
8219 #endif
8220 #endif
8221 default:
8222 return bpf_sk_base_func_proto(func_id);
8223 }
8224 }
8225
8226 static const struct bpf_func_proto *
xdp_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)8227 xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
8228 {
8229 switch (func_id) {
8230 case BPF_FUNC_perf_event_output:
8231 return &bpf_xdp_event_output_proto;
8232 case BPF_FUNC_get_smp_processor_id:
8233 return &bpf_get_smp_processor_id_proto;
8234 case BPF_FUNC_csum_diff:
8235 return &bpf_csum_diff_proto;
8236 case BPF_FUNC_xdp_adjust_head:
8237 return &bpf_xdp_adjust_head_proto;
8238 case BPF_FUNC_xdp_adjust_meta:
8239 return &bpf_xdp_adjust_meta_proto;
8240 case BPF_FUNC_redirect:
8241 return &bpf_xdp_redirect_proto;
8242 case BPF_FUNC_redirect_map:
8243 return &bpf_xdp_redirect_map_proto;
8244 case BPF_FUNC_xdp_adjust_tail:
8245 return &bpf_xdp_adjust_tail_proto;
8246 case BPF_FUNC_xdp_get_buff_len:
8247 return &bpf_xdp_get_buff_len_proto;
8248 case BPF_FUNC_xdp_load_bytes:
8249 return &bpf_xdp_load_bytes_proto;
8250 case BPF_FUNC_xdp_store_bytes:
8251 return &bpf_xdp_store_bytes_proto;
8252 case BPF_FUNC_fib_lookup:
8253 return &bpf_xdp_fib_lookup_proto;
8254 case BPF_FUNC_check_mtu:
8255 return &bpf_xdp_check_mtu_proto;
8256 #ifdef CONFIG_INET
8257 case BPF_FUNC_sk_lookup_udp:
8258 return &bpf_xdp_sk_lookup_udp_proto;
8259 case BPF_FUNC_sk_lookup_tcp:
8260 return &bpf_xdp_sk_lookup_tcp_proto;
8261 case BPF_FUNC_sk_release:
8262 return &bpf_sk_release_proto;
8263 case BPF_FUNC_skc_lookup_tcp:
8264 return &bpf_xdp_skc_lookup_tcp_proto;
8265 case BPF_FUNC_tcp_check_syncookie:
8266 return &bpf_tcp_check_syncookie_proto;
8267 case BPF_FUNC_tcp_gen_syncookie:
8268 return &bpf_tcp_gen_syncookie_proto;
8269 #ifdef CONFIG_SYN_COOKIES
8270 case BPF_FUNC_tcp_raw_gen_syncookie_ipv4:
8271 return &bpf_tcp_raw_gen_syncookie_ipv4_proto;
8272 case BPF_FUNC_tcp_raw_gen_syncookie_ipv6:
8273 return &bpf_tcp_raw_gen_syncookie_ipv6_proto;
8274 case BPF_FUNC_tcp_raw_check_syncookie_ipv4:
8275 return &bpf_tcp_raw_check_syncookie_ipv4_proto;
8276 case BPF_FUNC_tcp_raw_check_syncookie_ipv6:
8277 return &bpf_tcp_raw_check_syncookie_ipv6_proto;
8278 #endif
8279 #endif
8280 default:
8281 return bpf_sk_base_func_proto(func_id);
8282 }
8283
8284 #if IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)
8285 /* The nf_conn___init type is used in the NF_CONNTRACK kfuncs. The
8286 * kfuncs are defined in two different modules, and we want to be able
8287 * to use them interchangably with the same BTF type ID. Because modules
8288 * can't de-duplicate BTF IDs between each other, we need the type to be
8289 * referenced in the vmlinux BTF or the verifier will get confused about
8290 * the different types. So we add this dummy type reference which will
8291 * be included in vmlinux BTF, allowing both modules to refer to the
8292 * same type ID.
8293 */
8294 BTF_TYPE_EMIT(struct nf_conn___init);
8295 #endif
8296 }
8297
8298 const struct bpf_func_proto bpf_sock_map_update_proto __weak;
8299 const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
8300
8301 static const struct bpf_func_proto *
sock_ops_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)8302 sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
8303 {
8304 const struct bpf_func_proto *func_proto;
8305
8306 func_proto = cgroup_common_func_proto(func_id, prog);
8307 if (func_proto)
8308 return func_proto;
8309
8310 switch (func_id) {
8311 case BPF_FUNC_setsockopt:
8312 return &bpf_sock_ops_setsockopt_proto;
8313 case BPF_FUNC_getsockopt:
8314 return &bpf_sock_ops_getsockopt_proto;
8315 case BPF_FUNC_sock_ops_cb_flags_set:
8316 return &bpf_sock_ops_cb_flags_set_proto;
8317 case BPF_FUNC_sock_map_update:
8318 return &bpf_sock_map_update_proto;
8319 case BPF_FUNC_sock_hash_update:
8320 return &bpf_sock_hash_update_proto;
8321 case BPF_FUNC_get_socket_cookie:
8322 return &bpf_get_socket_cookie_sock_ops_proto;
8323 case BPF_FUNC_perf_event_output:
8324 return &bpf_event_output_data_proto;
8325 case BPF_FUNC_sk_storage_get:
8326 return &bpf_sk_storage_get_proto;
8327 case BPF_FUNC_sk_storage_delete:
8328 return &bpf_sk_storage_delete_proto;
8329 case BPF_FUNC_get_netns_cookie:
8330 return &bpf_get_netns_cookie_sock_ops_proto;
8331 #ifdef CONFIG_INET
8332 case BPF_FUNC_load_hdr_opt:
8333 return &bpf_sock_ops_load_hdr_opt_proto;
8334 case BPF_FUNC_store_hdr_opt:
8335 return &bpf_sock_ops_store_hdr_opt_proto;
8336 case BPF_FUNC_reserve_hdr_opt:
8337 return &bpf_sock_ops_reserve_hdr_opt_proto;
8338 case BPF_FUNC_tcp_sock:
8339 return &bpf_tcp_sock_proto;
8340 #endif /* CONFIG_INET */
8341 default:
8342 return bpf_sk_base_func_proto(func_id);
8343 }
8344 }
8345
8346 const struct bpf_func_proto bpf_msg_redirect_map_proto __weak;
8347 const struct bpf_func_proto bpf_msg_redirect_hash_proto __weak;
8348
8349 static const struct bpf_func_proto *
sk_msg_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)8350 sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
8351 {
8352 switch (func_id) {
8353 case BPF_FUNC_msg_redirect_map:
8354 return &bpf_msg_redirect_map_proto;
8355 case BPF_FUNC_msg_redirect_hash:
8356 return &bpf_msg_redirect_hash_proto;
8357 case BPF_FUNC_msg_apply_bytes:
8358 return &bpf_msg_apply_bytes_proto;
8359 case BPF_FUNC_msg_cork_bytes:
8360 return &bpf_msg_cork_bytes_proto;
8361 case BPF_FUNC_msg_pull_data:
8362 return &bpf_msg_pull_data_proto;
8363 case BPF_FUNC_msg_push_data:
8364 return &bpf_msg_push_data_proto;
8365 case BPF_FUNC_msg_pop_data:
8366 return &bpf_msg_pop_data_proto;
8367 case BPF_FUNC_perf_event_output:
8368 return &bpf_event_output_data_proto;
8369 case BPF_FUNC_get_current_uid_gid:
8370 return &bpf_get_current_uid_gid_proto;
8371 case BPF_FUNC_get_current_pid_tgid:
8372 return &bpf_get_current_pid_tgid_proto;
8373 case BPF_FUNC_sk_storage_get:
8374 return &bpf_sk_storage_get_proto;
8375 case BPF_FUNC_sk_storage_delete:
8376 return &bpf_sk_storage_delete_proto;
8377 case BPF_FUNC_get_netns_cookie:
8378 return &bpf_get_netns_cookie_sk_msg_proto;
8379 #ifdef CONFIG_CGROUP_NET_CLASSID
8380 case BPF_FUNC_get_cgroup_classid:
8381 return &bpf_get_cgroup_classid_curr_proto;
8382 #endif
8383 default:
8384 return bpf_sk_base_func_proto(func_id);
8385 }
8386 }
8387
8388 const struct bpf_func_proto bpf_sk_redirect_map_proto __weak;
8389 const struct bpf_func_proto bpf_sk_redirect_hash_proto __weak;
8390
8391 static const struct bpf_func_proto *
sk_skb_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)8392 sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
8393 {
8394 switch (func_id) {
8395 case BPF_FUNC_skb_store_bytes:
8396 return &bpf_skb_store_bytes_proto;
8397 case BPF_FUNC_skb_load_bytes:
8398 return &bpf_skb_load_bytes_proto;
8399 case BPF_FUNC_skb_pull_data:
8400 return &sk_skb_pull_data_proto;
8401 case BPF_FUNC_skb_change_tail:
8402 return &sk_skb_change_tail_proto;
8403 case BPF_FUNC_skb_change_head:
8404 return &sk_skb_change_head_proto;
8405 case BPF_FUNC_skb_adjust_room:
8406 return &sk_skb_adjust_room_proto;
8407 case BPF_FUNC_get_socket_cookie:
8408 return &bpf_get_socket_cookie_proto;
8409 case BPF_FUNC_get_socket_uid:
8410 return &bpf_get_socket_uid_proto;
8411 case BPF_FUNC_sk_redirect_map:
8412 return &bpf_sk_redirect_map_proto;
8413 case BPF_FUNC_sk_redirect_hash:
8414 return &bpf_sk_redirect_hash_proto;
8415 case BPF_FUNC_perf_event_output:
8416 return &bpf_skb_event_output_proto;
8417 #ifdef CONFIG_INET
8418 case BPF_FUNC_sk_lookup_tcp:
8419 return &bpf_sk_lookup_tcp_proto;
8420 case BPF_FUNC_sk_lookup_udp:
8421 return &bpf_sk_lookup_udp_proto;
8422 case BPF_FUNC_sk_release:
8423 return &bpf_sk_release_proto;
8424 case BPF_FUNC_skc_lookup_tcp:
8425 return &bpf_skc_lookup_tcp_proto;
8426 #endif
8427 default:
8428 return bpf_sk_base_func_proto(func_id);
8429 }
8430 }
8431
8432 static const struct bpf_func_proto *
flow_dissector_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)8433 flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
8434 {
8435 switch (func_id) {
8436 case BPF_FUNC_skb_load_bytes:
8437 return &bpf_flow_dissector_load_bytes_proto;
8438 default:
8439 return bpf_sk_base_func_proto(func_id);
8440 }
8441 }
8442
8443 static const struct bpf_func_proto *
lwt_out_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)8444 lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
8445 {
8446 switch (func_id) {
8447 case BPF_FUNC_skb_load_bytes:
8448 return &bpf_skb_load_bytes_proto;
8449 case BPF_FUNC_skb_pull_data:
8450 return &bpf_skb_pull_data_proto;
8451 case BPF_FUNC_csum_diff:
8452 return &bpf_csum_diff_proto;
8453 case BPF_FUNC_get_cgroup_classid:
8454 return &bpf_get_cgroup_classid_proto;
8455 case BPF_FUNC_get_route_realm:
8456 return &bpf_get_route_realm_proto;
8457 case BPF_FUNC_get_hash_recalc:
8458 return &bpf_get_hash_recalc_proto;
8459 case BPF_FUNC_perf_event_output:
8460 return &bpf_skb_event_output_proto;
8461 case BPF_FUNC_get_smp_processor_id:
8462 return &bpf_get_smp_processor_id_proto;
8463 case BPF_FUNC_skb_under_cgroup:
8464 return &bpf_skb_under_cgroup_proto;
8465 default:
8466 return bpf_sk_base_func_proto(func_id);
8467 }
8468 }
8469
8470 static const struct bpf_func_proto *
lwt_in_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)8471 lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
8472 {
8473 switch (func_id) {
8474 case BPF_FUNC_lwt_push_encap:
8475 return &bpf_lwt_in_push_encap_proto;
8476 default:
8477 return lwt_out_func_proto(func_id, prog);
8478 }
8479 }
8480
8481 static const struct bpf_func_proto *
lwt_xmit_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)8482 lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
8483 {
8484 switch (func_id) {
8485 case BPF_FUNC_skb_get_tunnel_key:
8486 return &bpf_skb_get_tunnel_key_proto;
8487 case BPF_FUNC_skb_set_tunnel_key:
8488 return bpf_get_skb_set_tunnel_proto(func_id);
8489 case BPF_FUNC_skb_get_tunnel_opt:
8490 return &bpf_skb_get_tunnel_opt_proto;
8491 case BPF_FUNC_skb_set_tunnel_opt:
8492 return bpf_get_skb_set_tunnel_proto(func_id);
8493 case BPF_FUNC_redirect:
8494 return &bpf_redirect_proto;
8495 case BPF_FUNC_clone_redirect:
8496 return &bpf_clone_redirect_proto;
8497 case BPF_FUNC_skb_change_tail:
8498 return &bpf_skb_change_tail_proto;
8499 case BPF_FUNC_skb_change_head:
8500 return &bpf_skb_change_head_proto;
8501 case BPF_FUNC_skb_store_bytes:
8502 return &bpf_skb_store_bytes_proto;
8503 case BPF_FUNC_csum_update:
8504 return &bpf_csum_update_proto;
8505 case BPF_FUNC_csum_level:
8506 return &bpf_csum_level_proto;
8507 case BPF_FUNC_l3_csum_replace:
8508 return &bpf_l3_csum_replace_proto;
8509 case BPF_FUNC_l4_csum_replace:
8510 return &bpf_l4_csum_replace_proto;
8511 case BPF_FUNC_set_hash_invalid:
8512 return &bpf_set_hash_invalid_proto;
8513 case BPF_FUNC_lwt_push_encap:
8514 return &bpf_lwt_xmit_push_encap_proto;
8515 default:
8516 return lwt_out_func_proto(func_id, prog);
8517 }
8518 }
8519
8520 static const struct bpf_func_proto *
lwt_seg6local_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)8521 lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
8522 {
8523 switch (func_id) {
8524 #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
8525 case BPF_FUNC_lwt_seg6_store_bytes:
8526 return &bpf_lwt_seg6_store_bytes_proto;
8527 case BPF_FUNC_lwt_seg6_action:
8528 return &bpf_lwt_seg6_action_proto;
8529 case BPF_FUNC_lwt_seg6_adjust_srh:
8530 return &bpf_lwt_seg6_adjust_srh_proto;
8531 #endif
8532 default:
8533 return lwt_out_func_proto(func_id, prog);
8534 }
8535 }
8536
bpf_skb_is_valid_access(int off,int size,enum bpf_access_type type,const struct bpf_prog * prog,struct bpf_insn_access_aux * info)8537 static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type,
8538 const struct bpf_prog *prog,
8539 struct bpf_insn_access_aux *info)
8540 {
8541 const int size_default = sizeof(__u32);
8542
8543 if (off < 0 || off >= sizeof(struct __sk_buff))
8544 return false;
8545
8546 /* The verifier guarantees that size > 0. */
8547 if (off % size != 0)
8548 return false;
8549
8550 switch (off) {
8551 case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
8552 if (off + size > offsetofend(struct __sk_buff, cb[4]))
8553 return false;
8554 break;
8555 case bpf_ctx_range_till(struct __sk_buff, remote_ip6[0], remote_ip6[3]):
8556 case bpf_ctx_range_till(struct __sk_buff, local_ip6[0], local_ip6[3]):
8557 case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4):
8558 case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4):
8559 case bpf_ctx_range(struct __sk_buff, data):
8560 case bpf_ctx_range(struct __sk_buff, data_meta):
8561 case bpf_ctx_range(struct __sk_buff, data_end):
8562 if (size != size_default)
8563 return false;
8564 break;
8565 case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
8566 return false;
8567 case bpf_ctx_range(struct __sk_buff, hwtstamp):
8568 if (type == BPF_WRITE || size != sizeof(__u64))
8569 return false;
8570 break;
8571 case bpf_ctx_range(struct __sk_buff, tstamp):
8572 if (size != sizeof(__u64))
8573 return false;
8574 break;
8575 case offsetof(struct __sk_buff, sk):
8576 if (type == BPF_WRITE || size != sizeof(__u64))
8577 return false;
8578 info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
8579 break;
8580 case offsetof(struct __sk_buff, tstamp_type):
8581 return false;
8582 case offsetofend(struct __sk_buff, tstamp_type) ... offsetof(struct __sk_buff, hwtstamp) - 1:
8583 /* Explicitly prohibit access to padding in __sk_buff. */
8584 return false;
8585 default:
8586 /* Only narrow read access allowed for now. */
8587 if (type == BPF_WRITE) {
8588 if (size != size_default)
8589 return false;
8590 } else {
8591 bpf_ctx_record_field_size(info, size_default);
8592 if (!bpf_ctx_narrow_access_ok(off, size, size_default))
8593 return false;
8594 }
8595 }
8596
8597 return true;
8598 }
8599
sk_filter_is_valid_access(int off,int size,enum bpf_access_type type,const struct bpf_prog * prog,struct bpf_insn_access_aux * info)8600 static bool sk_filter_is_valid_access(int off, int size,
8601 enum bpf_access_type type,
8602 const struct bpf_prog *prog,
8603 struct bpf_insn_access_aux *info)
8604 {
8605 switch (off) {
8606 case bpf_ctx_range(struct __sk_buff, tc_classid):
8607 case bpf_ctx_range(struct __sk_buff, data):
8608 case bpf_ctx_range(struct __sk_buff, data_meta):
8609 case bpf_ctx_range(struct __sk_buff, data_end):
8610 case bpf_ctx_range_till(struct __sk_buff, family, local_port):
8611 case bpf_ctx_range(struct __sk_buff, tstamp):
8612 case bpf_ctx_range(struct __sk_buff, wire_len):
8613 case bpf_ctx_range(struct __sk_buff, hwtstamp):
8614 return false;
8615 }
8616
8617 if (type == BPF_WRITE) {
8618 switch (off) {
8619 case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
8620 break;
8621 default:
8622 return false;
8623 }
8624 }
8625
8626 return bpf_skb_is_valid_access(off, size, type, prog, info);
8627 }
8628
cg_skb_is_valid_access(int off,int size,enum bpf_access_type type,const struct bpf_prog * prog,struct bpf_insn_access_aux * info)8629 static bool cg_skb_is_valid_access(int off, int size,
8630 enum bpf_access_type type,
8631 const struct bpf_prog *prog,
8632 struct bpf_insn_access_aux *info)
8633 {
8634 switch (off) {
8635 case bpf_ctx_range(struct __sk_buff, tc_classid):
8636 case bpf_ctx_range(struct __sk_buff, data_meta):
8637 case bpf_ctx_range(struct __sk_buff, wire_len):
8638 return false;
8639 case bpf_ctx_range(struct __sk_buff, data):
8640 case bpf_ctx_range(struct __sk_buff, data_end):
8641 if (!bpf_capable())
8642 return false;
8643 break;
8644 }
8645
8646 if (type == BPF_WRITE) {
8647 switch (off) {
8648 case bpf_ctx_range(struct __sk_buff, mark):
8649 case bpf_ctx_range(struct __sk_buff, priority):
8650 case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
8651 break;
8652 case bpf_ctx_range(struct __sk_buff, tstamp):
8653 if (!bpf_capable())
8654 return false;
8655 break;
8656 default:
8657 return false;
8658 }
8659 }
8660
8661 switch (off) {
8662 case bpf_ctx_range(struct __sk_buff, data):
8663 info->reg_type = PTR_TO_PACKET;
8664 break;
8665 case bpf_ctx_range(struct __sk_buff, data_end):
8666 info->reg_type = PTR_TO_PACKET_END;
8667 break;
8668 }
8669
8670 return bpf_skb_is_valid_access(off, size, type, prog, info);
8671 }
8672
lwt_is_valid_access(int off,int size,enum bpf_access_type type,const struct bpf_prog * prog,struct bpf_insn_access_aux * info)8673 static bool lwt_is_valid_access(int off, int size,
8674 enum bpf_access_type type,
8675 const struct bpf_prog *prog,
8676 struct bpf_insn_access_aux *info)
8677 {
8678 switch (off) {
8679 case bpf_ctx_range(struct __sk_buff, tc_classid):
8680 case bpf_ctx_range_till(struct __sk_buff, family, local_port):
8681 case bpf_ctx_range(struct __sk_buff, data_meta):
8682 case bpf_ctx_range(struct __sk_buff, tstamp):
8683 case bpf_ctx_range(struct __sk_buff, wire_len):
8684 case bpf_ctx_range(struct __sk_buff, hwtstamp):
8685 return false;
8686 }
8687
8688 if (type == BPF_WRITE) {
8689 switch (off) {
8690 case bpf_ctx_range(struct __sk_buff, mark):
8691 case bpf_ctx_range(struct __sk_buff, priority):
8692 case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
8693 break;
8694 default:
8695 return false;
8696 }
8697 }
8698
8699 switch (off) {
8700 case bpf_ctx_range(struct __sk_buff, data):
8701 info->reg_type = PTR_TO_PACKET;
8702 break;
8703 case bpf_ctx_range(struct __sk_buff, data_end):
8704 info->reg_type = PTR_TO_PACKET_END;
8705 break;
8706 }
8707
8708 return bpf_skb_is_valid_access(off, size, type, prog, info);
8709 }
8710
8711 /* Attach type specific accesses */
__sock_filter_check_attach_type(int off,enum bpf_access_type access_type,enum bpf_attach_type attach_type)8712 static bool __sock_filter_check_attach_type(int off,
8713 enum bpf_access_type access_type,
8714 enum bpf_attach_type attach_type)
8715 {
8716 switch (off) {
8717 case offsetof(struct bpf_sock, bound_dev_if):
8718 case offsetof(struct bpf_sock, mark):
8719 case offsetof(struct bpf_sock, priority):
8720 switch (attach_type) {
8721 case BPF_CGROUP_INET_SOCK_CREATE:
8722 case BPF_CGROUP_INET_SOCK_RELEASE:
8723 goto full_access;
8724 default:
8725 return false;
8726 }
8727 case bpf_ctx_range(struct bpf_sock, src_ip4):
8728 switch (attach_type) {
8729 case BPF_CGROUP_INET4_POST_BIND:
8730 goto read_only;
8731 default:
8732 return false;
8733 }
8734 case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
8735 switch (attach_type) {
8736 case BPF_CGROUP_INET6_POST_BIND:
8737 goto read_only;
8738 default:
8739 return false;
8740 }
8741 case bpf_ctx_range(struct bpf_sock, src_port):
8742 switch (attach_type) {
8743 case BPF_CGROUP_INET4_POST_BIND:
8744 case BPF_CGROUP_INET6_POST_BIND:
8745 goto read_only;
8746 default:
8747 return false;
8748 }
8749 }
8750 read_only:
8751 return access_type == BPF_READ;
8752 full_access:
8753 return true;
8754 }
8755
bpf_sock_common_is_valid_access(int off,int size,enum bpf_access_type type,struct bpf_insn_access_aux * info)8756 bool bpf_sock_common_is_valid_access(int off, int size,
8757 enum bpf_access_type type,
8758 struct bpf_insn_access_aux *info)
8759 {
8760 switch (off) {
8761 case bpf_ctx_range_till(struct bpf_sock, type, priority):
8762 return false;
8763 default:
8764 return bpf_sock_is_valid_access(off, size, type, info);
8765 }
8766 }
8767
bpf_sock_is_valid_access(int off,int size,enum bpf_access_type type,struct bpf_insn_access_aux * info)8768 bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
8769 struct bpf_insn_access_aux *info)
8770 {
8771 const int size_default = sizeof(__u32);
8772 int field_size;
8773
8774 if (off < 0 || off >= sizeof(struct bpf_sock))
8775 return false;
8776 if (off % size != 0)
8777 return false;
8778
8779 switch (off) {
8780 case offsetof(struct bpf_sock, state):
8781 case offsetof(struct bpf_sock, family):
8782 case offsetof(struct bpf_sock, type):
8783 case offsetof(struct bpf_sock, protocol):
8784 case offsetof(struct bpf_sock, src_port):
8785 case offsetof(struct bpf_sock, rx_queue_mapping):
8786 case bpf_ctx_range(struct bpf_sock, src_ip4):
8787 case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
8788 case bpf_ctx_range(struct bpf_sock, dst_ip4):
8789 case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]):
8790 bpf_ctx_record_field_size(info, size_default);
8791 return bpf_ctx_narrow_access_ok(off, size, size_default);
8792 case bpf_ctx_range(struct bpf_sock, dst_port):
8793 field_size = size == size_default ?
8794 size_default : sizeof_field(struct bpf_sock, dst_port);
8795 bpf_ctx_record_field_size(info, field_size);
8796 return bpf_ctx_narrow_access_ok(off, size, field_size);
8797 case offsetofend(struct bpf_sock, dst_port) ...
8798 offsetof(struct bpf_sock, dst_ip4) - 1:
8799 return false;
8800 }
8801
8802 return size == size_default;
8803 }
8804
sock_filter_is_valid_access(int off,int size,enum bpf_access_type type,const struct bpf_prog * prog,struct bpf_insn_access_aux * info)8805 static bool sock_filter_is_valid_access(int off, int size,
8806 enum bpf_access_type type,
8807 const struct bpf_prog *prog,
8808 struct bpf_insn_access_aux *info)
8809 {
8810 if (!bpf_sock_is_valid_access(off, size, type, info))
8811 return false;
8812 return __sock_filter_check_attach_type(off, type,
8813 prog->expected_attach_type);
8814 }
8815
bpf_noop_prologue(struct bpf_insn * insn_buf,bool direct_write,const struct bpf_prog * prog)8816 static int bpf_noop_prologue(struct bpf_insn *insn_buf, bool direct_write,
8817 const struct bpf_prog *prog)
8818 {
8819 /* Neither direct read nor direct write requires any preliminary
8820 * action.
8821 */
8822 return 0;
8823 }
8824
bpf_unclone_prologue(struct bpf_insn * insn_buf,bool direct_write,const struct bpf_prog * prog,int drop_verdict)8825 static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write,
8826 const struct bpf_prog *prog, int drop_verdict)
8827 {
8828 struct bpf_insn *insn = insn_buf;
8829
8830 if (!direct_write)
8831 return 0;
8832
8833 /* if (!skb->cloned)
8834 * goto start;
8835 *
8836 * (Fast-path, otherwise approximation that we might be
8837 * a clone, do the rest in helper.)
8838 */
8839 *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET);
8840 *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK);
8841 *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7);
8842
8843 /* ret = bpf_skb_pull_data(skb, 0); */
8844 *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
8845 *insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2);
8846 *insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
8847 BPF_FUNC_skb_pull_data);
8848 /* if (!ret)
8849 * goto restore;
8850 * return TC_ACT_SHOT;
8851 */
8852 *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2);
8853 *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, drop_verdict);
8854 *insn++ = BPF_EXIT_INSN();
8855
8856 /* restore: */
8857 *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6);
8858 /* start: */
8859 *insn++ = prog->insnsi[0];
8860
8861 return insn - insn_buf;
8862 }
8863
bpf_gen_ld_abs(const struct bpf_insn * orig,struct bpf_insn * insn_buf)8864 static int bpf_gen_ld_abs(const struct bpf_insn *orig,
8865 struct bpf_insn *insn_buf)
8866 {
8867 bool indirect = BPF_MODE(orig->code) == BPF_IND;
8868 struct bpf_insn *insn = insn_buf;
8869
8870 if (!indirect) {
8871 *insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm);
8872 } else {
8873 *insn++ = BPF_MOV64_REG(BPF_REG_2, orig->src_reg);
8874 if (orig->imm)
8875 *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm);
8876 }
8877 /* We're guaranteed here that CTX is in R6. */
8878 *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX);
8879
8880 switch (BPF_SIZE(orig->code)) {
8881 case BPF_B:
8882 *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8_no_cache);
8883 break;
8884 case BPF_H:
8885 *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16_no_cache);
8886 break;
8887 case BPF_W:
8888 *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32_no_cache);
8889 break;
8890 }
8891
8892 *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 2);
8893 *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0);
8894 *insn++ = BPF_EXIT_INSN();
8895
8896 return insn - insn_buf;
8897 }
8898
tc_cls_act_prologue(struct bpf_insn * insn_buf,bool direct_write,const struct bpf_prog * prog)8899 static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
8900 const struct bpf_prog *prog)
8901 {
8902 return bpf_unclone_prologue(insn_buf, direct_write, prog, TC_ACT_SHOT);
8903 }
8904
tc_cls_act_is_valid_access(int off,int size,enum bpf_access_type type,const struct bpf_prog * prog,struct bpf_insn_access_aux * info)8905 static bool tc_cls_act_is_valid_access(int off, int size,
8906 enum bpf_access_type type,
8907 const struct bpf_prog *prog,
8908 struct bpf_insn_access_aux *info)
8909 {
8910 if (type == BPF_WRITE) {
8911 switch (off) {
8912 case bpf_ctx_range(struct __sk_buff, mark):
8913 case bpf_ctx_range(struct __sk_buff, tc_index):
8914 case bpf_ctx_range(struct __sk_buff, priority):
8915 case bpf_ctx_range(struct __sk_buff, tc_classid):
8916 case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
8917 case bpf_ctx_range(struct __sk_buff, tstamp):
8918 case bpf_ctx_range(struct __sk_buff, queue_mapping):
8919 break;
8920 default:
8921 return false;
8922 }
8923 }
8924
8925 switch (off) {
8926 case bpf_ctx_range(struct __sk_buff, data):
8927 info->reg_type = PTR_TO_PACKET;
8928 break;
8929 case bpf_ctx_range(struct __sk_buff, data_meta):
8930 info->reg_type = PTR_TO_PACKET_META;
8931 break;
8932 case bpf_ctx_range(struct __sk_buff, data_end):
8933 info->reg_type = PTR_TO_PACKET_END;
8934 break;
8935 case bpf_ctx_range_till(struct __sk_buff, family, local_port):
8936 return false;
8937 case offsetof(struct __sk_buff, tstamp_type):
8938 /* The convert_ctx_access() on reading and writing
8939 * __sk_buff->tstamp depends on whether the bpf prog
8940 * has used __sk_buff->tstamp_type or not.
8941 * Thus, we need to set prog->tstamp_type_access
8942 * earlier during is_valid_access() here.
8943 */
8944 ((struct bpf_prog *)prog)->tstamp_type_access = 1;
8945 return size == sizeof(__u8);
8946 }
8947
8948 return bpf_skb_is_valid_access(off, size, type, prog, info);
8949 }
8950
8951 DEFINE_MUTEX(nf_conn_btf_access_lock);
8952 EXPORT_SYMBOL_GPL(nf_conn_btf_access_lock);
8953
8954 int (*nfct_btf_struct_access)(struct bpf_verifier_log *log,
8955 const struct bpf_reg_state *reg,
8956 int off, int size);
8957 EXPORT_SYMBOL_GPL(nfct_btf_struct_access);
8958
tc_cls_act_btf_struct_access(struct bpf_verifier_log * log,const struct bpf_reg_state * reg,int off,int size)8959 static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log,
8960 const struct bpf_reg_state *reg,
8961 int off, int size)
8962 {
8963 int ret = -EACCES;
8964
8965 mutex_lock(&nf_conn_btf_access_lock);
8966 if (nfct_btf_struct_access)
8967 ret = nfct_btf_struct_access(log, reg, off, size);
8968 mutex_unlock(&nf_conn_btf_access_lock);
8969
8970 return ret;
8971 }
8972
__is_valid_xdp_access(int off,int size)8973 static bool __is_valid_xdp_access(int off, int size)
8974 {
8975 if (off < 0 || off >= sizeof(struct xdp_md))
8976 return false;
8977 if (off % size != 0)
8978 return false;
8979 if (size != sizeof(__u32))
8980 return false;
8981
8982 return true;
8983 }
8984
xdp_is_valid_access(int off,int size,enum bpf_access_type type,const struct bpf_prog * prog,struct bpf_insn_access_aux * info)8985 static bool xdp_is_valid_access(int off, int size,
8986 enum bpf_access_type type,
8987 const struct bpf_prog *prog,
8988 struct bpf_insn_access_aux *info)
8989 {
8990 if (prog->expected_attach_type != BPF_XDP_DEVMAP) {
8991 switch (off) {
8992 case offsetof(struct xdp_md, egress_ifindex):
8993 return false;
8994 }
8995 }
8996
8997 if (type == BPF_WRITE) {
8998 if (bpf_prog_is_offloaded(prog->aux)) {
8999 switch (off) {
9000 case offsetof(struct xdp_md, rx_queue_index):
9001 return __is_valid_xdp_access(off, size);
9002 }
9003 }
9004 return false;
9005 }
9006
9007 switch (off) {
9008 case offsetof(struct xdp_md, data):
9009 info->reg_type = PTR_TO_PACKET;
9010 break;
9011 case offsetof(struct xdp_md, data_meta):
9012 info->reg_type = PTR_TO_PACKET_META;
9013 break;
9014 case offsetof(struct xdp_md, data_end):
9015 info->reg_type = PTR_TO_PACKET_END;
9016 break;
9017 }
9018
9019 return __is_valid_xdp_access(off, size);
9020 }
9021
bpf_warn_invalid_xdp_action(struct net_device * dev,struct bpf_prog * prog,u32 act)9022 void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, u32 act)
9023 {
9024 const u32 act_max = XDP_REDIRECT;
9025
9026 pr_warn_once("%s XDP return value %u on prog %s (id %d) dev %s, expect packet loss!\n",
9027 act > act_max ? "Illegal" : "Driver unsupported",
9028 act, prog->aux->name, prog->aux->id, dev ? dev->name : "N/A");
9029 }
9030 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
9031
xdp_btf_struct_access(struct bpf_verifier_log * log,const struct bpf_reg_state * reg,int off,int size)9032 static int xdp_btf_struct_access(struct bpf_verifier_log *log,
9033 const struct bpf_reg_state *reg,
9034 int off, int size)
9035 {
9036 int ret = -EACCES;
9037
9038 mutex_lock(&nf_conn_btf_access_lock);
9039 if (nfct_btf_struct_access)
9040 ret = nfct_btf_struct_access(log, reg, off, size);
9041 mutex_unlock(&nf_conn_btf_access_lock);
9042
9043 return ret;
9044 }
9045
sock_addr_is_valid_access(int off,int size,enum bpf_access_type type,const struct bpf_prog * prog,struct bpf_insn_access_aux * info)9046 static bool sock_addr_is_valid_access(int off, int size,
9047 enum bpf_access_type type,
9048 const struct bpf_prog *prog,
9049 struct bpf_insn_access_aux *info)
9050 {
9051 const int size_default = sizeof(__u32);
9052
9053 if (off < 0 || off >= sizeof(struct bpf_sock_addr))
9054 return false;
9055 if (off % size != 0)
9056 return false;
9057
9058 /* Disallow access to IPv6 fields from IPv4 contex and vise
9059 * versa.
9060 */
9061 switch (off) {
9062 case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
9063 switch (prog->expected_attach_type) {
9064 case BPF_CGROUP_INET4_BIND:
9065 case BPF_CGROUP_INET4_CONNECT:
9066 case BPF_CGROUP_INET4_GETPEERNAME:
9067 case BPF_CGROUP_INET4_GETSOCKNAME:
9068 case BPF_CGROUP_UDP4_SENDMSG:
9069 case BPF_CGROUP_UDP4_RECVMSG:
9070 break;
9071 default:
9072 return false;
9073 }
9074 break;
9075 case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
9076 switch (prog->expected_attach_type) {
9077 case BPF_CGROUP_INET6_BIND:
9078 case BPF_CGROUP_INET6_CONNECT:
9079 case BPF_CGROUP_INET6_GETPEERNAME:
9080 case BPF_CGROUP_INET6_GETSOCKNAME:
9081 case BPF_CGROUP_UDP6_SENDMSG:
9082 case BPF_CGROUP_UDP6_RECVMSG:
9083 break;
9084 default:
9085 return false;
9086 }
9087 break;
9088 case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
9089 switch (prog->expected_attach_type) {
9090 case BPF_CGROUP_UDP4_SENDMSG:
9091 break;
9092 default:
9093 return false;
9094 }
9095 break;
9096 case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
9097 msg_src_ip6[3]):
9098 switch (prog->expected_attach_type) {
9099 case BPF_CGROUP_UDP6_SENDMSG:
9100 break;
9101 default:
9102 return false;
9103 }
9104 break;
9105 }
9106
9107 switch (off) {
9108 case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
9109 case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
9110 case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
9111 case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
9112 msg_src_ip6[3]):
9113 case bpf_ctx_range(struct bpf_sock_addr, user_port):
9114 if (type == BPF_READ) {
9115 bpf_ctx_record_field_size(info, size_default);
9116
9117 if (bpf_ctx_wide_access_ok(off, size,
9118 struct bpf_sock_addr,
9119 user_ip6))
9120 return true;
9121
9122 if (bpf_ctx_wide_access_ok(off, size,
9123 struct bpf_sock_addr,
9124 msg_src_ip6))
9125 return true;
9126
9127 if (!bpf_ctx_narrow_access_ok(off, size, size_default))
9128 return false;
9129 } else {
9130 if (bpf_ctx_wide_access_ok(off, size,
9131 struct bpf_sock_addr,
9132 user_ip6))
9133 return true;
9134
9135 if (bpf_ctx_wide_access_ok(off, size,
9136 struct bpf_sock_addr,
9137 msg_src_ip6))
9138 return true;
9139
9140 if (size != size_default)
9141 return false;
9142 }
9143 break;
9144 case offsetof(struct bpf_sock_addr, sk):
9145 if (type != BPF_READ)
9146 return false;
9147 if (size != sizeof(__u64))
9148 return false;
9149 info->reg_type = PTR_TO_SOCKET;
9150 break;
9151 default:
9152 if (type == BPF_READ) {
9153 if (size != size_default)
9154 return false;
9155 } else {
9156 return false;
9157 }
9158 }
9159
9160 return true;
9161 }
9162
sock_ops_is_valid_access(int off,int size,enum bpf_access_type type,const struct bpf_prog * prog,struct bpf_insn_access_aux * info)9163 static bool sock_ops_is_valid_access(int off, int size,
9164 enum bpf_access_type type,
9165 const struct bpf_prog *prog,
9166 struct bpf_insn_access_aux *info)
9167 {
9168 const int size_default = sizeof(__u32);
9169
9170 if (off < 0 || off >= sizeof(struct bpf_sock_ops))
9171 return false;
9172
9173 /* The verifier guarantees that size > 0. */
9174 if (off % size != 0)
9175 return false;
9176
9177 if (type == BPF_WRITE) {
9178 switch (off) {
9179 case offsetof(struct bpf_sock_ops, reply):
9180 case offsetof(struct bpf_sock_ops, sk_txhash):
9181 if (size != size_default)
9182 return false;
9183 break;
9184 default:
9185 return false;
9186 }
9187 } else {
9188 switch (off) {
9189 case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received,
9190 bytes_acked):
9191 if (size != sizeof(__u64))
9192 return false;
9193 break;
9194 case offsetof(struct bpf_sock_ops, sk):
9195 if (size != sizeof(__u64))
9196 return false;
9197 info->reg_type = PTR_TO_SOCKET_OR_NULL;
9198 break;
9199 case offsetof(struct bpf_sock_ops, skb_data):
9200 if (size != sizeof(__u64))
9201 return false;
9202 info->reg_type = PTR_TO_PACKET;
9203 break;
9204 case offsetof(struct bpf_sock_ops, skb_data_end):
9205 if (size != sizeof(__u64))
9206 return false;
9207 info->reg_type = PTR_TO_PACKET_END;
9208 break;
9209 case offsetof(struct bpf_sock_ops, skb_tcp_flags):
9210 bpf_ctx_record_field_size(info, size_default);
9211 return bpf_ctx_narrow_access_ok(off, size,
9212 size_default);
9213 case offsetof(struct bpf_sock_ops, skb_hwtstamp):
9214 if (size != sizeof(__u64))
9215 return false;
9216 break;
9217 default:
9218 if (size != size_default)
9219 return false;
9220 break;
9221 }
9222 }
9223
9224 return true;
9225 }
9226
sk_skb_prologue(struct bpf_insn * insn_buf,bool direct_write,const struct bpf_prog * prog)9227 static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write,
9228 const struct bpf_prog *prog)
9229 {
9230 return bpf_unclone_prologue(insn_buf, direct_write, prog, SK_DROP);
9231 }
9232
sk_skb_is_valid_access(int off,int size,enum bpf_access_type type,const struct bpf_prog * prog,struct bpf_insn_access_aux * info)9233 static bool sk_skb_is_valid_access(int off, int size,
9234 enum bpf_access_type type,
9235 const struct bpf_prog *prog,
9236 struct bpf_insn_access_aux *info)
9237 {
9238 switch (off) {
9239 case bpf_ctx_range(struct __sk_buff, tc_classid):
9240 case bpf_ctx_range(struct __sk_buff, data_meta):
9241 case bpf_ctx_range(struct __sk_buff, tstamp):
9242 case bpf_ctx_range(struct __sk_buff, wire_len):
9243 case bpf_ctx_range(struct __sk_buff, hwtstamp):
9244 return false;
9245 }
9246
9247 if (type == BPF_WRITE) {
9248 switch (off) {
9249 case bpf_ctx_range(struct __sk_buff, tc_index):
9250 case bpf_ctx_range(struct __sk_buff, priority):
9251 break;
9252 default:
9253 return false;
9254 }
9255 }
9256
9257 switch (off) {
9258 case bpf_ctx_range(struct __sk_buff, mark):
9259 return false;
9260 case bpf_ctx_range(struct __sk_buff, data):
9261 info->reg_type = PTR_TO_PACKET;
9262 break;
9263 case bpf_ctx_range(struct __sk_buff, data_end):
9264 info->reg_type = PTR_TO_PACKET_END;
9265 break;
9266 }
9267
9268 return bpf_skb_is_valid_access(off, size, type, prog, info);
9269 }
9270
sk_msg_is_valid_access(int off,int size,enum bpf_access_type type,const struct bpf_prog * prog,struct bpf_insn_access_aux * info)9271 static bool sk_msg_is_valid_access(int off, int size,
9272 enum bpf_access_type type,
9273 const struct bpf_prog *prog,
9274 struct bpf_insn_access_aux *info)
9275 {
9276 if (type == BPF_WRITE)
9277 return false;
9278
9279 if (off % size != 0)
9280 return false;
9281
9282 switch (off) {
9283 case offsetof(struct sk_msg_md, data):
9284 info->reg_type = PTR_TO_PACKET;
9285 if (size != sizeof(__u64))
9286 return false;
9287 break;
9288 case offsetof(struct sk_msg_md, data_end):
9289 info->reg_type = PTR_TO_PACKET_END;
9290 if (size != sizeof(__u64))
9291 return false;
9292 break;
9293 case offsetof(struct sk_msg_md, sk):
9294 if (size != sizeof(__u64))
9295 return false;
9296 info->reg_type = PTR_TO_SOCKET;
9297 break;
9298 case bpf_ctx_range(struct sk_msg_md, family):
9299 case bpf_ctx_range(struct sk_msg_md, remote_ip4):
9300 case bpf_ctx_range(struct sk_msg_md, local_ip4):
9301 case bpf_ctx_range_till(struct sk_msg_md, remote_ip6[0], remote_ip6[3]):
9302 case bpf_ctx_range_till(struct sk_msg_md, local_ip6[0], local_ip6[3]):
9303 case bpf_ctx_range(struct sk_msg_md, remote_port):
9304 case bpf_ctx_range(struct sk_msg_md, local_port):
9305 case bpf_ctx_range(struct sk_msg_md, size):
9306 if (size != sizeof(__u32))
9307 return false;
9308 break;
9309 default:
9310 return false;
9311 }
9312 return true;
9313 }
9314
flow_dissector_is_valid_access(int off,int size,enum bpf_access_type type,const struct bpf_prog * prog,struct bpf_insn_access_aux * info)9315 static bool flow_dissector_is_valid_access(int off, int size,
9316 enum bpf_access_type type,
9317 const struct bpf_prog *prog,
9318 struct bpf_insn_access_aux *info)
9319 {
9320 const int size_default = sizeof(__u32);
9321
9322 if (off < 0 || off >= sizeof(struct __sk_buff))
9323 return false;
9324
9325 if (type == BPF_WRITE)
9326 return false;
9327
9328 switch (off) {
9329 case bpf_ctx_range(struct __sk_buff, data):
9330 if (size != size_default)
9331 return false;
9332 info->reg_type = PTR_TO_PACKET;
9333 return true;
9334 case bpf_ctx_range(struct __sk_buff, data_end):
9335 if (size != size_default)
9336 return false;
9337 info->reg_type = PTR_TO_PACKET_END;
9338 return true;
9339 case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
9340 if (size != sizeof(__u64))
9341 return false;
9342 info->reg_type = PTR_TO_FLOW_KEYS;
9343 return true;
9344 default:
9345 return false;
9346 }
9347 }
9348
flow_dissector_convert_ctx_access(enum bpf_access_type type,const struct bpf_insn * si,struct bpf_insn * insn_buf,struct bpf_prog * prog,u32 * target_size)9349 static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type,
9350 const struct bpf_insn *si,
9351 struct bpf_insn *insn_buf,
9352 struct bpf_prog *prog,
9353 u32 *target_size)
9354
9355 {
9356 struct bpf_insn *insn = insn_buf;
9357
9358 switch (si->off) {
9359 case offsetof(struct __sk_buff, data):
9360 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data),
9361 si->dst_reg, si->src_reg,
9362 offsetof(struct bpf_flow_dissector, data));
9363 break;
9364
9365 case offsetof(struct __sk_buff, data_end):
9366 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data_end),
9367 si->dst_reg, si->src_reg,
9368 offsetof(struct bpf_flow_dissector, data_end));
9369 break;
9370
9371 case offsetof(struct __sk_buff, flow_keys):
9372 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, flow_keys),
9373 si->dst_reg, si->src_reg,
9374 offsetof(struct bpf_flow_dissector, flow_keys));
9375 break;
9376 }
9377
9378 return insn - insn_buf;
9379 }
9380
bpf_convert_tstamp_type_read(const struct bpf_insn * si,struct bpf_insn * insn)9381 static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si,
9382 struct bpf_insn *insn)
9383 {
9384 __u8 value_reg = si->dst_reg;
9385 __u8 skb_reg = si->src_reg;
9386 /* AX is needed because src_reg and dst_reg could be the same */
9387 __u8 tmp_reg = BPF_REG_AX;
9388
9389 *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg,
9390 SKB_BF_MONO_TC_OFFSET);
9391 *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg,
9392 SKB_MONO_DELIVERY_TIME_MASK, 2);
9393 *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_UNSPEC);
9394 *insn++ = BPF_JMP_A(1);
9395 *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_DELIVERY_MONO);
9396
9397 return insn;
9398 }
9399
bpf_convert_shinfo_access(__u8 dst_reg,__u8 skb_reg,struct bpf_insn * insn)9400 static struct bpf_insn *bpf_convert_shinfo_access(__u8 dst_reg, __u8 skb_reg,
9401 struct bpf_insn *insn)
9402 {
9403 /* si->dst_reg = skb_shinfo(SKB); */
9404 #ifdef NET_SKBUFF_DATA_USES_OFFSET
9405 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end),
9406 BPF_REG_AX, skb_reg,
9407 offsetof(struct sk_buff, end));
9408 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head),
9409 dst_reg, skb_reg,
9410 offsetof(struct sk_buff, head));
9411 *insn++ = BPF_ALU64_REG(BPF_ADD, dst_reg, BPF_REG_AX);
9412 #else
9413 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end),
9414 dst_reg, skb_reg,
9415 offsetof(struct sk_buff, end));
9416 #endif
9417
9418 return insn;
9419 }
9420
bpf_convert_tstamp_read(const struct bpf_prog * prog,const struct bpf_insn * si,struct bpf_insn * insn)9421 static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog,
9422 const struct bpf_insn *si,
9423 struct bpf_insn *insn)
9424 {
9425 __u8 value_reg = si->dst_reg;
9426 __u8 skb_reg = si->src_reg;
9427
9428 #ifdef CONFIG_NET_XGRESS
9429 /* If the tstamp_type is read,
9430 * the bpf prog is aware the tstamp could have delivery time.
9431 * Thus, read skb->tstamp as is if tstamp_type_access is true.
9432 */
9433 if (!prog->tstamp_type_access) {
9434 /* AX is needed because src_reg and dst_reg could be the same */
9435 __u8 tmp_reg = BPF_REG_AX;
9436
9437 *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
9438 *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg,
9439 TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK);
9440 *insn++ = BPF_JMP32_IMM(BPF_JNE, tmp_reg,
9441 TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK, 2);
9442 /* skb->tc_at_ingress && skb->tstamp_type,
9443 * read 0 as the (rcv) timestamp.
9444 */
9445 *insn++ = BPF_MOV64_IMM(value_reg, 0);
9446 *insn++ = BPF_JMP_A(1);
9447 }
9448 #endif
9449
9450 *insn++ = BPF_LDX_MEM(BPF_DW, value_reg, skb_reg,
9451 offsetof(struct sk_buff, tstamp));
9452 return insn;
9453 }
9454
bpf_convert_tstamp_write(const struct bpf_prog * prog,const struct bpf_insn * si,struct bpf_insn * insn)9455 static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,
9456 const struct bpf_insn *si,
9457 struct bpf_insn *insn)
9458 {
9459 __u8 value_reg = si->src_reg;
9460 __u8 skb_reg = si->dst_reg;
9461
9462 #ifdef CONFIG_NET_XGRESS
9463 /* If the tstamp_type is read,
9464 * the bpf prog is aware the tstamp could have delivery time.
9465 * Thus, write skb->tstamp as is if tstamp_type_access is true.
9466 * Otherwise, writing at ingress will have to clear the
9467 * skb->tstamp_type bit also.
9468 */
9469 if (!prog->tstamp_type_access) {
9470 __u8 tmp_reg = BPF_REG_AX;
9471
9472 *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
9473 /* Writing __sk_buff->tstamp as ingress, goto <clear> */
9474 *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);
9475 /* goto <store> */
9476 *insn++ = BPF_JMP_A(2);
9477 /* <clear>: skb->tstamp_type */
9478 *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_MONO_DELIVERY_TIME_MASK);
9479 *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, SKB_BF_MONO_TC_OFFSET);
9480 }
9481 #endif
9482
9483 /* <store>: skb->tstamp = tstamp */
9484 *insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) | BPF_DW | BPF_MEM,
9485 skb_reg, value_reg, offsetof(struct sk_buff, tstamp), si->imm);
9486 return insn;
9487 }
9488
9489 #define BPF_EMIT_STORE(size, si, off) \
9490 BPF_RAW_INSN(BPF_CLASS((si)->code) | (size) | BPF_MEM, \
9491 (si)->dst_reg, (si)->src_reg, (off), (si)->imm)
9492
bpf_convert_ctx_access(enum bpf_access_type type,const struct bpf_insn * si,struct bpf_insn * insn_buf,struct bpf_prog * prog,u32 * target_size)9493 static u32 bpf_convert_ctx_access(enum bpf_access_type type,
9494 const struct bpf_insn *si,
9495 struct bpf_insn *insn_buf,
9496 struct bpf_prog *prog, u32 *target_size)
9497 {
9498 struct bpf_insn *insn = insn_buf;
9499 int off;
9500
9501 switch (si->off) {
9502 case offsetof(struct __sk_buff, len):
9503 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
9504 bpf_target_off(struct sk_buff, len, 4,
9505 target_size));
9506 break;
9507
9508 case offsetof(struct __sk_buff, protocol):
9509 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
9510 bpf_target_off(struct sk_buff, protocol, 2,
9511 target_size));
9512 break;
9513
9514 case offsetof(struct __sk_buff, vlan_proto):
9515 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
9516 bpf_target_off(struct sk_buff, vlan_proto, 2,
9517 target_size));
9518 break;
9519
9520 case offsetof(struct __sk_buff, priority):
9521 if (type == BPF_WRITE)
9522 *insn++ = BPF_EMIT_STORE(BPF_W, si,
9523 bpf_target_off(struct sk_buff, priority, 4,
9524 target_size));
9525 else
9526 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
9527 bpf_target_off(struct sk_buff, priority, 4,
9528 target_size));
9529 break;
9530
9531 case offsetof(struct __sk_buff, ingress_ifindex):
9532 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
9533 bpf_target_off(struct sk_buff, skb_iif, 4,
9534 target_size));
9535 break;
9536
9537 case offsetof(struct __sk_buff, ifindex):
9538 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
9539 si->dst_reg, si->src_reg,
9540 offsetof(struct sk_buff, dev));
9541 *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
9542 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
9543 bpf_target_off(struct net_device, ifindex, 4,
9544 target_size));
9545 break;
9546
9547 case offsetof(struct __sk_buff, hash):
9548 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
9549 bpf_target_off(struct sk_buff, hash, 4,
9550 target_size));
9551 break;
9552
9553 case offsetof(struct __sk_buff, mark):
9554 if (type == BPF_WRITE)
9555 *insn++ = BPF_EMIT_STORE(BPF_W, si,
9556 bpf_target_off(struct sk_buff, mark, 4,
9557 target_size));
9558 else
9559 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
9560 bpf_target_off(struct sk_buff, mark, 4,
9561 target_size));
9562 break;
9563
9564 case offsetof(struct __sk_buff, pkt_type):
9565 *target_size = 1;
9566 *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg,
9567 PKT_TYPE_OFFSET);
9568 *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, PKT_TYPE_MAX);
9569 #ifdef __BIG_ENDIAN_BITFIELD
9570 *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5);
9571 #endif
9572 break;
9573
9574 case offsetof(struct __sk_buff, queue_mapping):
9575 if (type == BPF_WRITE) {
9576 u32 off = bpf_target_off(struct sk_buff, queue_mapping, 2, target_size);
9577
9578 if (BPF_CLASS(si->code) == BPF_ST && si->imm >= NO_QUEUE_MAPPING) {
9579 *insn++ = BPF_JMP_A(0); /* noop */
9580 break;
9581 }
9582
9583 if (BPF_CLASS(si->code) == BPF_STX)
9584 *insn++ = BPF_JMP_IMM(BPF_JGE, si->src_reg, NO_QUEUE_MAPPING, 1);
9585 *insn++ = BPF_EMIT_STORE(BPF_H, si, off);
9586 } else {
9587 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
9588 bpf_target_off(struct sk_buff,
9589 queue_mapping,
9590 2, target_size));
9591 }
9592 break;
9593
9594 case offsetof(struct __sk_buff, vlan_present):
9595 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
9596 bpf_target_off(struct sk_buff,
9597 vlan_all, 4, target_size));
9598 *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
9599 *insn++ = BPF_ALU32_IMM(BPF_MOV, si->dst_reg, 1);
9600 break;
9601
9602 case offsetof(struct __sk_buff, vlan_tci):
9603 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
9604 bpf_target_off(struct sk_buff, vlan_tci, 2,
9605 target_size));
9606 break;
9607
9608 case offsetof(struct __sk_buff, cb[0]) ...
9609 offsetofend(struct __sk_buff, cb[4]) - 1:
9610 BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, data) < 20);
9611 BUILD_BUG_ON((offsetof(struct sk_buff, cb) +
9612 offsetof(struct qdisc_skb_cb, data)) %
9613 sizeof(__u64));
9614
9615 prog->cb_access = 1;
9616 off = si->off;
9617 off -= offsetof(struct __sk_buff, cb[0]);
9618 off += offsetof(struct sk_buff, cb);
9619 off += offsetof(struct qdisc_skb_cb, data);
9620 if (type == BPF_WRITE)
9621 *insn++ = BPF_EMIT_STORE(BPF_SIZE(si->code), si, off);
9622 else
9623 *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg,
9624 si->src_reg, off);
9625 break;
9626
9627 case offsetof(struct __sk_buff, tc_classid):
9628 BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, tc_classid) != 2);
9629
9630 off = si->off;
9631 off -= offsetof(struct __sk_buff, tc_classid);
9632 off += offsetof(struct sk_buff, cb);
9633 off += offsetof(struct qdisc_skb_cb, tc_classid);
9634 *target_size = 2;
9635 if (type == BPF_WRITE)
9636 *insn++ = BPF_EMIT_STORE(BPF_H, si, off);
9637 else
9638 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg,
9639 si->src_reg, off);
9640 break;
9641
9642 case offsetof(struct __sk_buff, data):
9643 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
9644 si->dst_reg, si->src_reg,
9645 offsetof(struct sk_buff, data));
9646 break;
9647
9648 case offsetof(struct __sk_buff, data_meta):
9649 off = si->off;
9650 off -= offsetof(struct __sk_buff, data_meta);
9651 off += offsetof(struct sk_buff, cb);
9652 off += offsetof(struct bpf_skb_data_end, data_meta);
9653 *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
9654 si->src_reg, off);
9655 break;
9656
9657 case offsetof(struct __sk_buff, data_end):
9658 off = si->off;
9659 off -= offsetof(struct __sk_buff, data_end);
9660 off += offsetof(struct sk_buff, cb);
9661 off += offsetof(struct bpf_skb_data_end, data_end);
9662 *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
9663 si->src_reg, off);
9664 break;
9665
9666 case offsetof(struct __sk_buff, tc_index):
9667 #ifdef CONFIG_NET_SCHED
9668 if (type == BPF_WRITE)
9669 *insn++ = BPF_EMIT_STORE(BPF_H, si,
9670 bpf_target_off(struct sk_buff, tc_index, 2,
9671 target_size));
9672 else
9673 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
9674 bpf_target_off(struct sk_buff, tc_index, 2,
9675 target_size));
9676 #else
9677 *target_size = 2;
9678 if (type == BPF_WRITE)
9679 *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg);
9680 else
9681 *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
9682 #endif
9683 break;
9684
9685 case offsetof(struct __sk_buff, napi_id):
9686 #if defined(CONFIG_NET_RX_BUSY_POLL)
9687 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
9688 bpf_target_off(struct sk_buff, napi_id, 4,
9689 target_size));
9690 *insn++ = BPF_JMP_IMM(BPF_JGE, si->dst_reg, MIN_NAPI_ID, 1);
9691 *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
9692 #else
9693 *target_size = 4;
9694 *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
9695 #endif
9696 break;
9697 case offsetof(struct __sk_buff, family):
9698 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2);
9699
9700 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
9701 si->dst_reg, si->src_reg,
9702 offsetof(struct sk_buff, sk));
9703 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
9704 bpf_target_off(struct sock_common,
9705 skc_family,
9706 2, target_size));
9707 break;
9708 case offsetof(struct __sk_buff, remote_ip4):
9709 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4);
9710
9711 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
9712 si->dst_reg, si->src_reg,
9713 offsetof(struct sk_buff, sk));
9714 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
9715 bpf_target_off(struct sock_common,
9716 skc_daddr,
9717 4, target_size));
9718 break;
9719 case offsetof(struct __sk_buff, local_ip4):
9720 BUILD_BUG_ON(sizeof_field(struct sock_common,
9721 skc_rcv_saddr) != 4);
9722
9723 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
9724 si->dst_reg, si->src_reg,
9725 offsetof(struct sk_buff, sk));
9726 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
9727 bpf_target_off(struct sock_common,
9728 skc_rcv_saddr,
9729 4, target_size));
9730 break;
9731 case offsetof(struct __sk_buff, remote_ip6[0]) ...
9732 offsetof(struct __sk_buff, remote_ip6[3]):
9733 #if IS_ENABLED(CONFIG_IPV6)
9734 BUILD_BUG_ON(sizeof_field(struct sock_common,
9735 skc_v6_daddr.s6_addr32[0]) != 4);
9736
9737 off = si->off;
9738 off -= offsetof(struct __sk_buff, remote_ip6[0]);
9739
9740 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
9741 si->dst_reg, si->src_reg,
9742 offsetof(struct sk_buff, sk));
9743 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
9744 offsetof(struct sock_common,
9745 skc_v6_daddr.s6_addr32[0]) +
9746 off);
9747 #else
9748 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
9749 #endif
9750 break;
9751 case offsetof(struct __sk_buff, local_ip6[0]) ...
9752 offsetof(struct __sk_buff, local_ip6[3]):
9753 #if IS_ENABLED(CONFIG_IPV6)
9754 BUILD_BUG_ON(sizeof_field(struct sock_common,
9755 skc_v6_rcv_saddr.s6_addr32[0]) != 4);
9756
9757 off = si->off;
9758 off -= offsetof(struct __sk_buff, local_ip6[0]);
9759
9760 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
9761 si->dst_reg, si->src_reg,
9762 offsetof(struct sk_buff, sk));
9763 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
9764 offsetof(struct sock_common,
9765 skc_v6_rcv_saddr.s6_addr32[0]) +
9766 off);
9767 #else
9768 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
9769 #endif
9770 break;
9771
9772 case offsetof(struct __sk_buff, remote_port):
9773 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2);
9774
9775 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
9776 si->dst_reg, si->src_reg,
9777 offsetof(struct sk_buff, sk));
9778 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
9779 bpf_target_off(struct sock_common,
9780 skc_dport,
9781 2, target_size));
9782 #ifndef __BIG_ENDIAN_BITFIELD
9783 *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
9784 #endif
9785 break;
9786
9787 case offsetof(struct __sk_buff, local_port):
9788 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2);
9789
9790 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
9791 si->dst_reg, si->src_reg,
9792 offsetof(struct sk_buff, sk));
9793 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
9794 bpf_target_off(struct sock_common,
9795 skc_num, 2, target_size));
9796 break;
9797
9798 case offsetof(struct __sk_buff, tstamp):
9799 BUILD_BUG_ON(sizeof_field(struct sk_buff, tstamp) != 8);
9800
9801 if (type == BPF_WRITE)
9802 insn = bpf_convert_tstamp_write(prog, si, insn);
9803 else
9804 insn = bpf_convert_tstamp_read(prog, si, insn);
9805 break;
9806
9807 case offsetof(struct __sk_buff, tstamp_type):
9808 insn = bpf_convert_tstamp_type_read(si, insn);
9809 break;
9810
9811 case offsetof(struct __sk_buff, gso_segs):
9812 insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn);
9813 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_segs),
9814 si->dst_reg, si->dst_reg,
9815 bpf_target_off(struct skb_shared_info,
9816 gso_segs, 2,
9817 target_size));
9818 break;
9819 case offsetof(struct __sk_buff, gso_size):
9820 insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn);
9821 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_size),
9822 si->dst_reg, si->dst_reg,
9823 bpf_target_off(struct skb_shared_info,
9824 gso_size, 2,
9825 target_size));
9826 break;
9827 case offsetof(struct __sk_buff, wire_len):
9828 BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, pkt_len) != 4);
9829
9830 off = si->off;
9831 off -= offsetof(struct __sk_buff, wire_len);
9832 off += offsetof(struct sk_buff, cb);
9833 off += offsetof(struct qdisc_skb_cb, pkt_len);
9834 *target_size = 4;
9835 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off);
9836 break;
9837
9838 case offsetof(struct __sk_buff, sk):
9839 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
9840 si->dst_reg, si->src_reg,
9841 offsetof(struct sk_buff, sk));
9842 break;
9843 case offsetof(struct __sk_buff, hwtstamp):
9844 BUILD_BUG_ON(sizeof_field(struct skb_shared_hwtstamps, hwtstamp) != 8);
9845 BUILD_BUG_ON(offsetof(struct skb_shared_hwtstamps, hwtstamp) != 0);
9846
9847 insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn);
9848 *insn++ = BPF_LDX_MEM(BPF_DW,
9849 si->dst_reg, si->dst_reg,
9850 bpf_target_off(struct skb_shared_info,
9851 hwtstamps, 8,
9852 target_size));
9853 break;
9854 }
9855
9856 return insn - insn_buf;
9857 }
9858
bpf_sock_convert_ctx_access(enum bpf_access_type type,const struct bpf_insn * si,struct bpf_insn * insn_buf,struct bpf_prog * prog,u32 * target_size)9859 u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
9860 const struct bpf_insn *si,
9861 struct bpf_insn *insn_buf,
9862 struct bpf_prog *prog, u32 *target_size)
9863 {
9864 struct bpf_insn *insn = insn_buf;
9865 int off;
9866
9867 switch (si->off) {
9868 case offsetof(struct bpf_sock, bound_dev_if):
9869 BUILD_BUG_ON(sizeof_field(struct sock, sk_bound_dev_if) != 4);
9870
9871 if (type == BPF_WRITE)
9872 *insn++ = BPF_EMIT_STORE(BPF_W, si,
9873 offsetof(struct sock, sk_bound_dev_if));
9874 else
9875 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
9876 offsetof(struct sock, sk_bound_dev_if));
9877 break;
9878
9879 case offsetof(struct bpf_sock, mark):
9880 BUILD_BUG_ON(sizeof_field(struct sock, sk_mark) != 4);
9881
9882 if (type == BPF_WRITE)
9883 *insn++ = BPF_EMIT_STORE(BPF_W, si,
9884 offsetof(struct sock, sk_mark));
9885 else
9886 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
9887 offsetof(struct sock, sk_mark));
9888 break;
9889
9890 case offsetof(struct bpf_sock, priority):
9891 BUILD_BUG_ON(sizeof_field(struct sock, sk_priority) != 4);
9892
9893 if (type == BPF_WRITE)
9894 *insn++ = BPF_EMIT_STORE(BPF_W, si,
9895 offsetof(struct sock, sk_priority));
9896 else
9897 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
9898 offsetof(struct sock, sk_priority));
9899 break;
9900
9901 case offsetof(struct bpf_sock, family):
9902 *insn++ = BPF_LDX_MEM(
9903 BPF_FIELD_SIZEOF(struct sock_common, skc_family),
9904 si->dst_reg, si->src_reg,
9905 bpf_target_off(struct sock_common,
9906 skc_family,
9907 sizeof_field(struct sock_common,
9908 skc_family),
9909 target_size));
9910 break;
9911
9912 case offsetof(struct bpf_sock, type):
9913 *insn++ = BPF_LDX_MEM(
9914 BPF_FIELD_SIZEOF(struct sock, sk_type),
9915 si->dst_reg, si->src_reg,
9916 bpf_target_off(struct sock, sk_type,
9917 sizeof_field(struct sock, sk_type),
9918 target_size));
9919 break;
9920
9921 case offsetof(struct bpf_sock, protocol):
9922 *insn++ = BPF_LDX_MEM(
9923 BPF_FIELD_SIZEOF(struct sock, sk_protocol),
9924 si->dst_reg, si->src_reg,
9925 bpf_target_off(struct sock, sk_protocol,
9926 sizeof_field(struct sock, sk_protocol),
9927 target_size));
9928 break;
9929
9930 case offsetof(struct bpf_sock, src_ip4):
9931 *insn++ = BPF_LDX_MEM(
9932 BPF_SIZE(si->code), si->dst_reg, si->src_reg,
9933 bpf_target_off(struct sock_common, skc_rcv_saddr,
9934 sizeof_field(struct sock_common,
9935 skc_rcv_saddr),
9936 target_size));
9937 break;
9938
9939 case offsetof(struct bpf_sock, dst_ip4):
9940 *insn++ = BPF_LDX_MEM(
9941 BPF_SIZE(si->code), si->dst_reg, si->src_reg,
9942 bpf_target_off(struct sock_common, skc_daddr,
9943 sizeof_field(struct sock_common,
9944 skc_daddr),
9945 target_size));
9946 break;
9947
9948 case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
9949 #if IS_ENABLED(CONFIG_IPV6)
9950 off = si->off;
9951 off -= offsetof(struct bpf_sock, src_ip6[0]);
9952 *insn++ = BPF_LDX_MEM(
9953 BPF_SIZE(si->code), si->dst_reg, si->src_reg,
9954 bpf_target_off(
9955 struct sock_common,
9956 skc_v6_rcv_saddr.s6_addr32[0],
9957 sizeof_field(struct sock_common,
9958 skc_v6_rcv_saddr.s6_addr32[0]),
9959 target_size) + off);
9960 #else
9961 (void)off;
9962 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
9963 #endif
9964 break;
9965
9966 case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]):
9967 #if IS_ENABLED(CONFIG_IPV6)
9968 off = si->off;
9969 off -= offsetof(struct bpf_sock, dst_ip6[0]);
9970 *insn++ = BPF_LDX_MEM(
9971 BPF_SIZE(si->code), si->dst_reg, si->src_reg,
9972 bpf_target_off(struct sock_common,
9973 skc_v6_daddr.s6_addr32[0],
9974 sizeof_field(struct sock_common,
9975 skc_v6_daddr.s6_addr32[0]),
9976 target_size) + off);
9977 #else
9978 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
9979 *target_size = 4;
9980 #endif
9981 break;
9982
9983 case offsetof(struct bpf_sock, src_port):
9984 *insn++ = BPF_LDX_MEM(
9985 BPF_FIELD_SIZEOF(struct sock_common, skc_num),
9986 si->dst_reg, si->src_reg,
9987 bpf_target_off(struct sock_common, skc_num,
9988 sizeof_field(struct sock_common,
9989 skc_num),
9990 target_size));
9991 break;
9992
9993 case offsetof(struct bpf_sock, dst_port):
9994 *insn++ = BPF_LDX_MEM(
9995 BPF_FIELD_SIZEOF(struct sock_common, skc_dport),
9996 si->dst_reg, si->src_reg,
9997 bpf_target_off(struct sock_common, skc_dport,
9998 sizeof_field(struct sock_common,
9999 skc_dport),
10000 target_size));
10001 break;
10002
10003 case offsetof(struct bpf_sock, state):
10004 *insn++ = BPF_LDX_MEM(
10005 BPF_FIELD_SIZEOF(struct sock_common, skc_state),
10006 si->dst_reg, si->src_reg,
10007 bpf_target_off(struct sock_common, skc_state,
10008 sizeof_field(struct sock_common,
10009 skc_state),
10010 target_size));
10011 break;
10012 case offsetof(struct bpf_sock, rx_queue_mapping):
10013 #ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
10014 *insn++ = BPF_LDX_MEM(
10015 BPF_FIELD_SIZEOF(struct sock, sk_rx_queue_mapping),
10016 si->dst_reg, si->src_reg,
10017 bpf_target_off(struct sock, sk_rx_queue_mapping,
10018 sizeof_field(struct sock,
10019 sk_rx_queue_mapping),
10020 target_size));
10021 *insn++ = BPF_JMP_IMM(BPF_JNE, si->dst_reg, NO_QUEUE_MAPPING,
10022 1);
10023 *insn++ = BPF_MOV64_IMM(si->dst_reg, -1);
10024 #else
10025 *insn++ = BPF_MOV64_IMM(si->dst_reg, -1);
10026 *target_size = 2;
10027 #endif
10028 break;
10029 }
10030
10031 return insn - insn_buf;
10032 }
10033
tc_cls_act_convert_ctx_access(enum bpf_access_type type,const struct bpf_insn * si,struct bpf_insn * insn_buf,struct bpf_prog * prog,u32 * target_size)10034 static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type,
10035 const struct bpf_insn *si,
10036 struct bpf_insn *insn_buf,
10037 struct bpf_prog *prog, u32 *target_size)
10038 {
10039 struct bpf_insn *insn = insn_buf;
10040
10041 switch (si->off) {
10042 case offsetof(struct __sk_buff, ifindex):
10043 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
10044 si->dst_reg, si->src_reg,
10045 offsetof(struct sk_buff, dev));
10046 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
10047 bpf_target_off(struct net_device, ifindex, 4,
10048 target_size));
10049 break;
10050 default:
10051 return bpf_convert_ctx_access(type, si, insn_buf, prog,
10052 target_size);
10053 }
10054
10055 return insn - insn_buf;
10056 }
10057
xdp_convert_ctx_access(enum bpf_access_type type,const struct bpf_insn * si,struct bpf_insn * insn_buf,struct bpf_prog * prog,u32 * target_size)10058 static u32 xdp_convert_ctx_access(enum bpf_access_type type,
10059 const struct bpf_insn *si,
10060 struct bpf_insn *insn_buf,
10061 struct bpf_prog *prog, u32 *target_size)
10062 {
10063 struct bpf_insn *insn = insn_buf;
10064
10065 switch (si->off) {
10066 case offsetof(struct xdp_md, data):
10067 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data),
10068 si->dst_reg, si->src_reg,
10069 offsetof(struct xdp_buff, data));
10070 break;
10071 case offsetof(struct xdp_md, data_meta):
10072 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_meta),
10073 si->dst_reg, si->src_reg,
10074 offsetof(struct xdp_buff, data_meta));
10075 break;
10076 case offsetof(struct xdp_md, data_end):
10077 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),
10078 si->dst_reg, si->src_reg,
10079 offsetof(struct xdp_buff, data_end));
10080 break;
10081 case offsetof(struct xdp_md, ingress_ifindex):
10082 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
10083 si->dst_reg, si->src_reg,
10084 offsetof(struct xdp_buff, rxq));
10085 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev),
10086 si->dst_reg, si->dst_reg,
10087 offsetof(struct xdp_rxq_info, dev));
10088 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
10089 offsetof(struct net_device, ifindex));
10090 break;
10091 case offsetof(struct xdp_md, rx_queue_index):
10092 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
10093 si->dst_reg, si->src_reg,
10094 offsetof(struct xdp_buff, rxq));
10095 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
10096 offsetof(struct xdp_rxq_info,
10097 queue_index));
10098 break;
10099 case offsetof(struct xdp_md, egress_ifindex):
10100 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, txq),
10101 si->dst_reg, si->src_reg,
10102 offsetof(struct xdp_buff, txq));
10103 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_txq_info, dev),
10104 si->dst_reg, si->dst_reg,
10105 offsetof(struct xdp_txq_info, dev));
10106 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
10107 offsetof(struct net_device, ifindex));
10108 break;
10109 }
10110
10111 return insn - insn_buf;
10112 }
10113
10114 /* SOCK_ADDR_LOAD_NESTED_FIELD() loads Nested Field S.F.NF where S is type of
10115 * context Structure, F is Field in context structure that contains a pointer
10116 * to Nested Structure of type NS that has the field NF.
10117 *
10118 * SIZE encodes the load size (BPF_B, BPF_H, etc). It's up to caller to make
10119 * sure that SIZE is not greater than actual size of S.F.NF.
10120 *
10121 * If offset OFF is provided, the load happens from that offset relative to
10122 * offset of NF.
10123 */
10124 #define SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF) \
10125 do { \
10126 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), si->dst_reg, \
10127 si->src_reg, offsetof(S, F)); \
10128 *insn++ = BPF_LDX_MEM( \
10129 SIZE, si->dst_reg, si->dst_reg, \
10130 bpf_target_off(NS, NF, sizeof_field(NS, NF), \
10131 target_size) \
10132 + OFF); \
10133 } while (0)
10134
10135 #define SOCK_ADDR_LOAD_NESTED_FIELD(S, NS, F, NF) \
10136 SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, \
10137 BPF_FIELD_SIZEOF(NS, NF), 0)
10138
10139 /* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to
10140 * SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation.
10141 *
10142 * In addition it uses Temporary Field TF (member of struct S) as the 3rd
10143 * "register" since two registers available in convert_ctx_access are not
10144 * enough: we can't override neither SRC, since it contains value to store, nor
10145 * DST since it contains pointer to context that may be used by later
10146 * instructions. But we need a temporary place to save pointer to nested
10147 * structure whose field we want to store to.
10148 */
10149 #define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, OFF, TF) \
10150 do { \
10151 int tmp_reg = BPF_REG_9; \
10152 if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \
10153 --tmp_reg; \
10154 if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \
10155 --tmp_reg; \
10156 *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, tmp_reg, \
10157 offsetof(S, TF)); \
10158 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg, \
10159 si->dst_reg, offsetof(S, F)); \
10160 *insn++ = BPF_RAW_INSN(SIZE | BPF_MEM | BPF_CLASS(si->code), \
10161 tmp_reg, si->src_reg, \
10162 bpf_target_off(NS, NF, sizeof_field(NS, NF), \
10163 target_size) \
10164 + OFF, \
10165 si->imm); \
10166 *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, si->dst_reg, \
10167 offsetof(S, TF)); \
10168 } while (0)
10169
10170 #define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF, \
10171 TF) \
10172 do { \
10173 if (type == BPF_WRITE) { \
10174 SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, \
10175 OFF, TF); \
10176 } else { \
10177 SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( \
10178 S, NS, F, NF, SIZE, OFF); \
10179 } \
10180 } while (0)
10181
10182 #define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(S, NS, F, NF, TF) \
10183 SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( \
10184 S, NS, F, NF, BPF_FIELD_SIZEOF(NS, NF), 0, TF)
10185
sock_addr_convert_ctx_access(enum bpf_access_type type,const struct bpf_insn * si,struct bpf_insn * insn_buf,struct bpf_prog * prog,u32 * target_size)10186 static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
10187 const struct bpf_insn *si,
10188 struct bpf_insn *insn_buf,
10189 struct bpf_prog *prog, u32 *target_size)
10190 {
10191 int off, port_size = sizeof_field(struct sockaddr_in6, sin6_port);
10192 struct bpf_insn *insn = insn_buf;
10193
10194 switch (si->off) {
10195 case offsetof(struct bpf_sock_addr, user_family):
10196 SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
10197 struct sockaddr, uaddr, sa_family);
10198 break;
10199
10200 case offsetof(struct bpf_sock_addr, user_ip4):
10201 SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
10202 struct bpf_sock_addr_kern, struct sockaddr_in, uaddr,
10203 sin_addr, BPF_SIZE(si->code), 0, tmp_reg);
10204 break;
10205
10206 case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
10207 off = si->off;
10208 off -= offsetof(struct bpf_sock_addr, user_ip6[0]);
10209 SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
10210 struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr,
10211 sin6_addr.s6_addr32[0], BPF_SIZE(si->code), off,
10212 tmp_reg);
10213 break;
10214
10215 case offsetof(struct bpf_sock_addr, user_port):
10216 /* To get port we need to know sa_family first and then treat
10217 * sockaddr as either sockaddr_in or sockaddr_in6.
10218 * Though we can simplify since port field has same offset and
10219 * size in both structures.
10220 * Here we check this invariant and use just one of the
10221 * structures if it's true.
10222 */
10223 BUILD_BUG_ON(offsetof(struct sockaddr_in, sin_port) !=
10224 offsetof(struct sockaddr_in6, sin6_port));
10225 BUILD_BUG_ON(sizeof_field(struct sockaddr_in, sin_port) !=
10226 sizeof_field(struct sockaddr_in6, sin6_port));
10227 /* Account for sin6_port being smaller than user_port. */
10228 port_size = min(port_size, BPF_LDST_BYTES(si));
10229 SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
10230 struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr,
10231 sin6_port, bytes_to_bpf_size(port_size), 0, tmp_reg);
10232 break;
10233
10234 case offsetof(struct bpf_sock_addr, family):
10235 SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
10236 struct sock, sk, sk_family);
10237 break;
10238
10239 case offsetof(struct bpf_sock_addr, type):
10240 SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
10241 struct sock, sk, sk_type);
10242 break;
10243
10244 case offsetof(struct bpf_sock_addr, protocol):
10245 SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
10246 struct sock, sk, sk_protocol);
10247 break;
10248
10249 case offsetof(struct bpf_sock_addr, msg_src_ip4):
10250 /* Treat t_ctx as struct in_addr for msg_src_ip4. */
10251 SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
10252 struct bpf_sock_addr_kern, struct in_addr, t_ctx,
10253 s_addr, BPF_SIZE(si->code), 0, tmp_reg);
10254 break;
10255
10256 case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
10257 msg_src_ip6[3]):
10258 off = si->off;
10259 off -= offsetof(struct bpf_sock_addr, msg_src_ip6[0]);
10260 /* Treat t_ctx as struct in6_addr for msg_src_ip6. */
10261 SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
10262 struct bpf_sock_addr_kern, struct in6_addr, t_ctx,
10263 s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg);
10264 break;
10265 case offsetof(struct bpf_sock_addr, sk):
10266 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_addr_kern, sk),
10267 si->dst_reg, si->src_reg,
10268 offsetof(struct bpf_sock_addr_kern, sk));
10269 break;
10270 }
10271
10272 return insn - insn_buf;
10273 }
10274
sock_ops_convert_ctx_access(enum bpf_access_type type,const struct bpf_insn * si,struct bpf_insn * insn_buf,struct bpf_prog * prog,u32 * target_size)10275 static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
10276 const struct bpf_insn *si,
10277 struct bpf_insn *insn_buf,
10278 struct bpf_prog *prog,
10279 u32 *target_size)
10280 {
10281 struct bpf_insn *insn = insn_buf;
10282 int off;
10283
10284 /* Helper macro for adding read access to tcp_sock or sock fields. */
10285 #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \
10286 do { \
10287 int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 2; \
10288 BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) > \
10289 sizeof_field(struct bpf_sock_ops, BPF_FIELD)); \
10290 if (si->dst_reg == reg || si->src_reg == reg) \
10291 reg--; \
10292 if (si->dst_reg == reg || si->src_reg == reg) \
10293 reg--; \
10294 if (si->dst_reg == si->src_reg) { \
10295 *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, \
10296 offsetof(struct bpf_sock_ops_kern, \
10297 temp)); \
10298 fullsock_reg = reg; \
10299 jmp += 2; \
10300 } \
10301 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
10302 struct bpf_sock_ops_kern, \
10303 is_fullsock), \
10304 fullsock_reg, si->src_reg, \
10305 offsetof(struct bpf_sock_ops_kern, \
10306 is_fullsock)); \
10307 *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \
10308 if (si->dst_reg == si->src_reg) \
10309 *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \
10310 offsetof(struct bpf_sock_ops_kern, \
10311 temp)); \
10312 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
10313 struct bpf_sock_ops_kern, sk),\
10314 si->dst_reg, si->src_reg, \
10315 offsetof(struct bpf_sock_ops_kern, sk));\
10316 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ, \
10317 OBJ_FIELD), \
10318 si->dst_reg, si->dst_reg, \
10319 offsetof(OBJ, OBJ_FIELD)); \
10320 if (si->dst_reg == si->src_reg) { \
10321 *insn++ = BPF_JMP_A(1); \
10322 *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \
10323 offsetof(struct bpf_sock_ops_kern, \
10324 temp)); \
10325 } \
10326 } while (0)
10327
10328 #define SOCK_OPS_GET_SK() \
10329 do { \
10330 int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 1; \
10331 if (si->dst_reg == reg || si->src_reg == reg) \
10332 reg--; \
10333 if (si->dst_reg == reg || si->src_reg == reg) \
10334 reg--; \
10335 if (si->dst_reg == si->src_reg) { \
10336 *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, \
10337 offsetof(struct bpf_sock_ops_kern, \
10338 temp)); \
10339 fullsock_reg = reg; \
10340 jmp += 2; \
10341 } \
10342 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
10343 struct bpf_sock_ops_kern, \
10344 is_fullsock), \
10345 fullsock_reg, si->src_reg, \
10346 offsetof(struct bpf_sock_ops_kern, \
10347 is_fullsock)); \
10348 *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \
10349 if (si->dst_reg == si->src_reg) \
10350 *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \
10351 offsetof(struct bpf_sock_ops_kern, \
10352 temp)); \
10353 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
10354 struct bpf_sock_ops_kern, sk),\
10355 si->dst_reg, si->src_reg, \
10356 offsetof(struct bpf_sock_ops_kern, sk));\
10357 if (si->dst_reg == si->src_reg) { \
10358 *insn++ = BPF_JMP_A(1); \
10359 *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \
10360 offsetof(struct bpf_sock_ops_kern, \
10361 temp)); \
10362 } \
10363 } while (0)
10364
10365 #define SOCK_OPS_GET_TCP_SOCK_FIELD(FIELD) \
10366 SOCK_OPS_GET_FIELD(FIELD, FIELD, struct tcp_sock)
10367
10368 /* Helper macro for adding write access to tcp_sock or sock fields.
10369 * The macro is called with two registers, dst_reg which contains a pointer
10370 * to ctx (context) and src_reg which contains the value that should be
10371 * stored. However, we need an additional register since we cannot overwrite
10372 * dst_reg because it may be used later in the program.
10373 * Instead we "borrow" one of the other register. We first save its value
10374 * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore
10375 * it at the end of the macro.
10376 */
10377 #define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \
10378 do { \
10379 int reg = BPF_REG_9; \
10380 BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) > \
10381 sizeof_field(struct bpf_sock_ops, BPF_FIELD)); \
10382 if (si->dst_reg == reg || si->src_reg == reg) \
10383 reg--; \
10384 if (si->dst_reg == reg || si->src_reg == reg) \
10385 reg--; \
10386 *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg, \
10387 offsetof(struct bpf_sock_ops_kern, \
10388 temp)); \
10389 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
10390 struct bpf_sock_ops_kern, \
10391 is_fullsock), \
10392 reg, si->dst_reg, \
10393 offsetof(struct bpf_sock_ops_kern, \
10394 is_fullsock)); \
10395 *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \
10396 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
10397 struct bpf_sock_ops_kern, sk),\
10398 reg, si->dst_reg, \
10399 offsetof(struct bpf_sock_ops_kern, sk));\
10400 *insn++ = BPF_RAW_INSN(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD) | \
10401 BPF_MEM | BPF_CLASS(si->code), \
10402 reg, si->src_reg, \
10403 offsetof(OBJ, OBJ_FIELD), \
10404 si->imm); \
10405 *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg, \
10406 offsetof(struct bpf_sock_ops_kern, \
10407 temp)); \
10408 } while (0)
10409
10410 #define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE) \
10411 do { \
10412 if (TYPE == BPF_WRITE) \
10413 SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \
10414 else \
10415 SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \
10416 } while (0)
10417
10418 switch (si->off) {
10419 case offsetof(struct bpf_sock_ops, op):
10420 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
10421 op),
10422 si->dst_reg, si->src_reg,
10423 offsetof(struct bpf_sock_ops_kern, op));
10424 break;
10425
10426 case offsetof(struct bpf_sock_ops, replylong[0]) ...
10427 offsetof(struct bpf_sock_ops, replylong[3]):
10428 BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, reply) !=
10429 sizeof_field(struct bpf_sock_ops_kern, reply));
10430 BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, replylong) !=
10431 sizeof_field(struct bpf_sock_ops_kern, replylong));
10432 off = si->off;
10433 off -= offsetof(struct bpf_sock_ops, replylong[0]);
10434 off += offsetof(struct bpf_sock_ops_kern, replylong[0]);
10435 if (type == BPF_WRITE)
10436 *insn++ = BPF_EMIT_STORE(BPF_W, si, off);
10437 else
10438 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
10439 off);
10440 break;
10441
10442 case offsetof(struct bpf_sock_ops, family):
10443 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2);
10444
10445 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
10446 struct bpf_sock_ops_kern, sk),
10447 si->dst_reg, si->src_reg,
10448 offsetof(struct bpf_sock_ops_kern, sk));
10449 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
10450 offsetof(struct sock_common, skc_family));
10451 break;
10452
10453 case offsetof(struct bpf_sock_ops, remote_ip4):
10454 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4);
10455
10456 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
10457 struct bpf_sock_ops_kern, sk),
10458 si->dst_reg, si->src_reg,
10459 offsetof(struct bpf_sock_ops_kern, sk));
10460 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
10461 offsetof(struct sock_common, skc_daddr));
10462 break;
10463
10464 case offsetof(struct bpf_sock_ops, local_ip4):
10465 BUILD_BUG_ON(sizeof_field(struct sock_common,
10466 skc_rcv_saddr) != 4);
10467
10468 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
10469 struct bpf_sock_ops_kern, sk),
10470 si->dst_reg, si->src_reg,
10471 offsetof(struct bpf_sock_ops_kern, sk));
10472 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
10473 offsetof(struct sock_common,
10474 skc_rcv_saddr));
10475 break;
10476
10477 case offsetof(struct bpf_sock_ops, remote_ip6[0]) ...
10478 offsetof(struct bpf_sock_ops, remote_ip6[3]):
10479 #if IS_ENABLED(CONFIG_IPV6)
10480 BUILD_BUG_ON(sizeof_field(struct sock_common,
10481 skc_v6_daddr.s6_addr32[0]) != 4);
10482
10483 off = si->off;
10484 off -= offsetof(struct bpf_sock_ops, remote_ip6[0]);
10485 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
10486 struct bpf_sock_ops_kern, sk),
10487 si->dst_reg, si->src_reg,
10488 offsetof(struct bpf_sock_ops_kern, sk));
10489 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
10490 offsetof(struct sock_common,
10491 skc_v6_daddr.s6_addr32[0]) +
10492 off);
10493 #else
10494 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
10495 #endif
10496 break;
10497
10498 case offsetof(struct bpf_sock_ops, local_ip6[0]) ...
10499 offsetof(struct bpf_sock_ops, local_ip6[3]):
10500 #if IS_ENABLED(CONFIG_IPV6)
10501 BUILD_BUG_ON(sizeof_field(struct sock_common,
10502 skc_v6_rcv_saddr.s6_addr32[0]) != 4);
10503
10504 off = si->off;
10505 off -= offsetof(struct bpf_sock_ops, local_ip6[0]);
10506 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
10507 struct bpf_sock_ops_kern, sk),
10508 si->dst_reg, si->src_reg,
10509 offsetof(struct bpf_sock_ops_kern, sk));
10510 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
10511 offsetof(struct sock_common,
10512 skc_v6_rcv_saddr.s6_addr32[0]) +
10513 off);
10514 #else
10515 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
10516 #endif
10517 break;
10518
10519 case offsetof(struct bpf_sock_ops, remote_port):
10520 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2);
10521
10522 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
10523 struct bpf_sock_ops_kern, sk),
10524 si->dst_reg, si->src_reg,
10525 offsetof(struct bpf_sock_ops_kern, sk));
10526 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
10527 offsetof(struct sock_common, skc_dport));
10528 #ifndef __BIG_ENDIAN_BITFIELD
10529 *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
10530 #endif
10531 break;
10532
10533 case offsetof(struct bpf_sock_ops, local_port):
10534 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2);
10535
10536 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
10537 struct bpf_sock_ops_kern, sk),
10538 si->dst_reg, si->src_reg,
10539 offsetof(struct bpf_sock_ops_kern, sk));
10540 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
10541 offsetof(struct sock_common, skc_num));
10542 break;
10543
10544 case offsetof(struct bpf_sock_ops, is_fullsock):
10545 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
10546 struct bpf_sock_ops_kern,
10547 is_fullsock),
10548 si->dst_reg, si->src_reg,
10549 offsetof(struct bpf_sock_ops_kern,
10550 is_fullsock));
10551 break;
10552
10553 case offsetof(struct bpf_sock_ops, state):
10554 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_state) != 1);
10555
10556 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
10557 struct bpf_sock_ops_kern, sk),
10558 si->dst_reg, si->src_reg,
10559 offsetof(struct bpf_sock_ops_kern, sk));
10560 *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->dst_reg,
10561 offsetof(struct sock_common, skc_state));
10562 break;
10563
10564 case offsetof(struct bpf_sock_ops, rtt_min):
10565 BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) !=
10566 sizeof(struct minmax));
10567 BUILD_BUG_ON(sizeof(struct minmax) <
10568 sizeof(struct minmax_sample));
10569
10570 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
10571 struct bpf_sock_ops_kern, sk),
10572 si->dst_reg, si->src_reg,
10573 offsetof(struct bpf_sock_ops_kern, sk));
10574 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
10575 offsetof(struct tcp_sock, rtt_min) +
10576 sizeof_field(struct minmax_sample, t));
10577 break;
10578
10579 case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags):
10580 SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags,
10581 struct tcp_sock);
10582 break;
10583
10584 case offsetof(struct bpf_sock_ops, sk_txhash):
10585 SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
10586 struct sock, type);
10587 break;
10588 case offsetof(struct bpf_sock_ops, snd_cwnd):
10589 SOCK_OPS_GET_TCP_SOCK_FIELD(snd_cwnd);
10590 break;
10591 case offsetof(struct bpf_sock_ops, srtt_us):
10592 SOCK_OPS_GET_TCP_SOCK_FIELD(srtt_us);
10593 break;
10594 case offsetof(struct bpf_sock_ops, snd_ssthresh):
10595 SOCK_OPS_GET_TCP_SOCK_FIELD(snd_ssthresh);
10596 break;
10597 case offsetof(struct bpf_sock_ops, rcv_nxt):
10598 SOCK_OPS_GET_TCP_SOCK_FIELD(rcv_nxt);
10599 break;
10600 case offsetof(struct bpf_sock_ops, snd_nxt):
10601 SOCK_OPS_GET_TCP_SOCK_FIELD(snd_nxt);
10602 break;
10603 case offsetof(struct bpf_sock_ops, snd_una):
10604 SOCK_OPS_GET_TCP_SOCK_FIELD(snd_una);
10605 break;
10606 case offsetof(struct bpf_sock_ops, mss_cache):
10607 SOCK_OPS_GET_TCP_SOCK_FIELD(mss_cache);
10608 break;
10609 case offsetof(struct bpf_sock_ops, ecn_flags):
10610 SOCK_OPS_GET_TCP_SOCK_FIELD(ecn_flags);
10611 break;
10612 case offsetof(struct bpf_sock_ops, rate_delivered):
10613 SOCK_OPS_GET_TCP_SOCK_FIELD(rate_delivered);
10614 break;
10615 case offsetof(struct bpf_sock_ops, rate_interval_us):
10616 SOCK_OPS_GET_TCP_SOCK_FIELD(rate_interval_us);
10617 break;
10618 case offsetof(struct bpf_sock_ops, packets_out):
10619 SOCK_OPS_GET_TCP_SOCK_FIELD(packets_out);
10620 break;
10621 case offsetof(struct bpf_sock_ops, retrans_out):
10622 SOCK_OPS_GET_TCP_SOCK_FIELD(retrans_out);
10623 break;
10624 case offsetof(struct bpf_sock_ops, total_retrans):
10625 SOCK_OPS_GET_TCP_SOCK_FIELD(total_retrans);
10626 break;
10627 case offsetof(struct bpf_sock_ops, segs_in):
10628 SOCK_OPS_GET_TCP_SOCK_FIELD(segs_in);
10629 break;
10630 case offsetof(struct bpf_sock_ops, data_segs_in):
10631 SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_in);
10632 break;
10633 case offsetof(struct bpf_sock_ops, segs_out):
10634 SOCK_OPS_GET_TCP_SOCK_FIELD(segs_out);
10635 break;
10636 case offsetof(struct bpf_sock_ops, data_segs_out):
10637 SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_out);
10638 break;
10639 case offsetof(struct bpf_sock_ops, lost_out):
10640 SOCK_OPS_GET_TCP_SOCK_FIELD(lost_out);
10641 break;
10642 case offsetof(struct bpf_sock_ops, sacked_out):
10643 SOCK_OPS_GET_TCP_SOCK_FIELD(sacked_out);
10644 break;
10645 case offsetof(struct bpf_sock_ops, bytes_received):
10646 SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_received);
10647 break;
10648 case offsetof(struct bpf_sock_ops, bytes_acked):
10649 SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_acked);
10650 break;
10651 case offsetof(struct bpf_sock_ops, sk):
10652 SOCK_OPS_GET_SK();
10653 break;
10654 case offsetof(struct bpf_sock_ops, skb_data_end):
10655 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
10656 skb_data_end),
10657 si->dst_reg, si->src_reg,
10658 offsetof(struct bpf_sock_ops_kern,
10659 skb_data_end));
10660 break;
10661 case offsetof(struct bpf_sock_ops, skb_data):
10662 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
10663 skb),
10664 si->dst_reg, si->src_reg,
10665 offsetof(struct bpf_sock_ops_kern,
10666 skb));
10667 *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
10668 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
10669 si->dst_reg, si->dst_reg,
10670 offsetof(struct sk_buff, data));
10671 break;
10672 case offsetof(struct bpf_sock_ops, skb_len):
10673 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
10674 skb),
10675 si->dst_reg, si->src_reg,
10676 offsetof(struct bpf_sock_ops_kern,
10677 skb));
10678 *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
10679 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len),
10680 si->dst_reg, si->dst_reg,
10681 offsetof(struct sk_buff, len));
10682 break;
10683 case offsetof(struct bpf_sock_ops, skb_tcp_flags):
10684 off = offsetof(struct sk_buff, cb);
10685 off += offsetof(struct tcp_skb_cb, tcp_flags);
10686 *target_size = sizeof_field(struct tcp_skb_cb, tcp_flags);
10687 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
10688 skb),
10689 si->dst_reg, si->src_reg,
10690 offsetof(struct bpf_sock_ops_kern,
10691 skb));
10692 *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
10693 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_skb_cb,
10694 tcp_flags),
10695 si->dst_reg, si->dst_reg, off);
10696 break;
10697 case offsetof(struct bpf_sock_ops, skb_hwtstamp): {
10698 struct bpf_insn *jmp_on_null_skb;
10699
10700 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
10701 skb),
10702 si->dst_reg, si->src_reg,
10703 offsetof(struct bpf_sock_ops_kern,
10704 skb));
10705 /* Reserve one insn to test skb == NULL */
10706 jmp_on_null_skb = insn++;
10707 insn = bpf_convert_shinfo_access(si->dst_reg, si->dst_reg, insn);
10708 *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg,
10709 bpf_target_off(struct skb_shared_info,
10710 hwtstamps, 8,
10711 target_size));
10712 *jmp_on_null_skb = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0,
10713 insn - jmp_on_null_skb - 1);
10714 break;
10715 }
10716 }
10717 return insn - insn_buf;
10718 }
10719
10720 /* data_end = skb->data + skb_headlen() */
bpf_convert_data_end_access(const struct bpf_insn * si,struct bpf_insn * insn)10721 static struct bpf_insn *bpf_convert_data_end_access(const struct bpf_insn *si,
10722 struct bpf_insn *insn)
10723 {
10724 int reg;
10725 int temp_reg_off = offsetof(struct sk_buff, cb) +
10726 offsetof(struct sk_skb_cb, temp_reg);
10727
10728 if (si->src_reg == si->dst_reg) {
10729 /* We need an extra register, choose and save a register. */
10730 reg = BPF_REG_9;
10731 if (si->src_reg == reg || si->dst_reg == reg)
10732 reg--;
10733 if (si->src_reg == reg || si->dst_reg == reg)
10734 reg--;
10735 *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, temp_reg_off);
10736 } else {
10737 reg = si->dst_reg;
10738 }
10739
10740 /* reg = skb->data */
10741 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
10742 reg, si->src_reg,
10743 offsetof(struct sk_buff, data));
10744 /* AX = skb->len */
10745 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len),
10746 BPF_REG_AX, si->src_reg,
10747 offsetof(struct sk_buff, len));
10748 /* reg = skb->data + skb->len */
10749 *insn++ = BPF_ALU64_REG(BPF_ADD, reg, BPF_REG_AX);
10750 /* AX = skb->data_len */
10751 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data_len),
10752 BPF_REG_AX, si->src_reg,
10753 offsetof(struct sk_buff, data_len));
10754
10755 /* reg = skb->data + skb->len - skb->data_len */
10756 *insn++ = BPF_ALU64_REG(BPF_SUB, reg, BPF_REG_AX);
10757
10758 if (si->src_reg == si->dst_reg) {
10759 /* Restore the saved register */
10760 *insn++ = BPF_MOV64_REG(BPF_REG_AX, si->src_reg);
10761 *insn++ = BPF_MOV64_REG(si->dst_reg, reg);
10762 *insn++ = BPF_LDX_MEM(BPF_DW, reg, BPF_REG_AX, temp_reg_off);
10763 }
10764
10765 return insn;
10766 }
10767
sk_skb_convert_ctx_access(enum bpf_access_type type,const struct bpf_insn * si,struct bpf_insn * insn_buf,struct bpf_prog * prog,u32 * target_size)10768 static u32 sk_skb_convert_ctx_access(enum bpf_access_type type,
10769 const struct bpf_insn *si,
10770 struct bpf_insn *insn_buf,
10771 struct bpf_prog *prog, u32 *target_size)
10772 {
10773 struct bpf_insn *insn = insn_buf;
10774 int off;
10775
10776 switch (si->off) {
10777 case offsetof(struct __sk_buff, data_end):
10778 insn = bpf_convert_data_end_access(si, insn);
10779 break;
10780 case offsetof(struct __sk_buff, cb[0]) ...
10781 offsetofend(struct __sk_buff, cb[4]) - 1:
10782 BUILD_BUG_ON(sizeof_field(struct sk_skb_cb, data) < 20);
10783 BUILD_BUG_ON((offsetof(struct sk_buff, cb) +
10784 offsetof(struct sk_skb_cb, data)) %
10785 sizeof(__u64));
10786
10787 prog->cb_access = 1;
10788 off = si->off;
10789 off -= offsetof(struct __sk_buff, cb[0]);
10790 off += offsetof(struct sk_buff, cb);
10791 off += offsetof(struct sk_skb_cb, data);
10792 if (type == BPF_WRITE)
10793 *insn++ = BPF_EMIT_STORE(BPF_SIZE(si->code), si, off);
10794 else
10795 *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg,
10796 si->src_reg, off);
10797 break;
10798
10799
10800 default:
10801 return bpf_convert_ctx_access(type, si, insn_buf, prog,
10802 target_size);
10803 }
10804
10805 return insn - insn_buf;
10806 }
10807
sk_msg_convert_ctx_access(enum bpf_access_type type,const struct bpf_insn * si,struct bpf_insn * insn_buf,struct bpf_prog * prog,u32 * target_size)10808 static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
10809 const struct bpf_insn *si,
10810 struct bpf_insn *insn_buf,
10811 struct bpf_prog *prog, u32 *target_size)
10812 {
10813 struct bpf_insn *insn = insn_buf;
10814 #if IS_ENABLED(CONFIG_IPV6)
10815 int off;
10816 #endif
10817
10818 /* convert ctx uses the fact sg element is first in struct */
10819 BUILD_BUG_ON(offsetof(struct sk_msg, sg) != 0);
10820
10821 switch (si->off) {
10822 case offsetof(struct sk_msg_md, data):
10823 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data),
10824 si->dst_reg, si->src_reg,
10825 offsetof(struct sk_msg, data));
10826 break;
10827 case offsetof(struct sk_msg_md, data_end):
10828 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data_end),
10829 si->dst_reg, si->src_reg,
10830 offsetof(struct sk_msg, data_end));
10831 break;
10832 case offsetof(struct sk_msg_md, family):
10833 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2);
10834
10835 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
10836 struct sk_msg, sk),
10837 si->dst_reg, si->src_reg,
10838 offsetof(struct sk_msg, sk));
10839 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
10840 offsetof(struct sock_common, skc_family));
10841 break;
10842
10843 case offsetof(struct sk_msg_md, remote_ip4):
10844 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4);
10845
10846 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
10847 struct sk_msg, sk),
10848 si->dst_reg, si->src_reg,
10849 offsetof(struct sk_msg, sk));
10850 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
10851 offsetof(struct sock_common, skc_daddr));
10852 break;
10853
10854 case offsetof(struct sk_msg_md, local_ip4):
10855 BUILD_BUG_ON(sizeof_field(struct sock_common,
10856 skc_rcv_saddr) != 4);
10857
10858 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
10859 struct sk_msg, sk),
10860 si->dst_reg, si->src_reg,
10861 offsetof(struct sk_msg, sk));
10862 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
10863 offsetof(struct sock_common,
10864 skc_rcv_saddr));
10865 break;
10866
10867 case offsetof(struct sk_msg_md, remote_ip6[0]) ...
10868 offsetof(struct sk_msg_md, remote_ip6[3]):
10869 #if IS_ENABLED(CONFIG_IPV6)
10870 BUILD_BUG_ON(sizeof_field(struct sock_common,
10871 skc_v6_daddr.s6_addr32[0]) != 4);
10872
10873 off = si->off;
10874 off -= offsetof(struct sk_msg_md, remote_ip6[0]);
10875 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
10876 struct sk_msg, sk),
10877 si->dst_reg, si->src_reg,
10878 offsetof(struct sk_msg, sk));
10879 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
10880 offsetof(struct sock_common,
10881 skc_v6_daddr.s6_addr32[0]) +
10882 off);
10883 #else
10884 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
10885 #endif
10886 break;
10887
10888 case offsetof(struct sk_msg_md, local_ip6[0]) ...
10889 offsetof(struct sk_msg_md, local_ip6[3]):
10890 #if IS_ENABLED(CONFIG_IPV6)
10891 BUILD_BUG_ON(sizeof_field(struct sock_common,
10892 skc_v6_rcv_saddr.s6_addr32[0]) != 4);
10893
10894 off = si->off;
10895 off -= offsetof(struct sk_msg_md, local_ip6[0]);
10896 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
10897 struct sk_msg, sk),
10898 si->dst_reg, si->src_reg,
10899 offsetof(struct sk_msg, sk));
10900 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
10901 offsetof(struct sock_common,
10902 skc_v6_rcv_saddr.s6_addr32[0]) +
10903 off);
10904 #else
10905 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
10906 #endif
10907 break;
10908
10909 case offsetof(struct sk_msg_md, remote_port):
10910 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2);
10911
10912 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
10913 struct sk_msg, sk),
10914 si->dst_reg, si->src_reg,
10915 offsetof(struct sk_msg, sk));
10916 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
10917 offsetof(struct sock_common, skc_dport));
10918 #ifndef __BIG_ENDIAN_BITFIELD
10919 *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
10920 #endif
10921 break;
10922
10923 case offsetof(struct sk_msg_md, local_port):
10924 BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2);
10925
10926 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
10927 struct sk_msg, sk),
10928 si->dst_reg, si->src_reg,
10929 offsetof(struct sk_msg, sk));
10930 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
10931 offsetof(struct sock_common, skc_num));
10932 break;
10933
10934 case offsetof(struct sk_msg_md, size):
10935 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_sg, size),
10936 si->dst_reg, si->src_reg,
10937 offsetof(struct sk_msg_sg, size));
10938 break;
10939
10940 case offsetof(struct sk_msg_md, sk):
10941 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, sk),
10942 si->dst_reg, si->src_reg,
10943 offsetof(struct sk_msg, sk));
10944 break;
10945 }
10946
10947 return insn - insn_buf;
10948 }
10949
10950 const struct bpf_verifier_ops sk_filter_verifier_ops = {
10951 .get_func_proto = sk_filter_func_proto,
10952 .is_valid_access = sk_filter_is_valid_access,
10953 .convert_ctx_access = bpf_convert_ctx_access,
10954 .gen_ld_abs = bpf_gen_ld_abs,
10955 };
10956
10957 const struct bpf_prog_ops sk_filter_prog_ops = {
10958 .test_run = bpf_prog_test_run_skb,
10959 };
10960
10961 const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
10962 .get_func_proto = tc_cls_act_func_proto,
10963 .is_valid_access = tc_cls_act_is_valid_access,
10964 .convert_ctx_access = tc_cls_act_convert_ctx_access,
10965 .gen_prologue = tc_cls_act_prologue,
10966 .gen_ld_abs = bpf_gen_ld_abs,
10967 .btf_struct_access = tc_cls_act_btf_struct_access,
10968 };
10969
10970 const struct bpf_prog_ops tc_cls_act_prog_ops = {
10971 .test_run = bpf_prog_test_run_skb,
10972 };
10973
10974 const struct bpf_verifier_ops xdp_verifier_ops = {
10975 .get_func_proto = xdp_func_proto,
10976 .is_valid_access = xdp_is_valid_access,
10977 .convert_ctx_access = xdp_convert_ctx_access,
10978 .gen_prologue = bpf_noop_prologue,
10979 .btf_struct_access = xdp_btf_struct_access,
10980 };
10981
10982 const struct bpf_prog_ops xdp_prog_ops = {
10983 .test_run = bpf_prog_test_run_xdp,
10984 };
10985
10986 const struct bpf_verifier_ops cg_skb_verifier_ops = {
10987 .get_func_proto = cg_skb_func_proto,
10988 .is_valid_access = cg_skb_is_valid_access,
10989 .convert_ctx_access = bpf_convert_ctx_access,
10990 };
10991
10992 const struct bpf_prog_ops cg_skb_prog_ops = {
10993 .test_run = bpf_prog_test_run_skb,
10994 };
10995
10996 const struct bpf_verifier_ops lwt_in_verifier_ops = {
10997 .get_func_proto = lwt_in_func_proto,
10998 .is_valid_access = lwt_is_valid_access,
10999 .convert_ctx_access = bpf_convert_ctx_access,
11000 };
11001
11002 const struct bpf_prog_ops lwt_in_prog_ops = {
11003 .test_run = bpf_prog_test_run_skb,
11004 };
11005
11006 const struct bpf_verifier_ops lwt_out_verifier_ops = {
11007 .get_func_proto = lwt_out_func_proto,
11008 .is_valid_access = lwt_is_valid_access,
11009 .convert_ctx_access = bpf_convert_ctx_access,
11010 };
11011
11012 const struct bpf_prog_ops lwt_out_prog_ops = {
11013 .test_run = bpf_prog_test_run_skb,
11014 };
11015
11016 const struct bpf_verifier_ops lwt_xmit_verifier_ops = {
11017 .get_func_proto = lwt_xmit_func_proto,
11018 .is_valid_access = lwt_is_valid_access,
11019 .convert_ctx_access = bpf_convert_ctx_access,
11020 .gen_prologue = tc_cls_act_prologue,
11021 };
11022
11023 const struct bpf_prog_ops lwt_xmit_prog_ops = {
11024 .test_run = bpf_prog_test_run_skb,
11025 };
11026
11027 const struct bpf_verifier_ops lwt_seg6local_verifier_ops = {
11028 .get_func_proto = lwt_seg6local_func_proto,
11029 .is_valid_access = lwt_is_valid_access,
11030 .convert_ctx_access = bpf_convert_ctx_access,
11031 };
11032
11033 const struct bpf_prog_ops lwt_seg6local_prog_ops = {
11034 .test_run = bpf_prog_test_run_skb,
11035 };
11036
11037 const struct bpf_verifier_ops cg_sock_verifier_ops = {
11038 .get_func_proto = sock_filter_func_proto,
11039 .is_valid_access = sock_filter_is_valid_access,
11040 .convert_ctx_access = bpf_sock_convert_ctx_access,
11041 };
11042
11043 const struct bpf_prog_ops cg_sock_prog_ops = {
11044 };
11045
11046 const struct bpf_verifier_ops cg_sock_addr_verifier_ops = {
11047 .get_func_proto = sock_addr_func_proto,
11048 .is_valid_access = sock_addr_is_valid_access,
11049 .convert_ctx_access = sock_addr_convert_ctx_access,
11050 };
11051
11052 const struct bpf_prog_ops cg_sock_addr_prog_ops = {
11053 };
11054
11055 const struct bpf_verifier_ops sock_ops_verifier_ops = {
11056 .get_func_proto = sock_ops_func_proto,
11057 .is_valid_access = sock_ops_is_valid_access,
11058 .convert_ctx_access = sock_ops_convert_ctx_access,
11059 };
11060
11061 const struct bpf_prog_ops sock_ops_prog_ops = {
11062 };
11063
11064 const struct bpf_verifier_ops sk_skb_verifier_ops = {
11065 .get_func_proto = sk_skb_func_proto,
11066 .is_valid_access = sk_skb_is_valid_access,
11067 .convert_ctx_access = sk_skb_convert_ctx_access,
11068 .gen_prologue = sk_skb_prologue,
11069 };
11070
11071 const struct bpf_prog_ops sk_skb_prog_ops = {
11072 };
11073
11074 const struct bpf_verifier_ops sk_msg_verifier_ops = {
11075 .get_func_proto = sk_msg_func_proto,
11076 .is_valid_access = sk_msg_is_valid_access,
11077 .convert_ctx_access = sk_msg_convert_ctx_access,
11078 .gen_prologue = bpf_noop_prologue,
11079 };
11080
11081 const struct bpf_prog_ops sk_msg_prog_ops = {
11082 };
11083
11084 const struct bpf_verifier_ops flow_dissector_verifier_ops = {
11085 .get_func_proto = flow_dissector_func_proto,
11086 .is_valid_access = flow_dissector_is_valid_access,
11087 .convert_ctx_access = flow_dissector_convert_ctx_access,
11088 };
11089
11090 const struct bpf_prog_ops flow_dissector_prog_ops = {
11091 .test_run = bpf_prog_test_run_flow_dissector,
11092 };
11093
sk_detach_filter(struct sock * sk)11094 int sk_detach_filter(struct sock *sk)
11095 {
11096 int ret = -ENOENT;
11097 struct sk_filter *filter;
11098
11099 if (sock_flag(sk, SOCK_FILTER_LOCKED))
11100 return -EPERM;
11101
11102 filter = rcu_dereference_protected(sk->sk_filter,
11103 lockdep_sock_is_held(sk));
11104 if (filter) {
11105 RCU_INIT_POINTER(sk->sk_filter, NULL);
11106 sk_filter_uncharge(sk, filter);
11107 ret = 0;
11108 }
11109
11110 return ret;
11111 }
11112 EXPORT_SYMBOL_GPL(sk_detach_filter);
11113
sk_get_filter(struct sock * sk,sockptr_t optval,unsigned int len)11114 int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len)
11115 {
11116 struct sock_fprog_kern *fprog;
11117 struct sk_filter *filter;
11118 int ret = 0;
11119
11120 sockopt_lock_sock(sk);
11121 filter = rcu_dereference_protected(sk->sk_filter,
11122 lockdep_sock_is_held(sk));
11123 if (!filter)
11124 goto out;
11125
11126 /* We're copying the filter that has been originally attached,
11127 * so no conversion/decode needed anymore. eBPF programs that
11128 * have no original program cannot be dumped through this.
11129 */
11130 ret = -EACCES;
11131 fprog = filter->prog->orig_prog;
11132 if (!fprog)
11133 goto out;
11134
11135 ret = fprog->len;
11136 if (!len)
11137 /* User space only enquires number of filter blocks. */
11138 goto out;
11139
11140 ret = -EINVAL;
11141 if (len < fprog->len)
11142 goto out;
11143
11144 ret = -EFAULT;
11145 if (copy_to_sockptr(optval, fprog->filter, bpf_classic_proglen(fprog)))
11146 goto out;
11147
11148 /* Instead of bytes, the API requests to return the number
11149 * of filter blocks.
11150 */
11151 ret = fprog->len;
11152 out:
11153 sockopt_release_sock(sk);
11154 return ret;
11155 }
11156
11157 #ifdef CONFIG_INET
bpf_init_reuseport_kern(struct sk_reuseport_kern * reuse_kern,struct sock_reuseport * reuse,struct sock * sk,struct sk_buff * skb,struct sock * migrating_sk,u32 hash)11158 static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
11159 struct sock_reuseport *reuse,
11160 struct sock *sk, struct sk_buff *skb,
11161 struct sock *migrating_sk,
11162 u32 hash)
11163 {
11164 reuse_kern->skb = skb;
11165 reuse_kern->sk = sk;
11166 reuse_kern->selected_sk = NULL;
11167 reuse_kern->migrating_sk = migrating_sk;
11168 reuse_kern->data_end = skb->data + skb_headlen(skb);
11169 reuse_kern->hash = hash;
11170 reuse_kern->reuseport_id = reuse->reuseport_id;
11171 reuse_kern->bind_inany = reuse->bind_inany;
11172 }
11173
bpf_run_sk_reuseport(struct sock_reuseport * reuse,struct sock * sk,struct bpf_prog * prog,struct sk_buff * skb,struct sock * migrating_sk,u32 hash)11174 struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
11175 struct bpf_prog *prog, struct sk_buff *skb,
11176 struct sock *migrating_sk,
11177 u32 hash)
11178 {
11179 struct sk_reuseport_kern reuse_kern;
11180 enum sk_action action;
11181
11182 bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash);
11183 action = bpf_prog_run(prog, &reuse_kern);
11184
11185 if (action == SK_PASS)
11186 return reuse_kern.selected_sk;
11187 else
11188 return ERR_PTR(-ECONNREFUSED);
11189 }
11190
BPF_CALL_4(sk_select_reuseport,struct sk_reuseport_kern *,reuse_kern,struct bpf_map *,map,void *,key,u32,flags)11191 BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern,
11192 struct bpf_map *, map, void *, key, u32, flags)
11193 {
11194 bool is_sockarray = map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY;
11195 struct sock_reuseport *reuse;
11196 struct sock *selected_sk;
11197 int err;
11198
11199 selected_sk = map->ops->map_lookup_elem(map, key);
11200 if (!selected_sk)
11201 return -ENOENT;
11202
11203 reuse = rcu_dereference(selected_sk->sk_reuseport_cb);
11204 if (!reuse) {
11205 /* reuseport_array has only sk with non NULL sk_reuseport_cb.
11206 * The only (!reuse) case here is - the sk has already been
11207 * unhashed (e.g. by close()), so treat it as -ENOENT.
11208 *
11209 * Other maps (e.g. sock_map) do not provide this guarantee and
11210 * the sk may never be in the reuseport group to begin with.
11211 */
11212 err = is_sockarray ? -ENOENT : -EINVAL;
11213 goto error;
11214 }
11215
11216 if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) {
11217 struct sock *sk = reuse_kern->sk;
11218
11219 if (sk->sk_protocol != selected_sk->sk_protocol) {
11220 err = -EPROTOTYPE;
11221 } else if (sk->sk_family != selected_sk->sk_family) {
11222 err = -EAFNOSUPPORT;
11223 } else {
11224 /* Catch all. Likely bound to a different sockaddr. */
11225 err = -EBADFD;
11226 }
11227 goto error;
11228 }
11229
11230 reuse_kern->selected_sk = selected_sk;
11231
11232 return 0;
11233 error:
11234 /* Lookup in sock_map can return TCP ESTABLISHED sockets. */
11235 if (sk_is_refcounted(selected_sk))
11236 sock_put(selected_sk);
11237
11238 return err;
11239 }
11240
11241 static const struct bpf_func_proto sk_select_reuseport_proto = {
11242 .func = sk_select_reuseport,
11243 .gpl_only = false,
11244 .ret_type = RET_INTEGER,
11245 .arg1_type = ARG_PTR_TO_CTX,
11246 .arg2_type = ARG_CONST_MAP_PTR,
11247 .arg3_type = ARG_PTR_TO_MAP_KEY,
11248 .arg4_type = ARG_ANYTHING,
11249 };
11250
BPF_CALL_4(sk_reuseport_load_bytes,const struct sk_reuseport_kern *,reuse_kern,u32,offset,void *,to,u32,len)11251 BPF_CALL_4(sk_reuseport_load_bytes,
11252 const struct sk_reuseport_kern *, reuse_kern, u32, offset,
11253 void *, to, u32, len)
11254 {
11255 return ____bpf_skb_load_bytes(reuse_kern->skb, offset, to, len);
11256 }
11257
11258 static const struct bpf_func_proto sk_reuseport_load_bytes_proto = {
11259 .func = sk_reuseport_load_bytes,
11260 .gpl_only = false,
11261 .ret_type = RET_INTEGER,
11262 .arg1_type = ARG_PTR_TO_CTX,
11263 .arg2_type = ARG_ANYTHING,
11264 .arg3_type = ARG_PTR_TO_UNINIT_MEM,
11265 .arg4_type = ARG_CONST_SIZE,
11266 };
11267
BPF_CALL_5(sk_reuseport_load_bytes_relative,const struct sk_reuseport_kern *,reuse_kern,u32,offset,void *,to,u32,len,u32,start_header)11268 BPF_CALL_5(sk_reuseport_load_bytes_relative,
11269 const struct sk_reuseport_kern *, reuse_kern, u32, offset,
11270 void *, to, u32, len, u32, start_header)
11271 {
11272 return ____bpf_skb_load_bytes_relative(reuse_kern->skb, offset, to,
11273 len, start_header);
11274 }
11275
11276 static const struct bpf_func_proto sk_reuseport_load_bytes_relative_proto = {
11277 .func = sk_reuseport_load_bytes_relative,
11278 .gpl_only = false,
11279 .ret_type = RET_INTEGER,
11280 .arg1_type = ARG_PTR_TO_CTX,
11281 .arg2_type = ARG_ANYTHING,
11282 .arg3_type = ARG_PTR_TO_UNINIT_MEM,
11283 .arg4_type = ARG_CONST_SIZE,
11284 .arg5_type = ARG_ANYTHING,
11285 };
11286
11287 static const struct bpf_func_proto *
sk_reuseport_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)11288 sk_reuseport_func_proto(enum bpf_func_id func_id,
11289 const struct bpf_prog *prog)
11290 {
11291 switch (func_id) {
11292 case BPF_FUNC_sk_select_reuseport:
11293 return &sk_select_reuseport_proto;
11294 case BPF_FUNC_skb_load_bytes:
11295 return &sk_reuseport_load_bytes_proto;
11296 case BPF_FUNC_skb_load_bytes_relative:
11297 return &sk_reuseport_load_bytes_relative_proto;
11298 case BPF_FUNC_get_socket_cookie:
11299 return &bpf_get_socket_ptr_cookie_proto;
11300 case BPF_FUNC_ktime_get_coarse_ns:
11301 return &bpf_ktime_get_coarse_ns_proto;
11302 default:
11303 return bpf_base_func_proto(func_id);
11304 }
11305 }
11306
11307 static bool
sk_reuseport_is_valid_access(int off,int size,enum bpf_access_type type,const struct bpf_prog * prog,struct bpf_insn_access_aux * info)11308 sk_reuseport_is_valid_access(int off, int size,
11309 enum bpf_access_type type,
11310 const struct bpf_prog *prog,
11311 struct bpf_insn_access_aux *info)
11312 {
11313 const u32 size_default = sizeof(__u32);
11314
11315 if (off < 0 || off >= sizeof(struct sk_reuseport_md) ||
11316 off % size || type != BPF_READ)
11317 return false;
11318
11319 switch (off) {
11320 case offsetof(struct sk_reuseport_md, data):
11321 info->reg_type = PTR_TO_PACKET;
11322 return size == sizeof(__u64);
11323
11324 case offsetof(struct sk_reuseport_md, data_end):
11325 info->reg_type = PTR_TO_PACKET_END;
11326 return size == sizeof(__u64);
11327
11328 case offsetof(struct sk_reuseport_md, hash):
11329 return size == size_default;
11330
11331 case offsetof(struct sk_reuseport_md, sk):
11332 info->reg_type = PTR_TO_SOCKET;
11333 return size == sizeof(__u64);
11334
11335 case offsetof(struct sk_reuseport_md, migrating_sk):
11336 info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
11337 return size == sizeof(__u64);
11338
11339 /* Fields that allow narrowing */
11340 case bpf_ctx_range(struct sk_reuseport_md, eth_protocol):
11341 if (size < sizeof_field(struct sk_buff, protocol))
11342 return false;
11343 fallthrough;
11344 case bpf_ctx_range(struct sk_reuseport_md, ip_protocol):
11345 case bpf_ctx_range(struct sk_reuseport_md, bind_inany):
11346 case bpf_ctx_range(struct sk_reuseport_md, len):
11347 bpf_ctx_record_field_size(info, size_default);
11348 return bpf_ctx_narrow_access_ok(off, size, size_default);
11349
11350 default:
11351 return false;
11352 }
11353 }
11354
11355 #define SK_REUSEPORT_LOAD_FIELD(F) ({ \
11356 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_reuseport_kern, F), \
11357 si->dst_reg, si->src_reg, \
11358 bpf_target_off(struct sk_reuseport_kern, F, \
11359 sizeof_field(struct sk_reuseport_kern, F), \
11360 target_size)); \
11361 })
11362
11363 #define SK_REUSEPORT_LOAD_SKB_FIELD(SKB_FIELD) \
11364 SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \
11365 struct sk_buff, \
11366 skb, \
11367 SKB_FIELD)
11368
11369 #define SK_REUSEPORT_LOAD_SK_FIELD(SK_FIELD) \
11370 SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \
11371 struct sock, \
11372 sk, \
11373 SK_FIELD)
11374
sk_reuseport_convert_ctx_access(enum bpf_access_type type,const struct bpf_insn * si,struct bpf_insn * insn_buf,struct bpf_prog * prog,u32 * target_size)11375 static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type,
11376 const struct bpf_insn *si,
11377 struct bpf_insn *insn_buf,
11378 struct bpf_prog *prog,
11379 u32 *target_size)
11380 {
11381 struct bpf_insn *insn = insn_buf;
11382
11383 switch (si->off) {
11384 case offsetof(struct sk_reuseport_md, data):
11385 SK_REUSEPORT_LOAD_SKB_FIELD(data);
11386 break;
11387
11388 case offsetof(struct sk_reuseport_md, len):
11389 SK_REUSEPORT_LOAD_SKB_FIELD(len);
11390 break;
11391
11392 case offsetof(struct sk_reuseport_md, eth_protocol):
11393 SK_REUSEPORT_LOAD_SKB_FIELD(protocol);
11394 break;
11395
11396 case offsetof(struct sk_reuseport_md, ip_protocol):
11397 SK_REUSEPORT_LOAD_SK_FIELD(sk_protocol);
11398 break;
11399
11400 case offsetof(struct sk_reuseport_md, data_end):
11401 SK_REUSEPORT_LOAD_FIELD(data_end);
11402 break;
11403
11404 case offsetof(struct sk_reuseport_md, hash):
11405 SK_REUSEPORT_LOAD_FIELD(hash);
11406 break;
11407
11408 case offsetof(struct sk_reuseport_md, bind_inany):
11409 SK_REUSEPORT_LOAD_FIELD(bind_inany);
11410 break;
11411
11412 case offsetof(struct sk_reuseport_md, sk):
11413 SK_REUSEPORT_LOAD_FIELD(sk);
11414 break;
11415
11416 case offsetof(struct sk_reuseport_md, migrating_sk):
11417 SK_REUSEPORT_LOAD_FIELD(migrating_sk);
11418 break;
11419 }
11420
11421 return insn - insn_buf;
11422 }
11423
11424 const struct bpf_verifier_ops sk_reuseport_verifier_ops = {
11425 .get_func_proto = sk_reuseport_func_proto,
11426 .is_valid_access = sk_reuseport_is_valid_access,
11427 .convert_ctx_access = sk_reuseport_convert_ctx_access,
11428 };
11429
11430 const struct bpf_prog_ops sk_reuseport_prog_ops = {
11431 };
11432
11433 DEFINE_STATIC_KEY_FALSE(bpf_sk_lookup_enabled);
11434 EXPORT_SYMBOL(bpf_sk_lookup_enabled);
11435
BPF_CALL_3(bpf_sk_lookup_assign,struct bpf_sk_lookup_kern *,ctx,struct sock *,sk,u64,flags)11436 BPF_CALL_3(bpf_sk_lookup_assign, struct bpf_sk_lookup_kern *, ctx,
11437 struct sock *, sk, u64, flags)
11438 {
11439 if (unlikely(flags & ~(BPF_SK_LOOKUP_F_REPLACE |
11440 BPF_SK_LOOKUP_F_NO_REUSEPORT)))
11441 return -EINVAL;
11442 if (unlikely(sk && sk_is_refcounted(sk)))
11443 return -ESOCKTNOSUPPORT; /* reject non-RCU freed sockets */
11444 if (unlikely(sk && sk_is_tcp(sk) && sk->sk_state != TCP_LISTEN))
11445 return -ESOCKTNOSUPPORT; /* only accept TCP socket in LISTEN */
11446 if (unlikely(sk && sk_is_udp(sk) && sk->sk_state != TCP_CLOSE))
11447 return -ESOCKTNOSUPPORT; /* only accept UDP socket in CLOSE */
11448
11449 /* Check if socket is suitable for packet L3/L4 protocol */
11450 if (sk && sk->sk_protocol != ctx->protocol)
11451 return -EPROTOTYPE;
11452 if (sk && sk->sk_family != ctx->family &&
11453 (sk->sk_family == AF_INET || ipv6_only_sock(sk)))
11454 return -EAFNOSUPPORT;
11455
11456 if (ctx->selected_sk && !(flags & BPF_SK_LOOKUP_F_REPLACE))
11457 return -EEXIST;
11458
11459 /* Select socket as lookup result */
11460 ctx->selected_sk = sk;
11461 ctx->no_reuseport = flags & BPF_SK_LOOKUP_F_NO_REUSEPORT;
11462 return 0;
11463 }
11464
11465 static const struct bpf_func_proto bpf_sk_lookup_assign_proto = {
11466 .func = bpf_sk_lookup_assign,
11467 .gpl_only = false,
11468 .ret_type = RET_INTEGER,
11469 .arg1_type = ARG_PTR_TO_CTX,
11470 .arg2_type = ARG_PTR_TO_SOCKET_OR_NULL,
11471 .arg3_type = ARG_ANYTHING,
11472 };
11473
11474 static const struct bpf_func_proto *
sk_lookup_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)11475 sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
11476 {
11477 switch (func_id) {
11478 case BPF_FUNC_perf_event_output:
11479 return &bpf_event_output_data_proto;
11480 case BPF_FUNC_sk_assign:
11481 return &bpf_sk_lookup_assign_proto;
11482 case BPF_FUNC_sk_release:
11483 return &bpf_sk_release_proto;
11484 default:
11485 return bpf_sk_base_func_proto(func_id);
11486 }
11487 }
11488
sk_lookup_is_valid_access(int off,int size,enum bpf_access_type type,const struct bpf_prog * prog,struct bpf_insn_access_aux * info)11489 static bool sk_lookup_is_valid_access(int off, int size,
11490 enum bpf_access_type type,
11491 const struct bpf_prog *prog,
11492 struct bpf_insn_access_aux *info)
11493 {
11494 if (off < 0 || off >= sizeof(struct bpf_sk_lookup))
11495 return false;
11496 if (off % size != 0)
11497 return false;
11498 if (type != BPF_READ)
11499 return false;
11500
11501 switch (off) {
11502 case offsetof(struct bpf_sk_lookup, sk):
11503 info->reg_type = PTR_TO_SOCKET_OR_NULL;
11504 return size == sizeof(__u64);
11505
11506 case bpf_ctx_range(struct bpf_sk_lookup, family):
11507 case bpf_ctx_range(struct bpf_sk_lookup, protocol):
11508 case bpf_ctx_range(struct bpf_sk_lookup, remote_ip4):
11509 case bpf_ctx_range(struct bpf_sk_lookup, local_ip4):
11510 case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]):
11511 case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]):
11512 case bpf_ctx_range(struct bpf_sk_lookup, local_port):
11513 case bpf_ctx_range(struct bpf_sk_lookup, ingress_ifindex):
11514 bpf_ctx_record_field_size(info, sizeof(__u32));
11515 return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32));
11516
11517 case bpf_ctx_range(struct bpf_sk_lookup, remote_port):
11518 /* Allow 4-byte access to 2-byte field for backward compatibility */
11519 if (size == sizeof(__u32))
11520 return true;
11521 bpf_ctx_record_field_size(info, sizeof(__be16));
11522 return bpf_ctx_narrow_access_ok(off, size, sizeof(__be16));
11523
11524 case offsetofend(struct bpf_sk_lookup, remote_port) ...
11525 offsetof(struct bpf_sk_lookup, local_ip4) - 1:
11526 /* Allow access to zero padding for backward compatibility */
11527 bpf_ctx_record_field_size(info, sizeof(__u16));
11528 return bpf_ctx_narrow_access_ok(off, size, sizeof(__u16));
11529
11530 default:
11531 return false;
11532 }
11533 }
11534
sk_lookup_convert_ctx_access(enum bpf_access_type type,const struct bpf_insn * si,struct bpf_insn * insn_buf,struct bpf_prog * prog,u32 * target_size)11535 static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type,
11536 const struct bpf_insn *si,
11537 struct bpf_insn *insn_buf,
11538 struct bpf_prog *prog,
11539 u32 *target_size)
11540 {
11541 struct bpf_insn *insn = insn_buf;
11542
11543 switch (si->off) {
11544 case offsetof(struct bpf_sk_lookup, sk):
11545 *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
11546 offsetof(struct bpf_sk_lookup_kern, selected_sk));
11547 break;
11548
11549 case offsetof(struct bpf_sk_lookup, family):
11550 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
11551 bpf_target_off(struct bpf_sk_lookup_kern,
11552 family, 2, target_size));
11553 break;
11554
11555 case offsetof(struct bpf_sk_lookup, protocol):
11556 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
11557 bpf_target_off(struct bpf_sk_lookup_kern,
11558 protocol, 2, target_size));
11559 break;
11560
11561 case offsetof(struct bpf_sk_lookup, remote_ip4):
11562 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
11563 bpf_target_off(struct bpf_sk_lookup_kern,
11564 v4.saddr, 4, target_size));
11565 break;
11566
11567 case offsetof(struct bpf_sk_lookup, local_ip4):
11568 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
11569 bpf_target_off(struct bpf_sk_lookup_kern,
11570 v4.daddr, 4, target_size));
11571 break;
11572
11573 case bpf_ctx_range_till(struct bpf_sk_lookup,
11574 remote_ip6[0], remote_ip6[3]): {
11575 #if IS_ENABLED(CONFIG_IPV6)
11576 int off = si->off;
11577
11578 off -= offsetof(struct bpf_sk_lookup, remote_ip6[0]);
11579 off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size);
11580 *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
11581 offsetof(struct bpf_sk_lookup_kern, v6.saddr));
11582 *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
11583 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off);
11584 #else
11585 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
11586 #endif
11587 break;
11588 }
11589 case bpf_ctx_range_till(struct bpf_sk_lookup,
11590 local_ip6[0], local_ip6[3]): {
11591 #if IS_ENABLED(CONFIG_IPV6)
11592 int off = si->off;
11593
11594 off -= offsetof(struct bpf_sk_lookup, local_ip6[0]);
11595 off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size);
11596 *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
11597 offsetof(struct bpf_sk_lookup_kern, v6.daddr));
11598 *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
11599 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off);
11600 #else
11601 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
11602 #endif
11603 break;
11604 }
11605 case offsetof(struct bpf_sk_lookup, remote_port):
11606 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
11607 bpf_target_off(struct bpf_sk_lookup_kern,
11608 sport, 2, target_size));
11609 break;
11610
11611 case offsetofend(struct bpf_sk_lookup, remote_port):
11612 *target_size = 2;
11613 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
11614 break;
11615
11616 case offsetof(struct bpf_sk_lookup, local_port):
11617 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
11618 bpf_target_off(struct bpf_sk_lookup_kern,
11619 dport, 2, target_size));
11620 break;
11621
11622 case offsetof(struct bpf_sk_lookup, ingress_ifindex):
11623 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
11624 bpf_target_off(struct bpf_sk_lookup_kern,
11625 ingress_ifindex, 4, target_size));
11626 break;
11627 }
11628
11629 return insn - insn_buf;
11630 }
11631
11632 const struct bpf_prog_ops sk_lookup_prog_ops = {
11633 .test_run = bpf_prog_test_run_sk_lookup,
11634 };
11635
11636 const struct bpf_verifier_ops sk_lookup_verifier_ops = {
11637 .get_func_proto = sk_lookup_func_proto,
11638 .is_valid_access = sk_lookup_is_valid_access,
11639 .convert_ctx_access = sk_lookup_convert_ctx_access,
11640 };
11641
11642 #endif /* CONFIG_INET */
11643
DEFINE_BPF_DISPATCHER(xdp)11644 DEFINE_BPF_DISPATCHER(xdp)
11645
11646 void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog)
11647 {
11648 bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog);
11649 }
11650
BTF_ID_LIST_GLOBAL(btf_sock_ids,MAX_BTF_SOCK_TYPE)11651 BTF_ID_LIST_GLOBAL(btf_sock_ids, MAX_BTF_SOCK_TYPE)
11652 #define BTF_SOCK_TYPE(name, type) BTF_ID(struct, type)
11653 BTF_SOCK_TYPE_xxx
11654 #undef BTF_SOCK_TYPE
11655
11656 BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk)
11657 {
11658 /* tcp6_sock type is not generated in dwarf and hence btf,
11659 * trigger an explicit type generation here.
11660 */
11661 BTF_TYPE_EMIT(struct tcp6_sock);
11662 if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP &&
11663 sk->sk_family == AF_INET6)
11664 return (unsigned long)sk;
11665
11666 return (unsigned long)NULL;
11667 }
11668
11669 const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = {
11670 .func = bpf_skc_to_tcp6_sock,
11671 .gpl_only = false,
11672 .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
11673 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
11674 .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP6],
11675 };
11676
BPF_CALL_1(bpf_skc_to_tcp_sock,struct sock *,sk)11677 BPF_CALL_1(bpf_skc_to_tcp_sock, struct sock *, sk)
11678 {
11679 if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP)
11680 return (unsigned long)sk;
11681
11682 return (unsigned long)NULL;
11683 }
11684
11685 const struct bpf_func_proto bpf_skc_to_tcp_sock_proto = {
11686 .func = bpf_skc_to_tcp_sock,
11687 .gpl_only = false,
11688 .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
11689 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
11690 .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
11691 };
11692
BPF_CALL_1(bpf_skc_to_tcp_timewait_sock,struct sock *,sk)11693 BPF_CALL_1(bpf_skc_to_tcp_timewait_sock, struct sock *, sk)
11694 {
11695 /* BTF types for tcp_timewait_sock and inet_timewait_sock are not
11696 * generated if CONFIG_INET=n. Trigger an explicit generation here.
11697 */
11698 BTF_TYPE_EMIT(struct inet_timewait_sock);
11699 BTF_TYPE_EMIT(struct tcp_timewait_sock);
11700
11701 #ifdef CONFIG_INET
11702 if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_TIME_WAIT)
11703 return (unsigned long)sk;
11704 #endif
11705
11706 #if IS_BUILTIN(CONFIG_IPV6)
11707 if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_TIME_WAIT)
11708 return (unsigned long)sk;
11709 #endif
11710
11711 return (unsigned long)NULL;
11712 }
11713
11714 const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto = {
11715 .func = bpf_skc_to_tcp_timewait_sock,
11716 .gpl_only = false,
11717 .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
11718 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
11719 .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_TW],
11720 };
11721
BPF_CALL_1(bpf_skc_to_tcp_request_sock,struct sock *,sk)11722 BPF_CALL_1(bpf_skc_to_tcp_request_sock, struct sock *, sk)
11723 {
11724 #ifdef CONFIG_INET
11725 if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_NEW_SYN_RECV)
11726 return (unsigned long)sk;
11727 #endif
11728
11729 #if IS_BUILTIN(CONFIG_IPV6)
11730 if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_NEW_SYN_RECV)
11731 return (unsigned long)sk;
11732 #endif
11733
11734 return (unsigned long)NULL;
11735 }
11736
11737 const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto = {
11738 .func = bpf_skc_to_tcp_request_sock,
11739 .gpl_only = false,
11740 .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
11741 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
11742 .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_REQ],
11743 };
11744
BPF_CALL_1(bpf_skc_to_udp6_sock,struct sock *,sk)11745 BPF_CALL_1(bpf_skc_to_udp6_sock, struct sock *, sk)
11746 {
11747 /* udp6_sock type is not generated in dwarf and hence btf,
11748 * trigger an explicit type generation here.
11749 */
11750 BTF_TYPE_EMIT(struct udp6_sock);
11751 if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_UDP &&
11752 sk->sk_type == SOCK_DGRAM && sk->sk_family == AF_INET6)
11753 return (unsigned long)sk;
11754
11755 return (unsigned long)NULL;
11756 }
11757
11758 const struct bpf_func_proto bpf_skc_to_udp6_sock_proto = {
11759 .func = bpf_skc_to_udp6_sock,
11760 .gpl_only = false,
11761 .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
11762 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
11763 .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UDP6],
11764 };
11765
BPF_CALL_1(bpf_skc_to_unix_sock,struct sock *,sk)11766 BPF_CALL_1(bpf_skc_to_unix_sock, struct sock *, sk)
11767 {
11768 /* unix_sock type is not generated in dwarf and hence btf,
11769 * trigger an explicit type generation here.
11770 */
11771 BTF_TYPE_EMIT(struct unix_sock);
11772 if (sk && sk_fullsock(sk) && sk->sk_family == AF_UNIX)
11773 return (unsigned long)sk;
11774
11775 return (unsigned long)NULL;
11776 }
11777
11778 const struct bpf_func_proto bpf_skc_to_unix_sock_proto = {
11779 .func = bpf_skc_to_unix_sock,
11780 .gpl_only = false,
11781 .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
11782 .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
11783 .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UNIX],
11784 };
11785
BPF_CALL_1(bpf_skc_to_mptcp_sock,struct sock *,sk)11786 BPF_CALL_1(bpf_skc_to_mptcp_sock, struct sock *, sk)
11787 {
11788 BTF_TYPE_EMIT(struct mptcp_sock);
11789 return (unsigned long)bpf_mptcp_sock_from_subflow(sk);
11790 }
11791
11792 const struct bpf_func_proto bpf_skc_to_mptcp_sock_proto = {
11793 .func = bpf_skc_to_mptcp_sock,
11794 .gpl_only = false,
11795 .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
11796 .arg1_type = ARG_PTR_TO_SOCK_COMMON,
11797 .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_MPTCP],
11798 };
11799
BPF_CALL_1(bpf_sock_from_file,struct file *,file)11800 BPF_CALL_1(bpf_sock_from_file, struct file *, file)
11801 {
11802 return (unsigned long)sock_from_file(file);
11803 }
11804
11805 BTF_ID_LIST(bpf_sock_from_file_btf_ids)
11806 BTF_ID(struct, socket)
11807 BTF_ID(struct, file)
11808
11809 const struct bpf_func_proto bpf_sock_from_file_proto = {
11810 .func = bpf_sock_from_file,
11811 .gpl_only = false,
11812 .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
11813 .ret_btf_id = &bpf_sock_from_file_btf_ids[0],
11814 .arg1_type = ARG_PTR_TO_BTF_ID,
11815 .arg1_btf_id = &bpf_sock_from_file_btf_ids[1],
11816 };
11817
11818 static const struct bpf_func_proto *
bpf_sk_base_func_proto(enum bpf_func_id func_id)11819 bpf_sk_base_func_proto(enum bpf_func_id func_id)
11820 {
11821 const struct bpf_func_proto *func;
11822
11823 switch (func_id) {
11824 case BPF_FUNC_skc_to_tcp6_sock:
11825 func = &bpf_skc_to_tcp6_sock_proto;
11826 break;
11827 case BPF_FUNC_skc_to_tcp_sock:
11828 func = &bpf_skc_to_tcp_sock_proto;
11829 break;
11830 case BPF_FUNC_skc_to_tcp_timewait_sock:
11831 func = &bpf_skc_to_tcp_timewait_sock_proto;
11832 break;
11833 case BPF_FUNC_skc_to_tcp_request_sock:
11834 func = &bpf_skc_to_tcp_request_sock_proto;
11835 break;
11836 case BPF_FUNC_skc_to_udp6_sock:
11837 func = &bpf_skc_to_udp6_sock_proto;
11838 break;
11839 case BPF_FUNC_skc_to_unix_sock:
11840 func = &bpf_skc_to_unix_sock_proto;
11841 break;
11842 case BPF_FUNC_skc_to_mptcp_sock:
11843 func = &bpf_skc_to_mptcp_sock_proto;
11844 break;
11845 case BPF_FUNC_ktime_get_coarse_ns:
11846 return &bpf_ktime_get_coarse_ns_proto;
11847 default:
11848 return bpf_base_func_proto(func_id);
11849 }
11850
11851 if (!perfmon_capable())
11852 return NULL;
11853
11854 return func;
11855 }
11856
11857 __diag_push();
11858 __diag_ignore_all("-Wmissing-prototypes",
11859 "Global functions as their definitions will be in vmlinux BTF");
bpf_dynptr_from_skb(struct sk_buff * skb,u64 flags,struct bpf_dynptr_kern * ptr__uninit)11860 __bpf_kfunc int bpf_dynptr_from_skb(struct sk_buff *skb, u64 flags,
11861 struct bpf_dynptr_kern *ptr__uninit)
11862 {
11863 if (flags) {
11864 bpf_dynptr_set_null(ptr__uninit);
11865 return -EINVAL;
11866 }
11867
11868 bpf_dynptr_init(ptr__uninit, skb, BPF_DYNPTR_TYPE_SKB, 0, skb->len);
11869
11870 return 0;
11871 }
11872
bpf_dynptr_from_xdp(struct xdp_buff * xdp,u64 flags,struct bpf_dynptr_kern * ptr__uninit)11873 __bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_buff *xdp, u64 flags,
11874 struct bpf_dynptr_kern *ptr__uninit)
11875 {
11876 if (flags) {
11877 bpf_dynptr_set_null(ptr__uninit);
11878 return -EINVAL;
11879 }
11880
11881 bpf_dynptr_init(ptr__uninit, xdp, BPF_DYNPTR_TYPE_XDP, 0, xdp_get_buff_len(xdp));
11882
11883 return 0;
11884 }
11885
bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern * sa_kern,const u8 * sun_path,u32 sun_path__sz)11886 __bpf_kfunc int bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern *sa_kern,
11887 const u8 *sun_path, u32 sun_path__sz)
11888 {
11889 struct sockaddr_un *un;
11890
11891 if (sa_kern->sk->sk_family != AF_UNIX)
11892 return -EINVAL;
11893
11894 /* We do not allow changing the address to unnamed or larger than the
11895 * maximum allowed address size for a unix sockaddr.
11896 */
11897 if (sun_path__sz == 0 || sun_path__sz > UNIX_PATH_MAX)
11898 return -EINVAL;
11899
11900 un = (struct sockaddr_un *)sa_kern->uaddr;
11901 memcpy(un->sun_path, sun_path, sun_path__sz);
11902 sa_kern->uaddrlen = offsetof(struct sockaddr_un, sun_path) + sun_path__sz;
11903
11904 return 0;
11905 }
11906 __diag_pop();
11907
bpf_dynptr_from_skb_rdonly(struct sk_buff * skb,u64 flags,struct bpf_dynptr_kern * ptr__uninit)11908 int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags,
11909 struct bpf_dynptr_kern *ptr__uninit)
11910 {
11911 int err;
11912
11913 err = bpf_dynptr_from_skb(skb, flags, ptr__uninit);
11914 if (err)
11915 return err;
11916
11917 bpf_dynptr_set_rdonly(ptr__uninit);
11918
11919 return 0;
11920 }
11921
11922 BTF_SET8_START(bpf_kfunc_check_set_skb)
11923 BTF_ID_FLAGS(func, bpf_dynptr_from_skb)
11924 BTF_SET8_END(bpf_kfunc_check_set_skb)
11925
11926 BTF_SET8_START(bpf_kfunc_check_set_xdp)
11927 BTF_ID_FLAGS(func, bpf_dynptr_from_xdp)
11928 BTF_SET8_END(bpf_kfunc_check_set_xdp)
11929
11930 BTF_SET8_START(bpf_kfunc_check_set_sock_addr)
11931 BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path)
11932 BTF_SET8_END(bpf_kfunc_check_set_sock_addr)
11933
11934 static const struct btf_kfunc_id_set bpf_kfunc_set_skb = {
11935 .owner = THIS_MODULE,
11936 .set = &bpf_kfunc_check_set_skb,
11937 };
11938
11939 static const struct btf_kfunc_id_set bpf_kfunc_set_xdp = {
11940 .owner = THIS_MODULE,
11941 .set = &bpf_kfunc_check_set_xdp,
11942 };
11943
11944 static const struct btf_kfunc_id_set bpf_kfunc_set_sock_addr = {
11945 .owner = THIS_MODULE,
11946 .set = &bpf_kfunc_check_set_sock_addr,
11947 };
11948
bpf_kfunc_init(void)11949 static int __init bpf_kfunc_init(void)
11950 {
11951 int ret;
11952
11953 ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb);
11954 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb);
11955 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SK_SKB, &bpf_kfunc_set_skb);
11956 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCKET_FILTER, &bpf_kfunc_set_skb);
11957 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SKB, &bpf_kfunc_set_skb);
11958 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_OUT, &bpf_kfunc_set_skb);
11959 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_IN, &bpf_kfunc_set_skb);
11960 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_XMIT, &bpf_kfunc_set_skb);
11961 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb);
11962 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb);
11963 ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp);
11964 return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
11965 &bpf_kfunc_set_sock_addr);
11966 }
11967 late_initcall(bpf_kfunc_init);
11968
11969 /* Disables missing prototype warnings */
11970 __diag_push();
11971 __diag_ignore_all("-Wmissing-prototypes",
11972 "Global functions as their definitions will be in vmlinux BTF");
11973
11974 /* bpf_sock_destroy: Destroy the given socket with ECONNABORTED error code.
11975 *
11976 * The function expects a non-NULL pointer to a socket, and invokes the
11977 * protocol specific socket destroy handlers.
11978 *
11979 * The helper can only be called from BPF contexts that have acquired the socket
11980 * locks.
11981 *
11982 * Parameters:
11983 * @sock: Pointer to socket to be destroyed
11984 *
11985 * Return:
11986 * On error, may return EPROTONOSUPPORT, EINVAL.
11987 * EPROTONOSUPPORT if protocol specific destroy handler is not supported.
11988 * 0 otherwise
11989 */
bpf_sock_destroy(struct sock_common * sock)11990 __bpf_kfunc int bpf_sock_destroy(struct sock_common *sock)
11991 {
11992 struct sock *sk = (struct sock *)sock;
11993
11994 /* The locking semantics that allow for synchronous execution of the
11995 * destroy handlers are only supported for TCP and UDP.
11996 * Supporting protocols will need to acquire sock lock in the BPF context
11997 * prior to invoking this kfunc.
11998 */
11999 if (!sk->sk_prot->diag_destroy || (sk->sk_protocol != IPPROTO_TCP &&
12000 sk->sk_protocol != IPPROTO_UDP))
12001 return -EOPNOTSUPP;
12002
12003 return sk->sk_prot->diag_destroy(sk, ECONNABORTED);
12004 }
12005
12006 __diag_pop()
12007
BTF_SET8_START(bpf_sk_iter_kfunc_ids)12008 BTF_SET8_START(bpf_sk_iter_kfunc_ids)
12009 BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS)
12010 BTF_SET8_END(bpf_sk_iter_kfunc_ids)
12011
12012 static int tracing_iter_filter(const struct bpf_prog *prog, u32 kfunc_id)
12013 {
12014 if (btf_id_set8_contains(&bpf_sk_iter_kfunc_ids, kfunc_id) &&
12015 prog->expected_attach_type != BPF_TRACE_ITER)
12016 return -EACCES;
12017 return 0;
12018 }
12019
12020 static const struct btf_kfunc_id_set bpf_sk_iter_kfunc_set = {
12021 .owner = THIS_MODULE,
12022 .set = &bpf_sk_iter_kfunc_ids,
12023 .filter = tracing_iter_filter,
12024 };
12025
init_subsystem(void)12026 static int init_subsystem(void)
12027 {
12028 return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_sk_iter_kfunc_set);
12029 }
12030 late_initcall(init_subsystem);
12031