1 /* 2 * Linux Socket Filter - Kernel level socket filtering 3 * 4 * Based on the design of the Berkeley Packet Filter. The new 5 * internal format has been designed by PLUMgrid: 6 * 7 * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com 8 * 9 * Authors: 10 * 11 * Jay Schulist <jschlst@samba.org> 12 * Alexei Starovoitov <ast@plumgrid.com> 13 * Daniel Borkmann <dborkman@redhat.com> 14 * 15 * This program is free software; you can redistribute it and/or 16 * modify it under the terms of the GNU General Public License 17 * as published by the Free Software Foundation; either version 18 * 2 of the License, or (at your option) any later version. 19 * 20 * Andi Kleen - Fix a few bad bugs and races. 21 * Kris Katterjohn - Added many additional checks in bpf_check_classic() 22 */ 23 24 #include <linux/module.h> 25 #include <linux/types.h> 26 #include <linux/mm.h> 27 #include <linux/fcntl.h> 28 #include <linux/socket.h> 29 #include <linux/sock_diag.h> 30 #include <linux/in.h> 31 #include <linux/inet.h> 32 #include <linux/netdevice.h> 33 #include <linux/if_packet.h> 34 #include <linux/if_arp.h> 35 #include <linux/gfp.h> 36 #include <net/inet_common.h> 37 #include <net/ip.h> 38 #include <net/protocol.h> 39 #include <net/netlink.h> 40 #include <linux/skbuff.h> 41 #include <net/sock.h> 42 #include <net/flow_dissector.h> 43 #include <linux/errno.h> 44 #include <linux/timer.h> 45 #include <linux/uaccess.h> 46 #include <asm/unaligned.h> 47 #include <asm/cmpxchg.h> 48 #include <linux/filter.h> 49 #include <linux/ratelimit.h> 50 #include <linux/seccomp.h> 51 #include <linux/if_vlan.h> 52 #include <linux/bpf.h> 53 #include <net/sch_generic.h> 54 #include <net/cls_cgroup.h> 55 #include <net/dst_metadata.h> 56 #include <net/dst.h> 57 #include <net/sock_reuseport.h> 58 #include <net/busy_poll.h> 59 #include <net/tcp.h> 60 #include <linux/bpf_trace.h> 61 62 /** 63 * sk_filter_trim_cap - run a packet through a socket filter 64 * @sk: sock associated with &sk_buff 65 * @skb: buffer to filter 66 * @cap: limit on how short the eBPF program may trim the packet 67 * 68 * Run the eBPF program and then cut skb->data to correct size returned by 69 * the program. If pkt_len is 0 we toss packet. If skb->len is smaller 70 * than pkt_len we keep whole skb->data. This is the socket level 71 * wrapper to BPF_PROG_RUN. It returns 0 if the packet should 72 * be accepted or -EPERM if the packet should be tossed. 73 * 74 */ 75 int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) 76 { 77 int err; 78 struct sk_filter *filter; 79 80 /* 81 * If the skb was allocated from pfmemalloc reserves, only 82 * allow SOCK_MEMALLOC sockets to use it as this socket is 83 * helping free memory 84 */ 85 if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) { 86 NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP); 87 return -ENOMEM; 88 } 89 err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb); 90 if (err) 91 return err; 92 93 err = security_sock_rcv_skb(sk, skb); 94 if (err) 95 return err; 96 97 rcu_read_lock(); 98 filter = rcu_dereference(sk->sk_filter); 99 if (filter) { 100 struct sock *save_sk = skb->sk; 101 unsigned int pkt_len; 102 103 skb->sk = sk; 104 pkt_len = bpf_prog_run_save_cb(filter->prog, skb); 105 skb->sk = save_sk; 106 err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM; 107 } 108 rcu_read_unlock(); 109 110 return err; 111 } 112 EXPORT_SYMBOL(sk_filter_trim_cap); 113 114 BPF_CALL_1(__skb_get_pay_offset, struct sk_buff *, skb) 115 { 116 return skb_get_poff(skb); 117 } 118 119 BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) 120 { 121 struct nlattr *nla; 122 123 if (skb_is_nonlinear(skb)) 124 return 0; 125 126 if (skb->len < sizeof(struct nlattr)) 127 return 0; 128 129 if (a > skb->len - sizeof(struct nlattr)) 130 return 0; 131 132 nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x); 133 if (nla) 134 return (void *) nla - (void *) skb->data; 135 136 return 0; 137 } 138 139 BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) 140 { 141 struct nlattr *nla; 142 143 if (skb_is_nonlinear(skb)) 144 return 0; 145 146 if (skb->len < sizeof(struct nlattr)) 147 return 0; 148 149 if (a > skb->len - sizeof(struct nlattr)) 150 return 0; 151 152 nla = (struct nlattr *) &skb->data[a]; 153 if (nla->nla_len > skb->len - a) 154 return 0; 155 156 nla = nla_find_nested(nla, x); 157 if (nla) 158 return (void *) nla - (void *) skb->data; 159 160 return 0; 161 } 162 163 BPF_CALL_0(__get_raw_cpu_id) 164 { 165 return raw_smp_processor_id(); 166 } 167 168 static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { 169 .func = __get_raw_cpu_id, 170 .gpl_only = false, 171 .ret_type = RET_INTEGER, 172 }; 173 174 static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, 175 struct bpf_insn *insn_buf) 176 { 177 struct bpf_insn *insn = insn_buf; 178 179 switch (skb_field) { 180 case SKF_AD_MARK: 181 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); 182 183 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 184 offsetof(struct sk_buff, mark)); 185 break; 186 187 case SKF_AD_PKTTYPE: 188 *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET()); 189 *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX); 190 #ifdef __BIG_ENDIAN_BITFIELD 191 *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5); 192 #endif 193 break; 194 195 case SKF_AD_QUEUE: 196 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2); 197 198 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 199 offsetof(struct sk_buff, queue_mapping)); 200 break; 201 202 case SKF_AD_VLAN_TAG: 203 case SKF_AD_VLAN_TAG_PRESENT: 204 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); 205 BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); 206 207 /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */ 208 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 209 offsetof(struct sk_buff, vlan_tci)); 210 if (skb_field == SKF_AD_VLAN_TAG) { 211 *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 212 ~VLAN_TAG_PRESENT); 213 } else { 214 /* dst_reg >>= 12 */ 215 *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 12); 216 /* dst_reg &= 1 */ 217 *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1); 218 } 219 break; 220 } 221 222 return insn - insn_buf; 223 } 224 225 static bool convert_bpf_extensions(struct sock_filter *fp, 226 struct bpf_insn **insnp) 227 { 228 struct bpf_insn *insn = *insnp; 229 u32 cnt; 230 231 switch (fp->k) { 232 case SKF_AD_OFF + SKF_AD_PROTOCOL: 233 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); 234 235 /* A = *(u16 *) (CTX + offsetof(protocol)) */ 236 *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, 237 offsetof(struct sk_buff, protocol)); 238 /* A = ntohs(A) [emitting a nop or swap16] */ 239 *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); 240 break; 241 242 case SKF_AD_OFF + SKF_AD_PKTTYPE: 243 cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn); 244 insn += cnt - 1; 245 break; 246 247 case SKF_AD_OFF + SKF_AD_IFINDEX: 248 case SKF_AD_OFF + SKF_AD_HATYPE: 249 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); 250 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2); 251 252 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), 253 BPF_REG_TMP, BPF_REG_CTX, 254 offsetof(struct sk_buff, dev)); 255 /* if (tmp != 0) goto pc + 1 */ 256 *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1); 257 *insn++ = BPF_EXIT_INSN(); 258 if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX) 259 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP, 260 offsetof(struct net_device, ifindex)); 261 else 262 *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP, 263 offsetof(struct net_device, type)); 264 break; 265 266 case SKF_AD_OFF + SKF_AD_MARK: 267 cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn); 268 insn += cnt - 1; 269 break; 270 271 case SKF_AD_OFF + SKF_AD_RXHASH: 272 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); 273 274 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, 275 offsetof(struct sk_buff, hash)); 276 break; 277 278 case SKF_AD_OFF + SKF_AD_QUEUE: 279 cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn); 280 insn += cnt - 1; 281 break; 282 283 case SKF_AD_OFF + SKF_AD_VLAN_TAG: 284 cnt = convert_skb_access(SKF_AD_VLAN_TAG, 285 BPF_REG_A, BPF_REG_CTX, insn); 286 insn += cnt - 1; 287 break; 288 289 case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT: 290 cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, 291 BPF_REG_A, BPF_REG_CTX, insn); 292 insn += cnt - 1; 293 break; 294 295 case SKF_AD_OFF + SKF_AD_VLAN_TPID: 296 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2); 297 298 /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */ 299 *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, 300 offsetof(struct sk_buff, vlan_proto)); 301 /* A = ntohs(A) [emitting a nop or swap16] */ 302 *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); 303 break; 304 305 case SKF_AD_OFF + SKF_AD_PAY_OFFSET: 306 case SKF_AD_OFF + SKF_AD_NLATTR: 307 case SKF_AD_OFF + SKF_AD_NLATTR_NEST: 308 case SKF_AD_OFF + SKF_AD_CPU: 309 case SKF_AD_OFF + SKF_AD_RANDOM: 310 /* arg1 = CTX */ 311 *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); 312 /* arg2 = A */ 313 *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A); 314 /* arg3 = X */ 315 *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X); 316 /* Emit call(arg1=CTX, arg2=A, arg3=X) */ 317 switch (fp->k) { 318 case SKF_AD_OFF + SKF_AD_PAY_OFFSET: 319 *insn = BPF_EMIT_CALL(__skb_get_pay_offset); 320 break; 321 case SKF_AD_OFF + SKF_AD_NLATTR: 322 *insn = BPF_EMIT_CALL(__skb_get_nlattr); 323 break; 324 case SKF_AD_OFF + SKF_AD_NLATTR_NEST: 325 *insn = BPF_EMIT_CALL(__skb_get_nlattr_nest); 326 break; 327 case SKF_AD_OFF + SKF_AD_CPU: 328 *insn = BPF_EMIT_CALL(__get_raw_cpu_id); 329 break; 330 case SKF_AD_OFF + SKF_AD_RANDOM: 331 *insn = BPF_EMIT_CALL(bpf_user_rnd_u32); 332 bpf_user_rnd_init_once(); 333 break; 334 } 335 break; 336 337 case SKF_AD_OFF + SKF_AD_ALU_XOR_X: 338 /* A ^= X */ 339 *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X); 340 break; 341 342 default: 343 /* This is just a dummy call to avoid letting the compiler 344 * evict __bpf_call_base() as an optimization. Placed here 345 * where no-one bothers. 346 */ 347 BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0); 348 return false; 349 } 350 351 *insnp = insn; 352 return true; 353 } 354 355 /** 356 * bpf_convert_filter - convert filter program 357 * @prog: the user passed filter program 358 * @len: the length of the user passed filter program 359 * @new_prog: allocated 'struct bpf_prog' or NULL 360 * @new_len: pointer to store length of converted program 361 * 362 * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn' 363 * style extended BPF (eBPF). 364 * Conversion workflow: 365 * 366 * 1) First pass for calculating the new program length: 367 * bpf_convert_filter(old_prog, old_len, NULL, &new_len) 368 * 369 * 2) 2nd pass to remap in two passes: 1st pass finds new 370 * jump offsets, 2nd pass remapping: 371 * bpf_convert_filter(old_prog, old_len, new_prog, &new_len); 372 */ 373 static int bpf_convert_filter(struct sock_filter *prog, int len, 374 struct bpf_prog *new_prog, int *new_len) 375 { 376 int new_flen = 0, pass = 0, target, i, stack_off; 377 struct bpf_insn *new_insn, *first_insn = NULL; 378 struct sock_filter *fp; 379 int *addrs = NULL; 380 u8 bpf_src; 381 382 BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK); 383 BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); 384 385 if (len <= 0 || len > BPF_MAXINSNS) 386 return -EINVAL; 387 388 if (new_prog) { 389 first_insn = new_prog->insnsi; 390 addrs = kcalloc(len, sizeof(*addrs), 391 GFP_KERNEL | __GFP_NOWARN); 392 if (!addrs) 393 return -ENOMEM; 394 } 395 396 do_pass: 397 new_insn = first_insn; 398 fp = prog; 399 400 /* Classic BPF related prologue emission. */ 401 if (new_prog) { 402 /* Classic BPF expects A and X to be reset first. These need 403 * to be guaranteed to be the first two instructions. 404 */ 405 *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); 406 *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); 407 408 /* All programs must keep CTX in callee saved BPF_REG_CTX. 409 * In eBPF case it's done by the compiler, here we need to 410 * do this ourself. Initial CTX is present in BPF_REG_ARG1. 411 */ 412 *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); 413 } else { 414 new_insn += 3; 415 } 416 417 for (i = 0; i < len; fp++, i++) { 418 struct bpf_insn tmp_insns[6] = { }; 419 struct bpf_insn *insn = tmp_insns; 420 421 if (addrs) 422 addrs[i] = new_insn - first_insn; 423 424 switch (fp->code) { 425 /* All arithmetic insns and skb loads map as-is. */ 426 case BPF_ALU | BPF_ADD | BPF_X: 427 case BPF_ALU | BPF_ADD | BPF_K: 428 case BPF_ALU | BPF_SUB | BPF_X: 429 case BPF_ALU | BPF_SUB | BPF_K: 430 case BPF_ALU | BPF_AND | BPF_X: 431 case BPF_ALU | BPF_AND | BPF_K: 432 case BPF_ALU | BPF_OR | BPF_X: 433 case BPF_ALU | BPF_OR | BPF_K: 434 case BPF_ALU | BPF_LSH | BPF_X: 435 case BPF_ALU | BPF_LSH | BPF_K: 436 case BPF_ALU | BPF_RSH | BPF_X: 437 case BPF_ALU | BPF_RSH | BPF_K: 438 case BPF_ALU | BPF_XOR | BPF_X: 439 case BPF_ALU | BPF_XOR | BPF_K: 440 case BPF_ALU | BPF_MUL | BPF_X: 441 case BPF_ALU | BPF_MUL | BPF_K: 442 case BPF_ALU | BPF_DIV | BPF_X: 443 case BPF_ALU | BPF_DIV | BPF_K: 444 case BPF_ALU | BPF_MOD | BPF_X: 445 case BPF_ALU | BPF_MOD | BPF_K: 446 case BPF_ALU | BPF_NEG: 447 case BPF_LD | BPF_ABS | BPF_W: 448 case BPF_LD | BPF_ABS | BPF_H: 449 case BPF_LD | BPF_ABS | BPF_B: 450 case BPF_LD | BPF_IND | BPF_W: 451 case BPF_LD | BPF_IND | BPF_H: 452 case BPF_LD | BPF_IND | BPF_B: 453 /* Check for overloaded BPF extension and 454 * directly convert it if found, otherwise 455 * just move on with mapping. 456 */ 457 if (BPF_CLASS(fp->code) == BPF_LD && 458 BPF_MODE(fp->code) == BPF_ABS && 459 convert_bpf_extensions(fp, &insn)) 460 break; 461 462 if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) || 463 fp->code == (BPF_ALU | BPF_MOD | BPF_X)) { 464 *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X); 465 /* Error with exception code on div/mod by 0. 466 * For cBPF programs, this was always return 0. 467 */ 468 *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_X, 0, 2); 469 *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); 470 *insn++ = BPF_EXIT_INSN(); 471 } 472 473 *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); 474 break; 475 476 /* Jump transformation cannot use BPF block macros 477 * everywhere as offset calculation and target updates 478 * require a bit more work than the rest, i.e. jump 479 * opcodes map as-is, but offsets need adjustment. 480 */ 481 482 #define BPF_EMIT_JMP \ 483 do { \ 484 if (target >= len || target < 0) \ 485 goto err; \ 486 insn->off = addrs ? addrs[target] - addrs[i] - 1 : 0; \ 487 /* Adjust pc relative offset for 2nd or 3rd insn. */ \ 488 insn->off -= insn - tmp_insns; \ 489 } while (0) 490 491 case BPF_JMP | BPF_JA: 492 target = i + fp->k + 1; 493 insn->code = fp->code; 494 BPF_EMIT_JMP; 495 break; 496 497 case BPF_JMP | BPF_JEQ | BPF_K: 498 case BPF_JMP | BPF_JEQ | BPF_X: 499 case BPF_JMP | BPF_JSET | BPF_K: 500 case BPF_JMP | BPF_JSET | BPF_X: 501 case BPF_JMP | BPF_JGT | BPF_K: 502 case BPF_JMP | BPF_JGT | BPF_X: 503 case BPF_JMP | BPF_JGE | BPF_K: 504 case BPF_JMP | BPF_JGE | BPF_X: 505 if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) { 506 /* BPF immediates are signed, zero extend 507 * immediate into tmp register and use it 508 * in compare insn. 509 */ 510 *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k); 511 512 insn->dst_reg = BPF_REG_A; 513 insn->src_reg = BPF_REG_TMP; 514 bpf_src = BPF_X; 515 } else { 516 insn->dst_reg = BPF_REG_A; 517 insn->imm = fp->k; 518 bpf_src = BPF_SRC(fp->code); 519 insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0; 520 } 521 522 /* Common case where 'jump_false' is next insn. */ 523 if (fp->jf == 0) { 524 insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; 525 target = i + fp->jt + 1; 526 BPF_EMIT_JMP; 527 break; 528 } 529 530 /* Convert some jumps when 'jump_true' is next insn. */ 531 if (fp->jt == 0) { 532 switch (BPF_OP(fp->code)) { 533 case BPF_JEQ: 534 insn->code = BPF_JMP | BPF_JNE | bpf_src; 535 break; 536 case BPF_JGT: 537 insn->code = BPF_JMP | BPF_JLE | bpf_src; 538 break; 539 case BPF_JGE: 540 insn->code = BPF_JMP | BPF_JLT | bpf_src; 541 break; 542 default: 543 goto jmp_rest; 544 } 545 546 target = i + fp->jf + 1; 547 BPF_EMIT_JMP; 548 break; 549 } 550 jmp_rest: 551 /* Other jumps are mapped into two insns: Jxx and JA. */ 552 target = i + fp->jt + 1; 553 insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; 554 BPF_EMIT_JMP; 555 insn++; 556 557 insn->code = BPF_JMP | BPF_JA; 558 target = i + fp->jf + 1; 559 BPF_EMIT_JMP; 560 break; 561 562 /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */ 563 case BPF_LDX | BPF_MSH | BPF_B: 564 /* tmp = A */ 565 *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A); 566 /* A = BPF_R0 = *(u8 *) (skb->data + K) */ 567 *insn++ = BPF_LD_ABS(BPF_B, fp->k); 568 /* A &= 0xf */ 569 *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf); 570 /* A <<= 2 */ 571 *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2); 572 /* X = A */ 573 *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); 574 /* A = tmp */ 575 *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); 576 break; 577 578 /* RET_K is remaped into 2 insns. RET_A case doesn't need an 579 * extra mov as BPF_REG_0 is already mapped into BPF_REG_A. 580 */ 581 case BPF_RET | BPF_A: 582 case BPF_RET | BPF_K: 583 if (BPF_RVAL(fp->code) == BPF_K) 584 *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0, 585 0, fp->k); 586 *insn = BPF_EXIT_INSN(); 587 break; 588 589 /* Store to stack. */ 590 case BPF_ST: 591 case BPF_STX: 592 stack_off = fp->k * 4 + 4; 593 *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) == 594 BPF_ST ? BPF_REG_A : BPF_REG_X, 595 -stack_off); 596 /* check_load_and_stores() verifies that classic BPF can 597 * load from stack only after write, so tracking 598 * stack_depth for ST|STX insns is enough 599 */ 600 if (new_prog && new_prog->aux->stack_depth < stack_off) 601 new_prog->aux->stack_depth = stack_off; 602 break; 603 604 /* Load from stack. */ 605 case BPF_LD | BPF_MEM: 606 case BPF_LDX | BPF_MEM: 607 stack_off = fp->k * 4 + 4; 608 *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? 609 BPF_REG_A : BPF_REG_X, BPF_REG_FP, 610 -stack_off); 611 break; 612 613 /* A = K or X = K */ 614 case BPF_LD | BPF_IMM: 615 case BPF_LDX | BPF_IMM: 616 *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ? 617 BPF_REG_A : BPF_REG_X, fp->k); 618 break; 619 620 /* X = A */ 621 case BPF_MISC | BPF_TAX: 622 *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); 623 break; 624 625 /* A = X */ 626 case BPF_MISC | BPF_TXA: 627 *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X); 628 break; 629 630 /* A = skb->len or X = skb->len */ 631 case BPF_LD | BPF_W | BPF_LEN: 632 case BPF_LDX | BPF_W | BPF_LEN: 633 *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? 634 BPF_REG_A : BPF_REG_X, BPF_REG_CTX, 635 offsetof(struct sk_buff, len)); 636 break; 637 638 /* Access seccomp_data fields. */ 639 case BPF_LDX | BPF_ABS | BPF_W: 640 /* A = *(u32 *) (ctx + K) */ 641 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k); 642 break; 643 644 /* Unknown instruction. */ 645 default: 646 goto err; 647 } 648 649 insn++; 650 if (new_prog) 651 memcpy(new_insn, tmp_insns, 652 sizeof(*insn) * (insn - tmp_insns)); 653 new_insn += insn - tmp_insns; 654 } 655 656 if (!new_prog) { 657 /* Only calculating new length. */ 658 *new_len = new_insn - first_insn; 659 return 0; 660 } 661 662 pass++; 663 if (new_flen != new_insn - first_insn) { 664 new_flen = new_insn - first_insn; 665 if (pass > 2) 666 goto err; 667 goto do_pass; 668 } 669 670 kfree(addrs); 671 BUG_ON(*new_len != new_flen); 672 return 0; 673 err: 674 kfree(addrs); 675 return -EINVAL; 676 } 677 678 /* Security: 679 * 680 * As we dont want to clear mem[] array for each packet going through 681 * __bpf_prog_run(), we check that filter loaded by user never try to read 682 * a cell if not previously written, and we check all branches to be sure 683 * a malicious user doesn't try to abuse us. 684 */ 685 static int check_load_and_stores(const struct sock_filter *filter, int flen) 686 { 687 u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */ 688 int pc, ret = 0; 689 690 BUILD_BUG_ON(BPF_MEMWORDS > 16); 691 692 masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL); 693 if (!masks) 694 return -ENOMEM; 695 696 memset(masks, 0xff, flen * sizeof(*masks)); 697 698 for (pc = 0; pc < flen; pc++) { 699 memvalid &= masks[pc]; 700 701 switch (filter[pc].code) { 702 case BPF_ST: 703 case BPF_STX: 704 memvalid |= (1 << filter[pc].k); 705 break; 706 case BPF_LD | BPF_MEM: 707 case BPF_LDX | BPF_MEM: 708 if (!(memvalid & (1 << filter[pc].k))) { 709 ret = -EINVAL; 710 goto error; 711 } 712 break; 713 case BPF_JMP | BPF_JA: 714 /* A jump must set masks on target */ 715 masks[pc + 1 + filter[pc].k] &= memvalid; 716 memvalid = ~0; 717 break; 718 case BPF_JMP | BPF_JEQ | BPF_K: 719 case BPF_JMP | BPF_JEQ | BPF_X: 720 case BPF_JMP | BPF_JGE | BPF_K: 721 case BPF_JMP | BPF_JGE | BPF_X: 722 case BPF_JMP | BPF_JGT | BPF_K: 723 case BPF_JMP | BPF_JGT | BPF_X: 724 case BPF_JMP | BPF_JSET | BPF_K: 725 case BPF_JMP | BPF_JSET | BPF_X: 726 /* A jump must set masks on targets */ 727 masks[pc + 1 + filter[pc].jt] &= memvalid; 728 masks[pc + 1 + filter[pc].jf] &= memvalid; 729 memvalid = ~0; 730 break; 731 } 732 } 733 error: 734 kfree(masks); 735 return ret; 736 } 737 738 static bool chk_code_allowed(u16 code_to_probe) 739 { 740 static const bool codes[] = { 741 /* 32 bit ALU operations */ 742 [BPF_ALU | BPF_ADD | BPF_K] = true, 743 [BPF_ALU | BPF_ADD | BPF_X] = true, 744 [BPF_ALU | BPF_SUB | BPF_K] = true, 745 [BPF_ALU | BPF_SUB | BPF_X] = true, 746 [BPF_ALU | BPF_MUL | BPF_K] = true, 747 [BPF_ALU | BPF_MUL | BPF_X] = true, 748 [BPF_ALU | BPF_DIV | BPF_K] = true, 749 [BPF_ALU | BPF_DIV | BPF_X] = true, 750 [BPF_ALU | BPF_MOD | BPF_K] = true, 751 [BPF_ALU | BPF_MOD | BPF_X] = true, 752 [BPF_ALU | BPF_AND | BPF_K] = true, 753 [BPF_ALU | BPF_AND | BPF_X] = true, 754 [BPF_ALU | BPF_OR | BPF_K] = true, 755 [BPF_ALU | BPF_OR | BPF_X] = true, 756 [BPF_ALU | BPF_XOR | BPF_K] = true, 757 [BPF_ALU | BPF_XOR | BPF_X] = true, 758 [BPF_ALU | BPF_LSH | BPF_K] = true, 759 [BPF_ALU | BPF_LSH | BPF_X] = true, 760 [BPF_ALU | BPF_RSH | BPF_K] = true, 761 [BPF_ALU | BPF_RSH | BPF_X] = true, 762 [BPF_ALU | BPF_NEG] = true, 763 /* Load instructions */ 764 [BPF_LD | BPF_W | BPF_ABS] = true, 765 [BPF_LD | BPF_H | BPF_ABS] = true, 766 [BPF_LD | BPF_B | BPF_ABS] = true, 767 [BPF_LD | BPF_W | BPF_LEN] = true, 768 [BPF_LD | BPF_W | BPF_IND] = true, 769 [BPF_LD | BPF_H | BPF_IND] = true, 770 [BPF_LD | BPF_B | BPF_IND] = true, 771 [BPF_LD | BPF_IMM] = true, 772 [BPF_LD | BPF_MEM] = true, 773 [BPF_LDX | BPF_W | BPF_LEN] = true, 774 [BPF_LDX | BPF_B | BPF_MSH] = true, 775 [BPF_LDX | BPF_IMM] = true, 776 [BPF_LDX | BPF_MEM] = true, 777 /* Store instructions */ 778 [BPF_ST] = true, 779 [BPF_STX] = true, 780 /* Misc instructions */ 781 [BPF_MISC | BPF_TAX] = true, 782 [BPF_MISC | BPF_TXA] = true, 783 /* Return instructions */ 784 [BPF_RET | BPF_K] = true, 785 [BPF_RET | BPF_A] = true, 786 /* Jump instructions */ 787 [BPF_JMP | BPF_JA] = true, 788 [BPF_JMP | BPF_JEQ | BPF_K] = true, 789 [BPF_JMP | BPF_JEQ | BPF_X] = true, 790 [BPF_JMP | BPF_JGE | BPF_K] = true, 791 [BPF_JMP | BPF_JGE | BPF_X] = true, 792 [BPF_JMP | BPF_JGT | BPF_K] = true, 793 [BPF_JMP | BPF_JGT | BPF_X] = true, 794 [BPF_JMP | BPF_JSET | BPF_K] = true, 795 [BPF_JMP | BPF_JSET | BPF_X] = true, 796 }; 797 798 if (code_to_probe >= ARRAY_SIZE(codes)) 799 return false; 800 801 return codes[code_to_probe]; 802 } 803 804 static bool bpf_check_basics_ok(const struct sock_filter *filter, 805 unsigned int flen) 806 { 807 if (filter == NULL) 808 return false; 809 if (flen == 0 || flen > BPF_MAXINSNS) 810 return false; 811 812 return true; 813 } 814 815 /** 816 * bpf_check_classic - verify socket filter code 817 * @filter: filter to verify 818 * @flen: length of filter 819 * 820 * Check the user's filter code. If we let some ugly 821 * filter code slip through kaboom! The filter must contain 822 * no references or jumps that are out of range, no illegal 823 * instructions, and must end with a RET instruction. 824 * 825 * All jumps are forward as they are not signed. 826 * 827 * Returns 0 if the rule set is legal or -EINVAL if not. 828 */ 829 static int bpf_check_classic(const struct sock_filter *filter, 830 unsigned int flen) 831 { 832 bool anc_found; 833 int pc; 834 835 /* Check the filter code now */ 836 for (pc = 0; pc < flen; pc++) { 837 const struct sock_filter *ftest = &filter[pc]; 838 839 /* May we actually operate on this code? */ 840 if (!chk_code_allowed(ftest->code)) 841 return -EINVAL; 842 843 /* Some instructions need special checks */ 844 switch (ftest->code) { 845 case BPF_ALU | BPF_DIV | BPF_K: 846 case BPF_ALU | BPF_MOD | BPF_K: 847 /* Check for division by zero */ 848 if (ftest->k == 0) 849 return -EINVAL; 850 break; 851 case BPF_ALU | BPF_LSH | BPF_K: 852 case BPF_ALU | BPF_RSH | BPF_K: 853 if (ftest->k >= 32) 854 return -EINVAL; 855 break; 856 case BPF_LD | BPF_MEM: 857 case BPF_LDX | BPF_MEM: 858 case BPF_ST: 859 case BPF_STX: 860 /* Check for invalid memory addresses */ 861 if (ftest->k >= BPF_MEMWORDS) 862 return -EINVAL; 863 break; 864 case BPF_JMP | BPF_JA: 865 /* Note, the large ftest->k might cause loops. 866 * Compare this with conditional jumps below, 867 * where offsets are limited. --ANK (981016) 868 */ 869 if (ftest->k >= (unsigned int)(flen - pc - 1)) 870 return -EINVAL; 871 break; 872 case BPF_JMP | BPF_JEQ | BPF_K: 873 case BPF_JMP | BPF_JEQ | BPF_X: 874 case BPF_JMP | BPF_JGE | BPF_K: 875 case BPF_JMP | BPF_JGE | BPF_X: 876 case BPF_JMP | BPF_JGT | BPF_K: 877 case BPF_JMP | BPF_JGT | BPF_X: 878 case BPF_JMP | BPF_JSET | BPF_K: 879 case BPF_JMP | BPF_JSET | BPF_X: 880 /* Both conditionals must be safe */ 881 if (pc + ftest->jt + 1 >= flen || 882 pc + ftest->jf + 1 >= flen) 883 return -EINVAL; 884 break; 885 case BPF_LD | BPF_W | BPF_ABS: 886 case BPF_LD | BPF_H | BPF_ABS: 887 case BPF_LD | BPF_B | BPF_ABS: 888 anc_found = false; 889 if (bpf_anc_helper(ftest) & BPF_ANC) 890 anc_found = true; 891 /* Ancillary operation unknown or unsupported */ 892 if (anc_found == false && ftest->k >= SKF_AD_OFF) 893 return -EINVAL; 894 } 895 } 896 897 /* Last instruction must be a RET code */ 898 switch (filter[flen - 1].code) { 899 case BPF_RET | BPF_K: 900 case BPF_RET | BPF_A: 901 return check_load_and_stores(filter, flen); 902 } 903 904 return -EINVAL; 905 } 906 907 static int bpf_prog_store_orig_filter(struct bpf_prog *fp, 908 const struct sock_fprog *fprog) 909 { 910 unsigned int fsize = bpf_classic_proglen(fprog); 911 struct sock_fprog_kern *fkprog; 912 913 fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL); 914 if (!fp->orig_prog) 915 return -ENOMEM; 916 917 fkprog = fp->orig_prog; 918 fkprog->len = fprog->len; 919 920 fkprog->filter = kmemdup(fp->insns, fsize, 921 GFP_KERNEL | __GFP_NOWARN); 922 if (!fkprog->filter) { 923 kfree(fp->orig_prog); 924 return -ENOMEM; 925 } 926 927 return 0; 928 } 929 930 static void bpf_release_orig_filter(struct bpf_prog *fp) 931 { 932 struct sock_fprog_kern *fprog = fp->orig_prog; 933 934 if (fprog) { 935 kfree(fprog->filter); 936 kfree(fprog); 937 } 938 } 939 940 static void __bpf_prog_release(struct bpf_prog *prog) 941 { 942 if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) { 943 bpf_prog_put(prog); 944 } else { 945 bpf_release_orig_filter(prog); 946 bpf_prog_free(prog); 947 } 948 } 949 950 static void __sk_filter_release(struct sk_filter *fp) 951 { 952 __bpf_prog_release(fp->prog); 953 kfree(fp); 954 } 955 956 /** 957 * sk_filter_release_rcu - Release a socket filter by rcu_head 958 * @rcu: rcu_head that contains the sk_filter to free 959 */ 960 static void sk_filter_release_rcu(struct rcu_head *rcu) 961 { 962 struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); 963 964 __sk_filter_release(fp); 965 } 966 967 /** 968 * sk_filter_release - release a socket filter 969 * @fp: filter to remove 970 * 971 * Remove a filter from a socket and release its resources. 972 */ 973 static void sk_filter_release(struct sk_filter *fp) 974 { 975 if (refcount_dec_and_test(&fp->refcnt)) 976 call_rcu(&fp->rcu, sk_filter_release_rcu); 977 } 978 979 void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) 980 { 981 u32 filter_size = bpf_prog_size(fp->prog->len); 982 983 atomic_sub(filter_size, &sk->sk_omem_alloc); 984 sk_filter_release(fp); 985 } 986 987 /* try to charge the socket memory if there is space available 988 * return true on success 989 */ 990 static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp) 991 { 992 u32 filter_size = bpf_prog_size(fp->prog->len); 993 994 /* same check as in sock_kmalloc() */ 995 if (filter_size <= sysctl_optmem_max && 996 atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) { 997 atomic_add(filter_size, &sk->sk_omem_alloc); 998 return true; 999 } 1000 return false; 1001 } 1002 1003 bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) 1004 { 1005 if (!refcount_inc_not_zero(&fp->refcnt)) 1006 return false; 1007 1008 if (!__sk_filter_charge(sk, fp)) { 1009 sk_filter_release(fp); 1010 return false; 1011 } 1012 return true; 1013 } 1014 1015 static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) 1016 { 1017 struct sock_filter *old_prog; 1018 struct bpf_prog *old_fp; 1019 int err, new_len, old_len = fp->len; 1020 1021 /* We are free to overwrite insns et al right here as it 1022 * won't be used at this point in time anymore internally 1023 * after the migration to the internal BPF instruction 1024 * representation. 1025 */ 1026 BUILD_BUG_ON(sizeof(struct sock_filter) != 1027 sizeof(struct bpf_insn)); 1028 1029 /* Conversion cannot happen on overlapping memory areas, 1030 * so we need to keep the user BPF around until the 2nd 1031 * pass. At this time, the user BPF is stored in fp->insns. 1032 */ 1033 old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter), 1034 GFP_KERNEL | __GFP_NOWARN); 1035 if (!old_prog) { 1036 err = -ENOMEM; 1037 goto out_err; 1038 } 1039 1040 /* 1st pass: calculate the new program length. */ 1041 err = bpf_convert_filter(old_prog, old_len, NULL, &new_len); 1042 if (err) 1043 goto out_err_free; 1044 1045 /* Expand fp for appending the new filter representation. */ 1046 old_fp = fp; 1047 fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0); 1048 if (!fp) { 1049 /* The old_fp is still around in case we couldn't 1050 * allocate new memory, so uncharge on that one. 1051 */ 1052 fp = old_fp; 1053 err = -ENOMEM; 1054 goto out_err_free; 1055 } 1056 1057 fp->len = new_len; 1058 1059 /* 2nd pass: remap sock_filter insns into bpf_insn insns. */ 1060 err = bpf_convert_filter(old_prog, old_len, fp, &new_len); 1061 if (err) 1062 /* 2nd bpf_convert_filter() can fail only if it fails 1063 * to allocate memory, remapping must succeed. Note, 1064 * that at this time old_fp has already been released 1065 * by krealloc(). 1066 */ 1067 goto out_err_free; 1068 1069 fp = bpf_prog_select_runtime(fp, &err); 1070 if (err) 1071 goto out_err_free; 1072 1073 kfree(old_prog); 1074 return fp; 1075 1076 out_err_free: 1077 kfree(old_prog); 1078 out_err: 1079 __bpf_prog_release(fp); 1080 return ERR_PTR(err); 1081 } 1082 1083 static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, 1084 bpf_aux_classic_check_t trans) 1085 { 1086 int err; 1087 1088 fp->bpf_func = NULL; 1089 fp->jited = 0; 1090 1091 err = bpf_check_classic(fp->insns, fp->len); 1092 if (err) { 1093 __bpf_prog_release(fp); 1094 return ERR_PTR(err); 1095 } 1096 1097 /* There might be additional checks and transformations 1098 * needed on classic filters, f.e. in case of seccomp. 1099 */ 1100 if (trans) { 1101 err = trans(fp->insns, fp->len); 1102 if (err) { 1103 __bpf_prog_release(fp); 1104 return ERR_PTR(err); 1105 } 1106 } 1107 1108 /* Probe if we can JIT compile the filter and if so, do 1109 * the compilation of the filter. 1110 */ 1111 bpf_jit_compile(fp); 1112 1113 /* JIT compiler couldn't process this filter, so do the 1114 * internal BPF translation for the optimized interpreter. 1115 */ 1116 if (!fp->jited) 1117 fp = bpf_migrate_filter(fp); 1118 1119 return fp; 1120 } 1121 1122 /** 1123 * bpf_prog_create - create an unattached filter 1124 * @pfp: the unattached filter that is created 1125 * @fprog: the filter program 1126 * 1127 * Create a filter independent of any socket. We first run some 1128 * sanity checks on it to make sure it does not explode on us later. 1129 * If an error occurs or there is insufficient memory for the filter 1130 * a negative errno code is returned. On success the return is zero. 1131 */ 1132 int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog) 1133 { 1134 unsigned int fsize = bpf_classic_proglen(fprog); 1135 struct bpf_prog *fp; 1136 1137 /* Make sure new filter is there and in the right amounts. */ 1138 if (!bpf_check_basics_ok(fprog->filter, fprog->len)) 1139 return -EINVAL; 1140 1141 fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); 1142 if (!fp) 1143 return -ENOMEM; 1144 1145 memcpy(fp->insns, fprog->filter, fsize); 1146 1147 fp->len = fprog->len; 1148 /* Since unattached filters are not copied back to user 1149 * space through sk_get_filter(), we do not need to hold 1150 * a copy here, and can spare us the work. 1151 */ 1152 fp->orig_prog = NULL; 1153 1154 /* bpf_prepare_filter() already takes care of freeing 1155 * memory in case something goes wrong. 1156 */ 1157 fp = bpf_prepare_filter(fp, NULL); 1158 if (IS_ERR(fp)) 1159 return PTR_ERR(fp); 1160 1161 *pfp = fp; 1162 return 0; 1163 } 1164 EXPORT_SYMBOL_GPL(bpf_prog_create); 1165 1166 /** 1167 * bpf_prog_create_from_user - create an unattached filter from user buffer 1168 * @pfp: the unattached filter that is created 1169 * @fprog: the filter program 1170 * @trans: post-classic verifier transformation handler 1171 * @save_orig: save classic BPF program 1172 * 1173 * This function effectively does the same as bpf_prog_create(), only 1174 * that it builds up its insns buffer from user space provided buffer. 1175 * It also allows for passing a bpf_aux_classic_check_t handler. 1176 */ 1177 int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, 1178 bpf_aux_classic_check_t trans, bool save_orig) 1179 { 1180 unsigned int fsize = bpf_classic_proglen(fprog); 1181 struct bpf_prog *fp; 1182 int err; 1183 1184 /* Make sure new filter is there and in the right amounts. */ 1185 if (!bpf_check_basics_ok(fprog->filter, fprog->len)) 1186 return -EINVAL; 1187 1188 fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); 1189 if (!fp) 1190 return -ENOMEM; 1191 1192 if (copy_from_user(fp->insns, fprog->filter, fsize)) { 1193 __bpf_prog_free(fp); 1194 return -EFAULT; 1195 } 1196 1197 fp->len = fprog->len; 1198 fp->orig_prog = NULL; 1199 1200 if (save_orig) { 1201 err = bpf_prog_store_orig_filter(fp, fprog); 1202 if (err) { 1203 __bpf_prog_free(fp); 1204 return -ENOMEM; 1205 } 1206 } 1207 1208 /* bpf_prepare_filter() already takes care of freeing 1209 * memory in case something goes wrong. 1210 */ 1211 fp = bpf_prepare_filter(fp, trans); 1212 if (IS_ERR(fp)) 1213 return PTR_ERR(fp); 1214 1215 *pfp = fp; 1216 return 0; 1217 } 1218 EXPORT_SYMBOL_GPL(bpf_prog_create_from_user); 1219 1220 void bpf_prog_destroy(struct bpf_prog *fp) 1221 { 1222 __bpf_prog_release(fp); 1223 } 1224 EXPORT_SYMBOL_GPL(bpf_prog_destroy); 1225 1226 static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) 1227 { 1228 struct sk_filter *fp, *old_fp; 1229 1230 fp = kmalloc(sizeof(*fp), GFP_KERNEL); 1231 if (!fp) 1232 return -ENOMEM; 1233 1234 fp->prog = prog; 1235 1236 if (!__sk_filter_charge(sk, fp)) { 1237 kfree(fp); 1238 return -ENOMEM; 1239 } 1240 refcount_set(&fp->refcnt, 1); 1241 1242 old_fp = rcu_dereference_protected(sk->sk_filter, 1243 lockdep_sock_is_held(sk)); 1244 rcu_assign_pointer(sk->sk_filter, fp); 1245 1246 if (old_fp) 1247 sk_filter_uncharge(sk, old_fp); 1248 1249 return 0; 1250 } 1251 1252 static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk) 1253 { 1254 struct bpf_prog *old_prog; 1255 int err; 1256 1257 if (bpf_prog_size(prog->len) > sysctl_optmem_max) 1258 return -ENOMEM; 1259 1260 if (sk_unhashed(sk) && sk->sk_reuseport) { 1261 err = reuseport_alloc(sk); 1262 if (err) 1263 return err; 1264 } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { 1265 /* The socket wasn't bound with SO_REUSEPORT */ 1266 return -EINVAL; 1267 } 1268 1269 old_prog = reuseport_attach_prog(sk, prog); 1270 if (old_prog) 1271 bpf_prog_destroy(old_prog); 1272 1273 return 0; 1274 } 1275 1276 static 1277 struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) 1278 { 1279 unsigned int fsize = bpf_classic_proglen(fprog); 1280 struct bpf_prog *prog; 1281 int err; 1282 1283 if (sock_flag(sk, SOCK_FILTER_LOCKED)) 1284 return ERR_PTR(-EPERM); 1285 1286 /* Make sure new filter is there and in the right amounts. */ 1287 if (!bpf_check_basics_ok(fprog->filter, fprog->len)) 1288 return ERR_PTR(-EINVAL); 1289 1290 prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); 1291 if (!prog) 1292 return ERR_PTR(-ENOMEM); 1293 1294 if (copy_from_user(prog->insns, fprog->filter, fsize)) { 1295 __bpf_prog_free(prog); 1296 return ERR_PTR(-EFAULT); 1297 } 1298 1299 prog->len = fprog->len; 1300 1301 err = bpf_prog_store_orig_filter(prog, fprog); 1302 if (err) { 1303 __bpf_prog_free(prog); 1304 return ERR_PTR(-ENOMEM); 1305 } 1306 1307 /* bpf_prepare_filter() already takes care of freeing 1308 * memory in case something goes wrong. 1309 */ 1310 return bpf_prepare_filter(prog, NULL); 1311 } 1312 1313 /** 1314 * sk_attach_filter - attach a socket filter 1315 * @fprog: the filter program 1316 * @sk: the socket to use 1317 * 1318 * Attach the user's filter code. We first run some sanity checks on 1319 * it to make sure it does not explode on us later. If an error 1320 * occurs or there is insufficient memory for the filter a negative 1321 * errno code is returned. On success the return is zero. 1322 */ 1323 int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) 1324 { 1325 struct bpf_prog *prog = __get_filter(fprog, sk); 1326 int err; 1327 1328 if (IS_ERR(prog)) 1329 return PTR_ERR(prog); 1330 1331 err = __sk_attach_prog(prog, sk); 1332 if (err < 0) { 1333 __bpf_prog_release(prog); 1334 return err; 1335 } 1336 1337 return 0; 1338 } 1339 EXPORT_SYMBOL_GPL(sk_attach_filter); 1340 1341 int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) 1342 { 1343 struct bpf_prog *prog = __get_filter(fprog, sk); 1344 int err; 1345 1346 if (IS_ERR(prog)) 1347 return PTR_ERR(prog); 1348 1349 err = __reuseport_attach_prog(prog, sk); 1350 if (err < 0) { 1351 __bpf_prog_release(prog); 1352 return err; 1353 } 1354 1355 return 0; 1356 } 1357 1358 static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk) 1359 { 1360 if (sock_flag(sk, SOCK_FILTER_LOCKED)) 1361 return ERR_PTR(-EPERM); 1362 1363 return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); 1364 } 1365 1366 int sk_attach_bpf(u32 ufd, struct sock *sk) 1367 { 1368 struct bpf_prog *prog = __get_bpf(ufd, sk); 1369 int err; 1370 1371 if (IS_ERR(prog)) 1372 return PTR_ERR(prog); 1373 1374 err = __sk_attach_prog(prog, sk); 1375 if (err < 0) { 1376 bpf_prog_put(prog); 1377 return err; 1378 } 1379 1380 return 0; 1381 } 1382 1383 int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) 1384 { 1385 struct bpf_prog *prog = __get_bpf(ufd, sk); 1386 int err; 1387 1388 if (IS_ERR(prog)) 1389 return PTR_ERR(prog); 1390 1391 err = __reuseport_attach_prog(prog, sk); 1392 if (err < 0) { 1393 bpf_prog_put(prog); 1394 return err; 1395 } 1396 1397 return 0; 1398 } 1399 1400 struct bpf_scratchpad { 1401 union { 1402 __be32 diff[MAX_BPF_STACK / sizeof(__be32)]; 1403 u8 buff[MAX_BPF_STACK]; 1404 }; 1405 }; 1406 1407 static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp); 1408 1409 static inline int __bpf_try_make_writable(struct sk_buff *skb, 1410 unsigned int write_len) 1411 { 1412 return skb_ensure_writable(skb, write_len); 1413 } 1414 1415 static inline int bpf_try_make_writable(struct sk_buff *skb, 1416 unsigned int write_len) 1417 { 1418 int err = __bpf_try_make_writable(skb, write_len); 1419 1420 bpf_compute_data_pointers(skb); 1421 return err; 1422 } 1423 1424 static int bpf_try_make_head_writable(struct sk_buff *skb) 1425 { 1426 return bpf_try_make_writable(skb, skb_headlen(skb)); 1427 } 1428 1429 static inline void bpf_push_mac_rcsum(struct sk_buff *skb) 1430 { 1431 if (skb_at_tc_ingress(skb)) 1432 skb_postpush_rcsum(skb, skb_mac_header(skb), skb->mac_len); 1433 } 1434 1435 static inline void bpf_pull_mac_rcsum(struct sk_buff *skb) 1436 { 1437 if (skb_at_tc_ingress(skb)) 1438 skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len); 1439 } 1440 1441 BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset, 1442 const void *, from, u32, len, u64, flags) 1443 { 1444 void *ptr; 1445 1446 if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))) 1447 return -EINVAL; 1448 if (unlikely(offset > 0xffff)) 1449 return -EFAULT; 1450 if (unlikely(bpf_try_make_writable(skb, offset + len))) 1451 return -EFAULT; 1452 1453 ptr = skb->data + offset; 1454 if (flags & BPF_F_RECOMPUTE_CSUM) 1455 __skb_postpull_rcsum(skb, ptr, len, offset); 1456 1457 memcpy(ptr, from, len); 1458 1459 if (flags & BPF_F_RECOMPUTE_CSUM) 1460 __skb_postpush_rcsum(skb, ptr, len, offset); 1461 if (flags & BPF_F_INVALIDATE_HASH) 1462 skb_clear_hash(skb); 1463 1464 return 0; 1465 } 1466 1467 static const struct bpf_func_proto bpf_skb_store_bytes_proto = { 1468 .func = bpf_skb_store_bytes, 1469 .gpl_only = false, 1470 .ret_type = RET_INTEGER, 1471 .arg1_type = ARG_PTR_TO_CTX, 1472 .arg2_type = ARG_ANYTHING, 1473 .arg3_type = ARG_PTR_TO_MEM, 1474 .arg4_type = ARG_CONST_SIZE, 1475 .arg5_type = ARG_ANYTHING, 1476 }; 1477 1478 BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset, 1479 void *, to, u32, len) 1480 { 1481 void *ptr; 1482 1483 if (unlikely(offset > 0xffff)) 1484 goto err_clear; 1485 1486 ptr = skb_header_pointer(skb, offset, len, to); 1487 if (unlikely(!ptr)) 1488 goto err_clear; 1489 if (ptr != to) 1490 memcpy(to, ptr, len); 1491 1492 return 0; 1493 err_clear: 1494 memset(to, 0, len); 1495 return -EFAULT; 1496 } 1497 1498 static const struct bpf_func_proto bpf_skb_load_bytes_proto = { 1499 .func = bpf_skb_load_bytes, 1500 .gpl_only = false, 1501 .ret_type = RET_INTEGER, 1502 .arg1_type = ARG_PTR_TO_CTX, 1503 .arg2_type = ARG_ANYTHING, 1504 .arg3_type = ARG_PTR_TO_UNINIT_MEM, 1505 .arg4_type = ARG_CONST_SIZE, 1506 }; 1507 1508 BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len) 1509 { 1510 /* Idea is the following: should the needed direct read/write 1511 * test fail during runtime, we can pull in more data and redo 1512 * again, since implicitly, we invalidate previous checks here. 1513 * 1514 * Or, since we know how much we need to make read/writeable, 1515 * this can be done once at the program beginning for direct 1516 * access case. By this we overcome limitations of only current 1517 * headroom being accessible. 1518 */ 1519 return bpf_try_make_writable(skb, len ? : skb_headlen(skb)); 1520 } 1521 1522 static const struct bpf_func_proto bpf_skb_pull_data_proto = { 1523 .func = bpf_skb_pull_data, 1524 .gpl_only = false, 1525 .ret_type = RET_INTEGER, 1526 .arg1_type = ARG_PTR_TO_CTX, 1527 .arg2_type = ARG_ANYTHING, 1528 }; 1529 1530 BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset, 1531 u64, from, u64, to, u64, flags) 1532 { 1533 __sum16 *ptr; 1534 1535 if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK))) 1536 return -EINVAL; 1537 if (unlikely(offset > 0xffff || offset & 1)) 1538 return -EFAULT; 1539 if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr)))) 1540 return -EFAULT; 1541 1542 ptr = (__sum16 *)(skb->data + offset); 1543 switch (flags & BPF_F_HDR_FIELD_MASK) { 1544 case 0: 1545 if (unlikely(from != 0)) 1546 return -EINVAL; 1547 1548 csum_replace_by_diff(ptr, to); 1549 break; 1550 case 2: 1551 csum_replace2(ptr, from, to); 1552 break; 1553 case 4: 1554 csum_replace4(ptr, from, to); 1555 break; 1556 default: 1557 return -EINVAL; 1558 } 1559 1560 return 0; 1561 } 1562 1563 static const struct bpf_func_proto bpf_l3_csum_replace_proto = { 1564 .func = bpf_l3_csum_replace, 1565 .gpl_only = false, 1566 .ret_type = RET_INTEGER, 1567 .arg1_type = ARG_PTR_TO_CTX, 1568 .arg2_type = ARG_ANYTHING, 1569 .arg3_type = ARG_ANYTHING, 1570 .arg4_type = ARG_ANYTHING, 1571 .arg5_type = ARG_ANYTHING, 1572 }; 1573 1574 BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset, 1575 u64, from, u64, to, u64, flags) 1576 { 1577 bool is_pseudo = flags & BPF_F_PSEUDO_HDR; 1578 bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; 1579 bool do_mforce = flags & BPF_F_MARK_ENFORCE; 1580 __sum16 *ptr; 1581 1582 if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE | 1583 BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK))) 1584 return -EINVAL; 1585 if (unlikely(offset > 0xffff || offset & 1)) 1586 return -EFAULT; 1587 if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr)))) 1588 return -EFAULT; 1589 1590 ptr = (__sum16 *)(skb->data + offset); 1591 if (is_mmzero && !do_mforce && !*ptr) 1592 return 0; 1593 1594 switch (flags & BPF_F_HDR_FIELD_MASK) { 1595 case 0: 1596 if (unlikely(from != 0)) 1597 return -EINVAL; 1598 1599 inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo); 1600 break; 1601 case 2: 1602 inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); 1603 break; 1604 case 4: 1605 inet_proto_csum_replace4(ptr, skb, from, to, is_pseudo); 1606 break; 1607 default: 1608 return -EINVAL; 1609 } 1610 1611 if (is_mmzero && !*ptr) 1612 *ptr = CSUM_MANGLED_0; 1613 return 0; 1614 } 1615 1616 static const struct bpf_func_proto bpf_l4_csum_replace_proto = { 1617 .func = bpf_l4_csum_replace, 1618 .gpl_only = false, 1619 .ret_type = RET_INTEGER, 1620 .arg1_type = ARG_PTR_TO_CTX, 1621 .arg2_type = ARG_ANYTHING, 1622 .arg3_type = ARG_ANYTHING, 1623 .arg4_type = ARG_ANYTHING, 1624 .arg5_type = ARG_ANYTHING, 1625 }; 1626 1627 BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size, 1628 __be32 *, to, u32, to_size, __wsum, seed) 1629 { 1630 struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp); 1631 u32 diff_size = from_size + to_size; 1632 int i, j = 0; 1633 1634 /* This is quite flexible, some examples: 1635 * 1636 * from_size == 0, to_size > 0, seed := csum --> pushing data 1637 * from_size > 0, to_size == 0, seed := csum --> pulling data 1638 * from_size > 0, to_size > 0, seed := 0 --> diffing data 1639 * 1640 * Even for diffing, from_size and to_size don't need to be equal. 1641 */ 1642 if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) || 1643 diff_size > sizeof(sp->diff))) 1644 return -EINVAL; 1645 1646 for (i = 0; i < from_size / sizeof(__be32); i++, j++) 1647 sp->diff[j] = ~from[i]; 1648 for (i = 0; i < to_size / sizeof(__be32); i++, j++) 1649 sp->diff[j] = to[i]; 1650 1651 return csum_partial(sp->diff, diff_size, seed); 1652 } 1653 1654 static const struct bpf_func_proto bpf_csum_diff_proto = { 1655 .func = bpf_csum_diff, 1656 .gpl_only = false, 1657 .pkt_access = true, 1658 .ret_type = RET_INTEGER, 1659 .arg1_type = ARG_PTR_TO_MEM_OR_NULL, 1660 .arg2_type = ARG_CONST_SIZE_OR_ZERO, 1661 .arg3_type = ARG_PTR_TO_MEM_OR_NULL, 1662 .arg4_type = ARG_CONST_SIZE_OR_ZERO, 1663 .arg5_type = ARG_ANYTHING, 1664 }; 1665 1666 BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum) 1667 { 1668 /* The interface is to be used in combination with bpf_csum_diff() 1669 * for direct packet writes. csum rotation for alignment as well 1670 * as emulating csum_sub() can be done from the eBPF program. 1671 */ 1672 if (skb->ip_summed == CHECKSUM_COMPLETE) 1673 return (skb->csum = csum_add(skb->csum, csum)); 1674 1675 return -ENOTSUPP; 1676 } 1677 1678 static const struct bpf_func_proto bpf_csum_update_proto = { 1679 .func = bpf_csum_update, 1680 .gpl_only = false, 1681 .ret_type = RET_INTEGER, 1682 .arg1_type = ARG_PTR_TO_CTX, 1683 .arg2_type = ARG_ANYTHING, 1684 }; 1685 1686 static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) 1687 { 1688 return dev_forward_skb(dev, skb); 1689 } 1690 1691 static inline int __bpf_rx_skb_no_mac(struct net_device *dev, 1692 struct sk_buff *skb) 1693 { 1694 int ret = ____dev_forward_skb(dev, skb); 1695 1696 if (likely(!ret)) { 1697 skb->dev = dev; 1698 ret = netif_rx(skb); 1699 } 1700 1701 return ret; 1702 } 1703 1704 static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) 1705 { 1706 int ret; 1707 1708 if (unlikely(__this_cpu_read(xmit_recursion) > XMIT_RECURSION_LIMIT)) { 1709 net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); 1710 kfree_skb(skb); 1711 return -ENETDOWN; 1712 } 1713 1714 skb->dev = dev; 1715 1716 __this_cpu_inc(xmit_recursion); 1717 ret = dev_queue_xmit(skb); 1718 __this_cpu_dec(xmit_recursion); 1719 1720 return ret; 1721 } 1722 1723 static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev, 1724 u32 flags) 1725 { 1726 /* skb->mac_len is not set on normal egress */ 1727 unsigned int mlen = skb->network_header - skb->mac_header; 1728 1729 __skb_pull(skb, mlen); 1730 1731 /* At ingress, the mac header has already been pulled once. 1732 * At egress, skb_pospull_rcsum has to be done in case that 1733 * the skb is originated from ingress (i.e. a forwarded skb) 1734 * to ensure that rcsum starts at net header. 1735 */ 1736 if (!skb_at_tc_ingress(skb)) 1737 skb_postpull_rcsum(skb, skb_mac_header(skb), mlen); 1738 skb_pop_mac_header(skb); 1739 skb_reset_mac_len(skb); 1740 return flags & BPF_F_INGRESS ? 1741 __bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb); 1742 } 1743 1744 static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev, 1745 u32 flags) 1746 { 1747 /* Verify that a link layer header is carried */ 1748 if (unlikely(skb->mac_header >= skb->network_header)) { 1749 kfree_skb(skb); 1750 return -ERANGE; 1751 } 1752 1753 bpf_push_mac_rcsum(skb); 1754 return flags & BPF_F_INGRESS ? 1755 __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); 1756 } 1757 1758 static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, 1759 u32 flags) 1760 { 1761 if (dev_is_mac_header_xmit(dev)) 1762 return __bpf_redirect_common(skb, dev, flags); 1763 else 1764 return __bpf_redirect_no_mac(skb, dev, flags); 1765 } 1766 1767 BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) 1768 { 1769 struct net_device *dev; 1770 struct sk_buff *clone; 1771 int ret; 1772 1773 if (unlikely(flags & ~(BPF_F_INGRESS))) 1774 return -EINVAL; 1775 1776 dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex); 1777 if (unlikely(!dev)) 1778 return -EINVAL; 1779 1780 clone = skb_clone(skb, GFP_ATOMIC); 1781 if (unlikely(!clone)) 1782 return -ENOMEM; 1783 1784 /* For direct write, we need to keep the invariant that the skbs 1785 * we're dealing with need to be uncloned. Should uncloning fail 1786 * here, we need to free the just generated clone to unclone once 1787 * again. 1788 */ 1789 ret = bpf_try_make_head_writable(skb); 1790 if (unlikely(ret)) { 1791 kfree_skb(clone); 1792 return -ENOMEM; 1793 } 1794 1795 return __bpf_redirect(clone, dev, flags); 1796 } 1797 1798 static const struct bpf_func_proto bpf_clone_redirect_proto = { 1799 .func = bpf_clone_redirect, 1800 .gpl_only = false, 1801 .ret_type = RET_INTEGER, 1802 .arg1_type = ARG_PTR_TO_CTX, 1803 .arg2_type = ARG_ANYTHING, 1804 .arg3_type = ARG_ANYTHING, 1805 }; 1806 1807 struct redirect_info { 1808 u32 ifindex; 1809 u32 flags; 1810 struct bpf_map *map; 1811 struct bpf_map *map_to_flush; 1812 unsigned long map_owner; 1813 }; 1814 1815 static DEFINE_PER_CPU(struct redirect_info, redirect_info); 1816 1817 BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) 1818 { 1819 struct redirect_info *ri = this_cpu_ptr(&redirect_info); 1820 1821 if (unlikely(flags & ~(BPF_F_INGRESS))) 1822 return TC_ACT_SHOT; 1823 1824 ri->ifindex = ifindex; 1825 ri->flags = flags; 1826 1827 return TC_ACT_REDIRECT; 1828 } 1829 1830 int skb_do_redirect(struct sk_buff *skb) 1831 { 1832 struct redirect_info *ri = this_cpu_ptr(&redirect_info); 1833 struct net_device *dev; 1834 1835 dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex); 1836 ri->ifindex = 0; 1837 if (unlikely(!dev)) { 1838 kfree_skb(skb); 1839 return -EINVAL; 1840 } 1841 1842 return __bpf_redirect(skb, dev, ri->flags); 1843 } 1844 1845 static const struct bpf_func_proto bpf_redirect_proto = { 1846 .func = bpf_redirect, 1847 .gpl_only = false, 1848 .ret_type = RET_INTEGER, 1849 .arg1_type = ARG_ANYTHING, 1850 .arg2_type = ARG_ANYTHING, 1851 }; 1852 1853 BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, 1854 struct bpf_map *, map, u32, key, u64, flags) 1855 { 1856 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); 1857 1858 /* If user passes invalid input drop the packet. */ 1859 if (unlikely(flags & ~(BPF_F_INGRESS))) 1860 return SK_DROP; 1861 1862 tcb->bpf.key = key; 1863 tcb->bpf.flags = flags; 1864 tcb->bpf.map = map; 1865 1866 return SK_PASS; 1867 } 1868 1869 struct sock *do_sk_redirect_map(struct sk_buff *skb) 1870 { 1871 struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); 1872 struct sock *sk = NULL; 1873 1874 if (tcb->bpf.map) { 1875 sk = __sock_map_lookup_elem(tcb->bpf.map, tcb->bpf.key); 1876 1877 tcb->bpf.key = 0; 1878 tcb->bpf.map = NULL; 1879 } 1880 1881 return sk; 1882 } 1883 1884 static const struct bpf_func_proto bpf_sk_redirect_map_proto = { 1885 .func = bpf_sk_redirect_map, 1886 .gpl_only = false, 1887 .ret_type = RET_INTEGER, 1888 .arg1_type = ARG_PTR_TO_CTX, 1889 .arg2_type = ARG_CONST_MAP_PTR, 1890 .arg3_type = ARG_ANYTHING, 1891 .arg4_type = ARG_ANYTHING, 1892 }; 1893 1894 BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg, 1895 struct bpf_map *, map, u32, key, u64, flags) 1896 { 1897 /* If user passes invalid input drop the packet. */ 1898 if (unlikely(flags & ~(BPF_F_INGRESS))) 1899 return SK_DROP; 1900 1901 msg->key = key; 1902 msg->flags = flags; 1903 msg->map = map; 1904 1905 return SK_PASS; 1906 } 1907 1908 struct sock *do_msg_redirect_map(struct sk_msg_buff *msg) 1909 { 1910 struct sock *sk = NULL; 1911 1912 if (msg->map) { 1913 sk = __sock_map_lookup_elem(msg->map, msg->key); 1914 1915 msg->key = 0; 1916 msg->map = NULL; 1917 } 1918 1919 return sk; 1920 } 1921 1922 static const struct bpf_func_proto bpf_msg_redirect_map_proto = { 1923 .func = bpf_msg_redirect_map, 1924 .gpl_only = false, 1925 .ret_type = RET_INTEGER, 1926 .arg1_type = ARG_PTR_TO_CTX, 1927 .arg2_type = ARG_CONST_MAP_PTR, 1928 .arg3_type = ARG_ANYTHING, 1929 .arg4_type = ARG_ANYTHING, 1930 }; 1931 1932 BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg_buff *, msg, u32, bytes) 1933 { 1934 msg->apply_bytes = bytes; 1935 return 0; 1936 } 1937 1938 static const struct bpf_func_proto bpf_msg_apply_bytes_proto = { 1939 .func = bpf_msg_apply_bytes, 1940 .gpl_only = false, 1941 .ret_type = RET_INTEGER, 1942 .arg1_type = ARG_PTR_TO_CTX, 1943 .arg2_type = ARG_ANYTHING, 1944 }; 1945 1946 BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg_buff *, msg, u32, bytes) 1947 { 1948 msg->cork_bytes = bytes; 1949 return 0; 1950 } 1951 1952 static const struct bpf_func_proto bpf_msg_cork_bytes_proto = { 1953 .func = bpf_msg_cork_bytes, 1954 .gpl_only = false, 1955 .ret_type = RET_INTEGER, 1956 .arg1_type = ARG_PTR_TO_CTX, 1957 .arg2_type = ARG_ANYTHING, 1958 }; 1959 1960 BPF_CALL_4(bpf_msg_pull_data, 1961 struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags) 1962 { 1963 unsigned int len = 0, offset = 0, copy = 0; 1964 struct scatterlist *sg = msg->sg_data; 1965 int first_sg, last_sg, i, shift; 1966 unsigned char *p, *to, *from; 1967 int bytes = end - start; 1968 struct page *page; 1969 1970 if (unlikely(flags || end <= start)) 1971 return -EINVAL; 1972 1973 /* First find the starting scatterlist element */ 1974 i = msg->sg_start; 1975 do { 1976 len = sg[i].length; 1977 offset += len; 1978 if (start < offset + len) 1979 break; 1980 i++; 1981 if (i == MAX_SKB_FRAGS) 1982 i = 0; 1983 } while (i != msg->sg_end); 1984 1985 if (unlikely(start >= offset + len)) 1986 return -EINVAL; 1987 1988 if (!msg->sg_copy[i] && bytes <= len) 1989 goto out; 1990 1991 first_sg = i; 1992 1993 /* At this point we need to linearize multiple scatterlist 1994 * elements or a single shared page. Either way we need to 1995 * copy into a linear buffer exclusively owned by BPF. Then 1996 * place the buffer in the scatterlist and fixup the original 1997 * entries by removing the entries now in the linear buffer 1998 * and shifting the remaining entries. For now we do not try 1999 * to copy partial entries to avoid complexity of running out 2000 * of sg_entry slots. The downside is reading a single byte 2001 * will copy the entire sg entry. 2002 */ 2003 do { 2004 copy += sg[i].length; 2005 i++; 2006 if (i == MAX_SKB_FRAGS) 2007 i = 0; 2008 if (bytes < copy) 2009 break; 2010 } while (i != msg->sg_end); 2011 last_sg = i; 2012 2013 if (unlikely(copy < end - start)) 2014 return -EINVAL; 2015 2016 page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC, get_order(copy)); 2017 if (unlikely(!page)) 2018 return -ENOMEM; 2019 p = page_address(page); 2020 offset = 0; 2021 2022 i = first_sg; 2023 do { 2024 from = sg_virt(&sg[i]); 2025 len = sg[i].length; 2026 to = p + offset; 2027 2028 memcpy(to, from, len); 2029 offset += len; 2030 sg[i].length = 0; 2031 put_page(sg_page(&sg[i])); 2032 2033 i++; 2034 if (i == MAX_SKB_FRAGS) 2035 i = 0; 2036 } while (i != last_sg); 2037 2038 sg[first_sg].length = copy; 2039 sg_set_page(&sg[first_sg], page, copy, 0); 2040 2041 /* To repair sg ring we need to shift entries. If we only 2042 * had a single entry though we can just replace it and 2043 * be done. Otherwise walk the ring and shift the entries. 2044 */ 2045 shift = last_sg - first_sg - 1; 2046 if (!shift) 2047 goto out; 2048 2049 i = first_sg + 1; 2050 do { 2051 int move_from; 2052 2053 if (i + shift >= MAX_SKB_FRAGS) 2054 move_from = i + shift - MAX_SKB_FRAGS; 2055 else 2056 move_from = i + shift; 2057 2058 if (move_from == msg->sg_end) 2059 break; 2060 2061 sg[i] = sg[move_from]; 2062 sg[move_from].length = 0; 2063 sg[move_from].page_link = 0; 2064 sg[move_from].offset = 0; 2065 2066 i++; 2067 if (i == MAX_SKB_FRAGS) 2068 i = 0; 2069 } while (1); 2070 msg->sg_end -= shift; 2071 if (msg->sg_end < 0) 2072 msg->sg_end += MAX_SKB_FRAGS; 2073 out: 2074 msg->data = sg_virt(&sg[i]) + start - offset; 2075 msg->data_end = msg->data + bytes; 2076 2077 return 0; 2078 } 2079 2080 static const struct bpf_func_proto bpf_msg_pull_data_proto = { 2081 .func = bpf_msg_pull_data, 2082 .gpl_only = false, 2083 .ret_type = RET_INTEGER, 2084 .arg1_type = ARG_PTR_TO_CTX, 2085 .arg2_type = ARG_ANYTHING, 2086 .arg3_type = ARG_ANYTHING, 2087 .arg4_type = ARG_ANYTHING, 2088 }; 2089 2090 BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) 2091 { 2092 return task_get_classid(skb); 2093 } 2094 2095 static const struct bpf_func_proto bpf_get_cgroup_classid_proto = { 2096 .func = bpf_get_cgroup_classid, 2097 .gpl_only = false, 2098 .ret_type = RET_INTEGER, 2099 .arg1_type = ARG_PTR_TO_CTX, 2100 }; 2101 2102 BPF_CALL_1(bpf_get_route_realm, const struct sk_buff *, skb) 2103 { 2104 return dst_tclassid(skb); 2105 } 2106 2107 static const struct bpf_func_proto bpf_get_route_realm_proto = { 2108 .func = bpf_get_route_realm, 2109 .gpl_only = false, 2110 .ret_type = RET_INTEGER, 2111 .arg1_type = ARG_PTR_TO_CTX, 2112 }; 2113 2114 BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb) 2115 { 2116 /* If skb_clear_hash() was called due to mangling, we can 2117 * trigger SW recalculation here. Later access to hash 2118 * can then use the inline skb->hash via context directly 2119 * instead of calling this helper again. 2120 */ 2121 return skb_get_hash(skb); 2122 } 2123 2124 static const struct bpf_func_proto bpf_get_hash_recalc_proto = { 2125 .func = bpf_get_hash_recalc, 2126 .gpl_only = false, 2127 .ret_type = RET_INTEGER, 2128 .arg1_type = ARG_PTR_TO_CTX, 2129 }; 2130 2131 BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb) 2132 { 2133 /* After all direct packet write, this can be used once for 2134 * triggering a lazy recalc on next skb_get_hash() invocation. 2135 */ 2136 skb_clear_hash(skb); 2137 return 0; 2138 } 2139 2140 static const struct bpf_func_proto bpf_set_hash_invalid_proto = { 2141 .func = bpf_set_hash_invalid, 2142 .gpl_only = false, 2143 .ret_type = RET_INTEGER, 2144 .arg1_type = ARG_PTR_TO_CTX, 2145 }; 2146 2147 BPF_CALL_2(bpf_set_hash, struct sk_buff *, skb, u32, hash) 2148 { 2149 /* Set user specified hash as L4(+), so that it gets returned 2150 * on skb_get_hash() call unless BPF prog later on triggers a 2151 * skb_clear_hash(). 2152 */ 2153 __skb_set_sw_hash(skb, hash, true); 2154 return 0; 2155 } 2156 2157 static const struct bpf_func_proto bpf_set_hash_proto = { 2158 .func = bpf_set_hash, 2159 .gpl_only = false, 2160 .ret_type = RET_INTEGER, 2161 .arg1_type = ARG_PTR_TO_CTX, 2162 .arg2_type = ARG_ANYTHING, 2163 }; 2164 2165 BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, 2166 u16, vlan_tci) 2167 { 2168 int ret; 2169 2170 if (unlikely(vlan_proto != htons(ETH_P_8021Q) && 2171 vlan_proto != htons(ETH_P_8021AD))) 2172 vlan_proto = htons(ETH_P_8021Q); 2173 2174 bpf_push_mac_rcsum(skb); 2175 ret = skb_vlan_push(skb, vlan_proto, vlan_tci); 2176 bpf_pull_mac_rcsum(skb); 2177 2178 bpf_compute_data_pointers(skb); 2179 return ret; 2180 } 2181 2182 const struct bpf_func_proto bpf_skb_vlan_push_proto = { 2183 .func = bpf_skb_vlan_push, 2184 .gpl_only = false, 2185 .ret_type = RET_INTEGER, 2186 .arg1_type = ARG_PTR_TO_CTX, 2187 .arg2_type = ARG_ANYTHING, 2188 .arg3_type = ARG_ANYTHING, 2189 }; 2190 EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto); 2191 2192 BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) 2193 { 2194 int ret; 2195 2196 bpf_push_mac_rcsum(skb); 2197 ret = skb_vlan_pop(skb); 2198 bpf_pull_mac_rcsum(skb); 2199 2200 bpf_compute_data_pointers(skb); 2201 return ret; 2202 } 2203 2204 const struct bpf_func_proto bpf_skb_vlan_pop_proto = { 2205 .func = bpf_skb_vlan_pop, 2206 .gpl_only = false, 2207 .ret_type = RET_INTEGER, 2208 .arg1_type = ARG_PTR_TO_CTX, 2209 }; 2210 EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto); 2211 2212 static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len) 2213 { 2214 /* Caller already did skb_cow() with len as headroom, 2215 * so no need to do it here. 2216 */ 2217 skb_push(skb, len); 2218 memmove(skb->data, skb->data + len, off); 2219 memset(skb->data + off, 0, len); 2220 2221 /* No skb_postpush_rcsum(skb, skb->data + off, len) 2222 * needed here as it does not change the skb->csum 2223 * result for checksum complete when summing over 2224 * zeroed blocks. 2225 */ 2226 return 0; 2227 } 2228 2229 static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len) 2230 { 2231 /* skb_ensure_writable() is not needed here, as we're 2232 * already working on an uncloned skb. 2233 */ 2234 if (unlikely(!pskb_may_pull(skb, off + len))) 2235 return -ENOMEM; 2236 2237 skb_postpull_rcsum(skb, skb->data + off, len); 2238 memmove(skb->data + len, skb->data, off); 2239 __skb_pull(skb, len); 2240 2241 return 0; 2242 } 2243 2244 static int bpf_skb_net_hdr_push(struct sk_buff *skb, u32 off, u32 len) 2245 { 2246 bool trans_same = skb->transport_header == skb->network_header; 2247 int ret; 2248 2249 /* There's no need for __skb_push()/__skb_pull() pair to 2250 * get to the start of the mac header as we're guaranteed 2251 * to always start from here under eBPF. 2252 */ 2253 ret = bpf_skb_generic_push(skb, off, len); 2254 if (likely(!ret)) { 2255 skb->mac_header -= len; 2256 skb->network_header -= len; 2257 if (trans_same) 2258 skb->transport_header = skb->network_header; 2259 } 2260 2261 return ret; 2262 } 2263 2264 static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len) 2265 { 2266 bool trans_same = skb->transport_header == skb->network_header; 2267 int ret; 2268 2269 /* Same here, __skb_push()/__skb_pull() pair not needed. */ 2270 ret = bpf_skb_generic_pop(skb, off, len); 2271 if (likely(!ret)) { 2272 skb->mac_header += len; 2273 skb->network_header += len; 2274 if (trans_same) 2275 skb->transport_header = skb->network_header; 2276 } 2277 2278 return ret; 2279 } 2280 2281 static int bpf_skb_proto_4_to_6(struct sk_buff *skb) 2282 { 2283 const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); 2284 u32 off = skb_mac_header_len(skb); 2285 int ret; 2286 2287 /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */ 2288 if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb))) 2289 return -ENOTSUPP; 2290 2291 ret = skb_cow(skb, len_diff); 2292 if (unlikely(ret < 0)) 2293 return ret; 2294 2295 ret = bpf_skb_net_hdr_push(skb, off, len_diff); 2296 if (unlikely(ret < 0)) 2297 return ret; 2298 2299 if (skb_is_gso(skb)) { 2300 struct skb_shared_info *shinfo = skb_shinfo(skb); 2301 2302 /* SKB_GSO_TCPV4 needs to be changed into 2303 * SKB_GSO_TCPV6. 2304 */ 2305 if (shinfo->gso_type & SKB_GSO_TCPV4) { 2306 shinfo->gso_type &= ~SKB_GSO_TCPV4; 2307 shinfo->gso_type |= SKB_GSO_TCPV6; 2308 } 2309 2310 /* Due to IPv6 header, MSS needs to be downgraded. */ 2311 skb_decrease_gso_size(shinfo, len_diff); 2312 /* Header must be checked, and gso_segs recomputed. */ 2313 shinfo->gso_type |= SKB_GSO_DODGY; 2314 shinfo->gso_segs = 0; 2315 } 2316 2317 skb->protocol = htons(ETH_P_IPV6); 2318 skb_clear_hash(skb); 2319 2320 return 0; 2321 } 2322 2323 static int bpf_skb_proto_6_to_4(struct sk_buff *skb) 2324 { 2325 const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); 2326 u32 off = skb_mac_header_len(skb); 2327 int ret; 2328 2329 /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */ 2330 if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb))) 2331 return -ENOTSUPP; 2332 2333 ret = skb_unclone(skb, GFP_ATOMIC); 2334 if (unlikely(ret < 0)) 2335 return ret; 2336 2337 ret = bpf_skb_net_hdr_pop(skb, off, len_diff); 2338 if (unlikely(ret < 0)) 2339 return ret; 2340 2341 if (skb_is_gso(skb)) { 2342 struct skb_shared_info *shinfo = skb_shinfo(skb); 2343 2344 /* SKB_GSO_TCPV6 needs to be changed into 2345 * SKB_GSO_TCPV4. 2346 */ 2347 if (shinfo->gso_type & SKB_GSO_TCPV6) { 2348 shinfo->gso_type &= ~SKB_GSO_TCPV6; 2349 shinfo->gso_type |= SKB_GSO_TCPV4; 2350 } 2351 2352 /* Due to IPv4 header, MSS can be upgraded. */ 2353 skb_increase_gso_size(shinfo, len_diff); 2354 /* Header must be checked, and gso_segs recomputed. */ 2355 shinfo->gso_type |= SKB_GSO_DODGY; 2356 shinfo->gso_segs = 0; 2357 } 2358 2359 skb->protocol = htons(ETH_P_IP); 2360 skb_clear_hash(skb); 2361 2362 return 0; 2363 } 2364 2365 static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto) 2366 { 2367 __be16 from_proto = skb->protocol; 2368 2369 if (from_proto == htons(ETH_P_IP) && 2370 to_proto == htons(ETH_P_IPV6)) 2371 return bpf_skb_proto_4_to_6(skb); 2372 2373 if (from_proto == htons(ETH_P_IPV6) && 2374 to_proto == htons(ETH_P_IP)) 2375 return bpf_skb_proto_6_to_4(skb); 2376 2377 return -ENOTSUPP; 2378 } 2379 2380 BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto, 2381 u64, flags) 2382 { 2383 int ret; 2384 2385 if (unlikely(flags)) 2386 return -EINVAL; 2387 2388 /* General idea is that this helper does the basic groundwork 2389 * needed for changing the protocol, and eBPF program fills the 2390 * rest through bpf_skb_store_bytes(), bpf_lX_csum_replace() 2391 * and other helpers, rather than passing a raw buffer here. 2392 * 2393 * The rationale is to keep this minimal and without a need to 2394 * deal with raw packet data. F.e. even if we would pass buffers 2395 * here, the program still needs to call the bpf_lX_csum_replace() 2396 * helpers anyway. Plus, this way we keep also separation of 2397 * concerns, since f.e. bpf_skb_store_bytes() should only take 2398 * care of stores. 2399 * 2400 * Currently, additional options and extension header space are 2401 * not supported, but flags register is reserved so we can adapt 2402 * that. For offloads, we mark packet as dodgy, so that headers 2403 * need to be verified first. 2404 */ 2405 ret = bpf_skb_proto_xlat(skb, proto); 2406 bpf_compute_data_pointers(skb); 2407 return ret; 2408 } 2409 2410 static const struct bpf_func_proto bpf_skb_change_proto_proto = { 2411 .func = bpf_skb_change_proto, 2412 .gpl_only = false, 2413 .ret_type = RET_INTEGER, 2414 .arg1_type = ARG_PTR_TO_CTX, 2415 .arg2_type = ARG_ANYTHING, 2416 .arg3_type = ARG_ANYTHING, 2417 }; 2418 2419 BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type) 2420 { 2421 /* We only allow a restricted subset to be changed for now. */ 2422 if (unlikely(!skb_pkt_type_ok(skb->pkt_type) || 2423 !skb_pkt_type_ok(pkt_type))) 2424 return -EINVAL; 2425 2426 skb->pkt_type = pkt_type; 2427 return 0; 2428 } 2429 2430 static const struct bpf_func_proto bpf_skb_change_type_proto = { 2431 .func = bpf_skb_change_type, 2432 .gpl_only = false, 2433 .ret_type = RET_INTEGER, 2434 .arg1_type = ARG_PTR_TO_CTX, 2435 .arg2_type = ARG_ANYTHING, 2436 }; 2437 2438 static u32 bpf_skb_net_base_len(const struct sk_buff *skb) 2439 { 2440 switch (skb->protocol) { 2441 case htons(ETH_P_IP): 2442 return sizeof(struct iphdr); 2443 case htons(ETH_P_IPV6): 2444 return sizeof(struct ipv6hdr); 2445 default: 2446 return ~0U; 2447 } 2448 } 2449 2450 static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff) 2451 { 2452 u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); 2453 int ret; 2454 2455 /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */ 2456 if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb))) 2457 return -ENOTSUPP; 2458 2459 ret = skb_cow(skb, len_diff); 2460 if (unlikely(ret < 0)) 2461 return ret; 2462 2463 ret = bpf_skb_net_hdr_push(skb, off, len_diff); 2464 if (unlikely(ret < 0)) 2465 return ret; 2466 2467 if (skb_is_gso(skb)) { 2468 struct skb_shared_info *shinfo = skb_shinfo(skb); 2469 2470 /* Due to header grow, MSS needs to be downgraded. */ 2471 skb_decrease_gso_size(shinfo, len_diff); 2472 /* Header must be checked, and gso_segs recomputed. */ 2473 shinfo->gso_type |= SKB_GSO_DODGY; 2474 shinfo->gso_segs = 0; 2475 } 2476 2477 return 0; 2478 } 2479 2480 static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff) 2481 { 2482 u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); 2483 int ret; 2484 2485 /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */ 2486 if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb))) 2487 return -ENOTSUPP; 2488 2489 ret = skb_unclone(skb, GFP_ATOMIC); 2490 if (unlikely(ret < 0)) 2491 return ret; 2492 2493 ret = bpf_skb_net_hdr_pop(skb, off, len_diff); 2494 if (unlikely(ret < 0)) 2495 return ret; 2496 2497 if (skb_is_gso(skb)) { 2498 struct skb_shared_info *shinfo = skb_shinfo(skb); 2499 2500 /* Due to header shrink, MSS can be upgraded. */ 2501 skb_increase_gso_size(shinfo, len_diff); 2502 /* Header must be checked, and gso_segs recomputed. */ 2503 shinfo->gso_type |= SKB_GSO_DODGY; 2504 shinfo->gso_segs = 0; 2505 } 2506 2507 return 0; 2508 } 2509 2510 static u32 __bpf_skb_max_len(const struct sk_buff *skb) 2511 { 2512 return skb->dev->mtu + skb->dev->hard_header_len; 2513 } 2514 2515 static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff) 2516 { 2517 bool trans_same = skb->transport_header == skb->network_header; 2518 u32 len_cur, len_diff_abs = abs(len_diff); 2519 u32 len_min = bpf_skb_net_base_len(skb); 2520 u32 len_max = __bpf_skb_max_len(skb); 2521 __be16 proto = skb->protocol; 2522 bool shrink = len_diff < 0; 2523 int ret; 2524 2525 if (unlikely(len_diff_abs > 0xfffU)) 2526 return -EFAULT; 2527 if (unlikely(proto != htons(ETH_P_IP) && 2528 proto != htons(ETH_P_IPV6))) 2529 return -ENOTSUPP; 2530 2531 len_cur = skb->len - skb_network_offset(skb); 2532 if (skb_transport_header_was_set(skb) && !trans_same) 2533 len_cur = skb_network_header_len(skb); 2534 if ((shrink && (len_diff_abs >= len_cur || 2535 len_cur - len_diff_abs < len_min)) || 2536 (!shrink && (skb->len + len_diff_abs > len_max && 2537 !skb_is_gso(skb)))) 2538 return -ENOTSUPP; 2539 2540 ret = shrink ? bpf_skb_net_shrink(skb, len_diff_abs) : 2541 bpf_skb_net_grow(skb, len_diff_abs); 2542 2543 bpf_compute_data_pointers(skb); 2544 return ret; 2545 } 2546 2547 BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, 2548 u32, mode, u64, flags) 2549 { 2550 if (unlikely(flags)) 2551 return -EINVAL; 2552 if (likely(mode == BPF_ADJ_ROOM_NET)) 2553 return bpf_skb_adjust_net(skb, len_diff); 2554 2555 return -ENOTSUPP; 2556 } 2557 2558 static const struct bpf_func_proto bpf_skb_adjust_room_proto = { 2559 .func = bpf_skb_adjust_room, 2560 .gpl_only = false, 2561 .ret_type = RET_INTEGER, 2562 .arg1_type = ARG_PTR_TO_CTX, 2563 .arg2_type = ARG_ANYTHING, 2564 .arg3_type = ARG_ANYTHING, 2565 .arg4_type = ARG_ANYTHING, 2566 }; 2567 2568 static u32 __bpf_skb_min_len(const struct sk_buff *skb) 2569 { 2570 u32 min_len = skb_network_offset(skb); 2571 2572 if (skb_transport_header_was_set(skb)) 2573 min_len = skb_transport_offset(skb); 2574 if (skb->ip_summed == CHECKSUM_PARTIAL) 2575 min_len = skb_checksum_start_offset(skb) + 2576 skb->csum_offset + sizeof(__sum16); 2577 return min_len; 2578 } 2579 2580 static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len) 2581 { 2582 unsigned int old_len = skb->len; 2583 int ret; 2584 2585 ret = __skb_grow_rcsum(skb, new_len); 2586 if (!ret) 2587 memset(skb->data + old_len, 0, new_len - old_len); 2588 return ret; 2589 } 2590 2591 static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len) 2592 { 2593 return __skb_trim_rcsum(skb, new_len); 2594 } 2595 2596 BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, 2597 u64, flags) 2598 { 2599 u32 max_len = __bpf_skb_max_len(skb); 2600 u32 min_len = __bpf_skb_min_len(skb); 2601 int ret; 2602 2603 if (unlikely(flags || new_len > max_len || new_len < min_len)) 2604 return -EINVAL; 2605 if (skb->encapsulation) 2606 return -ENOTSUPP; 2607 2608 /* The basic idea of this helper is that it's performing the 2609 * needed work to either grow or trim an skb, and eBPF program 2610 * rewrites the rest via helpers like bpf_skb_store_bytes(), 2611 * bpf_lX_csum_replace() and others rather than passing a raw 2612 * buffer here. This one is a slow path helper and intended 2613 * for replies with control messages. 2614 * 2615 * Like in bpf_skb_change_proto(), we want to keep this rather 2616 * minimal and without protocol specifics so that we are able 2617 * to separate concerns as in bpf_skb_store_bytes() should only 2618 * be the one responsible for writing buffers. 2619 * 2620 * It's really expected to be a slow path operation here for 2621 * control message replies, so we're implicitly linearizing, 2622 * uncloning and drop offloads from the skb by this. 2623 */ 2624 ret = __bpf_try_make_writable(skb, skb->len); 2625 if (!ret) { 2626 if (new_len > skb->len) 2627 ret = bpf_skb_grow_rcsum(skb, new_len); 2628 else if (new_len < skb->len) 2629 ret = bpf_skb_trim_rcsum(skb, new_len); 2630 if (!ret && skb_is_gso(skb)) 2631 skb_gso_reset(skb); 2632 } 2633 2634 bpf_compute_data_pointers(skb); 2635 return ret; 2636 } 2637 2638 static const struct bpf_func_proto bpf_skb_change_tail_proto = { 2639 .func = bpf_skb_change_tail, 2640 .gpl_only = false, 2641 .ret_type = RET_INTEGER, 2642 .arg1_type = ARG_PTR_TO_CTX, 2643 .arg2_type = ARG_ANYTHING, 2644 .arg3_type = ARG_ANYTHING, 2645 }; 2646 2647 BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, 2648 u64, flags) 2649 { 2650 u32 max_len = __bpf_skb_max_len(skb); 2651 u32 new_len = skb->len + head_room; 2652 int ret; 2653 2654 if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) || 2655 new_len < skb->len)) 2656 return -EINVAL; 2657 2658 ret = skb_cow(skb, head_room); 2659 if (likely(!ret)) { 2660 /* Idea for this helper is that we currently only 2661 * allow to expand on mac header. This means that 2662 * skb->protocol network header, etc, stay as is. 2663 * Compared to bpf_skb_change_tail(), we're more 2664 * flexible due to not needing to linearize or 2665 * reset GSO. Intention for this helper is to be 2666 * used by an L3 skb that needs to push mac header 2667 * for redirection into L2 device. 2668 */ 2669 __skb_push(skb, head_room); 2670 memset(skb->data, 0, head_room); 2671 skb_reset_mac_header(skb); 2672 } 2673 2674 bpf_compute_data_pointers(skb); 2675 return 0; 2676 } 2677 2678 static const struct bpf_func_proto bpf_skb_change_head_proto = { 2679 .func = bpf_skb_change_head, 2680 .gpl_only = false, 2681 .ret_type = RET_INTEGER, 2682 .arg1_type = ARG_PTR_TO_CTX, 2683 .arg2_type = ARG_ANYTHING, 2684 .arg3_type = ARG_ANYTHING, 2685 }; 2686 2687 static unsigned long xdp_get_metalen(const struct xdp_buff *xdp) 2688 { 2689 return xdp_data_meta_unsupported(xdp) ? 0 : 2690 xdp->data - xdp->data_meta; 2691 } 2692 2693 BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset) 2694 { 2695 unsigned long metalen = xdp_get_metalen(xdp); 2696 void *data_start = xdp->data_hard_start + metalen; 2697 void *data = xdp->data + offset; 2698 2699 if (unlikely(data < data_start || 2700 data > xdp->data_end - ETH_HLEN)) 2701 return -EINVAL; 2702 2703 if (metalen) 2704 memmove(xdp->data_meta + offset, 2705 xdp->data_meta, metalen); 2706 xdp->data_meta += offset; 2707 xdp->data = data; 2708 2709 return 0; 2710 } 2711 2712 static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { 2713 .func = bpf_xdp_adjust_head, 2714 .gpl_only = false, 2715 .ret_type = RET_INTEGER, 2716 .arg1_type = ARG_PTR_TO_CTX, 2717 .arg2_type = ARG_ANYTHING, 2718 }; 2719 2720 BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset) 2721 { 2722 void *meta = xdp->data_meta + offset; 2723 unsigned long metalen = xdp->data - meta; 2724 2725 if (xdp_data_meta_unsupported(xdp)) 2726 return -ENOTSUPP; 2727 if (unlikely(meta < xdp->data_hard_start || 2728 meta > xdp->data)) 2729 return -EINVAL; 2730 if (unlikely((metalen & (sizeof(__u32) - 1)) || 2731 (metalen > 32))) 2732 return -EACCES; 2733 2734 xdp->data_meta = meta; 2735 2736 return 0; 2737 } 2738 2739 static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { 2740 .func = bpf_xdp_adjust_meta, 2741 .gpl_only = false, 2742 .ret_type = RET_INTEGER, 2743 .arg1_type = ARG_PTR_TO_CTX, 2744 .arg2_type = ARG_ANYTHING, 2745 }; 2746 2747 static int __bpf_tx_xdp(struct net_device *dev, 2748 struct bpf_map *map, 2749 struct xdp_buff *xdp, 2750 u32 index) 2751 { 2752 int err; 2753 2754 if (!dev->netdev_ops->ndo_xdp_xmit) { 2755 return -EOPNOTSUPP; 2756 } 2757 2758 err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp); 2759 if (err) 2760 return err; 2761 dev->netdev_ops->ndo_xdp_flush(dev); 2762 return 0; 2763 } 2764 2765 static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, 2766 struct bpf_map *map, 2767 struct xdp_buff *xdp, 2768 u32 index) 2769 { 2770 int err; 2771 2772 if (map->map_type == BPF_MAP_TYPE_DEVMAP) { 2773 struct net_device *dev = fwd; 2774 2775 if (!dev->netdev_ops->ndo_xdp_xmit) 2776 return -EOPNOTSUPP; 2777 2778 err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp); 2779 if (err) 2780 return err; 2781 __dev_map_insert_ctx(map, index); 2782 2783 } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) { 2784 struct bpf_cpu_map_entry *rcpu = fwd; 2785 2786 err = cpu_map_enqueue(rcpu, xdp, dev_rx); 2787 if (err) 2788 return err; 2789 __cpu_map_insert_ctx(map, index); 2790 } 2791 return 0; 2792 } 2793 2794 void xdp_do_flush_map(void) 2795 { 2796 struct redirect_info *ri = this_cpu_ptr(&redirect_info); 2797 struct bpf_map *map = ri->map_to_flush; 2798 2799 ri->map_to_flush = NULL; 2800 if (map) { 2801 switch (map->map_type) { 2802 case BPF_MAP_TYPE_DEVMAP: 2803 __dev_map_flush(map); 2804 break; 2805 case BPF_MAP_TYPE_CPUMAP: 2806 __cpu_map_flush(map); 2807 break; 2808 default: 2809 break; 2810 } 2811 } 2812 } 2813 EXPORT_SYMBOL_GPL(xdp_do_flush_map); 2814 2815 static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) 2816 { 2817 switch (map->map_type) { 2818 case BPF_MAP_TYPE_DEVMAP: 2819 return __dev_map_lookup_elem(map, index); 2820 case BPF_MAP_TYPE_CPUMAP: 2821 return __cpu_map_lookup_elem(map, index); 2822 default: 2823 return NULL; 2824 } 2825 } 2826 2827 static inline bool xdp_map_invalid(const struct bpf_prog *xdp_prog, 2828 unsigned long aux) 2829 { 2830 return (unsigned long)xdp_prog->aux != aux; 2831 } 2832 2833 static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, 2834 struct bpf_prog *xdp_prog) 2835 { 2836 struct redirect_info *ri = this_cpu_ptr(&redirect_info); 2837 unsigned long map_owner = ri->map_owner; 2838 struct bpf_map *map = ri->map; 2839 u32 index = ri->ifindex; 2840 void *fwd = NULL; 2841 int err; 2842 2843 ri->ifindex = 0; 2844 ri->map = NULL; 2845 ri->map_owner = 0; 2846 2847 if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) { 2848 err = -EFAULT; 2849 map = NULL; 2850 goto err; 2851 } 2852 2853 fwd = __xdp_map_lookup_elem(map, index); 2854 if (!fwd) { 2855 err = -EINVAL; 2856 goto err; 2857 } 2858 if (ri->map_to_flush && ri->map_to_flush != map) 2859 xdp_do_flush_map(); 2860 2861 err = __bpf_tx_xdp_map(dev, fwd, map, xdp, index); 2862 if (unlikely(err)) 2863 goto err; 2864 2865 ri->map_to_flush = map; 2866 _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index); 2867 return 0; 2868 err: 2869 _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err); 2870 return err; 2871 } 2872 2873 int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, 2874 struct bpf_prog *xdp_prog) 2875 { 2876 struct redirect_info *ri = this_cpu_ptr(&redirect_info); 2877 struct net_device *fwd; 2878 u32 index = ri->ifindex; 2879 int err; 2880 2881 if (ri->map) 2882 return xdp_do_redirect_map(dev, xdp, xdp_prog); 2883 2884 fwd = dev_get_by_index_rcu(dev_net(dev), index); 2885 ri->ifindex = 0; 2886 if (unlikely(!fwd)) { 2887 err = -EINVAL; 2888 goto err; 2889 } 2890 2891 err = __bpf_tx_xdp(fwd, NULL, xdp, 0); 2892 if (unlikely(err)) 2893 goto err; 2894 2895 _trace_xdp_redirect(dev, xdp_prog, index); 2896 return 0; 2897 err: 2898 _trace_xdp_redirect_err(dev, xdp_prog, index, err); 2899 return err; 2900 } 2901 EXPORT_SYMBOL_GPL(xdp_do_redirect); 2902 2903 static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device *fwd) 2904 { 2905 unsigned int len; 2906 2907 if (unlikely(!(fwd->flags & IFF_UP))) 2908 return -ENETDOWN; 2909 2910 len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN; 2911 if (skb->len > len) 2912 return -EMSGSIZE; 2913 2914 return 0; 2915 } 2916 2917 static int xdp_do_generic_redirect_map(struct net_device *dev, 2918 struct sk_buff *skb, 2919 struct bpf_prog *xdp_prog) 2920 { 2921 struct redirect_info *ri = this_cpu_ptr(&redirect_info); 2922 unsigned long map_owner = ri->map_owner; 2923 struct bpf_map *map = ri->map; 2924 struct net_device *fwd = NULL; 2925 u32 index = ri->ifindex; 2926 int err = 0; 2927 2928 ri->ifindex = 0; 2929 ri->map = NULL; 2930 ri->map_owner = 0; 2931 2932 if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) { 2933 err = -EFAULT; 2934 map = NULL; 2935 goto err; 2936 } 2937 fwd = __xdp_map_lookup_elem(map, index); 2938 if (unlikely(!fwd)) { 2939 err = -EINVAL; 2940 goto err; 2941 } 2942 2943 if (map->map_type == BPF_MAP_TYPE_DEVMAP) { 2944 if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd)))) 2945 goto err; 2946 skb->dev = fwd; 2947 } else { 2948 /* TODO: Handle BPF_MAP_TYPE_CPUMAP */ 2949 err = -EBADRQC; 2950 goto err; 2951 } 2952 2953 _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index); 2954 return 0; 2955 err: 2956 _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err); 2957 return err; 2958 } 2959 2960 int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, 2961 struct bpf_prog *xdp_prog) 2962 { 2963 struct redirect_info *ri = this_cpu_ptr(&redirect_info); 2964 u32 index = ri->ifindex; 2965 struct net_device *fwd; 2966 int err = 0; 2967 2968 if (ri->map) 2969 return xdp_do_generic_redirect_map(dev, skb, xdp_prog); 2970 2971 ri->ifindex = 0; 2972 fwd = dev_get_by_index_rcu(dev_net(dev), index); 2973 if (unlikely(!fwd)) { 2974 err = -EINVAL; 2975 goto err; 2976 } 2977 2978 if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd)))) 2979 goto err; 2980 2981 skb->dev = fwd; 2982 _trace_xdp_redirect(dev, xdp_prog, index); 2983 return 0; 2984 err: 2985 _trace_xdp_redirect_err(dev, xdp_prog, index, err); 2986 return err; 2987 } 2988 EXPORT_SYMBOL_GPL(xdp_do_generic_redirect); 2989 2990 BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) 2991 { 2992 struct redirect_info *ri = this_cpu_ptr(&redirect_info); 2993 2994 if (unlikely(flags)) 2995 return XDP_ABORTED; 2996 2997 ri->ifindex = ifindex; 2998 ri->flags = flags; 2999 ri->map = NULL; 3000 ri->map_owner = 0; 3001 3002 return XDP_REDIRECT; 3003 } 3004 3005 static const struct bpf_func_proto bpf_xdp_redirect_proto = { 3006 .func = bpf_xdp_redirect, 3007 .gpl_only = false, 3008 .ret_type = RET_INTEGER, 3009 .arg1_type = ARG_ANYTHING, 3010 .arg2_type = ARG_ANYTHING, 3011 }; 3012 3013 BPF_CALL_4(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags, 3014 unsigned long, map_owner) 3015 { 3016 struct redirect_info *ri = this_cpu_ptr(&redirect_info); 3017 3018 if (unlikely(flags)) 3019 return XDP_ABORTED; 3020 3021 ri->ifindex = ifindex; 3022 ri->flags = flags; 3023 ri->map = map; 3024 ri->map_owner = map_owner; 3025 3026 return XDP_REDIRECT; 3027 } 3028 3029 /* Note, arg4 is hidden from users and populated by the verifier 3030 * with the right pointer. 3031 */ 3032 static const struct bpf_func_proto bpf_xdp_redirect_map_proto = { 3033 .func = bpf_xdp_redirect_map, 3034 .gpl_only = false, 3035 .ret_type = RET_INTEGER, 3036 .arg1_type = ARG_CONST_MAP_PTR, 3037 .arg2_type = ARG_ANYTHING, 3038 .arg3_type = ARG_ANYTHING, 3039 }; 3040 3041 bool bpf_helper_changes_pkt_data(void *func) 3042 { 3043 if (func == bpf_skb_vlan_push || 3044 func == bpf_skb_vlan_pop || 3045 func == bpf_skb_store_bytes || 3046 func == bpf_skb_change_proto || 3047 func == bpf_skb_change_head || 3048 func == bpf_skb_change_tail || 3049 func == bpf_skb_adjust_room || 3050 func == bpf_skb_pull_data || 3051 func == bpf_clone_redirect || 3052 func == bpf_l3_csum_replace || 3053 func == bpf_l4_csum_replace || 3054 func == bpf_xdp_adjust_head || 3055 func == bpf_xdp_adjust_meta || 3056 func == bpf_msg_pull_data) 3057 return true; 3058 3059 return false; 3060 } 3061 3062 static unsigned long bpf_skb_copy(void *dst_buff, const void *skb, 3063 unsigned long off, unsigned long len) 3064 { 3065 void *ptr = skb_header_pointer(skb, off, len, dst_buff); 3066 3067 if (unlikely(!ptr)) 3068 return len; 3069 if (ptr != dst_buff) 3070 memcpy(dst_buff, ptr, len); 3071 3072 return 0; 3073 } 3074 3075 BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map, 3076 u64, flags, void *, meta, u64, meta_size) 3077 { 3078 u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32; 3079 3080 if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) 3081 return -EINVAL; 3082 if (unlikely(skb_size > skb->len)) 3083 return -EFAULT; 3084 3085 return bpf_event_output(map, flags, meta, meta_size, skb, skb_size, 3086 bpf_skb_copy); 3087 } 3088 3089 static const struct bpf_func_proto bpf_skb_event_output_proto = { 3090 .func = bpf_skb_event_output, 3091 .gpl_only = true, 3092 .ret_type = RET_INTEGER, 3093 .arg1_type = ARG_PTR_TO_CTX, 3094 .arg2_type = ARG_CONST_MAP_PTR, 3095 .arg3_type = ARG_ANYTHING, 3096 .arg4_type = ARG_PTR_TO_MEM, 3097 .arg5_type = ARG_CONST_SIZE_OR_ZERO, 3098 }; 3099 3100 static unsigned short bpf_tunnel_key_af(u64 flags) 3101 { 3102 return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET; 3103 } 3104 3105 BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key *, to, 3106 u32, size, u64, flags) 3107 { 3108 const struct ip_tunnel_info *info = skb_tunnel_info(skb); 3109 u8 compat[sizeof(struct bpf_tunnel_key)]; 3110 void *to_orig = to; 3111 int err; 3112 3113 if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6)))) { 3114 err = -EINVAL; 3115 goto err_clear; 3116 } 3117 if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) { 3118 err = -EPROTO; 3119 goto err_clear; 3120 } 3121 if (unlikely(size != sizeof(struct bpf_tunnel_key))) { 3122 err = -EINVAL; 3123 switch (size) { 3124 case offsetof(struct bpf_tunnel_key, tunnel_label): 3125 case offsetof(struct bpf_tunnel_key, tunnel_ext): 3126 goto set_compat; 3127 case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): 3128 /* Fixup deprecated structure layouts here, so we have 3129 * a common path later on. 3130 */ 3131 if (ip_tunnel_info_af(info) != AF_INET) 3132 goto err_clear; 3133 set_compat: 3134 to = (struct bpf_tunnel_key *)compat; 3135 break; 3136 default: 3137 goto err_clear; 3138 } 3139 } 3140 3141 to->tunnel_id = be64_to_cpu(info->key.tun_id); 3142 to->tunnel_tos = info->key.tos; 3143 to->tunnel_ttl = info->key.ttl; 3144 3145 if (flags & BPF_F_TUNINFO_IPV6) { 3146 memcpy(to->remote_ipv6, &info->key.u.ipv6.src, 3147 sizeof(to->remote_ipv6)); 3148 to->tunnel_label = be32_to_cpu(info->key.label); 3149 } else { 3150 to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); 3151 } 3152 3153 if (unlikely(size != sizeof(struct bpf_tunnel_key))) 3154 memcpy(to_orig, to, size); 3155 3156 return 0; 3157 err_clear: 3158 memset(to_orig, 0, size); 3159 return err; 3160 } 3161 3162 static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { 3163 .func = bpf_skb_get_tunnel_key, 3164 .gpl_only = false, 3165 .ret_type = RET_INTEGER, 3166 .arg1_type = ARG_PTR_TO_CTX, 3167 .arg2_type = ARG_PTR_TO_UNINIT_MEM, 3168 .arg3_type = ARG_CONST_SIZE, 3169 .arg4_type = ARG_ANYTHING, 3170 }; 3171 3172 BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size) 3173 { 3174 const struct ip_tunnel_info *info = skb_tunnel_info(skb); 3175 int err; 3176 3177 if (unlikely(!info || 3178 !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) { 3179 err = -ENOENT; 3180 goto err_clear; 3181 } 3182 if (unlikely(size < info->options_len)) { 3183 err = -ENOMEM; 3184 goto err_clear; 3185 } 3186 3187 ip_tunnel_info_opts_get(to, info); 3188 if (size > info->options_len) 3189 memset(to + info->options_len, 0, size - info->options_len); 3190 3191 return info->options_len; 3192 err_clear: 3193 memset(to, 0, size); 3194 return err; 3195 } 3196 3197 static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = { 3198 .func = bpf_skb_get_tunnel_opt, 3199 .gpl_only = false, 3200 .ret_type = RET_INTEGER, 3201 .arg1_type = ARG_PTR_TO_CTX, 3202 .arg2_type = ARG_PTR_TO_UNINIT_MEM, 3203 .arg3_type = ARG_CONST_SIZE, 3204 }; 3205 3206 static struct metadata_dst __percpu *md_dst; 3207 3208 BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, 3209 const struct bpf_tunnel_key *, from, u32, size, u64, flags) 3210 { 3211 struct metadata_dst *md = this_cpu_ptr(md_dst); 3212 u8 compat[sizeof(struct bpf_tunnel_key)]; 3213 struct ip_tunnel_info *info; 3214 3215 if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX | 3216 BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER))) 3217 return -EINVAL; 3218 if (unlikely(size != sizeof(struct bpf_tunnel_key))) { 3219 switch (size) { 3220 case offsetof(struct bpf_tunnel_key, tunnel_label): 3221 case offsetof(struct bpf_tunnel_key, tunnel_ext): 3222 case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): 3223 /* Fixup deprecated structure layouts here, so we have 3224 * a common path later on. 3225 */ 3226 memcpy(compat, from, size); 3227 memset(compat + size, 0, sizeof(compat) - size); 3228 from = (const struct bpf_tunnel_key *) compat; 3229 break; 3230 default: 3231 return -EINVAL; 3232 } 3233 } 3234 if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) || 3235 from->tunnel_ext)) 3236 return -EINVAL; 3237 3238 skb_dst_drop(skb); 3239 dst_hold((struct dst_entry *) md); 3240 skb_dst_set(skb, (struct dst_entry *) md); 3241 3242 info = &md->u.tun_info; 3243 info->mode = IP_TUNNEL_INFO_TX; 3244 3245 info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE; 3246 if (flags & BPF_F_DONT_FRAGMENT) 3247 info->key.tun_flags |= TUNNEL_DONT_FRAGMENT; 3248 if (flags & BPF_F_ZERO_CSUM_TX) 3249 info->key.tun_flags &= ~TUNNEL_CSUM; 3250 if (flags & BPF_F_SEQ_NUMBER) 3251 info->key.tun_flags |= TUNNEL_SEQ; 3252 3253 info->key.tun_id = cpu_to_be64(from->tunnel_id); 3254 info->key.tos = from->tunnel_tos; 3255 info->key.ttl = from->tunnel_ttl; 3256 3257 if (flags & BPF_F_TUNINFO_IPV6) { 3258 info->mode |= IP_TUNNEL_INFO_IPV6; 3259 memcpy(&info->key.u.ipv6.dst, from->remote_ipv6, 3260 sizeof(from->remote_ipv6)); 3261 info->key.label = cpu_to_be32(from->tunnel_label) & 3262 IPV6_FLOWLABEL_MASK; 3263 } else { 3264 info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); 3265 } 3266 3267 return 0; 3268 } 3269 3270 static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { 3271 .func = bpf_skb_set_tunnel_key, 3272 .gpl_only = false, 3273 .ret_type = RET_INTEGER, 3274 .arg1_type = ARG_PTR_TO_CTX, 3275 .arg2_type = ARG_PTR_TO_MEM, 3276 .arg3_type = ARG_CONST_SIZE, 3277 .arg4_type = ARG_ANYTHING, 3278 }; 3279 3280 BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb, 3281 const u8 *, from, u32, size) 3282 { 3283 struct ip_tunnel_info *info = skb_tunnel_info(skb); 3284 const struct metadata_dst *md = this_cpu_ptr(md_dst); 3285 3286 if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1)))) 3287 return -EINVAL; 3288 if (unlikely(size > IP_TUNNEL_OPTS_MAX)) 3289 return -ENOMEM; 3290 3291 ip_tunnel_info_opts_set(info, from, size); 3292 3293 return 0; 3294 } 3295 3296 static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = { 3297 .func = bpf_skb_set_tunnel_opt, 3298 .gpl_only = false, 3299 .ret_type = RET_INTEGER, 3300 .arg1_type = ARG_PTR_TO_CTX, 3301 .arg2_type = ARG_PTR_TO_MEM, 3302 .arg3_type = ARG_CONST_SIZE, 3303 }; 3304 3305 static const struct bpf_func_proto * 3306 bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) 3307 { 3308 if (!md_dst) { 3309 struct metadata_dst __percpu *tmp; 3310 3311 tmp = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX, 3312 METADATA_IP_TUNNEL, 3313 GFP_KERNEL); 3314 if (!tmp) 3315 return NULL; 3316 if (cmpxchg(&md_dst, NULL, tmp)) 3317 metadata_dst_free_percpu(tmp); 3318 } 3319 3320 switch (which) { 3321 case BPF_FUNC_skb_set_tunnel_key: 3322 return &bpf_skb_set_tunnel_key_proto; 3323 case BPF_FUNC_skb_set_tunnel_opt: 3324 return &bpf_skb_set_tunnel_opt_proto; 3325 default: 3326 return NULL; 3327 } 3328 } 3329 3330 BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map, 3331 u32, idx) 3332 { 3333 struct bpf_array *array = container_of(map, struct bpf_array, map); 3334 struct cgroup *cgrp; 3335 struct sock *sk; 3336 3337 sk = skb_to_full_sk(skb); 3338 if (!sk || !sk_fullsock(sk)) 3339 return -ENOENT; 3340 if (unlikely(idx >= array->map.max_entries)) 3341 return -E2BIG; 3342 3343 cgrp = READ_ONCE(array->ptrs[idx]); 3344 if (unlikely(!cgrp)) 3345 return -EAGAIN; 3346 3347 return sk_under_cgroup_hierarchy(sk, cgrp); 3348 } 3349 3350 static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { 3351 .func = bpf_skb_under_cgroup, 3352 .gpl_only = false, 3353 .ret_type = RET_INTEGER, 3354 .arg1_type = ARG_PTR_TO_CTX, 3355 .arg2_type = ARG_CONST_MAP_PTR, 3356 .arg3_type = ARG_ANYTHING, 3357 }; 3358 3359 static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, 3360 unsigned long off, unsigned long len) 3361 { 3362 memcpy(dst_buff, src_buff + off, len); 3363 return 0; 3364 } 3365 3366 BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map, 3367 u64, flags, void *, meta, u64, meta_size) 3368 { 3369 u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32; 3370 3371 if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) 3372 return -EINVAL; 3373 if (unlikely(xdp_size > (unsigned long)(xdp->data_end - xdp->data))) 3374 return -EFAULT; 3375 3376 return bpf_event_output(map, flags, meta, meta_size, xdp->data, 3377 xdp_size, bpf_xdp_copy); 3378 } 3379 3380 static const struct bpf_func_proto bpf_xdp_event_output_proto = { 3381 .func = bpf_xdp_event_output, 3382 .gpl_only = true, 3383 .ret_type = RET_INTEGER, 3384 .arg1_type = ARG_PTR_TO_CTX, 3385 .arg2_type = ARG_CONST_MAP_PTR, 3386 .arg3_type = ARG_ANYTHING, 3387 .arg4_type = ARG_PTR_TO_MEM, 3388 .arg5_type = ARG_CONST_SIZE_OR_ZERO, 3389 }; 3390 3391 BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb) 3392 { 3393 return skb->sk ? sock_gen_cookie(skb->sk) : 0; 3394 } 3395 3396 static const struct bpf_func_proto bpf_get_socket_cookie_proto = { 3397 .func = bpf_get_socket_cookie, 3398 .gpl_only = false, 3399 .ret_type = RET_INTEGER, 3400 .arg1_type = ARG_PTR_TO_CTX, 3401 }; 3402 3403 BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb) 3404 { 3405 struct sock *sk = sk_to_full_sk(skb->sk); 3406 kuid_t kuid; 3407 3408 if (!sk || !sk_fullsock(sk)) 3409 return overflowuid; 3410 kuid = sock_net_uid(sock_net(sk), sk); 3411 return from_kuid_munged(sock_net(sk)->user_ns, kuid); 3412 } 3413 3414 static const struct bpf_func_proto bpf_get_socket_uid_proto = { 3415 .func = bpf_get_socket_uid, 3416 .gpl_only = false, 3417 .ret_type = RET_INTEGER, 3418 .arg1_type = ARG_PTR_TO_CTX, 3419 }; 3420 3421 BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, 3422 int, level, int, optname, char *, optval, int, optlen) 3423 { 3424 struct sock *sk = bpf_sock->sk; 3425 int ret = 0; 3426 int val; 3427 3428 if (!sk_fullsock(sk)) 3429 return -EINVAL; 3430 3431 if (level == SOL_SOCKET) { 3432 if (optlen != sizeof(int)) 3433 return -EINVAL; 3434 val = *((int *)optval); 3435 3436 /* Only some socketops are supported */ 3437 switch (optname) { 3438 case SO_RCVBUF: 3439 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 3440 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF); 3441 break; 3442 case SO_SNDBUF: 3443 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 3444 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF); 3445 break; 3446 case SO_MAX_PACING_RATE: 3447 sk->sk_max_pacing_rate = val; 3448 sk->sk_pacing_rate = min(sk->sk_pacing_rate, 3449 sk->sk_max_pacing_rate); 3450 break; 3451 case SO_PRIORITY: 3452 sk->sk_priority = val; 3453 break; 3454 case SO_RCVLOWAT: 3455 if (val < 0) 3456 val = INT_MAX; 3457 sk->sk_rcvlowat = val ? : 1; 3458 break; 3459 case SO_MARK: 3460 sk->sk_mark = val; 3461 break; 3462 default: 3463 ret = -EINVAL; 3464 } 3465 #ifdef CONFIG_INET 3466 } else if (level == SOL_IP) { 3467 if (optlen != sizeof(int) || sk->sk_family != AF_INET) 3468 return -EINVAL; 3469 3470 val = *((int *)optval); 3471 /* Only some options are supported */ 3472 switch (optname) { 3473 case IP_TOS: 3474 if (val < -1 || val > 0xff) { 3475 ret = -EINVAL; 3476 } else { 3477 struct inet_sock *inet = inet_sk(sk); 3478 3479 if (val == -1) 3480 val = 0; 3481 inet->tos = val; 3482 } 3483 break; 3484 default: 3485 ret = -EINVAL; 3486 } 3487 #if IS_ENABLED(CONFIG_IPV6) 3488 } else if (level == SOL_IPV6) { 3489 if (optlen != sizeof(int) || sk->sk_family != AF_INET6) 3490 return -EINVAL; 3491 3492 val = *((int *)optval); 3493 /* Only some options are supported */ 3494 switch (optname) { 3495 case IPV6_TCLASS: 3496 if (val < -1 || val > 0xff) { 3497 ret = -EINVAL; 3498 } else { 3499 struct ipv6_pinfo *np = inet6_sk(sk); 3500 3501 if (val == -1) 3502 val = 0; 3503 np->tclass = val; 3504 } 3505 break; 3506 default: 3507 ret = -EINVAL; 3508 } 3509 #endif 3510 } else if (level == SOL_TCP && 3511 sk->sk_prot->setsockopt == tcp_setsockopt) { 3512 if (optname == TCP_CONGESTION) { 3513 char name[TCP_CA_NAME_MAX]; 3514 bool reinit = bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN; 3515 3516 strncpy(name, optval, min_t(long, optlen, 3517 TCP_CA_NAME_MAX-1)); 3518 name[TCP_CA_NAME_MAX-1] = 0; 3519 ret = tcp_set_congestion_control(sk, name, false, 3520 reinit); 3521 } else { 3522 struct tcp_sock *tp = tcp_sk(sk); 3523 3524 if (optlen != sizeof(int)) 3525 return -EINVAL; 3526 3527 val = *((int *)optval); 3528 /* Only some options are supported */ 3529 switch (optname) { 3530 case TCP_BPF_IW: 3531 if (val <= 0 || tp->data_segs_out > 0) 3532 ret = -EINVAL; 3533 else 3534 tp->snd_cwnd = val; 3535 break; 3536 case TCP_BPF_SNDCWND_CLAMP: 3537 if (val <= 0) { 3538 ret = -EINVAL; 3539 } else { 3540 tp->snd_cwnd_clamp = val; 3541 tp->snd_ssthresh = val; 3542 } 3543 break; 3544 default: 3545 ret = -EINVAL; 3546 } 3547 } 3548 #endif 3549 } else { 3550 ret = -EINVAL; 3551 } 3552 return ret; 3553 } 3554 3555 static const struct bpf_func_proto bpf_setsockopt_proto = { 3556 .func = bpf_setsockopt, 3557 .gpl_only = false, 3558 .ret_type = RET_INTEGER, 3559 .arg1_type = ARG_PTR_TO_CTX, 3560 .arg2_type = ARG_ANYTHING, 3561 .arg3_type = ARG_ANYTHING, 3562 .arg4_type = ARG_PTR_TO_MEM, 3563 .arg5_type = ARG_CONST_SIZE, 3564 }; 3565 3566 BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, 3567 int, level, int, optname, char *, optval, int, optlen) 3568 { 3569 struct sock *sk = bpf_sock->sk; 3570 3571 if (!sk_fullsock(sk)) 3572 goto err_clear; 3573 3574 #ifdef CONFIG_INET 3575 if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) { 3576 if (optname == TCP_CONGESTION) { 3577 struct inet_connection_sock *icsk = inet_csk(sk); 3578 3579 if (!icsk->icsk_ca_ops || optlen <= 1) 3580 goto err_clear; 3581 strncpy(optval, icsk->icsk_ca_ops->name, optlen); 3582 optval[optlen - 1] = 0; 3583 } else { 3584 goto err_clear; 3585 } 3586 } else if (level == SOL_IP) { 3587 struct inet_sock *inet = inet_sk(sk); 3588 3589 if (optlen != sizeof(int) || sk->sk_family != AF_INET) 3590 goto err_clear; 3591 3592 /* Only some options are supported */ 3593 switch (optname) { 3594 case IP_TOS: 3595 *((int *)optval) = (int)inet->tos; 3596 break; 3597 default: 3598 goto err_clear; 3599 } 3600 #if IS_ENABLED(CONFIG_IPV6) 3601 } else if (level == SOL_IPV6) { 3602 struct ipv6_pinfo *np = inet6_sk(sk); 3603 3604 if (optlen != sizeof(int) || sk->sk_family != AF_INET6) 3605 goto err_clear; 3606 3607 /* Only some options are supported */ 3608 switch (optname) { 3609 case IPV6_TCLASS: 3610 *((int *)optval) = (int)np->tclass; 3611 break; 3612 default: 3613 goto err_clear; 3614 } 3615 #endif 3616 } else { 3617 goto err_clear; 3618 } 3619 return 0; 3620 #endif 3621 err_clear: 3622 memset(optval, 0, optlen); 3623 return -EINVAL; 3624 } 3625 3626 static const struct bpf_func_proto bpf_getsockopt_proto = { 3627 .func = bpf_getsockopt, 3628 .gpl_only = false, 3629 .ret_type = RET_INTEGER, 3630 .arg1_type = ARG_PTR_TO_CTX, 3631 .arg2_type = ARG_ANYTHING, 3632 .arg3_type = ARG_ANYTHING, 3633 .arg4_type = ARG_PTR_TO_UNINIT_MEM, 3634 .arg5_type = ARG_CONST_SIZE, 3635 }; 3636 3637 BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock, 3638 int, argval) 3639 { 3640 struct sock *sk = bpf_sock->sk; 3641 int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS; 3642 3643 if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk)) 3644 return -EINVAL; 3645 3646 if (val) 3647 tcp_sk(sk)->bpf_sock_ops_cb_flags = val; 3648 3649 return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS); 3650 } 3651 3652 static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = { 3653 .func = bpf_sock_ops_cb_flags_set, 3654 .gpl_only = false, 3655 .ret_type = RET_INTEGER, 3656 .arg1_type = ARG_PTR_TO_CTX, 3657 .arg2_type = ARG_ANYTHING, 3658 }; 3659 3660 const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; 3661 EXPORT_SYMBOL_GPL(ipv6_bpf_stub); 3662 3663 BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, 3664 int, addr_len) 3665 { 3666 #ifdef CONFIG_INET 3667 struct sock *sk = ctx->sk; 3668 int err; 3669 3670 /* Binding to port can be expensive so it's prohibited in the helper. 3671 * Only binding to IP is supported. 3672 */ 3673 err = -EINVAL; 3674 if (addr->sa_family == AF_INET) { 3675 if (addr_len < sizeof(struct sockaddr_in)) 3676 return err; 3677 if (((struct sockaddr_in *)addr)->sin_port != htons(0)) 3678 return err; 3679 return __inet_bind(sk, addr, addr_len, true, false); 3680 #if IS_ENABLED(CONFIG_IPV6) 3681 } else if (addr->sa_family == AF_INET6) { 3682 if (addr_len < SIN6_LEN_RFC2133) 3683 return err; 3684 if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0)) 3685 return err; 3686 /* ipv6_bpf_stub cannot be NULL, since it's called from 3687 * bpf_cgroup_inet6_connect hook and ipv6 is already loaded 3688 */ 3689 return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, true, false); 3690 #endif /* CONFIG_IPV6 */ 3691 } 3692 #endif /* CONFIG_INET */ 3693 3694 return -EAFNOSUPPORT; 3695 } 3696 3697 static const struct bpf_func_proto bpf_bind_proto = { 3698 .func = bpf_bind, 3699 .gpl_only = false, 3700 .ret_type = RET_INTEGER, 3701 .arg1_type = ARG_PTR_TO_CTX, 3702 .arg2_type = ARG_PTR_TO_MEM, 3703 .arg3_type = ARG_CONST_SIZE, 3704 }; 3705 3706 static const struct bpf_func_proto * 3707 bpf_base_func_proto(enum bpf_func_id func_id) 3708 { 3709 switch (func_id) { 3710 case BPF_FUNC_map_lookup_elem: 3711 return &bpf_map_lookup_elem_proto; 3712 case BPF_FUNC_map_update_elem: 3713 return &bpf_map_update_elem_proto; 3714 case BPF_FUNC_map_delete_elem: 3715 return &bpf_map_delete_elem_proto; 3716 case BPF_FUNC_get_prandom_u32: 3717 return &bpf_get_prandom_u32_proto; 3718 case BPF_FUNC_get_smp_processor_id: 3719 return &bpf_get_raw_smp_processor_id_proto; 3720 case BPF_FUNC_get_numa_node_id: 3721 return &bpf_get_numa_node_id_proto; 3722 case BPF_FUNC_tail_call: 3723 return &bpf_tail_call_proto; 3724 case BPF_FUNC_ktime_get_ns: 3725 return &bpf_ktime_get_ns_proto; 3726 case BPF_FUNC_trace_printk: 3727 if (capable(CAP_SYS_ADMIN)) 3728 return bpf_get_trace_printk_proto(); 3729 default: 3730 return NULL; 3731 } 3732 } 3733 3734 static const struct bpf_func_proto * 3735 sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 3736 { 3737 switch (func_id) { 3738 /* inet and inet6 sockets are created in a process 3739 * context so there is always a valid uid/gid 3740 */ 3741 case BPF_FUNC_get_current_uid_gid: 3742 return &bpf_get_current_uid_gid_proto; 3743 default: 3744 return bpf_base_func_proto(func_id); 3745 } 3746 } 3747 3748 static const struct bpf_func_proto * 3749 sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 3750 { 3751 switch (func_id) { 3752 /* inet and inet6 sockets are created in a process 3753 * context so there is always a valid uid/gid 3754 */ 3755 case BPF_FUNC_get_current_uid_gid: 3756 return &bpf_get_current_uid_gid_proto; 3757 case BPF_FUNC_bind: 3758 switch (prog->expected_attach_type) { 3759 case BPF_CGROUP_INET4_CONNECT: 3760 case BPF_CGROUP_INET6_CONNECT: 3761 return &bpf_bind_proto; 3762 default: 3763 return NULL; 3764 } 3765 default: 3766 return bpf_base_func_proto(func_id); 3767 } 3768 } 3769 3770 static const struct bpf_func_proto * 3771 sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 3772 { 3773 switch (func_id) { 3774 case BPF_FUNC_skb_load_bytes: 3775 return &bpf_skb_load_bytes_proto; 3776 case BPF_FUNC_get_socket_cookie: 3777 return &bpf_get_socket_cookie_proto; 3778 case BPF_FUNC_get_socket_uid: 3779 return &bpf_get_socket_uid_proto; 3780 default: 3781 return bpf_base_func_proto(func_id); 3782 } 3783 } 3784 3785 static const struct bpf_func_proto * 3786 tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 3787 { 3788 switch (func_id) { 3789 case BPF_FUNC_skb_store_bytes: 3790 return &bpf_skb_store_bytes_proto; 3791 case BPF_FUNC_skb_load_bytes: 3792 return &bpf_skb_load_bytes_proto; 3793 case BPF_FUNC_skb_pull_data: 3794 return &bpf_skb_pull_data_proto; 3795 case BPF_FUNC_csum_diff: 3796 return &bpf_csum_diff_proto; 3797 case BPF_FUNC_csum_update: 3798 return &bpf_csum_update_proto; 3799 case BPF_FUNC_l3_csum_replace: 3800 return &bpf_l3_csum_replace_proto; 3801 case BPF_FUNC_l4_csum_replace: 3802 return &bpf_l4_csum_replace_proto; 3803 case BPF_FUNC_clone_redirect: 3804 return &bpf_clone_redirect_proto; 3805 case BPF_FUNC_get_cgroup_classid: 3806 return &bpf_get_cgroup_classid_proto; 3807 case BPF_FUNC_skb_vlan_push: 3808 return &bpf_skb_vlan_push_proto; 3809 case BPF_FUNC_skb_vlan_pop: 3810 return &bpf_skb_vlan_pop_proto; 3811 case BPF_FUNC_skb_change_proto: 3812 return &bpf_skb_change_proto_proto; 3813 case BPF_FUNC_skb_change_type: 3814 return &bpf_skb_change_type_proto; 3815 case BPF_FUNC_skb_adjust_room: 3816 return &bpf_skb_adjust_room_proto; 3817 case BPF_FUNC_skb_change_tail: 3818 return &bpf_skb_change_tail_proto; 3819 case BPF_FUNC_skb_get_tunnel_key: 3820 return &bpf_skb_get_tunnel_key_proto; 3821 case BPF_FUNC_skb_set_tunnel_key: 3822 return bpf_get_skb_set_tunnel_proto(func_id); 3823 case BPF_FUNC_skb_get_tunnel_opt: 3824 return &bpf_skb_get_tunnel_opt_proto; 3825 case BPF_FUNC_skb_set_tunnel_opt: 3826 return bpf_get_skb_set_tunnel_proto(func_id); 3827 case BPF_FUNC_redirect: 3828 return &bpf_redirect_proto; 3829 case BPF_FUNC_get_route_realm: 3830 return &bpf_get_route_realm_proto; 3831 case BPF_FUNC_get_hash_recalc: 3832 return &bpf_get_hash_recalc_proto; 3833 case BPF_FUNC_set_hash_invalid: 3834 return &bpf_set_hash_invalid_proto; 3835 case BPF_FUNC_set_hash: 3836 return &bpf_set_hash_proto; 3837 case BPF_FUNC_perf_event_output: 3838 return &bpf_skb_event_output_proto; 3839 case BPF_FUNC_get_smp_processor_id: 3840 return &bpf_get_smp_processor_id_proto; 3841 case BPF_FUNC_skb_under_cgroup: 3842 return &bpf_skb_under_cgroup_proto; 3843 case BPF_FUNC_get_socket_cookie: 3844 return &bpf_get_socket_cookie_proto; 3845 case BPF_FUNC_get_socket_uid: 3846 return &bpf_get_socket_uid_proto; 3847 default: 3848 return bpf_base_func_proto(func_id); 3849 } 3850 } 3851 3852 static const struct bpf_func_proto * 3853 xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 3854 { 3855 switch (func_id) { 3856 case BPF_FUNC_perf_event_output: 3857 return &bpf_xdp_event_output_proto; 3858 case BPF_FUNC_get_smp_processor_id: 3859 return &bpf_get_smp_processor_id_proto; 3860 case BPF_FUNC_csum_diff: 3861 return &bpf_csum_diff_proto; 3862 case BPF_FUNC_xdp_adjust_head: 3863 return &bpf_xdp_adjust_head_proto; 3864 case BPF_FUNC_xdp_adjust_meta: 3865 return &bpf_xdp_adjust_meta_proto; 3866 case BPF_FUNC_redirect: 3867 return &bpf_xdp_redirect_proto; 3868 case BPF_FUNC_redirect_map: 3869 return &bpf_xdp_redirect_map_proto; 3870 default: 3871 return bpf_base_func_proto(func_id); 3872 } 3873 } 3874 3875 static const struct bpf_func_proto * 3876 lwt_inout_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 3877 { 3878 switch (func_id) { 3879 case BPF_FUNC_skb_load_bytes: 3880 return &bpf_skb_load_bytes_proto; 3881 case BPF_FUNC_skb_pull_data: 3882 return &bpf_skb_pull_data_proto; 3883 case BPF_FUNC_csum_diff: 3884 return &bpf_csum_diff_proto; 3885 case BPF_FUNC_get_cgroup_classid: 3886 return &bpf_get_cgroup_classid_proto; 3887 case BPF_FUNC_get_route_realm: 3888 return &bpf_get_route_realm_proto; 3889 case BPF_FUNC_get_hash_recalc: 3890 return &bpf_get_hash_recalc_proto; 3891 case BPF_FUNC_perf_event_output: 3892 return &bpf_skb_event_output_proto; 3893 case BPF_FUNC_get_smp_processor_id: 3894 return &bpf_get_smp_processor_id_proto; 3895 case BPF_FUNC_skb_under_cgroup: 3896 return &bpf_skb_under_cgroup_proto; 3897 default: 3898 return bpf_base_func_proto(func_id); 3899 } 3900 } 3901 3902 static const struct bpf_func_proto * 3903 sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 3904 { 3905 switch (func_id) { 3906 case BPF_FUNC_setsockopt: 3907 return &bpf_setsockopt_proto; 3908 case BPF_FUNC_getsockopt: 3909 return &bpf_getsockopt_proto; 3910 case BPF_FUNC_sock_ops_cb_flags_set: 3911 return &bpf_sock_ops_cb_flags_set_proto; 3912 case BPF_FUNC_sock_map_update: 3913 return &bpf_sock_map_update_proto; 3914 default: 3915 return bpf_base_func_proto(func_id); 3916 } 3917 } 3918 3919 static const struct bpf_func_proto * 3920 sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 3921 { 3922 switch (func_id) { 3923 case BPF_FUNC_msg_redirect_map: 3924 return &bpf_msg_redirect_map_proto; 3925 case BPF_FUNC_msg_apply_bytes: 3926 return &bpf_msg_apply_bytes_proto; 3927 case BPF_FUNC_msg_cork_bytes: 3928 return &bpf_msg_cork_bytes_proto; 3929 case BPF_FUNC_msg_pull_data: 3930 return &bpf_msg_pull_data_proto; 3931 default: 3932 return bpf_base_func_proto(func_id); 3933 } 3934 } 3935 3936 static const struct bpf_func_proto * 3937 sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 3938 { 3939 switch (func_id) { 3940 case BPF_FUNC_skb_store_bytes: 3941 return &bpf_skb_store_bytes_proto; 3942 case BPF_FUNC_skb_load_bytes: 3943 return &bpf_skb_load_bytes_proto; 3944 case BPF_FUNC_skb_pull_data: 3945 return &bpf_skb_pull_data_proto; 3946 case BPF_FUNC_skb_change_tail: 3947 return &bpf_skb_change_tail_proto; 3948 case BPF_FUNC_skb_change_head: 3949 return &bpf_skb_change_head_proto; 3950 case BPF_FUNC_get_socket_cookie: 3951 return &bpf_get_socket_cookie_proto; 3952 case BPF_FUNC_get_socket_uid: 3953 return &bpf_get_socket_uid_proto; 3954 case BPF_FUNC_sk_redirect_map: 3955 return &bpf_sk_redirect_map_proto; 3956 default: 3957 return bpf_base_func_proto(func_id); 3958 } 3959 } 3960 3961 static const struct bpf_func_proto * 3962 lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 3963 { 3964 switch (func_id) { 3965 case BPF_FUNC_skb_get_tunnel_key: 3966 return &bpf_skb_get_tunnel_key_proto; 3967 case BPF_FUNC_skb_set_tunnel_key: 3968 return bpf_get_skb_set_tunnel_proto(func_id); 3969 case BPF_FUNC_skb_get_tunnel_opt: 3970 return &bpf_skb_get_tunnel_opt_proto; 3971 case BPF_FUNC_skb_set_tunnel_opt: 3972 return bpf_get_skb_set_tunnel_proto(func_id); 3973 case BPF_FUNC_redirect: 3974 return &bpf_redirect_proto; 3975 case BPF_FUNC_clone_redirect: 3976 return &bpf_clone_redirect_proto; 3977 case BPF_FUNC_skb_change_tail: 3978 return &bpf_skb_change_tail_proto; 3979 case BPF_FUNC_skb_change_head: 3980 return &bpf_skb_change_head_proto; 3981 case BPF_FUNC_skb_store_bytes: 3982 return &bpf_skb_store_bytes_proto; 3983 case BPF_FUNC_csum_update: 3984 return &bpf_csum_update_proto; 3985 case BPF_FUNC_l3_csum_replace: 3986 return &bpf_l3_csum_replace_proto; 3987 case BPF_FUNC_l4_csum_replace: 3988 return &bpf_l4_csum_replace_proto; 3989 case BPF_FUNC_set_hash_invalid: 3990 return &bpf_set_hash_invalid_proto; 3991 default: 3992 return lwt_inout_func_proto(func_id, prog); 3993 } 3994 } 3995 3996 static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type, 3997 const struct bpf_prog *prog, 3998 struct bpf_insn_access_aux *info) 3999 { 4000 const int size_default = sizeof(__u32); 4001 4002 if (off < 0 || off >= sizeof(struct __sk_buff)) 4003 return false; 4004 4005 /* The verifier guarantees that size > 0. */ 4006 if (off % size != 0) 4007 return false; 4008 4009 switch (off) { 4010 case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): 4011 if (off + size > offsetofend(struct __sk_buff, cb[4])) 4012 return false; 4013 break; 4014 case bpf_ctx_range_till(struct __sk_buff, remote_ip6[0], remote_ip6[3]): 4015 case bpf_ctx_range_till(struct __sk_buff, local_ip6[0], local_ip6[3]): 4016 case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4): 4017 case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4): 4018 case bpf_ctx_range(struct __sk_buff, data): 4019 case bpf_ctx_range(struct __sk_buff, data_meta): 4020 case bpf_ctx_range(struct __sk_buff, data_end): 4021 if (size != size_default) 4022 return false; 4023 break; 4024 default: 4025 /* Only narrow read access allowed for now. */ 4026 if (type == BPF_WRITE) { 4027 if (size != size_default) 4028 return false; 4029 } else { 4030 bpf_ctx_record_field_size(info, size_default); 4031 if (!bpf_ctx_narrow_access_ok(off, size, size_default)) 4032 return false; 4033 } 4034 } 4035 4036 return true; 4037 } 4038 4039 static bool sk_filter_is_valid_access(int off, int size, 4040 enum bpf_access_type type, 4041 const struct bpf_prog *prog, 4042 struct bpf_insn_access_aux *info) 4043 { 4044 switch (off) { 4045 case bpf_ctx_range(struct __sk_buff, tc_classid): 4046 case bpf_ctx_range(struct __sk_buff, data): 4047 case bpf_ctx_range(struct __sk_buff, data_meta): 4048 case bpf_ctx_range(struct __sk_buff, data_end): 4049 case bpf_ctx_range_till(struct __sk_buff, family, local_port): 4050 return false; 4051 } 4052 4053 if (type == BPF_WRITE) { 4054 switch (off) { 4055 case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): 4056 break; 4057 default: 4058 return false; 4059 } 4060 } 4061 4062 return bpf_skb_is_valid_access(off, size, type, prog, info); 4063 } 4064 4065 static bool lwt_is_valid_access(int off, int size, 4066 enum bpf_access_type type, 4067 const struct bpf_prog *prog, 4068 struct bpf_insn_access_aux *info) 4069 { 4070 switch (off) { 4071 case bpf_ctx_range(struct __sk_buff, tc_classid): 4072 case bpf_ctx_range_till(struct __sk_buff, family, local_port): 4073 case bpf_ctx_range(struct __sk_buff, data_meta): 4074 return false; 4075 } 4076 4077 if (type == BPF_WRITE) { 4078 switch (off) { 4079 case bpf_ctx_range(struct __sk_buff, mark): 4080 case bpf_ctx_range(struct __sk_buff, priority): 4081 case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): 4082 break; 4083 default: 4084 return false; 4085 } 4086 } 4087 4088 switch (off) { 4089 case bpf_ctx_range(struct __sk_buff, data): 4090 info->reg_type = PTR_TO_PACKET; 4091 break; 4092 case bpf_ctx_range(struct __sk_buff, data_end): 4093 info->reg_type = PTR_TO_PACKET_END; 4094 break; 4095 } 4096 4097 return bpf_skb_is_valid_access(off, size, type, prog, info); 4098 } 4099 4100 4101 /* Attach type specific accesses */ 4102 static bool __sock_filter_check_attach_type(int off, 4103 enum bpf_access_type access_type, 4104 enum bpf_attach_type attach_type) 4105 { 4106 switch (off) { 4107 case offsetof(struct bpf_sock, bound_dev_if): 4108 case offsetof(struct bpf_sock, mark): 4109 case offsetof(struct bpf_sock, priority): 4110 switch (attach_type) { 4111 case BPF_CGROUP_INET_SOCK_CREATE: 4112 goto full_access; 4113 default: 4114 return false; 4115 } 4116 case bpf_ctx_range(struct bpf_sock, src_ip4): 4117 switch (attach_type) { 4118 case BPF_CGROUP_INET4_POST_BIND: 4119 goto read_only; 4120 default: 4121 return false; 4122 } 4123 case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): 4124 switch (attach_type) { 4125 case BPF_CGROUP_INET6_POST_BIND: 4126 goto read_only; 4127 default: 4128 return false; 4129 } 4130 case bpf_ctx_range(struct bpf_sock, src_port): 4131 switch (attach_type) { 4132 case BPF_CGROUP_INET4_POST_BIND: 4133 case BPF_CGROUP_INET6_POST_BIND: 4134 goto read_only; 4135 default: 4136 return false; 4137 } 4138 } 4139 read_only: 4140 return access_type == BPF_READ; 4141 full_access: 4142 return true; 4143 } 4144 4145 static bool __sock_filter_check_size(int off, int size, 4146 struct bpf_insn_access_aux *info) 4147 { 4148 const int size_default = sizeof(__u32); 4149 4150 switch (off) { 4151 case bpf_ctx_range(struct bpf_sock, src_ip4): 4152 case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): 4153 bpf_ctx_record_field_size(info, size_default); 4154 return bpf_ctx_narrow_access_ok(off, size, size_default); 4155 } 4156 4157 return size == size_default; 4158 } 4159 4160 static bool sock_filter_is_valid_access(int off, int size, 4161 enum bpf_access_type type, 4162 const struct bpf_prog *prog, 4163 struct bpf_insn_access_aux *info) 4164 { 4165 if (off < 0 || off >= sizeof(struct bpf_sock)) 4166 return false; 4167 if (off % size != 0) 4168 return false; 4169 if (!__sock_filter_check_attach_type(off, type, 4170 prog->expected_attach_type)) 4171 return false; 4172 if (!__sock_filter_check_size(off, size, info)) 4173 return false; 4174 return true; 4175 } 4176 4177 static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write, 4178 const struct bpf_prog *prog, int drop_verdict) 4179 { 4180 struct bpf_insn *insn = insn_buf; 4181 4182 if (!direct_write) 4183 return 0; 4184 4185 /* if (!skb->cloned) 4186 * goto start; 4187 * 4188 * (Fast-path, otherwise approximation that we might be 4189 * a clone, do the rest in helper.) 4190 */ 4191 *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET()); 4192 *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK); 4193 *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7); 4194 4195 /* ret = bpf_skb_pull_data(skb, 0); */ 4196 *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); 4197 *insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2); 4198 *insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, 4199 BPF_FUNC_skb_pull_data); 4200 /* if (!ret) 4201 * goto restore; 4202 * return TC_ACT_SHOT; 4203 */ 4204 *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2); 4205 *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, drop_verdict); 4206 *insn++ = BPF_EXIT_INSN(); 4207 4208 /* restore: */ 4209 *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6); 4210 /* start: */ 4211 *insn++ = prog->insnsi[0]; 4212 4213 return insn - insn_buf; 4214 } 4215 4216 static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, 4217 const struct bpf_prog *prog) 4218 { 4219 return bpf_unclone_prologue(insn_buf, direct_write, prog, TC_ACT_SHOT); 4220 } 4221 4222 static bool tc_cls_act_is_valid_access(int off, int size, 4223 enum bpf_access_type type, 4224 const struct bpf_prog *prog, 4225 struct bpf_insn_access_aux *info) 4226 { 4227 if (type == BPF_WRITE) { 4228 switch (off) { 4229 case bpf_ctx_range(struct __sk_buff, mark): 4230 case bpf_ctx_range(struct __sk_buff, tc_index): 4231 case bpf_ctx_range(struct __sk_buff, priority): 4232 case bpf_ctx_range(struct __sk_buff, tc_classid): 4233 case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): 4234 break; 4235 default: 4236 return false; 4237 } 4238 } 4239 4240 switch (off) { 4241 case bpf_ctx_range(struct __sk_buff, data): 4242 info->reg_type = PTR_TO_PACKET; 4243 break; 4244 case bpf_ctx_range(struct __sk_buff, data_meta): 4245 info->reg_type = PTR_TO_PACKET_META; 4246 break; 4247 case bpf_ctx_range(struct __sk_buff, data_end): 4248 info->reg_type = PTR_TO_PACKET_END; 4249 break; 4250 case bpf_ctx_range_till(struct __sk_buff, family, local_port): 4251 return false; 4252 } 4253 4254 return bpf_skb_is_valid_access(off, size, type, prog, info); 4255 } 4256 4257 static bool __is_valid_xdp_access(int off, int size) 4258 { 4259 if (off < 0 || off >= sizeof(struct xdp_md)) 4260 return false; 4261 if (off % size != 0) 4262 return false; 4263 if (size != sizeof(__u32)) 4264 return false; 4265 4266 return true; 4267 } 4268 4269 static bool xdp_is_valid_access(int off, int size, 4270 enum bpf_access_type type, 4271 const struct bpf_prog *prog, 4272 struct bpf_insn_access_aux *info) 4273 { 4274 if (type == BPF_WRITE) 4275 return false; 4276 4277 switch (off) { 4278 case offsetof(struct xdp_md, data): 4279 info->reg_type = PTR_TO_PACKET; 4280 break; 4281 case offsetof(struct xdp_md, data_meta): 4282 info->reg_type = PTR_TO_PACKET_META; 4283 break; 4284 case offsetof(struct xdp_md, data_end): 4285 info->reg_type = PTR_TO_PACKET_END; 4286 break; 4287 } 4288 4289 return __is_valid_xdp_access(off, size); 4290 } 4291 4292 void bpf_warn_invalid_xdp_action(u32 act) 4293 { 4294 const u32 act_max = XDP_REDIRECT; 4295 4296 WARN_ONCE(1, "%s XDP return value %u, expect packet loss!\n", 4297 act > act_max ? "Illegal" : "Driver unsupported", 4298 act); 4299 } 4300 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); 4301 4302 static bool sock_addr_is_valid_access(int off, int size, 4303 enum bpf_access_type type, 4304 const struct bpf_prog *prog, 4305 struct bpf_insn_access_aux *info) 4306 { 4307 const int size_default = sizeof(__u32); 4308 4309 if (off < 0 || off >= sizeof(struct bpf_sock_addr)) 4310 return false; 4311 if (off % size != 0) 4312 return false; 4313 4314 /* Disallow access to IPv6 fields from IPv4 contex and vise 4315 * versa. 4316 */ 4317 switch (off) { 4318 case bpf_ctx_range(struct bpf_sock_addr, user_ip4): 4319 switch (prog->expected_attach_type) { 4320 case BPF_CGROUP_INET4_BIND: 4321 case BPF_CGROUP_INET4_CONNECT: 4322 break; 4323 default: 4324 return false; 4325 } 4326 break; 4327 case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): 4328 switch (prog->expected_attach_type) { 4329 case BPF_CGROUP_INET6_BIND: 4330 case BPF_CGROUP_INET6_CONNECT: 4331 break; 4332 default: 4333 return false; 4334 } 4335 break; 4336 } 4337 4338 switch (off) { 4339 case bpf_ctx_range(struct bpf_sock_addr, user_ip4): 4340 case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): 4341 /* Only narrow read access allowed for now. */ 4342 if (type == BPF_READ) { 4343 bpf_ctx_record_field_size(info, size_default); 4344 if (!bpf_ctx_narrow_access_ok(off, size, size_default)) 4345 return false; 4346 } else { 4347 if (size != size_default) 4348 return false; 4349 } 4350 break; 4351 case bpf_ctx_range(struct bpf_sock_addr, user_port): 4352 if (size != size_default) 4353 return false; 4354 break; 4355 default: 4356 if (type == BPF_READ) { 4357 if (size != size_default) 4358 return false; 4359 } else { 4360 return false; 4361 } 4362 } 4363 4364 return true; 4365 } 4366 4367 static bool sock_ops_is_valid_access(int off, int size, 4368 enum bpf_access_type type, 4369 const struct bpf_prog *prog, 4370 struct bpf_insn_access_aux *info) 4371 { 4372 const int size_default = sizeof(__u32); 4373 4374 if (off < 0 || off >= sizeof(struct bpf_sock_ops)) 4375 return false; 4376 4377 /* The verifier guarantees that size > 0. */ 4378 if (off % size != 0) 4379 return false; 4380 4381 if (type == BPF_WRITE) { 4382 switch (off) { 4383 case offsetof(struct bpf_sock_ops, reply): 4384 case offsetof(struct bpf_sock_ops, sk_txhash): 4385 if (size != size_default) 4386 return false; 4387 break; 4388 default: 4389 return false; 4390 } 4391 } else { 4392 switch (off) { 4393 case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received, 4394 bytes_acked): 4395 if (size != sizeof(__u64)) 4396 return false; 4397 break; 4398 default: 4399 if (size != size_default) 4400 return false; 4401 break; 4402 } 4403 } 4404 4405 return true; 4406 } 4407 4408 static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write, 4409 const struct bpf_prog *prog) 4410 { 4411 return bpf_unclone_prologue(insn_buf, direct_write, prog, SK_DROP); 4412 } 4413 4414 static bool sk_skb_is_valid_access(int off, int size, 4415 enum bpf_access_type type, 4416 const struct bpf_prog *prog, 4417 struct bpf_insn_access_aux *info) 4418 { 4419 switch (off) { 4420 case bpf_ctx_range(struct __sk_buff, tc_classid): 4421 case bpf_ctx_range(struct __sk_buff, data_meta): 4422 return false; 4423 } 4424 4425 if (type == BPF_WRITE) { 4426 switch (off) { 4427 case bpf_ctx_range(struct __sk_buff, tc_index): 4428 case bpf_ctx_range(struct __sk_buff, priority): 4429 break; 4430 default: 4431 return false; 4432 } 4433 } 4434 4435 switch (off) { 4436 case bpf_ctx_range(struct __sk_buff, mark): 4437 return false; 4438 case bpf_ctx_range(struct __sk_buff, data): 4439 info->reg_type = PTR_TO_PACKET; 4440 break; 4441 case bpf_ctx_range(struct __sk_buff, data_end): 4442 info->reg_type = PTR_TO_PACKET_END; 4443 break; 4444 } 4445 4446 return bpf_skb_is_valid_access(off, size, type, prog, info); 4447 } 4448 4449 static bool sk_msg_is_valid_access(int off, int size, 4450 enum bpf_access_type type, 4451 const struct bpf_prog *prog, 4452 struct bpf_insn_access_aux *info) 4453 { 4454 if (type == BPF_WRITE) 4455 return false; 4456 4457 switch (off) { 4458 case offsetof(struct sk_msg_md, data): 4459 info->reg_type = PTR_TO_PACKET; 4460 break; 4461 case offsetof(struct sk_msg_md, data_end): 4462 info->reg_type = PTR_TO_PACKET_END; 4463 break; 4464 } 4465 4466 if (off < 0 || off >= sizeof(struct sk_msg_md)) 4467 return false; 4468 if (off % size != 0) 4469 return false; 4470 if (size != sizeof(__u64)) 4471 return false; 4472 4473 return true; 4474 } 4475 4476 static u32 bpf_convert_ctx_access(enum bpf_access_type type, 4477 const struct bpf_insn *si, 4478 struct bpf_insn *insn_buf, 4479 struct bpf_prog *prog, u32 *target_size) 4480 { 4481 struct bpf_insn *insn = insn_buf; 4482 int off; 4483 4484 switch (si->off) { 4485 case offsetof(struct __sk_buff, len): 4486 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, 4487 bpf_target_off(struct sk_buff, len, 4, 4488 target_size)); 4489 break; 4490 4491 case offsetof(struct __sk_buff, protocol): 4492 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, 4493 bpf_target_off(struct sk_buff, protocol, 2, 4494 target_size)); 4495 break; 4496 4497 case offsetof(struct __sk_buff, vlan_proto): 4498 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, 4499 bpf_target_off(struct sk_buff, vlan_proto, 2, 4500 target_size)); 4501 break; 4502 4503 case offsetof(struct __sk_buff, priority): 4504 if (type == BPF_WRITE) 4505 *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, 4506 bpf_target_off(struct sk_buff, priority, 4, 4507 target_size)); 4508 else 4509 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, 4510 bpf_target_off(struct sk_buff, priority, 4, 4511 target_size)); 4512 break; 4513 4514 case offsetof(struct __sk_buff, ingress_ifindex): 4515 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, 4516 bpf_target_off(struct sk_buff, skb_iif, 4, 4517 target_size)); 4518 break; 4519 4520 case offsetof(struct __sk_buff, ifindex): 4521 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), 4522 si->dst_reg, si->src_reg, 4523 offsetof(struct sk_buff, dev)); 4524 *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); 4525 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 4526 bpf_target_off(struct net_device, ifindex, 4, 4527 target_size)); 4528 break; 4529 4530 case offsetof(struct __sk_buff, hash): 4531 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, 4532 bpf_target_off(struct sk_buff, hash, 4, 4533 target_size)); 4534 break; 4535 4536 case offsetof(struct __sk_buff, mark): 4537 if (type == BPF_WRITE) 4538 *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, 4539 bpf_target_off(struct sk_buff, mark, 4, 4540 target_size)); 4541 else 4542 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, 4543 bpf_target_off(struct sk_buff, mark, 4, 4544 target_size)); 4545 break; 4546 4547 case offsetof(struct __sk_buff, pkt_type): 4548 *target_size = 1; 4549 *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg, 4550 PKT_TYPE_OFFSET()); 4551 *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, PKT_TYPE_MAX); 4552 #ifdef __BIG_ENDIAN_BITFIELD 4553 *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5); 4554 #endif 4555 break; 4556 4557 case offsetof(struct __sk_buff, queue_mapping): 4558 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, 4559 bpf_target_off(struct sk_buff, queue_mapping, 2, 4560 target_size)); 4561 break; 4562 4563 case offsetof(struct __sk_buff, vlan_present): 4564 case offsetof(struct __sk_buff, vlan_tci): 4565 BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); 4566 4567 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, 4568 bpf_target_off(struct sk_buff, vlan_tci, 2, 4569 target_size)); 4570 if (si->off == offsetof(struct __sk_buff, vlan_tci)) { 4571 *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, 4572 ~VLAN_TAG_PRESENT); 4573 } else { 4574 *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 12); 4575 *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, 1); 4576 } 4577 break; 4578 4579 case offsetof(struct __sk_buff, cb[0]) ... 4580 offsetofend(struct __sk_buff, cb[4]) - 1: 4581 BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20); 4582 BUILD_BUG_ON((offsetof(struct sk_buff, cb) + 4583 offsetof(struct qdisc_skb_cb, data)) % 4584 sizeof(__u64)); 4585 4586 prog->cb_access = 1; 4587 off = si->off; 4588 off -= offsetof(struct __sk_buff, cb[0]); 4589 off += offsetof(struct sk_buff, cb); 4590 off += offsetof(struct qdisc_skb_cb, data); 4591 if (type == BPF_WRITE) 4592 *insn++ = BPF_STX_MEM(BPF_SIZE(si->code), si->dst_reg, 4593 si->src_reg, off); 4594 else 4595 *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg, 4596 si->src_reg, off); 4597 break; 4598 4599 case offsetof(struct __sk_buff, tc_classid): 4600 BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, tc_classid) != 2); 4601 4602 off = si->off; 4603 off -= offsetof(struct __sk_buff, tc_classid); 4604 off += offsetof(struct sk_buff, cb); 4605 off += offsetof(struct qdisc_skb_cb, tc_classid); 4606 *target_size = 2; 4607 if (type == BPF_WRITE) 4608 *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, 4609 si->src_reg, off); 4610 else 4611 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, 4612 si->src_reg, off); 4613 break; 4614 4615 case offsetof(struct __sk_buff, data): 4616 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), 4617 si->dst_reg, si->src_reg, 4618 offsetof(struct sk_buff, data)); 4619 break; 4620 4621 case offsetof(struct __sk_buff, data_meta): 4622 off = si->off; 4623 off -= offsetof(struct __sk_buff, data_meta); 4624 off += offsetof(struct sk_buff, cb); 4625 off += offsetof(struct bpf_skb_data_end, data_meta); 4626 *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, 4627 si->src_reg, off); 4628 break; 4629 4630 case offsetof(struct __sk_buff, data_end): 4631 off = si->off; 4632 off -= offsetof(struct __sk_buff, data_end); 4633 off += offsetof(struct sk_buff, cb); 4634 off += offsetof(struct bpf_skb_data_end, data_end); 4635 *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, 4636 si->src_reg, off); 4637 break; 4638 4639 case offsetof(struct __sk_buff, tc_index): 4640 #ifdef CONFIG_NET_SCHED 4641 if (type == BPF_WRITE) 4642 *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg, 4643 bpf_target_off(struct sk_buff, tc_index, 2, 4644 target_size)); 4645 else 4646 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, 4647 bpf_target_off(struct sk_buff, tc_index, 2, 4648 target_size)); 4649 #else 4650 *target_size = 2; 4651 if (type == BPF_WRITE) 4652 *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg); 4653 else 4654 *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); 4655 #endif 4656 break; 4657 4658 case offsetof(struct __sk_buff, napi_id): 4659 #if defined(CONFIG_NET_RX_BUSY_POLL) 4660 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, 4661 bpf_target_off(struct sk_buff, napi_id, 4, 4662 target_size)); 4663 *insn++ = BPF_JMP_IMM(BPF_JGE, si->dst_reg, MIN_NAPI_ID, 1); 4664 *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); 4665 #else 4666 *target_size = 4; 4667 *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); 4668 #endif 4669 break; 4670 case offsetof(struct __sk_buff, family): 4671 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2); 4672 4673 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), 4674 si->dst_reg, si->src_reg, 4675 offsetof(struct sk_buff, sk)); 4676 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, 4677 bpf_target_off(struct sock_common, 4678 skc_family, 4679 2, target_size)); 4680 break; 4681 case offsetof(struct __sk_buff, remote_ip4): 4682 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4); 4683 4684 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), 4685 si->dst_reg, si->src_reg, 4686 offsetof(struct sk_buff, sk)); 4687 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 4688 bpf_target_off(struct sock_common, 4689 skc_daddr, 4690 4, target_size)); 4691 break; 4692 case offsetof(struct __sk_buff, local_ip4): 4693 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, 4694 skc_rcv_saddr) != 4); 4695 4696 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), 4697 si->dst_reg, si->src_reg, 4698 offsetof(struct sk_buff, sk)); 4699 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 4700 bpf_target_off(struct sock_common, 4701 skc_rcv_saddr, 4702 4, target_size)); 4703 break; 4704 case offsetof(struct __sk_buff, remote_ip6[0]) ... 4705 offsetof(struct __sk_buff, remote_ip6[3]): 4706 #if IS_ENABLED(CONFIG_IPV6) 4707 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, 4708 skc_v6_daddr.s6_addr32[0]) != 4); 4709 4710 off = si->off; 4711 off -= offsetof(struct __sk_buff, remote_ip6[0]); 4712 4713 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), 4714 si->dst_reg, si->src_reg, 4715 offsetof(struct sk_buff, sk)); 4716 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 4717 offsetof(struct sock_common, 4718 skc_v6_daddr.s6_addr32[0]) + 4719 off); 4720 #else 4721 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); 4722 #endif 4723 break; 4724 case offsetof(struct __sk_buff, local_ip6[0]) ... 4725 offsetof(struct __sk_buff, local_ip6[3]): 4726 #if IS_ENABLED(CONFIG_IPV6) 4727 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, 4728 skc_v6_rcv_saddr.s6_addr32[0]) != 4); 4729 4730 off = si->off; 4731 off -= offsetof(struct __sk_buff, local_ip6[0]); 4732 4733 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), 4734 si->dst_reg, si->src_reg, 4735 offsetof(struct sk_buff, sk)); 4736 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 4737 offsetof(struct sock_common, 4738 skc_v6_rcv_saddr.s6_addr32[0]) + 4739 off); 4740 #else 4741 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); 4742 #endif 4743 break; 4744 4745 case offsetof(struct __sk_buff, remote_port): 4746 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2); 4747 4748 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), 4749 si->dst_reg, si->src_reg, 4750 offsetof(struct sk_buff, sk)); 4751 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, 4752 bpf_target_off(struct sock_common, 4753 skc_dport, 4754 2, target_size)); 4755 #ifndef __BIG_ENDIAN_BITFIELD 4756 *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); 4757 #endif 4758 break; 4759 4760 case offsetof(struct __sk_buff, local_port): 4761 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2); 4762 4763 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), 4764 si->dst_reg, si->src_reg, 4765 offsetof(struct sk_buff, sk)); 4766 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, 4767 bpf_target_off(struct sock_common, 4768 skc_num, 2, target_size)); 4769 break; 4770 } 4771 4772 return insn - insn_buf; 4773 } 4774 4775 static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, 4776 const struct bpf_insn *si, 4777 struct bpf_insn *insn_buf, 4778 struct bpf_prog *prog, u32 *target_size) 4779 { 4780 struct bpf_insn *insn = insn_buf; 4781 int off; 4782 4783 switch (si->off) { 4784 case offsetof(struct bpf_sock, bound_dev_if): 4785 BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4); 4786 4787 if (type == BPF_WRITE) 4788 *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, 4789 offsetof(struct sock, sk_bound_dev_if)); 4790 else 4791 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, 4792 offsetof(struct sock, sk_bound_dev_if)); 4793 break; 4794 4795 case offsetof(struct bpf_sock, mark): 4796 BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_mark) != 4); 4797 4798 if (type == BPF_WRITE) 4799 *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, 4800 offsetof(struct sock, sk_mark)); 4801 else 4802 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, 4803 offsetof(struct sock, sk_mark)); 4804 break; 4805 4806 case offsetof(struct bpf_sock, priority): 4807 BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_priority) != 4); 4808 4809 if (type == BPF_WRITE) 4810 *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, 4811 offsetof(struct sock, sk_priority)); 4812 else 4813 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, 4814 offsetof(struct sock, sk_priority)); 4815 break; 4816 4817 case offsetof(struct bpf_sock, family): 4818 BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2); 4819 4820 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, 4821 offsetof(struct sock, sk_family)); 4822 break; 4823 4824 case offsetof(struct bpf_sock, type): 4825 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, 4826 offsetof(struct sock, __sk_flags_offset)); 4827 *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK); 4828 *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT); 4829 break; 4830 4831 case offsetof(struct bpf_sock, protocol): 4832 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, 4833 offsetof(struct sock, __sk_flags_offset)); 4834 *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); 4835 *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT); 4836 break; 4837 4838 case offsetof(struct bpf_sock, src_ip4): 4839 *insn++ = BPF_LDX_MEM( 4840 BPF_SIZE(si->code), si->dst_reg, si->src_reg, 4841 bpf_target_off(struct sock_common, skc_rcv_saddr, 4842 FIELD_SIZEOF(struct sock_common, 4843 skc_rcv_saddr), 4844 target_size)); 4845 break; 4846 4847 case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): 4848 #if IS_ENABLED(CONFIG_IPV6) 4849 off = si->off; 4850 off -= offsetof(struct bpf_sock, src_ip6[0]); 4851 *insn++ = BPF_LDX_MEM( 4852 BPF_SIZE(si->code), si->dst_reg, si->src_reg, 4853 bpf_target_off( 4854 struct sock_common, 4855 skc_v6_rcv_saddr.s6_addr32[0], 4856 FIELD_SIZEOF(struct sock_common, 4857 skc_v6_rcv_saddr.s6_addr32[0]), 4858 target_size) + off); 4859 #else 4860 (void)off; 4861 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); 4862 #endif 4863 break; 4864 4865 case offsetof(struct bpf_sock, src_port): 4866 *insn++ = BPF_LDX_MEM( 4867 BPF_FIELD_SIZEOF(struct sock_common, skc_num), 4868 si->dst_reg, si->src_reg, 4869 bpf_target_off(struct sock_common, skc_num, 4870 FIELD_SIZEOF(struct sock_common, 4871 skc_num), 4872 target_size)); 4873 break; 4874 } 4875 4876 return insn - insn_buf; 4877 } 4878 4879 static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, 4880 const struct bpf_insn *si, 4881 struct bpf_insn *insn_buf, 4882 struct bpf_prog *prog, u32 *target_size) 4883 { 4884 struct bpf_insn *insn = insn_buf; 4885 4886 switch (si->off) { 4887 case offsetof(struct __sk_buff, ifindex): 4888 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), 4889 si->dst_reg, si->src_reg, 4890 offsetof(struct sk_buff, dev)); 4891 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 4892 bpf_target_off(struct net_device, ifindex, 4, 4893 target_size)); 4894 break; 4895 default: 4896 return bpf_convert_ctx_access(type, si, insn_buf, prog, 4897 target_size); 4898 } 4899 4900 return insn - insn_buf; 4901 } 4902 4903 static u32 xdp_convert_ctx_access(enum bpf_access_type type, 4904 const struct bpf_insn *si, 4905 struct bpf_insn *insn_buf, 4906 struct bpf_prog *prog, u32 *target_size) 4907 { 4908 struct bpf_insn *insn = insn_buf; 4909 4910 switch (si->off) { 4911 case offsetof(struct xdp_md, data): 4912 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data), 4913 si->dst_reg, si->src_reg, 4914 offsetof(struct xdp_buff, data)); 4915 break; 4916 case offsetof(struct xdp_md, data_meta): 4917 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_meta), 4918 si->dst_reg, si->src_reg, 4919 offsetof(struct xdp_buff, data_meta)); 4920 break; 4921 case offsetof(struct xdp_md, data_end): 4922 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end), 4923 si->dst_reg, si->src_reg, 4924 offsetof(struct xdp_buff, data_end)); 4925 break; 4926 case offsetof(struct xdp_md, ingress_ifindex): 4927 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), 4928 si->dst_reg, si->src_reg, 4929 offsetof(struct xdp_buff, rxq)); 4930 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev), 4931 si->dst_reg, si->dst_reg, 4932 offsetof(struct xdp_rxq_info, dev)); 4933 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 4934 offsetof(struct net_device, ifindex)); 4935 break; 4936 case offsetof(struct xdp_md, rx_queue_index): 4937 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), 4938 si->dst_reg, si->src_reg, 4939 offsetof(struct xdp_buff, rxq)); 4940 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 4941 offsetof(struct xdp_rxq_info, 4942 queue_index)); 4943 break; 4944 } 4945 4946 return insn - insn_buf; 4947 } 4948 4949 /* SOCK_ADDR_LOAD_NESTED_FIELD() loads Nested Field S.F.NF where S is type of 4950 * context Structure, F is Field in context structure that contains a pointer 4951 * to Nested Structure of type NS that has the field NF. 4952 * 4953 * SIZE encodes the load size (BPF_B, BPF_H, etc). It's up to caller to make 4954 * sure that SIZE is not greater than actual size of S.F.NF. 4955 * 4956 * If offset OFF is provided, the load happens from that offset relative to 4957 * offset of NF. 4958 */ 4959 #define SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF) \ 4960 do { \ 4961 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), si->dst_reg, \ 4962 si->src_reg, offsetof(S, F)); \ 4963 *insn++ = BPF_LDX_MEM( \ 4964 SIZE, si->dst_reg, si->dst_reg, \ 4965 bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF), \ 4966 target_size) \ 4967 + OFF); \ 4968 } while (0) 4969 4970 #define SOCK_ADDR_LOAD_NESTED_FIELD(S, NS, F, NF) \ 4971 SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, \ 4972 BPF_FIELD_SIZEOF(NS, NF), 0) 4973 4974 /* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to 4975 * SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation. 4976 * 4977 * It doesn't support SIZE argument though since narrow stores are not 4978 * supported for now. 4979 * 4980 * In addition it uses Temporary Field TF (member of struct S) as the 3rd 4981 * "register" since two registers available in convert_ctx_access are not 4982 * enough: we can't override neither SRC, since it contains value to store, nor 4983 * DST since it contains pointer to context that may be used by later 4984 * instructions. But we need a temporary place to save pointer to nested 4985 * structure whose field we want to store to. 4986 */ 4987 #define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, TF) \ 4988 do { \ 4989 int tmp_reg = BPF_REG_9; \ 4990 if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ 4991 --tmp_reg; \ 4992 if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ 4993 --tmp_reg; \ 4994 *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, tmp_reg, \ 4995 offsetof(S, TF)); \ 4996 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg, \ 4997 si->dst_reg, offsetof(S, F)); \ 4998 *insn++ = BPF_STX_MEM( \ 4999 BPF_FIELD_SIZEOF(NS, NF), tmp_reg, si->src_reg, \ 5000 bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF), \ 5001 target_size) \ 5002 + OFF); \ 5003 *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, si->dst_reg, \ 5004 offsetof(S, TF)); \ 5005 } while (0) 5006 5007 #define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF, \ 5008 TF) \ 5009 do { \ 5010 if (type == BPF_WRITE) { \ 5011 SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, \ 5012 TF); \ 5013 } else { \ 5014 SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( \ 5015 S, NS, F, NF, SIZE, OFF); \ 5016 } \ 5017 } while (0) 5018 5019 #define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(S, NS, F, NF, TF) \ 5020 SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( \ 5021 S, NS, F, NF, BPF_FIELD_SIZEOF(NS, NF), 0, TF) 5022 5023 static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, 5024 const struct bpf_insn *si, 5025 struct bpf_insn *insn_buf, 5026 struct bpf_prog *prog, u32 *target_size) 5027 { 5028 struct bpf_insn *insn = insn_buf; 5029 int off; 5030 5031 switch (si->off) { 5032 case offsetof(struct bpf_sock_addr, user_family): 5033 SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, 5034 struct sockaddr, uaddr, sa_family); 5035 break; 5036 5037 case offsetof(struct bpf_sock_addr, user_ip4): 5038 SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( 5039 struct bpf_sock_addr_kern, struct sockaddr_in, uaddr, 5040 sin_addr, BPF_SIZE(si->code), 0, tmp_reg); 5041 break; 5042 5043 case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): 5044 off = si->off; 5045 off -= offsetof(struct bpf_sock_addr, user_ip6[0]); 5046 SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( 5047 struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr, 5048 sin6_addr.s6_addr32[0], BPF_SIZE(si->code), off, 5049 tmp_reg); 5050 break; 5051 5052 case offsetof(struct bpf_sock_addr, user_port): 5053 /* To get port we need to know sa_family first and then treat 5054 * sockaddr as either sockaddr_in or sockaddr_in6. 5055 * Though we can simplify since port field has same offset and 5056 * size in both structures. 5057 * Here we check this invariant and use just one of the 5058 * structures if it's true. 5059 */ 5060 BUILD_BUG_ON(offsetof(struct sockaddr_in, sin_port) != 5061 offsetof(struct sockaddr_in6, sin6_port)); 5062 BUILD_BUG_ON(FIELD_SIZEOF(struct sockaddr_in, sin_port) != 5063 FIELD_SIZEOF(struct sockaddr_in6, sin6_port)); 5064 SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(struct bpf_sock_addr_kern, 5065 struct sockaddr_in6, uaddr, 5066 sin6_port, tmp_reg); 5067 break; 5068 5069 case offsetof(struct bpf_sock_addr, family): 5070 SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, 5071 struct sock, sk, sk_family); 5072 break; 5073 5074 case offsetof(struct bpf_sock_addr, type): 5075 SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( 5076 struct bpf_sock_addr_kern, struct sock, sk, 5077 __sk_flags_offset, BPF_W, 0); 5078 *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK); 5079 *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT); 5080 break; 5081 5082 case offsetof(struct bpf_sock_addr, protocol): 5083 SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( 5084 struct bpf_sock_addr_kern, struct sock, sk, 5085 __sk_flags_offset, BPF_W, 0); 5086 *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); 5087 *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5088 SK_FL_PROTO_SHIFT); 5089 break; 5090 } 5091 5092 return insn - insn_buf; 5093 } 5094 5095 static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, 5096 const struct bpf_insn *si, 5097 struct bpf_insn *insn_buf, 5098 struct bpf_prog *prog, 5099 u32 *target_size) 5100 { 5101 struct bpf_insn *insn = insn_buf; 5102 int off; 5103 5104 switch (si->off) { 5105 case offsetof(struct bpf_sock_ops, op) ... 5106 offsetof(struct bpf_sock_ops, replylong[3]): 5107 BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, op) != 5108 FIELD_SIZEOF(struct bpf_sock_ops_kern, op)); 5109 BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, reply) != 5110 FIELD_SIZEOF(struct bpf_sock_ops_kern, reply)); 5111 BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, replylong) != 5112 FIELD_SIZEOF(struct bpf_sock_ops_kern, replylong)); 5113 off = si->off; 5114 off -= offsetof(struct bpf_sock_ops, op); 5115 off += offsetof(struct bpf_sock_ops_kern, op); 5116 if (type == BPF_WRITE) 5117 *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, 5118 off); 5119 else 5120 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, 5121 off); 5122 break; 5123 5124 case offsetof(struct bpf_sock_ops, family): 5125 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2); 5126 5127 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 5128 struct bpf_sock_ops_kern, sk), 5129 si->dst_reg, si->src_reg, 5130 offsetof(struct bpf_sock_ops_kern, sk)); 5131 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, 5132 offsetof(struct sock_common, skc_family)); 5133 break; 5134 5135 case offsetof(struct bpf_sock_ops, remote_ip4): 5136 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4); 5137 5138 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 5139 struct bpf_sock_ops_kern, sk), 5140 si->dst_reg, si->src_reg, 5141 offsetof(struct bpf_sock_ops_kern, sk)); 5142 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 5143 offsetof(struct sock_common, skc_daddr)); 5144 break; 5145 5146 case offsetof(struct bpf_sock_ops, local_ip4): 5147 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_rcv_saddr) != 4); 5148 5149 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 5150 struct bpf_sock_ops_kern, sk), 5151 si->dst_reg, si->src_reg, 5152 offsetof(struct bpf_sock_ops_kern, sk)); 5153 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 5154 offsetof(struct sock_common, 5155 skc_rcv_saddr)); 5156 break; 5157 5158 case offsetof(struct bpf_sock_ops, remote_ip6[0]) ... 5159 offsetof(struct bpf_sock_ops, remote_ip6[3]): 5160 #if IS_ENABLED(CONFIG_IPV6) 5161 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, 5162 skc_v6_daddr.s6_addr32[0]) != 4); 5163 5164 off = si->off; 5165 off -= offsetof(struct bpf_sock_ops, remote_ip6[0]); 5166 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 5167 struct bpf_sock_ops_kern, sk), 5168 si->dst_reg, si->src_reg, 5169 offsetof(struct bpf_sock_ops_kern, sk)); 5170 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 5171 offsetof(struct sock_common, 5172 skc_v6_daddr.s6_addr32[0]) + 5173 off); 5174 #else 5175 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); 5176 #endif 5177 break; 5178 5179 case offsetof(struct bpf_sock_ops, local_ip6[0]) ... 5180 offsetof(struct bpf_sock_ops, local_ip6[3]): 5181 #if IS_ENABLED(CONFIG_IPV6) 5182 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, 5183 skc_v6_rcv_saddr.s6_addr32[0]) != 4); 5184 5185 off = si->off; 5186 off -= offsetof(struct bpf_sock_ops, local_ip6[0]); 5187 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 5188 struct bpf_sock_ops_kern, sk), 5189 si->dst_reg, si->src_reg, 5190 offsetof(struct bpf_sock_ops_kern, sk)); 5191 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 5192 offsetof(struct sock_common, 5193 skc_v6_rcv_saddr.s6_addr32[0]) + 5194 off); 5195 #else 5196 *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); 5197 #endif 5198 break; 5199 5200 case offsetof(struct bpf_sock_ops, remote_port): 5201 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2); 5202 5203 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 5204 struct bpf_sock_ops_kern, sk), 5205 si->dst_reg, si->src_reg, 5206 offsetof(struct bpf_sock_ops_kern, sk)); 5207 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, 5208 offsetof(struct sock_common, skc_dport)); 5209 #ifndef __BIG_ENDIAN_BITFIELD 5210 *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); 5211 #endif 5212 break; 5213 5214 case offsetof(struct bpf_sock_ops, local_port): 5215 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2); 5216 5217 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 5218 struct bpf_sock_ops_kern, sk), 5219 si->dst_reg, si->src_reg, 5220 offsetof(struct bpf_sock_ops_kern, sk)); 5221 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, 5222 offsetof(struct sock_common, skc_num)); 5223 break; 5224 5225 case offsetof(struct bpf_sock_ops, is_fullsock): 5226 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 5227 struct bpf_sock_ops_kern, 5228 is_fullsock), 5229 si->dst_reg, si->src_reg, 5230 offsetof(struct bpf_sock_ops_kern, 5231 is_fullsock)); 5232 break; 5233 5234 case offsetof(struct bpf_sock_ops, state): 5235 BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_state) != 1); 5236 5237 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 5238 struct bpf_sock_ops_kern, sk), 5239 si->dst_reg, si->src_reg, 5240 offsetof(struct bpf_sock_ops_kern, sk)); 5241 *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->dst_reg, 5242 offsetof(struct sock_common, skc_state)); 5243 break; 5244 5245 case offsetof(struct bpf_sock_ops, rtt_min): 5246 BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, rtt_min) != 5247 sizeof(struct minmax)); 5248 BUILD_BUG_ON(sizeof(struct minmax) < 5249 sizeof(struct minmax_sample)); 5250 5251 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( 5252 struct bpf_sock_ops_kern, sk), 5253 si->dst_reg, si->src_reg, 5254 offsetof(struct bpf_sock_ops_kern, sk)); 5255 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 5256 offsetof(struct tcp_sock, rtt_min) + 5257 FIELD_SIZEOF(struct minmax_sample, t)); 5258 break; 5259 5260 /* Helper macro for adding read access to tcp_sock or sock fields. */ 5261 #define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ 5262 do { \ 5263 BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \ 5264 FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \ 5265 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ 5266 struct bpf_sock_ops_kern, \ 5267 is_fullsock), \ 5268 si->dst_reg, si->src_reg, \ 5269 offsetof(struct bpf_sock_ops_kern, \ 5270 is_fullsock)); \ 5271 *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 2); \ 5272 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ 5273 struct bpf_sock_ops_kern, sk),\ 5274 si->dst_reg, si->src_reg, \ 5275 offsetof(struct bpf_sock_ops_kern, sk));\ 5276 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ, \ 5277 OBJ_FIELD), \ 5278 si->dst_reg, si->dst_reg, \ 5279 offsetof(OBJ, OBJ_FIELD)); \ 5280 } while (0) 5281 5282 /* Helper macro for adding write access to tcp_sock or sock fields. 5283 * The macro is called with two registers, dst_reg which contains a pointer 5284 * to ctx (context) and src_reg which contains the value that should be 5285 * stored. However, we need an additional register since we cannot overwrite 5286 * dst_reg because it may be used later in the program. 5287 * Instead we "borrow" one of the other register. We first save its value 5288 * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore 5289 * it at the end of the macro. 5290 */ 5291 #define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ 5292 do { \ 5293 int reg = BPF_REG_9; \ 5294 BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \ 5295 FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \ 5296 if (si->dst_reg == reg || si->src_reg == reg) \ 5297 reg--; \ 5298 if (si->dst_reg == reg || si->src_reg == reg) \ 5299 reg--; \ 5300 *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg, \ 5301 offsetof(struct bpf_sock_ops_kern, \ 5302 temp)); \ 5303 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ 5304 struct bpf_sock_ops_kern, \ 5305 is_fullsock), \ 5306 reg, si->dst_reg, \ 5307 offsetof(struct bpf_sock_ops_kern, \ 5308 is_fullsock)); \ 5309 *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \ 5310 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ 5311 struct bpf_sock_ops_kern, sk),\ 5312 reg, si->dst_reg, \ 5313 offsetof(struct bpf_sock_ops_kern, sk));\ 5314 *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD), \ 5315 reg, si->src_reg, \ 5316 offsetof(OBJ, OBJ_FIELD)); \ 5317 *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg, \ 5318 offsetof(struct bpf_sock_ops_kern, \ 5319 temp)); \ 5320 } while (0) 5321 5322 #define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE) \ 5323 do { \ 5324 if (TYPE == BPF_WRITE) \ 5325 SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ 5326 else \ 5327 SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ 5328 } while (0) 5329 5330 case offsetof(struct bpf_sock_ops, snd_cwnd): 5331 SOCK_OPS_GET_FIELD(snd_cwnd, snd_cwnd, struct tcp_sock); 5332 break; 5333 5334 case offsetof(struct bpf_sock_ops, srtt_us): 5335 SOCK_OPS_GET_FIELD(srtt_us, srtt_us, struct tcp_sock); 5336 break; 5337 5338 case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags): 5339 SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags, 5340 struct tcp_sock); 5341 break; 5342 5343 case offsetof(struct bpf_sock_ops, snd_ssthresh): 5344 SOCK_OPS_GET_FIELD(snd_ssthresh, snd_ssthresh, struct tcp_sock); 5345 break; 5346 5347 case offsetof(struct bpf_sock_ops, rcv_nxt): 5348 SOCK_OPS_GET_FIELD(rcv_nxt, rcv_nxt, struct tcp_sock); 5349 break; 5350 5351 case offsetof(struct bpf_sock_ops, snd_nxt): 5352 SOCK_OPS_GET_FIELD(snd_nxt, snd_nxt, struct tcp_sock); 5353 break; 5354 5355 case offsetof(struct bpf_sock_ops, snd_una): 5356 SOCK_OPS_GET_FIELD(snd_una, snd_una, struct tcp_sock); 5357 break; 5358 5359 case offsetof(struct bpf_sock_ops, mss_cache): 5360 SOCK_OPS_GET_FIELD(mss_cache, mss_cache, struct tcp_sock); 5361 break; 5362 5363 case offsetof(struct bpf_sock_ops, ecn_flags): 5364 SOCK_OPS_GET_FIELD(ecn_flags, ecn_flags, struct tcp_sock); 5365 break; 5366 5367 case offsetof(struct bpf_sock_ops, rate_delivered): 5368 SOCK_OPS_GET_FIELD(rate_delivered, rate_delivered, 5369 struct tcp_sock); 5370 break; 5371 5372 case offsetof(struct bpf_sock_ops, rate_interval_us): 5373 SOCK_OPS_GET_FIELD(rate_interval_us, rate_interval_us, 5374 struct tcp_sock); 5375 break; 5376 5377 case offsetof(struct bpf_sock_ops, packets_out): 5378 SOCK_OPS_GET_FIELD(packets_out, packets_out, struct tcp_sock); 5379 break; 5380 5381 case offsetof(struct bpf_sock_ops, retrans_out): 5382 SOCK_OPS_GET_FIELD(retrans_out, retrans_out, struct tcp_sock); 5383 break; 5384 5385 case offsetof(struct bpf_sock_ops, total_retrans): 5386 SOCK_OPS_GET_FIELD(total_retrans, total_retrans, 5387 struct tcp_sock); 5388 break; 5389 5390 case offsetof(struct bpf_sock_ops, segs_in): 5391 SOCK_OPS_GET_FIELD(segs_in, segs_in, struct tcp_sock); 5392 break; 5393 5394 case offsetof(struct bpf_sock_ops, data_segs_in): 5395 SOCK_OPS_GET_FIELD(data_segs_in, data_segs_in, struct tcp_sock); 5396 break; 5397 5398 case offsetof(struct bpf_sock_ops, segs_out): 5399 SOCK_OPS_GET_FIELD(segs_out, segs_out, struct tcp_sock); 5400 break; 5401 5402 case offsetof(struct bpf_sock_ops, data_segs_out): 5403 SOCK_OPS_GET_FIELD(data_segs_out, data_segs_out, 5404 struct tcp_sock); 5405 break; 5406 5407 case offsetof(struct bpf_sock_ops, lost_out): 5408 SOCK_OPS_GET_FIELD(lost_out, lost_out, struct tcp_sock); 5409 break; 5410 5411 case offsetof(struct bpf_sock_ops, sacked_out): 5412 SOCK_OPS_GET_FIELD(sacked_out, sacked_out, struct tcp_sock); 5413 break; 5414 5415 case offsetof(struct bpf_sock_ops, sk_txhash): 5416 SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash, 5417 struct sock, type); 5418 break; 5419 5420 case offsetof(struct bpf_sock_ops, bytes_received): 5421 SOCK_OPS_GET_FIELD(bytes_received, bytes_received, 5422 struct tcp_sock); 5423 break; 5424 5425 case offsetof(struct bpf_sock_ops, bytes_acked): 5426 SOCK_OPS_GET_FIELD(bytes_acked, bytes_acked, struct tcp_sock); 5427 break; 5428 5429 } 5430 return insn - insn_buf; 5431 } 5432 5433 static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, 5434 const struct bpf_insn *si, 5435 struct bpf_insn *insn_buf, 5436 struct bpf_prog *prog, u32 *target_size) 5437 { 5438 struct bpf_insn *insn = insn_buf; 5439 int off; 5440 5441 switch (si->off) { 5442 case offsetof(struct __sk_buff, data_end): 5443 off = si->off; 5444 off -= offsetof(struct __sk_buff, data_end); 5445 off += offsetof(struct sk_buff, cb); 5446 off += offsetof(struct tcp_skb_cb, bpf.data_end); 5447 *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, 5448 si->src_reg, off); 5449 break; 5450 default: 5451 return bpf_convert_ctx_access(type, si, insn_buf, prog, 5452 target_size); 5453 } 5454 5455 return insn - insn_buf; 5456 } 5457 5458 static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, 5459 const struct bpf_insn *si, 5460 struct bpf_insn *insn_buf, 5461 struct bpf_prog *prog, u32 *target_size) 5462 { 5463 struct bpf_insn *insn = insn_buf; 5464 5465 switch (si->off) { 5466 case offsetof(struct sk_msg_md, data): 5467 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data), 5468 si->dst_reg, si->src_reg, 5469 offsetof(struct sk_msg_buff, data)); 5470 break; 5471 case offsetof(struct sk_msg_md, data_end): 5472 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data_end), 5473 si->dst_reg, si->src_reg, 5474 offsetof(struct sk_msg_buff, data_end)); 5475 break; 5476 } 5477 5478 return insn - insn_buf; 5479 } 5480 5481 const struct bpf_verifier_ops sk_filter_verifier_ops = { 5482 .get_func_proto = sk_filter_func_proto, 5483 .is_valid_access = sk_filter_is_valid_access, 5484 .convert_ctx_access = bpf_convert_ctx_access, 5485 }; 5486 5487 const struct bpf_prog_ops sk_filter_prog_ops = { 5488 .test_run = bpf_prog_test_run_skb, 5489 }; 5490 5491 const struct bpf_verifier_ops tc_cls_act_verifier_ops = { 5492 .get_func_proto = tc_cls_act_func_proto, 5493 .is_valid_access = tc_cls_act_is_valid_access, 5494 .convert_ctx_access = tc_cls_act_convert_ctx_access, 5495 .gen_prologue = tc_cls_act_prologue, 5496 }; 5497 5498 const struct bpf_prog_ops tc_cls_act_prog_ops = { 5499 .test_run = bpf_prog_test_run_skb, 5500 }; 5501 5502 const struct bpf_verifier_ops xdp_verifier_ops = { 5503 .get_func_proto = xdp_func_proto, 5504 .is_valid_access = xdp_is_valid_access, 5505 .convert_ctx_access = xdp_convert_ctx_access, 5506 }; 5507 5508 const struct bpf_prog_ops xdp_prog_ops = { 5509 .test_run = bpf_prog_test_run_xdp, 5510 }; 5511 5512 const struct bpf_verifier_ops cg_skb_verifier_ops = { 5513 .get_func_proto = sk_filter_func_proto, 5514 .is_valid_access = sk_filter_is_valid_access, 5515 .convert_ctx_access = bpf_convert_ctx_access, 5516 }; 5517 5518 const struct bpf_prog_ops cg_skb_prog_ops = { 5519 .test_run = bpf_prog_test_run_skb, 5520 }; 5521 5522 const struct bpf_verifier_ops lwt_inout_verifier_ops = { 5523 .get_func_proto = lwt_inout_func_proto, 5524 .is_valid_access = lwt_is_valid_access, 5525 .convert_ctx_access = bpf_convert_ctx_access, 5526 }; 5527 5528 const struct bpf_prog_ops lwt_inout_prog_ops = { 5529 .test_run = bpf_prog_test_run_skb, 5530 }; 5531 5532 const struct bpf_verifier_ops lwt_xmit_verifier_ops = { 5533 .get_func_proto = lwt_xmit_func_proto, 5534 .is_valid_access = lwt_is_valid_access, 5535 .convert_ctx_access = bpf_convert_ctx_access, 5536 .gen_prologue = tc_cls_act_prologue, 5537 }; 5538 5539 const struct bpf_prog_ops lwt_xmit_prog_ops = { 5540 .test_run = bpf_prog_test_run_skb, 5541 }; 5542 5543 const struct bpf_verifier_ops cg_sock_verifier_ops = { 5544 .get_func_proto = sock_filter_func_proto, 5545 .is_valid_access = sock_filter_is_valid_access, 5546 .convert_ctx_access = sock_filter_convert_ctx_access, 5547 }; 5548 5549 const struct bpf_prog_ops cg_sock_prog_ops = { 5550 }; 5551 5552 const struct bpf_verifier_ops cg_sock_addr_verifier_ops = { 5553 .get_func_proto = sock_addr_func_proto, 5554 .is_valid_access = sock_addr_is_valid_access, 5555 .convert_ctx_access = sock_addr_convert_ctx_access, 5556 }; 5557 5558 const struct bpf_prog_ops cg_sock_addr_prog_ops = { 5559 }; 5560 5561 const struct bpf_verifier_ops sock_ops_verifier_ops = { 5562 .get_func_proto = sock_ops_func_proto, 5563 .is_valid_access = sock_ops_is_valid_access, 5564 .convert_ctx_access = sock_ops_convert_ctx_access, 5565 }; 5566 5567 const struct bpf_prog_ops sock_ops_prog_ops = { 5568 }; 5569 5570 const struct bpf_verifier_ops sk_skb_verifier_ops = { 5571 .get_func_proto = sk_skb_func_proto, 5572 .is_valid_access = sk_skb_is_valid_access, 5573 .convert_ctx_access = sk_skb_convert_ctx_access, 5574 .gen_prologue = sk_skb_prologue, 5575 }; 5576 5577 const struct bpf_prog_ops sk_skb_prog_ops = { 5578 }; 5579 5580 const struct bpf_verifier_ops sk_msg_verifier_ops = { 5581 .get_func_proto = sk_msg_func_proto, 5582 .is_valid_access = sk_msg_is_valid_access, 5583 .convert_ctx_access = sk_msg_convert_ctx_access, 5584 }; 5585 5586 const struct bpf_prog_ops sk_msg_prog_ops = { 5587 }; 5588 5589 int sk_detach_filter(struct sock *sk) 5590 { 5591 int ret = -ENOENT; 5592 struct sk_filter *filter; 5593 5594 if (sock_flag(sk, SOCK_FILTER_LOCKED)) 5595 return -EPERM; 5596 5597 filter = rcu_dereference_protected(sk->sk_filter, 5598 lockdep_sock_is_held(sk)); 5599 if (filter) { 5600 RCU_INIT_POINTER(sk->sk_filter, NULL); 5601 sk_filter_uncharge(sk, filter); 5602 ret = 0; 5603 } 5604 5605 return ret; 5606 } 5607 EXPORT_SYMBOL_GPL(sk_detach_filter); 5608 5609 int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, 5610 unsigned int len) 5611 { 5612 struct sock_fprog_kern *fprog; 5613 struct sk_filter *filter; 5614 int ret = 0; 5615 5616 lock_sock(sk); 5617 filter = rcu_dereference_protected(sk->sk_filter, 5618 lockdep_sock_is_held(sk)); 5619 if (!filter) 5620 goto out; 5621 5622 /* We're copying the filter that has been originally attached, 5623 * so no conversion/decode needed anymore. eBPF programs that 5624 * have no original program cannot be dumped through this. 5625 */ 5626 ret = -EACCES; 5627 fprog = filter->prog->orig_prog; 5628 if (!fprog) 5629 goto out; 5630 5631 ret = fprog->len; 5632 if (!len) 5633 /* User space only enquires number of filter blocks. */ 5634 goto out; 5635 5636 ret = -EINVAL; 5637 if (len < fprog->len) 5638 goto out; 5639 5640 ret = -EFAULT; 5641 if (copy_to_user(ubuf, fprog->filter, bpf_classic_proglen(fprog))) 5642 goto out; 5643 5644 /* Instead of bytes, the API requests to return the number 5645 * of filter blocks. 5646 */ 5647 ret = fprog->len; 5648 out: 5649 release_sock(sk); 5650 return ret; 5651 } 5652