1 /* 2 * Linux Socket Filter - Kernel level socket filtering 3 * 4 * Based on the design of the Berkeley Packet Filter. The new 5 * internal format has been designed by PLUMgrid: 6 * 7 * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com 8 * 9 * Authors: 10 * 11 * Jay Schulist <jschlst@samba.org> 12 * Alexei Starovoitov <ast@plumgrid.com> 13 * Daniel Borkmann <dborkman@redhat.com> 14 * 15 * This program is free software; you can redistribute it and/or 16 * modify it under the terms of the GNU General Public License 17 * as published by the Free Software Foundation; either version 18 * 2 of the License, or (at your option) any later version. 19 * 20 * Andi Kleen - Fix a few bad bugs and races. 21 * Kris Katterjohn - Added many additional checks in bpf_check_classic() 22 */ 23 24 #include <linux/module.h> 25 #include <linux/types.h> 26 #include <linux/mm.h> 27 #include <linux/fcntl.h> 28 #include <linux/socket.h> 29 #include <linux/in.h> 30 #include <linux/inet.h> 31 #include <linux/netdevice.h> 32 #include <linux/if_packet.h> 33 #include <linux/gfp.h> 34 #include <net/ip.h> 35 #include <net/protocol.h> 36 #include <net/netlink.h> 37 #include <linux/skbuff.h> 38 #include <net/sock.h> 39 #include <net/flow_dissector.h> 40 #include <linux/errno.h> 41 #include <linux/timer.h> 42 #include <asm/uaccess.h> 43 #include <asm/unaligned.h> 44 #include <linux/filter.h> 45 #include <linux/ratelimit.h> 46 #include <linux/seccomp.h> 47 #include <linux/if_vlan.h> 48 #include <linux/bpf.h> 49 #include <net/sch_generic.h> 50 #include <net/cls_cgroup.h> 51 #include <net/dst_metadata.h> 52 #include <net/dst.h> 53 #include <net/sock_reuseport.h> 54 55 /** 56 * sk_filter_trim_cap - run a packet through a socket filter 57 * @sk: sock associated with &sk_buff 58 * @skb: buffer to filter 59 * @cap: limit on how short the eBPF program may trim the packet 60 * 61 * Run the eBPF program and then cut skb->data to correct size returned by 62 * the program. If pkt_len is 0 we toss packet. If skb->len is smaller 63 * than pkt_len we keep whole skb->data. This is the socket level 64 * wrapper to BPF_PROG_RUN. It returns 0 if the packet should 65 * be accepted or -EPERM if the packet should be tossed. 66 * 67 */ 68 int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) 69 { 70 int err; 71 struct sk_filter *filter; 72 73 /* 74 * If the skb was allocated from pfmemalloc reserves, only 75 * allow SOCK_MEMALLOC sockets to use it as this socket is 76 * helping free memory 77 */ 78 if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) 79 return -ENOMEM; 80 81 err = security_sock_rcv_skb(sk, skb); 82 if (err) 83 return err; 84 85 rcu_read_lock(); 86 filter = rcu_dereference(sk->sk_filter); 87 if (filter) { 88 unsigned int pkt_len = bpf_prog_run_save_cb(filter->prog, skb); 89 err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM; 90 } 91 rcu_read_unlock(); 92 93 return err; 94 } 95 EXPORT_SYMBOL(sk_filter_trim_cap); 96 97 BPF_CALL_1(__skb_get_pay_offset, struct sk_buff *, skb) 98 { 99 return skb_get_poff(skb); 100 } 101 102 BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) 103 { 104 struct nlattr *nla; 105 106 if (skb_is_nonlinear(skb)) 107 return 0; 108 109 if (skb->len < sizeof(struct nlattr)) 110 return 0; 111 112 if (a > skb->len - sizeof(struct nlattr)) 113 return 0; 114 115 nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x); 116 if (nla) 117 return (void *) nla - (void *) skb->data; 118 119 return 0; 120 } 121 122 BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) 123 { 124 struct nlattr *nla; 125 126 if (skb_is_nonlinear(skb)) 127 return 0; 128 129 if (skb->len < sizeof(struct nlattr)) 130 return 0; 131 132 if (a > skb->len - sizeof(struct nlattr)) 133 return 0; 134 135 nla = (struct nlattr *) &skb->data[a]; 136 if (nla->nla_len > skb->len - a) 137 return 0; 138 139 nla = nla_find_nested(nla, x); 140 if (nla) 141 return (void *) nla - (void *) skb->data; 142 143 return 0; 144 } 145 146 BPF_CALL_0(__get_raw_cpu_id) 147 { 148 return raw_smp_processor_id(); 149 } 150 151 static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { 152 .func = __get_raw_cpu_id, 153 .gpl_only = false, 154 .ret_type = RET_INTEGER, 155 }; 156 157 static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, 158 struct bpf_insn *insn_buf) 159 { 160 struct bpf_insn *insn = insn_buf; 161 162 switch (skb_field) { 163 case SKF_AD_MARK: 164 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); 165 166 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 167 offsetof(struct sk_buff, mark)); 168 break; 169 170 case SKF_AD_PKTTYPE: 171 *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET()); 172 *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX); 173 #ifdef __BIG_ENDIAN_BITFIELD 174 *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5); 175 #endif 176 break; 177 178 case SKF_AD_QUEUE: 179 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2); 180 181 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 182 offsetof(struct sk_buff, queue_mapping)); 183 break; 184 185 case SKF_AD_VLAN_TAG: 186 case SKF_AD_VLAN_TAG_PRESENT: 187 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); 188 BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); 189 190 /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */ 191 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 192 offsetof(struct sk_buff, vlan_tci)); 193 if (skb_field == SKF_AD_VLAN_TAG) { 194 *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 195 ~VLAN_TAG_PRESENT); 196 } else { 197 /* dst_reg >>= 12 */ 198 *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 12); 199 /* dst_reg &= 1 */ 200 *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1); 201 } 202 break; 203 } 204 205 return insn - insn_buf; 206 } 207 208 static bool convert_bpf_extensions(struct sock_filter *fp, 209 struct bpf_insn **insnp) 210 { 211 struct bpf_insn *insn = *insnp; 212 u32 cnt; 213 214 switch (fp->k) { 215 case SKF_AD_OFF + SKF_AD_PROTOCOL: 216 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); 217 218 /* A = *(u16 *) (CTX + offsetof(protocol)) */ 219 *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, 220 offsetof(struct sk_buff, protocol)); 221 /* A = ntohs(A) [emitting a nop or swap16] */ 222 *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); 223 break; 224 225 case SKF_AD_OFF + SKF_AD_PKTTYPE: 226 cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn); 227 insn += cnt - 1; 228 break; 229 230 case SKF_AD_OFF + SKF_AD_IFINDEX: 231 case SKF_AD_OFF + SKF_AD_HATYPE: 232 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); 233 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2); 234 235 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), 236 BPF_REG_TMP, BPF_REG_CTX, 237 offsetof(struct sk_buff, dev)); 238 /* if (tmp != 0) goto pc + 1 */ 239 *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1); 240 *insn++ = BPF_EXIT_INSN(); 241 if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX) 242 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP, 243 offsetof(struct net_device, ifindex)); 244 else 245 *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP, 246 offsetof(struct net_device, type)); 247 break; 248 249 case SKF_AD_OFF + SKF_AD_MARK: 250 cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn); 251 insn += cnt - 1; 252 break; 253 254 case SKF_AD_OFF + SKF_AD_RXHASH: 255 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); 256 257 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, 258 offsetof(struct sk_buff, hash)); 259 break; 260 261 case SKF_AD_OFF + SKF_AD_QUEUE: 262 cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn); 263 insn += cnt - 1; 264 break; 265 266 case SKF_AD_OFF + SKF_AD_VLAN_TAG: 267 cnt = convert_skb_access(SKF_AD_VLAN_TAG, 268 BPF_REG_A, BPF_REG_CTX, insn); 269 insn += cnt - 1; 270 break; 271 272 case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT: 273 cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, 274 BPF_REG_A, BPF_REG_CTX, insn); 275 insn += cnt - 1; 276 break; 277 278 case SKF_AD_OFF + SKF_AD_VLAN_TPID: 279 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2); 280 281 /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */ 282 *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, 283 offsetof(struct sk_buff, vlan_proto)); 284 /* A = ntohs(A) [emitting a nop or swap16] */ 285 *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); 286 break; 287 288 case SKF_AD_OFF + SKF_AD_PAY_OFFSET: 289 case SKF_AD_OFF + SKF_AD_NLATTR: 290 case SKF_AD_OFF + SKF_AD_NLATTR_NEST: 291 case SKF_AD_OFF + SKF_AD_CPU: 292 case SKF_AD_OFF + SKF_AD_RANDOM: 293 /* arg1 = CTX */ 294 *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); 295 /* arg2 = A */ 296 *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A); 297 /* arg3 = X */ 298 *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X); 299 /* Emit call(arg1=CTX, arg2=A, arg3=X) */ 300 switch (fp->k) { 301 case SKF_AD_OFF + SKF_AD_PAY_OFFSET: 302 *insn = BPF_EMIT_CALL(__skb_get_pay_offset); 303 break; 304 case SKF_AD_OFF + SKF_AD_NLATTR: 305 *insn = BPF_EMIT_CALL(__skb_get_nlattr); 306 break; 307 case SKF_AD_OFF + SKF_AD_NLATTR_NEST: 308 *insn = BPF_EMIT_CALL(__skb_get_nlattr_nest); 309 break; 310 case SKF_AD_OFF + SKF_AD_CPU: 311 *insn = BPF_EMIT_CALL(__get_raw_cpu_id); 312 break; 313 case SKF_AD_OFF + SKF_AD_RANDOM: 314 *insn = BPF_EMIT_CALL(bpf_user_rnd_u32); 315 bpf_user_rnd_init_once(); 316 break; 317 } 318 break; 319 320 case SKF_AD_OFF + SKF_AD_ALU_XOR_X: 321 /* A ^= X */ 322 *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X); 323 break; 324 325 default: 326 /* This is just a dummy call to avoid letting the compiler 327 * evict __bpf_call_base() as an optimization. Placed here 328 * where no-one bothers. 329 */ 330 BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0); 331 return false; 332 } 333 334 *insnp = insn; 335 return true; 336 } 337 338 /** 339 * bpf_convert_filter - convert filter program 340 * @prog: the user passed filter program 341 * @len: the length of the user passed filter program 342 * @new_prog: buffer where converted program will be stored 343 * @new_len: pointer to store length of converted program 344 * 345 * Remap 'sock_filter' style BPF instruction set to 'sock_filter_ext' style. 346 * Conversion workflow: 347 * 348 * 1) First pass for calculating the new program length: 349 * bpf_convert_filter(old_prog, old_len, NULL, &new_len) 350 * 351 * 2) 2nd pass to remap in two passes: 1st pass finds new 352 * jump offsets, 2nd pass remapping: 353 * new_prog = kmalloc(sizeof(struct bpf_insn) * new_len); 354 * bpf_convert_filter(old_prog, old_len, new_prog, &new_len); 355 */ 356 static int bpf_convert_filter(struct sock_filter *prog, int len, 357 struct bpf_insn *new_prog, int *new_len) 358 { 359 int new_flen = 0, pass = 0, target, i; 360 struct bpf_insn *new_insn; 361 struct sock_filter *fp; 362 int *addrs = NULL; 363 u8 bpf_src; 364 365 BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK); 366 BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); 367 368 if (len <= 0 || len > BPF_MAXINSNS) 369 return -EINVAL; 370 371 if (new_prog) { 372 addrs = kcalloc(len, sizeof(*addrs), 373 GFP_KERNEL | __GFP_NOWARN); 374 if (!addrs) 375 return -ENOMEM; 376 } 377 378 do_pass: 379 new_insn = new_prog; 380 fp = prog; 381 382 /* Classic BPF related prologue emission. */ 383 if (new_insn) { 384 /* Classic BPF expects A and X to be reset first. These need 385 * to be guaranteed to be the first two instructions. 386 */ 387 *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); 388 *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); 389 390 /* All programs must keep CTX in callee saved BPF_REG_CTX. 391 * In eBPF case it's done by the compiler, here we need to 392 * do this ourself. Initial CTX is present in BPF_REG_ARG1. 393 */ 394 *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); 395 } else { 396 new_insn += 3; 397 } 398 399 for (i = 0; i < len; fp++, i++) { 400 struct bpf_insn tmp_insns[6] = { }; 401 struct bpf_insn *insn = tmp_insns; 402 403 if (addrs) 404 addrs[i] = new_insn - new_prog; 405 406 switch (fp->code) { 407 /* All arithmetic insns and skb loads map as-is. */ 408 case BPF_ALU | BPF_ADD | BPF_X: 409 case BPF_ALU | BPF_ADD | BPF_K: 410 case BPF_ALU | BPF_SUB | BPF_X: 411 case BPF_ALU | BPF_SUB | BPF_K: 412 case BPF_ALU | BPF_AND | BPF_X: 413 case BPF_ALU | BPF_AND | BPF_K: 414 case BPF_ALU | BPF_OR | BPF_X: 415 case BPF_ALU | BPF_OR | BPF_K: 416 case BPF_ALU | BPF_LSH | BPF_X: 417 case BPF_ALU | BPF_LSH | BPF_K: 418 case BPF_ALU | BPF_RSH | BPF_X: 419 case BPF_ALU | BPF_RSH | BPF_K: 420 case BPF_ALU | BPF_XOR | BPF_X: 421 case BPF_ALU | BPF_XOR | BPF_K: 422 case BPF_ALU | BPF_MUL | BPF_X: 423 case BPF_ALU | BPF_MUL | BPF_K: 424 case BPF_ALU | BPF_DIV | BPF_X: 425 case BPF_ALU | BPF_DIV | BPF_K: 426 case BPF_ALU | BPF_MOD | BPF_X: 427 case BPF_ALU | BPF_MOD | BPF_K: 428 case BPF_ALU | BPF_NEG: 429 case BPF_LD | BPF_ABS | BPF_W: 430 case BPF_LD | BPF_ABS | BPF_H: 431 case BPF_LD | BPF_ABS | BPF_B: 432 case BPF_LD | BPF_IND | BPF_W: 433 case BPF_LD | BPF_IND | BPF_H: 434 case BPF_LD | BPF_IND | BPF_B: 435 /* Check for overloaded BPF extension and 436 * directly convert it if found, otherwise 437 * just move on with mapping. 438 */ 439 if (BPF_CLASS(fp->code) == BPF_LD && 440 BPF_MODE(fp->code) == BPF_ABS && 441 convert_bpf_extensions(fp, &insn)) 442 break; 443 444 *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); 445 break; 446 447 /* Jump transformation cannot use BPF block macros 448 * everywhere as offset calculation and target updates 449 * require a bit more work than the rest, i.e. jump 450 * opcodes map as-is, but offsets need adjustment. 451 */ 452 453 #define BPF_EMIT_JMP \ 454 do { \ 455 if (target >= len || target < 0) \ 456 goto err; \ 457 insn->off = addrs ? addrs[target] - addrs[i] - 1 : 0; \ 458 /* Adjust pc relative offset for 2nd or 3rd insn. */ \ 459 insn->off -= insn - tmp_insns; \ 460 } while (0) 461 462 case BPF_JMP | BPF_JA: 463 target = i + fp->k + 1; 464 insn->code = fp->code; 465 BPF_EMIT_JMP; 466 break; 467 468 case BPF_JMP | BPF_JEQ | BPF_K: 469 case BPF_JMP | BPF_JEQ | BPF_X: 470 case BPF_JMP | BPF_JSET | BPF_K: 471 case BPF_JMP | BPF_JSET | BPF_X: 472 case BPF_JMP | BPF_JGT | BPF_K: 473 case BPF_JMP | BPF_JGT | BPF_X: 474 case BPF_JMP | BPF_JGE | BPF_K: 475 case BPF_JMP | BPF_JGE | BPF_X: 476 if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) { 477 /* BPF immediates are signed, zero extend 478 * immediate into tmp register and use it 479 * in compare insn. 480 */ 481 *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k); 482 483 insn->dst_reg = BPF_REG_A; 484 insn->src_reg = BPF_REG_TMP; 485 bpf_src = BPF_X; 486 } else { 487 insn->dst_reg = BPF_REG_A; 488 insn->imm = fp->k; 489 bpf_src = BPF_SRC(fp->code); 490 insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0; 491 } 492 493 /* Common case where 'jump_false' is next insn. */ 494 if (fp->jf == 0) { 495 insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; 496 target = i + fp->jt + 1; 497 BPF_EMIT_JMP; 498 break; 499 } 500 501 /* Convert JEQ into JNE when 'jump_true' is next insn. */ 502 if (fp->jt == 0 && BPF_OP(fp->code) == BPF_JEQ) { 503 insn->code = BPF_JMP | BPF_JNE | bpf_src; 504 target = i + fp->jf + 1; 505 BPF_EMIT_JMP; 506 break; 507 } 508 509 /* Other jumps are mapped into two insns: Jxx and JA. */ 510 target = i + fp->jt + 1; 511 insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; 512 BPF_EMIT_JMP; 513 insn++; 514 515 insn->code = BPF_JMP | BPF_JA; 516 target = i + fp->jf + 1; 517 BPF_EMIT_JMP; 518 break; 519 520 /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */ 521 case BPF_LDX | BPF_MSH | BPF_B: 522 /* tmp = A */ 523 *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A); 524 /* A = BPF_R0 = *(u8 *) (skb->data + K) */ 525 *insn++ = BPF_LD_ABS(BPF_B, fp->k); 526 /* A &= 0xf */ 527 *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf); 528 /* A <<= 2 */ 529 *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2); 530 /* X = A */ 531 *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); 532 /* A = tmp */ 533 *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); 534 break; 535 536 /* RET_K is remaped into 2 insns. RET_A case doesn't need an 537 * extra mov as BPF_REG_0 is already mapped into BPF_REG_A. 538 */ 539 case BPF_RET | BPF_A: 540 case BPF_RET | BPF_K: 541 if (BPF_RVAL(fp->code) == BPF_K) 542 *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0, 543 0, fp->k); 544 *insn = BPF_EXIT_INSN(); 545 break; 546 547 /* Store to stack. */ 548 case BPF_ST: 549 case BPF_STX: 550 *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) == 551 BPF_ST ? BPF_REG_A : BPF_REG_X, 552 -(BPF_MEMWORDS - fp->k) * 4); 553 break; 554 555 /* Load from stack. */ 556 case BPF_LD | BPF_MEM: 557 case BPF_LDX | BPF_MEM: 558 *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? 559 BPF_REG_A : BPF_REG_X, BPF_REG_FP, 560 -(BPF_MEMWORDS - fp->k) * 4); 561 break; 562 563 /* A = K or X = K */ 564 case BPF_LD | BPF_IMM: 565 case BPF_LDX | BPF_IMM: 566 *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ? 567 BPF_REG_A : BPF_REG_X, fp->k); 568 break; 569 570 /* X = A */ 571 case BPF_MISC | BPF_TAX: 572 *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); 573 break; 574 575 /* A = X */ 576 case BPF_MISC | BPF_TXA: 577 *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X); 578 break; 579 580 /* A = skb->len or X = skb->len */ 581 case BPF_LD | BPF_W | BPF_LEN: 582 case BPF_LDX | BPF_W | BPF_LEN: 583 *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? 584 BPF_REG_A : BPF_REG_X, BPF_REG_CTX, 585 offsetof(struct sk_buff, len)); 586 break; 587 588 /* Access seccomp_data fields. */ 589 case BPF_LDX | BPF_ABS | BPF_W: 590 /* A = *(u32 *) (ctx + K) */ 591 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k); 592 break; 593 594 /* Unknown instruction. */ 595 default: 596 goto err; 597 } 598 599 insn++; 600 if (new_prog) 601 memcpy(new_insn, tmp_insns, 602 sizeof(*insn) * (insn - tmp_insns)); 603 new_insn += insn - tmp_insns; 604 } 605 606 if (!new_prog) { 607 /* Only calculating new length. */ 608 *new_len = new_insn - new_prog; 609 return 0; 610 } 611 612 pass++; 613 if (new_flen != new_insn - new_prog) { 614 new_flen = new_insn - new_prog; 615 if (pass > 2) 616 goto err; 617 goto do_pass; 618 } 619 620 kfree(addrs); 621 BUG_ON(*new_len != new_flen); 622 return 0; 623 err: 624 kfree(addrs); 625 return -EINVAL; 626 } 627 628 /* Security: 629 * 630 * As we dont want to clear mem[] array for each packet going through 631 * __bpf_prog_run(), we check that filter loaded by user never try to read 632 * a cell if not previously written, and we check all branches to be sure 633 * a malicious user doesn't try to abuse us. 634 */ 635 static int check_load_and_stores(const struct sock_filter *filter, int flen) 636 { 637 u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */ 638 int pc, ret = 0; 639 640 BUILD_BUG_ON(BPF_MEMWORDS > 16); 641 642 masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL); 643 if (!masks) 644 return -ENOMEM; 645 646 memset(masks, 0xff, flen * sizeof(*masks)); 647 648 for (pc = 0; pc < flen; pc++) { 649 memvalid &= masks[pc]; 650 651 switch (filter[pc].code) { 652 case BPF_ST: 653 case BPF_STX: 654 memvalid |= (1 << filter[pc].k); 655 break; 656 case BPF_LD | BPF_MEM: 657 case BPF_LDX | BPF_MEM: 658 if (!(memvalid & (1 << filter[pc].k))) { 659 ret = -EINVAL; 660 goto error; 661 } 662 break; 663 case BPF_JMP | BPF_JA: 664 /* A jump must set masks on target */ 665 masks[pc + 1 + filter[pc].k] &= memvalid; 666 memvalid = ~0; 667 break; 668 case BPF_JMP | BPF_JEQ | BPF_K: 669 case BPF_JMP | BPF_JEQ | BPF_X: 670 case BPF_JMP | BPF_JGE | BPF_K: 671 case BPF_JMP | BPF_JGE | BPF_X: 672 case BPF_JMP | BPF_JGT | BPF_K: 673 case BPF_JMP | BPF_JGT | BPF_X: 674 case BPF_JMP | BPF_JSET | BPF_K: 675 case BPF_JMP | BPF_JSET | BPF_X: 676 /* A jump must set masks on targets */ 677 masks[pc + 1 + filter[pc].jt] &= memvalid; 678 masks[pc + 1 + filter[pc].jf] &= memvalid; 679 memvalid = ~0; 680 break; 681 } 682 } 683 error: 684 kfree(masks); 685 return ret; 686 } 687 688 static bool chk_code_allowed(u16 code_to_probe) 689 { 690 static const bool codes[] = { 691 /* 32 bit ALU operations */ 692 [BPF_ALU | BPF_ADD | BPF_K] = true, 693 [BPF_ALU | BPF_ADD | BPF_X] = true, 694 [BPF_ALU | BPF_SUB | BPF_K] = true, 695 [BPF_ALU | BPF_SUB | BPF_X] = true, 696 [BPF_ALU | BPF_MUL | BPF_K] = true, 697 [BPF_ALU | BPF_MUL | BPF_X] = true, 698 [BPF_ALU | BPF_DIV | BPF_K] = true, 699 [BPF_ALU | BPF_DIV | BPF_X] = true, 700 [BPF_ALU | BPF_MOD | BPF_K] = true, 701 [BPF_ALU | BPF_MOD | BPF_X] = true, 702 [BPF_ALU | BPF_AND | BPF_K] = true, 703 [BPF_ALU | BPF_AND | BPF_X] = true, 704 [BPF_ALU | BPF_OR | BPF_K] = true, 705 [BPF_ALU | BPF_OR | BPF_X] = true, 706 [BPF_ALU | BPF_XOR | BPF_K] = true, 707 [BPF_ALU | BPF_XOR | BPF_X] = true, 708 [BPF_ALU | BPF_LSH | BPF_K] = true, 709 [BPF_ALU | BPF_LSH | BPF_X] = true, 710 [BPF_ALU | BPF_RSH | BPF_K] = true, 711 [BPF_ALU | BPF_RSH | BPF_X] = true, 712 [BPF_ALU | BPF_NEG] = true, 713 /* Load instructions */ 714 [BPF_LD | BPF_W | BPF_ABS] = true, 715 [BPF_LD | BPF_H | BPF_ABS] = true, 716 [BPF_LD | BPF_B | BPF_ABS] = true, 717 [BPF_LD | BPF_W | BPF_LEN] = true, 718 [BPF_LD | BPF_W | BPF_IND] = true, 719 [BPF_LD | BPF_H | BPF_IND] = true, 720 [BPF_LD | BPF_B | BPF_IND] = true, 721 [BPF_LD | BPF_IMM] = true, 722 [BPF_LD | BPF_MEM] = true, 723 [BPF_LDX | BPF_W | BPF_LEN] = true, 724 [BPF_LDX | BPF_B | BPF_MSH] = true, 725 [BPF_LDX | BPF_IMM] = true, 726 [BPF_LDX | BPF_MEM] = true, 727 /* Store instructions */ 728 [BPF_ST] = true, 729 [BPF_STX] = true, 730 /* Misc instructions */ 731 [BPF_MISC | BPF_TAX] = true, 732 [BPF_MISC | BPF_TXA] = true, 733 /* Return instructions */ 734 [BPF_RET | BPF_K] = true, 735 [BPF_RET | BPF_A] = true, 736 /* Jump instructions */ 737 [BPF_JMP | BPF_JA] = true, 738 [BPF_JMP | BPF_JEQ | BPF_K] = true, 739 [BPF_JMP | BPF_JEQ | BPF_X] = true, 740 [BPF_JMP | BPF_JGE | BPF_K] = true, 741 [BPF_JMP | BPF_JGE | BPF_X] = true, 742 [BPF_JMP | BPF_JGT | BPF_K] = true, 743 [BPF_JMP | BPF_JGT | BPF_X] = true, 744 [BPF_JMP | BPF_JSET | BPF_K] = true, 745 [BPF_JMP | BPF_JSET | BPF_X] = true, 746 }; 747 748 if (code_to_probe >= ARRAY_SIZE(codes)) 749 return false; 750 751 return codes[code_to_probe]; 752 } 753 754 static bool bpf_check_basics_ok(const struct sock_filter *filter, 755 unsigned int flen) 756 { 757 if (filter == NULL) 758 return false; 759 if (flen == 0 || flen > BPF_MAXINSNS) 760 return false; 761 762 return true; 763 } 764 765 /** 766 * bpf_check_classic - verify socket filter code 767 * @filter: filter to verify 768 * @flen: length of filter 769 * 770 * Check the user's filter code. If we let some ugly 771 * filter code slip through kaboom! The filter must contain 772 * no references or jumps that are out of range, no illegal 773 * instructions, and must end with a RET instruction. 774 * 775 * All jumps are forward as they are not signed. 776 * 777 * Returns 0 if the rule set is legal or -EINVAL if not. 778 */ 779 static int bpf_check_classic(const struct sock_filter *filter, 780 unsigned int flen) 781 { 782 bool anc_found; 783 int pc; 784 785 /* Check the filter code now */ 786 for (pc = 0; pc < flen; pc++) { 787 const struct sock_filter *ftest = &filter[pc]; 788 789 /* May we actually operate on this code? */ 790 if (!chk_code_allowed(ftest->code)) 791 return -EINVAL; 792 793 /* Some instructions need special checks */ 794 switch (ftest->code) { 795 case BPF_ALU | BPF_DIV | BPF_K: 796 case BPF_ALU | BPF_MOD | BPF_K: 797 /* Check for division by zero */ 798 if (ftest->k == 0) 799 return -EINVAL; 800 break; 801 case BPF_ALU | BPF_LSH | BPF_K: 802 case BPF_ALU | BPF_RSH | BPF_K: 803 if (ftest->k >= 32) 804 return -EINVAL; 805 break; 806 case BPF_LD | BPF_MEM: 807 case BPF_LDX | BPF_MEM: 808 case BPF_ST: 809 case BPF_STX: 810 /* Check for invalid memory addresses */ 811 if (ftest->k >= BPF_MEMWORDS) 812 return -EINVAL; 813 break; 814 case BPF_JMP | BPF_JA: 815 /* Note, the large ftest->k might cause loops. 816 * Compare this with conditional jumps below, 817 * where offsets are limited. --ANK (981016) 818 */ 819 if (ftest->k >= (unsigned int)(flen - pc - 1)) 820 return -EINVAL; 821 break; 822 case BPF_JMP | BPF_JEQ | BPF_K: 823 case BPF_JMP | BPF_JEQ | BPF_X: 824 case BPF_JMP | BPF_JGE | BPF_K: 825 case BPF_JMP | BPF_JGE | BPF_X: 826 case BPF_JMP | BPF_JGT | BPF_K: 827 case BPF_JMP | BPF_JGT | BPF_X: 828 case BPF_JMP | BPF_JSET | BPF_K: 829 case BPF_JMP | BPF_JSET | BPF_X: 830 /* Both conditionals must be safe */ 831 if (pc + ftest->jt + 1 >= flen || 832 pc + ftest->jf + 1 >= flen) 833 return -EINVAL; 834 break; 835 case BPF_LD | BPF_W | BPF_ABS: 836 case BPF_LD | BPF_H | BPF_ABS: 837 case BPF_LD | BPF_B | BPF_ABS: 838 anc_found = false; 839 if (bpf_anc_helper(ftest) & BPF_ANC) 840 anc_found = true; 841 /* Ancillary operation unknown or unsupported */ 842 if (anc_found == false && ftest->k >= SKF_AD_OFF) 843 return -EINVAL; 844 } 845 } 846 847 /* Last instruction must be a RET code */ 848 switch (filter[flen - 1].code) { 849 case BPF_RET | BPF_K: 850 case BPF_RET | BPF_A: 851 return check_load_and_stores(filter, flen); 852 } 853 854 return -EINVAL; 855 } 856 857 static int bpf_prog_store_orig_filter(struct bpf_prog *fp, 858 const struct sock_fprog *fprog) 859 { 860 unsigned int fsize = bpf_classic_proglen(fprog); 861 struct sock_fprog_kern *fkprog; 862 863 fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL); 864 if (!fp->orig_prog) 865 return -ENOMEM; 866 867 fkprog = fp->orig_prog; 868 fkprog->len = fprog->len; 869 870 fkprog->filter = kmemdup(fp->insns, fsize, 871 GFP_KERNEL | __GFP_NOWARN); 872 if (!fkprog->filter) { 873 kfree(fp->orig_prog); 874 return -ENOMEM; 875 } 876 877 return 0; 878 } 879 880 static void bpf_release_orig_filter(struct bpf_prog *fp) 881 { 882 struct sock_fprog_kern *fprog = fp->orig_prog; 883 884 if (fprog) { 885 kfree(fprog->filter); 886 kfree(fprog); 887 } 888 } 889 890 static void __bpf_prog_release(struct bpf_prog *prog) 891 { 892 if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) { 893 bpf_prog_put(prog); 894 } else { 895 bpf_release_orig_filter(prog); 896 bpf_prog_free(prog); 897 } 898 } 899 900 static void __sk_filter_release(struct sk_filter *fp) 901 { 902 __bpf_prog_release(fp->prog); 903 kfree(fp); 904 } 905 906 /** 907 * sk_filter_release_rcu - Release a socket filter by rcu_head 908 * @rcu: rcu_head that contains the sk_filter to free 909 */ 910 static void sk_filter_release_rcu(struct rcu_head *rcu) 911 { 912 struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); 913 914 __sk_filter_release(fp); 915 } 916 917 /** 918 * sk_filter_release - release a socket filter 919 * @fp: filter to remove 920 * 921 * Remove a filter from a socket and release its resources. 922 */ 923 static void sk_filter_release(struct sk_filter *fp) 924 { 925 if (atomic_dec_and_test(&fp->refcnt)) 926 call_rcu(&fp->rcu, sk_filter_release_rcu); 927 } 928 929 void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) 930 { 931 u32 filter_size = bpf_prog_size(fp->prog->len); 932 933 atomic_sub(filter_size, &sk->sk_omem_alloc); 934 sk_filter_release(fp); 935 } 936 937 /* try to charge the socket memory if there is space available 938 * return true on success 939 */ 940 bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) 941 { 942 u32 filter_size = bpf_prog_size(fp->prog->len); 943 944 /* same check as in sock_kmalloc() */ 945 if (filter_size <= sysctl_optmem_max && 946 atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) { 947 atomic_inc(&fp->refcnt); 948 atomic_add(filter_size, &sk->sk_omem_alloc); 949 return true; 950 } 951 return false; 952 } 953 954 static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) 955 { 956 struct sock_filter *old_prog; 957 struct bpf_prog *old_fp; 958 int err, new_len, old_len = fp->len; 959 960 /* We are free to overwrite insns et al right here as it 961 * won't be used at this point in time anymore internally 962 * after the migration to the internal BPF instruction 963 * representation. 964 */ 965 BUILD_BUG_ON(sizeof(struct sock_filter) != 966 sizeof(struct bpf_insn)); 967 968 /* Conversion cannot happen on overlapping memory areas, 969 * so we need to keep the user BPF around until the 2nd 970 * pass. At this time, the user BPF is stored in fp->insns. 971 */ 972 old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter), 973 GFP_KERNEL | __GFP_NOWARN); 974 if (!old_prog) { 975 err = -ENOMEM; 976 goto out_err; 977 } 978 979 /* 1st pass: calculate the new program length. */ 980 err = bpf_convert_filter(old_prog, old_len, NULL, &new_len); 981 if (err) 982 goto out_err_free; 983 984 /* Expand fp for appending the new filter representation. */ 985 old_fp = fp; 986 fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0); 987 if (!fp) { 988 /* The old_fp is still around in case we couldn't 989 * allocate new memory, so uncharge on that one. 990 */ 991 fp = old_fp; 992 err = -ENOMEM; 993 goto out_err_free; 994 } 995 996 fp->len = new_len; 997 998 /* 2nd pass: remap sock_filter insns into bpf_insn insns. */ 999 err = bpf_convert_filter(old_prog, old_len, fp->insnsi, &new_len); 1000 if (err) 1001 /* 2nd bpf_convert_filter() can fail only if it fails 1002 * to allocate memory, remapping must succeed. Note, 1003 * that at this time old_fp has already been released 1004 * by krealloc(). 1005 */ 1006 goto out_err_free; 1007 1008 /* We are guaranteed to never error here with cBPF to eBPF 1009 * transitions, since there's no issue with type compatibility 1010 * checks on program arrays. 1011 */ 1012 fp = bpf_prog_select_runtime(fp, &err); 1013 1014 kfree(old_prog); 1015 return fp; 1016 1017 out_err_free: 1018 kfree(old_prog); 1019 out_err: 1020 __bpf_prog_release(fp); 1021 return ERR_PTR(err); 1022 } 1023 1024 static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, 1025 bpf_aux_classic_check_t trans) 1026 { 1027 int err; 1028 1029 fp->bpf_func = NULL; 1030 fp->jited = 0; 1031 1032 err = bpf_check_classic(fp->insns, fp->len); 1033 if (err) { 1034 __bpf_prog_release(fp); 1035 return ERR_PTR(err); 1036 } 1037 1038 /* There might be additional checks and transformations 1039 * needed on classic filters, f.e. in case of seccomp. 1040 */ 1041 if (trans) { 1042 err = trans(fp->insns, fp->len); 1043 if (err) { 1044 __bpf_prog_release(fp); 1045 return ERR_PTR(err); 1046 } 1047 } 1048 1049 /* Probe if we can JIT compile the filter and if so, do 1050 * the compilation of the filter. 1051 */ 1052 bpf_jit_compile(fp); 1053 1054 /* JIT compiler couldn't process this filter, so do the 1055 * internal BPF translation for the optimized interpreter. 1056 */ 1057 if (!fp->jited) 1058 fp = bpf_migrate_filter(fp); 1059 1060 return fp; 1061 } 1062 1063 /** 1064 * bpf_prog_create - create an unattached filter 1065 * @pfp: the unattached filter that is created 1066 * @fprog: the filter program 1067 * 1068 * Create a filter independent of any socket. We first run some 1069 * sanity checks on it to make sure it does not explode on us later. 1070 * If an error occurs or there is insufficient memory for the filter 1071 * a negative errno code is returned. On success the return is zero. 1072 */ 1073 int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog) 1074 { 1075 unsigned int fsize = bpf_classic_proglen(fprog); 1076 struct bpf_prog *fp; 1077 1078 /* Make sure new filter is there and in the right amounts. */ 1079 if (!bpf_check_basics_ok(fprog->filter, fprog->len)) 1080 return -EINVAL; 1081 1082 fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); 1083 if (!fp) 1084 return -ENOMEM; 1085 1086 memcpy(fp->insns, fprog->filter, fsize); 1087 1088 fp->len = fprog->len; 1089 /* Since unattached filters are not copied back to user 1090 * space through sk_get_filter(), we do not need to hold 1091 * a copy here, and can spare us the work. 1092 */ 1093 fp->orig_prog = NULL; 1094 1095 /* bpf_prepare_filter() already takes care of freeing 1096 * memory in case something goes wrong. 1097 */ 1098 fp = bpf_prepare_filter(fp, NULL); 1099 if (IS_ERR(fp)) 1100 return PTR_ERR(fp); 1101 1102 *pfp = fp; 1103 return 0; 1104 } 1105 EXPORT_SYMBOL_GPL(bpf_prog_create); 1106 1107 /** 1108 * bpf_prog_create_from_user - create an unattached filter from user buffer 1109 * @pfp: the unattached filter that is created 1110 * @fprog: the filter program 1111 * @trans: post-classic verifier transformation handler 1112 * @save_orig: save classic BPF program 1113 * 1114 * This function effectively does the same as bpf_prog_create(), only 1115 * that it builds up its insns buffer from user space provided buffer. 1116 * It also allows for passing a bpf_aux_classic_check_t handler. 1117 */ 1118 int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, 1119 bpf_aux_classic_check_t trans, bool save_orig) 1120 { 1121 unsigned int fsize = bpf_classic_proglen(fprog); 1122 struct bpf_prog *fp; 1123 int err; 1124 1125 /* Make sure new filter is there and in the right amounts. */ 1126 if (!bpf_check_basics_ok(fprog->filter, fprog->len)) 1127 return -EINVAL; 1128 1129 fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); 1130 if (!fp) 1131 return -ENOMEM; 1132 1133 if (copy_from_user(fp->insns, fprog->filter, fsize)) { 1134 __bpf_prog_free(fp); 1135 return -EFAULT; 1136 } 1137 1138 fp->len = fprog->len; 1139 fp->orig_prog = NULL; 1140 1141 if (save_orig) { 1142 err = bpf_prog_store_orig_filter(fp, fprog); 1143 if (err) { 1144 __bpf_prog_free(fp); 1145 return -ENOMEM; 1146 } 1147 } 1148 1149 /* bpf_prepare_filter() already takes care of freeing 1150 * memory in case something goes wrong. 1151 */ 1152 fp = bpf_prepare_filter(fp, trans); 1153 if (IS_ERR(fp)) 1154 return PTR_ERR(fp); 1155 1156 *pfp = fp; 1157 return 0; 1158 } 1159 EXPORT_SYMBOL_GPL(bpf_prog_create_from_user); 1160 1161 void bpf_prog_destroy(struct bpf_prog *fp) 1162 { 1163 __bpf_prog_release(fp); 1164 } 1165 EXPORT_SYMBOL_GPL(bpf_prog_destroy); 1166 1167 static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) 1168 { 1169 struct sk_filter *fp, *old_fp; 1170 1171 fp = kmalloc(sizeof(*fp), GFP_KERNEL); 1172 if (!fp) 1173 return -ENOMEM; 1174 1175 fp->prog = prog; 1176 atomic_set(&fp->refcnt, 0); 1177 1178 if (!sk_filter_charge(sk, fp)) { 1179 kfree(fp); 1180 return -ENOMEM; 1181 } 1182 1183 old_fp = rcu_dereference_protected(sk->sk_filter, 1184 lockdep_sock_is_held(sk)); 1185 rcu_assign_pointer(sk->sk_filter, fp); 1186 1187 if (old_fp) 1188 sk_filter_uncharge(sk, old_fp); 1189 1190 return 0; 1191 } 1192 1193 static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk) 1194 { 1195 struct bpf_prog *old_prog; 1196 int err; 1197 1198 if (bpf_prog_size(prog->len) > sysctl_optmem_max) 1199 return -ENOMEM; 1200 1201 if (sk_unhashed(sk) && sk->sk_reuseport) { 1202 err = reuseport_alloc(sk); 1203 if (err) 1204 return err; 1205 } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { 1206 /* The socket wasn't bound with SO_REUSEPORT */ 1207 return -EINVAL; 1208 } 1209 1210 old_prog = reuseport_attach_prog(sk, prog); 1211 if (old_prog) 1212 bpf_prog_destroy(old_prog); 1213 1214 return 0; 1215 } 1216 1217 static 1218 struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) 1219 { 1220 unsigned int fsize = bpf_classic_proglen(fprog); 1221 struct bpf_prog *prog; 1222 int err; 1223 1224 if (sock_flag(sk, SOCK_FILTER_LOCKED)) 1225 return ERR_PTR(-EPERM); 1226 1227 /* Make sure new filter is there and in the right amounts. */ 1228 if (!bpf_check_basics_ok(fprog->filter, fprog->len)) 1229 return ERR_PTR(-EINVAL); 1230 1231 prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); 1232 if (!prog) 1233 return ERR_PTR(-ENOMEM); 1234 1235 if (copy_from_user(prog->insns, fprog->filter, fsize)) { 1236 __bpf_prog_free(prog); 1237 return ERR_PTR(-EFAULT); 1238 } 1239 1240 prog->len = fprog->len; 1241 1242 err = bpf_prog_store_orig_filter(prog, fprog); 1243 if (err) { 1244 __bpf_prog_free(prog); 1245 return ERR_PTR(-ENOMEM); 1246 } 1247 1248 /* bpf_prepare_filter() already takes care of freeing 1249 * memory in case something goes wrong. 1250 */ 1251 return bpf_prepare_filter(prog, NULL); 1252 } 1253 1254 /** 1255 * sk_attach_filter - attach a socket filter 1256 * @fprog: the filter program 1257 * @sk: the socket to use 1258 * 1259 * Attach the user's filter code. We first run some sanity checks on 1260 * it to make sure it does not explode on us later. If an error 1261 * occurs or there is insufficient memory for the filter a negative 1262 * errno code is returned. On success the return is zero. 1263 */ 1264 int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) 1265 { 1266 struct bpf_prog *prog = __get_filter(fprog, sk); 1267 int err; 1268 1269 if (IS_ERR(prog)) 1270 return PTR_ERR(prog); 1271 1272 err = __sk_attach_prog(prog, sk); 1273 if (err < 0) { 1274 __bpf_prog_release(prog); 1275 return err; 1276 } 1277 1278 return 0; 1279 } 1280 EXPORT_SYMBOL_GPL(sk_attach_filter); 1281 1282 int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) 1283 { 1284 struct bpf_prog *prog = __get_filter(fprog, sk); 1285 int err; 1286 1287 if (IS_ERR(prog)) 1288 return PTR_ERR(prog); 1289 1290 err = __reuseport_attach_prog(prog, sk); 1291 if (err < 0) { 1292 __bpf_prog_release(prog); 1293 return err; 1294 } 1295 1296 return 0; 1297 } 1298 1299 static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk) 1300 { 1301 if (sock_flag(sk, SOCK_FILTER_LOCKED)) 1302 return ERR_PTR(-EPERM); 1303 1304 return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); 1305 } 1306 1307 int sk_attach_bpf(u32 ufd, struct sock *sk) 1308 { 1309 struct bpf_prog *prog = __get_bpf(ufd, sk); 1310 int err; 1311 1312 if (IS_ERR(prog)) 1313 return PTR_ERR(prog); 1314 1315 err = __sk_attach_prog(prog, sk); 1316 if (err < 0) { 1317 bpf_prog_put(prog); 1318 return err; 1319 } 1320 1321 return 0; 1322 } 1323 1324 int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) 1325 { 1326 struct bpf_prog *prog = __get_bpf(ufd, sk); 1327 int err; 1328 1329 if (IS_ERR(prog)) 1330 return PTR_ERR(prog); 1331 1332 err = __reuseport_attach_prog(prog, sk); 1333 if (err < 0) { 1334 bpf_prog_put(prog); 1335 return err; 1336 } 1337 1338 return 0; 1339 } 1340 1341 struct bpf_scratchpad { 1342 union { 1343 __be32 diff[MAX_BPF_STACK / sizeof(__be32)]; 1344 u8 buff[MAX_BPF_STACK]; 1345 }; 1346 }; 1347 1348 static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp); 1349 1350 static inline int __bpf_try_make_writable(struct sk_buff *skb, 1351 unsigned int write_len) 1352 { 1353 return skb_ensure_writable(skb, write_len); 1354 } 1355 1356 static inline int bpf_try_make_writable(struct sk_buff *skb, 1357 unsigned int write_len) 1358 { 1359 int err = __bpf_try_make_writable(skb, write_len); 1360 1361 bpf_compute_data_end(skb); 1362 return err; 1363 } 1364 1365 static int bpf_try_make_head_writable(struct sk_buff *skb) 1366 { 1367 return bpf_try_make_writable(skb, skb_headlen(skb)); 1368 } 1369 1370 static inline void bpf_push_mac_rcsum(struct sk_buff *skb) 1371 { 1372 if (skb_at_tc_ingress(skb)) 1373 skb_postpush_rcsum(skb, skb_mac_header(skb), skb->mac_len); 1374 } 1375 1376 static inline void bpf_pull_mac_rcsum(struct sk_buff *skb) 1377 { 1378 if (skb_at_tc_ingress(skb)) 1379 skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len); 1380 } 1381 1382 BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset, 1383 const void *, from, u32, len, u64, flags) 1384 { 1385 void *ptr; 1386 1387 if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))) 1388 return -EINVAL; 1389 if (unlikely(offset > 0xffff)) 1390 return -EFAULT; 1391 if (unlikely(bpf_try_make_writable(skb, offset + len))) 1392 return -EFAULT; 1393 1394 ptr = skb->data + offset; 1395 if (flags & BPF_F_RECOMPUTE_CSUM) 1396 __skb_postpull_rcsum(skb, ptr, len, offset); 1397 1398 memcpy(ptr, from, len); 1399 1400 if (flags & BPF_F_RECOMPUTE_CSUM) 1401 __skb_postpush_rcsum(skb, ptr, len, offset); 1402 if (flags & BPF_F_INVALIDATE_HASH) 1403 skb_clear_hash(skb); 1404 1405 return 0; 1406 } 1407 1408 static const struct bpf_func_proto bpf_skb_store_bytes_proto = { 1409 .func = bpf_skb_store_bytes, 1410 .gpl_only = false, 1411 .ret_type = RET_INTEGER, 1412 .arg1_type = ARG_PTR_TO_CTX, 1413 .arg2_type = ARG_ANYTHING, 1414 .arg3_type = ARG_PTR_TO_STACK, 1415 .arg4_type = ARG_CONST_STACK_SIZE, 1416 .arg5_type = ARG_ANYTHING, 1417 }; 1418 1419 BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset, 1420 void *, to, u32, len) 1421 { 1422 void *ptr; 1423 1424 if (unlikely(offset > 0xffff)) 1425 goto err_clear; 1426 1427 ptr = skb_header_pointer(skb, offset, len, to); 1428 if (unlikely(!ptr)) 1429 goto err_clear; 1430 if (ptr != to) 1431 memcpy(to, ptr, len); 1432 1433 return 0; 1434 err_clear: 1435 memset(to, 0, len); 1436 return -EFAULT; 1437 } 1438 1439 static const struct bpf_func_proto bpf_skb_load_bytes_proto = { 1440 .func = bpf_skb_load_bytes, 1441 .gpl_only = false, 1442 .ret_type = RET_INTEGER, 1443 .arg1_type = ARG_PTR_TO_CTX, 1444 .arg2_type = ARG_ANYTHING, 1445 .arg3_type = ARG_PTR_TO_RAW_STACK, 1446 .arg4_type = ARG_CONST_STACK_SIZE, 1447 }; 1448 1449 BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len) 1450 { 1451 /* Idea is the following: should the needed direct read/write 1452 * test fail during runtime, we can pull in more data and redo 1453 * again, since implicitly, we invalidate previous checks here. 1454 * 1455 * Or, since we know how much we need to make read/writeable, 1456 * this can be done once at the program beginning for direct 1457 * access case. By this we overcome limitations of only current 1458 * headroom being accessible. 1459 */ 1460 return bpf_try_make_writable(skb, len ? : skb_headlen(skb)); 1461 } 1462 1463 static const struct bpf_func_proto bpf_skb_pull_data_proto = { 1464 .func = bpf_skb_pull_data, 1465 .gpl_only = false, 1466 .ret_type = RET_INTEGER, 1467 .arg1_type = ARG_PTR_TO_CTX, 1468 .arg2_type = ARG_ANYTHING, 1469 }; 1470 1471 BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset, 1472 u64, from, u64, to, u64, flags) 1473 { 1474 __sum16 *ptr; 1475 1476 if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK))) 1477 return -EINVAL; 1478 if (unlikely(offset > 0xffff || offset & 1)) 1479 return -EFAULT; 1480 if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr)))) 1481 return -EFAULT; 1482 1483 ptr = (__sum16 *)(skb->data + offset); 1484 switch (flags & BPF_F_HDR_FIELD_MASK) { 1485 case 0: 1486 if (unlikely(from != 0)) 1487 return -EINVAL; 1488 1489 csum_replace_by_diff(ptr, to); 1490 break; 1491 case 2: 1492 csum_replace2(ptr, from, to); 1493 break; 1494 case 4: 1495 csum_replace4(ptr, from, to); 1496 break; 1497 default: 1498 return -EINVAL; 1499 } 1500 1501 return 0; 1502 } 1503 1504 static const struct bpf_func_proto bpf_l3_csum_replace_proto = { 1505 .func = bpf_l3_csum_replace, 1506 .gpl_only = false, 1507 .ret_type = RET_INTEGER, 1508 .arg1_type = ARG_PTR_TO_CTX, 1509 .arg2_type = ARG_ANYTHING, 1510 .arg3_type = ARG_ANYTHING, 1511 .arg4_type = ARG_ANYTHING, 1512 .arg5_type = ARG_ANYTHING, 1513 }; 1514 1515 BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset, 1516 u64, from, u64, to, u64, flags) 1517 { 1518 bool is_pseudo = flags & BPF_F_PSEUDO_HDR; 1519 bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; 1520 __sum16 *ptr; 1521 1522 if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR | 1523 BPF_F_HDR_FIELD_MASK))) 1524 return -EINVAL; 1525 if (unlikely(offset > 0xffff || offset & 1)) 1526 return -EFAULT; 1527 if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr)))) 1528 return -EFAULT; 1529 1530 ptr = (__sum16 *)(skb->data + offset); 1531 if (is_mmzero && !*ptr) 1532 return 0; 1533 1534 switch (flags & BPF_F_HDR_FIELD_MASK) { 1535 case 0: 1536 if (unlikely(from != 0)) 1537 return -EINVAL; 1538 1539 inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo); 1540 break; 1541 case 2: 1542 inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); 1543 break; 1544 case 4: 1545 inet_proto_csum_replace4(ptr, skb, from, to, is_pseudo); 1546 break; 1547 default: 1548 return -EINVAL; 1549 } 1550 1551 if (is_mmzero && !*ptr) 1552 *ptr = CSUM_MANGLED_0; 1553 return 0; 1554 } 1555 1556 static const struct bpf_func_proto bpf_l4_csum_replace_proto = { 1557 .func = bpf_l4_csum_replace, 1558 .gpl_only = false, 1559 .ret_type = RET_INTEGER, 1560 .arg1_type = ARG_PTR_TO_CTX, 1561 .arg2_type = ARG_ANYTHING, 1562 .arg3_type = ARG_ANYTHING, 1563 .arg4_type = ARG_ANYTHING, 1564 .arg5_type = ARG_ANYTHING, 1565 }; 1566 1567 BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size, 1568 __be32 *, to, u32, to_size, __wsum, seed) 1569 { 1570 struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp); 1571 u32 diff_size = from_size + to_size; 1572 int i, j = 0; 1573 1574 /* This is quite flexible, some examples: 1575 * 1576 * from_size == 0, to_size > 0, seed := csum --> pushing data 1577 * from_size > 0, to_size == 0, seed := csum --> pulling data 1578 * from_size > 0, to_size > 0, seed := 0 --> diffing data 1579 * 1580 * Even for diffing, from_size and to_size don't need to be equal. 1581 */ 1582 if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) || 1583 diff_size > sizeof(sp->diff))) 1584 return -EINVAL; 1585 1586 for (i = 0; i < from_size / sizeof(__be32); i++, j++) 1587 sp->diff[j] = ~from[i]; 1588 for (i = 0; i < to_size / sizeof(__be32); i++, j++) 1589 sp->diff[j] = to[i]; 1590 1591 return csum_partial(sp->diff, diff_size, seed); 1592 } 1593 1594 static const struct bpf_func_proto bpf_csum_diff_proto = { 1595 .func = bpf_csum_diff, 1596 .gpl_only = false, 1597 .pkt_access = true, 1598 .ret_type = RET_INTEGER, 1599 .arg1_type = ARG_PTR_TO_STACK, 1600 .arg2_type = ARG_CONST_STACK_SIZE_OR_ZERO, 1601 .arg3_type = ARG_PTR_TO_STACK, 1602 .arg4_type = ARG_CONST_STACK_SIZE_OR_ZERO, 1603 .arg5_type = ARG_ANYTHING, 1604 }; 1605 1606 BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum) 1607 { 1608 /* The interface is to be used in combination with bpf_csum_diff() 1609 * for direct packet writes. csum rotation for alignment as well 1610 * as emulating csum_sub() can be done from the eBPF program. 1611 */ 1612 if (skb->ip_summed == CHECKSUM_COMPLETE) 1613 return (skb->csum = csum_add(skb->csum, csum)); 1614 1615 return -ENOTSUPP; 1616 } 1617 1618 static const struct bpf_func_proto bpf_csum_update_proto = { 1619 .func = bpf_csum_update, 1620 .gpl_only = false, 1621 .ret_type = RET_INTEGER, 1622 .arg1_type = ARG_PTR_TO_CTX, 1623 .arg2_type = ARG_ANYTHING, 1624 }; 1625 1626 static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) 1627 { 1628 return dev_forward_skb(dev, skb); 1629 } 1630 1631 static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) 1632 { 1633 int ret; 1634 1635 if (unlikely(__this_cpu_read(xmit_recursion) > XMIT_RECURSION_LIMIT)) { 1636 net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); 1637 kfree_skb(skb); 1638 return -ENETDOWN; 1639 } 1640 1641 skb->dev = dev; 1642 1643 __this_cpu_inc(xmit_recursion); 1644 ret = dev_queue_xmit(skb); 1645 __this_cpu_dec(xmit_recursion); 1646 1647 return ret; 1648 } 1649 1650 BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) 1651 { 1652 struct net_device *dev; 1653 struct sk_buff *clone; 1654 int ret; 1655 1656 if (unlikely(flags & ~(BPF_F_INGRESS))) 1657 return -EINVAL; 1658 1659 dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex); 1660 if (unlikely(!dev)) 1661 return -EINVAL; 1662 1663 clone = skb_clone(skb, GFP_ATOMIC); 1664 if (unlikely(!clone)) 1665 return -ENOMEM; 1666 1667 /* For direct write, we need to keep the invariant that the skbs 1668 * we're dealing with need to be uncloned. Should uncloning fail 1669 * here, we need to free the just generated clone to unclone once 1670 * again. 1671 */ 1672 ret = bpf_try_make_head_writable(skb); 1673 if (unlikely(ret)) { 1674 kfree_skb(clone); 1675 return -ENOMEM; 1676 } 1677 1678 bpf_push_mac_rcsum(clone); 1679 1680 return flags & BPF_F_INGRESS ? 1681 __bpf_rx_skb(dev, clone) : __bpf_tx_skb(dev, clone); 1682 } 1683 1684 static const struct bpf_func_proto bpf_clone_redirect_proto = { 1685 .func = bpf_clone_redirect, 1686 .gpl_only = false, 1687 .ret_type = RET_INTEGER, 1688 .arg1_type = ARG_PTR_TO_CTX, 1689 .arg2_type = ARG_ANYTHING, 1690 .arg3_type = ARG_ANYTHING, 1691 }; 1692 1693 struct redirect_info { 1694 u32 ifindex; 1695 u32 flags; 1696 }; 1697 1698 static DEFINE_PER_CPU(struct redirect_info, redirect_info); 1699 1700 BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) 1701 { 1702 struct redirect_info *ri = this_cpu_ptr(&redirect_info); 1703 1704 if (unlikely(flags & ~(BPF_F_INGRESS))) 1705 return TC_ACT_SHOT; 1706 1707 ri->ifindex = ifindex; 1708 ri->flags = flags; 1709 1710 return TC_ACT_REDIRECT; 1711 } 1712 1713 int skb_do_redirect(struct sk_buff *skb) 1714 { 1715 struct redirect_info *ri = this_cpu_ptr(&redirect_info); 1716 struct net_device *dev; 1717 1718 dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex); 1719 ri->ifindex = 0; 1720 if (unlikely(!dev)) { 1721 kfree_skb(skb); 1722 return -EINVAL; 1723 } 1724 1725 bpf_push_mac_rcsum(skb); 1726 1727 return ri->flags & BPF_F_INGRESS ? 1728 __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); 1729 } 1730 1731 static const struct bpf_func_proto bpf_redirect_proto = { 1732 .func = bpf_redirect, 1733 .gpl_only = false, 1734 .ret_type = RET_INTEGER, 1735 .arg1_type = ARG_ANYTHING, 1736 .arg2_type = ARG_ANYTHING, 1737 }; 1738 1739 BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) 1740 { 1741 return task_get_classid(skb); 1742 } 1743 1744 static const struct bpf_func_proto bpf_get_cgroup_classid_proto = { 1745 .func = bpf_get_cgroup_classid, 1746 .gpl_only = false, 1747 .ret_type = RET_INTEGER, 1748 .arg1_type = ARG_PTR_TO_CTX, 1749 }; 1750 1751 BPF_CALL_1(bpf_get_route_realm, const struct sk_buff *, skb) 1752 { 1753 return dst_tclassid(skb); 1754 } 1755 1756 static const struct bpf_func_proto bpf_get_route_realm_proto = { 1757 .func = bpf_get_route_realm, 1758 .gpl_only = false, 1759 .ret_type = RET_INTEGER, 1760 .arg1_type = ARG_PTR_TO_CTX, 1761 }; 1762 1763 BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb) 1764 { 1765 /* If skb_clear_hash() was called due to mangling, we can 1766 * trigger SW recalculation here. Later access to hash 1767 * can then use the inline skb->hash via context directly 1768 * instead of calling this helper again. 1769 */ 1770 return skb_get_hash(skb); 1771 } 1772 1773 static const struct bpf_func_proto bpf_get_hash_recalc_proto = { 1774 .func = bpf_get_hash_recalc, 1775 .gpl_only = false, 1776 .ret_type = RET_INTEGER, 1777 .arg1_type = ARG_PTR_TO_CTX, 1778 }; 1779 1780 BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb) 1781 { 1782 /* After all direct packet write, this can be used once for 1783 * triggering a lazy recalc on next skb_get_hash() invocation. 1784 */ 1785 skb_clear_hash(skb); 1786 return 0; 1787 } 1788 1789 static const struct bpf_func_proto bpf_set_hash_invalid_proto = { 1790 .func = bpf_set_hash_invalid, 1791 .gpl_only = false, 1792 .ret_type = RET_INTEGER, 1793 .arg1_type = ARG_PTR_TO_CTX, 1794 }; 1795 1796 BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, 1797 u16, vlan_tci) 1798 { 1799 int ret; 1800 1801 if (unlikely(vlan_proto != htons(ETH_P_8021Q) && 1802 vlan_proto != htons(ETH_P_8021AD))) 1803 vlan_proto = htons(ETH_P_8021Q); 1804 1805 bpf_push_mac_rcsum(skb); 1806 ret = skb_vlan_push(skb, vlan_proto, vlan_tci); 1807 bpf_pull_mac_rcsum(skb); 1808 1809 bpf_compute_data_end(skb); 1810 return ret; 1811 } 1812 1813 const struct bpf_func_proto bpf_skb_vlan_push_proto = { 1814 .func = bpf_skb_vlan_push, 1815 .gpl_only = false, 1816 .ret_type = RET_INTEGER, 1817 .arg1_type = ARG_PTR_TO_CTX, 1818 .arg2_type = ARG_ANYTHING, 1819 .arg3_type = ARG_ANYTHING, 1820 }; 1821 EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto); 1822 1823 BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) 1824 { 1825 int ret; 1826 1827 bpf_push_mac_rcsum(skb); 1828 ret = skb_vlan_pop(skb); 1829 bpf_pull_mac_rcsum(skb); 1830 1831 bpf_compute_data_end(skb); 1832 return ret; 1833 } 1834 1835 const struct bpf_func_proto bpf_skb_vlan_pop_proto = { 1836 .func = bpf_skb_vlan_pop, 1837 .gpl_only = false, 1838 .ret_type = RET_INTEGER, 1839 .arg1_type = ARG_PTR_TO_CTX, 1840 }; 1841 EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto); 1842 1843 static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len) 1844 { 1845 /* Caller already did skb_cow() with len as headroom, 1846 * so no need to do it here. 1847 */ 1848 skb_push(skb, len); 1849 memmove(skb->data, skb->data + len, off); 1850 memset(skb->data + off, 0, len); 1851 1852 /* No skb_postpush_rcsum(skb, skb->data + off, len) 1853 * needed here as it does not change the skb->csum 1854 * result for checksum complete when summing over 1855 * zeroed blocks. 1856 */ 1857 return 0; 1858 } 1859 1860 static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len) 1861 { 1862 /* skb_ensure_writable() is not needed here, as we're 1863 * already working on an uncloned skb. 1864 */ 1865 if (unlikely(!pskb_may_pull(skb, off + len))) 1866 return -ENOMEM; 1867 1868 skb_postpull_rcsum(skb, skb->data + off, len); 1869 memmove(skb->data + len, skb->data, off); 1870 __skb_pull(skb, len); 1871 1872 return 0; 1873 } 1874 1875 static int bpf_skb_net_hdr_push(struct sk_buff *skb, u32 off, u32 len) 1876 { 1877 bool trans_same = skb->transport_header == skb->network_header; 1878 int ret; 1879 1880 /* There's no need for __skb_push()/__skb_pull() pair to 1881 * get to the start of the mac header as we're guaranteed 1882 * to always start from here under eBPF. 1883 */ 1884 ret = bpf_skb_generic_push(skb, off, len); 1885 if (likely(!ret)) { 1886 skb->mac_header -= len; 1887 skb->network_header -= len; 1888 if (trans_same) 1889 skb->transport_header = skb->network_header; 1890 } 1891 1892 return ret; 1893 } 1894 1895 static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len) 1896 { 1897 bool trans_same = skb->transport_header == skb->network_header; 1898 int ret; 1899 1900 /* Same here, __skb_push()/__skb_pull() pair not needed. */ 1901 ret = bpf_skb_generic_pop(skb, off, len); 1902 if (likely(!ret)) { 1903 skb->mac_header += len; 1904 skb->network_header += len; 1905 if (trans_same) 1906 skb->transport_header = skb->network_header; 1907 } 1908 1909 return ret; 1910 } 1911 1912 static int bpf_skb_proto_4_to_6(struct sk_buff *skb) 1913 { 1914 const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); 1915 u32 off = skb->network_header - skb->mac_header; 1916 int ret; 1917 1918 ret = skb_cow(skb, len_diff); 1919 if (unlikely(ret < 0)) 1920 return ret; 1921 1922 ret = bpf_skb_net_hdr_push(skb, off, len_diff); 1923 if (unlikely(ret < 0)) 1924 return ret; 1925 1926 if (skb_is_gso(skb)) { 1927 /* SKB_GSO_UDP stays as is. SKB_GSO_TCPV4 needs to 1928 * be changed into SKB_GSO_TCPV6. 1929 */ 1930 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { 1931 skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV4; 1932 skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6; 1933 } 1934 1935 /* Due to IPv6 header, MSS needs to be downgraded. */ 1936 skb_shinfo(skb)->gso_size -= len_diff; 1937 /* Header must be checked, and gso_segs recomputed. */ 1938 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; 1939 skb_shinfo(skb)->gso_segs = 0; 1940 } 1941 1942 skb->protocol = htons(ETH_P_IPV6); 1943 skb_clear_hash(skb); 1944 1945 return 0; 1946 } 1947 1948 static int bpf_skb_proto_6_to_4(struct sk_buff *skb) 1949 { 1950 const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); 1951 u32 off = skb->network_header - skb->mac_header; 1952 int ret; 1953 1954 ret = skb_unclone(skb, GFP_ATOMIC); 1955 if (unlikely(ret < 0)) 1956 return ret; 1957 1958 ret = bpf_skb_net_hdr_pop(skb, off, len_diff); 1959 if (unlikely(ret < 0)) 1960 return ret; 1961 1962 if (skb_is_gso(skb)) { 1963 /* SKB_GSO_UDP stays as is. SKB_GSO_TCPV6 needs to 1964 * be changed into SKB_GSO_TCPV4. 1965 */ 1966 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) { 1967 skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV6; 1968 skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; 1969 } 1970 1971 /* Due to IPv4 header, MSS can be upgraded. */ 1972 skb_shinfo(skb)->gso_size += len_diff; 1973 /* Header must be checked, and gso_segs recomputed. */ 1974 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; 1975 skb_shinfo(skb)->gso_segs = 0; 1976 } 1977 1978 skb->protocol = htons(ETH_P_IP); 1979 skb_clear_hash(skb); 1980 1981 return 0; 1982 } 1983 1984 static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto) 1985 { 1986 __be16 from_proto = skb->protocol; 1987 1988 if (from_proto == htons(ETH_P_IP) && 1989 to_proto == htons(ETH_P_IPV6)) 1990 return bpf_skb_proto_4_to_6(skb); 1991 1992 if (from_proto == htons(ETH_P_IPV6) && 1993 to_proto == htons(ETH_P_IP)) 1994 return bpf_skb_proto_6_to_4(skb); 1995 1996 return -ENOTSUPP; 1997 } 1998 1999 BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto, 2000 u64, flags) 2001 { 2002 int ret; 2003 2004 if (unlikely(flags)) 2005 return -EINVAL; 2006 2007 /* General idea is that this helper does the basic groundwork 2008 * needed for changing the protocol, and eBPF program fills the 2009 * rest through bpf_skb_store_bytes(), bpf_lX_csum_replace() 2010 * and other helpers, rather than passing a raw buffer here. 2011 * 2012 * The rationale is to keep this minimal and without a need to 2013 * deal with raw packet data. F.e. even if we would pass buffers 2014 * here, the program still needs to call the bpf_lX_csum_replace() 2015 * helpers anyway. Plus, this way we keep also separation of 2016 * concerns, since f.e. bpf_skb_store_bytes() should only take 2017 * care of stores. 2018 * 2019 * Currently, additional options and extension header space are 2020 * not supported, but flags register is reserved so we can adapt 2021 * that. For offloads, we mark packet as dodgy, so that headers 2022 * need to be verified first. 2023 */ 2024 ret = bpf_skb_proto_xlat(skb, proto); 2025 bpf_compute_data_end(skb); 2026 return ret; 2027 } 2028 2029 static const struct bpf_func_proto bpf_skb_change_proto_proto = { 2030 .func = bpf_skb_change_proto, 2031 .gpl_only = false, 2032 .ret_type = RET_INTEGER, 2033 .arg1_type = ARG_PTR_TO_CTX, 2034 .arg2_type = ARG_ANYTHING, 2035 .arg3_type = ARG_ANYTHING, 2036 }; 2037 2038 BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type) 2039 { 2040 /* We only allow a restricted subset to be changed for now. */ 2041 if (unlikely(!skb_pkt_type_ok(skb->pkt_type) || 2042 !skb_pkt_type_ok(pkt_type))) 2043 return -EINVAL; 2044 2045 skb->pkt_type = pkt_type; 2046 return 0; 2047 } 2048 2049 static const struct bpf_func_proto bpf_skb_change_type_proto = { 2050 .func = bpf_skb_change_type, 2051 .gpl_only = false, 2052 .ret_type = RET_INTEGER, 2053 .arg1_type = ARG_PTR_TO_CTX, 2054 .arg2_type = ARG_ANYTHING, 2055 }; 2056 2057 static u32 __bpf_skb_min_len(const struct sk_buff *skb) 2058 { 2059 u32 min_len = skb_network_offset(skb); 2060 2061 if (skb_transport_header_was_set(skb)) 2062 min_len = skb_transport_offset(skb); 2063 if (skb->ip_summed == CHECKSUM_PARTIAL) 2064 min_len = skb_checksum_start_offset(skb) + 2065 skb->csum_offset + sizeof(__sum16); 2066 return min_len; 2067 } 2068 2069 static u32 __bpf_skb_max_len(const struct sk_buff *skb) 2070 { 2071 return skb->dev->mtu + skb->dev->hard_header_len; 2072 } 2073 2074 static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len) 2075 { 2076 unsigned int old_len = skb->len; 2077 int ret; 2078 2079 ret = __skb_grow_rcsum(skb, new_len); 2080 if (!ret) 2081 memset(skb->data + old_len, 0, new_len - old_len); 2082 return ret; 2083 } 2084 2085 static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len) 2086 { 2087 return __skb_trim_rcsum(skb, new_len); 2088 } 2089 2090 BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, 2091 u64, flags) 2092 { 2093 u32 max_len = __bpf_skb_max_len(skb); 2094 u32 min_len = __bpf_skb_min_len(skb); 2095 int ret; 2096 2097 if (unlikely(flags || new_len > max_len || new_len < min_len)) 2098 return -EINVAL; 2099 if (skb->encapsulation) 2100 return -ENOTSUPP; 2101 2102 /* The basic idea of this helper is that it's performing the 2103 * needed work to either grow or trim an skb, and eBPF program 2104 * rewrites the rest via helpers like bpf_skb_store_bytes(), 2105 * bpf_lX_csum_replace() and others rather than passing a raw 2106 * buffer here. This one is a slow path helper and intended 2107 * for replies with control messages. 2108 * 2109 * Like in bpf_skb_change_proto(), we want to keep this rather 2110 * minimal and without protocol specifics so that we are able 2111 * to separate concerns as in bpf_skb_store_bytes() should only 2112 * be the one responsible for writing buffers. 2113 * 2114 * It's really expected to be a slow path operation here for 2115 * control message replies, so we're implicitly linearizing, 2116 * uncloning and drop offloads from the skb by this. 2117 */ 2118 ret = __bpf_try_make_writable(skb, skb->len); 2119 if (!ret) { 2120 if (new_len > skb->len) 2121 ret = bpf_skb_grow_rcsum(skb, new_len); 2122 else if (new_len < skb->len) 2123 ret = bpf_skb_trim_rcsum(skb, new_len); 2124 if (!ret && skb_is_gso(skb)) 2125 skb_gso_reset(skb); 2126 } 2127 2128 bpf_compute_data_end(skb); 2129 return ret; 2130 } 2131 2132 static const struct bpf_func_proto bpf_skb_change_tail_proto = { 2133 .func = bpf_skb_change_tail, 2134 .gpl_only = false, 2135 .ret_type = RET_INTEGER, 2136 .arg1_type = ARG_PTR_TO_CTX, 2137 .arg2_type = ARG_ANYTHING, 2138 .arg3_type = ARG_ANYTHING, 2139 }; 2140 2141 bool bpf_helper_changes_skb_data(void *func) 2142 { 2143 if (func == bpf_skb_vlan_push || 2144 func == bpf_skb_vlan_pop || 2145 func == bpf_skb_store_bytes || 2146 func == bpf_skb_change_proto || 2147 func == bpf_skb_change_tail || 2148 func == bpf_skb_pull_data || 2149 func == bpf_l3_csum_replace || 2150 func == bpf_l4_csum_replace) 2151 return true; 2152 2153 return false; 2154 } 2155 2156 static unsigned long bpf_skb_copy(void *dst_buff, const void *skb, 2157 unsigned long off, unsigned long len) 2158 { 2159 void *ptr = skb_header_pointer(skb, off, len, dst_buff); 2160 2161 if (unlikely(!ptr)) 2162 return len; 2163 if (ptr != dst_buff) 2164 memcpy(dst_buff, ptr, len); 2165 2166 return 0; 2167 } 2168 2169 BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map, 2170 u64, flags, void *, meta, u64, meta_size) 2171 { 2172 u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32; 2173 2174 if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) 2175 return -EINVAL; 2176 if (unlikely(skb_size > skb->len)) 2177 return -EFAULT; 2178 2179 return bpf_event_output(map, flags, meta, meta_size, skb, skb_size, 2180 bpf_skb_copy); 2181 } 2182 2183 static const struct bpf_func_proto bpf_skb_event_output_proto = { 2184 .func = bpf_skb_event_output, 2185 .gpl_only = true, 2186 .ret_type = RET_INTEGER, 2187 .arg1_type = ARG_PTR_TO_CTX, 2188 .arg2_type = ARG_CONST_MAP_PTR, 2189 .arg3_type = ARG_ANYTHING, 2190 .arg4_type = ARG_PTR_TO_STACK, 2191 .arg5_type = ARG_CONST_STACK_SIZE, 2192 }; 2193 2194 static unsigned short bpf_tunnel_key_af(u64 flags) 2195 { 2196 return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET; 2197 } 2198 2199 BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key *, to, 2200 u32, size, u64, flags) 2201 { 2202 const struct ip_tunnel_info *info = skb_tunnel_info(skb); 2203 u8 compat[sizeof(struct bpf_tunnel_key)]; 2204 void *to_orig = to; 2205 int err; 2206 2207 if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6)))) { 2208 err = -EINVAL; 2209 goto err_clear; 2210 } 2211 if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) { 2212 err = -EPROTO; 2213 goto err_clear; 2214 } 2215 if (unlikely(size != sizeof(struct bpf_tunnel_key))) { 2216 err = -EINVAL; 2217 switch (size) { 2218 case offsetof(struct bpf_tunnel_key, tunnel_label): 2219 case offsetof(struct bpf_tunnel_key, tunnel_ext): 2220 goto set_compat; 2221 case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): 2222 /* Fixup deprecated structure layouts here, so we have 2223 * a common path later on. 2224 */ 2225 if (ip_tunnel_info_af(info) != AF_INET) 2226 goto err_clear; 2227 set_compat: 2228 to = (struct bpf_tunnel_key *)compat; 2229 break; 2230 default: 2231 goto err_clear; 2232 } 2233 } 2234 2235 to->tunnel_id = be64_to_cpu(info->key.tun_id); 2236 to->tunnel_tos = info->key.tos; 2237 to->tunnel_ttl = info->key.ttl; 2238 2239 if (flags & BPF_F_TUNINFO_IPV6) { 2240 memcpy(to->remote_ipv6, &info->key.u.ipv6.src, 2241 sizeof(to->remote_ipv6)); 2242 to->tunnel_label = be32_to_cpu(info->key.label); 2243 } else { 2244 to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); 2245 } 2246 2247 if (unlikely(size != sizeof(struct bpf_tunnel_key))) 2248 memcpy(to_orig, to, size); 2249 2250 return 0; 2251 err_clear: 2252 memset(to_orig, 0, size); 2253 return err; 2254 } 2255 2256 static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { 2257 .func = bpf_skb_get_tunnel_key, 2258 .gpl_only = false, 2259 .ret_type = RET_INTEGER, 2260 .arg1_type = ARG_PTR_TO_CTX, 2261 .arg2_type = ARG_PTR_TO_RAW_STACK, 2262 .arg3_type = ARG_CONST_STACK_SIZE, 2263 .arg4_type = ARG_ANYTHING, 2264 }; 2265 2266 BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size) 2267 { 2268 const struct ip_tunnel_info *info = skb_tunnel_info(skb); 2269 int err; 2270 2271 if (unlikely(!info || 2272 !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) { 2273 err = -ENOENT; 2274 goto err_clear; 2275 } 2276 if (unlikely(size < info->options_len)) { 2277 err = -ENOMEM; 2278 goto err_clear; 2279 } 2280 2281 ip_tunnel_info_opts_get(to, info); 2282 if (size > info->options_len) 2283 memset(to + info->options_len, 0, size - info->options_len); 2284 2285 return info->options_len; 2286 err_clear: 2287 memset(to, 0, size); 2288 return err; 2289 } 2290 2291 static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = { 2292 .func = bpf_skb_get_tunnel_opt, 2293 .gpl_only = false, 2294 .ret_type = RET_INTEGER, 2295 .arg1_type = ARG_PTR_TO_CTX, 2296 .arg2_type = ARG_PTR_TO_RAW_STACK, 2297 .arg3_type = ARG_CONST_STACK_SIZE, 2298 }; 2299 2300 static struct metadata_dst __percpu *md_dst; 2301 2302 BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, 2303 const struct bpf_tunnel_key *, from, u32, size, u64, flags) 2304 { 2305 struct metadata_dst *md = this_cpu_ptr(md_dst); 2306 u8 compat[sizeof(struct bpf_tunnel_key)]; 2307 struct ip_tunnel_info *info; 2308 2309 if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX | 2310 BPF_F_DONT_FRAGMENT))) 2311 return -EINVAL; 2312 if (unlikely(size != sizeof(struct bpf_tunnel_key))) { 2313 switch (size) { 2314 case offsetof(struct bpf_tunnel_key, tunnel_label): 2315 case offsetof(struct bpf_tunnel_key, tunnel_ext): 2316 case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): 2317 /* Fixup deprecated structure layouts here, so we have 2318 * a common path later on. 2319 */ 2320 memcpy(compat, from, size); 2321 memset(compat + size, 0, sizeof(compat) - size); 2322 from = (const struct bpf_tunnel_key *) compat; 2323 break; 2324 default: 2325 return -EINVAL; 2326 } 2327 } 2328 if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) || 2329 from->tunnel_ext)) 2330 return -EINVAL; 2331 2332 skb_dst_drop(skb); 2333 dst_hold((struct dst_entry *) md); 2334 skb_dst_set(skb, (struct dst_entry *) md); 2335 2336 info = &md->u.tun_info; 2337 info->mode = IP_TUNNEL_INFO_TX; 2338 2339 info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE; 2340 if (flags & BPF_F_DONT_FRAGMENT) 2341 info->key.tun_flags |= TUNNEL_DONT_FRAGMENT; 2342 2343 info->key.tun_id = cpu_to_be64(from->tunnel_id); 2344 info->key.tos = from->tunnel_tos; 2345 info->key.ttl = from->tunnel_ttl; 2346 2347 if (flags & BPF_F_TUNINFO_IPV6) { 2348 info->mode |= IP_TUNNEL_INFO_IPV6; 2349 memcpy(&info->key.u.ipv6.dst, from->remote_ipv6, 2350 sizeof(from->remote_ipv6)); 2351 info->key.label = cpu_to_be32(from->tunnel_label) & 2352 IPV6_FLOWLABEL_MASK; 2353 } else { 2354 info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); 2355 if (flags & BPF_F_ZERO_CSUM_TX) 2356 info->key.tun_flags &= ~TUNNEL_CSUM; 2357 } 2358 2359 return 0; 2360 } 2361 2362 static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { 2363 .func = bpf_skb_set_tunnel_key, 2364 .gpl_only = false, 2365 .ret_type = RET_INTEGER, 2366 .arg1_type = ARG_PTR_TO_CTX, 2367 .arg2_type = ARG_PTR_TO_STACK, 2368 .arg3_type = ARG_CONST_STACK_SIZE, 2369 .arg4_type = ARG_ANYTHING, 2370 }; 2371 2372 BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb, 2373 const u8 *, from, u32, size) 2374 { 2375 struct ip_tunnel_info *info = skb_tunnel_info(skb); 2376 const struct metadata_dst *md = this_cpu_ptr(md_dst); 2377 2378 if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1)))) 2379 return -EINVAL; 2380 if (unlikely(size > IP_TUNNEL_OPTS_MAX)) 2381 return -ENOMEM; 2382 2383 ip_tunnel_info_opts_set(info, from, size); 2384 2385 return 0; 2386 } 2387 2388 static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = { 2389 .func = bpf_skb_set_tunnel_opt, 2390 .gpl_only = false, 2391 .ret_type = RET_INTEGER, 2392 .arg1_type = ARG_PTR_TO_CTX, 2393 .arg2_type = ARG_PTR_TO_STACK, 2394 .arg3_type = ARG_CONST_STACK_SIZE, 2395 }; 2396 2397 static const struct bpf_func_proto * 2398 bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) 2399 { 2400 if (!md_dst) { 2401 /* Race is not possible, since it's called from verifier 2402 * that is holding verifier mutex. 2403 */ 2404 md_dst = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX, 2405 GFP_KERNEL); 2406 if (!md_dst) 2407 return NULL; 2408 } 2409 2410 switch (which) { 2411 case BPF_FUNC_skb_set_tunnel_key: 2412 return &bpf_skb_set_tunnel_key_proto; 2413 case BPF_FUNC_skb_set_tunnel_opt: 2414 return &bpf_skb_set_tunnel_opt_proto; 2415 default: 2416 return NULL; 2417 } 2418 } 2419 2420 BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map, 2421 u32, idx) 2422 { 2423 struct bpf_array *array = container_of(map, struct bpf_array, map); 2424 struct cgroup *cgrp; 2425 struct sock *sk; 2426 2427 sk = skb_to_full_sk(skb); 2428 if (!sk || !sk_fullsock(sk)) 2429 return -ENOENT; 2430 if (unlikely(idx >= array->map.max_entries)) 2431 return -E2BIG; 2432 2433 cgrp = READ_ONCE(array->ptrs[idx]); 2434 if (unlikely(!cgrp)) 2435 return -EAGAIN; 2436 2437 return sk_under_cgroup_hierarchy(sk, cgrp); 2438 } 2439 2440 static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { 2441 .func = bpf_skb_under_cgroup, 2442 .gpl_only = false, 2443 .ret_type = RET_INTEGER, 2444 .arg1_type = ARG_PTR_TO_CTX, 2445 .arg2_type = ARG_CONST_MAP_PTR, 2446 .arg3_type = ARG_ANYTHING, 2447 }; 2448 2449 static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, 2450 unsigned long off, unsigned long len) 2451 { 2452 memcpy(dst_buff, src_buff + off, len); 2453 return 0; 2454 } 2455 2456 BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map, 2457 u64, flags, void *, meta, u64, meta_size) 2458 { 2459 u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32; 2460 2461 if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) 2462 return -EINVAL; 2463 if (unlikely(xdp_size > (unsigned long)(xdp->data_end - xdp->data))) 2464 return -EFAULT; 2465 2466 return bpf_event_output(map, flags, meta, meta_size, xdp, xdp_size, 2467 bpf_xdp_copy); 2468 } 2469 2470 static const struct bpf_func_proto bpf_xdp_event_output_proto = { 2471 .func = bpf_xdp_event_output, 2472 .gpl_only = true, 2473 .ret_type = RET_INTEGER, 2474 .arg1_type = ARG_PTR_TO_CTX, 2475 .arg2_type = ARG_CONST_MAP_PTR, 2476 .arg3_type = ARG_ANYTHING, 2477 .arg4_type = ARG_PTR_TO_STACK, 2478 .arg5_type = ARG_CONST_STACK_SIZE, 2479 }; 2480 2481 static const struct bpf_func_proto * 2482 sk_filter_func_proto(enum bpf_func_id func_id) 2483 { 2484 switch (func_id) { 2485 case BPF_FUNC_map_lookup_elem: 2486 return &bpf_map_lookup_elem_proto; 2487 case BPF_FUNC_map_update_elem: 2488 return &bpf_map_update_elem_proto; 2489 case BPF_FUNC_map_delete_elem: 2490 return &bpf_map_delete_elem_proto; 2491 case BPF_FUNC_get_prandom_u32: 2492 return &bpf_get_prandom_u32_proto; 2493 case BPF_FUNC_get_smp_processor_id: 2494 return &bpf_get_raw_smp_processor_id_proto; 2495 case BPF_FUNC_tail_call: 2496 return &bpf_tail_call_proto; 2497 case BPF_FUNC_ktime_get_ns: 2498 return &bpf_ktime_get_ns_proto; 2499 case BPF_FUNC_trace_printk: 2500 if (capable(CAP_SYS_ADMIN)) 2501 return bpf_get_trace_printk_proto(); 2502 default: 2503 return NULL; 2504 } 2505 } 2506 2507 static const struct bpf_func_proto * 2508 tc_cls_act_func_proto(enum bpf_func_id func_id) 2509 { 2510 switch (func_id) { 2511 case BPF_FUNC_skb_store_bytes: 2512 return &bpf_skb_store_bytes_proto; 2513 case BPF_FUNC_skb_load_bytes: 2514 return &bpf_skb_load_bytes_proto; 2515 case BPF_FUNC_skb_pull_data: 2516 return &bpf_skb_pull_data_proto; 2517 case BPF_FUNC_csum_diff: 2518 return &bpf_csum_diff_proto; 2519 case BPF_FUNC_csum_update: 2520 return &bpf_csum_update_proto; 2521 case BPF_FUNC_l3_csum_replace: 2522 return &bpf_l3_csum_replace_proto; 2523 case BPF_FUNC_l4_csum_replace: 2524 return &bpf_l4_csum_replace_proto; 2525 case BPF_FUNC_clone_redirect: 2526 return &bpf_clone_redirect_proto; 2527 case BPF_FUNC_get_cgroup_classid: 2528 return &bpf_get_cgroup_classid_proto; 2529 case BPF_FUNC_skb_vlan_push: 2530 return &bpf_skb_vlan_push_proto; 2531 case BPF_FUNC_skb_vlan_pop: 2532 return &bpf_skb_vlan_pop_proto; 2533 case BPF_FUNC_skb_change_proto: 2534 return &bpf_skb_change_proto_proto; 2535 case BPF_FUNC_skb_change_type: 2536 return &bpf_skb_change_type_proto; 2537 case BPF_FUNC_skb_change_tail: 2538 return &bpf_skb_change_tail_proto; 2539 case BPF_FUNC_skb_get_tunnel_key: 2540 return &bpf_skb_get_tunnel_key_proto; 2541 case BPF_FUNC_skb_set_tunnel_key: 2542 return bpf_get_skb_set_tunnel_proto(func_id); 2543 case BPF_FUNC_skb_get_tunnel_opt: 2544 return &bpf_skb_get_tunnel_opt_proto; 2545 case BPF_FUNC_skb_set_tunnel_opt: 2546 return bpf_get_skb_set_tunnel_proto(func_id); 2547 case BPF_FUNC_redirect: 2548 return &bpf_redirect_proto; 2549 case BPF_FUNC_get_route_realm: 2550 return &bpf_get_route_realm_proto; 2551 case BPF_FUNC_get_hash_recalc: 2552 return &bpf_get_hash_recalc_proto; 2553 case BPF_FUNC_set_hash_invalid: 2554 return &bpf_set_hash_invalid_proto; 2555 case BPF_FUNC_perf_event_output: 2556 return &bpf_skb_event_output_proto; 2557 case BPF_FUNC_get_smp_processor_id: 2558 return &bpf_get_smp_processor_id_proto; 2559 case BPF_FUNC_skb_under_cgroup: 2560 return &bpf_skb_under_cgroup_proto; 2561 default: 2562 return sk_filter_func_proto(func_id); 2563 } 2564 } 2565 2566 static const struct bpf_func_proto * 2567 xdp_func_proto(enum bpf_func_id func_id) 2568 { 2569 switch (func_id) { 2570 case BPF_FUNC_perf_event_output: 2571 return &bpf_xdp_event_output_proto; 2572 case BPF_FUNC_get_smp_processor_id: 2573 return &bpf_get_smp_processor_id_proto; 2574 default: 2575 return sk_filter_func_proto(func_id); 2576 } 2577 } 2578 2579 static bool __is_valid_access(int off, int size, enum bpf_access_type type) 2580 { 2581 if (off < 0 || off >= sizeof(struct __sk_buff)) 2582 return false; 2583 /* The verifier guarantees that size > 0. */ 2584 if (off % size != 0) 2585 return false; 2586 if (size != sizeof(__u32)) 2587 return false; 2588 2589 return true; 2590 } 2591 2592 static bool sk_filter_is_valid_access(int off, int size, 2593 enum bpf_access_type type, 2594 enum bpf_reg_type *reg_type) 2595 { 2596 switch (off) { 2597 case offsetof(struct __sk_buff, tc_classid): 2598 case offsetof(struct __sk_buff, data): 2599 case offsetof(struct __sk_buff, data_end): 2600 return false; 2601 } 2602 2603 if (type == BPF_WRITE) { 2604 switch (off) { 2605 case offsetof(struct __sk_buff, cb[0]) ... 2606 offsetof(struct __sk_buff, cb[4]): 2607 break; 2608 default: 2609 return false; 2610 } 2611 } 2612 2613 return __is_valid_access(off, size, type); 2614 } 2615 2616 static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, 2617 const struct bpf_prog *prog) 2618 { 2619 struct bpf_insn *insn = insn_buf; 2620 2621 if (!direct_write) 2622 return 0; 2623 2624 /* if (!skb->cloned) 2625 * goto start; 2626 * 2627 * (Fast-path, otherwise approximation that we might be 2628 * a clone, do the rest in helper.) 2629 */ 2630 *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET()); 2631 *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK); 2632 *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7); 2633 2634 /* ret = bpf_skb_pull_data(skb, 0); */ 2635 *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); 2636 *insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2); 2637 *insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, 2638 BPF_FUNC_skb_pull_data); 2639 /* if (!ret) 2640 * goto restore; 2641 * return TC_ACT_SHOT; 2642 */ 2643 *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2); 2644 *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, TC_ACT_SHOT); 2645 *insn++ = BPF_EXIT_INSN(); 2646 2647 /* restore: */ 2648 *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6); 2649 /* start: */ 2650 *insn++ = prog->insnsi[0]; 2651 2652 return insn - insn_buf; 2653 } 2654 2655 static bool tc_cls_act_is_valid_access(int off, int size, 2656 enum bpf_access_type type, 2657 enum bpf_reg_type *reg_type) 2658 { 2659 if (type == BPF_WRITE) { 2660 switch (off) { 2661 case offsetof(struct __sk_buff, mark): 2662 case offsetof(struct __sk_buff, tc_index): 2663 case offsetof(struct __sk_buff, priority): 2664 case offsetof(struct __sk_buff, cb[0]) ... 2665 offsetof(struct __sk_buff, cb[4]): 2666 case offsetof(struct __sk_buff, tc_classid): 2667 break; 2668 default: 2669 return false; 2670 } 2671 } 2672 2673 switch (off) { 2674 case offsetof(struct __sk_buff, data): 2675 *reg_type = PTR_TO_PACKET; 2676 break; 2677 case offsetof(struct __sk_buff, data_end): 2678 *reg_type = PTR_TO_PACKET_END; 2679 break; 2680 } 2681 2682 return __is_valid_access(off, size, type); 2683 } 2684 2685 static bool __is_valid_xdp_access(int off, int size, 2686 enum bpf_access_type type) 2687 { 2688 if (off < 0 || off >= sizeof(struct xdp_md)) 2689 return false; 2690 if (off % size != 0) 2691 return false; 2692 if (size != sizeof(__u32)) 2693 return false; 2694 2695 return true; 2696 } 2697 2698 static bool xdp_is_valid_access(int off, int size, 2699 enum bpf_access_type type, 2700 enum bpf_reg_type *reg_type) 2701 { 2702 if (type == BPF_WRITE) 2703 return false; 2704 2705 switch (off) { 2706 case offsetof(struct xdp_md, data): 2707 *reg_type = PTR_TO_PACKET; 2708 break; 2709 case offsetof(struct xdp_md, data_end): 2710 *reg_type = PTR_TO_PACKET_END; 2711 break; 2712 } 2713 2714 return __is_valid_xdp_access(off, size, type); 2715 } 2716 2717 void bpf_warn_invalid_xdp_action(u32 act) 2718 { 2719 WARN_ONCE(1, "Illegal XDP return value %u, expect packet loss\n", act); 2720 } 2721 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); 2722 2723 static u32 sk_filter_convert_ctx_access(enum bpf_access_type type, int dst_reg, 2724 int src_reg, int ctx_off, 2725 struct bpf_insn *insn_buf, 2726 struct bpf_prog *prog) 2727 { 2728 struct bpf_insn *insn = insn_buf; 2729 2730 switch (ctx_off) { 2731 case offsetof(struct __sk_buff, len): 2732 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); 2733 2734 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 2735 offsetof(struct sk_buff, len)); 2736 break; 2737 2738 case offsetof(struct __sk_buff, protocol): 2739 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); 2740 2741 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 2742 offsetof(struct sk_buff, protocol)); 2743 break; 2744 2745 case offsetof(struct __sk_buff, vlan_proto): 2746 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2); 2747 2748 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 2749 offsetof(struct sk_buff, vlan_proto)); 2750 break; 2751 2752 case offsetof(struct __sk_buff, priority): 2753 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4); 2754 2755 if (type == BPF_WRITE) 2756 *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, 2757 offsetof(struct sk_buff, priority)); 2758 else 2759 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 2760 offsetof(struct sk_buff, priority)); 2761 break; 2762 2763 case offsetof(struct __sk_buff, ingress_ifindex): 2764 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4); 2765 2766 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 2767 offsetof(struct sk_buff, skb_iif)); 2768 break; 2769 2770 case offsetof(struct __sk_buff, ifindex): 2771 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); 2772 2773 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), 2774 dst_reg, src_reg, 2775 offsetof(struct sk_buff, dev)); 2776 *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1); 2777 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg, 2778 offsetof(struct net_device, ifindex)); 2779 break; 2780 2781 case offsetof(struct __sk_buff, hash): 2782 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); 2783 2784 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 2785 offsetof(struct sk_buff, hash)); 2786 break; 2787 2788 case offsetof(struct __sk_buff, mark): 2789 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); 2790 2791 if (type == BPF_WRITE) 2792 *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, 2793 offsetof(struct sk_buff, mark)); 2794 else 2795 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 2796 offsetof(struct sk_buff, mark)); 2797 break; 2798 2799 case offsetof(struct __sk_buff, pkt_type): 2800 return convert_skb_access(SKF_AD_PKTTYPE, dst_reg, src_reg, insn); 2801 2802 case offsetof(struct __sk_buff, queue_mapping): 2803 return convert_skb_access(SKF_AD_QUEUE, dst_reg, src_reg, insn); 2804 2805 case offsetof(struct __sk_buff, vlan_present): 2806 return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, 2807 dst_reg, src_reg, insn); 2808 2809 case offsetof(struct __sk_buff, vlan_tci): 2810 return convert_skb_access(SKF_AD_VLAN_TAG, 2811 dst_reg, src_reg, insn); 2812 2813 case offsetof(struct __sk_buff, cb[0]) ... 2814 offsetof(struct __sk_buff, cb[4]): 2815 BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20); 2816 2817 prog->cb_access = 1; 2818 ctx_off -= offsetof(struct __sk_buff, cb[0]); 2819 ctx_off += offsetof(struct sk_buff, cb); 2820 ctx_off += offsetof(struct qdisc_skb_cb, data); 2821 if (type == BPF_WRITE) 2822 *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, ctx_off); 2823 else 2824 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ctx_off); 2825 break; 2826 2827 case offsetof(struct __sk_buff, tc_classid): 2828 ctx_off -= offsetof(struct __sk_buff, tc_classid); 2829 ctx_off += offsetof(struct sk_buff, cb); 2830 ctx_off += offsetof(struct qdisc_skb_cb, tc_classid); 2831 if (type == BPF_WRITE) 2832 *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off); 2833 else 2834 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, ctx_off); 2835 break; 2836 2837 case offsetof(struct __sk_buff, data): 2838 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), 2839 dst_reg, src_reg, 2840 offsetof(struct sk_buff, data)); 2841 break; 2842 2843 case offsetof(struct __sk_buff, data_end): 2844 ctx_off -= offsetof(struct __sk_buff, data_end); 2845 ctx_off += offsetof(struct sk_buff, cb); 2846 ctx_off += offsetof(struct bpf_skb_data_end, data_end); 2847 *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), dst_reg, src_reg, 2848 ctx_off); 2849 break; 2850 2851 case offsetof(struct __sk_buff, tc_index): 2852 #ifdef CONFIG_NET_SCHED 2853 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2); 2854 2855 if (type == BPF_WRITE) 2856 *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, 2857 offsetof(struct sk_buff, tc_index)); 2858 else 2859 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 2860 offsetof(struct sk_buff, tc_index)); 2861 break; 2862 #else 2863 if (type == BPF_WRITE) 2864 *insn++ = BPF_MOV64_REG(dst_reg, dst_reg); 2865 else 2866 *insn++ = BPF_MOV64_IMM(dst_reg, 0); 2867 break; 2868 #endif 2869 } 2870 2871 return insn - insn_buf; 2872 } 2873 2874 static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, int dst_reg, 2875 int src_reg, int ctx_off, 2876 struct bpf_insn *insn_buf, 2877 struct bpf_prog *prog) 2878 { 2879 struct bpf_insn *insn = insn_buf; 2880 2881 switch (ctx_off) { 2882 case offsetof(struct __sk_buff, ifindex): 2883 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); 2884 2885 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), 2886 dst_reg, src_reg, 2887 offsetof(struct sk_buff, dev)); 2888 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg, 2889 offsetof(struct net_device, ifindex)); 2890 break; 2891 default: 2892 return sk_filter_convert_ctx_access(type, dst_reg, src_reg, 2893 ctx_off, insn_buf, prog); 2894 } 2895 2896 return insn - insn_buf; 2897 } 2898 2899 static u32 xdp_convert_ctx_access(enum bpf_access_type type, int dst_reg, 2900 int src_reg, int ctx_off, 2901 struct bpf_insn *insn_buf, 2902 struct bpf_prog *prog) 2903 { 2904 struct bpf_insn *insn = insn_buf; 2905 2906 switch (ctx_off) { 2907 case offsetof(struct xdp_md, data): 2908 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data), 2909 dst_reg, src_reg, 2910 offsetof(struct xdp_buff, data)); 2911 break; 2912 case offsetof(struct xdp_md, data_end): 2913 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end), 2914 dst_reg, src_reg, 2915 offsetof(struct xdp_buff, data_end)); 2916 break; 2917 } 2918 2919 return insn - insn_buf; 2920 } 2921 2922 static const struct bpf_verifier_ops sk_filter_ops = { 2923 .get_func_proto = sk_filter_func_proto, 2924 .is_valid_access = sk_filter_is_valid_access, 2925 .convert_ctx_access = sk_filter_convert_ctx_access, 2926 }; 2927 2928 static const struct bpf_verifier_ops tc_cls_act_ops = { 2929 .get_func_proto = tc_cls_act_func_proto, 2930 .is_valid_access = tc_cls_act_is_valid_access, 2931 .convert_ctx_access = tc_cls_act_convert_ctx_access, 2932 .gen_prologue = tc_cls_act_prologue, 2933 }; 2934 2935 static const struct bpf_verifier_ops xdp_ops = { 2936 .get_func_proto = xdp_func_proto, 2937 .is_valid_access = xdp_is_valid_access, 2938 .convert_ctx_access = xdp_convert_ctx_access, 2939 }; 2940 2941 static struct bpf_prog_type_list sk_filter_type __read_mostly = { 2942 .ops = &sk_filter_ops, 2943 .type = BPF_PROG_TYPE_SOCKET_FILTER, 2944 }; 2945 2946 static struct bpf_prog_type_list sched_cls_type __read_mostly = { 2947 .ops = &tc_cls_act_ops, 2948 .type = BPF_PROG_TYPE_SCHED_CLS, 2949 }; 2950 2951 static struct bpf_prog_type_list sched_act_type __read_mostly = { 2952 .ops = &tc_cls_act_ops, 2953 .type = BPF_PROG_TYPE_SCHED_ACT, 2954 }; 2955 2956 static struct bpf_prog_type_list xdp_type __read_mostly = { 2957 .ops = &xdp_ops, 2958 .type = BPF_PROG_TYPE_XDP, 2959 }; 2960 2961 static int __init register_sk_filter_ops(void) 2962 { 2963 bpf_register_prog_type(&sk_filter_type); 2964 bpf_register_prog_type(&sched_cls_type); 2965 bpf_register_prog_type(&sched_act_type); 2966 bpf_register_prog_type(&xdp_type); 2967 2968 return 0; 2969 } 2970 late_initcall(register_sk_filter_ops); 2971 2972 int sk_detach_filter(struct sock *sk) 2973 { 2974 int ret = -ENOENT; 2975 struct sk_filter *filter; 2976 2977 if (sock_flag(sk, SOCK_FILTER_LOCKED)) 2978 return -EPERM; 2979 2980 filter = rcu_dereference_protected(sk->sk_filter, 2981 lockdep_sock_is_held(sk)); 2982 if (filter) { 2983 RCU_INIT_POINTER(sk->sk_filter, NULL); 2984 sk_filter_uncharge(sk, filter); 2985 ret = 0; 2986 } 2987 2988 return ret; 2989 } 2990 EXPORT_SYMBOL_GPL(sk_detach_filter); 2991 2992 int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, 2993 unsigned int len) 2994 { 2995 struct sock_fprog_kern *fprog; 2996 struct sk_filter *filter; 2997 int ret = 0; 2998 2999 lock_sock(sk); 3000 filter = rcu_dereference_protected(sk->sk_filter, 3001 lockdep_sock_is_held(sk)); 3002 if (!filter) 3003 goto out; 3004 3005 /* We're copying the filter that has been originally attached, 3006 * so no conversion/decode needed anymore. eBPF programs that 3007 * have no original program cannot be dumped through this. 3008 */ 3009 ret = -EACCES; 3010 fprog = filter->prog->orig_prog; 3011 if (!fprog) 3012 goto out; 3013 3014 ret = fprog->len; 3015 if (!len) 3016 /* User space only enquires number of filter blocks. */ 3017 goto out; 3018 3019 ret = -EINVAL; 3020 if (len < fprog->len) 3021 goto out; 3022 3023 ret = -EFAULT; 3024 if (copy_to_user(ubuf, fprog->filter, bpf_classic_proglen(fprog))) 3025 goto out; 3026 3027 /* Instead of bytes, the API requests to return the number 3028 * of filter blocks. 3029 */ 3030 ret = fprog->len; 3031 out: 3032 release_sock(sk); 3033 return ret; 3034 } 3035