1 /* 2 * Linux Socket Filter - Kernel level socket filtering 3 * 4 * Based on the design of the Berkeley Packet Filter. The new 5 * internal format has been designed by PLUMgrid: 6 * 7 * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com 8 * 9 * Authors: 10 * 11 * Jay Schulist <jschlst@samba.org> 12 * Alexei Starovoitov <ast@plumgrid.com> 13 * Daniel Borkmann <dborkman@redhat.com> 14 * 15 * This program is free software; you can redistribute it and/or 16 * modify it under the terms of the GNU General Public License 17 * as published by the Free Software Foundation; either version 18 * 2 of the License, or (at your option) any later version. 19 * 20 * Andi Kleen - Fix a few bad bugs and races. 21 * Kris Katterjohn - Added many additional checks in bpf_check_classic() 22 */ 23 24 #include <linux/module.h> 25 #include <linux/types.h> 26 #include <linux/mm.h> 27 #include <linux/fcntl.h> 28 #include <linux/socket.h> 29 #include <linux/in.h> 30 #include <linux/inet.h> 31 #include <linux/netdevice.h> 32 #include <linux/if_packet.h> 33 #include <linux/gfp.h> 34 #include <net/ip.h> 35 #include <net/protocol.h> 36 #include <net/netlink.h> 37 #include <linux/skbuff.h> 38 #include <net/sock.h> 39 #include <net/flow_dissector.h> 40 #include <linux/errno.h> 41 #include <linux/timer.h> 42 #include <asm/uaccess.h> 43 #include <asm/unaligned.h> 44 #include <linux/filter.h> 45 #include <linux/ratelimit.h> 46 #include <linux/seccomp.h> 47 #include <linux/if_vlan.h> 48 #include <linux/bpf.h> 49 #include <net/sch_generic.h> 50 #include <net/cls_cgroup.h> 51 #include <net/dst_metadata.h> 52 #include <net/dst.h> 53 #include <net/sock_reuseport.h> 54 55 /** 56 * sk_filter - run a packet through a socket filter 57 * @sk: sock associated with &sk_buff 58 * @skb: buffer to filter 59 * 60 * Run the eBPF program and then cut skb->data to correct size returned by 61 * the program. If pkt_len is 0 we toss packet. If skb->len is smaller 62 * than pkt_len we keep whole skb->data. This is the socket level 63 * wrapper to BPF_PROG_RUN. It returns 0 if the packet should 64 * be accepted or -EPERM if the packet should be tossed. 65 * 66 */ 67 int sk_filter(struct sock *sk, struct sk_buff *skb) 68 { 69 int err; 70 struct sk_filter *filter; 71 72 /* 73 * If the skb was allocated from pfmemalloc reserves, only 74 * allow SOCK_MEMALLOC sockets to use it as this socket is 75 * helping free memory 76 */ 77 if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) 78 return -ENOMEM; 79 80 err = security_sock_rcv_skb(sk, skb); 81 if (err) 82 return err; 83 84 rcu_read_lock(); 85 filter = rcu_dereference(sk->sk_filter); 86 if (filter) { 87 unsigned int pkt_len = bpf_prog_run_save_cb(filter->prog, skb); 88 89 err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM; 90 } 91 rcu_read_unlock(); 92 93 return err; 94 } 95 EXPORT_SYMBOL(sk_filter); 96 97 static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) 98 { 99 return skb_get_poff((struct sk_buff *)(unsigned long) ctx); 100 } 101 102 static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) 103 { 104 struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx; 105 struct nlattr *nla; 106 107 if (skb_is_nonlinear(skb)) 108 return 0; 109 110 if (skb->len < sizeof(struct nlattr)) 111 return 0; 112 113 if (a > skb->len - sizeof(struct nlattr)) 114 return 0; 115 116 nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x); 117 if (nla) 118 return (void *) nla - (void *) skb->data; 119 120 return 0; 121 } 122 123 static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) 124 { 125 struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx; 126 struct nlattr *nla; 127 128 if (skb_is_nonlinear(skb)) 129 return 0; 130 131 if (skb->len < sizeof(struct nlattr)) 132 return 0; 133 134 if (a > skb->len - sizeof(struct nlattr)) 135 return 0; 136 137 nla = (struct nlattr *) &skb->data[a]; 138 if (nla->nla_len > skb->len - a) 139 return 0; 140 141 nla = nla_find_nested(nla, x); 142 if (nla) 143 return (void *) nla - (void *) skb->data; 144 145 return 0; 146 } 147 148 static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) 149 { 150 return raw_smp_processor_id(); 151 } 152 153 static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, 154 struct bpf_insn *insn_buf) 155 { 156 struct bpf_insn *insn = insn_buf; 157 158 switch (skb_field) { 159 case SKF_AD_MARK: 160 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); 161 162 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 163 offsetof(struct sk_buff, mark)); 164 break; 165 166 case SKF_AD_PKTTYPE: 167 *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET()); 168 *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX); 169 #ifdef __BIG_ENDIAN_BITFIELD 170 *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5); 171 #endif 172 break; 173 174 case SKF_AD_QUEUE: 175 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2); 176 177 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 178 offsetof(struct sk_buff, queue_mapping)); 179 break; 180 181 case SKF_AD_VLAN_TAG: 182 case SKF_AD_VLAN_TAG_PRESENT: 183 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); 184 BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); 185 186 /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */ 187 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 188 offsetof(struct sk_buff, vlan_tci)); 189 if (skb_field == SKF_AD_VLAN_TAG) { 190 *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 191 ~VLAN_TAG_PRESENT); 192 } else { 193 /* dst_reg >>= 12 */ 194 *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 12); 195 /* dst_reg &= 1 */ 196 *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1); 197 } 198 break; 199 } 200 201 return insn - insn_buf; 202 } 203 204 static bool convert_bpf_extensions(struct sock_filter *fp, 205 struct bpf_insn **insnp) 206 { 207 struct bpf_insn *insn = *insnp; 208 u32 cnt; 209 210 switch (fp->k) { 211 case SKF_AD_OFF + SKF_AD_PROTOCOL: 212 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); 213 214 /* A = *(u16 *) (CTX + offsetof(protocol)) */ 215 *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, 216 offsetof(struct sk_buff, protocol)); 217 /* A = ntohs(A) [emitting a nop or swap16] */ 218 *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); 219 break; 220 221 case SKF_AD_OFF + SKF_AD_PKTTYPE: 222 cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn); 223 insn += cnt - 1; 224 break; 225 226 case SKF_AD_OFF + SKF_AD_IFINDEX: 227 case SKF_AD_OFF + SKF_AD_HATYPE: 228 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); 229 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2); 230 BUILD_BUG_ON(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)) < 0); 231 232 *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)), 233 BPF_REG_TMP, BPF_REG_CTX, 234 offsetof(struct sk_buff, dev)); 235 /* if (tmp != 0) goto pc + 1 */ 236 *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1); 237 *insn++ = BPF_EXIT_INSN(); 238 if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX) 239 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP, 240 offsetof(struct net_device, ifindex)); 241 else 242 *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP, 243 offsetof(struct net_device, type)); 244 break; 245 246 case SKF_AD_OFF + SKF_AD_MARK: 247 cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn); 248 insn += cnt - 1; 249 break; 250 251 case SKF_AD_OFF + SKF_AD_RXHASH: 252 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); 253 254 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, 255 offsetof(struct sk_buff, hash)); 256 break; 257 258 case SKF_AD_OFF + SKF_AD_QUEUE: 259 cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn); 260 insn += cnt - 1; 261 break; 262 263 case SKF_AD_OFF + SKF_AD_VLAN_TAG: 264 cnt = convert_skb_access(SKF_AD_VLAN_TAG, 265 BPF_REG_A, BPF_REG_CTX, insn); 266 insn += cnt - 1; 267 break; 268 269 case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT: 270 cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, 271 BPF_REG_A, BPF_REG_CTX, insn); 272 insn += cnt - 1; 273 break; 274 275 case SKF_AD_OFF + SKF_AD_VLAN_TPID: 276 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2); 277 278 /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */ 279 *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, 280 offsetof(struct sk_buff, vlan_proto)); 281 /* A = ntohs(A) [emitting a nop or swap16] */ 282 *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); 283 break; 284 285 case SKF_AD_OFF + SKF_AD_PAY_OFFSET: 286 case SKF_AD_OFF + SKF_AD_NLATTR: 287 case SKF_AD_OFF + SKF_AD_NLATTR_NEST: 288 case SKF_AD_OFF + SKF_AD_CPU: 289 case SKF_AD_OFF + SKF_AD_RANDOM: 290 /* arg1 = CTX */ 291 *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); 292 /* arg2 = A */ 293 *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A); 294 /* arg3 = X */ 295 *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X); 296 /* Emit call(arg1=CTX, arg2=A, arg3=X) */ 297 switch (fp->k) { 298 case SKF_AD_OFF + SKF_AD_PAY_OFFSET: 299 *insn = BPF_EMIT_CALL(__skb_get_pay_offset); 300 break; 301 case SKF_AD_OFF + SKF_AD_NLATTR: 302 *insn = BPF_EMIT_CALL(__skb_get_nlattr); 303 break; 304 case SKF_AD_OFF + SKF_AD_NLATTR_NEST: 305 *insn = BPF_EMIT_CALL(__skb_get_nlattr_nest); 306 break; 307 case SKF_AD_OFF + SKF_AD_CPU: 308 *insn = BPF_EMIT_CALL(__get_raw_cpu_id); 309 break; 310 case SKF_AD_OFF + SKF_AD_RANDOM: 311 *insn = BPF_EMIT_CALL(bpf_user_rnd_u32); 312 bpf_user_rnd_init_once(); 313 break; 314 } 315 break; 316 317 case SKF_AD_OFF + SKF_AD_ALU_XOR_X: 318 /* A ^= X */ 319 *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X); 320 break; 321 322 default: 323 /* This is just a dummy call to avoid letting the compiler 324 * evict __bpf_call_base() as an optimization. Placed here 325 * where no-one bothers. 326 */ 327 BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0); 328 return false; 329 } 330 331 *insnp = insn; 332 return true; 333 } 334 335 /** 336 * bpf_convert_filter - convert filter program 337 * @prog: the user passed filter program 338 * @len: the length of the user passed filter program 339 * @new_prog: buffer where converted program will be stored 340 * @new_len: pointer to store length of converted program 341 * 342 * Remap 'sock_filter' style BPF instruction set to 'sock_filter_ext' style. 343 * Conversion workflow: 344 * 345 * 1) First pass for calculating the new program length: 346 * bpf_convert_filter(old_prog, old_len, NULL, &new_len) 347 * 348 * 2) 2nd pass to remap in two passes: 1st pass finds new 349 * jump offsets, 2nd pass remapping: 350 * new_prog = kmalloc(sizeof(struct bpf_insn) * new_len); 351 * bpf_convert_filter(old_prog, old_len, new_prog, &new_len); 352 */ 353 static int bpf_convert_filter(struct sock_filter *prog, int len, 354 struct bpf_insn *new_prog, int *new_len) 355 { 356 int new_flen = 0, pass = 0, target, i; 357 struct bpf_insn *new_insn; 358 struct sock_filter *fp; 359 int *addrs = NULL; 360 u8 bpf_src; 361 362 BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK); 363 BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); 364 365 if (len <= 0 || len > BPF_MAXINSNS) 366 return -EINVAL; 367 368 if (new_prog) { 369 addrs = kcalloc(len, sizeof(*addrs), 370 GFP_KERNEL | __GFP_NOWARN); 371 if (!addrs) 372 return -ENOMEM; 373 } 374 375 do_pass: 376 new_insn = new_prog; 377 fp = prog; 378 379 /* Classic BPF related prologue emission. */ 380 if (new_insn) { 381 /* Classic BPF expects A and X to be reset first. These need 382 * to be guaranteed to be the first two instructions. 383 */ 384 *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); 385 *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); 386 387 /* All programs must keep CTX in callee saved BPF_REG_CTX. 388 * In eBPF case it's done by the compiler, here we need to 389 * do this ourself. Initial CTX is present in BPF_REG_ARG1. 390 */ 391 *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); 392 } else { 393 new_insn += 3; 394 } 395 396 for (i = 0; i < len; fp++, i++) { 397 struct bpf_insn tmp_insns[6] = { }; 398 struct bpf_insn *insn = tmp_insns; 399 400 if (addrs) 401 addrs[i] = new_insn - new_prog; 402 403 switch (fp->code) { 404 /* All arithmetic insns and skb loads map as-is. */ 405 case BPF_ALU | BPF_ADD | BPF_X: 406 case BPF_ALU | BPF_ADD | BPF_K: 407 case BPF_ALU | BPF_SUB | BPF_X: 408 case BPF_ALU | BPF_SUB | BPF_K: 409 case BPF_ALU | BPF_AND | BPF_X: 410 case BPF_ALU | BPF_AND | BPF_K: 411 case BPF_ALU | BPF_OR | BPF_X: 412 case BPF_ALU | BPF_OR | BPF_K: 413 case BPF_ALU | BPF_LSH | BPF_X: 414 case BPF_ALU | BPF_LSH | BPF_K: 415 case BPF_ALU | BPF_RSH | BPF_X: 416 case BPF_ALU | BPF_RSH | BPF_K: 417 case BPF_ALU | BPF_XOR | BPF_X: 418 case BPF_ALU | BPF_XOR | BPF_K: 419 case BPF_ALU | BPF_MUL | BPF_X: 420 case BPF_ALU | BPF_MUL | BPF_K: 421 case BPF_ALU | BPF_DIV | BPF_X: 422 case BPF_ALU | BPF_DIV | BPF_K: 423 case BPF_ALU | BPF_MOD | BPF_X: 424 case BPF_ALU | BPF_MOD | BPF_K: 425 case BPF_ALU | BPF_NEG: 426 case BPF_LD | BPF_ABS | BPF_W: 427 case BPF_LD | BPF_ABS | BPF_H: 428 case BPF_LD | BPF_ABS | BPF_B: 429 case BPF_LD | BPF_IND | BPF_W: 430 case BPF_LD | BPF_IND | BPF_H: 431 case BPF_LD | BPF_IND | BPF_B: 432 /* Check for overloaded BPF extension and 433 * directly convert it if found, otherwise 434 * just move on with mapping. 435 */ 436 if (BPF_CLASS(fp->code) == BPF_LD && 437 BPF_MODE(fp->code) == BPF_ABS && 438 convert_bpf_extensions(fp, &insn)) 439 break; 440 441 *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); 442 break; 443 444 /* Jump transformation cannot use BPF block macros 445 * everywhere as offset calculation and target updates 446 * require a bit more work than the rest, i.e. jump 447 * opcodes map as-is, but offsets need adjustment. 448 */ 449 450 #define BPF_EMIT_JMP \ 451 do { \ 452 if (target >= len || target < 0) \ 453 goto err; \ 454 insn->off = addrs ? addrs[target] - addrs[i] - 1 : 0; \ 455 /* Adjust pc relative offset for 2nd or 3rd insn. */ \ 456 insn->off -= insn - tmp_insns; \ 457 } while (0) 458 459 case BPF_JMP | BPF_JA: 460 target = i + fp->k + 1; 461 insn->code = fp->code; 462 BPF_EMIT_JMP; 463 break; 464 465 case BPF_JMP | BPF_JEQ | BPF_K: 466 case BPF_JMP | BPF_JEQ | BPF_X: 467 case BPF_JMP | BPF_JSET | BPF_K: 468 case BPF_JMP | BPF_JSET | BPF_X: 469 case BPF_JMP | BPF_JGT | BPF_K: 470 case BPF_JMP | BPF_JGT | BPF_X: 471 case BPF_JMP | BPF_JGE | BPF_K: 472 case BPF_JMP | BPF_JGE | BPF_X: 473 if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) { 474 /* BPF immediates are signed, zero extend 475 * immediate into tmp register and use it 476 * in compare insn. 477 */ 478 *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k); 479 480 insn->dst_reg = BPF_REG_A; 481 insn->src_reg = BPF_REG_TMP; 482 bpf_src = BPF_X; 483 } else { 484 insn->dst_reg = BPF_REG_A; 485 insn->imm = fp->k; 486 bpf_src = BPF_SRC(fp->code); 487 insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0; 488 } 489 490 /* Common case where 'jump_false' is next insn. */ 491 if (fp->jf == 0) { 492 insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; 493 target = i + fp->jt + 1; 494 BPF_EMIT_JMP; 495 break; 496 } 497 498 /* Convert JEQ into JNE when 'jump_true' is next insn. */ 499 if (fp->jt == 0 && BPF_OP(fp->code) == BPF_JEQ) { 500 insn->code = BPF_JMP | BPF_JNE | bpf_src; 501 target = i + fp->jf + 1; 502 BPF_EMIT_JMP; 503 break; 504 } 505 506 /* Other jumps are mapped into two insns: Jxx and JA. */ 507 target = i + fp->jt + 1; 508 insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; 509 BPF_EMIT_JMP; 510 insn++; 511 512 insn->code = BPF_JMP | BPF_JA; 513 target = i + fp->jf + 1; 514 BPF_EMIT_JMP; 515 break; 516 517 /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */ 518 case BPF_LDX | BPF_MSH | BPF_B: 519 /* tmp = A */ 520 *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A); 521 /* A = BPF_R0 = *(u8 *) (skb->data + K) */ 522 *insn++ = BPF_LD_ABS(BPF_B, fp->k); 523 /* A &= 0xf */ 524 *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf); 525 /* A <<= 2 */ 526 *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2); 527 /* X = A */ 528 *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); 529 /* A = tmp */ 530 *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); 531 break; 532 533 /* RET_K is remaped into 2 insns. RET_A case doesn't need an 534 * extra mov as BPF_REG_0 is already mapped into BPF_REG_A. 535 */ 536 case BPF_RET | BPF_A: 537 case BPF_RET | BPF_K: 538 if (BPF_RVAL(fp->code) == BPF_K) 539 *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0, 540 0, fp->k); 541 *insn = BPF_EXIT_INSN(); 542 break; 543 544 /* Store to stack. */ 545 case BPF_ST: 546 case BPF_STX: 547 *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) == 548 BPF_ST ? BPF_REG_A : BPF_REG_X, 549 -(BPF_MEMWORDS - fp->k) * 4); 550 break; 551 552 /* Load from stack. */ 553 case BPF_LD | BPF_MEM: 554 case BPF_LDX | BPF_MEM: 555 *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? 556 BPF_REG_A : BPF_REG_X, BPF_REG_FP, 557 -(BPF_MEMWORDS - fp->k) * 4); 558 break; 559 560 /* A = K or X = K */ 561 case BPF_LD | BPF_IMM: 562 case BPF_LDX | BPF_IMM: 563 *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ? 564 BPF_REG_A : BPF_REG_X, fp->k); 565 break; 566 567 /* X = A */ 568 case BPF_MISC | BPF_TAX: 569 *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); 570 break; 571 572 /* A = X */ 573 case BPF_MISC | BPF_TXA: 574 *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X); 575 break; 576 577 /* A = skb->len or X = skb->len */ 578 case BPF_LD | BPF_W | BPF_LEN: 579 case BPF_LDX | BPF_W | BPF_LEN: 580 *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? 581 BPF_REG_A : BPF_REG_X, BPF_REG_CTX, 582 offsetof(struct sk_buff, len)); 583 break; 584 585 /* Access seccomp_data fields. */ 586 case BPF_LDX | BPF_ABS | BPF_W: 587 /* A = *(u32 *) (ctx + K) */ 588 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k); 589 break; 590 591 /* Unknown instruction. */ 592 default: 593 goto err; 594 } 595 596 insn++; 597 if (new_prog) 598 memcpy(new_insn, tmp_insns, 599 sizeof(*insn) * (insn - tmp_insns)); 600 new_insn += insn - tmp_insns; 601 } 602 603 if (!new_prog) { 604 /* Only calculating new length. */ 605 *new_len = new_insn - new_prog; 606 return 0; 607 } 608 609 pass++; 610 if (new_flen != new_insn - new_prog) { 611 new_flen = new_insn - new_prog; 612 if (pass > 2) 613 goto err; 614 goto do_pass; 615 } 616 617 kfree(addrs); 618 BUG_ON(*new_len != new_flen); 619 return 0; 620 err: 621 kfree(addrs); 622 return -EINVAL; 623 } 624 625 /* Security: 626 * 627 * As we dont want to clear mem[] array for each packet going through 628 * __bpf_prog_run(), we check that filter loaded by user never try to read 629 * a cell if not previously written, and we check all branches to be sure 630 * a malicious user doesn't try to abuse us. 631 */ 632 static int check_load_and_stores(const struct sock_filter *filter, int flen) 633 { 634 u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */ 635 int pc, ret = 0; 636 637 BUILD_BUG_ON(BPF_MEMWORDS > 16); 638 639 masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL); 640 if (!masks) 641 return -ENOMEM; 642 643 memset(masks, 0xff, flen * sizeof(*masks)); 644 645 for (pc = 0; pc < flen; pc++) { 646 memvalid &= masks[pc]; 647 648 switch (filter[pc].code) { 649 case BPF_ST: 650 case BPF_STX: 651 memvalid |= (1 << filter[pc].k); 652 break; 653 case BPF_LD | BPF_MEM: 654 case BPF_LDX | BPF_MEM: 655 if (!(memvalid & (1 << filter[pc].k))) { 656 ret = -EINVAL; 657 goto error; 658 } 659 break; 660 case BPF_JMP | BPF_JA: 661 /* A jump must set masks on target */ 662 masks[pc + 1 + filter[pc].k] &= memvalid; 663 memvalid = ~0; 664 break; 665 case BPF_JMP | BPF_JEQ | BPF_K: 666 case BPF_JMP | BPF_JEQ | BPF_X: 667 case BPF_JMP | BPF_JGE | BPF_K: 668 case BPF_JMP | BPF_JGE | BPF_X: 669 case BPF_JMP | BPF_JGT | BPF_K: 670 case BPF_JMP | BPF_JGT | BPF_X: 671 case BPF_JMP | BPF_JSET | BPF_K: 672 case BPF_JMP | BPF_JSET | BPF_X: 673 /* A jump must set masks on targets */ 674 masks[pc + 1 + filter[pc].jt] &= memvalid; 675 masks[pc + 1 + filter[pc].jf] &= memvalid; 676 memvalid = ~0; 677 break; 678 } 679 } 680 error: 681 kfree(masks); 682 return ret; 683 } 684 685 static bool chk_code_allowed(u16 code_to_probe) 686 { 687 static const bool codes[] = { 688 /* 32 bit ALU operations */ 689 [BPF_ALU | BPF_ADD | BPF_K] = true, 690 [BPF_ALU | BPF_ADD | BPF_X] = true, 691 [BPF_ALU | BPF_SUB | BPF_K] = true, 692 [BPF_ALU | BPF_SUB | BPF_X] = true, 693 [BPF_ALU | BPF_MUL | BPF_K] = true, 694 [BPF_ALU | BPF_MUL | BPF_X] = true, 695 [BPF_ALU | BPF_DIV | BPF_K] = true, 696 [BPF_ALU | BPF_DIV | BPF_X] = true, 697 [BPF_ALU | BPF_MOD | BPF_K] = true, 698 [BPF_ALU | BPF_MOD | BPF_X] = true, 699 [BPF_ALU | BPF_AND | BPF_K] = true, 700 [BPF_ALU | BPF_AND | BPF_X] = true, 701 [BPF_ALU | BPF_OR | BPF_K] = true, 702 [BPF_ALU | BPF_OR | BPF_X] = true, 703 [BPF_ALU | BPF_XOR | BPF_K] = true, 704 [BPF_ALU | BPF_XOR | BPF_X] = true, 705 [BPF_ALU | BPF_LSH | BPF_K] = true, 706 [BPF_ALU | BPF_LSH | BPF_X] = true, 707 [BPF_ALU | BPF_RSH | BPF_K] = true, 708 [BPF_ALU | BPF_RSH | BPF_X] = true, 709 [BPF_ALU | BPF_NEG] = true, 710 /* Load instructions */ 711 [BPF_LD | BPF_W | BPF_ABS] = true, 712 [BPF_LD | BPF_H | BPF_ABS] = true, 713 [BPF_LD | BPF_B | BPF_ABS] = true, 714 [BPF_LD | BPF_W | BPF_LEN] = true, 715 [BPF_LD | BPF_W | BPF_IND] = true, 716 [BPF_LD | BPF_H | BPF_IND] = true, 717 [BPF_LD | BPF_B | BPF_IND] = true, 718 [BPF_LD | BPF_IMM] = true, 719 [BPF_LD | BPF_MEM] = true, 720 [BPF_LDX | BPF_W | BPF_LEN] = true, 721 [BPF_LDX | BPF_B | BPF_MSH] = true, 722 [BPF_LDX | BPF_IMM] = true, 723 [BPF_LDX | BPF_MEM] = true, 724 /* Store instructions */ 725 [BPF_ST] = true, 726 [BPF_STX] = true, 727 /* Misc instructions */ 728 [BPF_MISC | BPF_TAX] = true, 729 [BPF_MISC | BPF_TXA] = true, 730 /* Return instructions */ 731 [BPF_RET | BPF_K] = true, 732 [BPF_RET | BPF_A] = true, 733 /* Jump instructions */ 734 [BPF_JMP | BPF_JA] = true, 735 [BPF_JMP | BPF_JEQ | BPF_K] = true, 736 [BPF_JMP | BPF_JEQ | BPF_X] = true, 737 [BPF_JMP | BPF_JGE | BPF_K] = true, 738 [BPF_JMP | BPF_JGE | BPF_X] = true, 739 [BPF_JMP | BPF_JGT | BPF_K] = true, 740 [BPF_JMP | BPF_JGT | BPF_X] = true, 741 [BPF_JMP | BPF_JSET | BPF_K] = true, 742 [BPF_JMP | BPF_JSET | BPF_X] = true, 743 }; 744 745 if (code_to_probe >= ARRAY_SIZE(codes)) 746 return false; 747 748 return codes[code_to_probe]; 749 } 750 751 /** 752 * bpf_check_classic - verify socket filter code 753 * @filter: filter to verify 754 * @flen: length of filter 755 * 756 * Check the user's filter code. If we let some ugly 757 * filter code slip through kaboom! The filter must contain 758 * no references or jumps that are out of range, no illegal 759 * instructions, and must end with a RET instruction. 760 * 761 * All jumps are forward as they are not signed. 762 * 763 * Returns 0 if the rule set is legal or -EINVAL if not. 764 */ 765 static int bpf_check_classic(const struct sock_filter *filter, 766 unsigned int flen) 767 { 768 bool anc_found; 769 int pc; 770 771 if (flen == 0 || flen > BPF_MAXINSNS) 772 return -EINVAL; 773 774 /* Check the filter code now */ 775 for (pc = 0; pc < flen; pc++) { 776 const struct sock_filter *ftest = &filter[pc]; 777 778 /* May we actually operate on this code? */ 779 if (!chk_code_allowed(ftest->code)) 780 return -EINVAL; 781 782 /* Some instructions need special checks */ 783 switch (ftest->code) { 784 case BPF_ALU | BPF_DIV | BPF_K: 785 case BPF_ALU | BPF_MOD | BPF_K: 786 /* Check for division by zero */ 787 if (ftest->k == 0) 788 return -EINVAL; 789 break; 790 case BPF_ALU | BPF_LSH | BPF_K: 791 case BPF_ALU | BPF_RSH | BPF_K: 792 if (ftest->k >= 32) 793 return -EINVAL; 794 break; 795 case BPF_LD | BPF_MEM: 796 case BPF_LDX | BPF_MEM: 797 case BPF_ST: 798 case BPF_STX: 799 /* Check for invalid memory addresses */ 800 if (ftest->k >= BPF_MEMWORDS) 801 return -EINVAL; 802 break; 803 case BPF_JMP | BPF_JA: 804 /* Note, the large ftest->k might cause loops. 805 * Compare this with conditional jumps below, 806 * where offsets are limited. --ANK (981016) 807 */ 808 if (ftest->k >= (unsigned int)(flen - pc - 1)) 809 return -EINVAL; 810 break; 811 case BPF_JMP | BPF_JEQ | BPF_K: 812 case BPF_JMP | BPF_JEQ | BPF_X: 813 case BPF_JMP | BPF_JGE | BPF_K: 814 case BPF_JMP | BPF_JGE | BPF_X: 815 case BPF_JMP | BPF_JGT | BPF_K: 816 case BPF_JMP | BPF_JGT | BPF_X: 817 case BPF_JMP | BPF_JSET | BPF_K: 818 case BPF_JMP | BPF_JSET | BPF_X: 819 /* Both conditionals must be safe */ 820 if (pc + ftest->jt + 1 >= flen || 821 pc + ftest->jf + 1 >= flen) 822 return -EINVAL; 823 break; 824 case BPF_LD | BPF_W | BPF_ABS: 825 case BPF_LD | BPF_H | BPF_ABS: 826 case BPF_LD | BPF_B | BPF_ABS: 827 anc_found = false; 828 if (bpf_anc_helper(ftest) & BPF_ANC) 829 anc_found = true; 830 /* Ancillary operation unknown or unsupported */ 831 if (anc_found == false && ftest->k >= SKF_AD_OFF) 832 return -EINVAL; 833 } 834 } 835 836 /* Last instruction must be a RET code */ 837 switch (filter[flen - 1].code) { 838 case BPF_RET | BPF_K: 839 case BPF_RET | BPF_A: 840 return check_load_and_stores(filter, flen); 841 } 842 843 return -EINVAL; 844 } 845 846 static int bpf_prog_store_orig_filter(struct bpf_prog *fp, 847 const struct sock_fprog *fprog) 848 { 849 unsigned int fsize = bpf_classic_proglen(fprog); 850 struct sock_fprog_kern *fkprog; 851 852 fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL); 853 if (!fp->orig_prog) 854 return -ENOMEM; 855 856 fkprog = fp->orig_prog; 857 fkprog->len = fprog->len; 858 859 fkprog->filter = kmemdup(fp->insns, fsize, 860 GFP_KERNEL | __GFP_NOWARN); 861 if (!fkprog->filter) { 862 kfree(fp->orig_prog); 863 return -ENOMEM; 864 } 865 866 return 0; 867 } 868 869 static void bpf_release_orig_filter(struct bpf_prog *fp) 870 { 871 struct sock_fprog_kern *fprog = fp->orig_prog; 872 873 if (fprog) { 874 kfree(fprog->filter); 875 kfree(fprog); 876 } 877 } 878 879 static void __bpf_prog_release(struct bpf_prog *prog) 880 { 881 if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) { 882 bpf_prog_put(prog); 883 } else { 884 bpf_release_orig_filter(prog); 885 bpf_prog_free(prog); 886 } 887 } 888 889 static void __sk_filter_release(struct sk_filter *fp) 890 { 891 __bpf_prog_release(fp->prog); 892 kfree(fp); 893 } 894 895 /** 896 * sk_filter_release_rcu - Release a socket filter by rcu_head 897 * @rcu: rcu_head that contains the sk_filter to free 898 */ 899 static void sk_filter_release_rcu(struct rcu_head *rcu) 900 { 901 struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); 902 903 __sk_filter_release(fp); 904 } 905 906 /** 907 * sk_filter_release - release a socket filter 908 * @fp: filter to remove 909 * 910 * Remove a filter from a socket and release its resources. 911 */ 912 static void sk_filter_release(struct sk_filter *fp) 913 { 914 if (atomic_dec_and_test(&fp->refcnt)) 915 call_rcu(&fp->rcu, sk_filter_release_rcu); 916 } 917 918 void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) 919 { 920 u32 filter_size = bpf_prog_size(fp->prog->len); 921 922 atomic_sub(filter_size, &sk->sk_omem_alloc); 923 sk_filter_release(fp); 924 } 925 926 /* try to charge the socket memory if there is space available 927 * return true on success 928 */ 929 bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) 930 { 931 u32 filter_size = bpf_prog_size(fp->prog->len); 932 933 /* same check as in sock_kmalloc() */ 934 if (filter_size <= sysctl_optmem_max && 935 atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) { 936 atomic_inc(&fp->refcnt); 937 atomic_add(filter_size, &sk->sk_omem_alloc); 938 return true; 939 } 940 return false; 941 } 942 943 static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) 944 { 945 struct sock_filter *old_prog; 946 struct bpf_prog *old_fp; 947 int err, new_len, old_len = fp->len; 948 949 /* We are free to overwrite insns et al right here as it 950 * won't be used at this point in time anymore internally 951 * after the migration to the internal BPF instruction 952 * representation. 953 */ 954 BUILD_BUG_ON(sizeof(struct sock_filter) != 955 sizeof(struct bpf_insn)); 956 957 /* Conversion cannot happen on overlapping memory areas, 958 * so we need to keep the user BPF around until the 2nd 959 * pass. At this time, the user BPF is stored in fp->insns. 960 */ 961 old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter), 962 GFP_KERNEL | __GFP_NOWARN); 963 if (!old_prog) { 964 err = -ENOMEM; 965 goto out_err; 966 } 967 968 /* 1st pass: calculate the new program length. */ 969 err = bpf_convert_filter(old_prog, old_len, NULL, &new_len); 970 if (err) 971 goto out_err_free; 972 973 /* Expand fp for appending the new filter representation. */ 974 old_fp = fp; 975 fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0); 976 if (!fp) { 977 /* The old_fp is still around in case we couldn't 978 * allocate new memory, so uncharge on that one. 979 */ 980 fp = old_fp; 981 err = -ENOMEM; 982 goto out_err_free; 983 } 984 985 fp->len = new_len; 986 987 /* 2nd pass: remap sock_filter insns into bpf_insn insns. */ 988 err = bpf_convert_filter(old_prog, old_len, fp->insnsi, &new_len); 989 if (err) 990 /* 2nd bpf_convert_filter() can fail only if it fails 991 * to allocate memory, remapping must succeed. Note, 992 * that at this time old_fp has already been released 993 * by krealloc(). 994 */ 995 goto out_err_free; 996 997 bpf_prog_select_runtime(fp); 998 999 kfree(old_prog); 1000 return fp; 1001 1002 out_err_free: 1003 kfree(old_prog); 1004 out_err: 1005 __bpf_prog_release(fp); 1006 return ERR_PTR(err); 1007 } 1008 1009 static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, 1010 bpf_aux_classic_check_t trans) 1011 { 1012 int err; 1013 1014 fp->bpf_func = NULL; 1015 fp->jited = 0; 1016 1017 err = bpf_check_classic(fp->insns, fp->len); 1018 if (err) { 1019 __bpf_prog_release(fp); 1020 return ERR_PTR(err); 1021 } 1022 1023 /* There might be additional checks and transformations 1024 * needed on classic filters, f.e. in case of seccomp. 1025 */ 1026 if (trans) { 1027 err = trans(fp->insns, fp->len); 1028 if (err) { 1029 __bpf_prog_release(fp); 1030 return ERR_PTR(err); 1031 } 1032 } 1033 1034 /* Probe if we can JIT compile the filter and if so, do 1035 * the compilation of the filter. 1036 */ 1037 bpf_jit_compile(fp); 1038 1039 /* JIT compiler couldn't process this filter, so do the 1040 * internal BPF translation for the optimized interpreter. 1041 */ 1042 if (!fp->jited) 1043 fp = bpf_migrate_filter(fp); 1044 1045 return fp; 1046 } 1047 1048 /** 1049 * bpf_prog_create - create an unattached filter 1050 * @pfp: the unattached filter that is created 1051 * @fprog: the filter program 1052 * 1053 * Create a filter independent of any socket. We first run some 1054 * sanity checks on it to make sure it does not explode on us later. 1055 * If an error occurs or there is insufficient memory for the filter 1056 * a negative errno code is returned. On success the return is zero. 1057 */ 1058 int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog) 1059 { 1060 unsigned int fsize = bpf_classic_proglen(fprog); 1061 struct bpf_prog *fp; 1062 1063 /* Make sure new filter is there and in the right amounts. */ 1064 if (fprog->filter == NULL) 1065 return -EINVAL; 1066 1067 fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); 1068 if (!fp) 1069 return -ENOMEM; 1070 1071 memcpy(fp->insns, fprog->filter, fsize); 1072 1073 fp->len = fprog->len; 1074 /* Since unattached filters are not copied back to user 1075 * space through sk_get_filter(), we do not need to hold 1076 * a copy here, and can spare us the work. 1077 */ 1078 fp->orig_prog = NULL; 1079 1080 /* bpf_prepare_filter() already takes care of freeing 1081 * memory in case something goes wrong. 1082 */ 1083 fp = bpf_prepare_filter(fp, NULL); 1084 if (IS_ERR(fp)) 1085 return PTR_ERR(fp); 1086 1087 *pfp = fp; 1088 return 0; 1089 } 1090 EXPORT_SYMBOL_GPL(bpf_prog_create); 1091 1092 /** 1093 * bpf_prog_create_from_user - create an unattached filter from user buffer 1094 * @pfp: the unattached filter that is created 1095 * @fprog: the filter program 1096 * @trans: post-classic verifier transformation handler 1097 * @save_orig: save classic BPF program 1098 * 1099 * This function effectively does the same as bpf_prog_create(), only 1100 * that it builds up its insns buffer from user space provided buffer. 1101 * It also allows for passing a bpf_aux_classic_check_t handler. 1102 */ 1103 int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, 1104 bpf_aux_classic_check_t trans, bool save_orig) 1105 { 1106 unsigned int fsize = bpf_classic_proglen(fprog); 1107 struct bpf_prog *fp; 1108 int err; 1109 1110 /* Make sure new filter is there and in the right amounts. */ 1111 if (fprog->filter == NULL) 1112 return -EINVAL; 1113 1114 fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); 1115 if (!fp) 1116 return -ENOMEM; 1117 1118 if (copy_from_user(fp->insns, fprog->filter, fsize)) { 1119 __bpf_prog_free(fp); 1120 return -EFAULT; 1121 } 1122 1123 fp->len = fprog->len; 1124 fp->orig_prog = NULL; 1125 1126 if (save_orig) { 1127 err = bpf_prog_store_orig_filter(fp, fprog); 1128 if (err) { 1129 __bpf_prog_free(fp); 1130 return -ENOMEM; 1131 } 1132 } 1133 1134 /* bpf_prepare_filter() already takes care of freeing 1135 * memory in case something goes wrong. 1136 */ 1137 fp = bpf_prepare_filter(fp, trans); 1138 if (IS_ERR(fp)) 1139 return PTR_ERR(fp); 1140 1141 *pfp = fp; 1142 return 0; 1143 } 1144 EXPORT_SYMBOL_GPL(bpf_prog_create_from_user); 1145 1146 void bpf_prog_destroy(struct bpf_prog *fp) 1147 { 1148 __bpf_prog_release(fp); 1149 } 1150 EXPORT_SYMBOL_GPL(bpf_prog_destroy); 1151 1152 static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk, 1153 bool locked) 1154 { 1155 struct sk_filter *fp, *old_fp; 1156 1157 fp = kmalloc(sizeof(*fp), GFP_KERNEL); 1158 if (!fp) 1159 return -ENOMEM; 1160 1161 fp->prog = prog; 1162 atomic_set(&fp->refcnt, 0); 1163 1164 if (!sk_filter_charge(sk, fp)) { 1165 kfree(fp); 1166 return -ENOMEM; 1167 } 1168 1169 old_fp = rcu_dereference_protected(sk->sk_filter, locked); 1170 rcu_assign_pointer(sk->sk_filter, fp); 1171 if (old_fp) 1172 sk_filter_uncharge(sk, old_fp); 1173 1174 return 0; 1175 } 1176 1177 static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk) 1178 { 1179 struct bpf_prog *old_prog; 1180 int err; 1181 1182 if (bpf_prog_size(prog->len) > sysctl_optmem_max) 1183 return -ENOMEM; 1184 1185 if (sk_unhashed(sk) && sk->sk_reuseport) { 1186 err = reuseport_alloc(sk); 1187 if (err) 1188 return err; 1189 } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { 1190 /* The socket wasn't bound with SO_REUSEPORT */ 1191 return -EINVAL; 1192 } 1193 1194 old_prog = reuseport_attach_prog(sk, prog); 1195 if (old_prog) 1196 bpf_prog_destroy(old_prog); 1197 1198 return 0; 1199 } 1200 1201 static 1202 struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) 1203 { 1204 unsigned int fsize = bpf_classic_proglen(fprog); 1205 unsigned int bpf_fsize = bpf_prog_size(fprog->len); 1206 struct bpf_prog *prog; 1207 int err; 1208 1209 if (sock_flag(sk, SOCK_FILTER_LOCKED)) 1210 return ERR_PTR(-EPERM); 1211 1212 /* Make sure new filter is there and in the right amounts. */ 1213 if (fprog->filter == NULL) 1214 return ERR_PTR(-EINVAL); 1215 1216 prog = bpf_prog_alloc(bpf_fsize, 0); 1217 if (!prog) 1218 return ERR_PTR(-ENOMEM); 1219 1220 if (copy_from_user(prog->insns, fprog->filter, fsize)) { 1221 __bpf_prog_free(prog); 1222 return ERR_PTR(-EFAULT); 1223 } 1224 1225 prog->len = fprog->len; 1226 1227 err = bpf_prog_store_orig_filter(prog, fprog); 1228 if (err) { 1229 __bpf_prog_free(prog); 1230 return ERR_PTR(-ENOMEM); 1231 } 1232 1233 /* bpf_prepare_filter() already takes care of freeing 1234 * memory in case something goes wrong. 1235 */ 1236 return bpf_prepare_filter(prog, NULL); 1237 } 1238 1239 /** 1240 * sk_attach_filter - attach a socket filter 1241 * @fprog: the filter program 1242 * @sk: the socket to use 1243 * 1244 * Attach the user's filter code. We first run some sanity checks on 1245 * it to make sure it does not explode on us later. If an error 1246 * occurs or there is insufficient memory for the filter a negative 1247 * errno code is returned. On success the return is zero. 1248 */ 1249 int __sk_attach_filter(struct sock_fprog *fprog, struct sock *sk, 1250 bool locked) 1251 { 1252 struct bpf_prog *prog = __get_filter(fprog, sk); 1253 int err; 1254 1255 if (IS_ERR(prog)) 1256 return PTR_ERR(prog); 1257 1258 err = __sk_attach_prog(prog, sk, locked); 1259 if (err < 0) { 1260 __bpf_prog_release(prog); 1261 return err; 1262 } 1263 1264 return 0; 1265 } 1266 EXPORT_SYMBOL_GPL(__sk_attach_filter); 1267 1268 int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) 1269 { 1270 return __sk_attach_filter(fprog, sk, sock_owned_by_user(sk)); 1271 } 1272 1273 int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) 1274 { 1275 struct bpf_prog *prog = __get_filter(fprog, sk); 1276 int err; 1277 1278 if (IS_ERR(prog)) 1279 return PTR_ERR(prog); 1280 1281 err = __reuseport_attach_prog(prog, sk); 1282 if (err < 0) { 1283 __bpf_prog_release(prog); 1284 return err; 1285 } 1286 1287 return 0; 1288 } 1289 1290 static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk) 1291 { 1292 struct bpf_prog *prog; 1293 1294 if (sock_flag(sk, SOCK_FILTER_LOCKED)) 1295 return ERR_PTR(-EPERM); 1296 1297 prog = bpf_prog_get(ufd); 1298 if (IS_ERR(prog)) 1299 return prog; 1300 1301 if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) { 1302 bpf_prog_put(prog); 1303 return ERR_PTR(-EINVAL); 1304 } 1305 1306 return prog; 1307 } 1308 1309 int sk_attach_bpf(u32 ufd, struct sock *sk) 1310 { 1311 struct bpf_prog *prog = __get_bpf(ufd, sk); 1312 int err; 1313 1314 if (IS_ERR(prog)) 1315 return PTR_ERR(prog); 1316 1317 err = __sk_attach_prog(prog, sk, sock_owned_by_user(sk)); 1318 if (err < 0) { 1319 bpf_prog_put(prog); 1320 return err; 1321 } 1322 1323 return 0; 1324 } 1325 1326 int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) 1327 { 1328 struct bpf_prog *prog = __get_bpf(ufd, sk); 1329 int err; 1330 1331 if (IS_ERR(prog)) 1332 return PTR_ERR(prog); 1333 1334 err = __reuseport_attach_prog(prog, sk); 1335 if (err < 0) { 1336 bpf_prog_put(prog); 1337 return err; 1338 } 1339 1340 return 0; 1341 } 1342 1343 struct bpf_scratchpad { 1344 union { 1345 __be32 diff[MAX_BPF_STACK / sizeof(__be32)]; 1346 u8 buff[MAX_BPF_STACK]; 1347 }; 1348 }; 1349 1350 static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp); 1351 1352 static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) 1353 { 1354 struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp); 1355 struct sk_buff *skb = (struct sk_buff *) (long) r1; 1356 int offset = (int) r2; 1357 void *from = (void *) (long) r3; 1358 unsigned int len = (unsigned int) r4; 1359 void *ptr; 1360 1361 if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))) 1362 return -EINVAL; 1363 1364 /* bpf verifier guarantees that: 1365 * 'from' pointer points to bpf program stack 1366 * 'len' bytes of it were initialized 1367 * 'len' > 0 1368 * 'skb' is a valid pointer to 'struct sk_buff' 1369 * 1370 * so check for invalid 'offset' and too large 'len' 1371 */ 1372 if (unlikely((u32) offset > 0xffff || len > sizeof(sp->buff))) 1373 return -EFAULT; 1374 if (unlikely(skb_try_make_writable(skb, offset + len))) 1375 return -EFAULT; 1376 1377 ptr = skb_header_pointer(skb, offset, len, sp->buff); 1378 if (unlikely(!ptr)) 1379 return -EFAULT; 1380 1381 if (flags & BPF_F_RECOMPUTE_CSUM) 1382 skb_postpull_rcsum(skb, ptr, len); 1383 1384 memcpy(ptr, from, len); 1385 1386 if (ptr == sp->buff) 1387 /* skb_store_bits cannot return -EFAULT here */ 1388 skb_store_bits(skb, offset, ptr, len); 1389 1390 if (flags & BPF_F_RECOMPUTE_CSUM) 1391 skb_postpush_rcsum(skb, ptr, len); 1392 if (flags & BPF_F_INVALIDATE_HASH) 1393 skb_clear_hash(skb); 1394 1395 return 0; 1396 } 1397 1398 static const struct bpf_func_proto bpf_skb_store_bytes_proto = { 1399 .func = bpf_skb_store_bytes, 1400 .gpl_only = false, 1401 .ret_type = RET_INTEGER, 1402 .arg1_type = ARG_PTR_TO_CTX, 1403 .arg2_type = ARG_ANYTHING, 1404 .arg3_type = ARG_PTR_TO_STACK, 1405 .arg4_type = ARG_CONST_STACK_SIZE, 1406 .arg5_type = ARG_ANYTHING, 1407 }; 1408 1409 static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) 1410 { 1411 const struct sk_buff *skb = (const struct sk_buff *)(unsigned long) r1; 1412 int offset = (int) r2; 1413 void *to = (void *)(unsigned long) r3; 1414 unsigned int len = (unsigned int) r4; 1415 void *ptr; 1416 1417 if (unlikely((u32) offset > 0xffff || len > MAX_BPF_STACK)) 1418 return -EFAULT; 1419 1420 ptr = skb_header_pointer(skb, offset, len, to); 1421 if (unlikely(!ptr)) 1422 return -EFAULT; 1423 if (ptr != to) 1424 memcpy(to, ptr, len); 1425 1426 return 0; 1427 } 1428 1429 static const struct bpf_func_proto bpf_skb_load_bytes_proto = { 1430 .func = bpf_skb_load_bytes, 1431 .gpl_only = false, 1432 .ret_type = RET_INTEGER, 1433 .arg1_type = ARG_PTR_TO_CTX, 1434 .arg2_type = ARG_ANYTHING, 1435 .arg3_type = ARG_PTR_TO_STACK, 1436 .arg4_type = ARG_CONST_STACK_SIZE, 1437 }; 1438 1439 static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) 1440 { 1441 struct sk_buff *skb = (struct sk_buff *) (long) r1; 1442 int offset = (int) r2; 1443 __sum16 sum, *ptr; 1444 1445 if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK))) 1446 return -EINVAL; 1447 if (unlikely((u32) offset > 0xffff)) 1448 return -EFAULT; 1449 if (unlikely(skb_try_make_writable(skb, offset + sizeof(sum)))) 1450 return -EFAULT; 1451 1452 ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); 1453 if (unlikely(!ptr)) 1454 return -EFAULT; 1455 1456 switch (flags & BPF_F_HDR_FIELD_MASK) { 1457 case 0: 1458 if (unlikely(from != 0)) 1459 return -EINVAL; 1460 1461 csum_replace_by_diff(ptr, to); 1462 break; 1463 case 2: 1464 csum_replace2(ptr, from, to); 1465 break; 1466 case 4: 1467 csum_replace4(ptr, from, to); 1468 break; 1469 default: 1470 return -EINVAL; 1471 } 1472 1473 if (ptr == &sum) 1474 /* skb_store_bits guaranteed to not return -EFAULT here */ 1475 skb_store_bits(skb, offset, ptr, sizeof(sum)); 1476 1477 return 0; 1478 } 1479 1480 static const struct bpf_func_proto bpf_l3_csum_replace_proto = { 1481 .func = bpf_l3_csum_replace, 1482 .gpl_only = false, 1483 .ret_type = RET_INTEGER, 1484 .arg1_type = ARG_PTR_TO_CTX, 1485 .arg2_type = ARG_ANYTHING, 1486 .arg3_type = ARG_ANYTHING, 1487 .arg4_type = ARG_ANYTHING, 1488 .arg5_type = ARG_ANYTHING, 1489 }; 1490 1491 static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) 1492 { 1493 struct sk_buff *skb = (struct sk_buff *) (long) r1; 1494 bool is_pseudo = flags & BPF_F_PSEUDO_HDR; 1495 bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; 1496 int offset = (int) r2; 1497 __sum16 sum, *ptr; 1498 1499 if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR | 1500 BPF_F_HDR_FIELD_MASK))) 1501 return -EINVAL; 1502 if (unlikely((u32) offset > 0xffff)) 1503 return -EFAULT; 1504 if (unlikely(skb_try_make_writable(skb, offset + sizeof(sum)))) 1505 return -EFAULT; 1506 1507 ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); 1508 if (unlikely(!ptr)) 1509 return -EFAULT; 1510 if (is_mmzero && !*ptr) 1511 return 0; 1512 1513 switch (flags & BPF_F_HDR_FIELD_MASK) { 1514 case 0: 1515 if (unlikely(from != 0)) 1516 return -EINVAL; 1517 1518 inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo); 1519 break; 1520 case 2: 1521 inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); 1522 break; 1523 case 4: 1524 inet_proto_csum_replace4(ptr, skb, from, to, is_pseudo); 1525 break; 1526 default: 1527 return -EINVAL; 1528 } 1529 1530 if (is_mmzero && !*ptr) 1531 *ptr = CSUM_MANGLED_0; 1532 if (ptr == &sum) 1533 /* skb_store_bits guaranteed to not return -EFAULT here */ 1534 skb_store_bits(skb, offset, ptr, sizeof(sum)); 1535 1536 return 0; 1537 } 1538 1539 static const struct bpf_func_proto bpf_l4_csum_replace_proto = { 1540 .func = bpf_l4_csum_replace, 1541 .gpl_only = false, 1542 .ret_type = RET_INTEGER, 1543 .arg1_type = ARG_PTR_TO_CTX, 1544 .arg2_type = ARG_ANYTHING, 1545 .arg3_type = ARG_ANYTHING, 1546 .arg4_type = ARG_ANYTHING, 1547 .arg5_type = ARG_ANYTHING, 1548 }; 1549 1550 static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed) 1551 { 1552 struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp); 1553 u64 diff_size = from_size + to_size; 1554 __be32 *from = (__be32 *) (long) r1; 1555 __be32 *to = (__be32 *) (long) r3; 1556 int i, j = 0; 1557 1558 /* This is quite flexible, some examples: 1559 * 1560 * from_size == 0, to_size > 0, seed := csum --> pushing data 1561 * from_size > 0, to_size == 0, seed := csum --> pulling data 1562 * from_size > 0, to_size > 0, seed := 0 --> diffing data 1563 * 1564 * Even for diffing, from_size and to_size don't need to be equal. 1565 */ 1566 if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) || 1567 diff_size > sizeof(sp->diff))) 1568 return -EINVAL; 1569 1570 for (i = 0; i < from_size / sizeof(__be32); i++, j++) 1571 sp->diff[j] = ~from[i]; 1572 for (i = 0; i < to_size / sizeof(__be32); i++, j++) 1573 sp->diff[j] = to[i]; 1574 1575 return csum_partial(sp->diff, diff_size, seed); 1576 } 1577 1578 static const struct bpf_func_proto bpf_csum_diff_proto = { 1579 .func = bpf_csum_diff, 1580 .gpl_only = false, 1581 .ret_type = RET_INTEGER, 1582 .arg1_type = ARG_PTR_TO_STACK, 1583 .arg2_type = ARG_CONST_STACK_SIZE_OR_ZERO, 1584 .arg3_type = ARG_PTR_TO_STACK, 1585 .arg4_type = ARG_CONST_STACK_SIZE_OR_ZERO, 1586 .arg5_type = ARG_ANYTHING, 1587 }; 1588 1589 static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) 1590 { 1591 struct sk_buff *skb = (struct sk_buff *) (long) r1, *skb2; 1592 struct net_device *dev; 1593 1594 if (unlikely(flags & ~(BPF_F_INGRESS))) 1595 return -EINVAL; 1596 1597 dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex); 1598 if (unlikely(!dev)) 1599 return -EINVAL; 1600 1601 skb2 = skb_clone(skb, GFP_ATOMIC); 1602 if (unlikely(!skb2)) 1603 return -ENOMEM; 1604 1605 if (flags & BPF_F_INGRESS) { 1606 if (skb_at_tc_ingress(skb2)) 1607 skb_postpush_rcsum(skb2, skb_mac_header(skb2), 1608 skb2->mac_len); 1609 return dev_forward_skb(dev, skb2); 1610 } 1611 1612 skb2->dev = dev; 1613 return dev_queue_xmit(skb2); 1614 } 1615 1616 static const struct bpf_func_proto bpf_clone_redirect_proto = { 1617 .func = bpf_clone_redirect, 1618 .gpl_only = false, 1619 .ret_type = RET_INTEGER, 1620 .arg1_type = ARG_PTR_TO_CTX, 1621 .arg2_type = ARG_ANYTHING, 1622 .arg3_type = ARG_ANYTHING, 1623 }; 1624 1625 struct redirect_info { 1626 u32 ifindex; 1627 u32 flags; 1628 }; 1629 1630 static DEFINE_PER_CPU(struct redirect_info, redirect_info); 1631 1632 static u64 bpf_redirect(u64 ifindex, u64 flags, u64 r3, u64 r4, u64 r5) 1633 { 1634 struct redirect_info *ri = this_cpu_ptr(&redirect_info); 1635 1636 if (unlikely(flags & ~(BPF_F_INGRESS))) 1637 return TC_ACT_SHOT; 1638 1639 ri->ifindex = ifindex; 1640 ri->flags = flags; 1641 1642 return TC_ACT_REDIRECT; 1643 } 1644 1645 int skb_do_redirect(struct sk_buff *skb) 1646 { 1647 struct redirect_info *ri = this_cpu_ptr(&redirect_info); 1648 struct net_device *dev; 1649 1650 dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex); 1651 ri->ifindex = 0; 1652 if (unlikely(!dev)) { 1653 kfree_skb(skb); 1654 return -EINVAL; 1655 } 1656 1657 if (ri->flags & BPF_F_INGRESS) { 1658 if (skb_at_tc_ingress(skb)) 1659 skb_postpush_rcsum(skb, skb_mac_header(skb), 1660 skb->mac_len); 1661 return dev_forward_skb(dev, skb); 1662 } 1663 1664 skb->dev = dev; 1665 return dev_queue_xmit(skb); 1666 } 1667 1668 static const struct bpf_func_proto bpf_redirect_proto = { 1669 .func = bpf_redirect, 1670 .gpl_only = false, 1671 .ret_type = RET_INTEGER, 1672 .arg1_type = ARG_ANYTHING, 1673 .arg2_type = ARG_ANYTHING, 1674 }; 1675 1676 static u64 bpf_get_cgroup_classid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) 1677 { 1678 return task_get_classid((struct sk_buff *) (unsigned long) r1); 1679 } 1680 1681 static const struct bpf_func_proto bpf_get_cgroup_classid_proto = { 1682 .func = bpf_get_cgroup_classid, 1683 .gpl_only = false, 1684 .ret_type = RET_INTEGER, 1685 .arg1_type = ARG_PTR_TO_CTX, 1686 }; 1687 1688 static u64 bpf_get_route_realm(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) 1689 { 1690 return dst_tclassid((struct sk_buff *) (unsigned long) r1); 1691 } 1692 1693 static const struct bpf_func_proto bpf_get_route_realm_proto = { 1694 .func = bpf_get_route_realm, 1695 .gpl_only = false, 1696 .ret_type = RET_INTEGER, 1697 .arg1_type = ARG_PTR_TO_CTX, 1698 }; 1699 1700 static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5) 1701 { 1702 struct sk_buff *skb = (struct sk_buff *) (long) r1; 1703 __be16 vlan_proto = (__force __be16) r2; 1704 1705 if (unlikely(vlan_proto != htons(ETH_P_8021Q) && 1706 vlan_proto != htons(ETH_P_8021AD))) 1707 vlan_proto = htons(ETH_P_8021Q); 1708 1709 return skb_vlan_push(skb, vlan_proto, vlan_tci); 1710 } 1711 1712 const struct bpf_func_proto bpf_skb_vlan_push_proto = { 1713 .func = bpf_skb_vlan_push, 1714 .gpl_only = false, 1715 .ret_type = RET_INTEGER, 1716 .arg1_type = ARG_PTR_TO_CTX, 1717 .arg2_type = ARG_ANYTHING, 1718 .arg3_type = ARG_ANYTHING, 1719 }; 1720 EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto); 1721 1722 static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) 1723 { 1724 struct sk_buff *skb = (struct sk_buff *) (long) r1; 1725 1726 return skb_vlan_pop(skb); 1727 } 1728 1729 const struct bpf_func_proto bpf_skb_vlan_pop_proto = { 1730 .func = bpf_skb_vlan_pop, 1731 .gpl_only = false, 1732 .ret_type = RET_INTEGER, 1733 .arg1_type = ARG_PTR_TO_CTX, 1734 }; 1735 EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto); 1736 1737 bool bpf_helper_changes_skb_data(void *func) 1738 { 1739 if (func == bpf_skb_vlan_push) 1740 return true; 1741 if (func == bpf_skb_vlan_pop) 1742 return true; 1743 if (func == bpf_skb_store_bytes) 1744 return true; 1745 if (func == bpf_l3_csum_replace) 1746 return true; 1747 if (func == bpf_l4_csum_replace) 1748 return true; 1749 1750 return false; 1751 } 1752 1753 static unsigned short bpf_tunnel_key_af(u64 flags) 1754 { 1755 return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET; 1756 } 1757 1758 static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) 1759 { 1760 struct sk_buff *skb = (struct sk_buff *) (long) r1; 1761 struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2; 1762 const struct ip_tunnel_info *info = skb_tunnel_info(skb); 1763 u8 compat[sizeof(struct bpf_tunnel_key)]; 1764 1765 if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6)))) 1766 return -EINVAL; 1767 if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) 1768 return -EPROTO; 1769 if (unlikely(size != sizeof(struct bpf_tunnel_key))) { 1770 switch (size) { 1771 case offsetof(struct bpf_tunnel_key, tunnel_label): 1772 case offsetof(struct bpf_tunnel_key, tunnel_ext): 1773 goto set_compat; 1774 case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): 1775 /* Fixup deprecated structure layouts here, so we have 1776 * a common path later on. 1777 */ 1778 if (ip_tunnel_info_af(info) != AF_INET) 1779 return -EINVAL; 1780 set_compat: 1781 to = (struct bpf_tunnel_key *)compat; 1782 break; 1783 default: 1784 return -EINVAL; 1785 } 1786 } 1787 1788 to->tunnel_id = be64_to_cpu(info->key.tun_id); 1789 to->tunnel_tos = info->key.tos; 1790 to->tunnel_ttl = info->key.ttl; 1791 1792 if (flags & BPF_F_TUNINFO_IPV6) { 1793 memcpy(to->remote_ipv6, &info->key.u.ipv6.src, 1794 sizeof(to->remote_ipv6)); 1795 to->tunnel_label = be32_to_cpu(info->key.label); 1796 } else { 1797 to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); 1798 } 1799 1800 if (unlikely(size != sizeof(struct bpf_tunnel_key))) 1801 memcpy((void *)(long) r2, to, size); 1802 1803 return 0; 1804 } 1805 1806 static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { 1807 .func = bpf_skb_get_tunnel_key, 1808 .gpl_only = false, 1809 .ret_type = RET_INTEGER, 1810 .arg1_type = ARG_PTR_TO_CTX, 1811 .arg2_type = ARG_PTR_TO_STACK, 1812 .arg3_type = ARG_CONST_STACK_SIZE, 1813 .arg4_type = ARG_ANYTHING, 1814 }; 1815 1816 static u64 bpf_skb_get_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5) 1817 { 1818 struct sk_buff *skb = (struct sk_buff *) (long) r1; 1819 u8 *to = (u8 *) (long) r2; 1820 const struct ip_tunnel_info *info = skb_tunnel_info(skb); 1821 1822 if (unlikely(!info || 1823 !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) 1824 return -ENOENT; 1825 if (unlikely(size < info->options_len)) 1826 return -ENOMEM; 1827 1828 ip_tunnel_info_opts_get(to, info); 1829 1830 return info->options_len; 1831 } 1832 1833 static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = { 1834 .func = bpf_skb_get_tunnel_opt, 1835 .gpl_only = false, 1836 .ret_type = RET_INTEGER, 1837 .arg1_type = ARG_PTR_TO_CTX, 1838 .arg2_type = ARG_PTR_TO_STACK, 1839 .arg3_type = ARG_CONST_STACK_SIZE, 1840 }; 1841 1842 static struct metadata_dst __percpu *md_dst; 1843 1844 static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) 1845 { 1846 struct sk_buff *skb = (struct sk_buff *) (long) r1; 1847 struct bpf_tunnel_key *from = (struct bpf_tunnel_key *) (long) r2; 1848 struct metadata_dst *md = this_cpu_ptr(md_dst); 1849 u8 compat[sizeof(struct bpf_tunnel_key)]; 1850 struct ip_tunnel_info *info; 1851 1852 if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX | 1853 BPF_F_DONT_FRAGMENT))) 1854 return -EINVAL; 1855 if (unlikely(size != sizeof(struct bpf_tunnel_key))) { 1856 switch (size) { 1857 case offsetof(struct bpf_tunnel_key, tunnel_label): 1858 case offsetof(struct bpf_tunnel_key, tunnel_ext): 1859 case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): 1860 /* Fixup deprecated structure layouts here, so we have 1861 * a common path later on. 1862 */ 1863 memcpy(compat, from, size); 1864 memset(compat + size, 0, sizeof(compat) - size); 1865 from = (struct bpf_tunnel_key *)compat; 1866 break; 1867 default: 1868 return -EINVAL; 1869 } 1870 } 1871 if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) || 1872 from->tunnel_ext)) 1873 return -EINVAL; 1874 1875 skb_dst_drop(skb); 1876 dst_hold((struct dst_entry *) md); 1877 skb_dst_set(skb, (struct dst_entry *) md); 1878 1879 info = &md->u.tun_info; 1880 info->mode = IP_TUNNEL_INFO_TX; 1881 1882 info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE; 1883 if (flags & BPF_F_DONT_FRAGMENT) 1884 info->key.tun_flags |= TUNNEL_DONT_FRAGMENT; 1885 1886 info->key.tun_id = cpu_to_be64(from->tunnel_id); 1887 info->key.tos = from->tunnel_tos; 1888 info->key.ttl = from->tunnel_ttl; 1889 1890 if (flags & BPF_F_TUNINFO_IPV6) { 1891 info->mode |= IP_TUNNEL_INFO_IPV6; 1892 memcpy(&info->key.u.ipv6.dst, from->remote_ipv6, 1893 sizeof(from->remote_ipv6)); 1894 info->key.label = cpu_to_be32(from->tunnel_label) & 1895 IPV6_FLOWLABEL_MASK; 1896 } else { 1897 info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); 1898 if (flags & BPF_F_ZERO_CSUM_TX) 1899 info->key.tun_flags &= ~TUNNEL_CSUM; 1900 } 1901 1902 return 0; 1903 } 1904 1905 static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { 1906 .func = bpf_skb_set_tunnel_key, 1907 .gpl_only = false, 1908 .ret_type = RET_INTEGER, 1909 .arg1_type = ARG_PTR_TO_CTX, 1910 .arg2_type = ARG_PTR_TO_STACK, 1911 .arg3_type = ARG_CONST_STACK_SIZE, 1912 .arg4_type = ARG_ANYTHING, 1913 }; 1914 1915 static u64 bpf_skb_set_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5) 1916 { 1917 struct sk_buff *skb = (struct sk_buff *) (long) r1; 1918 u8 *from = (u8 *) (long) r2; 1919 struct ip_tunnel_info *info = skb_tunnel_info(skb); 1920 const struct metadata_dst *md = this_cpu_ptr(md_dst); 1921 1922 if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1)))) 1923 return -EINVAL; 1924 if (unlikely(size > IP_TUNNEL_OPTS_MAX)) 1925 return -ENOMEM; 1926 1927 ip_tunnel_info_opts_set(info, from, size); 1928 1929 return 0; 1930 } 1931 1932 static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = { 1933 .func = bpf_skb_set_tunnel_opt, 1934 .gpl_only = false, 1935 .ret_type = RET_INTEGER, 1936 .arg1_type = ARG_PTR_TO_CTX, 1937 .arg2_type = ARG_PTR_TO_STACK, 1938 .arg3_type = ARG_CONST_STACK_SIZE, 1939 }; 1940 1941 static const struct bpf_func_proto * 1942 bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) 1943 { 1944 if (!md_dst) { 1945 /* Race is not possible, since it's called from verifier 1946 * that is holding verifier mutex. 1947 */ 1948 md_dst = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX, 1949 GFP_KERNEL); 1950 if (!md_dst) 1951 return NULL; 1952 } 1953 1954 switch (which) { 1955 case BPF_FUNC_skb_set_tunnel_key: 1956 return &bpf_skb_set_tunnel_key_proto; 1957 case BPF_FUNC_skb_set_tunnel_opt: 1958 return &bpf_skb_set_tunnel_opt_proto; 1959 default: 1960 return NULL; 1961 } 1962 } 1963 1964 static const struct bpf_func_proto * 1965 sk_filter_func_proto(enum bpf_func_id func_id) 1966 { 1967 switch (func_id) { 1968 case BPF_FUNC_map_lookup_elem: 1969 return &bpf_map_lookup_elem_proto; 1970 case BPF_FUNC_map_update_elem: 1971 return &bpf_map_update_elem_proto; 1972 case BPF_FUNC_map_delete_elem: 1973 return &bpf_map_delete_elem_proto; 1974 case BPF_FUNC_get_prandom_u32: 1975 return &bpf_get_prandom_u32_proto; 1976 case BPF_FUNC_get_smp_processor_id: 1977 return &bpf_get_smp_processor_id_proto; 1978 case BPF_FUNC_tail_call: 1979 return &bpf_tail_call_proto; 1980 case BPF_FUNC_ktime_get_ns: 1981 return &bpf_ktime_get_ns_proto; 1982 case BPF_FUNC_trace_printk: 1983 if (capable(CAP_SYS_ADMIN)) 1984 return bpf_get_trace_printk_proto(); 1985 default: 1986 return NULL; 1987 } 1988 } 1989 1990 static const struct bpf_func_proto * 1991 tc_cls_act_func_proto(enum bpf_func_id func_id) 1992 { 1993 switch (func_id) { 1994 case BPF_FUNC_skb_store_bytes: 1995 return &bpf_skb_store_bytes_proto; 1996 case BPF_FUNC_skb_load_bytes: 1997 return &bpf_skb_load_bytes_proto; 1998 case BPF_FUNC_csum_diff: 1999 return &bpf_csum_diff_proto; 2000 case BPF_FUNC_l3_csum_replace: 2001 return &bpf_l3_csum_replace_proto; 2002 case BPF_FUNC_l4_csum_replace: 2003 return &bpf_l4_csum_replace_proto; 2004 case BPF_FUNC_clone_redirect: 2005 return &bpf_clone_redirect_proto; 2006 case BPF_FUNC_get_cgroup_classid: 2007 return &bpf_get_cgroup_classid_proto; 2008 case BPF_FUNC_skb_vlan_push: 2009 return &bpf_skb_vlan_push_proto; 2010 case BPF_FUNC_skb_vlan_pop: 2011 return &bpf_skb_vlan_pop_proto; 2012 case BPF_FUNC_skb_get_tunnel_key: 2013 return &bpf_skb_get_tunnel_key_proto; 2014 case BPF_FUNC_skb_set_tunnel_key: 2015 return bpf_get_skb_set_tunnel_proto(func_id); 2016 case BPF_FUNC_skb_get_tunnel_opt: 2017 return &bpf_skb_get_tunnel_opt_proto; 2018 case BPF_FUNC_skb_set_tunnel_opt: 2019 return bpf_get_skb_set_tunnel_proto(func_id); 2020 case BPF_FUNC_redirect: 2021 return &bpf_redirect_proto; 2022 case BPF_FUNC_get_route_realm: 2023 return &bpf_get_route_realm_proto; 2024 default: 2025 return sk_filter_func_proto(func_id); 2026 } 2027 } 2028 2029 static bool __is_valid_access(int off, int size, enum bpf_access_type type) 2030 { 2031 /* check bounds */ 2032 if (off < 0 || off >= sizeof(struct __sk_buff)) 2033 return false; 2034 2035 /* disallow misaligned access */ 2036 if (off % size != 0) 2037 return false; 2038 2039 /* all __sk_buff fields are __u32 */ 2040 if (size != 4) 2041 return false; 2042 2043 return true; 2044 } 2045 2046 static bool sk_filter_is_valid_access(int off, int size, 2047 enum bpf_access_type type) 2048 { 2049 if (off == offsetof(struct __sk_buff, tc_classid)) 2050 return false; 2051 2052 if (type == BPF_WRITE) { 2053 switch (off) { 2054 case offsetof(struct __sk_buff, cb[0]) ... 2055 offsetof(struct __sk_buff, cb[4]): 2056 break; 2057 default: 2058 return false; 2059 } 2060 } 2061 2062 return __is_valid_access(off, size, type); 2063 } 2064 2065 static bool tc_cls_act_is_valid_access(int off, int size, 2066 enum bpf_access_type type) 2067 { 2068 if (type == BPF_WRITE) { 2069 switch (off) { 2070 case offsetof(struct __sk_buff, mark): 2071 case offsetof(struct __sk_buff, tc_index): 2072 case offsetof(struct __sk_buff, priority): 2073 case offsetof(struct __sk_buff, cb[0]) ... 2074 offsetof(struct __sk_buff, cb[4]): 2075 case offsetof(struct __sk_buff, tc_classid): 2076 break; 2077 default: 2078 return false; 2079 } 2080 } 2081 return __is_valid_access(off, size, type); 2082 } 2083 2084 static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, 2085 int src_reg, int ctx_off, 2086 struct bpf_insn *insn_buf, 2087 struct bpf_prog *prog) 2088 { 2089 struct bpf_insn *insn = insn_buf; 2090 2091 switch (ctx_off) { 2092 case offsetof(struct __sk_buff, len): 2093 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); 2094 2095 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 2096 offsetof(struct sk_buff, len)); 2097 break; 2098 2099 case offsetof(struct __sk_buff, protocol): 2100 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); 2101 2102 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 2103 offsetof(struct sk_buff, protocol)); 2104 break; 2105 2106 case offsetof(struct __sk_buff, vlan_proto): 2107 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2); 2108 2109 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 2110 offsetof(struct sk_buff, vlan_proto)); 2111 break; 2112 2113 case offsetof(struct __sk_buff, priority): 2114 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4); 2115 2116 if (type == BPF_WRITE) 2117 *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, 2118 offsetof(struct sk_buff, priority)); 2119 else 2120 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 2121 offsetof(struct sk_buff, priority)); 2122 break; 2123 2124 case offsetof(struct __sk_buff, ingress_ifindex): 2125 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4); 2126 2127 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 2128 offsetof(struct sk_buff, skb_iif)); 2129 break; 2130 2131 case offsetof(struct __sk_buff, ifindex): 2132 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); 2133 2134 *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)), 2135 dst_reg, src_reg, 2136 offsetof(struct sk_buff, dev)); 2137 *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1); 2138 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg, 2139 offsetof(struct net_device, ifindex)); 2140 break; 2141 2142 case offsetof(struct __sk_buff, hash): 2143 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); 2144 2145 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 2146 offsetof(struct sk_buff, hash)); 2147 break; 2148 2149 case offsetof(struct __sk_buff, mark): 2150 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); 2151 2152 if (type == BPF_WRITE) 2153 *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, 2154 offsetof(struct sk_buff, mark)); 2155 else 2156 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 2157 offsetof(struct sk_buff, mark)); 2158 break; 2159 2160 case offsetof(struct __sk_buff, pkt_type): 2161 return convert_skb_access(SKF_AD_PKTTYPE, dst_reg, src_reg, insn); 2162 2163 case offsetof(struct __sk_buff, queue_mapping): 2164 return convert_skb_access(SKF_AD_QUEUE, dst_reg, src_reg, insn); 2165 2166 case offsetof(struct __sk_buff, vlan_present): 2167 return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, 2168 dst_reg, src_reg, insn); 2169 2170 case offsetof(struct __sk_buff, vlan_tci): 2171 return convert_skb_access(SKF_AD_VLAN_TAG, 2172 dst_reg, src_reg, insn); 2173 2174 case offsetof(struct __sk_buff, cb[0]) ... 2175 offsetof(struct __sk_buff, cb[4]): 2176 BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20); 2177 2178 prog->cb_access = 1; 2179 ctx_off -= offsetof(struct __sk_buff, cb[0]); 2180 ctx_off += offsetof(struct sk_buff, cb); 2181 ctx_off += offsetof(struct qdisc_skb_cb, data); 2182 if (type == BPF_WRITE) 2183 *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, ctx_off); 2184 else 2185 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ctx_off); 2186 break; 2187 2188 case offsetof(struct __sk_buff, tc_classid): 2189 ctx_off -= offsetof(struct __sk_buff, tc_classid); 2190 ctx_off += offsetof(struct sk_buff, cb); 2191 ctx_off += offsetof(struct qdisc_skb_cb, tc_classid); 2192 if (type == BPF_WRITE) 2193 *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off); 2194 else 2195 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, ctx_off); 2196 break; 2197 2198 case offsetof(struct __sk_buff, tc_index): 2199 #ifdef CONFIG_NET_SCHED 2200 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2); 2201 2202 if (type == BPF_WRITE) 2203 *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, 2204 offsetof(struct sk_buff, tc_index)); 2205 else 2206 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 2207 offsetof(struct sk_buff, tc_index)); 2208 break; 2209 #else 2210 if (type == BPF_WRITE) 2211 *insn++ = BPF_MOV64_REG(dst_reg, dst_reg); 2212 else 2213 *insn++ = BPF_MOV64_IMM(dst_reg, 0); 2214 break; 2215 #endif 2216 } 2217 2218 return insn - insn_buf; 2219 } 2220 2221 static const struct bpf_verifier_ops sk_filter_ops = { 2222 .get_func_proto = sk_filter_func_proto, 2223 .is_valid_access = sk_filter_is_valid_access, 2224 .convert_ctx_access = bpf_net_convert_ctx_access, 2225 }; 2226 2227 static const struct bpf_verifier_ops tc_cls_act_ops = { 2228 .get_func_proto = tc_cls_act_func_proto, 2229 .is_valid_access = tc_cls_act_is_valid_access, 2230 .convert_ctx_access = bpf_net_convert_ctx_access, 2231 }; 2232 2233 static struct bpf_prog_type_list sk_filter_type __read_mostly = { 2234 .ops = &sk_filter_ops, 2235 .type = BPF_PROG_TYPE_SOCKET_FILTER, 2236 }; 2237 2238 static struct bpf_prog_type_list sched_cls_type __read_mostly = { 2239 .ops = &tc_cls_act_ops, 2240 .type = BPF_PROG_TYPE_SCHED_CLS, 2241 }; 2242 2243 static struct bpf_prog_type_list sched_act_type __read_mostly = { 2244 .ops = &tc_cls_act_ops, 2245 .type = BPF_PROG_TYPE_SCHED_ACT, 2246 }; 2247 2248 static int __init register_sk_filter_ops(void) 2249 { 2250 bpf_register_prog_type(&sk_filter_type); 2251 bpf_register_prog_type(&sched_cls_type); 2252 bpf_register_prog_type(&sched_act_type); 2253 2254 return 0; 2255 } 2256 late_initcall(register_sk_filter_ops); 2257 2258 int __sk_detach_filter(struct sock *sk, bool locked) 2259 { 2260 int ret = -ENOENT; 2261 struct sk_filter *filter; 2262 2263 if (sock_flag(sk, SOCK_FILTER_LOCKED)) 2264 return -EPERM; 2265 2266 filter = rcu_dereference_protected(sk->sk_filter, locked); 2267 if (filter) { 2268 RCU_INIT_POINTER(sk->sk_filter, NULL); 2269 sk_filter_uncharge(sk, filter); 2270 ret = 0; 2271 } 2272 2273 return ret; 2274 } 2275 EXPORT_SYMBOL_GPL(__sk_detach_filter); 2276 2277 int sk_detach_filter(struct sock *sk) 2278 { 2279 return __sk_detach_filter(sk, sock_owned_by_user(sk)); 2280 } 2281 2282 int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, 2283 unsigned int len) 2284 { 2285 struct sock_fprog_kern *fprog; 2286 struct sk_filter *filter; 2287 int ret = 0; 2288 2289 lock_sock(sk); 2290 filter = rcu_dereference_protected(sk->sk_filter, 2291 sock_owned_by_user(sk)); 2292 if (!filter) 2293 goto out; 2294 2295 /* We're copying the filter that has been originally attached, 2296 * so no conversion/decode needed anymore. eBPF programs that 2297 * have no original program cannot be dumped through this. 2298 */ 2299 ret = -EACCES; 2300 fprog = filter->prog->orig_prog; 2301 if (!fprog) 2302 goto out; 2303 2304 ret = fprog->len; 2305 if (!len) 2306 /* User space only enquires number of filter blocks. */ 2307 goto out; 2308 2309 ret = -EINVAL; 2310 if (len < fprog->len) 2311 goto out; 2312 2313 ret = -EFAULT; 2314 if (copy_to_user(ubuf, fprog->filter, bpf_classic_proglen(fprog))) 2315 goto out; 2316 2317 /* Instead of bytes, the API requests to return the number 2318 * of filter blocks. 2319 */ 2320 ret = fprog->len; 2321 out: 2322 release_sock(sk); 2323 return ret; 2324 } 2325