1 /* 2 * Linux Socket Filter - Kernel level socket filtering 3 * 4 * Based on the design of the Berkeley Packet Filter. The new 5 * internal format has been designed by PLUMgrid: 6 * 7 * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com 8 * 9 * Authors: 10 * 11 * Jay Schulist <jschlst@samba.org> 12 * Alexei Starovoitov <ast@plumgrid.com> 13 * Daniel Borkmann <dborkman@redhat.com> 14 * 15 * This program is free software; you can redistribute it and/or 16 * modify it under the terms of the GNU General Public License 17 * as published by the Free Software Foundation; either version 18 * 2 of the License, or (at your option) any later version. 19 * 20 * Andi Kleen - Fix a few bad bugs and races. 21 * Kris Katterjohn - Added many additional checks in bpf_check_classic() 22 */ 23 24 #include <linux/module.h> 25 #include <linux/types.h> 26 #include <linux/mm.h> 27 #include <linux/fcntl.h> 28 #include <linux/socket.h> 29 #include <linux/in.h> 30 #include <linux/inet.h> 31 #include <linux/netdevice.h> 32 #include <linux/if_packet.h> 33 #include <linux/gfp.h> 34 #include <net/ip.h> 35 #include <net/protocol.h> 36 #include <net/netlink.h> 37 #include <linux/skbuff.h> 38 #include <net/sock.h> 39 #include <net/flow_dissector.h> 40 #include <linux/errno.h> 41 #include <linux/timer.h> 42 #include <asm/uaccess.h> 43 #include <asm/unaligned.h> 44 #include <linux/filter.h> 45 #include <linux/ratelimit.h> 46 #include <linux/seccomp.h> 47 #include <linux/if_vlan.h> 48 #include <linux/bpf.h> 49 #include <net/sch_generic.h> 50 51 /** 52 * sk_filter - run a packet through a socket filter 53 * @sk: sock associated with &sk_buff 54 * @skb: buffer to filter 55 * 56 * Run the filter code and then cut skb->data to correct size returned by 57 * SK_RUN_FILTER. If pkt_len is 0 we toss packet. If skb->len is smaller 58 * than pkt_len we keep whole skb->data. This is the socket level 59 * wrapper to SK_RUN_FILTER. It returns 0 if the packet should 60 * be accepted or -EPERM if the packet should be tossed. 61 * 62 */ 63 int sk_filter(struct sock *sk, struct sk_buff *skb) 64 { 65 int err; 66 struct sk_filter *filter; 67 68 /* 69 * If the skb was allocated from pfmemalloc reserves, only 70 * allow SOCK_MEMALLOC sockets to use it as this socket is 71 * helping free memory 72 */ 73 if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) 74 return -ENOMEM; 75 76 err = security_sock_rcv_skb(sk, skb); 77 if (err) 78 return err; 79 80 rcu_read_lock(); 81 filter = rcu_dereference(sk->sk_filter); 82 if (filter) { 83 unsigned int pkt_len = SK_RUN_FILTER(filter, skb); 84 85 err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM; 86 } 87 rcu_read_unlock(); 88 89 return err; 90 } 91 EXPORT_SYMBOL(sk_filter); 92 93 static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) 94 { 95 return skb_get_poff((struct sk_buff *)(unsigned long) ctx); 96 } 97 98 static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) 99 { 100 struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx; 101 struct nlattr *nla; 102 103 if (skb_is_nonlinear(skb)) 104 return 0; 105 106 if (skb->len < sizeof(struct nlattr)) 107 return 0; 108 109 if (a > skb->len - sizeof(struct nlattr)) 110 return 0; 111 112 nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x); 113 if (nla) 114 return (void *) nla - (void *) skb->data; 115 116 return 0; 117 } 118 119 static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) 120 { 121 struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx; 122 struct nlattr *nla; 123 124 if (skb_is_nonlinear(skb)) 125 return 0; 126 127 if (skb->len < sizeof(struct nlattr)) 128 return 0; 129 130 if (a > skb->len - sizeof(struct nlattr)) 131 return 0; 132 133 nla = (struct nlattr *) &skb->data[a]; 134 if (nla->nla_len > skb->len - a) 135 return 0; 136 137 nla = nla_find_nested(nla, x); 138 if (nla) 139 return (void *) nla - (void *) skb->data; 140 141 return 0; 142 } 143 144 static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) 145 { 146 return raw_smp_processor_id(); 147 } 148 149 /* note that this only generates 32-bit random numbers */ 150 static u64 __get_random_u32(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) 151 { 152 return prandom_u32(); 153 } 154 155 static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, 156 struct bpf_insn *insn_buf) 157 { 158 struct bpf_insn *insn = insn_buf; 159 160 switch (skb_field) { 161 case SKF_AD_MARK: 162 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); 163 164 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 165 offsetof(struct sk_buff, mark)); 166 break; 167 168 case SKF_AD_PKTTYPE: 169 *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET()); 170 *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX); 171 #ifdef __BIG_ENDIAN_BITFIELD 172 *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5); 173 #endif 174 break; 175 176 case SKF_AD_QUEUE: 177 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2); 178 179 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 180 offsetof(struct sk_buff, queue_mapping)); 181 break; 182 183 case SKF_AD_VLAN_TAG: 184 case SKF_AD_VLAN_TAG_PRESENT: 185 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); 186 BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); 187 188 /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */ 189 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 190 offsetof(struct sk_buff, vlan_tci)); 191 if (skb_field == SKF_AD_VLAN_TAG) { 192 *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 193 ~VLAN_TAG_PRESENT); 194 } else { 195 /* dst_reg >>= 12 */ 196 *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 12); 197 /* dst_reg &= 1 */ 198 *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1); 199 } 200 break; 201 } 202 203 return insn - insn_buf; 204 } 205 206 static bool convert_bpf_extensions(struct sock_filter *fp, 207 struct bpf_insn **insnp) 208 { 209 struct bpf_insn *insn = *insnp; 210 u32 cnt; 211 212 switch (fp->k) { 213 case SKF_AD_OFF + SKF_AD_PROTOCOL: 214 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); 215 216 /* A = *(u16 *) (CTX + offsetof(protocol)) */ 217 *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, 218 offsetof(struct sk_buff, protocol)); 219 /* A = ntohs(A) [emitting a nop or swap16] */ 220 *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); 221 break; 222 223 case SKF_AD_OFF + SKF_AD_PKTTYPE: 224 cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn); 225 insn += cnt - 1; 226 break; 227 228 case SKF_AD_OFF + SKF_AD_IFINDEX: 229 case SKF_AD_OFF + SKF_AD_HATYPE: 230 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); 231 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2); 232 BUILD_BUG_ON(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)) < 0); 233 234 *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)), 235 BPF_REG_TMP, BPF_REG_CTX, 236 offsetof(struct sk_buff, dev)); 237 /* if (tmp != 0) goto pc + 1 */ 238 *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1); 239 *insn++ = BPF_EXIT_INSN(); 240 if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX) 241 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP, 242 offsetof(struct net_device, ifindex)); 243 else 244 *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP, 245 offsetof(struct net_device, type)); 246 break; 247 248 case SKF_AD_OFF + SKF_AD_MARK: 249 cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn); 250 insn += cnt - 1; 251 break; 252 253 case SKF_AD_OFF + SKF_AD_RXHASH: 254 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); 255 256 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, 257 offsetof(struct sk_buff, hash)); 258 break; 259 260 case SKF_AD_OFF + SKF_AD_QUEUE: 261 cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn); 262 insn += cnt - 1; 263 break; 264 265 case SKF_AD_OFF + SKF_AD_VLAN_TAG: 266 cnt = convert_skb_access(SKF_AD_VLAN_TAG, 267 BPF_REG_A, BPF_REG_CTX, insn); 268 insn += cnt - 1; 269 break; 270 271 case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT: 272 cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, 273 BPF_REG_A, BPF_REG_CTX, insn); 274 insn += cnt - 1; 275 break; 276 277 case SKF_AD_OFF + SKF_AD_VLAN_TPID: 278 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2); 279 280 /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */ 281 *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, 282 offsetof(struct sk_buff, vlan_proto)); 283 /* A = ntohs(A) [emitting a nop or swap16] */ 284 *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); 285 break; 286 287 case SKF_AD_OFF + SKF_AD_PAY_OFFSET: 288 case SKF_AD_OFF + SKF_AD_NLATTR: 289 case SKF_AD_OFF + SKF_AD_NLATTR_NEST: 290 case SKF_AD_OFF + SKF_AD_CPU: 291 case SKF_AD_OFF + SKF_AD_RANDOM: 292 /* arg1 = CTX */ 293 *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); 294 /* arg2 = A */ 295 *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A); 296 /* arg3 = X */ 297 *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X); 298 /* Emit call(arg1=CTX, arg2=A, arg3=X) */ 299 switch (fp->k) { 300 case SKF_AD_OFF + SKF_AD_PAY_OFFSET: 301 *insn = BPF_EMIT_CALL(__skb_get_pay_offset); 302 break; 303 case SKF_AD_OFF + SKF_AD_NLATTR: 304 *insn = BPF_EMIT_CALL(__skb_get_nlattr); 305 break; 306 case SKF_AD_OFF + SKF_AD_NLATTR_NEST: 307 *insn = BPF_EMIT_CALL(__skb_get_nlattr_nest); 308 break; 309 case SKF_AD_OFF + SKF_AD_CPU: 310 *insn = BPF_EMIT_CALL(__get_raw_cpu_id); 311 break; 312 case SKF_AD_OFF + SKF_AD_RANDOM: 313 *insn = BPF_EMIT_CALL(__get_random_u32); 314 break; 315 } 316 break; 317 318 case SKF_AD_OFF + SKF_AD_ALU_XOR_X: 319 /* A ^= X */ 320 *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X); 321 break; 322 323 default: 324 /* This is just a dummy call to avoid letting the compiler 325 * evict __bpf_call_base() as an optimization. Placed here 326 * where no-one bothers. 327 */ 328 BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0); 329 return false; 330 } 331 332 *insnp = insn; 333 return true; 334 } 335 336 /** 337 * bpf_convert_filter - convert filter program 338 * @prog: the user passed filter program 339 * @len: the length of the user passed filter program 340 * @new_prog: buffer where converted program will be stored 341 * @new_len: pointer to store length of converted program 342 * 343 * Remap 'sock_filter' style BPF instruction set to 'sock_filter_ext' style. 344 * Conversion workflow: 345 * 346 * 1) First pass for calculating the new program length: 347 * bpf_convert_filter(old_prog, old_len, NULL, &new_len) 348 * 349 * 2) 2nd pass to remap in two passes: 1st pass finds new 350 * jump offsets, 2nd pass remapping: 351 * new_prog = kmalloc(sizeof(struct bpf_insn) * new_len); 352 * bpf_convert_filter(old_prog, old_len, new_prog, &new_len); 353 * 354 * User BPF's register A is mapped to our BPF register 6, user BPF 355 * register X is mapped to BPF register 7; frame pointer is always 356 * register 10; Context 'void *ctx' is stored in register 1, that is, 357 * for socket filters: ctx == 'struct sk_buff *', for seccomp: 358 * ctx == 'struct seccomp_data *'. 359 */ 360 static int bpf_convert_filter(struct sock_filter *prog, int len, 361 struct bpf_insn *new_prog, int *new_len) 362 { 363 int new_flen = 0, pass = 0, target, i; 364 struct bpf_insn *new_insn; 365 struct sock_filter *fp; 366 int *addrs = NULL; 367 u8 bpf_src; 368 369 BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK); 370 BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); 371 372 if (len <= 0 || len > BPF_MAXINSNS) 373 return -EINVAL; 374 375 if (new_prog) { 376 addrs = kcalloc(len, sizeof(*addrs), 377 GFP_KERNEL | __GFP_NOWARN); 378 if (!addrs) 379 return -ENOMEM; 380 } 381 382 do_pass: 383 new_insn = new_prog; 384 fp = prog; 385 386 if (new_insn) 387 *new_insn = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); 388 new_insn++; 389 390 for (i = 0; i < len; fp++, i++) { 391 struct bpf_insn tmp_insns[6] = { }; 392 struct bpf_insn *insn = tmp_insns; 393 394 if (addrs) 395 addrs[i] = new_insn - new_prog; 396 397 switch (fp->code) { 398 /* All arithmetic insns and skb loads map as-is. */ 399 case BPF_ALU | BPF_ADD | BPF_X: 400 case BPF_ALU | BPF_ADD | BPF_K: 401 case BPF_ALU | BPF_SUB | BPF_X: 402 case BPF_ALU | BPF_SUB | BPF_K: 403 case BPF_ALU | BPF_AND | BPF_X: 404 case BPF_ALU | BPF_AND | BPF_K: 405 case BPF_ALU | BPF_OR | BPF_X: 406 case BPF_ALU | BPF_OR | BPF_K: 407 case BPF_ALU | BPF_LSH | BPF_X: 408 case BPF_ALU | BPF_LSH | BPF_K: 409 case BPF_ALU | BPF_RSH | BPF_X: 410 case BPF_ALU | BPF_RSH | BPF_K: 411 case BPF_ALU | BPF_XOR | BPF_X: 412 case BPF_ALU | BPF_XOR | BPF_K: 413 case BPF_ALU | BPF_MUL | BPF_X: 414 case BPF_ALU | BPF_MUL | BPF_K: 415 case BPF_ALU | BPF_DIV | BPF_X: 416 case BPF_ALU | BPF_DIV | BPF_K: 417 case BPF_ALU | BPF_MOD | BPF_X: 418 case BPF_ALU | BPF_MOD | BPF_K: 419 case BPF_ALU | BPF_NEG: 420 case BPF_LD | BPF_ABS | BPF_W: 421 case BPF_LD | BPF_ABS | BPF_H: 422 case BPF_LD | BPF_ABS | BPF_B: 423 case BPF_LD | BPF_IND | BPF_W: 424 case BPF_LD | BPF_IND | BPF_H: 425 case BPF_LD | BPF_IND | BPF_B: 426 /* Check for overloaded BPF extension and 427 * directly convert it if found, otherwise 428 * just move on with mapping. 429 */ 430 if (BPF_CLASS(fp->code) == BPF_LD && 431 BPF_MODE(fp->code) == BPF_ABS && 432 convert_bpf_extensions(fp, &insn)) 433 break; 434 435 *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); 436 break; 437 438 /* Jump transformation cannot use BPF block macros 439 * everywhere as offset calculation and target updates 440 * require a bit more work than the rest, i.e. jump 441 * opcodes map as-is, but offsets need adjustment. 442 */ 443 444 #define BPF_EMIT_JMP \ 445 do { \ 446 if (target >= len || target < 0) \ 447 goto err; \ 448 insn->off = addrs ? addrs[target] - addrs[i] - 1 : 0; \ 449 /* Adjust pc relative offset for 2nd or 3rd insn. */ \ 450 insn->off -= insn - tmp_insns; \ 451 } while (0) 452 453 case BPF_JMP | BPF_JA: 454 target = i + fp->k + 1; 455 insn->code = fp->code; 456 BPF_EMIT_JMP; 457 break; 458 459 case BPF_JMP | BPF_JEQ | BPF_K: 460 case BPF_JMP | BPF_JEQ | BPF_X: 461 case BPF_JMP | BPF_JSET | BPF_K: 462 case BPF_JMP | BPF_JSET | BPF_X: 463 case BPF_JMP | BPF_JGT | BPF_K: 464 case BPF_JMP | BPF_JGT | BPF_X: 465 case BPF_JMP | BPF_JGE | BPF_K: 466 case BPF_JMP | BPF_JGE | BPF_X: 467 if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) { 468 /* BPF immediates are signed, zero extend 469 * immediate into tmp register and use it 470 * in compare insn. 471 */ 472 *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k); 473 474 insn->dst_reg = BPF_REG_A; 475 insn->src_reg = BPF_REG_TMP; 476 bpf_src = BPF_X; 477 } else { 478 insn->dst_reg = BPF_REG_A; 479 insn->src_reg = BPF_REG_X; 480 insn->imm = fp->k; 481 bpf_src = BPF_SRC(fp->code); 482 } 483 484 /* Common case where 'jump_false' is next insn. */ 485 if (fp->jf == 0) { 486 insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; 487 target = i + fp->jt + 1; 488 BPF_EMIT_JMP; 489 break; 490 } 491 492 /* Convert JEQ into JNE when 'jump_true' is next insn. */ 493 if (fp->jt == 0 && BPF_OP(fp->code) == BPF_JEQ) { 494 insn->code = BPF_JMP | BPF_JNE | bpf_src; 495 target = i + fp->jf + 1; 496 BPF_EMIT_JMP; 497 break; 498 } 499 500 /* Other jumps are mapped into two insns: Jxx and JA. */ 501 target = i + fp->jt + 1; 502 insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; 503 BPF_EMIT_JMP; 504 insn++; 505 506 insn->code = BPF_JMP | BPF_JA; 507 target = i + fp->jf + 1; 508 BPF_EMIT_JMP; 509 break; 510 511 /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */ 512 case BPF_LDX | BPF_MSH | BPF_B: 513 /* tmp = A */ 514 *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A); 515 /* A = BPF_R0 = *(u8 *) (skb->data + K) */ 516 *insn++ = BPF_LD_ABS(BPF_B, fp->k); 517 /* A &= 0xf */ 518 *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf); 519 /* A <<= 2 */ 520 *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2); 521 /* X = A */ 522 *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); 523 /* A = tmp */ 524 *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); 525 break; 526 527 /* RET_K, RET_A are remaped into 2 insns. */ 528 case BPF_RET | BPF_A: 529 case BPF_RET | BPF_K: 530 *insn++ = BPF_MOV32_RAW(BPF_RVAL(fp->code) == BPF_K ? 531 BPF_K : BPF_X, BPF_REG_0, 532 BPF_REG_A, fp->k); 533 *insn = BPF_EXIT_INSN(); 534 break; 535 536 /* Store to stack. */ 537 case BPF_ST: 538 case BPF_STX: 539 *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) == 540 BPF_ST ? BPF_REG_A : BPF_REG_X, 541 -(BPF_MEMWORDS - fp->k) * 4); 542 break; 543 544 /* Load from stack. */ 545 case BPF_LD | BPF_MEM: 546 case BPF_LDX | BPF_MEM: 547 *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? 548 BPF_REG_A : BPF_REG_X, BPF_REG_FP, 549 -(BPF_MEMWORDS - fp->k) * 4); 550 break; 551 552 /* A = K or X = K */ 553 case BPF_LD | BPF_IMM: 554 case BPF_LDX | BPF_IMM: 555 *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ? 556 BPF_REG_A : BPF_REG_X, fp->k); 557 break; 558 559 /* X = A */ 560 case BPF_MISC | BPF_TAX: 561 *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); 562 break; 563 564 /* A = X */ 565 case BPF_MISC | BPF_TXA: 566 *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X); 567 break; 568 569 /* A = skb->len or X = skb->len */ 570 case BPF_LD | BPF_W | BPF_LEN: 571 case BPF_LDX | BPF_W | BPF_LEN: 572 *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? 573 BPF_REG_A : BPF_REG_X, BPF_REG_CTX, 574 offsetof(struct sk_buff, len)); 575 break; 576 577 /* Access seccomp_data fields. */ 578 case BPF_LDX | BPF_ABS | BPF_W: 579 /* A = *(u32 *) (ctx + K) */ 580 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k); 581 break; 582 583 /* Unknown instruction. */ 584 default: 585 goto err; 586 } 587 588 insn++; 589 if (new_prog) 590 memcpy(new_insn, tmp_insns, 591 sizeof(*insn) * (insn - tmp_insns)); 592 new_insn += insn - tmp_insns; 593 } 594 595 if (!new_prog) { 596 /* Only calculating new length. */ 597 *new_len = new_insn - new_prog; 598 return 0; 599 } 600 601 pass++; 602 if (new_flen != new_insn - new_prog) { 603 new_flen = new_insn - new_prog; 604 if (pass > 2) 605 goto err; 606 goto do_pass; 607 } 608 609 kfree(addrs); 610 BUG_ON(*new_len != new_flen); 611 return 0; 612 err: 613 kfree(addrs); 614 return -EINVAL; 615 } 616 617 /* Security: 618 * 619 * As we dont want to clear mem[] array for each packet going through 620 * __bpf_prog_run(), we check that filter loaded by user never try to read 621 * a cell if not previously written, and we check all branches to be sure 622 * a malicious user doesn't try to abuse us. 623 */ 624 static int check_load_and_stores(const struct sock_filter *filter, int flen) 625 { 626 u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */ 627 int pc, ret = 0; 628 629 BUILD_BUG_ON(BPF_MEMWORDS > 16); 630 631 masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL); 632 if (!masks) 633 return -ENOMEM; 634 635 memset(masks, 0xff, flen * sizeof(*masks)); 636 637 for (pc = 0; pc < flen; pc++) { 638 memvalid &= masks[pc]; 639 640 switch (filter[pc].code) { 641 case BPF_ST: 642 case BPF_STX: 643 memvalid |= (1 << filter[pc].k); 644 break; 645 case BPF_LD | BPF_MEM: 646 case BPF_LDX | BPF_MEM: 647 if (!(memvalid & (1 << filter[pc].k))) { 648 ret = -EINVAL; 649 goto error; 650 } 651 break; 652 case BPF_JMP | BPF_JA: 653 /* A jump must set masks on target */ 654 masks[pc + 1 + filter[pc].k] &= memvalid; 655 memvalid = ~0; 656 break; 657 case BPF_JMP | BPF_JEQ | BPF_K: 658 case BPF_JMP | BPF_JEQ | BPF_X: 659 case BPF_JMP | BPF_JGE | BPF_K: 660 case BPF_JMP | BPF_JGE | BPF_X: 661 case BPF_JMP | BPF_JGT | BPF_K: 662 case BPF_JMP | BPF_JGT | BPF_X: 663 case BPF_JMP | BPF_JSET | BPF_K: 664 case BPF_JMP | BPF_JSET | BPF_X: 665 /* A jump must set masks on targets */ 666 masks[pc + 1 + filter[pc].jt] &= memvalid; 667 masks[pc + 1 + filter[pc].jf] &= memvalid; 668 memvalid = ~0; 669 break; 670 } 671 } 672 error: 673 kfree(masks); 674 return ret; 675 } 676 677 static bool chk_code_allowed(u16 code_to_probe) 678 { 679 static const bool codes[] = { 680 /* 32 bit ALU operations */ 681 [BPF_ALU | BPF_ADD | BPF_K] = true, 682 [BPF_ALU | BPF_ADD | BPF_X] = true, 683 [BPF_ALU | BPF_SUB | BPF_K] = true, 684 [BPF_ALU | BPF_SUB | BPF_X] = true, 685 [BPF_ALU | BPF_MUL | BPF_K] = true, 686 [BPF_ALU | BPF_MUL | BPF_X] = true, 687 [BPF_ALU | BPF_DIV | BPF_K] = true, 688 [BPF_ALU | BPF_DIV | BPF_X] = true, 689 [BPF_ALU | BPF_MOD | BPF_K] = true, 690 [BPF_ALU | BPF_MOD | BPF_X] = true, 691 [BPF_ALU | BPF_AND | BPF_K] = true, 692 [BPF_ALU | BPF_AND | BPF_X] = true, 693 [BPF_ALU | BPF_OR | BPF_K] = true, 694 [BPF_ALU | BPF_OR | BPF_X] = true, 695 [BPF_ALU | BPF_XOR | BPF_K] = true, 696 [BPF_ALU | BPF_XOR | BPF_X] = true, 697 [BPF_ALU | BPF_LSH | BPF_K] = true, 698 [BPF_ALU | BPF_LSH | BPF_X] = true, 699 [BPF_ALU | BPF_RSH | BPF_K] = true, 700 [BPF_ALU | BPF_RSH | BPF_X] = true, 701 [BPF_ALU | BPF_NEG] = true, 702 /* Load instructions */ 703 [BPF_LD | BPF_W | BPF_ABS] = true, 704 [BPF_LD | BPF_H | BPF_ABS] = true, 705 [BPF_LD | BPF_B | BPF_ABS] = true, 706 [BPF_LD | BPF_W | BPF_LEN] = true, 707 [BPF_LD | BPF_W | BPF_IND] = true, 708 [BPF_LD | BPF_H | BPF_IND] = true, 709 [BPF_LD | BPF_B | BPF_IND] = true, 710 [BPF_LD | BPF_IMM] = true, 711 [BPF_LD | BPF_MEM] = true, 712 [BPF_LDX | BPF_W | BPF_LEN] = true, 713 [BPF_LDX | BPF_B | BPF_MSH] = true, 714 [BPF_LDX | BPF_IMM] = true, 715 [BPF_LDX | BPF_MEM] = true, 716 /* Store instructions */ 717 [BPF_ST] = true, 718 [BPF_STX] = true, 719 /* Misc instructions */ 720 [BPF_MISC | BPF_TAX] = true, 721 [BPF_MISC | BPF_TXA] = true, 722 /* Return instructions */ 723 [BPF_RET | BPF_K] = true, 724 [BPF_RET | BPF_A] = true, 725 /* Jump instructions */ 726 [BPF_JMP | BPF_JA] = true, 727 [BPF_JMP | BPF_JEQ | BPF_K] = true, 728 [BPF_JMP | BPF_JEQ | BPF_X] = true, 729 [BPF_JMP | BPF_JGE | BPF_K] = true, 730 [BPF_JMP | BPF_JGE | BPF_X] = true, 731 [BPF_JMP | BPF_JGT | BPF_K] = true, 732 [BPF_JMP | BPF_JGT | BPF_X] = true, 733 [BPF_JMP | BPF_JSET | BPF_K] = true, 734 [BPF_JMP | BPF_JSET | BPF_X] = true, 735 }; 736 737 if (code_to_probe >= ARRAY_SIZE(codes)) 738 return false; 739 740 return codes[code_to_probe]; 741 } 742 743 /** 744 * bpf_check_classic - verify socket filter code 745 * @filter: filter to verify 746 * @flen: length of filter 747 * 748 * Check the user's filter code. If we let some ugly 749 * filter code slip through kaboom! The filter must contain 750 * no references or jumps that are out of range, no illegal 751 * instructions, and must end with a RET instruction. 752 * 753 * All jumps are forward as they are not signed. 754 * 755 * Returns 0 if the rule set is legal or -EINVAL if not. 756 */ 757 static int bpf_check_classic(const struct sock_filter *filter, 758 unsigned int flen) 759 { 760 bool anc_found; 761 int pc; 762 763 if (flen == 0 || flen > BPF_MAXINSNS) 764 return -EINVAL; 765 766 /* Check the filter code now */ 767 for (pc = 0; pc < flen; pc++) { 768 const struct sock_filter *ftest = &filter[pc]; 769 770 /* May we actually operate on this code? */ 771 if (!chk_code_allowed(ftest->code)) 772 return -EINVAL; 773 774 /* Some instructions need special checks */ 775 switch (ftest->code) { 776 case BPF_ALU | BPF_DIV | BPF_K: 777 case BPF_ALU | BPF_MOD | BPF_K: 778 /* Check for division by zero */ 779 if (ftest->k == 0) 780 return -EINVAL; 781 break; 782 case BPF_LD | BPF_MEM: 783 case BPF_LDX | BPF_MEM: 784 case BPF_ST: 785 case BPF_STX: 786 /* Check for invalid memory addresses */ 787 if (ftest->k >= BPF_MEMWORDS) 788 return -EINVAL; 789 break; 790 case BPF_JMP | BPF_JA: 791 /* Note, the large ftest->k might cause loops. 792 * Compare this with conditional jumps below, 793 * where offsets are limited. --ANK (981016) 794 */ 795 if (ftest->k >= (unsigned int)(flen - pc - 1)) 796 return -EINVAL; 797 break; 798 case BPF_JMP | BPF_JEQ | BPF_K: 799 case BPF_JMP | BPF_JEQ | BPF_X: 800 case BPF_JMP | BPF_JGE | BPF_K: 801 case BPF_JMP | BPF_JGE | BPF_X: 802 case BPF_JMP | BPF_JGT | BPF_K: 803 case BPF_JMP | BPF_JGT | BPF_X: 804 case BPF_JMP | BPF_JSET | BPF_K: 805 case BPF_JMP | BPF_JSET | BPF_X: 806 /* Both conditionals must be safe */ 807 if (pc + ftest->jt + 1 >= flen || 808 pc + ftest->jf + 1 >= flen) 809 return -EINVAL; 810 break; 811 case BPF_LD | BPF_W | BPF_ABS: 812 case BPF_LD | BPF_H | BPF_ABS: 813 case BPF_LD | BPF_B | BPF_ABS: 814 anc_found = false; 815 if (bpf_anc_helper(ftest) & BPF_ANC) 816 anc_found = true; 817 /* Ancillary operation unknown or unsupported */ 818 if (anc_found == false && ftest->k >= SKF_AD_OFF) 819 return -EINVAL; 820 } 821 } 822 823 /* Last instruction must be a RET code */ 824 switch (filter[flen - 1].code) { 825 case BPF_RET | BPF_K: 826 case BPF_RET | BPF_A: 827 return check_load_and_stores(filter, flen); 828 } 829 830 return -EINVAL; 831 } 832 833 static int bpf_prog_store_orig_filter(struct bpf_prog *fp, 834 const struct sock_fprog *fprog) 835 { 836 unsigned int fsize = bpf_classic_proglen(fprog); 837 struct sock_fprog_kern *fkprog; 838 839 fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL); 840 if (!fp->orig_prog) 841 return -ENOMEM; 842 843 fkprog = fp->orig_prog; 844 fkprog->len = fprog->len; 845 846 fkprog->filter = kmemdup(fp->insns, fsize, 847 GFP_KERNEL | __GFP_NOWARN); 848 if (!fkprog->filter) { 849 kfree(fp->orig_prog); 850 return -ENOMEM; 851 } 852 853 return 0; 854 } 855 856 static void bpf_release_orig_filter(struct bpf_prog *fp) 857 { 858 struct sock_fprog_kern *fprog = fp->orig_prog; 859 860 if (fprog) { 861 kfree(fprog->filter); 862 kfree(fprog); 863 } 864 } 865 866 static void __bpf_prog_release(struct bpf_prog *prog) 867 { 868 if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) { 869 bpf_prog_put(prog); 870 } else { 871 bpf_release_orig_filter(prog); 872 bpf_prog_free(prog); 873 } 874 } 875 876 static void __sk_filter_release(struct sk_filter *fp) 877 { 878 __bpf_prog_release(fp->prog); 879 kfree(fp); 880 } 881 882 /** 883 * sk_filter_release_rcu - Release a socket filter by rcu_head 884 * @rcu: rcu_head that contains the sk_filter to free 885 */ 886 static void sk_filter_release_rcu(struct rcu_head *rcu) 887 { 888 struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); 889 890 __sk_filter_release(fp); 891 } 892 893 /** 894 * sk_filter_release - release a socket filter 895 * @fp: filter to remove 896 * 897 * Remove a filter from a socket and release its resources. 898 */ 899 static void sk_filter_release(struct sk_filter *fp) 900 { 901 if (atomic_dec_and_test(&fp->refcnt)) 902 call_rcu(&fp->rcu, sk_filter_release_rcu); 903 } 904 905 void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) 906 { 907 u32 filter_size = bpf_prog_size(fp->prog->len); 908 909 atomic_sub(filter_size, &sk->sk_omem_alloc); 910 sk_filter_release(fp); 911 } 912 913 /* try to charge the socket memory if there is space available 914 * return true on success 915 */ 916 bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) 917 { 918 u32 filter_size = bpf_prog_size(fp->prog->len); 919 920 /* same check as in sock_kmalloc() */ 921 if (filter_size <= sysctl_optmem_max && 922 atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) { 923 atomic_inc(&fp->refcnt); 924 atomic_add(filter_size, &sk->sk_omem_alloc); 925 return true; 926 } 927 return false; 928 } 929 930 static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) 931 { 932 struct sock_filter *old_prog; 933 struct bpf_prog *old_fp; 934 int err, new_len, old_len = fp->len; 935 936 /* We are free to overwrite insns et al right here as it 937 * won't be used at this point in time anymore internally 938 * after the migration to the internal BPF instruction 939 * representation. 940 */ 941 BUILD_BUG_ON(sizeof(struct sock_filter) != 942 sizeof(struct bpf_insn)); 943 944 /* Conversion cannot happen on overlapping memory areas, 945 * so we need to keep the user BPF around until the 2nd 946 * pass. At this time, the user BPF is stored in fp->insns. 947 */ 948 old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter), 949 GFP_KERNEL | __GFP_NOWARN); 950 if (!old_prog) { 951 err = -ENOMEM; 952 goto out_err; 953 } 954 955 /* 1st pass: calculate the new program length. */ 956 err = bpf_convert_filter(old_prog, old_len, NULL, &new_len); 957 if (err) 958 goto out_err_free; 959 960 /* Expand fp for appending the new filter representation. */ 961 old_fp = fp; 962 fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0); 963 if (!fp) { 964 /* The old_fp is still around in case we couldn't 965 * allocate new memory, so uncharge on that one. 966 */ 967 fp = old_fp; 968 err = -ENOMEM; 969 goto out_err_free; 970 } 971 972 fp->len = new_len; 973 974 /* 2nd pass: remap sock_filter insns into bpf_insn insns. */ 975 err = bpf_convert_filter(old_prog, old_len, fp->insnsi, &new_len); 976 if (err) 977 /* 2nd bpf_convert_filter() can fail only if it fails 978 * to allocate memory, remapping must succeed. Note, 979 * that at this time old_fp has already been released 980 * by krealloc(). 981 */ 982 goto out_err_free; 983 984 bpf_prog_select_runtime(fp); 985 986 kfree(old_prog); 987 return fp; 988 989 out_err_free: 990 kfree(old_prog); 991 out_err: 992 __bpf_prog_release(fp); 993 return ERR_PTR(err); 994 } 995 996 static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, 997 bpf_aux_classic_check_t trans) 998 { 999 int err; 1000 1001 fp->bpf_func = NULL; 1002 fp->jited = false; 1003 1004 err = bpf_check_classic(fp->insns, fp->len); 1005 if (err) { 1006 __bpf_prog_release(fp); 1007 return ERR_PTR(err); 1008 } 1009 1010 /* There might be additional checks and transformations 1011 * needed on classic filters, f.e. in case of seccomp. 1012 */ 1013 if (trans) { 1014 err = trans(fp->insns, fp->len); 1015 if (err) { 1016 __bpf_prog_release(fp); 1017 return ERR_PTR(err); 1018 } 1019 } 1020 1021 /* Probe if we can JIT compile the filter and if so, do 1022 * the compilation of the filter. 1023 */ 1024 bpf_jit_compile(fp); 1025 1026 /* JIT compiler couldn't process this filter, so do the 1027 * internal BPF translation for the optimized interpreter. 1028 */ 1029 if (!fp->jited) 1030 fp = bpf_migrate_filter(fp); 1031 1032 return fp; 1033 } 1034 1035 /** 1036 * bpf_prog_create - create an unattached filter 1037 * @pfp: the unattached filter that is created 1038 * @fprog: the filter program 1039 * 1040 * Create a filter independent of any socket. We first run some 1041 * sanity checks on it to make sure it does not explode on us later. 1042 * If an error occurs or there is insufficient memory for the filter 1043 * a negative errno code is returned. On success the return is zero. 1044 */ 1045 int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog) 1046 { 1047 unsigned int fsize = bpf_classic_proglen(fprog); 1048 struct bpf_prog *fp; 1049 1050 /* Make sure new filter is there and in the right amounts. */ 1051 if (fprog->filter == NULL) 1052 return -EINVAL; 1053 1054 fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); 1055 if (!fp) 1056 return -ENOMEM; 1057 1058 memcpy(fp->insns, fprog->filter, fsize); 1059 1060 fp->len = fprog->len; 1061 /* Since unattached filters are not copied back to user 1062 * space through sk_get_filter(), we do not need to hold 1063 * a copy here, and can spare us the work. 1064 */ 1065 fp->orig_prog = NULL; 1066 1067 /* bpf_prepare_filter() already takes care of freeing 1068 * memory in case something goes wrong. 1069 */ 1070 fp = bpf_prepare_filter(fp, NULL); 1071 if (IS_ERR(fp)) 1072 return PTR_ERR(fp); 1073 1074 *pfp = fp; 1075 return 0; 1076 } 1077 EXPORT_SYMBOL_GPL(bpf_prog_create); 1078 1079 /** 1080 * bpf_prog_create_from_user - create an unattached filter from user buffer 1081 * @pfp: the unattached filter that is created 1082 * @fprog: the filter program 1083 * @trans: post-classic verifier transformation handler 1084 * 1085 * This function effectively does the same as bpf_prog_create(), only 1086 * that it builds up its insns buffer from user space provided buffer. 1087 * It also allows for passing a bpf_aux_classic_check_t handler. 1088 */ 1089 int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, 1090 bpf_aux_classic_check_t trans) 1091 { 1092 unsigned int fsize = bpf_classic_proglen(fprog); 1093 struct bpf_prog *fp; 1094 1095 /* Make sure new filter is there and in the right amounts. */ 1096 if (fprog->filter == NULL) 1097 return -EINVAL; 1098 1099 fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); 1100 if (!fp) 1101 return -ENOMEM; 1102 1103 if (copy_from_user(fp->insns, fprog->filter, fsize)) { 1104 __bpf_prog_free(fp); 1105 return -EFAULT; 1106 } 1107 1108 fp->len = fprog->len; 1109 /* Since unattached filters are not copied back to user 1110 * space through sk_get_filter(), we do not need to hold 1111 * a copy here, and can spare us the work. 1112 */ 1113 fp->orig_prog = NULL; 1114 1115 /* bpf_prepare_filter() already takes care of freeing 1116 * memory in case something goes wrong. 1117 */ 1118 fp = bpf_prepare_filter(fp, trans); 1119 if (IS_ERR(fp)) 1120 return PTR_ERR(fp); 1121 1122 *pfp = fp; 1123 return 0; 1124 } 1125 1126 void bpf_prog_destroy(struct bpf_prog *fp) 1127 { 1128 __bpf_prog_release(fp); 1129 } 1130 EXPORT_SYMBOL_GPL(bpf_prog_destroy); 1131 1132 static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) 1133 { 1134 struct sk_filter *fp, *old_fp; 1135 1136 fp = kmalloc(sizeof(*fp), GFP_KERNEL); 1137 if (!fp) 1138 return -ENOMEM; 1139 1140 fp->prog = prog; 1141 atomic_set(&fp->refcnt, 0); 1142 1143 if (!sk_filter_charge(sk, fp)) { 1144 kfree(fp); 1145 return -ENOMEM; 1146 } 1147 1148 old_fp = rcu_dereference_protected(sk->sk_filter, 1149 sock_owned_by_user(sk)); 1150 rcu_assign_pointer(sk->sk_filter, fp); 1151 1152 if (old_fp) 1153 sk_filter_uncharge(sk, old_fp); 1154 1155 return 0; 1156 } 1157 1158 /** 1159 * sk_attach_filter - attach a socket filter 1160 * @fprog: the filter program 1161 * @sk: the socket to use 1162 * 1163 * Attach the user's filter code. We first run some sanity checks on 1164 * it to make sure it does not explode on us later. If an error 1165 * occurs or there is insufficient memory for the filter a negative 1166 * errno code is returned. On success the return is zero. 1167 */ 1168 int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) 1169 { 1170 unsigned int fsize = bpf_classic_proglen(fprog); 1171 unsigned int bpf_fsize = bpf_prog_size(fprog->len); 1172 struct bpf_prog *prog; 1173 int err; 1174 1175 if (sock_flag(sk, SOCK_FILTER_LOCKED)) 1176 return -EPERM; 1177 1178 /* Make sure new filter is there and in the right amounts. */ 1179 if (fprog->filter == NULL) 1180 return -EINVAL; 1181 1182 prog = bpf_prog_alloc(bpf_fsize, 0); 1183 if (!prog) 1184 return -ENOMEM; 1185 1186 if (copy_from_user(prog->insns, fprog->filter, fsize)) { 1187 __bpf_prog_free(prog); 1188 return -EFAULT; 1189 } 1190 1191 prog->len = fprog->len; 1192 1193 err = bpf_prog_store_orig_filter(prog, fprog); 1194 if (err) { 1195 __bpf_prog_free(prog); 1196 return -ENOMEM; 1197 } 1198 1199 /* bpf_prepare_filter() already takes care of freeing 1200 * memory in case something goes wrong. 1201 */ 1202 prog = bpf_prepare_filter(prog, NULL); 1203 if (IS_ERR(prog)) 1204 return PTR_ERR(prog); 1205 1206 err = __sk_attach_prog(prog, sk); 1207 if (err < 0) { 1208 __bpf_prog_release(prog); 1209 return err; 1210 } 1211 1212 return 0; 1213 } 1214 EXPORT_SYMBOL_GPL(sk_attach_filter); 1215 1216 int sk_attach_bpf(u32 ufd, struct sock *sk) 1217 { 1218 struct bpf_prog *prog; 1219 int err; 1220 1221 if (sock_flag(sk, SOCK_FILTER_LOCKED)) 1222 return -EPERM; 1223 1224 prog = bpf_prog_get(ufd); 1225 if (IS_ERR(prog)) 1226 return PTR_ERR(prog); 1227 1228 if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) { 1229 bpf_prog_put(prog); 1230 return -EINVAL; 1231 } 1232 1233 err = __sk_attach_prog(prog, sk); 1234 if (err < 0) { 1235 bpf_prog_put(prog); 1236 return err; 1237 } 1238 1239 return 0; 1240 } 1241 1242 #define BPF_RECOMPUTE_CSUM(flags) ((flags) & 1) 1243 1244 static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) 1245 { 1246 struct sk_buff *skb = (struct sk_buff *) (long) r1; 1247 int offset = (int) r2; 1248 void *from = (void *) (long) r3; 1249 unsigned int len = (unsigned int) r4; 1250 char buf[16]; 1251 void *ptr; 1252 1253 /* bpf verifier guarantees that: 1254 * 'from' pointer points to bpf program stack 1255 * 'len' bytes of it were initialized 1256 * 'len' > 0 1257 * 'skb' is a valid pointer to 'struct sk_buff' 1258 * 1259 * so check for invalid 'offset' and too large 'len' 1260 */ 1261 if (unlikely((u32) offset > 0xffff || len > sizeof(buf))) 1262 return -EFAULT; 1263 1264 if (unlikely(skb_cloned(skb) && 1265 !skb_clone_writable(skb, offset + len))) 1266 return -EFAULT; 1267 1268 ptr = skb_header_pointer(skb, offset, len, buf); 1269 if (unlikely(!ptr)) 1270 return -EFAULT; 1271 1272 if (BPF_RECOMPUTE_CSUM(flags)) 1273 skb_postpull_rcsum(skb, ptr, len); 1274 1275 memcpy(ptr, from, len); 1276 1277 if (ptr == buf) 1278 /* skb_store_bits cannot return -EFAULT here */ 1279 skb_store_bits(skb, offset, ptr, len); 1280 1281 if (BPF_RECOMPUTE_CSUM(flags) && skb->ip_summed == CHECKSUM_COMPLETE) 1282 skb->csum = csum_add(skb->csum, csum_partial(ptr, len, 0)); 1283 return 0; 1284 } 1285 1286 const struct bpf_func_proto bpf_skb_store_bytes_proto = { 1287 .func = bpf_skb_store_bytes, 1288 .gpl_only = false, 1289 .ret_type = RET_INTEGER, 1290 .arg1_type = ARG_PTR_TO_CTX, 1291 .arg2_type = ARG_ANYTHING, 1292 .arg3_type = ARG_PTR_TO_STACK, 1293 .arg4_type = ARG_CONST_STACK_SIZE, 1294 .arg5_type = ARG_ANYTHING, 1295 }; 1296 1297 #define BPF_HEADER_FIELD_SIZE(flags) ((flags) & 0x0f) 1298 #define BPF_IS_PSEUDO_HEADER(flags) ((flags) & 0x10) 1299 1300 static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) 1301 { 1302 struct sk_buff *skb = (struct sk_buff *) (long) r1; 1303 int offset = (int) r2; 1304 __sum16 sum, *ptr; 1305 1306 if (unlikely((u32) offset > 0xffff)) 1307 return -EFAULT; 1308 1309 if (unlikely(skb_cloned(skb) && 1310 !skb_clone_writable(skb, offset + sizeof(sum)))) 1311 return -EFAULT; 1312 1313 ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); 1314 if (unlikely(!ptr)) 1315 return -EFAULT; 1316 1317 switch (BPF_HEADER_FIELD_SIZE(flags)) { 1318 case 2: 1319 csum_replace2(ptr, from, to); 1320 break; 1321 case 4: 1322 csum_replace4(ptr, from, to); 1323 break; 1324 default: 1325 return -EINVAL; 1326 } 1327 1328 if (ptr == &sum) 1329 /* skb_store_bits guaranteed to not return -EFAULT here */ 1330 skb_store_bits(skb, offset, ptr, sizeof(sum)); 1331 1332 return 0; 1333 } 1334 1335 const struct bpf_func_proto bpf_l3_csum_replace_proto = { 1336 .func = bpf_l3_csum_replace, 1337 .gpl_only = false, 1338 .ret_type = RET_INTEGER, 1339 .arg1_type = ARG_PTR_TO_CTX, 1340 .arg2_type = ARG_ANYTHING, 1341 .arg3_type = ARG_ANYTHING, 1342 .arg4_type = ARG_ANYTHING, 1343 .arg5_type = ARG_ANYTHING, 1344 }; 1345 1346 static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) 1347 { 1348 struct sk_buff *skb = (struct sk_buff *) (long) r1; 1349 u32 is_pseudo = BPF_IS_PSEUDO_HEADER(flags); 1350 int offset = (int) r2; 1351 __sum16 sum, *ptr; 1352 1353 if (unlikely((u32) offset > 0xffff)) 1354 return -EFAULT; 1355 1356 if (unlikely(skb_cloned(skb) && 1357 !skb_clone_writable(skb, offset + sizeof(sum)))) 1358 return -EFAULT; 1359 1360 ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); 1361 if (unlikely(!ptr)) 1362 return -EFAULT; 1363 1364 switch (BPF_HEADER_FIELD_SIZE(flags)) { 1365 case 2: 1366 inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); 1367 break; 1368 case 4: 1369 inet_proto_csum_replace4(ptr, skb, from, to, is_pseudo); 1370 break; 1371 default: 1372 return -EINVAL; 1373 } 1374 1375 if (ptr == &sum) 1376 /* skb_store_bits guaranteed to not return -EFAULT here */ 1377 skb_store_bits(skb, offset, ptr, sizeof(sum)); 1378 1379 return 0; 1380 } 1381 1382 const struct bpf_func_proto bpf_l4_csum_replace_proto = { 1383 .func = bpf_l4_csum_replace, 1384 .gpl_only = false, 1385 .ret_type = RET_INTEGER, 1386 .arg1_type = ARG_PTR_TO_CTX, 1387 .arg2_type = ARG_ANYTHING, 1388 .arg3_type = ARG_ANYTHING, 1389 .arg4_type = ARG_ANYTHING, 1390 .arg5_type = ARG_ANYTHING, 1391 }; 1392 1393 #define BPF_IS_REDIRECT_INGRESS(flags) ((flags) & 1) 1394 1395 static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) 1396 { 1397 struct sk_buff *skb = (struct sk_buff *) (long) r1, *skb2; 1398 struct net_device *dev; 1399 1400 dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex); 1401 if (unlikely(!dev)) 1402 return -EINVAL; 1403 1404 if (unlikely(!(dev->flags & IFF_UP))) 1405 return -EINVAL; 1406 1407 skb2 = skb_clone(skb, GFP_ATOMIC); 1408 if (unlikely(!skb2)) 1409 return -ENOMEM; 1410 1411 if (BPF_IS_REDIRECT_INGRESS(flags)) 1412 return dev_forward_skb(dev, skb2); 1413 1414 skb2->dev = dev; 1415 return dev_queue_xmit(skb2); 1416 } 1417 1418 const struct bpf_func_proto bpf_clone_redirect_proto = { 1419 .func = bpf_clone_redirect, 1420 .gpl_only = false, 1421 .ret_type = RET_INTEGER, 1422 .arg1_type = ARG_PTR_TO_CTX, 1423 .arg2_type = ARG_ANYTHING, 1424 .arg3_type = ARG_ANYTHING, 1425 }; 1426 1427 static const struct bpf_func_proto * 1428 sk_filter_func_proto(enum bpf_func_id func_id) 1429 { 1430 switch (func_id) { 1431 case BPF_FUNC_map_lookup_elem: 1432 return &bpf_map_lookup_elem_proto; 1433 case BPF_FUNC_map_update_elem: 1434 return &bpf_map_update_elem_proto; 1435 case BPF_FUNC_map_delete_elem: 1436 return &bpf_map_delete_elem_proto; 1437 case BPF_FUNC_get_prandom_u32: 1438 return &bpf_get_prandom_u32_proto; 1439 case BPF_FUNC_get_smp_processor_id: 1440 return &bpf_get_smp_processor_id_proto; 1441 case BPF_FUNC_tail_call: 1442 return &bpf_tail_call_proto; 1443 case BPF_FUNC_ktime_get_ns: 1444 return &bpf_ktime_get_ns_proto; 1445 case BPF_FUNC_trace_printk: 1446 return bpf_get_trace_printk_proto(); 1447 default: 1448 return NULL; 1449 } 1450 } 1451 1452 static const struct bpf_func_proto * 1453 tc_cls_act_func_proto(enum bpf_func_id func_id) 1454 { 1455 switch (func_id) { 1456 case BPF_FUNC_skb_store_bytes: 1457 return &bpf_skb_store_bytes_proto; 1458 case BPF_FUNC_l3_csum_replace: 1459 return &bpf_l3_csum_replace_proto; 1460 case BPF_FUNC_l4_csum_replace: 1461 return &bpf_l4_csum_replace_proto; 1462 case BPF_FUNC_clone_redirect: 1463 return &bpf_clone_redirect_proto; 1464 default: 1465 return sk_filter_func_proto(func_id); 1466 } 1467 } 1468 1469 static bool __is_valid_access(int off, int size, enum bpf_access_type type) 1470 { 1471 /* check bounds */ 1472 if (off < 0 || off >= sizeof(struct __sk_buff)) 1473 return false; 1474 1475 /* disallow misaligned access */ 1476 if (off % size != 0) 1477 return false; 1478 1479 /* all __sk_buff fields are __u32 */ 1480 if (size != 4) 1481 return false; 1482 1483 return true; 1484 } 1485 1486 static bool sk_filter_is_valid_access(int off, int size, 1487 enum bpf_access_type type) 1488 { 1489 if (type == BPF_WRITE) { 1490 switch (off) { 1491 case offsetof(struct __sk_buff, cb[0]) ... 1492 offsetof(struct __sk_buff, cb[4]): 1493 break; 1494 default: 1495 return false; 1496 } 1497 } 1498 1499 return __is_valid_access(off, size, type); 1500 } 1501 1502 static bool tc_cls_act_is_valid_access(int off, int size, 1503 enum bpf_access_type type) 1504 { 1505 if (type == BPF_WRITE) { 1506 switch (off) { 1507 case offsetof(struct __sk_buff, mark): 1508 case offsetof(struct __sk_buff, tc_index): 1509 case offsetof(struct __sk_buff, cb[0]) ... 1510 offsetof(struct __sk_buff, cb[4]): 1511 break; 1512 default: 1513 return false; 1514 } 1515 } 1516 return __is_valid_access(off, size, type); 1517 } 1518 1519 static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, 1520 int src_reg, int ctx_off, 1521 struct bpf_insn *insn_buf) 1522 { 1523 struct bpf_insn *insn = insn_buf; 1524 1525 switch (ctx_off) { 1526 case offsetof(struct __sk_buff, len): 1527 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); 1528 1529 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 1530 offsetof(struct sk_buff, len)); 1531 break; 1532 1533 case offsetof(struct __sk_buff, protocol): 1534 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); 1535 1536 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 1537 offsetof(struct sk_buff, protocol)); 1538 break; 1539 1540 case offsetof(struct __sk_buff, vlan_proto): 1541 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2); 1542 1543 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 1544 offsetof(struct sk_buff, vlan_proto)); 1545 break; 1546 1547 case offsetof(struct __sk_buff, priority): 1548 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4); 1549 1550 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 1551 offsetof(struct sk_buff, priority)); 1552 break; 1553 1554 case offsetof(struct __sk_buff, ingress_ifindex): 1555 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4); 1556 1557 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 1558 offsetof(struct sk_buff, skb_iif)); 1559 break; 1560 1561 case offsetof(struct __sk_buff, ifindex): 1562 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); 1563 1564 *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)), 1565 dst_reg, src_reg, 1566 offsetof(struct sk_buff, dev)); 1567 *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1); 1568 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg, 1569 offsetof(struct net_device, ifindex)); 1570 break; 1571 1572 case offsetof(struct __sk_buff, mark): 1573 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); 1574 1575 if (type == BPF_WRITE) 1576 *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, 1577 offsetof(struct sk_buff, mark)); 1578 else 1579 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 1580 offsetof(struct sk_buff, mark)); 1581 break; 1582 1583 case offsetof(struct __sk_buff, pkt_type): 1584 return convert_skb_access(SKF_AD_PKTTYPE, dst_reg, src_reg, insn); 1585 1586 case offsetof(struct __sk_buff, queue_mapping): 1587 return convert_skb_access(SKF_AD_QUEUE, dst_reg, src_reg, insn); 1588 1589 case offsetof(struct __sk_buff, vlan_present): 1590 return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, 1591 dst_reg, src_reg, insn); 1592 1593 case offsetof(struct __sk_buff, vlan_tci): 1594 return convert_skb_access(SKF_AD_VLAN_TAG, 1595 dst_reg, src_reg, insn); 1596 1597 case offsetof(struct __sk_buff, cb[0]) ... 1598 offsetof(struct __sk_buff, cb[4]): 1599 BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20); 1600 1601 ctx_off -= offsetof(struct __sk_buff, cb[0]); 1602 ctx_off += offsetof(struct sk_buff, cb); 1603 ctx_off += offsetof(struct qdisc_skb_cb, data); 1604 if (type == BPF_WRITE) 1605 *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, ctx_off); 1606 else 1607 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ctx_off); 1608 break; 1609 1610 case offsetof(struct __sk_buff, tc_index): 1611 #ifdef CONFIG_NET_SCHED 1612 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2); 1613 1614 if (type == BPF_WRITE) 1615 *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, 1616 offsetof(struct sk_buff, tc_index)); 1617 else 1618 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 1619 offsetof(struct sk_buff, tc_index)); 1620 break; 1621 #else 1622 if (type == BPF_WRITE) 1623 *insn++ = BPF_MOV64_REG(dst_reg, dst_reg); 1624 else 1625 *insn++ = BPF_MOV64_IMM(dst_reg, 0); 1626 break; 1627 #endif 1628 } 1629 1630 return insn - insn_buf; 1631 } 1632 1633 static const struct bpf_verifier_ops sk_filter_ops = { 1634 .get_func_proto = sk_filter_func_proto, 1635 .is_valid_access = sk_filter_is_valid_access, 1636 .convert_ctx_access = bpf_net_convert_ctx_access, 1637 }; 1638 1639 static const struct bpf_verifier_ops tc_cls_act_ops = { 1640 .get_func_proto = tc_cls_act_func_proto, 1641 .is_valid_access = tc_cls_act_is_valid_access, 1642 .convert_ctx_access = bpf_net_convert_ctx_access, 1643 }; 1644 1645 static struct bpf_prog_type_list sk_filter_type __read_mostly = { 1646 .ops = &sk_filter_ops, 1647 .type = BPF_PROG_TYPE_SOCKET_FILTER, 1648 }; 1649 1650 static struct bpf_prog_type_list sched_cls_type __read_mostly = { 1651 .ops = &tc_cls_act_ops, 1652 .type = BPF_PROG_TYPE_SCHED_CLS, 1653 }; 1654 1655 static struct bpf_prog_type_list sched_act_type __read_mostly = { 1656 .ops = &tc_cls_act_ops, 1657 .type = BPF_PROG_TYPE_SCHED_ACT, 1658 }; 1659 1660 static int __init register_sk_filter_ops(void) 1661 { 1662 bpf_register_prog_type(&sk_filter_type); 1663 bpf_register_prog_type(&sched_cls_type); 1664 bpf_register_prog_type(&sched_act_type); 1665 1666 return 0; 1667 } 1668 late_initcall(register_sk_filter_ops); 1669 1670 int sk_detach_filter(struct sock *sk) 1671 { 1672 int ret = -ENOENT; 1673 struct sk_filter *filter; 1674 1675 if (sock_flag(sk, SOCK_FILTER_LOCKED)) 1676 return -EPERM; 1677 1678 filter = rcu_dereference_protected(sk->sk_filter, 1679 sock_owned_by_user(sk)); 1680 if (filter) { 1681 RCU_INIT_POINTER(sk->sk_filter, NULL); 1682 sk_filter_uncharge(sk, filter); 1683 ret = 0; 1684 } 1685 1686 return ret; 1687 } 1688 EXPORT_SYMBOL_GPL(sk_detach_filter); 1689 1690 int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, 1691 unsigned int len) 1692 { 1693 struct sock_fprog_kern *fprog; 1694 struct sk_filter *filter; 1695 int ret = 0; 1696 1697 lock_sock(sk); 1698 filter = rcu_dereference_protected(sk->sk_filter, 1699 sock_owned_by_user(sk)); 1700 if (!filter) 1701 goto out; 1702 1703 /* We're copying the filter that has been originally attached, 1704 * so no conversion/decode needed anymore. 1705 */ 1706 fprog = filter->prog->orig_prog; 1707 1708 ret = fprog->len; 1709 if (!len) 1710 /* User space only enquires number of filter blocks. */ 1711 goto out; 1712 1713 ret = -EINVAL; 1714 if (len < fprog->len) 1715 goto out; 1716 1717 ret = -EFAULT; 1718 if (copy_to_user(ubuf, fprog->filter, bpf_classic_proglen(fprog))) 1719 goto out; 1720 1721 /* Instead of bytes, the API requests to return the number 1722 * of filter blocks. 1723 */ 1724 ret = fprog->len; 1725 out: 1726 release_sock(sk); 1727 return ret; 1728 } 1729