1 /* 2 * Linux Socket Filter - Kernel level socket filtering 3 * 4 * Based on the design of the Berkeley Packet Filter. The new 5 * internal format has been designed by PLUMgrid: 6 * 7 * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com 8 * 9 * Authors: 10 * 11 * Jay Schulist <jschlst@samba.org> 12 * Alexei Starovoitov <ast@plumgrid.com> 13 * Daniel Borkmann <dborkman@redhat.com> 14 * 15 * This program is free software; you can redistribute it and/or 16 * modify it under the terms of the GNU General Public License 17 * as published by the Free Software Foundation; either version 18 * 2 of the License, or (at your option) any later version. 19 * 20 * Andi Kleen - Fix a few bad bugs and races. 21 * Kris Katterjohn - Added many additional checks in bpf_check_classic() 22 */ 23 24 #include <linux/module.h> 25 #include <linux/types.h> 26 #include <linux/mm.h> 27 #include <linux/fcntl.h> 28 #include <linux/socket.h> 29 #include <linux/in.h> 30 #include <linux/inet.h> 31 #include <linux/netdevice.h> 32 #include <linux/if_packet.h> 33 #include <linux/gfp.h> 34 #include <net/ip.h> 35 #include <net/protocol.h> 36 #include <net/netlink.h> 37 #include <linux/skbuff.h> 38 #include <net/sock.h> 39 #include <net/flow_dissector.h> 40 #include <linux/errno.h> 41 #include <linux/timer.h> 42 #include <asm/uaccess.h> 43 #include <asm/unaligned.h> 44 #include <linux/filter.h> 45 #include <linux/ratelimit.h> 46 #include <linux/seccomp.h> 47 #include <linux/if_vlan.h> 48 #include <linux/bpf.h> 49 #include <net/sch_generic.h> 50 #include <net/cls_cgroup.h> 51 #include <net/dst_metadata.h> 52 #include <net/dst.h> 53 #include <net/sock_reuseport.h> 54 55 /** 56 * sk_filter - run a packet through a socket filter 57 * @sk: sock associated with &sk_buff 58 * @skb: buffer to filter 59 * 60 * Run the eBPF program and then cut skb->data to correct size returned by 61 * the program. If pkt_len is 0 we toss packet. If skb->len is smaller 62 * than pkt_len we keep whole skb->data. This is the socket level 63 * wrapper to BPF_PROG_RUN. It returns 0 if the packet should 64 * be accepted or -EPERM if the packet should be tossed. 65 * 66 */ 67 int sk_filter(struct sock *sk, struct sk_buff *skb) 68 { 69 int err; 70 struct sk_filter *filter; 71 72 /* 73 * If the skb was allocated from pfmemalloc reserves, only 74 * allow SOCK_MEMALLOC sockets to use it as this socket is 75 * helping free memory 76 */ 77 if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) 78 return -ENOMEM; 79 80 err = security_sock_rcv_skb(sk, skb); 81 if (err) 82 return err; 83 84 rcu_read_lock(); 85 filter = rcu_dereference(sk->sk_filter); 86 if (filter) { 87 unsigned int pkt_len = bpf_prog_run_save_cb(filter->prog, skb); 88 89 err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM; 90 } 91 rcu_read_unlock(); 92 93 return err; 94 } 95 EXPORT_SYMBOL(sk_filter); 96 97 static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) 98 { 99 return skb_get_poff((struct sk_buff *)(unsigned long) ctx); 100 } 101 102 static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) 103 { 104 struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx; 105 struct nlattr *nla; 106 107 if (skb_is_nonlinear(skb)) 108 return 0; 109 110 if (skb->len < sizeof(struct nlattr)) 111 return 0; 112 113 if (a > skb->len - sizeof(struct nlattr)) 114 return 0; 115 116 nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x); 117 if (nla) 118 return (void *) nla - (void *) skb->data; 119 120 return 0; 121 } 122 123 static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) 124 { 125 struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx; 126 struct nlattr *nla; 127 128 if (skb_is_nonlinear(skb)) 129 return 0; 130 131 if (skb->len < sizeof(struct nlattr)) 132 return 0; 133 134 if (a > skb->len - sizeof(struct nlattr)) 135 return 0; 136 137 nla = (struct nlattr *) &skb->data[a]; 138 if (nla->nla_len > skb->len - a) 139 return 0; 140 141 nla = nla_find_nested(nla, x); 142 if (nla) 143 return (void *) nla - (void *) skb->data; 144 145 return 0; 146 } 147 148 static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) 149 { 150 return raw_smp_processor_id(); 151 } 152 153 static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, 154 struct bpf_insn *insn_buf) 155 { 156 struct bpf_insn *insn = insn_buf; 157 158 switch (skb_field) { 159 case SKF_AD_MARK: 160 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); 161 162 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 163 offsetof(struct sk_buff, mark)); 164 break; 165 166 case SKF_AD_PKTTYPE: 167 *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET()); 168 *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX); 169 #ifdef __BIG_ENDIAN_BITFIELD 170 *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5); 171 #endif 172 break; 173 174 case SKF_AD_QUEUE: 175 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2); 176 177 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 178 offsetof(struct sk_buff, queue_mapping)); 179 break; 180 181 case SKF_AD_VLAN_TAG: 182 case SKF_AD_VLAN_TAG_PRESENT: 183 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); 184 BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); 185 186 /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */ 187 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 188 offsetof(struct sk_buff, vlan_tci)); 189 if (skb_field == SKF_AD_VLAN_TAG) { 190 *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 191 ~VLAN_TAG_PRESENT); 192 } else { 193 /* dst_reg >>= 12 */ 194 *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 12); 195 /* dst_reg &= 1 */ 196 *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1); 197 } 198 break; 199 } 200 201 return insn - insn_buf; 202 } 203 204 static bool convert_bpf_extensions(struct sock_filter *fp, 205 struct bpf_insn **insnp) 206 { 207 struct bpf_insn *insn = *insnp; 208 u32 cnt; 209 210 switch (fp->k) { 211 case SKF_AD_OFF + SKF_AD_PROTOCOL: 212 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); 213 214 /* A = *(u16 *) (CTX + offsetof(protocol)) */ 215 *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, 216 offsetof(struct sk_buff, protocol)); 217 /* A = ntohs(A) [emitting a nop or swap16] */ 218 *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); 219 break; 220 221 case SKF_AD_OFF + SKF_AD_PKTTYPE: 222 cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn); 223 insn += cnt - 1; 224 break; 225 226 case SKF_AD_OFF + SKF_AD_IFINDEX: 227 case SKF_AD_OFF + SKF_AD_HATYPE: 228 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); 229 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2); 230 BUILD_BUG_ON(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)) < 0); 231 232 *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)), 233 BPF_REG_TMP, BPF_REG_CTX, 234 offsetof(struct sk_buff, dev)); 235 /* if (tmp != 0) goto pc + 1 */ 236 *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1); 237 *insn++ = BPF_EXIT_INSN(); 238 if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX) 239 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP, 240 offsetof(struct net_device, ifindex)); 241 else 242 *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP, 243 offsetof(struct net_device, type)); 244 break; 245 246 case SKF_AD_OFF + SKF_AD_MARK: 247 cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn); 248 insn += cnt - 1; 249 break; 250 251 case SKF_AD_OFF + SKF_AD_RXHASH: 252 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); 253 254 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, 255 offsetof(struct sk_buff, hash)); 256 break; 257 258 case SKF_AD_OFF + SKF_AD_QUEUE: 259 cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn); 260 insn += cnt - 1; 261 break; 262 263 case SKF_AD_OFF + SKF_AD_VLAN_TAG: 264 cnt = convert_skb_access(SKF_AD_VLAN_TAG, 265 BPF_REG_A, BPF_REG_CTX, insn); 266 insn += cnt - 1; 267 break; 268 269 case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT: 270 cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, 271 BPF_REG_A, BPF_REG_CTX, insn); 272 insn += cnt - 1; 273 break; 274 275 case SKF_AD_OFF + SKF_AD_VLAN_TPID: 276 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2); 277 278 /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */ 279 *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, 280 offsetof(struct sk_buff, vlan_proto)); 281 /* A = ntohs(A) [emitting a nop or swap16] */ 282 *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); 283 break; 284 285 case SKF_AD_OFF + SKF_AD_PAY_OFFSET: 286 case SKF_AD_OFF + SKF_AD_NLATTR: 287 case SKF_AD_OFF + SKF_AD_NLATTR_NEST: 288 case SKF_AD_OFF + SKF_AD_CPU: 289 case SKF_AD_OFF + SKF_AD_RANDOM: 290 /* arg1 = CTX */ 291 *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); 292 /* arg2 = A */ 293 *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A); 294 /* arg3 = X */ 295 *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X); 296 /* Emit call(arg1=CTX, arg2=A, arg3=X) */ 297 switch (fp->k) { 298 case SKF_AD_OFF + SKF_AD_PAY_OFFSET: 299 *insn = BPF_EMIT_CALL(__skb_get_pay_offset); 300 break; 301 case SKF_AD_OFF + SKF_AD_NLATTR: 302 *insn = BPF_EMIT_CALL(__skb_get_nlattr); 303 break; 304 case SKF_AD_OFF + SKF_AD_NLATTR_NEST: 305 *insn = BPF_EMIT_CALL(__skb_get_nlattr_nest); 306 break; 307 case SKF_AD_OFF + SKF_AD_CPU: 308 *insn = BPF_EMIT_CALL(__get_raw_cpu_id); 309 break; 310 case SKF_AD_OFF + SKF_AD_RANDOM: 311 *insn = BPF_EMIT_CALL(bpf_user_rnd_u32); 312 bpf_user_rnd_init_once(); 313 break; 314 } 315 break; 316 317 case SKF_AD_OFF + SKF_AD_ALU_XOR_X: 318 /* A ^= X */ 319 *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X); 320 break; 321 322 default: 323 /* This is just a dummy call to avoid letting the compiler 324 * evict __bpf_call_base() as an optimization. Placed here 325 * where no-one bothers. 326 */ 327 BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0); 328 return false; 329 } 330 331 *insnp = insn; 332 return true; 333 } 334 335 /** 336 * bpf_convert_filter - convert filter program 337 * @prog: the user passed filter program 338 * @len: the length of the user passed filter program 339 * @new_prog: buffer where converted program will be stored 340 * @new_len: pointer to store length of converted program 341 * 342 * Remap 'sock_filter' style BPF instruction set to 'sock_filter_ext' style. 343 * Conversion workflow: 344 * 345 * 1) First pass for calculating the new program length: 346 * bpf_convert_filter(old_prog, old_len, NULL, &new_len) 347 * 348 * 2) 2nd pass to remap in two passes: 1st pass finds new 349 * jump offsets, 2nd pass remapping: 350 * new_prog = kmalloc(sizeof(struct bpf_insn) * new_len); 351 * bpf_convert_filter(old_prog, old_len, new_prog, &new_len); 352 */ 353 static int bpf_convert_filter(struct sock_filter *prog, int len, 354 struct bpf_insn *new_prog, int *new_len) 355 { 356 int new_flen = 0, pass = 0, target, i; 357 struct bpf_insn *new_insn; 358 struct sock_filter *fp; 359 int *addrs = NULL; 360 u8 bpf_src; 361 362 BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK); 363 BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); 364 365 if (len <= 0 || len > BPF_MAXINSNS) 366 return -EINVAL; 367 368 if (new_prog) { 369 addrs = kcalloc(len, sizeof(*addrs), 370 GFP_KERNEL | __GFP_NOWARN); 371 if (!addrs) 372 return -ENOMEM; 373 } 374 375 do_pass: 376 new_insn = new_prog; 377 fp = prog; 378 379 /* Classic BPF related prologue emission. */ 380 if (new_insn) { 381 /* Classic BPF expects A and X to be reset first. These need 382 * to be guaranteed to be the first two instructions. 383 */ 384 *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); 385 *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); 386 387 /* All programs must keep CTX in callee saved BPF_REG_CTX. 388 * In eBPF case it's done by the compiler, here we need to 389 * do this ourself. Initial CTX is present in BPF_REG_ARG1. 390 */ 391 *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); 392 } else { 393 new_insn += 3; 394 } 395 396 for (i = 0; i < len; fp++, i++) { 397 struct bpf_insn tmp_insns[6] = { }; 398 struct bpf_insn *insn = tmp_insns; 399 400 if (addrs) 401 addrs[i] = new_insn - new_prog; 402 403 switch (fp->code) { 404 /* All arithmetic insns and skb loads map as-is. */ 405 case BPF_ALU | BPF_ADD | BPF_X: 406 case BPF_ALU | BPF_ADD | BPF_K: 407 case BPF_ALU | BPF_SUB | BPF_X: 408 case BPF_ALU | BPF_SUB | BPF_K: 409 case BPF_ALU | BPF_AND | BPF_X: 410 case BPF_ALU | BPF_AND | BPF_K: 411 case BPF_ALU | BPF_OR | BPF_X: 412 case BPF_ALU | BPF_OR | BPF_K: 413 case BPF_ALU | BPF_LSH | BPF_X: 414 case BPF_ALU | BPF_LSH | BPF_K: 415 case BPF_ALU | BPF_RSH | BPF_X: 416 case BPF_ALU | BPF_RSH | BPF_K: 417 case BPF_ALU | BPF_XOR | BPF_X: 418 case BPF_ALU | BPF_XOR | BPF_K: 419 case BPF_ALU | BPF_MUL | BPF_X: 420 case BPF_ALU | BPF_MUL | BPF_K: 421 case BPF_ALU | BPF_DIV | BPF_X: 422 case BPF_ALU | BPF_DIV | BPF_K: 423 case BPF_ALU | BPF_MOD | BPF_X: 424 case BPF_ALU | BPF_MOD | BPF_K: 425 case BPF_ALU | BPF_NEG: 426 case BPF_LD | BPF_ABS | BPF_W: 427 case BPF_LD | BPF_ABS | BPF_H: 428 case BPF_LD | BPF_ABS | BPF_B: 429 case BPF_LD | BPF_IND | BPF_W: 430 case BPF_LD | BPF_IND | BPF_H: 431 case BPF_LD | BPF_IND | BPF_B: 432 /* Check for overloaded BPF extension and 433 * directly convert it if found, otherwise 434 * just move on with mapping. 435 */ 436 if (BPF_CLASS(fp->code) == BPF_LD && 437 BPF_MODE(fp->code) == BPF_ABS && 438 convert_bpf_extensions(fp, &insn)) 439 break; 440 441 *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); 442 break; 443 444 /* Jump transformation cannot use BPF block macros 445 * everywhere as offset calculation and target updates 446 * require a bit more work than the rest, i.e. jump 447 * opcodes map as-is, but offsets need adjustment. 448 */ 449 450 #define BPF_EMIT_JMP \ 451 do { \ 452 if (target >= len || target < 0) \ 453 goto err; \ 454 insn->off = addrs ? addrs[target] - addrs[i] - 1 : 0; \ 455 /* Adjust pc relative offset for 2nd or 3rd insn. */ \ 456 insn->off -= insn - tmp_insns; \ 457 } while (0) 458 459 case BPF_JMP | BPF_JA: 460 target = i + fp->k + 1; 461 insn->code = fp->code; 462 BPF_EMIT_JMP; 463 break; 464 465 case BPF_JMP | BPF_JEQ | BPF_K: 466 case BPF_JMP | BPF_JEQ | BPF_X: 467 case BPF_JMP | BPF_JSET | BPF_K: 468 case BPF_JMP | BPF_JSET | BPF_X: 469 case BPF_JMP | BPF_JGT | BPF_K: 470 case BPF_JMP | BPF_JGT | BPF_X: 471 case BPF_JMP | BPF_JGE | BPF_K: 472 case BPF_JMP | BPF_JGE | BPF_X: 473 if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) { 474 /* BPF immediates are signed, zero extend 475 * immediate into tmp register and use it 476 * in compare insn. 477 */ 478 *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k); 479 480 insn->dst_reg = BPF_REG_A; 481 insn->src_reg = BPF_REG_TMP; 482 bpf_src = BPF_X; 483 } else { 484 insn->dst_reg = BPF_REG_A; 485 insn->imm = fp->k; 486 bpf_src = BPF_SRC(fp->code); 487 insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0; 488 } 489 490 /* Common case where 'jump_false' is next insn. */ 491 if (fp->jf == 0) { 492 insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; 493 target = i + fp->jt + 1; 494 BPF_EMIT_JMP; 495 break; 496 } 497 498 /* Convert JEQ into JNE when 'jump_true' is next insn. */ 499 if (fp->jt == 0 && BPF_OP(fp->code) == BPF_JEQ) { 500 insn->code = BPF_JMP | BPF_JNE | bpf_src; 501 target = i + fp->jf + 1; 502 BPF_EMIT_JMP; 503 break; 504 } 505 506 /* Other jumps are mapped into two insns: Jxx and JA. */ 507 target = i + fp->jt + 1; 508 insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; 509 BPF_EMIT_JMP; 510 insn++; 511 512 insn->code = BPF_JMP | BPF_JA; 513 target = i + fp->jf + 1; 514 BPF_EMIT_JMP; 515 break; 516 517 /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */ 518 case BPF_LDX | BPF_MSH | BPF_B: 519 /* tmp = A */ 520 *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A); 521 /* A = BPF_R0 = *(u8 *) (skb->data + K) */ 522 *insn++ = BPF_LD_ABS(BPF_B, fp->k); 523 /* A &= 0xf */ 524 *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf); 525 /* A <<= 2 */ 526 *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2); 527 /* X = A */ 528 *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); 529 /* A = tmp */ 530 *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); 531 break; 532 533 /* RET_K, RET_A are remaped into 2 insns. */ 534 case BPF_RET | BPF_A: 535 case BPF_RET | BPF_K: 536 *insn++ = BPF_MOV32_RAW(BPF_RVAL(fp->code) == BPF_K ? 537 BPF_K : BPF_X, BPF_REG_0, 538 BPF_REG_A, fp->k); 539 *insn = BPF_EXIT_INSN(); 540 break; 541 542 /* Store to stack. */ 543 case BPF_ST: 544 case BPF_STX: 545 *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) == 546 BPF_ST ? BPF_REG_A : BPF_REG_X, 547 -(BPF_MEMWORDS - fp->k) * 4); 548 break; 549 550 /* Load from stack. */ 551 case BPF_LD | BPF_MEM: 552 case BPF_LDX | BPF_MEM: 553 *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? 554 BPF_REG_A : BPF_REG_X, BPF_REG_FP, 555 -(BPF_MEMWORDS - fp->k) * 4); 556 break; 557 558 /* A = K or X = K */ 559 case BPF_LD | BPF_IMM: 560 case BPF_LDX | BPF_IMM: 561 *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ? 562 BPF_REG_A : BPF_REG_X, fp->k); 563 break; 564 565 /* X = A */ 566 case BPF_MISC | BPF_TAX: 567 *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); 568 break; 569 570 /* A = X */ 571 case BPF_MISC | BPF_TXA: 572 *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X); 573 break; 574 575 /* A = skb->len or X = skb->len */ 576 case BPF_LD | BPF_W | BPF_LEN: 577 case BPF_LDX | BPF_W | BPF_LEN: 578 *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? 579 BPF_REG_A : BPF_REG_X, BPF_REG_CTX, 580 offsetof(struct sk_buff, len)); 581 break; 582 583 /* Access seccomp_data fields. */ 584 case BPF_LDX | BPF_ABS | BPF_W: 585 /* A = *(u32 *) (ctx + K) */ 586 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k); 587 break; 588 589 /* Unknown instruction. */ 590 default: 591 goto err; 592 } 593 594 insn++; 595 if (new_prog) 596 memcpy(new_insn, tmp_insns, 597 sizeof(*insn) * (insn - tmp_insns)); 598 new_insn += insn - tmp_insns; 599 } 600 601 if (!new_prog) { 602 /* Only calculating new length. */ 603 *new_len = new_insn - new_prog; 604 return 0; 605 } 606 607 pass++; 608 if (new_flen != new_insn - new_prog) { 609 new_flen = new_insn - new_prog; 610 if (pass > 2) 611 goto err; 612 goto do_pass; 613 } 614 615 kfree(addrs); 616 BUG_ON(*new_len != new_flen); 617 return 0; 618 err: 619 kfree(addrs); 620 return -EINVAL; 621 } 622 623 /* Security: 624 * 625 * As we dont want to clear mem[] array for each packet going through 626 * __bpf_prog_run(), we check that filter loaded by user never try to read 627 * a cell if not previously written, and we check all branches to be sure 628 * a malicious user doesn't try to abuse us. 629 */ 630 static int check_load_and_stores(const struct sock_filter *filter, int flen) 631 { 632 u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */ 633 int pc, ret = 0; 634 635 BUILD_BUG_ON(BPF_MEMWORDS > 16); 636 637 masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL); 638 if (!masks) 639 return -ENOMEM; 640 641 memset(masks, 0xff, flen * sizeof(*masks)); 642 643 for (pc = 0; pc < flen; pc++) { 644 memvalid &= masks[pc]; 645 646 switch (filter[pc].code) { 647 case BPF_ST: 648 case BPF_STX: 649 memvalid |= (1 << filter[pc].k); 650 break; 651 case BPF_LD | BPF_MEM: 652 case BPF_LDX | BPF_MEM: 653 if (!(memvalid & (1 << filter[pc].k))) { 654 ret = -EINVAL; 655 goto error; 656 } 657 break; 658 case BPF_JMP | BPF_JA: 659 /* A jump must set masks on target */ 660 masks[pc + 1 + filter[pc].k] &= memvalid; 661 memvalid = ~0; 662 break; 663 case BPF_JMP | BPF_JEQ | BPF_K: 664 case BPF_JMP | BPF_JEQ | BPF_X: 665 case BPF_JMP | BPF_JGE | BPF_K: 666 case BPF_JMP | BPF_JGE | BPF_X: 667 case BPF_JMP | BPF_JGT | BPF_K: 668 case BPF_JMP | BPF_JGT | BPF_X: 669 case BPF_JMP | BPF_JSET | BPF_K: 670 case BPF_JMP | BPF_JSET | BPF_X: 671 /* A jump must set masks on targets */ 672 masks[pc + 1 + filter[pc].jt] &= memvalid; 673 masks[pc + 1 + filter[pc].jf] &= memvalid; 674 memvalid = ~0; 675 break; 676 } 677 } 678 error: 679 kfree(masks); 680 return ret; 681 } 682 683 static bool chk_code_allowed(u16 code_to_probe) 684 { 685 static const bool codes[] = { 686 /* 32 bit ALU operations */ 687 [BPF_ALU | BPF_ADD | BPF_K] = true, 688 [BPF_ALU | BPF_ADD | BPF_X] = true, 689 [BPF_ALU | BPF_SUB | BPF_K] = true, 690 [BPF_ALU | BPF_SUB | BPF_X] = true, 691 [BPF_ALU | BPF_MUL | BPF_K] = true, 692 [BPF_ALU | BPF_MUL | BPF_X] = true, 693 [BPF_ALU | BPF_DIV | BPF_K] = true, 694 [BPF_ALU | BPF_DIV | BPF_X] = true, 695 [BPF_ALU | BPF_MOD | BPF_K] = true, 696 [BPF_ALU | BPF_MOD | BPF_X] = true, 697 [BPF_ALU | BPF_AND | BPF_K] = true, 698 [BPF_ALU | BPF_AND | BPF_X] = true, 699 [BPF_ALU | BPF_OR | BPF_K] = true, 700 [BPF_ALU | BPF_OR | BPF_X] = true, 701 [BPF_ALU | BPF_XOR | BPF_K] = true, 702 [BPF_ALU | BPF_XOR | BPF_X] = true, 703 [BPF_ALU | BPF_LSH | BPF_K] = true, 704 [BPF_ALU | BPF_LSH | BPF_X] = true, 705 [BPF_ALU | BPF_RSH | BPF_K] = true, 706 [BPF_ALU | BPF_RSH | BPF_X] = true, 707 [BPF_ALU | BPF_NEG] = true, 708 /* Load instructions */ 709 [BPF_LD | BPF_W | BPF_ABS] = true, 710 [BPF_LD | BPF_H | BPF_ABS] = true, 711 [BPF_LD | BPF_B | BPF_ABS] = true, 712 [BPF_LD | BPF_W | BPF_LEN] = true, 713 [BPF_LD | BPF_W | BPF_IND] = true, 714 [BPF_LD | BPF_H | BPF_IND] = true, 715 [BPF_LD | BPF_B | BPF_IND] = true, 716 [BPF_LD | BPF_IMM] = true, 717 [BPF_LD | BPF_MEM] = true, 718 [BPF_LDX | BPF_W | BPF_LEN] = true, 719 [BPF_LDX | BPF_B | BPF_MSH] = true, 720 [BPF_LDX | BPF_IMM] = true, 721 [BPF_LDX | BPF_MEM] = true, 722 /* Store instructions */ 723 [BPF_ST] = true, 724 [BPF_STX] = true, 725 /* Misc instructions */ 726 [BPF_MISC | BPF_TAX] = true, 727 [BPF_MISC | BPF_TXA] = true, 728 /* Return instructions */ 729 [BPF_RET | BPF_K] = true, 730 [BPF_RET | BPF_A] = true, 731 /* Jump instructions */ 732 [BPF_JMP | BPF_JA] = true, 733 [BPF_JMP | BPF_JEQ | BPF_K] = true, 734 [BPF_JMP | BPF_JEQ | BPF_X] = true, 735 [BPF_JMP | BPF_JGE | BPF_K] = true, 736 [BPF_JMP | BPF_JGE | BPF_X] = true, 737 [BPF_JMP | BPF_JGT | BPF_K] = true, 738 [BPF_JMP | BPF_JGT | BPF_X] = true, 739 [BPF_JMP | BPF_JSET | BPF_K] = true, 740 [BPF_JMP | BPF_JSET | BPF_X] = true, 741 }; 742 743 if (code_to_probe >= ARRAY_SIZE(codes)) 744 return false; 745 746 return codes[code_to_probe]; 747 } 748 749 /** 750 * bpf_check_classic - verify socket filter code 751 * @filter: filter to verify 752 * @flen: length of filter 753 * 754 * Check the user's filter code. If we let some ugly 755 * filter code slip through kaboom! The filter must contain 756 * no references or jumps that are out of range, no illegal 757 * instructions, and must end with a RET instruction. 758 * 759 * All jumps are forward as they are not signed. 760 * 761 * Returns 0 if the rule set is legal or -EINVAL if not. 762 */ 763 static int bpf_check_classic(const struct sock_filter *filter, 764 unsigned int flen) 765 { 766 bool anc_found; 767 int pc; 768 769 if (flen == 0 || flen > BPF_MAXINSNS) 770 return -EINVAL; 771 772 /* Check the filter code now */ 773 for (pc = 0; pc < flen; pc++) { 774 const struct sock_filter *ftest = &filter[pc]; 775 776 /* May we actually operate on this code? */ 777 if (!chk_code_allowed(ftest->code)) 778 return -EINVAL; 779 780 /* Some instructions need special checks */ 781 switch (ftest->code) { 782 case BPF_ALU | BPF_DIV | BPF_K: 783 case BPF_ALU | BPF_MOD | BPF_K: 784 /* Check for division by zero */ 785 if (ftest->k == 0) 786 return -EINVAL; 787 break; 788 case BPF_ALU | BPF_LSH | BPF_K: 789 case BPF_ALU | BPF_RSH | BPF_K: 790 if (ftest->k >= 32) 791 return -EINVAL; 792 break; 793 case BPF_LD | BPF_MEM: 794 case BPF_LDX | BPF_MEM: 795 case BPF_ST: 796 case BPF_STX: 797 /* Check for invalid memory addresses */ 798 if (ftest->k >= BPF_MEMWORDS) 799 return -EINVAL; 800 break; 801 case BPF_JMP | BPF_JA: 802 /* Note, the large ftest->k might cause loops. 803 * Compare this with conditional jumps below, 804 * where offsets are limited. --ANK (981016) 805 */ 806 if (ftest->k >= (unsigned int)(flen - pc - 1)) 807 return -EINVAL; 808 break; 809 case BPF_JMP | BPF_JEQ | BPF_K: 810 case BPF_JMP | BPF_JEQ | BPF_X: 811 case BPF_JMP | BPF_JGE | BPF_K: 812 case BPF_JMP | BPF_JGE | BPF_X: 813 case BPF_JMP | BPF_JGT | BPF_K: 814 case BPF_JMP | BPF_JGT | BPF_X: 815 case BPF_JMP | BPF_JSET | BPF_K: 816 case BPF_JMP | BPF_JSET | BPF_X: 817 /* Both conditionals must be safe */ 818 if (pc + ftest->jt + 1 >= flen || 819 pc + ftest->jf + 1 >= flen) 820 return -EINVAL; 821 break; 822 case BPF_LD | BPF_W | BPF_ABS: 823 case BPF_LD | BPF_H | BPF_ABS: 824 case BPF_LD | BPF_B | BPF_ABS: 825 anc_found = false; 826 if (bpf_anc_helper(ftest) & BPF_ANC) 827 anc_found = true; 828 /* Ancillary operation unknown or unsupported */ 829 if (anc_found == false && ftest->k >= SKF_AD_OFF) 830 return -EINVAL; 831 } 832 } 833 834 /* Last instruction must be a RET code */ 835 switch (filter[flen - 1].code) { 836 case BPF_RET | BPF_K: 837 case BPF_RET | BPF_A: 838 return check_load_and_stores(filter, flen); 839 } 840 841 return -EINVAL; 842 } 843 844 static int bpf_prog_store_orig_filter(struct bpf_prog *fp, 845 const struct sock_fprog *fprog) 846 { 847 unsigned int fsize = bpf_classic_proglen(fprog); 848 struct sock_fprog_kern *fkprog; 849 850 fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL); 851 if (!fp->orig_prog) 852 return -ENOMEM; 853 854 fkprog = fp->orig_prog; 855 fkprog->len = fprog->len; 856 857 fkprog->filter = kmemdup(fp->insns, fsize, 858 GFP_KERNEL | __GFP_NOWARN); 859 if (!fkprog->filter) { 860 kfree(fp->orig_prog); 861 return -ENOMEM; 862 } 863 864 return 0; 865 } 866 867 static void bpf_release_orig_filter(struct bpf_prog *fp) 868 { 869 struct sock_fprog_kern *fprog = fp->orig_prog; 870 871 if (fprog) { 872 kfree(fprog->filter); 873 kfree(fprog); 874 } 875 } 876 877 static void __bpf_prog_release(struct bpf_prog *prog) 878 { 879 if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) { 880 bpf_prog_put(prog); 881 } else { 882 bpf_release_orig_filter(prog); 883 bpf_prog_free(prog); 884 } 885 } 886 887 static void __sk_filter_release(struct sk_filter *fp) 888 { 889 __bpf_prog_release(fp->prog); 890 kfree(fp); 891 } 892 893 /** 894 * sk_filter_release_rcu - Release a socket filter by rcu_head 895 * @rcu: rcu_head that contains the sk_filter to free 896 */ 897 static void sk_filter_release_rcu(struct rcu_head *rcu) 898 { 899 struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); 900 901 __sk_filter_release(fp); 902 } 903 904 /** 905 * sk_filter_release - release a socket filter 906 * @fp: filter to remove 907 * 908 * Remove a filter from a socket and release its resources. 909 */ 910 static void sk_filter_release(struct sk_filter *fp) 911 { 912 if (atomic_dec_and_test(&fp->refcnt)) 913 call_rcu(&fp->rcu, sk_filter_release_rcu); 914 } 915 916 void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) 917 { 918 u32 filter_size = bpf_prog_size(fp->prog->len); 919 920 atomic_sub(filter_size, &sk->sk_omem_alloc); 921 sk_filter_release(fp); 922 } 923 924 /* try to charge the socket memory if there is space available 925 * return true on success 926 */ 927 bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) 928 { 929 u32 filter_size = bpf_prog_size(fp->prog->len); 930 931 /* same check as in sock_kmalloc() */ 932 if (filter_size <= sysctl_optmem_max && 933 atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) { 934 atomic_inc(&fp->refcnt); 935 atomic_add(filter_size, &sk->sk_omem_alloc); 936 return true; 937 } 938 return false; 939 } 940 941 static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) 942 { 943 struct sock_filter *old_prog; 944 struct bpf_prog *old_fp; 945 int err, new_len, old_len = fp->len; 946 947 /* We are free to overwrite insns et al right here as it 948 * won't be used at this point in time anymore internally 949 * after the migration to the internal BPF instruction 950 * representation. 951 */ 952 BUILD_BUG_ON(sizeof(struct sock_filter) != 953 sizeof(struct bpf_insn)); 954 955 /* Conversion cannot happen on overlapping memory areas, 956 * so we need to keep the user BPF around until the 2nd 957 * pass. At this time, the user BPF is stored in fp->insns. 958 */ 959 old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter), 960 GFP_KERNEL | __GFP_NOWARN); 961 if (!old_prog) { 962 err = -ENOMEM; 963 goto out_err; 964 } 965 966 /* 1st pass: calculate the new program length. */ 967 err = bpf_convert_filter(old_prog, old_len, NULL, &new_len); 968 if (err) 969 goto out_err_free; 970 971 /* Expand fp for appending the new filter representation. */ 972 old_fp = fp; 973 fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0); 974 if (!fp) { 975 /* The old_fp is still around in case we couldn't 976 * allocate new memory, so uncharge on that one. 977 */ 978 fp = old_fp; 979 err = -ENOMEM; 980 goto out_err_free; 981 } 982 983 fp->len = new_len; 984 985 /* 2nd pass: remap sock_filter insns into bpf_insn insns. */ 986 err = bpf_convert_filter(old_prog, old_len, fp->insnsi, &new_len); 987 if (err) 988 /* 2nd bpf_convert_filter() can fail only if it fails 989 * to allocate memory, remapping must succeed. Note, 990 * that at this time old_fp has already been released 991 * by krealloc(). 992 */ 993 goto out_err_free; 994 995 bpf_prog_select_runtime(fp); 996 997 kfree(old_prog); 998 return fp; 999 1000 out_err_free: 1001 kfree(old_prog); 1002 out_err: 1003 __bpf_prog_release(fp); 1004 return ERR_PTR(err); 1005 } 1006 1007 static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, 1008 bpf_aux_classic_check_t trans) 1009 { 1010 int err; 1011 1012 fp->bpf_func = NULL; 1013 fp->jited = 0; 1014 1015 err = bpf_check_classic(fp->insns, fp->len); 1016 if (err) { 1017 __bpf_prog_release(fp); 1018 return ERR_PTR(err); 1019 } 1020 1021 /* There might be additional checks and transformations 1022 * needed on classic filters, f.e. in case of seccomp. 1023 */ 1024 if (trans) { 1025 err = trans(fp->insns, fp->len); 1026 if (err) { 1027 __bpf_prog_release(fp); 1028 return ERR_PTR(err); 1029 } 1030 } 1031 1032 /* Probe if we can JIT compile the filter and if so, do 1033 * the compilation of the filter. 1034 */ 1035 bpf_jit_compile(fp); 1036 1037 /* JIT compiler couldn't process this filter, so do the 1038 * internal BPF translation for the optimized interpreter. 1039 */ 1040 if (!fp->jited) 1041 fp = bpf_migrate_filter(fp); 1042 1043 return fp; 1044 } 1045 1046 /** 1047 * bpf_prog_create - create an unattached filter 1048 * @pfp: the unattached filter that is created 1049 * @fprog: the filter program 1050 * 1051 * Create a filter independent of any socket. We first run some 1052 * sanity checks on it to make sure it does not explode on us later. 1053 * If an error occurs or there is insufficient memory for the filter 1054 * a negative errno code is returned. On success the return is zero. 1055 */ 1056 int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog) 1057 { 1058 unsigned int fsize = bpf_classic_proglen(fprog); 1059 struct bpf_prog *fp; 1060 1061 /* Make sure new filter is there and in the right amounts. */ 1062 if (fprog->filter == NULL) 1063 return -EINVAL; 1064 1065 fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); 1066 if (!fp) 1067 return -ENOMEM; 1068 1069 memcpy(fp->insns, fprog->filter, fsize); 1070 1071 fp->len = fprog->len; 1072 /* Since unattached filters are not copied back to user 1073 * space through sk_get_filter(), we do not need to hold 1074 * a copy here, and can spare us the work. 1075 */ 1076 fp->orig_prog = NULL; 1077 1078 /* bpf_prepare_filter() already takes care of freeing 1079 * memory in case something goes wrong. 1080 */ 1081 fp = bpf_prepare_filter(fp, NULL); 1082 if (IS_ERR(fp)) 1083 return PTR_ERR(fp); 1084 1085 *pfp = fp; 1086 return 0; 1087 } 1088 EXPORT_SYMBOL_GPL(bpf_prog_create); 1089 1090 /** 1091 * bpf_prog_create_from_user - create an unattached filter from user buffer 1092 * @pfp: the unattached filter that is created 1093 * @fprog: the filter program 1094 * @trans: post-classic verifier transformation handler 1095 * @save_orig: save classic BPF program 1096 * 1097 * This function effectively does the same as bpf_prog_create(), only 1098 * that it builds up its insns buffer from user space provided buffer. 1099 * It also allows for passing a bpf_aux_classic_check_t handler. 1100 */ 1101 int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, 1102 bpf_aux_classic_check_t trans, bool save_orig) 1103 { 1104 unsigned int fsize = bpf_classic_proglen(fprog); 1105 struct bpf_prog *fp; 1106 int err; 1107 1108 /* Make sure new filter is there and in the right amounts. */ 1109 if (fprog->filter == NULL) 1110 return -EINVAL; 1111 1112 fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); 1113 if (!fp) 1114 return -ENOMEM; 1115 1116 if (copy_from_user(fp->insns, fprog->filter, fsize)) { 1117 __bpf_prog_free(fp); 1118 return -EFAULT; 1119 } 1120 1121 fp->len = fprog->len; 1122 fp->orig_prog = NULL; 1123 1124 if (save_orig) { 1125 err = bpf_prog_store_orig_filter(fp, fprog); 1126 if (err) { 1127 __bpf_prog_free(fp); 1128 return -ENOMEM; 1129 } 1130 } 1131 1132 /* bpf_prepare_filter() already takes care of freeing 1133 * memory in case something goes wrong. 1134 */ 1135 fp = bpf_prepare_filter(fp, trans); 1136 if (IS_ERR(fp)) 1137 return PTR_ERR(fp); 1138 1139 *pfp = fp; 1140 return 0; 1141 } 1142 EXPORT_SYMBOL_GPL(bpf_prog_create_from_user); 1143 1144 void bpf_prog_destroy(struct bpf_prog *fp) 1145 { 1146 __bpf_prog_release(fp); 1147 } 1148 EXPORT_SYMBOL_GPL(bpf_prog_destroy); 1149 1150 static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) 1151 { 1152 struct sk_filter *fp, *old_fp; 1153 1154 fp = kmalloc(sizeof(*fp), GFP_KERNEL); 1155 if (!fp) 1156 return -ENOMEM; 1157 1158 fp->prog = prog; 1159 atomic_set(&fp->refcnt, 0); 1160 1161 if (!sk_filter_charge(sk, fp)) { 1162 kfree(fp); 1163 return -ENOMEM; 1164 } 1165 1166 old_fp = rcu_dereference_protected(sk->sk_filter, 1167 sock_owned_by_user(sk)); 1168 rcu_assign_pointer(sk->sk_filter, fp); 1169 1170 if (old_fp) 1171 sk_filter_uncharge(sk, old_fp); 1172 1173 return 0; 1174 } 1175 1176 static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk) 1177 { 1178 struct bpf_prog *old_prog; 1179 int err; 1180 1181 if (bpf_prog_size(prog->len) > sysctl_optmem_max) 1182 return -ENOMEM; 1183 1184 if (sk_unhashed(sk)) { 1185 err = reuseport_alloc(sk); 1186 if (err) 1187 return err; 1188 } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { 1189 /* The socket wasn't bound with SO_REUSEPORT */ 1190 return -EINVAL; 1191 } 1192 1193 old_prog = reuseport_attach_prog(sk, prog); 1194 if (old_prog) 1195 bpf_prog_destroy(old_prog); 1196 1197 return 0; 1198 } 1199 1200 static 1201 struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) 1202 { 1203 unsigned int fsize = bpf_classic_proglen(fprog); 1204 unsigned int bpf_fsize = bpf_prog_size(fprog->len); 1205 struct bpf_prog *prog; 1206 int err; 1207 1208 if (sock_flag(sk, SOCK_FILTER_LOCKED)) 1209 return ERR_PTR(-EPERM); 1210 1211 /* Make sure new filter is there and in the right amounts. */ 1212 if (fprog->filter == NULL) 1213 return ERR_PTR(-EINVAL); 1214 1215 prog = bpf_prog_alloc(bpf_fsize, 0); 1216 if (!prog) 1217 return ERR_PTR(-ENOMEM); 1218 1219 if (copy_from_user(prog->insns, fprog->filter, fsize)) { 1220 __bpf_prog_free(prog); 1221 return ERR_PTR(-EFAULT); 1222 } 1223 1224 prog->len = fprog->len; 1225 1226 err = bpf_prog_store_orig_filter(prog, fprog); 1227 if (err) { 1228 __bpf_prog_free(prog); 1229 return ERR_PTR(-ENOMEM); 1230 } 1231 1232 /* bpf_prepare_filter() already takes care of freeing 1233 * memory in case something goes wrong. 1234 */ 1235 return bpf_prepare_filter(prog, NULL); 1236 } 1237 1238 /** 1239 * sk_attach_filter - attach a socket filter 1240 * @fprog: the filter program 1241 * @sk: the socket to use 1242 * 1243 * Attach the user's filter code. We first run some sanity checks on 1244 * it to make sure it does not explode on us later. If an error 1245 * occurs or there is insufficient memory for the filter a negative 1246 * errno code is returned. On success the return is zero. 1247 */ 1248 int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) 1249 { 1250 struct bpf_prog *prog = __get_filter(fprog, sk); 1251 int err; 1252 1253 if (IS_ERR(prog)) 1254 return PTR_ERR(prog); 1255 1256 err = __sk_attach_prog(prog, sk); 1257 if (err < 0) { 1258 __bpf_prog_release(prog); 1259 return err; 1260 } 1261 1262 return 0; 1263 } 1264 EXPORT_SYMBOL_GPL(sk_attach_filter); 1265 1266 int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) 1267 { 1268 struct bpf_prog *prog = __get_filter(fprog, sk); 1269 int err; 1270 1271 if (IS_ERR(prog)) 1272 return PTR_ERR(prog); 1273 1274 err = __reuseport_attach_prog(prog, sk); 1275 if (err < 0) { 1276 __bpf_prog_release(prog); 1277 return err; 1278 } 1279 1280 return 0; 1281 } 1282 1283 static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk) 1284 { 1285 struct bpf_prog *prog; 1286 1287 if (sock_flag(sk, SOCK_FILTER_LOCKED)) 1288 return ERR_PTR(-EPERM); 1289 1290 prog = bpf_prog_get(ufd); 1291 if (IS_ERR(prog)) 1292 return prog; 1293 1294 if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) { 1295 bpf_prog_put(prog); 1296 return ERR_PTR(-EINVAL); 1297 } 1298 1299 return prog; 1300 } 1301 1302 int sk_attach_bpf(u32 ufd, struct sock *sk) 1303 { 1304 struct bpf_prog *prog = __get_bpf(ufd, sk); 1305 int err; 1306 1307 if (IS_ERR(prog)) 1308 return PTR_ERR(prog); 1309 1310 err = __sk_attach_prog(prog, sk); 1311 if (err < 0) { 1312 bpf_prog_put(prog); 1313 return err; 1314 } 1315 1316 return 0; 1317 } 1318 1319 int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) 1320 { 1321 struct bpf_prog *prog = __get_bpf(ufd, sk); 1322 int err; 1323 1324 if (IS_ERR(prog)) 1325 return PTR_ERR(prog); 1326 1327 err = __reuseport_attach_prog(prog, sk); 1328 if (err < 0) { 1329 bpf_prog_put(prog); 1330 return err; 1331 } 1332 1333 return 0; 1334 } 1335 1336 #define BPF_LDST_LEN 16U 1337 1338 static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) 1339 { 1340 struct sk_buff *skb = (struct sk_buff *) (long) r1; 1341 int offset = (int) r2; 1342 void *from = (void *) (long) r3; 1343 unsigned int len = (unsigned int) r4; 1344 char buf[BPF_LDST_LEN]; 1345 void *ptr; 1346 1347 if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM))) 1348 return -EINVAL; 1349 1350 /* bpf verifier guarantees that: 1351 * 'from' pointer points to bpf program stack 1352 * 'len' bytes of it were initialized 1353 * 'len' > 0 1354 * 'skb' is a valid pointer to 'struct sk_buff' 1355 * 1356 * so check for invalid 'offset' and too large 'len' 1357 */ 1358 if (unlikely((u32) offset > 0xffff || len > sizeof(buf))) 1359 return -EFAULT; 1360 1361 if (unlikely(skb_cloned(skb) && 1362 !skb_clone_writable(skb, offset + len))) 1363 return -EFAULT; 1364 1365 ptr = skb_header_pointer(skb, offset, len, buf); 1366 if (unlikely(!ptr)) 1367 return -EFAULT; 1368 1369 if (flags & BPF_F_RECOMPUTE_CSUM) 1370 skb_postpull_rcsum(skb, ptr, len); 1371 1372 memcpy(ptr, from, len); 1373 1374 if (ptr == buf) 1375 /* skb_store_bits cannot return -EFAULT here */ 1376 skb_store_bits(skb, offset, ptr, len); 1377 1378 if (flags & BPF_F_RECOMPUTE_CSUM) 1379 skb_postpush_rcsum(skb, ptr, len); 1380 1381 return 0; 1382 } 1383 1384 const struct bpf_func_proto bpf_skb_store_bytes_proto = { 1385 .func = bpf_skb_store_bytes, 1386 .gpl_only = false, 1387 .ret_type = RET_INTEGER, 1388 .arg1_type = ARG_PTR_TO_CTX, 1389 .arg2_type = ARG_ANYTHING, 1390 .arg3_type = ARG_PTR_TO_STACK, 1391 .arg4_type = ARG_CONST_STACK_SIZE, 1392 .arg5_type = ARG_ANYTHING, 1393 }; 1394 1395 static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) 1396 { 1397 const struct sk_buff *skb = (const struct sk_buff *)(unsigned long) r1; 1398 int offset = (int) r2; 1399 void *to = (void *)(unsigned long) r3; 1400 unsigned int len = (unsigned int) r4; 1401 void *ptr; 1402 1403 if (unlikely((u32) offset > 0xffff || len > BPF_LDST_LEN)) 1404 return -EFAULT; 1405 1406 ptr = skb_header_pointer(skb, offset, len, to); 1407 if (unlikely(!ptr)) 1408 return -EFAULT; 1409 if (ptr != to) 1410 memcpy(to, ptr, len); 1411 1412 return 0; 1413 } 1414 1415 const struct bpf_func_proto bpf_skb_load_bytes_proto = { 1416 .func = bpf_skb_load_bytes, 1417 .gpl_only = false, 1418 .ret_type = RET_INTEGER, 1419 .arg1_type = ARG_PTR_TO_CTX, 1420 .arg2_type = ARG_ANYTHING, 1421 .arg3_type = ARG_PTR_TO_STACK, 1422 .arg4_type = ARG_CONST_STACK_SIZE, 1423 }; 1424 1425 static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) 1426 { 1427 struct sk_buff *skb = (struct sk_buff *) (long) r1; 1428 int offset = (int) r2; 1429 __sum16 sum, *ptr; 1430 1431 if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK))) 1432 return -EINVAL; 1433 if (unlikely((u32) offset > 0xffff)) 1434 return -EFAULT; 1435 1436 if (unlikely(skb_cloned(skb) && 1437 !skb_clone_writable(skb, offset + sizeof(sum)))) 1438 return -EFAULT; 1439 1440 ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); 1441 if (unlikely(!ptr)) 1442 return -EFAULT; 1443 1444 switch (flags & BPF_F_HDR_FIELD_MASK) { 1445 case 2: 1446 csum_replace2(ptr, from, to); 1447 break; 1448 case 4: 1449 csum_replace4(ptr, from, to); 1450 break; 1451 default: 1452 return -EINVAL; 1453 } 1454 1455 if (ptr == &sum) 1456 /* skb_store_bits guaranteed to not return -EFAULT here */ 1457 skb_store_bits(skb, offset, ptr, sizeof(sum)); 1458 1459 return 0; 1460 } 1461 1462 const struct bpf_func_proto bpf_l3_csum_replace_proto = { 1463 .func = bpf_l3_csum_replace, 1464 .gpl_only = false, 1465 .ret_type = RET_INTEGER, 1466 .arg1_type = ARG_PTR_TO_CTX, 1467 .arg2_type = ARG_ANYTHING, 1468 .arg3_type = ARG_ANYTHING, 1469 .arg4_type = ARG_ANYTHING, 1470 .arg5_type = ARG_ANYTHING, 1471 }; 1472 1473 static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) 1474 { 1475 struct sk_buff *skb = (struct sk_buff *) (long) r1; 1476 bool is_pseudo = flags & BPF_F_PSEUDO_HDR; 1477 int offset = (int) r2; 1478 __sum16 sum, *ptr; 1479 1480 if (unlikely(flags & ~(BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK))) 1481 return -EINVAL; 1482 if (unlikely((u32) offset > 0xffff)) 1483 return -EFAULT; 1484 1485 if (unlikely(skb_cloned(skb) && 1486 !skb_clone_writable(skb, offset + sizeof(sum)))) 1487 return -EFAULT; 1488 1489 ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); 1490 if (unlikely(!ptr)) 1491 return -EFAULT; 1492 1493 switch (flags & BPF_F_HDR_FIELD_MASK) { 1494 case 2: 1495 inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); 1496 break; 1497 case 4: 1498 inet_proto_csum_replace4(ptr, skb, from, to, is_pseudo); 1499 break; 1500 default: 1501 return -EINVAL; 1502 } 1503 1504 if (ptr == &sum) 1505 /* skb_store_bits guaranteed to not return -EFAULT here */ 1506 skb_store_bits(skb, offset, ptr, sizeof(sum)); 1507 1508 return 0; 1509 } 1510 1511 const struct bpf_func_proto bpf_l4_csum_replace_proto = { 1512 .func = bpf_l4_csum_replace, 1513 .gpl_only = false, 1514 .ret_type = RET_INTEGER, 1515 .arg1_type = ARG_PTR_TO_CTX, 1516 .arg2_type = ARG_ANYTHING, 1517 .arg3_type = ARG_ANYTHING, 1518 .arg4_type = ARG_ANYTHING, 1519 .arg5_type = ARG_ANYTHING, 1520 }; 1521 1522 static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) 1523 { 1524 struct sk_buff *skb = (struct sk_buff *) (long) r1, *skb2; 1525 struct net_device *dev; 1526 1527 if (unlikely(flags & ~(BPF_F_INGRESS))) 1528 return -EINVAL; 1529 1530 dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex); 1531 if (unlikely(!dev)) 1532 return -EINVAL; 1533 1534 skb2 = skb_clone(skb, GFP_ATOMIC); 1535 if (unlikely(!skb2)) 1536 return -ENOMEM; 1537 1538 if (flags & BPF_F_INGRESS) { 1539 if (skb_at_tc_ingress(skb2)) 1540 skb_postpush_rcsum(skb2, skb_mac_header(skb2), 1541 skb2->mac_len); 1542 return dev_forward_skb(dev, skb2); 1543 } 1544 1545 skb2->dev = dev; 1546 skb_sender_cpu_clear(skb2); 1547 return dev_queue_xmit(skb2); 1548 } 1549 1550 const struct bpf_func_proto bpf_clone_redirect_proto = { 1551 .func = bpf_clone_redirect, 1552 .gpl_only = false, 1553 .ret_type = RET_INTEGER, 1554 .arg1_type = ARG_PTR_TO_CTX, 1555 .arg2_type = ARG_ANYTHING, 1556 .arg3_type = ARG_ANYTHING, 1557 }; 1558 1559 struct redirect_info { 1560 u32 ifindex; 1561 u32 flags; 1562 }; 1563 1564 static DEFINE_PER_CPU(struct redirect_info, redirect_info); 1565 1566 static u64 bpf_redirect(u64 ifindex, u64 flags, u64 r3, u64 r4, u64 r5) 1567 { 1568 struct redirect_info *ri = this_cpu_ptr(&redirect_info); 1569 1570 if (unlikely(flags & ~(BPF_F_INGRESS))) 1571 return TC_ACT_SHOT; 1572 1573 ri->ifindex = ifindex; 1574 ri->flags = flags; 1575 1576 return TC_ACT_REDIRECT; 1577 } 1578 1579 int skb_do_redirect(struct sk_buff *skb) 1580 { 1581 struct redirect_info *ri = this_cpu_ptr(&redirect_info); 1582 struct net_device *dev; 1583 1584 dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex); 1585 ri->ifindex = 0; 1586 if (unlikely(!dev)) { 1587 kfree_skb(skb); 1588 return -EINVAL; 1589 } 1590 1591 if (ri->flags & BPF_F_INGRESS) { 1592 if (skb_at_tc_ingress(skb)) 1593 skb_postpush_rcsum(skb, skb_mac_header(skb), 1594 skb->mac_len); 1595 return dev_forward_skb(dev, skb); 1596 } 1597 1598 skb->dev = dev; 1599 skb_sender_cpu_clear(skb); 1600 return dev_queue_xmit(skb); 1601 } 1602 1603 const struct bpf_func_proto bpf_redirect_proto = { 1604 .func = bpf_redirect, 1605 .gpl_only = false, 1606 .ret_type = RET_INTEGER, 1607 .arg1_type = ARG_ANYTHING, 1608 .arg2_type = ARG_ANYTHING, 1609 }; 1610 1611 static u64 bpf_get_cgroup_classid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) 1612 { 1613 return task_get_classid((struct sk_buff *) (unsigned long) r1); 1614 } 1615 1616 static const struct bpf_func_proto bpf_get_cgroup_classid_proto = { 1617 .func = bpf_get_cgroup_classid, 1618 .gpl_only = false, 1619 .ret_type = RET_INTEGER, 1620 .arg1_type = ARG_PTR_TO_CTX, 1621 }; 1622 1623 static u64 bpf_get_route_realm(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) 1624 { 1625 #ifdef CONFIG_IP_ROUTE_CLASSID 1626 const struct dst_entry *dst; 1627 1628 dst = skb_dst((struct sk_buff *) (unsigned long) r1); 1629 if (dst) 1630 return dst->tclassid; 1631 #endif 1632 return 0; 1633 } 1634 1635 static const struct bpf_func_proto bpf_get_route_realm_proto = { 1636 .func = bpf_get_route_realm, 1637 .gpl_only = false, 1638 .ret_type = RET_INTEGER, 1639 .arg1_type = ARG_PTR_TO_CTX, 1640 }; 1641 1642 static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5) 1643 { 1644 struct sk_buff *skb = (struct sk_buff *) (long) r1; 1645 __be16 vlan_proto = (__force __be16) r2; 1646 1647 if (unlikely(vlan_proto != htons(ETH_P_8021Q) && 1648 vlan_proto != htons(ETH_P_8021AD))) 1649 vlan_proto = htons(ETH_P_8021Q); 1650 1651 return skb_vlan_push(skb, vlan_proto, vlan_tci); 1652 } 1653 1654 const struct bpf_func_proto bpf_skb_vlan_push_proto = { 1655 .func = bpf_skb_vlan_push, 1656 .gpl_only = false, 1657 .ret_type = RET_INTEGER, 1658 .arg1_type = ARG_PTR_TO_CTX, 1659 .arg2_type = ARG_ANYTHING, 1660 .arg3_type = ARG_ANYTHING, 1661 }; 1662 EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto); 1663 1664 static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) 1665 { 1666 struct sk_buff *skb = (struct sk_buff *) (long) r1; 1667 1668 return skb_vlan_pop(skb); 1669 } 1670 1671 const struct bpf_func_proto bpf_skb_vlan_pop_proto = { 1672 .func = bpf_skb_vlan_pop, 1673 .gpl_only = false, 1674 .ret_type = RET_INTEGER, 1675 .arg1_type = ARG_PTR_TO_CTX, 1676 }; 1677 EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto); 1678 1679 bool bpf_helper_changes_skb_data(void *func) 1680 { 1681 if (func == bpf_skb_vlan_push) 1682 return true; 1683 if (func == bpf_skb_vlan_pop) 1684 return true; 1685 return false; 1686 } 1687 1688 static unsigned short bpf_tunnel_key_af(u64 flags) 1689 { 1690 return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET; 1691 } 1692 1693 static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) 1694 { 1695 struct sk_buff *skb = (struct sk_buff *) (long) r1; 1696 struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2; 1697 const struct ip_tunnel_info *info = skb_tunnel_info(skb); 1698 u8 compat[sizeof(struct bpf_tunnel_key)]; 1699 1700 if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6)))) 1701 return -EINVAL; 1702 if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) 1703 return -EPROTO; 1704 if (unlikely(size != sizeof(struct bpf_tunnel_key))) { 1705 switch (size) { 1706 case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): 1707 /* Fixup deprecated structure layouts here, so we have 1708 * a common path later on. 1709 */ 1710 if (ip_tunnel_info_af(info) != AF_INET) 1711 return -EINVAL; 1712 to = (struct bpf_tunnel_key *)compat; 1713 break; 1714 default: 1715 return -EINVAL; 1716 } 1717 } 1718 1719 to->tunnel_id = be64_to_cpu(info->key.tun_id); 1720 to->tunnel_tos = info->key.tos; 1721 to->tunnel_ttl = info->key.ttl; 1722 1723 if (flags & BPF_F_TUNINFO_IPV6) 1724 memcpy(to->remote_ipv6, &info->key.u.ipv6.src, 1725 sizeof(to->remote_ipv6)); 1726 else 1727 to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); 1728 1729 if (unlikely(size != sizeof(struct bpf_tunnel_key))) 1730 memcpy((void *)(long) r2, to, size); 1731 1732 return 0; 1733 } 1734 1735 const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { 1736 .func = bpf_skb_get_tunnel_key, 1737 .gpl_only = false, 1738 .ret_type = RET_INTEGER, 1739 .arg1_type = ARG_PTR_TO_CTX, 1740 .arg2_type = ARG_PTR_TO_STACK, 1741 .arg3_type = ARG_CONST_STACK_SIZE, 1742 .arg4_type = ARG_ANYTHING, 1743 }; 1744 1745 static struct metadata_dst __percpu *md_dst; 1746 1747 static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) 1748 { 1749 struct sk_buff *skb = (struct sk_buff *) (long) r1; 1750 struct bpf_tunnel_key *from = (struct bpf_tunnel_key *) (long) r2; 1751 struct metadata_dst *md = this_cpu_ptr(md_dst); 1752 u8 compat[sizeof(struct bpf_tunnel_key)]; 1753 struct ip_tunnel_info *info; 1754 1755 if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6))) 1756 return -EINVAL; 1757 if (unlikely(size != sizeof(struct bpf_tunnel_key))) { 1758 switch (size) { 1759 case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): 1760 /* Fixup deprecated structure layouts here, so we have 1761 * a common path later on. 1762 */ 1763 memcpy(compat, from, size); 1764 memset(compat + size, 0, sizeof(compat) - size); 1765 from = (struct bpf_tunnel_key *)compat; 1766 break; 1767 default: 1768 return -EINVAL; 1769 } 1770 } 1771 1772 skb_dst_drop(skb); 1773 dst_hold((struct dst_entry *) md); 1774 skb_dst_set(skb, (struct dst_entry *) md); 1775 1776 info = &md->u.tun_info; 1777 info->mode = IP_TUNNEL_INFO_TX; 1778 1779 info->key.tun_flags = TUNNEL_KEY; 1780 info->key.tun_id = cpu_to_be64(from->tunnel_id); 1781 info->key.tos = from->tunnel_tos; 1782 info->key.ttl = from->tunnel_ttl; 1783 1784 if (flags & BPF_F_TUNINFO_IPV6) { 1785 info->mode |= IP_TUNNEL_INFO_IPV6; 1786 memcpy(&info->key.u.ipv6.dst, from->remote_ipv6, 1787 sizeof(from->remote_ipv6)); 1788 } else { 1789 info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); 1790 } 1791 1792 return 0; 1793 } 1794 1795 const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { 1796 .func = bpf_skb_set_tunnel_key, 1797 .gpl_only = false, 1798 .ret_type = RET_INTEGER, 1799 .arg1_type = ARG_PTR_TO_CTX, 1800 .arg2_type = ARG_PTR_TO_STACK, 1801 .arg3_type = ARG_CONST_STACK_SIZE, 1802 .arg4_type = ARG_ANYTHING, 1803 }; 1804 1805 static const struct bpf_func_proto *bpf_get_skb_set_tunnel_key_proto(void) 1806 { 1807 if (!md_dst) { 1808 /* race is not possible, since it's called from 1809 * verifier that is holding verifier mutex 1810 */ 1811 md_dst = metadata_dst_alloc_percpu(0, GFP_KERNEL); 1812 if (!md_dst) 1813 return NULL; 1814 } 1815 return &bpf_skb_set_tunnel_key_proto; 1816 } 1817 1818 static const struct bpf_func_proto * 1819 sk_filter_func_proto(enum bpf_func_id func_id) 1820 { 1821 switch (func_id) { 1822 case BPF_FUNC_map_lookup_elem: 1823 return &bpf_map_lookup_elem_proto; 1824 case BPF_FUNC_map_update_elem: 1825 return &bpf_map_update_elem_proto; 1826 case BPF_FUNC_map_delete_elem: 1827 return &bpf_map_delete_elem_proto; 1828 case BPF_FUNC_get_prandom_u32: 1829 return &bpf_get_prandom_u32_proto; 1830 case BPF_FUNC_get_smp_processor_id: 1831 return &bpf_get_smp_processor_id_proto; 1832 case BPF_FUNC_tail_call: 1833 return &bpf_tail_call_proto; 1834 case BPF_FUNC_ktime_get_ns: 1835 return &bpf_ktime_get_ns_proto; 1836 case BPF_FUNC_trace_printk: 1837 if (capable(CAP_SYS_ADMIN)) 1838 return bpf_get_trace_printk_proto(); 1839 default: 1840 return NULL; 1841 } 1842 } 1843 1844 static const struct bpf_func_proto * 1845 tc_cls_act_func_proto(enum bpf_func_id func_id) 1846 { 1847 switch (func_id) { 1848 case BPF_FUNC_skb_store_bytes: 1849 return &bpf_skb_store_bytes_proto; 1850 case BPF_FUNC_skb_load_bytes: 1851 return &bpf_skb_load_bytes_proto; 1852 case BPF_FUNC_l3_csum_replace: 1853 return &bpf_l3_csum_replace_proto; 1854 case BPF_FUNC_l4_csum_replace: 1855 return &bpf_l4_csum_replace_proto; 1856 case BPF_FUNC_clone_redirect: 1857 return &bpf_clone_redirect_proto; 1858 case BPF_FUNC_get_cgroup_classid: 1859 return &bpf_get_cgroup_classid_proto; 1860 case BPF_FUNC_skb_vlan_push: 1861 return &bpf_skb_vlan_push_proto; 1862 case BPF_FUNC_skb_vlan_pop: 1863 return &bpf_skb_vlan_pop_proto; 1864 case BPF_FUNC_skb_get_tunnel_key: 1865 return &bpf_skb_get_tunnel_key_proto; 1866 case BPF_FUNC_skb_set_tunnel_key: 1867 return bpf_get_skb_set_tunnel_key_proto(); 1868 case BPF_FUNC_redirect: 1869 return &bpf_redirect_proto; 1870 case BPF_FUNC_get_route_realm: 1871 return &bpf_get_route_realm_proto; 1872 default: 1873 return sk_filter_func_proto(func_id); 1874 } 1875 } 1876 1877 static bool __is_valid_access(int off, int size, enum bpf_access_type type) 1878 { 1879 /* check bounds */ 1880 if (off < 0 || off >= sizeof(struct __sk_buff)) 1881 return false; 1882 1883 /* disallow misaligned access */ 1884 if (off % size != 0) 1885 return false; 1886 1887 /* all __sk_buff fields are __u32 */ 1888 if (size != 4) 1889 return false; 1890 1891 return true; 1892 } 1893 1894 static bool sk_filter_is_valid_access(int off, int size, 1895 enum bpf_access_type type) 1896 { 1897 if (off == offsetof(struct __sk_buff, tc_classid)) 1898 return false; 1899 1900 if (type == BPF_WRITE) { 1901 switch (off) { 1902 case offsetof(struct __sk_buff, cb[0]) ... 1903 offsetof(struct __sk_buff, cb[4]): 1904 break; 1905 default: 1906 return false; 1907 } 1908 } 1909 1910 return __is_valid_access(off, size, type); 1911 } 1912 1913 static bool tc_cls_act_is_valid_access(int off, int size, 1914 enum bpf_access_type type) 1915 { 1916 if (off == offsetof(struct __sk_buff, tc_classid)) 1917 return type == BPF_WRITE ? true : false; 1918 1919 if (type == BPF_WRITE) { 1920 switch (off) { 1921 case offsetof(struct __sk_buff, mark): 1922 case offsetof(struct __sk_buff, tc_index): 1923 case offsetof(struct __sk_buff, priority): 1924 case offsetof(struct __sk_buff, cb[0]) ... 1925 offsetof(struct __sk_buff, cb[4]): 1926 break; 1927 default: 1928 return false; 1929 } 1930 } 1931 return __is_valid_access(off, size, type); 1932 } 1933 1934 static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, 1935 int src_reg, int ctx_off, 1936 struct bpf_insn *insn_buf, 1937 struct bpf_prog *prog) 1938 { 1939 struct bpf_insn *insn = insn_buf; 1940 1941 switch (ctx_off) { 1942 case offsetof(struct __sk_buff, len): 1943 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); 1944 1945 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 1946 offsetof(struct sk_buff, len)); 1947 break; 1948 1949 case offsetof(struct __sk_buff, protocol): 1950 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); 1951 1952 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 1953 offsetof(struct sk_buff, protocol)); 1954 break; 1955 1956 case offsetof(struct __sk_buff, vlan_proto): 1957 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2); 1958 1959 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 1960 offsetof(struct sk_buff, vlan_proto)); 1961 break; 1962 1963 case offsetof(struct __sk_buff, priority): 1964 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4); 1965 1966 if (type == BPF_WRITE) 1967 *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, 1968 offsetof(struct sk_buff, priority)); 1969 else 1970 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 1971 offsetof(struct sk_buff, priority)); 1972 break; 1973 1974 case offsetof(struct __sk_buff, ingress_ifindex): 1975 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4); 1976 1977 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 1978 offsetof(struct sk_buff, skb_iif)); 1979 break; 1980 1981 case offsetof(struct __sk_buff, ifindex): 1982 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); 1983 1984 *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)), 1985 dst_reg, src_reg, 1986 offsetof(struct sk_buff, dev)); 1987 *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1); 1988 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg, 1989 offsetof(struct net_device, ifindex)); 1990 break; 1991 1992 case offsetof(struct __sk_buff, hash): 1993 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); 1994 1995 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 1996 offsetof(struct sk_buff, hash)); 1997 break; 1998 1999 case offsetof(struct __sk_buff, mark): 2000 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); 2001 2002 if (type == BPF_WRITE) 2003 *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, 2004 offsetof(struct sk_buff, mark)); 2005 else 2006 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 2007 offsetof(struct sk_buff, mark)); 2008 break; 2009 2010 case offsetof(struct __sk_buff, pkt_type): 2011 return convert_skb_access(SKF_AD_PKTTYPE, dst_reg, src_reg, insn); 2012 2013 case offsetof(struct __sk_buff, queue_mapping): 2014 return convert_skb_access(SKF_AD_QUEUE, dst_reg, src_reg, insn); 2015 2016 case offsetof(struct __sk_buff, vlan_present): 2017 return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, 2018 dst_reg, src_reg, insn); 2019 2020 case offsetof(struct __sk_buff, vlan_tci): 2021 return convert_skb_access(SKF_AD_VLAN_TAG, 2022 dst_reg, src_reg, insn); 2023 2024 case offsetof(struct __sk_buff, cb[0]) ... 2025 offsetof(struct __sk_buff, cb[4]): 2026 BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20); 2027 2028 prog->cb_access = 1; 2029 ctx_off -= offsetof(struct __sk_buff, cb[0]); 2030 ctx_off += offsetof(struct sk_buff, cb); 2031 ctx_off += offsetof(struct qdisc_skb_cb, data); 2032 if (type == BPF_WRITE) 2033 *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, ctx_off); 2034 else 2035 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ctx_off); 2036 break; 2037 2038 case offsetof(struct __sk_buff, tc_classid): 2039 ctx_off -= offsetof(struct __sk_buff, tc_classid); 2040 ctx_off += offsetof(struct sk_buff, cb); 2041 ctx_off += offsetof(struct qdisc_skb_cb, tc_classid); 2042 WARN_ON(type != BPF_WRITE); 2043 *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off); 2044 break; 2045 2046 case offsetof(struct __sk_buff, tc_index): 2047 #ifdef CONFIG_NET_SCHED 2048 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2); 2049 2050 if (type == BPF_WRITE) 2051 *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, 2052 offsetof(struct sk_buff, tc_index)); 2053 else 2054 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 2055 offsetof(struct sk_buff, tc_index)); 2056 break; 2057 #else 2058 if (type == BPF_WRITE) 2059 *insn++ = BPF_MOV64_REG(dst_reg, dst_reg); 2060 else 2061 *insn++ = BPF_MOV64_IMM(dst_reg, 0); 2062 break; 2063 #endif 2064 } 2065 2066 return insn - insn_buf; 2067 } 2068 2069 static const struct bpf_verifier_ops sk_filter_ops = { 2070 .get_func_proto = sk_filter_func_proto, 2071 .is_valid_access = sk_filter_is_valid_access, 2072 .convert_ctx_access = bpf_net_convert_ctx_access, 2073 }; 2074 2075 static const struct bpf_verifier_ops tc_cls_act_ops = { 2076 .get_func_proto = tc_cls_act_func_proto, 2077 .is_valid_access = tc_cls_act_is_valid_access, 2078 .convert_ctx_access = bpf_net_convert_ctx_access, 2079 }; 2080 2081 static struct bpf_prog_type_list sk_filter_type __read_mostly = { 2082 .ops = &sk_filter_ops, 2083 .type = BPF_PROG_TYPE_SOCKET_FILTER, 2084 }; 2085 2086 static struct bpf_prog_type_list sched_cls_type __read_mostly = { 2087 .ops = &tc_cls_act_ops, 2088 .type = BPF_PROG_TYPE_SCHED_CLS, 2089 }; 2090 2091 static struct bpf_prog_type_list sched_act_type __read_mostly = { 2092 .ops = &tc_cls_act_ops, 2093 .type = BPF_PROG_TYPE_SCHED_ACT, 2094 }; 2095 2096 static int __init register_sk_filter_ops(void) 2097 { 2098 bpf_register_prog_type(&sk_filter_type); 2099 bpf_register_prog_type(&sched_cls_type); 2100 bpf_register_prog_type(&sched_act_type); 2101 2102 return 0; 2103 } 2104 late_initcall(register_sk_filter_ops); 2105 2106 int sk_detach_filter(struct sock *sk) 2107 { 2108 int ret = -ENOENT; 2109 struct sk_filter *filter; 2110 2111 if (sock_flag(sk, SOCK_FILTER_LOCKED)) 2112 return -EPERM; 2113 2114 filter = rcu_dereference_protected(sk->sk_filter, 2115 sock_owned_by_user(sk)); 2116 if (filter) { 2117 RCU_INIT_POINTER(sk->sk_filter, NULL); 2118 sk_filter_uncharge(sk, filter); 2119 ret = 0; 2120 } 2121 2122 return ret; 2123 } 2124 EXPORT_SYMBOL_GPL(sk_detach_filter); 2125 2126 int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, 2127 unsigned int len) 2128 { 2129 struct sock_fprog_kern *fprog; 2130 struct sk_filter *filter; 2131 int ret = 0; 2132 2133 lock_sock(sk); 2134 filter = rcu_dereference_protected(sk->sk_filter, 2135 sock_owned_by_user(sk)); 2136 if (!filter) 2137 goto out; 2138 2139 /* We're copying the filter that has been originally attached, 2140 * so no conversion/decode needed anymore. eBPF programs that 2141 * have no original program cannot be dumped through this. 2142 */ 2143 ret = -EACCES; 2144 fprog = filter->prog->orig_prog; 2145 if (!fprog) 2146 goto out; 2147 2148 ret = fprog->len; 2149 if (!len) 2150 /* User space only enquires number of filter blocks. */ 2151 goto out; 2152 2153 ret = -EINVAL; 2154 if (len < fprog->len) 2155 goto out; 2156 2157 ret = -EFAULT; 2158 if (copy_to_user(ubuf, fprog->filter, bpf_classic_proglen(fprog))) 2159 goto out; 2160 2161 /* Instead of bytes, the API requests to return the number 2162 * of filter blocks. 2163 */ 2164 ret = fprog->len; 2165 out: 2166 release_sock(sk); 2167 return ret; 2168 } 2169