1 /* 2 * Linux Socket Filter - Kernel level socket filtering 3 * 4 * Based on the design of the Berkeley Packet Filter. The new 5 * internal format has been designed by PLUMgrid: 6 * 7 * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com 8 * 9 * Authors: 10 * 11 * Jay Schulist <jschlst@samba.org> 12 * Alexei Starovoitov <ast@plumgrid.com> 13 * Daniel Borkmann <dborkman@redhat.com> 14 * 15 * This program is free software; you can redistribute it and/or 16 * modify it under the terms of the GNU General Public License 17 * as published by the Free Software Foundation; either version 18 * 2 of the License, or (at your option) any later version. 19 * 20 * Andi Kleen - Fix a few bad bugs and races. 21 * Kris Katterjohn - Added many additional checks in bpf_check_classic() 22 */ 23 24 #include <linux/module.h> 25 #include <linux/types.h> 26 #include <linux/mm.h> 27 #include <linux/fcntl.h> 28 #include <linux/socket.h> 29 #include <linux/sock_diag.h> 30 #include <linux/in.h> 31 #include <linux/inet.h> 32 #include <linux/netdevice.h> 33 #include <linux/if_packet.h> 34 #include <linux/if_arp.h> 35 #include <linux/gfp.h> 36 #include <net/ip.h> 37 #include <net/protocol.h> 38 #include <net/netlink.h> 39 #include <linux/skbuff.h> 40 #include <net/sock.h> 41 #include <net/flow_dissector.h> 42 #include <linux/errno.h> 43 #include <linux/timer.h> 44 #include <linux/uaccess.h> 45 #include <asm/unaligned.h> 46 #include <linux/filter.h> 47 #include <linux/ratelimit.h> 48 #include <linux/seccomp.h> 49 #include <linux/if_vlan.h> 50 #include <linux/bpf.h> 51 #include <net/sch_generic.h> 52 #include <net/cls_cgroup.h> 53 #include <net/dst_metadata.h> 54 #include <net/dst.h> 55 #include <net/sock_reuseport.h> 56 #include <net/busy_poll.h> 57 58 /** 59 * sk_filter_trim_cap - run a packet through a socket filter 60 * @sk: sock associated with &sk_buff 61 * @skb: buffer to filter 62 * @cap: limit on how short the eBPF program may trim the packet 63 * 64 * Run the eBPF program and then cut skb->data to correct size returned by 65 * the program. If pkt_len is 0 we toss packet. If skb->len is smaller 66 * than pkt_len we keep whole skb->data. This is the socket level 67 * wrapper to BPF_PROG_RUN. It returns 0 if the packet should 68 * be accepted or -EPERM if the packet should be tossed. 69 * 70 */ 71 int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) 72 { 73 int err; 74 struct sk_filter *filter; 75 76 /* 77 * If the skb was allocated from pfmemalloc reserves, only 78 * allow SOCK_MEMALLOC sockets to use it as this socket is 79 * helping free memory 80 */ 81 if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) { 82 NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP); 83 return -ENOMEM; 84 } 85 err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb); 86 if (err) 87 return err; 88 89 err = security_sock_rcv_skb(sk, skb); 90 if (err) 91 return err; 92 93 rcu_read_lock(); 94 filter = rcu_dereference(sk->sk_filter); 95 if (filter) { 96 struct sock *save_sk = skb->sk; 97 unsigned int pkt_len; 98 99 skb->sk = sk; 100 pkt_len = bpf_prog_run_save_cb(filter->prog, skb); 101 skb->sk = save_sk; 102 err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM; 103 } 104 rcu_read_unlock(); 105 106 return err; 107 } 108 EXPORT_SYMBOL(sk_filter_trim_cap); 109 110 BPF_CALL_1(__skb_get_pay_offset, struct sk_buff *, skb) 111 { 112 return skb_get_poff(skb); 113 } 114 115 BPF_CALL_3(__skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) 116 { 117 struct nlattr *nla; 118 119 if (skb_is_nonlinear(skb)) 120 return 0; 121 122 if (skb->len < sizeof(struct nlattr)) 123 return 0; 124 125 if (a > skb->len - sizeof(struct nlattr)) 126 return 0; 127 128 nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x); 129 if (nla) 130 return (void *) nla - (void *) skb->data; 131 132 return 0; 133 } 134 135 BPF_CALL_3(__skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) 136 { 137 struct nlattr *nla; 138 139 if (skb_is_nonlinear(skb)) 140 return 0; 141 142 if (skb->len < sizeof(struct nlattr)) 143 return 0; 144 145 if (a > skb->len - sizeof(struct nlattr)) 146 return 0; 147 148 nla = (struct nlattr *) &skb->data[a]; 149 if (nla->nla_len > skb->len - a) 150 return 0; 151 152 nla = nla_find_nested(nla, x); 153 if (nla) 154 return (void *) nla - (void *) skb->data; 155 156 return 0; 157 } 158 159 BPF_CALL_0(__get_raw_cpu_id) 160 { 161 return raw_smp_processor_id(); 162 } 163 164 static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { 165 .func = __get_raw_cpu_id, 166 .gpl_only = false, 167 .ret_type = RET_INTEGER, 168 }; 169 170 static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, 171 struct bpf_insn *insn_buf) 172 { 173 struct bpf_insn *insn = insn_buf; 174 175 switch (skb_field) { 176 case SKF_AD_MARK: 177 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); 178 179 *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, 180 offsetof(struct sk_buff, mark)); 181 break; 182 183 case SKF_AD_PKTTYPE: 184 *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET()); 185 *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX); 186 #ifdef __BIG_ENDIAN_BITFIELD 187 *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5); 188 #endif 189 break; 190 191 case SKF_AD_QUEUE: 192 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2); 193 194 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 195 offsetof(struct sk_buff, queue_mapping)); 196 break; 197 198 case SKF_AD_VLAN_TAG: 199 case SKF_AD_VLAN_TAG_PRESENT: 200 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); 201 BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); 202 203 /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */ 204 *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, 205 offsetof(struct sk_buff, vlan_tci)); 206 if (skb_field == SKF_AD_VLAN_TAG) { 207 *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 208 ~VLAN_TAG_PRESENT); 209 } else { 210 /* dst_reg >>= 12 */ 211 *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 12); 212 /* dst_reg &= 1 */ 213 *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1); 214 } 215 break; 216 } 217 218 return insn - insn_buf; 219 } 220 221 static bool convert_bpf_extensions(struct sock_filter *fp, 222 struct bpf_insn **insnp) 223 { 224 struct bpf_insn *insn = *insnp; 225 u32 cnt; 226 227 switch (fp->k) { 228 case SKF_AD_OFF + SKF_AD_PROTOCOL: 229 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); 230 231 /* A = *(u16 *) (CTX + offsetof(protocol)) */ 232 *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, 233 offsetof(struct sk_buff, protocol)); 234 /* A = ntohs(A) [emitting a nop or swap16] */ 235 *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); 236 break; 237 238 case SKF_AD_OFF + SKF_AD_PKTTYPE: 239 cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn); 240 insn += cnt - 1; 241 break; 242 243 case SKF_AD_OFF + SKF_AD_IFINDEX: 244 case SKF_AD_OFF + SKF_AD_HATYPE: 245 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); 246 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2); 247 248 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), 249 BPF_REG_TMP, BPF_REG_CTX, 250 offsetof(struct sk_buff, dev)); 251 /* if (tmp != 0) goto pc + 1 */ 252 *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1); 253 *insn++ = BPF_EXIT_INSN(); 254 if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX) 255 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP, 256 offsetof(struct net_device, ifindex)); 257 else 258 *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP, 259 offsetof(struct net_device, type)); 260 break; 261 262 case SKF_AD_OFF + SKF_AD_MARK: 263 cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn); 264 insn += cnt - 1; 265 break; 266 267 case SKF_AD_OFF + SKF_AD_RXHASH: 268 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); 269 270 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, 271 offsetof(struct sk_buff, hash)); 272 break; 273 274 case SKF_AD_OFF + SKF_AD_QUEUE: 275 cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn); 276 insn += cnt - 1; 277 break; 278 279 case SKF_AD_OFF + SKF_AD_VLAN_TAG: 280 cnt = convert_skb_access(SKF_AD_VLAN_TAG, 281 BPF_REG_A, BPF_REG_CTX, insn); 282 insn += cnt - 1; 283 break; 284 285 case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT: 286 cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, 287 BPF_REG_A, BPF_REG_CTX, insn); 288 insn += cnt - 1; 289 break; 290 291 case SKF_AD_OFF + SKF_AD_VLAN_TPID: 292 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2); 293 294 /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */ 295 *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, 296 offsetof(struct sk_buff, vlan_proto)); 297 /* A = ntohs(A) [emitting a nop or swap16] */ 298 *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16); 299 break; 300 301 case SKF_AD_OFF + SKF_AD_PAY_OFFSET: 302 case SKF_AD_OFF + SKF_AD_NLATTR: 303 case SKF_AD_OFF + SKF_AD_NLATTR_NEST: 304 case SKF_AD_OFF + SKF_AD_CPU: 305 case SKF_AD_OFF + SKF_AD_RANDOM: 306 /* arg1 = CTX */ 307 *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); 308 /* arg2 = A */ 309 *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A); 310 /* arg3 = X */ 311 *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X); 312 /* Emit call(arg1=CTX, arg2=A, arg3=X) */ 313 switch (fp->k) { 314 case SKF_AD_OFF + SKF_AD_PAY_OFFSET: 315 *insn = BPF_EMIT_CALL(__skb_get_pay_offset); 316 break; 317 case SKF_AD_OFF + SKF_AD_NLATTR: 318 *insn = BPF_EMIT_CALL(__skb_get_nlattr); 319 break; 320 case SKF_AD_OFF + SKF_AD_NLATTR_NEST: 321 *insn = BPF_EMIT_CALL(__skb_get_nlattr_nest); 322 break; 323 case SKF_AD_OFF + SKF_AD_CPU: 324 *insn = BPF_EMIT_CALL(__get_raw_cpu_id); 325 break; 326 case SKF_AD_OFF + SKF_AD_RANDOM: 327 *insn = BPF_EMIT_CALL(bpf_user_rnd_u32); 328 bpf_user_rnd_init_once(); 329 break; 330 } 331 break; 332 333 case SKF_AD_OFF + SKF_AD_ALU_XOR_X: 334 /* A ^= X */ 335 *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X); 336 break; 337 338 default: 339 /* This is just a dummy call to avoid letting the compiler 340 * evict __bpf_call_base() as an optimization. Placed here 341 * where no-one bothers. 342 */ 343 BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0); 344 return false; 345 } 346 347 *insnp = insn; 348 return true; 349 } 350 351 /** 352 * bpf_convert_filter - convert filter program 353 * @prog: the user passed filter program 354 * @len: the length of the user passed filter program 355 * @new_prog: buffer where converted program will be stored 356 * @new_len: pointer to store length of converted program 357 * 358 * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn' 359 * style extended BPF (eBPF). 360 * Conversion workflow: 361 * 362 * 1) First pass for calculating the new program length: 363 * bpf_convert_filter(old_prog, old_len, NULL, &new_len) 364 * 365 * 2) 2nd pass to remap in two passes: 1st pass finds new 366 * jump offsets, 2nd pass remapping: 367 * new_prog = kmalloc(sizeof(struct bpf_insn) * new_len); 368 * bpf_convert_filter(old_prog, old_len, new_prog, &new_len); 369 */ 370 static int bpf_convert_filter(struct sock_filter *prog, int len, 371 struct bpf_insn *new_prog, int *new_len) 372 { 373 int new_flen = 0, pass = 0, target, i; 374 struct bpf_insn *new_insn; 375 struct sock_filter *fp; 376 int *addrs = NULL; 377 u8 bpf_src; 378 379 BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK); 380 BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); 381 382 if (len <= 0 || len > BPF_MAXINSNS) 383 return -EINVAL; 384 385 if (new_prog) { 386 addrs = kcalloc(len, sizeof(*addrs), 387 GFP_KERNEL | __GFP_NOWARN); 388 if (!addrs) 389 return -ENOMEM; 390 } 391 392 do_pass: 393 new_insn = new_prog; 394 fp = prog; 395 396 /* Classic BPF related prologue emission. */ 397 if (new_insn) { 398 /* Classic BPF expects A and X to be reset first. These need 399 * to be guaranteed to be the first two instructions. 400 */ 401 *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); 402 *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); 403 404 /* All programs must keep CTX in callee saved BPF_REG_CTX. 405 * In eBPF case it's done by the compiler, here we need to 406 * do this ourself. Initial CTX is present in BPF_REG_ARG1. 407 */ 408 *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); 409 } else { 410 new_insn += 3; 411 } 412 413 for (i = 0; i < len; fp++, i++) { 414 struct bpf_insn tmp_insns[6] = { }; 415 struct bpf_insn *insn = tmp_insns; 416 417 if (addrs) 418 addrs[i] = new_insn - new_prog; 419 420 switch (fp->code) { 421 /* All arithmetic insns and skb loads map as-is. */ 422 case BPF_ALU | BPF_ADD | BPF_X: 423 case BPF_ALU | BPF_ADD | BPF_K: 424 case BPF_ALU | BPF_SUB | BPF_X: 425 case BPF_ALU | BPF_SUB | BPF_K: 426 case BPF_ALU | BPF_AND | BPF_X: 427 case BPF_ALU | BPF_AND | BPF_K: 428 case BPF_ALU | BPF_OR | BPF_X: 429 case BPF_ALU | BPF_OR | BPF_K: 430 case BPF_ALU | BPF_LSH | BPF_X: 431 case BPF_ALU | BPF_LSH | BPF_K: 432 case BPF_ALU | BPF_RSH | BPF_X: 433 case BPF_ALU | BPF_RSH | BPF_K: 434 case BPF_ALU | BPF_XOR | BPF_X: 435 case BPF_ALU | BPF_XOR | BPF_K: 436 case BPF_ALU | BPF_MUL | BPF_X: 437 case BPF_ALU | BPF_MUL | BPF_K: 438 case BPF_ALU | BPF_DIV | BPF_X: 439 case BPF_ALU | BPF_DIV | BPF_K: 440 case BPF_ALU | BPF_MOD | BPF_X: 441 case BPF_ALU | BPF_MOD | BPF_K: 442 case BPF_ALU | BPF_NEG: 443 case BPF_LD | BPF_ABS | BPF_W: 444 case BPF_LD | BPF_ABS | BPF_H: 445 case BPF_LD | BPF_ABS | BPF_B: 446 case BPF_LD | BPF_IND | BPF_W: 447 case BPF_LD | BPF_IND | BPF_H: 448 case BPF_LD | BPF_IND | BPF_B: 449 /* Check for overloaded BPF extension and 450 * directly convert it if found, otherwise 451 * just move on with mapping. 452 */ 453 if (BPF_CLASS(fp->code) == BPF_LD && 454 BPF_MODE(fp->code) == BPF_ABS && 455 convert_bpf_extensions(fp, &insn)) 456 break; 457 458 *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); 459 break; 460 461 /* Jump transformation cannot use BPF block macros 462 * everywhere as offset calculation and target updates 463 * require a bit more work than the rest, i.e. jump 464 * opcodes map as-is, but offsets need adjustment. 465 */ 466 467 #define BPF_EMIT_JMP \ 468 do { \ 469 if (target >= len || target < 0) \ 470 goto err; \ 471 insn->off = addrs ? addrs[target] - addrs[i] - 1 : 0; \ 472 /* Adjust pc relative offset for 2nd or 3rd insn. */ \ 473 insn->off -= insn - tmp_insns; \ 474 } while (0) 475 476 case BPF_JMP | BPF_JA: 477 target = i + fp->k + 1; 478 insn->code = fp->code; 479 BPF_EMIT_JMP; 480 break; 481 482 case BPF_JMP | BPF_JEQ | BPF_K: 483 case BPF_JMP | BPF_JEQ | BPF_X: 484 case BPF_JMP | BPF_JSET | BPF_K: 485 case BPF_JMP | BPF_JSET | BPF_X: 486 case BPF_JMP | BPF_JGT | BPF_K: 487 case BPF_JMP | BPF_JGT | BPF_X: 488 case BPF_JMP | BPF_JGE | BPF_K: 489 case BPF_JMP | BPF_JGE | BPF_X: 490 if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) { 491 /* BPF immediates are signed, zero extend 492 * immediate into tmp register and use it 493 * in compare insn. 494 */ 495 *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k); 496 497 insn->dst_reg = BPF_REG_A; 498 insn->src_reg = BPF_REG_TMP; 499 bpf_src = BPF_X; 500 } else { 501 insn->dst_reg = BPF_REG_A; 502 insn->imm = fp->k; 503 bpf_src = BPF_SRC(fp->code); 504 insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0; 505 } 506 507 /* Common case where 'jump_false' is next insn. */ 508 if (fp->jf == 0) { 509 insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; 510 target = i + fp->jt + 1; 511 BPF_EMIT_JMP; 512 break; 513 } 514 515 /* Convert JEQ into JNE when 'jump_true' is next insn. */ 516 if (fp->jt == 0 && BPF_OP(fp->code) == BPF_JEQ) { 517 insn->code = BPF_JMP | BPF_JNE | bpf_src; 518 target = i + fp->jf + 1; 519 BPF_EMIT_JMP; 520 break; 521 } 522 523 /* Other jumps are mapped into two insns: Jxx and JA. */ 524 target = i + fp->jt + 1; 525 insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; 526 BPF_EMIT_JMP; 527 insn++; 528 529 insn->code = BPF_JMP | BPF_JA; 530 target = i + fp->jf + 1; 531 BPF_EMIT_JMP; 532 break; 533 534 /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */ 535 case BPF_LDX | BPF_MSH | BPF_B: 536 /* tmp = A */ 537 *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A); 538 /* A = BPF_R0 = *(u8 *) (skb->data + K) */ 539 *insn++ = BPF_LD_ABS(BPF_B, fp->k); 540 /* A &= 0xf */ 541 *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf); 542 /* A <<= 2 */ 543 *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2); 544 /* X = A */ 545 *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); 546 /* A = tmp */ 547 *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); 548 break; 549 550 /* RET_K is remaped into 2 insns. RET_A case doesn't need an 551 * extra mov as BPF_REG_0 is already mapped into BPF_REG_A. 552 */ 553 case BPF_RET | BPF_A: 554 case BPF_RET | BPF_K: 555 if (BPF_RVAL(fp->code) == BPF_K) 556 *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0, 557 0, fp->k); 558 *insn = BPF_EXIT_INSN(); 559 break; 560 561 /* Store to stack. */ 562 case BPF_ST: 563 case BPF_STX: 564 *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) == 565 BPF_ST ? BPF_REG_A : BPF_REG_X, 566 -(BPF_MEMWORDS - fp->k) * 4); 567 break; 568 569 /* Load from stack. */ 570 case BPF_LD | BPF_MEM: 571 case BPF_LDX | BPF_MEM: 572 *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? 573 BPF_REG_A : BPF_REG_X, BPF_REG_FP, 574 -(BPF_MEMWORDS - fp->k) * 4); 575 break; 576 577 /* A = K or X = K */ 578 case BPF_LD | BPF_IMM: 579 case BPF_LDX | BPF_IMM: 580 *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ? 581 BPF_REG_A : BPF_REG_X, fp->k); 582 break; 583 584 /* X = A */ 585 case BPF_MISC | BPF_TAX: 586 *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); 587 break; 588 589 /* A = X */ 590 case BPF_MISC | BPF_TXA: 591 *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X); 592 break; 593 594 /* A = skb->len or X = skb->len */ 595 case BPF_LD | BPF_W | BPF_LEN: 596 case BPF_LDX | BPF_W | BPF_LEN: 597 *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? 598 BPF_REG_A : BPF_REG_X, BPF_REG_CTX, 599 offsetof(struct sk_buff, len)); 600 break; 601 602 /* Access seccomp_data fields. */ 603 case BPF_LDX | BPF_ABS | BPF_W: 604 /* A = *(u32 *) (ctx + K) */ 605 *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k); 606 break; 607 608 /* Unknown instruction. */ 609 default: 610 goto err; 611 } 612 613 insn++; 614 if (new_prog) 615 memcpy(new_insn, tmp_insns, 616 sizeof(*insn) * (insn - tmp_insns)); 617 new_insn += insn - tmp_insns; 618 } 619 620 if (!new_prog) { 621 /* Only calculating new length. */ 622 *new_len = new_insn - new_prog; 623 return 0; 624 } 625 626 pass++; 627 if (new_flen != new_insn - new_prog) { 628 new_flen = new_insn - new_prog; 629 if (pass > 2) 630 goto err; 631 goto do_pass; 632 } 633 634 kfree(addrs); 635 BUG_ON(*new_len != new_flen); 636 return 0; 637 err: 638 kfree(addrs); 639 return -EINVAL; 640 } 641 642 /* Security: 643 * 644 * As we dont want to clear mem[] array for each packet going through 645 * __bpf_prog_run(), we check that filter loaded by user never try to read 646 * a cell if not previously written, and we check all branches to be sure 647 * a malicious user doesn't try to abuse us. 648 */ 649 static int check_load_and_stores(const struct sock_filter *filter, int flen) 650 { 651 u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */ 652 int pc, ret = 0; 653 654 BUILD_BUG_ON(BPF_MEMWORDS > 16); 655 656 masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL); 657 if (!masks) 658 return -ENOMEM; 659 660 memset(masks, 0xff, flen * sizeof(*masks)); 661 662 for (pc = 0; pc < flen; pc++) { 663 memvalid &= masks[pc]; 664 665 switch (filter[pc].code) { 666 case BPF_ST: 667 case BPF_STX: 668 memvalid |= (1 << filter[pc].k); 669 break; 670 case BPF_LD | BPF_MEM: 671 case BPF_LDX | BPF_MEM: 672 if (!(memvalid & (1 << filter[pc].k))) { 673 ret = -EINVAL; 674 goto error; 675 } 676 break; 677 case BPF_JMP | BPF_JA: 678 /* A jump must set masks on target */ 679 masks[pc + 1 + filter[pc].k] &= memvalid; 680 memvalid = ~0; 681 break; 682 case BPF_JMP | BPF_JEQ | BPF_K: 683 case BPF_JMP | BPF_JEQ | BPF_X: 684 case BPF_JMP | BPF_JGE | BPF_K: 685 case BPF_JMP | BPF_JGE | BPF_X: 686 case BPF_JMP | BPF_JGT | BPF_K: 687 case BPF_JMP | BPF_JGT | BPF_X: 688 case BPF_JMP | BPF_JSET | BPF_K: 689 case BPF_JMP | BPF_JSET | BPF_X: 690 /* A jump must set masks on targets */ 691 masks[pc + 1 + filter[pc].jt] &= memvalid; 692 masks[pc + 1 + filter[pc].jf] &= memvalid; 693 memvalid = ~0; 694 break; 695 } 696 } 697 error: 698 kfree(masks); 699 return ret; 700 } 701 702 static bool chk_code_allowed(u16 code_to_probe) 703 { 704 static const bool codes[] = { 705 /* 32 bit ALU operations */ 706 [BPF_ALU | BPF_ADD | BPF_K] = true, 707 [BPF_ALU | BPF_ADD | BPF_X] = true, 708 [BPF_ALU | BPF_SUB | BPF_K] = true, 709 [BPF_ALU | BPF_SUB | BPF_X] = true, 710 [BPF_ALU | BPF_MUL | BPF_K] = true, 711 [BPF_ALU | BPF_MUL | BPF_X] = true, 712 [BPF_ALU | BPF_DIV | BPF_K] = true, 713 [BPF_ALU | BPF_DIV | BPF_X] = true, 714 [BPF_ALU | BPF_MOD | BPF_K] = true, 715 [BPF_ALU | BPF_MOD | BPF_X] = true, 716 [BPF_ALU | BPF_AND | BPF_K] = true, 717 [BPF_ALU | BPF_AND | BPF_X] = true, 718 [BPF_ALU | BPF_OR | BPF_K] = true, 719 [BPF_ALU | BPF_OR | BPF_X] = true, 720 [BPF_ALU | BPF_XOR | BPF_K] = true, 721 [BPF_ALU | BPF_XOR | BPF_X] = true, 722 [BPF_ALU | BPF_LSH | BPF_K] = true, 723 [BPF_ALU | BPF_LSH | BPF_X] = true, 724 [BPF_ALU | BPF_RSH | BPF_K] = true, 725 [BPF_ALU | BPF_RSH | BPF_X] = true, 726 [BPF_ALU | BPF_NEG] = true, 727 /* Load instructions */ 728 [BPF_LD | BPF_W | BPF_ABS] = true, 729 [BPF_LD | BPF_H | BPF_ABS] = true, 730 [BPF_LD | BPF_B | BPF_ABS] = true, 731 [BPF_LD | BPF_W | BPF_LEN] = true, 732 [BPF_LD | BPF_W | BPF_IND] = true, 733 [BPF_LD | BPF_H | BPF_IND] = true, 734 [BPF_LD | BPF_B | BPF_IND] = true, 735 [BPF_LD | BPF_IMM] = true, 736 [BPF_LD | BPF_MEM] = true, 737 [BPF_LDX | BPF_W | BPF_LEN] = true, 738 [BPF_LDX | BPF_B | BPF_MSH] = true, 739 [BPF_LDX | BPF_IMM] = true, 740 [BPF_LDX | BPF_MEM] = true, 741 /* Store instructions */ 742 [BPF_ST] = true, 743 [BPF_STX] = true, 744 /* Misc instructions */ 745 [BPF_MISC | BPF_TAX] = true, 746 [BPF_MISC | BPF_TXA] = true, 747 /* Return instructions */ 748 [BPF_RET | BPF_K] = true, 749 [BPF_RET | BPF_A] = true, 750 /* Jump instructions */ 751 [BPF_JMP | BPF_JA] = true, 752 [BPF_JMP | BPF_JEQ | BPF_K] = true, 753 [BPF_JMP | BPF_JEQ | BPF_X] = true, 754 [BPF_JMP | BPF_JGE | BPF_K] = true, 755 [BPF_JMP | BPF_JGE | BPF_X] = true, 756 [BPF_JMP | BPF_JGT | BPF_K] = true, 757 [BPF_JMP | BPF_JGT | BPF_X] = true, 758 [BPF_JMP | BPF_JSET | BPF_K] = true, 759 [BPF_JMP | BPF_JSET | BPF_X] = true, 760 }; 761 762 if (code_to_probe >= ARRAY_SIZE(codes)) 763 return false; 764 765 return codes[code_to_probe]; 766 } 767 768 static bool bpf_check_basics_ok(const struct sock_filter *filter, 769 unsigned int flen) 770 { 771 if (filter == NULL) 772 return false; 773 if (flen == 0 || flen > BPF_MAXINSNS) 774 return false; 775 776 return true; 777 } 778 779 /** 780 * bpf_check_classic - verify socket filter code 781 * @filter: filter to verify 782 * @flen: length of filter 783 * 784 * Check the user's filter code. If we let some ugly 785 * filter code slip through kaboom! The filter must contain 786 * no references or jumps that are out of range, no illegal 787 * instructions, and must end with a RET instruction. 788 * 789 * All jumps are forward as they are not signed. 790 * 791 * Returns 0 if the rule set is legal or -EINVAL if not. 792 */ 793 static int bpf_check_classic(const struct sock_filter *filter, 794 unsigned int flen) 795 { 796 bool anc_found; 797 int pc; 798 799 /* Check the filter code now */ 800 for (pc = 0; pc < flen; pc++) { 801 const struct sock_filter *ftest = &filter[pc]; 802 803 /* May we actually operate on this code? */ 804 if (!chk_code_allowed(ftest->code)) 805 return -EINVAL; 806 807 /* Some instructions need special checks */ 808 switch (ftest->code) { 809 case BPF_ALU | BPF_DIV | BPF_K: 810 case BPF_ALU | BPF_MOD | BPF_K: 811 /* Check for division by zero */ 812 if (ftest->k == 0) 813 return -EINVAL; 814 break; 815 case BPF_ALU | BPF_LSH | BPF_K: 816 case BPF_ALU | BPF_RSH | BPF_K: 817 if (ftest->k >= 32) 818 return -EINVAL; 819 break; 820 case BPF_LD | BPF_MEM: 821 case BPF_LDX | BPF_MEM: 822 case BPF_ST: 823 case BPF_STX: 824 /* Check for invalid memory addresses */ 825 if (ftest->k >= BPF_MEMWORDS) 826 return -EINVAL; 827 break; 828 case BPF_JMP | BPF_JA: 829 /* Note, the large ftest->k might cause loops. 830 * Compare this with conditional jumps below, 831 * where offsets are limited. --ANK (981016) 832 */ 833 if (ftest->k >= (unsigned int)(flen - pc - 1)) 834 return -EINVAL; 835 break; 836 case BPF_JMP | BPF_JEQ | BPF_K: 837 case BPF_JMP | BPF_JEQ | BPF_X: 838 case BPF_JMP | BPF_JGE | BPF_K: 839 case BPF_JMP | BPF_JGE | BPF_X: 840 case BPF_JMP | BPF_JGT | BPF_K: 841 case BPF_JMP | BPF_JGT | BPF_X: 842 case BPF_JMP | BPF_JSET | BPF_K: 843 case BPF_JMP | BPF_JSET | BPF_X: 844 /* Both conditionals must be safe */ 845 if (pc + ftest->jt + 1 >= flen || 846 pc + ftest->jf + 1 >= flen) 847 return -EINVAL; 848 break; 849 case BPF_LD | BPF_W | BPF_ABS: 850 case BPF_LD | BPF_H | BPF_ABS: 851 case BPF_LD | BPF_B | BPF_ABS: 852 anc_found = false; 853 if (bpf_anc_helper(ftest) & BPF_ANC) 854 anc_found = true; 855 /* Ancillary operation unknown or unsupported */ 856 if (anc_found == false && ftest->k >= SKF_AD_OFF) 857 return -EINVAL; 858 } 859 } 860 861 /* Last instruction must be a RET code */ 862 switch (filter[flen - 1].code) { 863 case BPF_RET | BPF_K: 864 case BPF_RET | BPF_A: 865 return check_load_and_stores(filter, flen); 866 } 867 868 return -EINVAL; 869 } 870 871 static int bpf_prog_store_orig_filter(struct bpf_prog *fp, 872 const struct sock_fprog *fprog) 873 { 874 unsigned int fsize = bpf_classic_proglen(fprog); 875 struct sock_fprog_kern *fkprog; 876 877 fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL); 878 if (!fp->orig_prog) 879 return -ENOMEM; 880 881 fkprog = fp->orig_prog; 882 fkprog->len = fprog->len; 883 884 fkprog->filter = kmemdup(fp->insns, fsize, 885 GFP_KERNEL | __GFP_NOWARN); 886 if (!fkprog->filter) { 887 kfree(fp->orig_prog); 888 return -ENOMEM; 889 } 890 891 return 0; 892 } 893 894 static void bpf_release_orig_filter(struct bpf_prog *fp) 895 { 896 struct sock_fprog_kern *fprog = fp->orig_prog; 897 898 if (fprog) { 899 kfree(fprog->filter); 900 kfree(fprog); 901 } 902 } 903 904 static void __bpf_prog_release(struct bpf_prog *prog) 905 { 906 if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) { 907 bpf_prog_put(prog); 908 } else { 909 bpf_release_orig_filter(prog); 910 bpf_prog_free(prog); 911 } 912 } 913 914 static void __sk_filter_release(struct sk_filter *fp) 915 { 916 __bpf_prog_release(fp->prog); 917 kfree(fp); 918 } 919 920 /** 921 * sk_filter_release_rcu - Release a socket filter by rcu_head 922 * @rcu: rcu_head that contains the sk_filter to free 923 */ 924 static void sk_filter_release_rcu(struct rcu_head *rcu) 925 { 926 struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); 927 928 __sk_filter_release(fp); 929 } 930 931 /** 932 * sk_filter_release - release a socket filter 933 * @fp: filter to remove 934 * 935 * Remove a filter from a socket and release its resources. 936 */ 937 static void sk_filter_release(struct sk_filter *fp) 938 { 939 if (refcount_dec_and_test(&fp->refcnt)) 940 call_rcu(&fp->rcu, sk_filter_release_rcu); 941 } 942 943 void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) 944 { 945 u32 filter_size = bpf_prog_size(fp->prog->len); 946 947 atomic_sub(filter_size, &sk->sk_omem_alloc); 948 sk_filter_release(fp); 949 } 950 951 /* try to charge the socket memory if there is space available 952 * return true on success 953 */ 954 static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp) 955 { 956 u32 filter_size = bpf_prog_size(fp->prog->len); 957 958 /* same check as in sock_kmalloc() */ 959 if (filter_size <= sysctl_optmem_max && 960 atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) { 961 atomic_add(filter_size, &sk->sk_omem_alloc); 962 return true; 963 } 964 return false; 965 } 966 967 bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) 968 { 969 bool ret = __sk_filter_charge(sk, fp); 970 if (ret) 971 refcount_inc(&fp->refcnt); 972 return ret; 973 } 974 975 static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) 976 { 977 struct sock_filter *old_prog; 978 struct bpf_prog *old_fp; 979 int err, new_len, old_len = fp->len; 980 981 /* We are free to overwrite insns et al right here as it 982 * won't be used at this point in time anymore internally 983 * after the migration to the internal BPF instruction 984 * representation. 985 */ 986 BUILD_BUG_ON(sizeof(struct sock_filter) != 987 sizeof(struct bpf_insn)); 988 989 /* Conversion cannot happen on overlapping memory areas, 990 * so we need to keep the user BPF around until the 2nd 991 * pass. At this time, the user BPF is stored in fp->insns. 992 */ 993 old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter), 994 GFP_KERNEL | __GFP_NOWARN); 995 if (!old_prog) { 996 err = -ENOMEM; 997 goto out_err; 998 } 999 1000 /* 1st pass: calculate the new program length. */ 1001 err = bpf_convert_filter(old_prog, old_len, NULL, &new_len); 1002 if (err) 1003 goto out_err_free; 1004 1005 /* Expand fp for appending the new filter representation. */ 1006 old_fp = fp; 1007 fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0); 1008 if (!fp) { 1009 /* The old_fp is still around in case we couldn't 1010 * allocate new memory, so uncharge on that one. 1011 */ 1012 fp = old_fp; 1013 err = -ENOMEM; 1014 goto out_err_free; 1015 } 1016 1017 fp->len = new_len; 1018 1019 /* 2nd pass: remap sock_filter insns into bpf_insn insns. */ 1020 err = bpf_convert_filter(old_prog, old_len, fp->insnsi, &new_len); 1021 if (err) 1022 /* 2nd bpf_convert_filter() can fail only if it fails 1023 * to allocate memory, remapping must succeed. Note, 1024 * that at this time old_fp has already been released 1025 * by krealloc(). 1026 */ 1027 goto out_err_free; 1028 1029 /* We are guaranteed to never error here with cBPF to eBPF 1030 * transitions, since there's no issue with type compatibility 1031 * checks on program arrays. 1032 */ 1033 fp = bpf_prog_select_runtime(fp, &err); 1034 1035 kfree(old_prog); 1036 return fp; 1037 1038 out_err_free: 1039 kfree(old_prog); 1040 out_err: 1041 __bpf_prog_release(fp); 1042 return ERR_PTR(err); 1043 } 1044 1045 static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp, 1046 bpf_aux_classic_check_t trans) 1047 { 1048 int err; 1049 1050 fp->bpf_func = NULL; 1051 fp->jited = 0; 1052 1053 err = bpf_check_classic(fp->insns, fp->len); 1054 if (err) { 1055 __bpf_prog_release(fp); 1056 return ERR_PTR(err); 1057 } 1058 1059 /* There might be additional checks and transformations 1060 * needed on classic filters, f.e. in case of seccomp. 1061 */ 1062 if (trans) { 1063 err = trans(fp->insns, fp->len); 1064 if (err) { 1065 __bpf_prog_release(fp); 1066 return ERR_PTR(err); 1067 } 1068 } 1069 1070 /* Probe if we can JIT compile the filter and if so, do 1071 * the compilation of the filter. 1072 */ 1073 bpf_jit_compile(fp); 1074 1075 /* JIT compiler couldn't process this filter, so do the 1076 * internal BPF translation for the optimized interpreter. 1077 */ 1078 if (!fp->jited) 1079 fp = bpf_migrate_filter(fp); 1080 1081 return fp; 1082 } 1083 1084 /** 1085 * bpf_prog_create - create an unattached filter 1086 * @pfp: the unattached filter that is created 1087 * @fprog: the filter program 1088 * 1089 * Create a filter independent of any socket. We first run some 1090 * sanity checks on it to make sure it does not explode on us later. 1091 * If an error occurs or there is insufficient memory for the filter 1092 * a negative errno code is returned. On success the return is zero. 1093 */ 1094 int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog) 1095 { 1096 unsigned int fsize = bpf_classic_proglen(fprog); 1097 struct bpf_prog *fp; 1098 1099 /* Make sure new filter is there and in the right amounts. */ 1100 if (!bpf_check_basics_ok(fprog->filter, fprog->len)) 1101 return -EINVAL; 1102 1103 fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); 1104 if (!fp) 1105 return -ENOMEM; 1106 1107 memcpy(fp->insns, fprog->filter, fsize); 1108 1109 fp->len = fprog->len; 1110 /* Since unattached filters are not copied back to user 1111 * space through sk_get_filter(), we do not need to hold 1112 * a copy here, and can spare us the work. 1113 */ 1114 fp->orig_prog = NULL; 1115 1116 /* bpf_prepare_filter() already takes care of freeing 1117 * memory in case something goes wrong. 1118 */ 1119 fp = bpf_prepare_filter(fp, NULL); 1120 if (IS_ERR(fp)) 1121 return PTR_ERR(fp); 1122 1123 *pfp = fp; 1124 return 0; 1125 } 1126 EXPORT_SYMBOL_GPL(bpf_prog_create); 1127 1128 /** 1129 * bpf_prog_create_from_user - create an unattached filter from user buffer 1130 * @pfp: the unattached filter that is created 1131 * @fprog: the filter program 1132 * @trans: post-classic verifier transformation handler 1133 * @save_orig: save classic BPF program 1134 * 1135 * This function effectively does the same as bpf_prog_create(), only 1136 * that it builds up its insns buffer from user space provided buffer. 1137 * It also allows for passing a bpf_aux_classic_check_t handler. 1138 */ 1139 int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, 1140 bpf_aux_classic_check_t trans, bool save_orig) 1141 { 1142 unsigned int fsize = bpf_classic_proglen(fprog); 1143 struct bpf_prog *fp; 1144 int err; 1145 1146 /* Make sure new filter is there and in the right amounts. */ 1147 if (!bpf_check_basics_ok(fprog->filter, fprog->len)) 1148 return -EINVAL; 1149 1150 fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); 1151 if (!fp) 1152 return -ENOMEM; 1153 1154 if (copy_from_user(fp->insns, fprog->filter, fsize)) { 1155 __bpf_prog_free(fp); 1156 return -EFAULT; 1157 } 1158 1159 fp->len = fprog->len; 1160 fp->orig_prog = NULL; 1161 1162 if (save_orig) { 1163 err = bpf_prog_store_orig_filter(fp, fprog); 1164 if (err) { 1165 __bpf_prog_free(fp); 1166 return -ENOMEM; 1167 } 1168 } 1169 1170 /* bpf_prepare_filter() already takes care of freeing 1171 * memory in case something goes wrong. 1172 */ 1173 fp = bpf_prepare_filter(fp, trans); 1174 if (IS_ERR(fp)) 1175 return PTR_ERR(fp); 1176 1177 *pfp = fp; 1178 return 0; 1179 } 1180 EXPORT_SYMBOL_GPL(bpf_prog_create_from_user); 1181 1182 void bpf_prog_destroy(struct bpf_prog *fp) 1183 { 1184 __bpf_prog_release(fp); 1185 } 1186 EXPORT_SYMBOL_GPL(bpf_prog_destroy); 1187 1188 static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) 1189 { 1190 struct sk_filter *fp, *old_fp; 1191 1192 fp = kmalloc(sizeof(*fp), GFP_KERNEL); 1193 if (!fp) 1194 return -ENOMEM; 1195 1196 fp->prog = prog; 1197 1198 if (!__sk_filter_charge(sk, fp)) { 1199 kfree(fp); 1200 return -ENOMEM; 1201 } 1202 refcount_set(&fp->refcnt, 1); 1203 1204 old_fp = rcu_dereference_protected(sk->sk_filter, 1205 lockdep_sock_is_held(sk)); 1206 rcu_assign_pointer(sk->sk_filter, fp); 1207 1208 if (old_fp) 1209 sk_filter_uncharge(sk, old_fp); 1210 1211 return 0; 1212 } 1213 1214 static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk) 1215 { 1216 struct bpf_prog *old_prog; 1217 int err; 1218 1219 if (bpf_prog_size(prog->len) > sysctl_optmem_max) 1220 return -ENOMEM; 1221 1222 if (sk_unhashed(sk) && sk->sk_reuseport) { 1223 err = reuseport_alloc(sk); 1224 if (err) 1225 return err; 1226 } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { 1227 /* The socket wasn't bound with SO_REUSEPORT */ 1228 return -EINVAL; 1229 } 1230 1231 old_prog = reuseport_attach_prog(sk, prog); 1232 if (old_prog) 1233 bpf_prog_destroy(old_prog); 1234 1235 return 0; 1236 } 1237 1238 static 1239 struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) 1240 { 1241 unsigned int fsize = bpf_classic_proglen(fprog); 1242 struct bpf_prog *prog; 1243 int err; 1244 1245 if (sock_flag(sk, SOCK_FILTER_LOCKED)) 1246 return ERR_PTR(-EPERM); 1247 1248 /* Make sure new filter is there and in the right amounts. */ 1249 if (!bpf_check_basics_ok(fprog->filter, fprog->len)) 1250 return ERR_PTR(-EINVAL); 1251 1252 prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); 1253 if (!prog) 1254 return ERR_PTR(-ENOMEM); 1255 1256 if (copy_from_user(prog->insns, fprog->filter, fsize)) { 1257 __bpf_prog_free(prog); 1258 return ERR_PTR(-EFAULT); 1259 } 1260 1261 prog->len = fprog->len; 1262 1263 err = bpf_prog_store_orig_filter(prog, fprog); 1264 if (err) { 1265 __bpf_prog_free(prog); 1266 return ERR_PTR(-ENOMEM); 1267 } 1268 1269 /* bpf_prepare_filter() already takes care of freeing 1270 * memory in case something goes wrong. 1271 */ 1272 return bpf_prepare_filter(prog, NULL); 1273 } 1274 1275 /** 1276 * sk_attach_filter - attach a socket filter 1277 * @fprog: the filter program 1278 * @sk: the socket to use 1279 * 1280 * Attach the user's filter code. We first run some sanity checks on 1281 * it to make sure it does not explode on us later. If an error 1282 * occurs or there is insufficient memory for the filter a negative 1283 * errno code is returned. On success the return is zero. 1284 */ 1285 int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) 1286 { 1287 struct bpf_prog *prog = __get_filter(fprog, sk); 1288 int err; 1289 1290 if (IS_ERR(prog)) 1291 return PTR_ERR(prog); 1292 1293 err = __sk_attach_prog(prog, sk); 1294 if (err < 0) { 1295 __bpf_prog_release(prog); 1296 return err; 1297 } 1298 1299 return 0; 1300 } 1301 EXPORT_SYMBOL_GPL(sk_attach_filter); 1302 1303 int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) 1304 { 1305 struct bpf_prog *prog = __get_filter(fprog, sk); 1306 int err; 1307 1308 if (IS_ERR(prog)) 1309 return PTR_ERR(prog); 1310 1311 err = __reuseport_attach_prog(prog, sk); 1312 if (err < 0) { 1313 __bpf_prog_release(prog); 1314 return err; 1315 } 1316 1317 return 0; 1318 } 1319 1320 static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk) 1321 { 1322 if (sock_flag(sk, SOCK_FILTER_LOCKED)) 1323 return ERR_PTR(-EPERM); 1324 1325 return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); 1326 } 1327 1328 int sk_attach_bpf(u32 ufd, struct sock *sk) 1329 { 1330 struct bpf_prog *prog = __get_bpf(ufd, sk); 1331 int err; 1332 1333 if (IS_ERR(prog)) 1334 return PTR_ERR(prog); 1335 1336 err = __sk_attach_prog(prog, sk); 1337 if (err < 0) { 1338 bpf_prog_put(prog); 1339 return err; 1340 } 1341 1342 return 0; 1343 } 1344 1345 int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) 1346 { 1347 struct bpf_prog *prog = __get_bpf(ufd, sk); 1348 int err; 1349 1350 if (IS_ERR(prog)) 1351 return PTR_ERR(prog); 1352 1353 err = __reuseport_attach_prog(prog, sk); 1354 if (err < 0) { 1355 bpf_prog_put(prog); 1356 return err; 1357 } 1358 1359 return 0; 1360 } 1361 1362 struct bpf_scratchpad { 1363 union { 1364 __be32 diff[MAX_BPF_STACK / sizeof(__be32)]; 1365 u8 buff[MAX_BPF_STACK]; 1366 }; 1367 }; 1368 1369 static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp); 1370 1371 static inline int __bpf_try_make_writable(struct sk_buff *skb, 1372 unsigned int write_len) 1373 { 1374 return skb_ensure_writable(skb, write_len); 1375 } 1376 1377 static inline int bpf_try_make_writable(struct sk_buff *skb, 1378 unsigned int write_len) 1379 { 1380 int err = __bpf_try_make_writable(skb, write_len); 1381 1382 bpf_compute_data_end(skb); 1383 return err; 1384 } 1385 1386 static int bpf_try_make_head_writable(struct sk_buff *skb) 1387 { 1388 return bpf_try_make_writable(skb, skb_headlen(skb)); 1389 } 1390 1391 static inline void bpf_push_mac_rcsum(struct sk_buff *skb) 1392 { 1393 if (skb_at_tc_ingress(skb)) 1394 skb_postpush_rcsum(skb, skb_mac_header(skb), skb->mac_len); 1395 } 1396 1397 static inline void bpf_pull_mac_rcsum(struct sk_buff *skb) 1398 { 1399 if (skb_at_tc_ingress(skb)) 1400 skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len); 1401 } 1402 1403 BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset, 1404 const void *, from, u32, len, u64, flags) 1405 { 1406 void *ptr; 1407 1408 if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))) 1409 return -EINVAL; 1410 if (unlikely(offset > 0xffff)) 1411 return -EFAULT; 1412 if (unlikely(bpf_try_make_writable(skb, offset + len))) 1413 return -EFAULT; 1414 1415 ptr = skb->data + offset; 1416 if (flags & BPF_F_RECOMPUTE_CSUM) 1417 __skb_postpull_rcsum(skb, ptr, len, offset); 1418 1419 memcpy(ptr, from, len); 1420 1421 if (flags & BPF_F_RECOMPUTE_CSUM) 1422 __skb_postpush_rcsum(skb, ptr, len, offset); 1423 if (flags & BPF_F_INVALIDATE_HASH) 1424 skb_clear_hash(skb); 1425 1426 return 0; 1427 } 1428 1429 static const struct bpf_func_proto bpf_skb_store_bytes_proto = { 1430 .func = bpf_skb_store_bytes, 1431 .gpl_only = false, 1432 .ret_type = RET_INTEGER, 1433 .arg1_type = ARG_PTR_TO_CTX, 1434 .arg2_type = ARG_ANYTHING, 1435 .arg3_type = ARG_PTR_TO_MEM, 1436 .arg4_type = ARG_CONST_SIZE, 1437 .arg5_type = ARG_ANYTHING, 1438 }; 1439 1440 BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset, 1441 void *, to, u32, len) 1442 { 1443 void *ptr; 1444 1445 if (unlikely(offset > 0xffff)) 1446 goto err_clear; 1447 1448 ptr = skb_header_pointer(skb, offset, len, to); 1449 if (unlikely(!ptr)) 1450 goto err_clear; 1451 if (ptr != to) 1452 memcpy(to, ptr, len); 1453 1454 return 0; 1455 err_clear: 1456 memset(to, 0, len); 1457 return -EFAULT; 1458 } 1459 1460 static const struct bpf_func_proto bpf_skb_load_bytes_proto = { 1461 .func = bpf_skb_load_bytes, 1462 .gpl_only = false, 1463 .ret_type = RET_INTEGER, 1464 .arg1_type = ARG_PTR_TO_CTX, 1465 .arg2_type = ARG_ANYTHING, 1466 .arg3_type = ARG_PTR_TO_UNINIT_MEM, 1467 .arg4_type = ARG_CONST_SIZE, 1468 }; 1469 1470 BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len) 1471 { 1472 /* Idea is the following: should the needed direct read/write 1473 * test fail during runtime, we can pull in more data and redo 1474 * again, since implicitly, we invalidate previous checks here. 1475 * 1476 * Or, since we know how much we need to make read/writeable, 1477 * this can be done once at the program beginning for direct 1478 * access case. By this we overcome limitations of only current 1479 * headroom being accessible. 1480 */ 1481 return bpf_try_make_writable(skb, len ? : skb_headlen(skb)); 1482 } 1483 1484 static const struct bpf_func_proto bpf_skb_pull_data_proto = { 1485 .func = bpf_skb_pull_data, 1486 .gpl_only = false, 1487 .ret_type = RET_INTEGER, 1488 .arg1_type = ARG_PTR_TO_CTX, 1489 .arg2_type = ARG_ANYTHING, 1490 }; 1491 1492 BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset, 1493 u64, from, u64, to, u64, flags) 1494 { 1495 __sum16 *ptr; 1496 1497 if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK))) 1498 return -EINVAL; 1499 if (unlikely(offset > 0xffff || offset & 1)) 1500 return -EFAULT; 1501 if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr)))) 1502 return -EFAULT; 1503 1504 ptr = (__sum16 *)(skb->data + offset); 1505 switch (flags & BPF_F_HDR_FIELD_MASK) { 1506 case 0: 1507 if (unlikely(from != 0)) 1508 return -EINVAL; 1509 1510 csum_replace_by_diff(ptr, to); 1511 break; 1512 case 2: 1513 csum_replace2(ptr, from, to); 1514 break; 1515 case 4: 1516 csum_replace4(ptr, from, to); 1517 break; 1518 default: 1519 return -EINVAL; 1520 } 1521 1522 return 0; 1523 } 1524 1525 static const struct bpf_func_proto bpf_l3_csum_replace_proto = { 1526 .func = bpf_l3_csum_replace, 1527 .gpl_only = false, 1528 .ret_type = RET_INTEGER, 1529 .arg1_type = ARG_PTR_TO_CTX, 1530 .arg2_type = ARG_ANYTHING, 1531 .arg3_type = ARG_ANYTHING, 1532 .arg4_type = ARG_ANYTHING, 1533 .arg5_type = ARG_ANYTHING, 1534 }; 1535 1536 BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset, 1537 u64, from, u64, to, u64, flags) 1538 { 1539 bool is_pseudo = flags & BPF_F_PSEUDO_HDR; 1540 bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; 1541 bool do_mforce = flags & BPF_F_MARK_ENFORCE; 1542 __sum16 *ptr; 1543 1544 if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE | 1545 BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK))) 1546 return -EINVAL; 1547 if (unlikely(offset > 0xffff || offset & 1)) 1548 return -EFAULT; 1549 if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr)))) 1550 return -EFAULT; 1551 1552 ptr = (__sum16 *)(skb->data + offset); 1553 if (is_mmzero && !do_mforce && !*ptr) 1554 return 0; 1555 1556 switch (flags & BPF_F_HDR_FIELD_MASK) { 1557 case 0: 1558 if (unlikely(from != 0)) 1559 return -EINVAL; 1560 1561 inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo); 1562 break; 1563 case 2: 1564 inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); 1565 break; 1566 case 4: 1567 inet_proto_csum_replace4(ptr, skb, from, to, is_pseudo); 1568 break; 1569 default: 1570 return -EINVAL; 1571 } 1572 1573 if (is_mmzero && !*ptr) 1574 *ptr = CSUM_MANGLED_0; 1575 return 0; 1576 } 1577 1578 static const struct bpf_func_proto bpf_l4_csum_replace_proto = { 1579 .func = bpf_l4_csum_replace, 1580 .gpl_only = false, 1581 .ret_type = RET_INTEGER, 1582 .arg1_type = ARG_PTR_TO_CTX, 1583 .arg2_type = ARG_ANYTHING, 1584 .arg3_type = ARG_ANYTHING, 1585 .arg4_type = ARG_ANYTHING, 1586 .arg5_type = ARG_ANYTHING, 1587 }; 1588 1589 BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size, 1590 __be32 *, to, u32, to_size, __wsum, seed) 1591 { 1592 struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp); 1593 u32 diff_size = from_size + to_size; 1594 int i, j = 0; 1595 1596 /* This is quite flexible, some examples: 1597 * 1598 * from_size == 0, to_size > 0, seed := csum --> pushing data 1599 * from_size > 0, to_size == 0, seed := csum --> pulling data 1600 * from_size > 0, to_size > 0, seed := 0 --> diffing data 1601 * 1602 * Even for diffing, from_size and to_size don't need to be equal. 1603 */ 1604 if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) || 1605 diff_size > sizeof(sp->diff))) 1606 return -EINVAL; 1607 1608 for (i = 0; i < from_size / sizeof(__be32); i++, j++) 1609 sp->diff[j] = ~from[i]; 1610 for (i = 0; i < to_size / sizeof(__be32); i++, j++) 1611 sp->diff[j] = to[i]; 1612 1613 return csum_partial(sp->diff, diff_size, seed); 1614 } 1615 1616 static const struct bpf_func_proto bpf_csum_diff_proto = { 1617 .func = bpf_csum_diff, 1618 .gpl_only = false, 1619 .pkt_access = true, 1620 .ret_type = RET_INTEGER, 1621 .arg1_type = ARG_PTR_TO_MEM, 1622 .arg2_type = ARG_CONST_SIZE_OR_ZERO, 1623 .arg3_type = ARG_PTR_TO_MEM, 1624 .arg4_type = ARG_CONST_SIZE_OR_ZERO, 1625 .arg5_type = ARG_ANYTHING, 1626 }; 1627 1628 BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum) 1629 { 1630 /* The interface is to be used in combination with bpf_csum_diff() 1631 * for direct packet writes. csum rotation for alignment as well 1632 * as emulating csum_sub() can be done from the eBPF program. 1633 */ 1634 if (skb->ip_summed == CHECKSUM_COMPLETE) 1635 return (skb->csum = csum_add(skb->csum, csum)); 1636 1637 return -ENOTSUPP; 1638 } 1639 1640 static const struct bpf_func_proto bpf_csum_update_proto = { 1641 .func = bpf_csum_update, 1642 .gpl_only = false, 1643 .ret_type = RET_INTEGER, 1644 .arg1_type = ARG_PTR_TO_CTX, 1645 .arg2_type = ARG_ANYTHING, 1646 }; 1647 1648 static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) 1649 { 1650 return dev_forward_skb(dev, skb); 1651 } 1652 1653 static inline int __bpf_rx_skb_no_mac(struct net_device *dev, 1654 struct sk_buff *skb) 1655 { 1656 int ret = ____dev_forward_skb(dev, skb); 1657 1658 if (likely(!ret)) { 1659 skb->dev = dev; 1660 ret = netif_rx(skb); 1661 } 1662 1663 return ret; 1664 } 1665 1666 static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) 1667 { 1668 int ret; 1669 1670 if (unlikely(__this_cpu_read(xmit_recursion) > XMIT_RECURSION_LIMIT)) { 1671 net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); 1672 kfree_skb(skb); 1673 return -ENETDOWN; 1674 } 1675 1676 skb->dev = dev; 1677 1678 __this_cpu_inc(xmit_recursion); 1679 ret = dev_queue_xmit(skb); 1680 __this_cpu_dec(xmit_recursion); 1681 1682 return ret; 1683 } 1684 1685 static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev, 1686 u32 flags) 1687 { 1688 /* skb->mac_len is not set on normal egress */ 1689 unsigned int mlen = skb->network_header - skb->mac_header; 1690 1691 __skb_pull(skb, mlen); 1692 1693 /* At ingress, the mac header has already been pulled once. 1694 * At egress, skb_pospull_rcsum has to be done in case that 1695 * the skb is originated from ingress (i.e. a forwarded skb) 1696 * to ensure that rcsum starts at net header. 1697 */ 1698 if (!skb_at_tc_ingress(skb)) 1699 skb_postpull_rcsum(skb, skb_mac_header(skb), mlen); 1700 skb_pop_mac_header(skb); 1701 skb_reset_mac_len(skb); 1702 return flags & BPF_F_INGRESS ? 1703 __bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb); 1704 } 1705 1706 static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev, 1707 u32 flags) 1708 { 1709 /* Verify that a link layer header is carried */ 1710 if (unlikely(skb->mac_header >= skb->network_header)) { 1711 kfree_skb(skb); 1712 return -ERANGE; 1713 } 1714 1715 bpf_push_mac_rcsum(skb); 1716 return flags & BPF_F_INGRESS ? 1717 __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); 1718 } 1719 1720 static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, 1721 u32 flags) 1722 { 1723 if (dev_is_mac_header_xmit(dev)) 1724 return __bpf_redirect_common(skb, dev, flags); 1725 else 1726 return __bpf_redirect_no_mac(skb, dev, flags); 1727 } 1728 1729 BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) 1730 { 1731 struct net_device *dev; 1732 struct sk_buff *clone; 1733 int ret; 1734 1735 if (unlikely(flags & ~(BPF_F_INGRESS))) 1736 return -EINVAL; 1737 1738 dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex); 1739 if (unlikely(!dev)) 1740 return -EINVAL; 1741 1742 clone = skb_clone(skb, GFP_ATOMIC); 1743 if (unlikely(!clone)) 1744 return -ENOMEM; 1745 1746 /* For direct write, we need to keep the invariant that the skbs 1747 * we're dealing with need to be uncloned. Should uncloning fail 1748 * here, we need to free the just generated clone to unclone once 1749 * again. 1750 */ 1751 ret = bpf_try_make_head_writable(skb); 1752 if (unlikely(ret)) { 1753 kfree_skb(clone); 1754 return -ENOMEM; 1755 } 1756 1757 return __bpf_redirect(clone, dev, flags); 1758 } 1759 1760 static const struct bpf_func_proto bpf_clone_redirect_proto = { 1761 .func = bpf_clone_redirect, 1762 .gpl_only = false, 1763 .ret_type = RET_INTEGER, 1764 .arg1_type = ARG_PTR_TO_CTX, 1765 .arg2_type = ARG_ANYTHING, 1766 .arg3_type = ARG_ANYTHING, 1767 }; 1768 1769 struct redirect_info { 1770 u32 ifindex; 1771 u32 flags; 1772 }; 1773 1774 static DEFINE_PER_CPU(struct redirect_info, redirect_info); 1775 1776 BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) 1777 { 1778 struct redirect_info *ri = this_cpu_ptr(&redirect_info); 1779 1780 if (unlikely(flags & ~(BPF_F_INGRESS))) 1781 return TC_ACT_SHOT; 1782 1783 ri->ifindex = ifindex; 1784 ri->flags = flags; 1785 1786 return TC_ACT_REDIRECT; 1787 } 1788 1789 int skb_do_redirect(struct sk_buff *skb) 1790 { 1791 struct redirect_info *ri = this_cpu_ptr(&redirect_info); 1792 struct net_device *dev; 1793 1794 dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex); 1795 ri->ifindex = 0; 1796 if (unlikely(!dev)) { 1797 kfree_skb(skb); 1798 return -EINVAL; 1799 } 1800 1801 return __bpf_redirect(skb, dev, ri->flags); 1802 } 1803 1804 static const struct bpf_func_proto bpf_redirect_proto = { 1805 .func = bpf_redirect, 1806 .gpl_only = false, 1807 .ret_type = RET_INTEGER, 1808 .arg1_type = ARG_ANYTHING, 1809 .arg2_type = ARG_ANYTHING, 1810 }; 1811 1812 BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) 1813 { 1814 return task_get_classid(skb); 1815 } 1816 1817 static const struct bpf_func_proto bpf_get_cgroup_classid_proto = { 1818 .func = bpf_get_cgroup_classid, 1819 .gpl_only = false, 1820 .ret_type = RET_INTEGER, 1821 .arg1_type = ARG_PTR_TO_CTX, 1822 }; 1823 1824 BPF_CALL_1(bpf_get_route_realm, const struct sk_buff *, skb) 1825 { 1826 return dst_tclassid(skb); 1827 } 1828 1829 static const struct bpf_func_proto bpf_get_route_realm_proto = { 1830 .func = bpf_get_route_realm, 1831 .gpl_only = false, 1832 .ret_type = RET_INTEGER, 1833 .arg1_type = ARG_PTR_TO_CTX, 1834 }; 1835 1836 BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb) 1837 { 1838 /* If skb_clear_hash() was called due to mangling, we can 1839 * trigger SW recalculation here. Later access to hash 1840 * can then use the inline skb->hash via context directly 1841 * instead of calling this helper again. 1842 */ 1843 return skb_get_hash(skb); 1844 } 1845 1846 static const struct bpf_func_proto bpf_get_hash_recalc_proto = { 1847 .func = bpf_get_hash_recalc, 1848 .gpl_only = false, 1849 .ret_type = RET_INTEGER, 1850 .arg1_type = ARG_PTR_TO_CTX, 1851 }; 1852 1853 BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb) 1854 { 1855 /* After all direct packet write, this can be used once for 1856 * triggering a lazy recalc on next skb_get_hash() invocation. 1857 */ 1858 skb_clear_hash(skb); 1859 return 0; 1860 } 1861 1862 static const struct bpf_func_proto bpf_set_hash_invalid_proto = { 1863 .func = bpf_set_hash_invalid, 1864 .gpl_only = false, 1865 .ret_type = RET_INTEGER, 1866 .arg1_type = ARG_PTR_TO_CTX, 1867 }; 1868 1869 BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, 1870 u16, vlan_tci) 1871 { 1872 int ret; 1873 1874 if (unlikely(vlan_proto != htons(ETH_P_8021Q) && 1875 vlan_proto != htons(ETH_P_8021AD))) 1876 vlan_proto = htons(ETH_P_8021Q); 1877 1878 bpf_push_mac_rcsum(skb); 1879 ret = skb_vlan_push(skb, vlan_proto, vlan_tci); 1880 bpf_pull_mac_rcsum(skb); 1881 1882 bpf_compute_data_end(skb); 1883 return ret; 1884 } 1885 1886 const struct bpf_func_proto bpf_skb_vlan_push_proto = { 1887 .func = bpf_skb_vlan_push, 1888 .gpl_only = false, 1889 .ret_type = RET_INTEGER, 1890 .arg1_type = ARG_PTR_TO_CTX, 1891 .arg2_type = ARG_ANYTHING, 1892 .arg3_type = ARG_ANYTHING, 1893 }; 1894 EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto); 1895 1896 BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) 1897 { 1898 int ret; 1899 1900 bpf_push_mac_rcsum(skb); 1901 ret = skb_vlan_pop(skb); 1902 bpf_pull_mac_rcsum(skb); 1903 1904 bpf_compute_data_end(skb); 1905 return ret; 1906 } 1907 1908 const struct bpf_func_proto bpf_skb_vlan_pop_proto = { 1909 .func = bpf_skb_vlan_pop, 1910 .gpl_only = false, 1911 .ret_type = RET_INTEGER, 1912 .arg1_type = ARG_PTR_TO_CTX, 1913 }; 1914 EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto); 1915 1916 static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len) 1917 { 1918 /* Caller already did skb_cow() with len as headroom, 1919 * so no need to do it here. 1920 */ 1921 skb_push(skb, len); 1922 memmove(skb->data, skb->data + len, off); 1923 memset(skb->data + off, 0, len); 1924 1925 /* No skb_postpush_rcsum(skb, skb->data + off, len) 1926 * needed here as it does not change the skb->csum 1927 * result for checksum complete when summing over 1928 * zeroed blocks. 1929 */ 1930 return 0; 1931 } 1932 1933 static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len) 1934 { 1935 /* skb_ensure_writable() is not needed here, as we're 1936 * already working on an uncloned skb. 1937 */ 1938 if (unlikely(!pskb_may_pull(skb, off + len))) 1939 return -ENOMEM; 1940 1941 skb_postpull_rcsum(skb, skb->data + off, len); 1942 memmove(skb->data + len, skb->data, off); 1943 __skb_pull(skb, len); 1944 1945 return 0; 1946 } 1947 1948 static int bpf_skb_net_hdr_push(struct sk_buff *skb, u32 off, u32 len) 1949 { 1950 bool trans_same = skb->transport_header == skb->network_header; 1951 int ret; 1952 1953 /* There's no need for __skb_push()/__skb_pull() pair to 1954 * get to the start of the mac header as we're guaranteed 1955 * to always start from here under eBPF. 1956 */ 1957 ret = bpf_skb_generic_push(skb, off, len); 1958 if (likely(!ret)) { 1959 skb->mac_header -= len; 1960 skb->network_header -= len; 1961 if (trans_same) 1962 skb->transport_header = skb->network_header; 1963 } 1964 1965 return ret; 1966 } 1967 1968 static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len) 1969 { 1970 bool trans_same = skb->transport_header == skb->network_header; 1971 int ret; 1972 1973 /* Same here, __skb_push()/__skb_pull() pair not needed. */ 1974 ret = bpf_skb_generic_pop(skb, off, len); 1975 if (likely(!ret)) { 1976 skb->mac_header += len; 1977 skb->network_header += len; 1978 if (trans_same) 1979 skb->transport_header = skb->network_header; 1980 } 1981 1982 return ret; 1983 } 1984 1985 static int bpf_skb_proto_4_to_6(struct sk_buff *skb) 1986 { 1987 const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); 1988 u32 off = skb->network_header - skb->mac_header; 1989 int ret; 1990 1991 ret = skb_cow(skb, len_diff); 1992 if (unlikely(ret < 0)) 1993 return ret; 1994 1995 ret = bpf_skb_net_hdr_push(skb, off, len_diff); 1996 if (unlikely(ret < 0)) 1997 return ret; 1998 1999 if (skb_is_gso(skb)) { 2000 /* SKB_GSO_UDP stays as is. SKB_GSO_TCPV4 needs to 2001 * be changed into SKB_GSO_TCPV6. 2002 */ 2003 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { 2004 skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV4; 2005 skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6; 2006 } 2007 2008 /* Due to IPv6 header, MSS needs to be downgraded. */ 2009 skb_shinfo(skb)->gso_size -= len_diff; 2010 /* Header must be checked, and gso_segs recomputed. */ 2011 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; 2012 skb_shinfo(skb)->gso_segs = 0; 2013 } 2014 2015 skb->protocol = htons(ETH_P_IPV6); 2016 skb_clear_hash(skb); 2017 2018 return 0; 2019 } 2020 2021 static int bpf_skb_proto_6_to_4(struct sk_buff *skb) 2022 { 2023 const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); 2024 u32 off = skb->network_header - skb->mac_header; 2025 int ret; 2026 2027 ret = skb_unclone(skb, GFP_ATOMIC); 2028 if (unlikely(ret < 0)) 2029 return ret; 2030 2031 ret = bpf_skb_net_hdr_pop(skb, off, len_diff); 2032 if (unlikely(ret < 0)) 2033 return ret; 2034 2035 if (skb_is_gso(skb)) { 2036 /* SKB_GSO_UDP stays as is. SKB_GSO_TCPV6 needs to 2037 * be changed into SKB_GSO_TCPV4. 2038 */ 2039 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) { 2040 skb_shinfo(skb)->gso_type &= ~SKB_GSO_TCPV6; 2041 skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; 2042 } 2043 2044 /* Due to IPv4 header, MSS can be upgraded. */ 2045 skb_shinfo(skb)->gso_size += len_diff; 2046 /* Header must be checked, and gso_segs recomputed. */ 2047 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; 2048 skb_shinfo(skb)->gso_segs = 0; 2049 } 2050 2051 skb->protocol = htons(ETH_P_IP); 2052 skb_clear_hash(skb); 2053 2054 return 0; 2055 } 2056 2057 static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto) 2058 { 2059 __be16 from_proto = skb->protocol; 2060 2061 if (from_proto == htons(ETH_P_IP) && 2062 to_proto == htons(ETH_P_IPV6)) 2063 return bpf_skb_proto_4_to_6(skb); 2064 2065 if (from_proto == htons(ETH_P_IPV6) && 2066 to_proto == htons(ETH_P_IP)) 2067 return bpf_skb_proto_6_to_4(skb); 2068 2069 return -ENOTSUPP; 2070 } 2071 2072 BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto, 2073 u64, flags) 2074 { 2075 int ret; 2076 2077 if (unlikely(flags)) 2078 return -EINVAL; 2079 2080 /* General idea is that this helper does the basic groundwork 2081 * needed for changing the protocol, and eBPF program fills the 2082 * rest through bpf_skb_store_bytes(), bpf_lX_csum_replace() 2083 * and other helpers, rather than passing a raw buffer here. 2084 * 2085 * The rationale is to keep this minimal and without a need to 2086 * deal with raw packet data. F.e. even if we would pass buffers 2087 * here, the program still needs to call the bpf_lX_csum_replace() 2088 * helpers anyway. Plus, this way we keep also separation of 2089 * concerns, since f.e. bpf_skb_store_bytes() should only take 2090 * care of stores. 2091 * 2092 * Currently, additional options and extension header space are 2093 * not supported, but flags register is reserved so we can adapt 2094 * that. For offloads, we mark packet as dodgy, so that headers 2095 * need to be verified first. 2096 */ 2097 ret = bpf_skb_proto_xlat(skb, proto); 2098 bpf_compute_data_end(skb); 2099 return ret; 2100 } 2101 2102 static const struct bpf_func_proto bpf_skb_change_proto_proto = { 2103 .func = bpf_skb_change_proto, 2104 .gpl_only = false, 2105 .ret_type = RET_INTEGER, 2106 .arg1_type = ARG_PTR_TO_CTX, 2107 .arg2_type = ARG_ANYTHING, 2108 .arg3_type = ARG_ANYTHING, 2109 }; 2110 2111 BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type) 2112 { 2113 /* We only allow a restricted subset to be changed for now. */ 2114 if (unlikely(!skb_pkt_type_ok(skb->pkt_type) || 2115 !skb_pkt_type_ok(pkt_type))) 2116 return -EINVAL; 2117 2118 skb->pkt_type = pkt_type; 2119 return 0; 2120 } 2121 2122 static const struct bpf_func_proto bpf_skb_change_type_proto = { 2123 .func = bpf_skb_change_type, 2124 .gpl_only = false, 2125 .ret_type = RET_INTEGER, 2126 .arg1_type = ARG_PTR_TO_CTX, 2127 .arg2_type = ARG_ANYTHING, 2128 }; 2129 2130 static u32 __bpf_skb_min_len(const struct sk_buff *skb) 2131 { 2132 u32 min_len = skb_network_offset(skb); 2133 2134 if (skb_transport_header_was_set(skb)) 2135 min_len = skb_transport_offset(skb); 2136 if (skb->ip_summed == CHECKSUM_PARTIAL) 2137 min_len = skb_checksum_start_offset(skb) + 2138 skb->csum_offset + sizeof(__sum16); 2139 return min_len; 2140 } 2141 2142 static u32 __bpf_skb_max_len(const struct sk_buff *skb) 2143 { 2144 return skb->dev->mtu + skb->dev->hard_header_len; 2145 } 2146 2147 static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len) 2148 { 2149 unsigned int old_len = skb->len; 2150 int ret; 2151 2152 ret = __skb_grow_rcsum(skb, new_len); 2153 if (!ret) 2154 memset(skb->data + old_len, 0, new_len - old_len); 2155 return ret; 2156 } 2157 2158 static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len) 2159 { 2160 return __skb_trim_rcsum(skb, new_len); 2161 } 2162 2163 BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, 2164 u64, flags) 2165 { 2166 u32 max_len = __bpf_skb_max_len(skb); 2167 u32 min_len = __bpf_skb_min_len(skb); 2168 int ret; 2169 2170 if (unlikely(flags || new_len > max_len || new_len < min_len)) 2171 return -EINVAL; 2172 if (skb->encapsulation) 2173 return -ENOTSUPP; 2174 2175 /* The basic idea of this helper is that it's performing the 2176 * needed work to either grow or trim an skb, and eBPF program 2177 * rewrites the rest via helpers like bpf_skb_store_bytes(), 2178 * bpf_lX_csum_replace() and others rather than passing a raw 2179 * buffer here. This one is a slow path helper and intended 2180 * for replies with control messages. 2181 * 2182 * Like in bpf_skb_change_proto(), we want to keep this rather 2183 * minimal and without protocol specifics so that we are able 2184 * to separate concerns as in bpf_skb_store_bytes() should only 2185 * be the one responsible for writing buffers. 2186 * 2187 * It's really expected to be a slow path operation here for 2188 * control message replies, so we're implicitly linearizing, 2189 * uncloning and drop offloads from the skb by this. 2190 */ 2191 ret = __bpf_try_make_writable(skb, skb->len); 2192 if (!ret) { 2193 if (new_len > skb->len) 2194 ret = bpf_skb_grow_rcsum(skb, new_len); 2195 else if (new_len < skb->len) 2196 ret = bpf_skb_trim_rcsum(skb, new_len); 2197 if (!ret && skb_is_gso(skb)) 2198 skb_gso_reset(skb); 2199 } 2200 2201 bpf_compute_data_end(skb); 2202 return ret; 2203 } 2204 2205 static const struct bpf_func_proto bpf_skb_change_tail_proto = { 2206 .func = bpf_skb_change_tail, 2207 .gpl_only = false, 2208 .ret_type = RET_INTEGER, 2209 .arg1_type = ARG_PTR_TO_CTX, 2210 .arg2_type = ARG_ANYTHING, 2211 .arg3_type = ARG_ANYTHING, 2212 }; 2213 2214 BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, 2215 u64, flags) 2216 { 2217 u32 max_len = __bpf_skb_max_len(skb); 2218 u32 new_len = skb->len + head_room; 2219 int ret; 2220 2221 if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) || 2222 new_len < skb->len)) 2223 return -EINVAL; 2224 2225 ret = skb_cow(skb, head_room); 2226 if (likely(!ret)) { 2227 /* Idea for this helper is that we currently only 2228 * allow to expand on mac header. This means that 2229 * skb->protocol network header, etc, stay as is. 2230 * Compared to bpf_skb_change_tail(), we're more 2231 * flexible due to not needing to linearize or 2232 * reset GSO. Intention for this helper is to be 2233 * used by an L3 skb that needs to push mac header 2234 * for redirection into L2 device. 2235 */ 2236 __skb_push(skb, head_room); 2237 memset(skb->data, 0, head_room); 2238 skb_reset_mac_header(skb); 2239 } 2240 2241 bpf_compute_data_end(skb); 2242 return 0; 2243 } 2244 2245 static const struct bpf_func_proto bpf_skb_change_head_proto = { 2246 .func = bpf_skb_change_head, 2247 .gpl_only = false, 2248 .ret_type = RET_INTEGER, 2249 .arg1_type = ARG_PTR_TO_CTX, 2250 .arg2_type = ARG_ANYTHING, 2251 .arg3_type = ARG_ANYTHING, 2252 }; 2253 2254 BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset) 2255 { 2256 void *data = xdp->data + offset; 2257 2258 if (unlikely(data < xdp->data_hard_start || 2259 data > xdp->data_end - ETH_HLEN)) 2260 return -EINVAL; 2261 2262 xdp->data = data; 2263 2264 return 0; 2265 } 2266 2267 static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { 2268 .func = bpf_xdp_adjust_head, 2269 .gpl_only = false, 2270 .ret_type = RET_INTEGER, 2271 .arg1_type = ARG_PTR_TO_CTX, 2272 .arg2_type = ARG_ANYTHING, 2273 }; 2274 2275 bool bpf_helper_changes_pkt_data(void *func) 2276 { 2277 if (func == bpf_skb_vlan_push || 2278 func == bpf_skb_vlan_pop || 2279 func == bpf_skb_store_bytes || 2280 func == bpf_skb_change_proto || 2281 func == bpf_skb_change_head || 2282 func == bpf_skb_change_tail || 2283 func == bpf_skb_pull_data || 2284 func == bpf_clone_redirect || 2285 func == bpf_l3_csum_replace || 2286 func == bpf_l4_csum_replace || 2287 func == bpf_xdp_adjust_head) 2288 return true; 2289 2290 return false; 2291 } 2292 2293 static unsigned long bpf_skb_copy(void *dst_buff, const void *skb, 2294 unsigned long off, unsigned long len) 2295 { 2296 void *ptr = skb_header_pointer(skb, off, len, dst_buff); 2297 2298 if (unlikely(!ptr)) 2299 return len; 2300 if (ptr != dst_buff) 2301 memcpy(dst_buff, ptr, len); 2302 2303 return 0; 2304 } 2305 2306 BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map, 2307 u64, flags, void *, meta, u64, meta_size) 2308 { 2309 u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32; 2310 2311 if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) 2312 return -EINVAL; 2313 if (unlikely(skb_size > skb->len)) 2314 return -EFAULT; 2315 2316 return bpf_event_output(map, flags, meta, meta_size, skb, skb_size, 2317 bpf_skb_copy); 2318 } 2319 2320 static const struct bpf_func_proto bpf_skb_event_output_proto = { 2321 .func = bpf_skb_event_output, 2322 .gpl_only = true, 2323 .ret_type = RET_INTEGER, 2324 .arg1_type = ARG_PTR_TO_CTX, 2325 .arg2_type = ARG_CONST_MAP_PTR, 2326 .arg3_type = ARG_ANYTHING, 2327 .arg4_type = ARG_PTR_TO_MEM, 2328 .arg5_type = ARG_CONST_SIZE, 2329 }; 2330 2331 static unsigned short bpf_tunnel_key_af(u64 flags) 2332 { 2333 return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET; 2334 } 2335 2336 BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key *, to, 2337 u32, size, u64, flags) 2338 { 2339 const struct ip_tunnel_info *info = skb_tunnel_info(skb); 2340 u8 compat[sizeof(struct bpf_tunnel_key)]; 2341 void *to_orig = to; 2342 int err; 2343 2344 if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6)))) { 2345 err = -EINVAL; 2346 goto err_clear; 2347 } 2348 if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) { 2349 err = -EPROTO; 2350 goto err_clear; 2351 } 2352 if (unlikely(size != sizeof(struct bpf_tunnel_key))) { 2353 err = -EINVAL; 2354 switch (size) { 2355 case offsetof(struct bpf_tunnel_key, tunnel_label): 2356 case offsetof(struct bpf_tunnel_key, tunnel_ext): 2357 goto set_compat; 2358 case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): 2359 /* Fixup deprecated structure layouts here, so we have 2360 * a common path later on. 2361 */ 2362 if (ip_tunnel_info_af(info) != AF_INET) 2363 goto err_clear; 2364 set_compat: 2365 to = (struct bpf_tunnel_key *)compat; 2366 break; 2367 default: 2368 goto err_clear; 2369 } 2370 } 2371 2372 to->tunnel_id = be64_to_cpu(info->key.tun_id); 2373 to->tunnel_tos = info->key.tos; 2374 to->tunnel_ttl = info->key.ttl; 2375 2376 if (flags & BPF_F_TUNINFO_IPV6) { 2377 memcpy(to->remote_ipv6, &info->key.u.ipv6.src, 2378 sizeof(to->remote_ipv6)); 2379 to->tunnel_label = be32_to_cpu(info->key.label); 2380 } else { 2381 to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); 2382 } 2383 2384 if (unlikely(size != sizeof(struct bpf_tunnel_key))) 2385 memcpy(to_orig, to, size); 2386 2387 return 0; 2388 err_clear: 2389 memset(to_orig, 0, size); 2390 return err; 2391 } 2392 2393 static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { 2394 .func = bpf_skb_get_tunnel_key, 2395 .gpl_only = false, 2396 .ret_type = RET_INTEGER, 2397 .arg1_type = ARG_PTR_TO_CTX, 2398 .arg2_type = ARG_PTR_TO_UNINIT_MEM, 2399 .arg3_type = ARG_CONST_SIZE, 2400 .arg4_type = ARG_ANYTHING, 2401 }; 2402 2403 BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size) 2404 { 2405 const struct ip_tunnel_info *info = skb_tunnel_info(skb); 2406 int err; 2407 2408 if (unlikely(!info || 2409 !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) { 2410 err = -ENOENT; 2411 goto err_clear; 2412 } 2413 if (unlikely(size < info->options_len)) { 2414 err = -ENOMEM; 2415 goto err_clear; 2416 } 2417 2418 ip_tunnel_info_opts_get(to, info); 2419 if (size > info->options_len) 2420 memset(to + info->options_len, 0, size - info->options_len); 2421 2422 return info->options_len; 2423 err_clear: 2424 memset(to, 0, size); 2425 return err; 2426 } 2427 2428 static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = { 2429 .func = bpf_skb_get_tunnel_opt, 2430 .gpl_only = false, 2431 .ret_type = RET_INTEGER, 2432 .arg1_type = ARG_PTR_TO_CTX, 2433 .arg2_type = ARG_PTR_TO_UNINIT_MEM, 2434 .arg3_type = ARG_CONST_SIZE, 2435 }; 2436 2437 static struct metadata_dst __percpu *md_dst; 2438 2439 BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, 2440 const struct bpf_tunnel_key *, from, u32, size, u64, flags) 2441 { 2442 struct metadata_dst *md = this_cpu_ptr(md_dst); 2443 u8 compat[sizeof(struct bpf_tunnel_key)]; 2444 struct ip_tunnel_info *info; 2445 2446 if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX | 2447 BPF_F_DONT_FRAGMENT))) 2448 return -EINVAL; 2449 if (unlikely(size != sizeof(struct bpf_tunnel_key))) { 2450 switch (size) { 2451 case offsetof(struct bpf_tunnel_key, tunnel_label): 2452 case offsetof(struct bpf_tunnel_key, tunnel_ext): 2453 case offsetof(struct bpf_tunnel_key, remote_ipv6[1]): 2454 /* Fixup deprecated structure layouts here, so we have 2455 * a common path later on. 2456 */ 2457 memcpy(compat, from, size); 2458 memset(compat + size, 0, sizeof(compat) - size); 2459 from = (const struct bpf_tunnel_key *) compat; 2460 break; 2461 default: 2462 return -EINVAL; 2463 } 2464 } 2465 if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) || 2466 from->tunnel_ext)) 2467 return -EINVAL; 2468 2469 skb_dst_drop(skb); 2470 dst_hold((struct dst_entry *) md); 2471 skb_dst_set(skb, (struct dst_entry *) md); 2472 2473 info = &md->u.tun_info; 2474 info->mode = IP_TUNNEL_INFO_TX; 2475 2476 info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE; 2477 if (flags & BPF_F_DONT_FRAGMENT) 2478 info->key.tun_flags |= TUNNEL_DONT_FRAGMENT; 2479 2480 info->key.tun_id = cpu_to_be64(from->tunnel_id); 2481 info->key.tos = from->tunnel_tos; 2482 info->key.ttl = from->tunnel_ttl; 2483 2484 if (flags & BPF_F_TUNINFO_IPV6) { 2485 info->mode |= IP_TUNNEL_INFO_IPV6; 2486 memcpy(&info->key.u.ipv6.dst, from->remote_ipv6, 2487 sizeof(from->remote_ipv6)); 2488 info->key.label = cpu_to_be32(from->tunnel_label) & 2489 IPV6_FLOWLABEL_MASK; 2490 } else { 2491 info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); 2492 if (flags & BPF_F_ZERO_CSUM_TX) 2493 info->key.tun_flags &= ~TUNNEL_CSUM; 2494 } 2495 2496 return 0; 2497 } 2498 2499 static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { 2500 .func = bpf_skb_set_tunnel_key, 2501 .gpl_only = false, 2502 .ret_type = RET_INTEGER, 2503 .arg1_type = ARG_PTR_TO_CTX, 2504 .arg2_type = ARG_PTR_TO_MEM, 2505 .arg3_type = ARG_CONST_SIZE, 2506 .arg4_type = ARG_ANYTHING, 2507 }; 2508 2509 BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb, 2510 const u8 *, from, u32, size) 2511 { 2512 struct ip_tunnel_info *info = skb_tunnel_info(skb); 2513 const struct metadata_dst *md = this_cpu_ptr(md_dst); 2514 2515 if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1)))) 2516 return -EINVAL; 2517 if (unlikely(size > IP_TUNNEL_OPTS_MAX)) 2518 return -ENOMEM; 2519 2520 ip_tunnel_info_opts_set(info, from, size); 2521 2522 return 0; 2523 } 2524 2525 static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = { 2526 .func = bpf_skb_set_tunnel_opt, 2527 .gpl_only = false, 2528 .ret_type = RET_INTEGER, 2529 .arg1_type = ARG_PTR_TO_CTX, 2530 .arg2_type = ARG_PTR_TO_MEM, 2531 .arg3_type = ARG_CONST_SIZE, 2532 }; 2533 2534 static const struct bpf_func_proto * 2535 bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) 2536 { 2537 if (!md_dst) { 2538 /* Race is not possible, since it's called from verifier 2539 * that is holding verifier mutex. 2540 */ 2541 md_dst = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX, 2542 GFP_KERNEL); 2543 if (!md_dst) 2544 return NULL; 2545 } 2546 2547 switch (which) { 2548 case BPF_FUNC_skb_set_tunnel_key: 2549 return &bpf_skb_set_tunnel_key_proto; 2550 case BPF_FUNC_skb_set_tunnel_opt: 2551 return &bpf_skb_set_tunnel_opt_proto; 2552 default: 2553 return NULL; 2554 } 2555 } 2556 2557 BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map, 2558 u32, idx) 2559 { 2560 struct bpf_array *array = container_of(map, struct bpf_array, map); 2561 struct cgroup *cgrp; 2562 struct sock *sk; 2563 2564 sk = skb_to_full_sk(skb); 2565 if (!sk || !sk_fullsock(sk)) 2566 return -ENOENT; 2567 if (unlikely(idx >= array->map.max_entries)) 2568 return -E2BIG; 2569 2570 cgrp = READ_ONCE(array->ptrs[idx]); 2571 if (unlikely(!cgrp)) 2572 return -EAGAIN; 2573 2574 return sk_under_cgroup_hierarchy(sk, cgrp); 2575 } 2576 2577 static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { 2578 .func = bpf_skb_under_cgroup, 2579 .gpl_only = false, 2580 .ret_type = RET_INTEGER, 2581 .arg1_type = ARG_PTR_TO_CTX, 2582 .arg2_type = ARG_CONST_MAP_PTR, 2583 .arg3_type = ARG_ANYTHING, 2584 }; 2585 2586 static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, 2587 unsigned long off, unsigned long len) 2588 { 2589 memcpy(dst_buff, src_buff + off, len); 2590 return 0; 2591 } 2592 2593 BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map, 2594 u64, flags, void *, meta, u64, meta_size) 2595 { 2596 u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32; 2597 2598 if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) 2599 return -EINVAL; 2600 if (unlikely(xdp_size > (unsigned long)(xdp->data_end - xdp->data))) 2601 return -EFAULT; 2602 2603 return bpf_event_output(map, flags, meta, meta_size, xdp->data, 2604 xdp_size, bpf_xdp_copy); 2605 } 2606 2607 static const struct bpf_func_proto bpf_xdp_event_output_proto = { 2608 .func = bpf_xdp_event_output, 2609 .gpl_only = true, 2610 .ret_type = RET_INTEGER, 2611 .arg1_type = ARG_PTR_TO_CTX, 2612 .arg2_type = ARG_CONST_MAP_PTR, 2613 .arg3_type = ARG_ANYTHING, 2614 .arg4_type = ARG_PTR_TO_MEM, 2615 .arg5_type = ARG_CONST_SIZE, 2616 }; 2617 2618 BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb) 2619 { 2620 return skb->sk ? sock_gen_cookie(skb->sk) : 0; 2621 } 2622 2623 static const struct bpf_func_proto bpf_get_socket_cookie_proto = { 2624 .func = bpf_get_socket_cookie, 2625 .gpl_only = false, 2626 .ret_type = RET_INTEGER, 2627 .arg1_type = ARG_PTR_TO_CTX, 2628 }; 2629 2630 BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb) 2631 { 2632 struct sock *sk = sk_to_full_sk(skb->sk); 2633 kuid_t kuid; 2634 2635 if (!sk || !sk_fullsock(sk)) 2636 return overflowuid; 2637 kuid = sock_net_uid(sock_net(sk), sk); 2638 return from_kuid_munged(sock_net(sk)->user_ns, kuid); 2639 } 2640 2641 static const struct bpf_func_proto bpf_get_socket_uid_proto = { 2642 .func = bpf_get_socket_uid, 2643 .gpl_only = false, 2644 .ret_type = RET_INTEGER, 2645 .arg1_type = ARG_PTR_TO_CTX, 2646 }; 2647 2648 static const struct bpf_func_proto * 2649 bpf_base_func_proto(enum bpf_func_id func_id) 2650 { 2651 switch (func_id) { 2652 case BPF_FUNC_map_lookup_elem: 2653 return &bpf_map_lookup_elem_proto; 2654 case BPF_FUNC_map_update_elem: 2655 return &bpf_map_update_elem_proto; 2656 case BPF_FUNC_map_delete_elem: 2657 return &bpf_map_delete_elem_proto; 2658 case BPF_FUNC_get_prandom_u32: 2659 return &bpf_get_prandom_u32_proto; 2660 case BPF_FUNC_get_smp_processor_id: 2661 return &bpf_get_raw_smp_processor_id_proto; 2662 case BPF_FUNC_get_numa_node_id: 2663 return &bpf_get_numa_node_id_proto; 2664 case BPF_FUNC_tail_call: 2665 return &bpf_tail_call_proto; 2666 case BPF_FUNC_ktime_get_ns: 2667 return &bpf_ktime_get_ns_proto; 2668 case BPF_FUNC_trace_printk: 2669 if (capable(CAP_SYS_ADMIN)) 2670 return bpf_get_trace_printk_proto(); 2671 default: 2672 return NULL; 2673 } 2674 } 2675 2676 static const struct bpf_func_proto * 2677 sk_filter_func_proto(enum bpf_func_id func_id) 2678 { 2679 switch (func_id) { 2680 case BPF_FUNC_skb_load_bytes: 2681 return &bpf_skb_load_bytes_proto; 2682 case BPF_FUNC_get_socket_cookie: 2683 return &bpf_get_socket_cookie_proto; 2684 case BPF_FUNC_get_socket_uid: 2685 return &bpf_get_socket_uid_proto; 2686 default: 2687 return bpf_base_func_proto(func_id); 2688 } 2689 } 2690 2691 static const struct bpf_func_proto * 2692 tc_cls_act_func_proto(enum bpf_func_id func_id) 2693 { 2694 switch (func_id) { 2695 case BPF_FUNC_skb_store_bytes: 2696 return &bpf_skb_store_bytes_proto; 2697 case BPF_FUNC_skb_load_bytes: 2698 return &bpf_skb_load_bytes_proto; 2699 case BPF_FUNC_skb_pull_data: 2700 return &bpf_skb_pull_data_proto; 2701 case BPF_FUNC_csum_diff: 2702 return &bpf_csum_diff_proto; 2703 case BPF_FUNC_csum_update: 2704 return &bpf_csum_update_proto; 2705 case BPF_FUNC_l3_csum_replace: 2706 return &bpf_l3_csum_replace_proto; 2707 case BPF_FUNC_l4_csum_replace: 2708 return &bpf_l4_csum_replace_proto; 2709 case BPF_FUNC_clone_redirect: 2710 return &bpf_clone_redirect_proto; 2711 case BPF_FUNC_get_cgroup_classid: 2712 return &bpf_get_cgroup_classid_proto; 2713 case BPF_FUNC_skb_vlan_push: 2714 return &bpf_skb_vlan_push_proto; 2715 case BPF_FUNC_skb_vlan_pop: 2716 return &bpf_skb_vlan_pop_proto; 2717 case BPF_FUNC_skb_change_proto: 2718 return &bpf_skb_change_proto_proto; 2719 case BPF_FUNC_skb_change_type: 2720 return &bpf_skb_change_type_proto; 2721 case BPF_FUNC_skb_change_tail: 2722 return &bpf_skb_change_tail_proto; 2723 case BPF_FUNC_skb_get_tunnel_key: 2724 return &bpf_skb_get_tunnel_key_proto; 2725 case BPF_FUNC_skb_set_tunnel_key: 2726 return bpf_get_skb_set_tunnel_proto(func_id); 2727 case BPF_FUNC_skb_get_tunnel_opt: 2728 return &bpf_skb_get_tunnel_opt_proto; 2729 case BPF_FUNC_skb_set_tunnel_opt: 2730 return bpf_get_skb_set_tunnel_proto(func_id); 2731 case BPF_FUNC_redirect: 2732 return &bpf_redirect_proto; 2733 case BPF_FUNC_get_route_realm: 2734 return &bpf_get_route_realm_proto; 2735 case BPF_FUNC_get_hash_recalc: 2736 return &bpf_get_hash_recalc_proto; 2737 case BPF_FUNC_set_hash_invalid: 2738 return &bpf_set_hash_invalid_proto; 2739 case BPF_FUNC_perf_event_output: 2740 return &bpf_skb_event_output_proto; 2741 case BPF_FUNC_get_smp_processor_id: 2742 return &bpf_get_smp_processor_id_proto; 2743 case BPF_FUNC_skb_under_cgroup: 2744 return &bpf_skb_under_cgroup_proto; 2745 case BPF_FUNC_get_socket_cookie: 2746 return &bpf_get_socket_cookie_proto; 2747 case BPF_FUNC_get_socket_uid: 2748 return &bpf_get_socket_uid_proto; 2749 default: 2750 return bpf_base_func_proto(func_id); 2751 } 2752 } 2753 2754 static const struct bpf_func_proto * 2755 xdp_func_proto(enum bpf_func_id func_id) 2756 { 2757 switch (func_id) { 2758 case BPF_FUNC_perf_event_output: 2759 return &bpf_xdp_event_output_proto; 2760 case BPF_FUNC_get_smp_processor_id: 2761 return &bpf_get_smp_processor_id_proto; 2762 case BPF_FUNC_xdp_adjust_head: 2763 return &bpf_xdp_adjust_head_proto; 2764 default: 2765 return bpf_base_func_proto(func_id); 2766 } 2767 } 2768 2769 static const struct bpf_func_proto * 2770 cg_skb_func_proto(enum bpf_func_id func_id) 2771 { 2772 return sk_filter_func_proto(func_id); 2773 } 2774 2775 static const struct bpf_func_proto * 2776 lwt_inout_func_proto(enum bpf_func_id func_id) 2777 { 2778 switch (func_id) { 2779 case BPF_FUNC_skb_load_bytes: 2780 return &bpf_skb_load_bytes_proto; 2781 case BPF_FUNC_skb_pull_data: 2782 return &bpf_skb_pull_data_proto; 2783 case BPF_FUNC_csum_diff: 2784 return &bpf_csum_diff_proto; 2785 case BPF_FUNC_get_cgroup_classid: 2786 return &bpf_get_cgroup_classid_proto; 2787 case BPF_FUNC_get_route_realm: 2788 return &bpf_get_route_realm_proto; 2789 case BPF_FUNC_get_hash_recalc: 2790 return &bpf_get_hash_recalc_proto; 2791 case BPF_FUNC_perf_event_output: 2792 return &bpf_skb_event_output_proto; 2793 case BPF_FUNC_get_smp_processor_id: 2794 return &bpf_get_smp_processor_id_proto; 2795 case BPF_FUNC_skb_under_cgroup: 2796 return &bpf_skb_under_cgroup_proto; 2797 default: 2798 return bpf_base_func_proto(func_id); 2799 } 2800 } 2801 2802 static const struct bpf_func_proto * 2803 lwt_xmit_func_proto(enum bpf_func_id func_id) 2804 { 2805 switch (func_id) { 2806 case BPF_FUNC_skb_get_tunnel_key: 2807 return &bpf_skb_get_tunnel_key_proto; 2808 case BPF_FUNC_skb_set_tunnel_key: 2809 return bpf_get_skb_set_tunnel_proto(func_id); 2810 case BPF_FUNC_skb_get_tunnel_opt: 2811 return &bpf_skb_get_tunnel_opt_proto; 2812 case BPF_FUNC_skb_set_tunnel_opt: 2813 return bpf_get_skb_set_tunnel_proto(func_id); 2814 case BPF_FUNC_redirect: 2815 return &bpf_redirect_proto; 2816 case BPF_FUNC_clone_redirect: 2817 return &bpf_clone_redirect_proto; 2818 case BPF_FUNC_skb_change_tail: 2819 return &bpf_skb_change_tail_proto; 2820 case BPF_FUNC_skb_change_head: 2821 return &bpf_skb_change_head_proto; 2822 case BPF_FUNC_skb_store_bytes: 2823 return &bpf_skb_store_bytes_proto; 2824 case BPF_FUNC_csum_update: 2825 return &bpf_csum_update_proto; 2826 case BPF_FUNC_l3_csum_replace: 2827 return &bpf_l3_csum_replace_proto; 2828 case BPF_FUNC_l4_csum_replace: 2829 return &bpf_l4_csum_replace_proto; 2830 case BPF_FUNC_set_hash_invalid: 2831 return &bpf_set_hash_invalid_proto; 2832 default: 2833 return lwt_inout_func_proto(func_id); 2834 } 2835 } 2836 2837 static bool __is_valid_access(int off, int size) 2838 { 2839 if (off < 0 || off >= sizeof(struct __sk_buff)) 2840 return false; 2841 2842 /* The verifier guarantees that size > 0. */ 2843 if (off % size != 0) 2844 return false; 2845 2846 switch (off) { 2847 case offsetof(struct __sk_buff, cb[0]) ... 2848 offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: 2849 if (off + size > 2850 offsetof(struct __sk_buff, cb[4]) + sizeof(__u32)) 2851 return false; 2852 break; 2853 default: 2854 if (size != sizeof(__u32)) 2855 return false; 2856 } 2857 2858 return true; 2859 } 2860 2861 static bool sk_filter_is_valid_access(int off, int size, 2862 enum bpf_access_type type, 2863 enum bpf_reg_type *reg_type) 2864 { 2865 switch (off) { 2866 case offsetof(struct __sk_buff, tc_classid): 2867 case offsetof(struct __sk_buff, data): 2868 case offsetof(struct __sk_buff, data_end): 2869 return false; 2870 } 2871 2872 if (type == BPF_WRITE) { 2873 switch (off) { 2874 case offsetof(struct __sk_buff, cb[0]) ... 2875 offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: 2876 break; 2877 default: 2878 return false; 2879 } 2880 } 2881 2882 return __is_valid_access(off, size); 2883 } 2884 2885 static bool lwt_is_valid_access(int off, int size, 2886 enum bpf_access_type type, 2887 enum bpf_reg_type *reg_type) 2888 { 2889 switch (off) { 2890 case offsetof(struct __sk_buff, tc_classid): 2891 return false; 2892 } 2893 2894 if (type == BPF_WRITE) { 2895 switch (off) { 2896 case offsetof(struct __sk_buff, mark): 2897 case offsetof(struct __sk_buff, priority): 2898 case offsetof(struct __sk_buff, cb[0]) ... 2899 offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: 2900 break; 2901 default: 2902 return false; 2903 } 2904 } 2905 2906 switch (off) { 2907 case offsetof(struct __sk_buff, data): 2908 *reg_type = PTR_TO_PACKET; 2909 break; 2910 case offsetof(struct __sk_buff, data_end): 2911 *reg_type = PTR_TO_PACKET_END; 2912 break; 2913 } 2914 2915 return __is_valid_access(off, size); 2916 } 2917 2918 static bool sock_filter_is_valid_access(int off, int size, 2919 enum bpf_access_type type, 2920 enum bpf_reg_type *reg_type) 2921 { 2922 if (type == BPF_WRITE) { 2923 switch (off) { 2924 case offsetof(struct bpf_sock, bound_dev_if): 2925 break; 2926 default: 2927 return false; 2928 } 2929 } 2930 2931 if (off < 0 || off + size > sizeof(struct bpf_sock)) 2932 return false; 2933 /* The verifier guarantees that size > 0. */ 2934 if (off % size != 0) 2935 return false; 2936 if (size != sizeof(__u32)) 2937 return false; 2938 2939 return true; 2940 } 2941 2942 static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, 2943 const struct bpf_prog *prog) 2944 { 2945 struct bpf_insn *insn = insn_buf; 2946 2947 if (!direct_write) 2948 return 0; 2949 2950 /* if (!skb->cloned) 2951 * goto start; 2952 * 2953 * (Fast-path, otherwise approximation that we might be 2954 * a clone, do the rest in helper.) 2955 */ 2956 *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET()); 2957 *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK); 2958 *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7); 2959 2960 /* ret = bpf_skb_pull_data(skb, 0); */ 2961 *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); 2962 *insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2); 2963 *insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, 2964 BPF_FUNC_skb_pull_data); 2965 /* if (!ret) 2966 * goto restore; 2967 * return TC_ACT_SHOT; 2968 */ 2969 *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2); 2970 *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, TC_ACT_SHOT); 2971 *insn++ = BPF_EXIT_INSN(); 2972 2973 /* restore: */ 2974 *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6); 2975 /* start: */ 2976 *insn++ = prog->insnsi[0]; 2977 2978 return insn - insn_buf; 2979 } 2980 2981 static bool tc_cls_act_is_valid_access(int off, int size, 2982 enum bpf_access_type type, 2983 enum bpf_reg_type *reg_type) 2984 { 2985 if (type == BPF_WRITE) { 2986 switch (off) { 2987 case offsetof(struct __sk_buff, mark): 2988 case offsetof(struct __sk_buff, tc_index): 2989 case offsetof(struct __sk_buff, priority): 2990 case offsetof(struct __sk_buff, cb[0]) ... 2991 offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: 2992 case offsetof(struct __sk_buff, tc_classid): 2993 break; 2994 default: 2995 return false; 2996 } 2997 } 2998 2999 switch (off) { 3000 case offsetof(struct __sk_buff, data): 3001 *reg_type = PTR_TO_PACKET; 3002 break; 3003 case offsetof(struct __sk_buff, data_end): 3004 *reg_type = PTR_TO_PACKET_END; 3005 break; 3006 } 3007 3008 return __is_valid_access(off, size); 3009 } 3010 3011 static bool __is_valid_xdp_access(int off, int size) 3012 { 3013 if (off < 0 || off >= sizeof(struct xdp_md)) 3014 return false; 3015 if (off % size != 0) 3016 return false; 3017 if (size != sizeof(__u32)) 3018 return false; 3019 3020 return true; 3021 } 3022 3023 static bool xdp_is_valid_access(int off, int size, 3024 enum bpf_access_type type, 3025 enum bpf_reg_type *reg_type) 3026 { 3027 if (type == BPF_WRITE) 3028 return false; 3029 3030 switch (off) { 3031 case offsetof(struct xdp_md, data): 3032 *reg_type = PTR_TO_PACKET; 3033 break; 3034 case offsetof(struct xdp_md, data_end): 3035 *reg_type = PTR_TO_PACKET_END; 3036 break; 3037 } 3038 3039 return __is_valid_xdp_access(off, size); 3040 } 3041 3042 void bpf_warn_invalid_xdp_action(u32 act) 3043 { 3044 WARN_ONCE(1, "Illegal XDP return value %u, expect packet loss\n", act); 3045 } 3046 EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); 3047 3048 static u32 bpf_convert_ctx_access(enum bpf_access_type type, 3049 const struct bpf_insn *si, 3050 struct bpf_insn *insn_buf, 3051 struct bpf_prog *prog) 3052 { 3053 struct bpf_insn *insn = insn_buf; 3054 int off; 3055 3056 switch (si->off) { 3057 case offsetof(struct __sk_buff, len): 3058 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); 3059 3060 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, 3061 offsetof(struct sk_buff, len)); 3062 break; 3063 3064 case offsetof(struct __sk_buff, protocol): 3065 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); 3066 3067 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, 3068 offsetof(struct sk_buff, protocol)); 3069 break; 3070 3071 case offsetof(struct __sk_buff, vlan_proto): 3072 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2); 3073 3074 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, 3075 offsetof(struct sk_buff, vlan_proto)); 3076 break; 3077 3078 case offsetof(struct __sk_buff, priority): 3079 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4); 3080 3081 if (type == BPF_WRITE) 3082 *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, 3083 offsetof(struct sk_buff, priority)); 3084 else 3085 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, 3086 offsetof(struct sk_buff, priority)); 3087 break; 3088 3089 case offsetof(struct __sk_buff, ingress_ifindex): 3090 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4); 3091 3092 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, 3093 offsetof(struct sk_buff, skb_iif)); 3094 break; 3095 3096 case offsetof(struct __sk_buff, ifindex): 3097 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); 3098 3099 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), 3100 si->dst_reg, si->src_reg, 3101 offsetof(struct sk_buff, dev)); 3102 *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); 3103 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 3104 offsetof(struct net_device, ifindex)); 3105 break; 3106 3107 case offsetof(struct __sk_buff, hash): 3108 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); 3109 3110 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, 3111 offsetof(struct sk_buff, hash)); 3112 break; 3113 3114 case offsetof(struct __sk_buff, mark): 3115 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); 3116 3117 if (type == BPF_WRITE) 3118 *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, 3119 offsetof(struct sk_buff, mark)); 3120 else 3121 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, 3122 offsetof(struct sk_buff, mark)); 3123 break; 3124 3125 case offsetof(struct __sk_buff, pkt_type): 3126 return convert_skb_access(SKF_AD_PKTTYPE, si->dst_reg, 3127 si->src_reg, insn); 3128 3129 case offsetof(struct __sk_buff, queue_mapping): 3130 return convert_skb_access(SKF_AD_QUEUE, si->dst_reg, 3131 si->src_reg, insn); 3132 3133 case offsetof(struct __sk_buff, vlan_present): 3134 return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, 3135 si->dst_reg, si->src_reg, insn); 3136 3137 case offsetof(struct __sk_buff, vlan_tci): 3138 return convert_skb_access(SKF_AD_VLAN_TAG, 3139 si->dst_reg, si->src_reg, insn); 3140 3141 case offsetof(struct __sk_buff, cb[0]) ... 3142 offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: 3143 BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20); 3144 BUILD_BUG_ON((offsetof(struct sk_buff, cb) + 3145 offsetof(struct qdisc_skb_cb, data)) % 3146 sizeof(__u64)); 3147 3148 prog->cb_access = 1; 3149 off = si->off; 3150 off -= offsetof(struct __sk_buff, cb[0]); 3151 off += offsetof(struct sk_buff, cb); 3152 off += offsetof(struct qdisc_skb_cb, data); 3153 if (type == BPF_WRITE) 3154 *insn++ = BPF_STX_MEM(BPF_SIZE(si->code), si->dst_reg, 3155 si->src_reg, off); 3156 else 3157 *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg, 3158 si->src_reg, off); 3159 break; 3160 3161 case offsetof(struct __sk_buff, tc_classid): 3162 BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, tc_classid) != 2); 3163 3164 off = si->off; 3165 off -= offsetof(struct __sk_buff, tc_classid); 3166 off += offsetof(struct sk_buff, cb); 3167 off += offsetof(struct qdisc_skb_cb, tc_classid); 3168 if (type == BPF_WRITE) 3169 *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, 3170 si->src_reg, off); 3171 else 3172 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, 3173 si->src_reg, off); 3174 break; 3175 3176 case offsetof(struct __sk_buff, data): 3177 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), 3178 si->dst_reg, si->src_reg, 3179 offsetof(struct sk_buff, data)); 3180 break; 3181 3182 case offsetof(struct __sk_buff, data_end): 3183 off = si->off; 3184 off -= offsetof(struct __sk_buff, data_end); 3185 off += offsetof(struct sk_buff, cb); 3186 off += offsetof(struct bpf_skb_data_end, data_end); 3187 *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, 3188 si->src_reg, off); 3189 break; 3190 3191 case offsetof(struct __sk_buff, tc_index): 3192 #ifdef CONFIG_NET_SCHED 3193 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2); 3194 3195 if (type == BPF_WRITE) 3196 *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg, 3197 offsetof(struct sk_buff, tc_index)); 3198 else 3199 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, 3200 offsetof(struct sk_buff, tc_index)); 3201 #else 3202 if (type == BPF_WRITE) 3203 *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg); 3204 else 3205 *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); 3206 #endif 3207 break; 3208 3209 case offsetof(struct __sk_buff, napi_id): 3210 #if defined(CONFIG_NET_RX_BUSY_POLL) 3211 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, napi_id) != 4); 3212 3213 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, 3214 offsetof(struct sk_buff, napi_id)); 3215 *insn++ = BPF_JMP_IMM(BPF_JGE, si->dst_reg, MIN_NAPI_ID, 1); 3216 *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); 3217 #else 3218 *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); 3219 #endif 3220 break; 3221 } 3222 3223 return insn - insn_buf; 3224 } 3225 3226 static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, 3227 const struct bpf_insn *si, 3228 struct bpf_insn *insn_buf, 3229 struct bpf_prog *prog) 3230 { 3231 struct bpf_insn *insn = insn_buf; 3232 3233 switch (si->off) { 3234 case offsetof(struct bpf_sock, bound_dev_if): 3235 BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4); 3236 3237 if (type == BPF_WRITE) 3238 *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, 3239 offsetof(struct sock, sk_bound_dev_if)); 3240 else 3241 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, 3242 offsetof(struct sock, sk_bound_dev_if)); 3243 break; 3244 3245 case offsetof(struct bpf_sock, family): 3246 BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2); 3247 3248 *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, 3249 offsetof(struct sock, sk_family)); 3250 break; 3251 3252 case offsetof(struct bpf_sock, type): 3253 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, 3254 offsetof(struct sock, __sk_flags_offset)); 3255 *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK); 3256 *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT); 3257 break; 3258 3259 case offsetof(struct bpf_sock, protocol): 3260 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, 3261 offsetof(struct sock, __sk_flags_offset)); 3262 *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); 3263 *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT); 3264 break; 3265 } 3266 3267 return insn - insn_buf; 3268 } 3269 3270 static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, 3271 const struct bpf_insn *si, 3272 struct bpf_insn *insn_buf, 3273 struct bpf_prog *prog) 3274 { 3275 struct bpf_insn *insn = insn_buf; 3276 3277 switch (si->off) { 3278 case offsetof(struct __sk_buff, ifindex): 3279 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); 3280 3281 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), 3282 si->dst_reg, si->src_reg, 3283 offsetof(struct sk_buff, dev)); 3284 *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, 3285 offsetof(struct net_device, ifindex)); 3286 break; 3287 default: 3288 return bpf_convert_ctx_access(type, si, insn_buf, prog); 3289 } 3290 3291 return insn - insn_buf; 3292 } 3293 3294 static u32 xdp_convert_ctx_access(enum bpf_access_type type, 3295 const struct bpf_insn *si, 3296 struct bpf_insn *insn_buf, 3297 struct bpf_prog *prog) 3298 { 3299 struct bpf_insn *insn = insn_buf; 3300 3301 switch (si->off) { 3302 case offsetof(struct xdp_md, data): 3303 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data), 3304 si->dst_reg, si->src_reg, 3305 offsetof(struct xdp_buff, data)); 3306 break; 3307 case offsetof(struct xdp_md, data_end): 3308 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end), 3309 si->dst_reg, si->src_reg, 3310 offsetof(struct xdp_buff, data_end)); 3311 break; 3312 } 3313 3314 return insn - insn_buf; 3315 } 3316 3317 const struct bpf_verifier_ops sk_filter_prog_ops = { 3318 .get_func_proto = sk_filter_func_proto, 3319 .is_valid_access = sk_filter_is_valid_access, 3320 .convert_ctx_access = bpf_convert_ctx_access, 3321 }; 3322 3323 const struct bpf_verifier_ops tc_cls_act_prog_ops = { 3324 .get_func_proto = tc_cls_act_func_proto, 3325 .is_valid_access = tc_cls_act_is_valid_access, 3326 .convert_ctx_access = tc_cls_act_convert_ctx_access, 3327 .gen_prologue = tc_cls_act_prologue, 3328 .test_run = bpf_prog_test_run_skb, 3329 }; 3330 3331 const struct bpf_verifier_ops xdp_prog_ops = { 3332 .get_func_proto = xdp_func_proto, 3333 .is_valid_access = xdp_is_valid_access, 3334 .convert_ctx_access = xdp_convert_ctx_access, 3335 .test_run = bpf_prog_test_run_xdp, 3336 }; 3337 3338 const struct bpf_verifier_ops cg_skb_prog_ops = { 3339 .get_func_proto = cg_skb_func_proto, 3340 .is_valid_access = sk_filter_is_valid_access, 3341 .convert_ctx_access = bpf_convert_ctx_access, 3342 .test_run = bpf_prog_test_run_skb, 3343 }; 3344 3345 const struct bpf_verifier_ops lwt_inout_prog_ops = { 3346 .get_func_proto = lwt_inout_func_proto, 3347 .is_valid_access = lwt_is_valid_access, 3348 .convert_ctx_access = bpf_convert_ctx_access, 3349 .test_run = bpf_prog_test_run_skb, 3350 }; 3351 3352 const struct bpf_verifier_ops lwt_xmit_prog_ops = { 3353 .get_func_proto = lwt_xmit_func_proto, 3354 .is_valid_access = lwt_is_valid_access, 3355 .convert_ctx_access = bpf_convert_ctx_access, 3356 .gen_prologue = tc_cls_act_prologue, 3357 .test_run = bpf_prog_test_run_skb, 3358 }; 3359 3360 const struct bpf_verifier_ops cg_sock_prog_ops = { 3361 .get_func_proto = bpf_base_func_proto, 3362 .is_valid_access = sock_filter_is_valid_access, 3363 .convert_ctx_access = sock_filter_convert_ctx_access, 3364 }; 3365 3366 int sk_detach_filter(struct sock *sk) 3367 { 3368 int ret = -ENOENT; 3369 struct sk_filter *filter; 3370 3371 if (sock_flag(sk, SOCK_FILTER_LOCKED)) 3372 return -EPERM; 3373 3374 filter = rcu_dereference_protected(sk->sk_filter, 3375 lockdep_sock_is_held(sk)); 3376 if (filter) { 3377 RCU_INIT_POINTER(sk->sk_filter, NULL); 3378 sk_filter_uncharge(sk, filter); 3379 ret = 0; 3380 } 3381 3382 return ret; 3383 } 3384 EXPORT_SYMBOL_GPL(sk_detach_filter); 3385 3386 int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, 3387 unsigned int len) 3388 { 3389 struct sock_fprog_kern *fprog; 3390 struct sk_filter *filter; 3391 int ret = 0; 3392 3393 lock_sock(sk); 3394 filter = rcu_dereference_protected(sk->sk_filter, 3395 lockdep_sock_is_held(sk)); 3396 if (!filter) 3397 goto out; 3398 3399 /* We're copying the filter that has been originally attached, 3400 * so no conversion/decode needed anymore. eBPF programs that 3401 * have no original program cannot be dumped through this. 3402 */ 3403 ret = -EACCES; 3404 fprog = filter->prog->orig_prog; 3405 if (!fprog) 3406 goto out; 3407 3408 ret = fprog->len; 3409 if (!len) 3410 /* User space only enquires number of filter blocks. */ 3411 goto out; 3412 3413 ret = -EINVAL; 3414 if (len < fprog->len) 3415 goto out; 3416 3417 ret = -EFAULT; 3418 if (copy_to_user(ubuf, fprog->filter, bpf_classic_proglen(fprog))) 3419 goto out; 3420 3421 /* Instead of bytes, the API requests to return the number 3422 * of filter blocks. 3423 */ 3424 ret = fprog->len; 3425 out: 3426 release_sock(sk); 3427 return ret; 3428 } 3429