1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch> 3 */ 4 5 #include <linux/kernel.h> 6 #include <linux/module.h> 7 #include <linux/skbuff.h> 8 #include <linux/types.h> 9 #include <linux/bpf.h> 10 #include <net/lwtunnel.h> 11 #include <net/gre.h> 12 #include <net/ip6_route.h> 13 #include <net/ipv6_stubs.h> 14 15 struct bpf_lwt_prog { 16 struct bpf_prog *prog; 17 char *name; 18 }; 19 20 struct bpf_lwt { 21 struct bpf_lwt_prog in; 22 struct bpf_lwt_prog out; 23 struct bpf_lwt_prog xmit; 24 int family; 25 }; 26 27 #define MAX_PROG_NAME 256 28 29 static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt) 30 { 31 return (struct bpf_lwt *)lwt->data; 32 } 33 34 #define NO_REDIRECT false 35 #define CAN_REDIRECT true 36 37 static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, 38 struct dst_entry *dst, bool can_redirect) 39 { 40 int ret; 41 42 /* Preempt disable is needed to protect per-cpu redirect_info between 43 * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and 44 * access to maps strictly require a rcu_read_lock() for protection, 45 * mixing with BH RCU lock doesn't work. 46 */ 47 preempt_disable(); 48 bpf_compute_data_pointers(skb); 49 ret = bpf_prog_run_save_cb(lwt->prog, skb); 50 51 switch (ret) { 52 case BPF_OK: 53 case BPF_LWT_REROUTE: 54 break; 55 56 case BPF_REDIRECT: 57 if (unlikely(!can_redirect)) { 58 pr_warn_once("Illegal redirect return code in prog %s\n", 59 lwt->name ? : "<unknown>"); 60 ret = BPF_OK; 61 } else { 62 skb_reset_mac_header(skb); 63 ret = skb_do_redirect(skb); 64 if (ret == 0) 65 ret = BPF_REDIRECT; 66 } 67 break; 68 69 case BPF_DROP: 70 kfree_skb(skb); 71 ret = -EPERM; 72 break; 73 74 default: 75 pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret); 76 kfree_skb(skb); 77 ret = -EINVAL; 78 break; 79 } 80 81 preempt_enable(); 82 83 return ret; 84 } 85 86 static int bpf_lwt_input_reroute(struct sk_buff *skb) 87 { 88 int err = -EINVAL; 89 90 if (skb->protocol == htons(ETH_P_IP)) { 91 struct iphdr *iph = ip_hdr(skb); 92 93 err = ip_route_input_noref(skb, iph->daddr, iph->saddr, 94 iph->tos, skb_dst(skb)->dev); 95 } else if (skb->protocol == htons(ETH_P_IPV6)) { 96 err = ipv6_stub->ipv6_route_input(skb); 97 } else { 98 err = -EAFNOSUPPORT; 99 } 100 101 if (err) 102 goto err; 103 return dst_input(skb); 104 105 err: 106 kfree_skb(skb); 107 return err; 108 } 109 110 static int bpf_input(struct sk_buff *skb) 111 { 112 struct dst_entry *dst = skb_dst(skb); 113 struct bpf_lwt *bpf; 114 int ret; 115 116 bpf = bpf_lwt_lwtunnel(dst->lwtstate); 117 if (bpf->in.prog) { 118 ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT); 119 if (ret < 0) 120 return ret; 121 if (ret == BPF_LWT_REROUTE) 122 return bpf_lwt_input_reroute(skb); 123 } 124 125 if (unlikely(!dst->lwtstate->orig_input)) { 126 kfree_skb(skb); 127 return -EINVAL; 128 } 129 130 return dst->lwtstate->orig_input(skb); 131 } 132 133 static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb) 134 { 135 struct dst_entry *dst = skb_dst(skb); 136 struct bpf_lwt *bpf; 137 int ret; 138 139 bpf = bpf_lwt_lwtunnel(dst->lwtstate); 140 if (bpf->out.prog) { 141 ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT); 142 if (ret < 0) 143 return ret; 144 } 145 146 if (unlikely(!dst->lwtstate->orig_output)) { 147 pr_warn_once("orig_output not set on dst for prog %s\n", 148 bpf->out.name); 149 kfree_skb(skb); 150 return -EINVAL; 151 } 152 153 return dst->lwtstate->orig_output(net, sk, skb); 154 } 155 156 static int xmit_check_hhlen(struct sk_buff *skb) 157 { 158 int hh_len = skb_dst(skb)->dev->hard_header_len; 159 160 if (skb_headroom(skb) < hh_len) { 161 int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); 162 163 if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC)) 164 return -ENOMEM; 165 } 166 167 return 0; 168 } 169 170 static int bpf_lwt_xmit_reroute(struct sk_buff *skb) 171 { 172 struct net_device *l3mdev = l3mdev_master_dev_rcu(skb_dst(skb)->dev); 173 int oif = l3mdev ? l3mdev->ifindex : 0; 174 struct dst_entry *dst = NULL; 175 int err = -EAFNOSUPPORT; 176 struct sock *sk; 177 struct net *net; 178 bool ipv4; 179 180 if (skb->protocol == htons(ETH_P_IP)) 181 ipv4 = true; 182 else if (skb->protocol == htons(ETH_P_IPV6)) 183 ipv4 = false; 184 else 185 goto err; 186 187 sk = sk_to_full_sk(skb->sk); 188 if (sk) { 189 if (sk->sk_bound_dev_if) 190 oif = sk->sk_bound_dev_if; 191 net = sock_net(sk); 192 } else { 193 net = dev_net(skb_dst(skb)->dev); 194 } 195 196 if (ipv4) { 197 struct iphdr *iph = ip_hdr(skb); 198 struct flowi4 fl4 = {}; 199 struct rtable *rt; 200 201 fl4.flowi4_oif = oif; 202 fl4.flowi4_mark = skb->mark; 203 fl4.flowi4_uid = sock_net_uid(net, sk); 204 fl4.flowi4_tos = RT_TOS(iph->tos); 205 fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; 206 fl4.flowi4_proto = iph->protocol; 207 fl4.daddr = iph->daddr; 208 fl4.saddr = iph->saddr; 209 210 rt = ip_route_output_key(net, &fl4); 211 if (IS_ERR(rt)) { 212 err = PTR_ERR(rt); 213 goto err; 214 } 215 dst = &rt->dst; 216 } else { 217 struct ipv6hdr *iph6 = ipv6_hdr(skb); 218 struct flowi6 fl6 = {}; 219 220 fl6.flowi6_oif = oif; 221 fl6.flowi6_mark = skb->mark; 222 fl6.flowi6_uid = sock_net_uid(net, sk); 223 fl6.flowlabel = ip6_flowinfo(iph6); 224 fl6.flowi6_proto = iph6->nexthdr; 225 fl6.daddr = iph6->daddr; 226 fl6.saddr = iph6->saddr; 227 228 err = ipv6_stub->ipv6_dst_lookup(net, skb->sk, &dst, &fl6); 229 if (unlikely(err)) 230 goto err; 231 if (IS_ERR(dst)) { 232 err = PTR_ERR(dst); 233 goto err; 234 } 235 } 236 if (unlikely(dst->error)) { 237 err = dst->error; 238 dst_release(dst); 239 goto err; 240 } 241 242 /* Although skb header was reserved in bpf_lwt_push_ip_encap(), it 243 * was done for the previous dst, so we are doing it here again, in 244 * case the new dst needs much more space. The call below is a noop 245 * if there is enough header space in skb. 246 */ 247 err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); 248 if (unlikely(err)) 249 goto err; 250 251 skb_dst_drop(skb); 252 skb_dst_set(skb, dst); 253 254 err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb); 255 if (unlikely(err)) 256 return err; 257 258 /* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */ 259 return LWTUNNEL_XMIT_DONE; 260 261 err: 262 kfree_skb(skb); 263 return err; 264 } 265 266 static int bpf_xmit(struct sk_buff *skb) 267 { 268 struct dst_entry *dst = skb_dst(skb); 269 struct bpf_lwt *bpf; 270 271 bpf = bpf_lwt_lwtunnel(dst->lwtstate); 272 if (bpf->xmit.prog) { 273 __be16 proto = skb->protocol; 274 int ret; 275 276 ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT); 277 switch (ret) { 278 case BPF_OK: 279 /* If the header changed, e.g. via bpf_lwt_push_encap, 280 * BPF_LWT_REROUTE below should have been used if the 281 * protocol was also changed. 282 */ 283 if (skb->protocol != proto) { 284 kfree_skb(skb); 285 return -EINVAL; 286 } 287 /* If the header was expanded, headroom might be too 288 * small for L2 header to come, expand as needed. 289 */ 290 ret = xmit_check_hhlen(skb); 291 if (unlikely(ret)) 292 return ret; 293 294 return LWTUNNEL_XMIT_CONTINUE; 295 case BPF_REDIRECT: 296 return LWTUNNEL_XMIT_DONE; 297 case BPF_LWT_REROUTE: 298 return bpf_lwt_xmit_reroute(skb); 299 default: 300 return ret; 301 } 302 } 303 304 return LWTUNNEL_XMIT_CONTINUE; 305 } 306 307 static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog) 308 { 309 if (prog->prog) 310 bpf_prog_put(prog->prog); 311 312 kfree(prog->name); 313 } 314 315 static void bpf_destroy_state(struct lwtunnel_state *lwt) 316 { 317 struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); 318 319 bpf_lwt_prog_destroy(&bpf->in); 320 bpf_lwt_prog_destroy(&bpf->out); 321 bpf_lwt_prog_destroy(&bpf->xmit); 322 } 323 324 static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = { 325 [LWT_BPF_PROG_FD] = { .type = NLA_U32, }, 326 [LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING, 327 .len = MAX_PROG_NAME }, 328 }; 329 330 static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog, 331 enum bpf_prog_type type) 332 { 333 struct nlattr *tb[LWT_BPF_PROG_MAX + 1]; 334 struct bpf_prog *p; 335 int ret; 336 u32 fd; 337 338 ret = nla_parse_nested_deprecated(tb, LWT_BPF_PROG_MAX, attr, 339 bpf_prog_policy, NULL); 340 if (ret < 0) 341 return ret; 342 343 if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME]) 344 return -EINVAL; 345 346 prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_ATOMIC); 347 if (!prog->name) 348 return -ENOMEM; 349 350 fd = nla_get_u32(tb[LWT_BPF_PROG_FD]); 351 p = bpf_prog_get_type(fd, type); 352 if (IS_ERR(p)) 353 return PTR_ERR(p); 354 355 prog->prog = p; 356 357 return 0; 358 } 359 360 static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = { 361 [LWT_BPF_IN] = { .type = NLA_NESTED, }, 362 [LWT_BPF_OUT] = { .type = NLA_NESTED, }, 363 [LWT_BPF_XMIT] = { .type = NLA_NESTED, }, 364 [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 }, 365 }; 366 367 static int bpf_build_state(struct nlattr *nla, 368 unsigned int family, const void *cfg, 369 struct lwtunnel_state **ts, 370 struct netlink_ext_ack *extack) 371 { 372 struct nlattr *tb[LWT_BPF_MAX + 1]; 373 struct lwtunnel_state *newts; 374 struct bpf_lwt *bpf; 375 int ret; 376 377 if (family != AF_INET && family != AF_INET6) 378 return -EAFNOSUPPORT; 379 380 ret = nla_parse_nested_deprecated(tb, LWT_BPF_MAX, nla, bpf_nl_policy, 381 extack); 382 if (ret < 0) 383 return ret; 384 385 if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT]) 386 return -EINVAL; 387 388 newts = lwtunnel_state_alloc(sizeof(*bpf)); 389 if (!newts) 390 return -ENOMEM; 391 392 newts->type = LWTUNNEL_ENCAP_BPF; 393 bpf = bpf_lwt_lwtunnel(newts); 394 395 if (tb[LWT_BPF_IN]) { 396 newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT; 397 ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in, 398 BPF_PROG_TYPE_LWT_IN); 399 if (ret < 0) 400 goto errout; 401 } 402 403 if (tb[LWT_BPF_OUT]) { 404 newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; 405 ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out, 406 BPF_PROG_TYPE_LWT_OUT); 407 if (ret < 0) 408 goto errout; 409 } 410 411 if (tb[LWT_BPF_XMIT]) { 412 newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT; 413 ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit, 414 BPF_PROG_TYPE_LWT_XMIT); 415 if (ret < 0) 416 goto errout; 417 } 418 419 if (tb[LWT_BPF_XMIT_HEADROOM]) { 420 u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]); 421 422 if (headroom > LWT_BPF_MAX_HEADROOM) { 423 ret = -ERANGE; 424 goto errout; 425 } 426 427 newts->headroom = headroom; 428 } 429 430 bpf->family = family; 431 *ts = newts; 432 433 return 0; 434 435 errout: 436 bpf_destroy_state(newts); 437 kfree(newts); 438 return ret; 439 } 440 441 static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr, 442 struct bpf_lwt_prog *prog) 443 { 444 struct nlattr *nest; 445 446 if (!prog->prog) 447 return 0; 448 449 nest = nla_nest_start_noflag(skb, attr); 450 if (!nest) 451 return -EMSGSIZE; 452 453 if (prog->name && 454 nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name)) 455 return -EMSGSIZE; 456 457 return nla_nest_end(skb, nest); 458 } 459 460 static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt) 461 { 462 struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); 463 464 if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 || 465 bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 || 466 bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0) 467 return -EMSGSIZE; 468 469 return 0; 470 } 471 472 static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate) 473 { 474 int nest_len = nla_total_size(sizeof(struct nlattr)) + 475 nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */ 476 0; 477 478 return nest_len + /* LWT_BPF_IN */ 479 nest_len + /* LWT_BPF_OUT */ 480 nest_len + /* LWT_BPF_XMIT */ 481 0; 482 } 483 484 static int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b) 485 { 486 /* FIXME: 487 * The LWT state is currently rebuilt for delete requests which 488 * results in a new bpf_prog instance. Comparing names for now. 489 */ 490 if (!a->name && !b->name) 491 return 0; 492 493 if (!a->name || !b->name) 494 return 1; 495 496 return strcmp(a->name, b->name); 497 } 498 499 static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) 500 { 501 struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a); 502 struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b); 503 504 return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) || 505 bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) || 506 bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit); 507 } 508 509 static const struct lwtunnel_encap_ops bpf_encap_ops = { 510 .build_state = bpf_build_state, 511 .destroy_state = bpf_destroy_state, 512 .input = bpf_input, 513 .output = bpf_output, 514 .xmit = bpf_xmit, 515 .fill_encap = bpf_fill_encap_info, 516 .get_encap_size = bpf_encap_nlsize, 517 .cmp_encap = bpf_encap_cmp, 518 .owner = THIS_MODULE, 519 }; 520 521 static int handle_gso_type(struct sk_buff *skb, unsigned int gso_type, 522 int encap_len) 523 { 524 struct skb_shared_info *shinfo = skb_shinfo(skb); 525 526 gso_type |= SKB_GSO_DODGY; 527 shinfo->gso_type |= gso_type; 528 skb_decrease_gso_size(shinfo, encap_len); 529 shinfo->gso_segs = 0; 530 return 0; 531 } 532 533 static int handle_gso_encap(struct sk_buff *skb, bool ipv4, int encap_len) 534 { 535 int next_hdr_offset; 536 void *next_hdr; 537 __u8 protocol; 538 539 /* SCTP and UDP_L4 gso need more nuanced handling than what 540 * handle_gso_type() does above: skb_decrease_gso_size() is not enough. 541 * So at the moment only TCP GSO packets are let through. 542 */ 543 if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) 544 return -ENOTSUPP; 545 546 if (ipv4) { 547 protocol = ip_hdr(skb)->protocol; 548 next_hdr_offset = sizeof(struct iphdr); 549 next_hdr = skb_network_header(skb) + next_hdr_offset; 550 } else { 551 protocol = ipv6_hdr(skb)->nexthdr; 552 next_hdr_offset = sizeof(struct ipv6hdr); 553 next_hdr = skb_network_header(skb) + next_hdr_offset; 554 } 555 556 switch (protocol) { 557 case IPPROTO_GRE: 558 next_hdr_offset += sizeof(struct gre_base_hdr); 559 if (next_hdr_offset > encap_len) 560 return -EINVAL; 561 562 if (((struct gre_base_hdr *)next_hdr)->flags & GRE_CSUM) 563 return handle_gso_type(skb, SKB_GSO_GRE_CSUM, 564 encap_len); 565 return handle_gso_type(skb, SKB_GSO_GRE, encap_len); 566 567 case IPPROTO_UDP: 568 next_hdr_offset += sizeof(struct udphdr); 569 if (next_hdr_offset > encap_len) 570 return -EINVAL; 571 572 if (((struct udphdr *)next_hdr)->check) 573 return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL_CSUM, 574 encap_len); 575 return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL, encap_len); 576 577 case IPPROTO_IP: 578 case IPPROTO_IPV6: 579 if (ipv4) 580 return handle_gso_type(skb, SKB_GSO_IPXIP4, encap_len); 581 else 582 return handle_gso_type(skb, SKB_GSO_IPXIP6, encap_len); 583 584 default: 585 return -EPROTONOSUPPORT; 586 } 587 } 588 589 int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress) 590 { 591 struct iphdr *iph; 592 bool ipv4; 593 int err; 594 595 if (unlikely(len < sizeof(struct iphdr) || len > LWT_BPF_MAX_HEADROOM)) 596 return -EINVAL; 597 598 /* validate protocol and length */ 599 iph = (struct iphdr *)hdr; 600 if (iph->version == 4) { 601 ipv4 = true; 602 if (unlikely(len < iph->ihl * 4)) 603 return -EINVAL; 604 } else if (iph->version == 6) { 605 ipv4 = false; 606 if (unlikely(len < sizeof(struct ipv6hdr))) 607 return -EINVAL; 608 } else { 609 return -EINVAL; 610 } 611 612 if (ingress) 613 err = skb_cow_head(skb, len + skb->mac_len); 614 else 615 err = skb_cow_head(skb, 616 len + LL_RESERVED_SPACE(skb_dst(skb)->dev)); 617 if (unlikely(err)) 618 return err; 619 620 /* push the encap headers and fix pointers */ 621 skb_reset_inner_headers(skb); 622 skb_reset_inner_mac_header(skb); /* mac header is not yet set */ 623 skb_set_inner_protocol(skb, skb->protocol); 624 skb->encapsulation = 1; 625 skb_push(skb, len); 626 if (ingress) 627 skb_postpush_rcsum(skb, iph, len); 628 skb_reset_network_header(skb); 629 memcpy(skb_network_header(skb), hdr, len); 630 bpf_compute_data_pointers(skb); 631 skb_clear_hash(skb); 632 633 if (ipv4) { 634 skb->protocol = htons(ETH_P_IP); 635 iph = ip_hdr(skb); 636 637 if (!iph->check) 638 iph->check = ip_fast_csum((unsigned char *)iph, 639 iph->ihl); 640 } else { 641 skb->protocol = htons(ETH_P_IPV6); 642 } 643 644 if (skb_is_gso(skb)) 645 return handle_gso_encap(skb, ipv4, len); 646 647 return 0; 648 } 649 650 static int __init bpf_lwt_init(void) 651 { 652 return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF); 653 } 654 655 subsys_initcall(bpf_lwt_init) 656