1 /* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch> 2 * 3 * This program is free software; you can redistribute it and/or 4 * modify it under the terms of version 2 of the GNU General Public 5 * License as published by the Free Software Foundation. 6 * 7 * This program is distributed in the hope that it will be useful, but 8 * WITHOUT ANY WARRANTY; without even the implied warranty of 9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 10 * General Public License for more details. 11 */ 12 13 #include <linux/kernel.h> 14 #include <linux/module.h> 15 #include <linux/skbuff.h> 16 #include <linux/types.h> 17 #include <linux/bpf.h> 18 #include <net/lwtunnel.h> 19 #include <net/gre.h> 20 #include <net/ip6_route.h> 21 22 struct bpf_lwt_prog { 23 struct bpf_prog *prog; 24 char *name; 25 }; 26 27 struct bpf_lwt { 28 struct bpf_lwt_prog in; 29 struct bpf_lwt_prog out; 30 struct bpf_lwt_prog xmit; 31 int family; 32 }; 33 34 #define MAX_PROG_NAME 256 35 36 static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt) 37 { 38 return (struct bpf_lwt *)lwt->data; 39 } 40 41 #define NO_REDIRECT false 42 #define CAN_REDIRECT true 43 44 static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, 45 struct dst_entry *dst, bool can_redirect) 46 { 47 int ret; 48 49 /* Preempt disable is needed to protect per-cpu redirect_info between 50 * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and 51 * access to maps strictly require a rcu_read_lock() for protection, 52 * mixing with BH RCU lock doesn't work. 53 */ 54 preempt_disable(); 55 bpf_compute_data_pointers(skb); 56 ret = bpf_prog_run_save_cb(lwt->prog, skb); 57 58 switch (ret) { 59 case BPF_OK: 60 case BPF_LWT_REROUTE: 61 break; 62 63 case BPF_REDIRECT: 64 if (unlikely(!can_redirect)) { 65 pr_warn_once("Illegal redirect return code in prog %s\n", 66 lwt->name ? : "<unknown>"); 67 ret = BPF_OK; 68 } else { 69 skb_reset_mac_header(skb); 70 ret = skb_do_redirect(skb); 71 if (ret == 0) 72 ret = BPF_REDIRECT; 73 } 74 break; 75 76 case BPF_DROP: 77 kfree_skb(skb); 78 ret = -EPERM; 79 break; 80 81 default: 82 pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret); 83 kfree_skb(skb); 84 ret = -EINVAL; 85 break; 86 } 87 88 preempt_enable(); 89 90 return ret; 91 } 92 93 static int bpf_lwt_input_reroute(struct sk_buff *skb) 94 { 95 int err = -EINVAL; 96 97 if (skb->protocol == htons(ETH_P_IP)) { 98 struct iphdr *iph = ip_hdr(skb); 99 100 err = ip_route_input_noref(skb, iph->daddr, iph->saddr, 101 iph->tos, skb_dst(skb)->dev); 102 } else if (skb->protocol == htons(ETH_P_IPV6)) { 103 err = ipv6_stub->ipv6_route_input(skb); 104 } else { 105 err = -EAFNOSUPPORT; 106 } 107 108 if (err) 109 goto err; 110 return dst_input(skb); 111 112 err: 113 kfree_skb(skb); 114 return err; 115 } 116 117 static int bpf_input(struct sk_buff *skb) 118 { 119 struct dst_entry *dst = skb_dst(skb); 120 struct bpf_lwt *bpf; 121 int ret; 122 123 bpf = bpf_lwt_lwtunnel(dst->lwtstate); 124 if (bpf->in.prog) { 125 ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT); 126 if (ret < 0) 127 return ret; 128 if (ret == BPF_LWT_REROUTE) 129 return bpf_lwt_input_reroute(skb); 130 } 131 132 if (unlikely(!dst->lwtstate->orig_input)) { 133 kfree_skb(skb); 134 return -EINVAL; 135 } 136 137 return dst->lwtstate->orig_input(skb); 138 } 139 140 static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb) 141 { 142 struct dst_entry *dst = skb_dst(skb); 143 struct bpf_lwt *bpf; 144 int ret; 145 146 bpf = bpf_lwt_lwtunnel(dst->lwtstate); 147 if (bpf->out.prog) { 148 ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT); 149 if (ret < 0) 150 return ret; 151 } 152 153 if (unlikely(!dst->lwtstate->orig_output)) { 154 pr_warn_once("orig_output not set on dst for prog %s\n", 155 bpf->out.name); 156 kfree_skb(skb); 157 return -EINVAL; 158 } 159 160 return dst->lwtstate->orig_output(net, sk, skb); 161 } 162 163 static int xmit_check_hhlen(struct sk_buff *skb) 164 { 165 int hh_len = skb_dst(skb)->dev->hard_header_len; 166 167 if (skb_headroom(skb) < hh_len) { 168 int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); 169 170 if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC)) 171 return -ENOMEM; 172 } 173 174 return 0; 175 } 176 177 static int bpf_lwt_xmit_reroute(struct sk_buff *skb) 178 { 179 struct net_device *l3mdev = l3mdev_master_dev_rcu(skb_dst(skb)->dev); 180 int oif = l3mdev ? l3mdev->ifindex : 0; 181 struct dst_entry *dst = NULL; 182 int err = -EAFNOSUPPORT; 183 struct sock *sk; 184 struct net *net; 185 bool ipv4; 186 187 if (skb->protocol == htons(ETH_P_IP)) 188 ipv4 = true; 189 else if (skb->protocol == htons(ETH_P_IPV6)) 190 ipv4 = false; 191 else 192 goto err; 193 194 sk = sk_to_full_sk(skb->sk); 195 if (sk) { 196 if (sk->sk_bound_dev_if) 197 oif = sk->sk_bound_dev_if; 198 net = sock_net(sk); 199 } else { 200 net = dev_net(skb_dst(skb)->dev); 201 } 202 203 if (ipv4) { 204 struct iphdr *iph = ip_hdr(skb); 205 struct flowi4 fl4 = {}; 206 struct rtable *rt; 207 208 fl4.flowi4_oif = oif; 209 fl4.flowi4_mark = skb->mark; 210 fl4.flowi4_uid = sock_net_uid(net, sk); 211 fl4.flowi4_tos = RT_TOS(iph->tos); 212 fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; 213 fl4.flowi4_proto = iph->protocol; 214 fl4.daddr = iph->daddr; 215 fl4.saddr = iph->saddr; 216 217 rt = ip_route_output_key(net, &fl4); 218 if (IS_ERR(rt)) { 219 err = PTR_ERR(rt); 220 goto err; 221 } 222 dst = &rt->dst; 223 } else { 224 struct ipv6hdr *iph6 = ipv6_hdr(skb); 225 struct flowi6 fl6 = {}; 226 227 fl6.flowi6_oif = oif; 228 fl6.flowi6_mark = skb->mark; 229 fl6.flowi6_uid = sock_net_uid(net, sk); 230 fl6.flowlabel = ip6_flowinfo(iph6); 231 fl6.flowi6_proto = iph6->nexthdr; 232 fl6.daddr = iph6->daddr; 233 fl6.saddr = iph6->saddr; 234 235 err = ipv6_stub->ipv6_dst_lookup(net, skb->sk, &dst, &fl6); 236 if (unlikely(err)) 237 goto err; 238 if (IS_ERR(dst)) { 239 err = PTR_ERR(dst); 240 goto err; 241 } 242 } 243 if (unlikely(dst->error)) { 244 err = dst->error; 245 dst_release(dst); 246 goto err; 247 } 248 249 /* Although skb header was reserved in bpf_lwt_push_ip_encap(), it 250 * was done for the previous dst, so we are doing it here again, in 251 * case the new dst needs much more space. The call below is a noop 252 * if there is enough header space in skb. 253 */ 254 err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); 255 if (unlikely(err)) 256 goto err; 257 258 skb_dst_drop(skb); 259 skb_dst_set(skb, dst); 260 261 err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb); 262 if (unlikely(err)) 263 return err; 264 265 /* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */ 266 return LWTUNNEL_XMIT_DONE; 267 268 err: 269 kfree_skb(skb); 270 return err; 271 } 272 273 static int bpf_xmit(struct sk_buff *skb) 274 { 275 struct dst_entry *dst = skb_dst(skb); 276 struct bpf_lwt *bpf; 277 278 bpf = bpf_lwt_lwtunnel(dst->lwtstate); 279 if (bpf->xmit.prog) { 280 __be16 proto = skb->protocol; 281 int ret; 282 283 ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT); 284 switch (ret) { 285 case BPF_OK: 286 /* If the header changed, e.g. via bpf_lwt_push_encap, 287 * BPF_LWT_REROUTE below should have been used if the 288 * protocol was also changed. 289 */ 290 if (skb->protocol != proto) { 291 kfree_skb(skb); 292 return -EINVAL; 293 } 294 /* If the header was expanded, headroom might be too 295 * small for L2 header to come, expand as needed. 296 */ 297 ret = xmit_check_hhlen(skb); 298 if (unlikely(ret)) 299 return ret; 300 301 return LWTUNNEL_XMIT_CONTINUE; 302 case BPF_REDIRECT: 303 return LWTUNNEL_XMIT_DONE; 304 case BPF_LWT_REROUTE: 305 return bpf_lwt_xmit_reroute(skb); 306 default: 307 return ret; 308 } 309 } 310 311 return LWTUNNEL_XMIT_CONTINUE; 312 } 313 314 static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog) 315 { 316 if (prog->prog) 317 bpf_prog_put(prog->prog); 318 319 kfree(prog->name); 320 } 321 322 static void bpf_destroy_state(struct lwtunnel_state *lwt) 323 { 324 struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); 325 326 bpf_lwt_prog_destroy(&bpf->in); 327 bpf_lwt_prog_destroy(&bpf->out); 328 bpf_lwt_prog_destroy(&bpf->xmit); 329 } 330 331 static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = { 332 [LWT_BPF_PROG_FD] = { .type = NLA_U32, }, 333 [LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING, 334 .len = MAX_PROG_NAME }, 335 }; 336 337 static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog, 338 enum bpf_prog_type type) 339 { 340 struct nlattr *tb[LWT_BPF_PROG_MAX + 1]; 341 struct bpf_prog *p; 342 int ret; 343 u32 fd; 344 345 ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy, 346 NULL); 347 if (ret < 0) 348 return ret; 349 350 if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME]) 351 return -EINVAL; 352 353 prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_ATOMIC); 354 if (!prog->name) 355 return -ENOMEM; 356 357 fd = nla_get_u32(tb[LWT_BPF_PROG_FD]); 358 p = bpf_prog_get_type(fd, type); 359 if (IS_ERR(p)) 360 return PTR_ERR(p); 361 362 prog->prog = p; 363 364 return 0; 365 } 366 367 static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = { 368 [LWT_BPF_IN] = { .type = NLA_NESTED, }, 369 [LWT_BPF_OUT] = { .type = NLA_NESTED, }, 370 [LWT_BPF_XMIT] = { .type = NLA_NESTED, }, 371 [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 }, 372 }; 373 374 static int bpf_build_state(struct nlattr *nla, 375 unsigned int family, const void *cfg, 376 struct lwtunnel_state **ts, 377 struct netlink_ext_ack *extack) 378 { 379 struct nlattr *tb[LWT_BPF_MAX + 1]; 380 struct lwtunnel_state *newts; 381 struct bpf_lwt *bpf; 382 int ret; 383 384 if (family != AF_INET && family != AF_INET6) 385 return -EAFNOSUPPORT; 386 387 ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy, extack); 388 if (ret < 0) 389 return ret; 390 391 if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT]) 392 return -EINVAL; 393 394 newts = lwtunnel_state_alloc(sizeof(*bpf)); 395 if (!newts) 396 return -ENOMEM; 397 398 newts->type = LWTUNNEL_ENCAP_BPF; 399 bpf = bpf_lwt_lwtunnel(newts); 400 401 if (tb[LWT_BPF_IN]) { 402 newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT; 403 ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in, 404 BPF_PROG_TYPE_LWT_IN); 405 if (ret < 0) 406 goto errout; 407 } 408 409 if (tb[LWT_BPF_OUT]) { 410 newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; 411 ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out, 412 BPF_PROG_TYPE_LWT_OUT); 413 if (ret < 0) 414 goto errout; 415 } 416 417 if (tb[LWT_BPF_XMIT]) { 418 newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT; 419 ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit, 420 BPF_PROG_TYPE_LWT_XMIT); 421 if (ret < 0) 422 goto errout; 423 } 424 425 if (tb[LWT_BPF_XMIT_HEADROOM]) { 426 u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]); 427 428 if (headroom > LWT_BPF_MAX_HEADROOM) { 429 ret = -ERANGE; 430 goto errout; 431 } 432 433 newts->headroom = headroom; 434 } 435 436 bpf->family = family; 437 *ts = newts; 438 439 return 0; 440 441 errout: 442 bpf_destroy_state(newts); 443 kfree(newts); 444 return ret; 445 } 446 447 static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr, 448 struct bpf_lwt_prog *prog) 449 { 450 struct nlattr *nest; 451 452 if (!prog->prog) 453 return 0; 454 455 nest = nla_nest_start(skb, attr); 456 if (!nest) 457 return -EMSGSIZE; 458 459 if (prog->name && 460 nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name)) 461 return -EMSGSIZE; 462 463 return nla_nest_end(skb, nest); 464 } 465 466 static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt) 467 { 468 struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); 469 470 if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 || 471 bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 || 472 bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0) 473 return -EMSGSIZE; 474 475 return 0; 476 } 477 478 static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate) 479 { 480 int nest_len = nla_total_size(sizeof(struct nlattr)) + 481 nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */ 482 0; 483 484 return nest_len + /* LWT_BPF_IN */ 485 nest_len + /* LWT_BPF_OUT */ 486 nest_len + /* LWT_BPF_XMIT */ 487 0; 488 } 489 490 static int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b) 491 { 492 /* FIXME: 493 * The LWT state is currently rebuilt for delete requests which 494 * results in a new bpf_prog instance. Comparing names for now. 495 */ 496 if (!a->name && !b->name) 497 return 0; 498 499 if (!a->name || !b->name) 500 return 1; 501 502 return strcmp(a->name, b->name); 503 } 504 505 static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) 506 { 507 struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a); 508 struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b); 509 510 return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) || 511 bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) || 512 bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit); 513 } 514 515 static const struct lwtunnel_encap_ops bpf_encap_ops = { 516 .build_state = bpf_build_state, 517 .destroy_state = bpf_destroy_state, 518 .input = bpf_input, 519 .output = bpf_output, 520 .xmit = bpf_xmit, 521 .fill_encap = bpf_fill_encap_info, 522 .get_encap_size = bpf_encap_nlsize, 523 .cmp_encap = bpf_encap_cmp, 524 .owner = THIS_MODULE, 525 }; 526 527 static int handle_gso_type(struct sk_buff *skb, unsigned int gso_type, 528 int encap_len) 529 { 530 struct skb_shared_info *shinfo = skb_shinfo(skb); 531 532 gso_type |= SKB_GSO_DODGY; 533 shinfo->gso_type |= gso_type; 534 skb_decrease_gso_size(shinfo, encap_len); 535 shinfo->gso_segs = 0; 536 return 0; 537 } 538 539 static int handle_gso_encap(struct sk_buff *skb, bool ipv4, int encap_len) 540 { 541 int next_hdr_offset; 542 void *next_hdr; 543 __u8 protocol; 544 545 /* SCTP and UDP_L4 gso need more nuanced handling than what 546 * handle_gso_type() does above: skb_decrease_gso_size() is not enough. 547 * So at the moment only TCP GSO packets are let through. 548 */ 549 if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) 550 return -ENOTSUPP; 551 552 if (ipv4) { 553 protocol = ip_hdr(skb)->protocol; 554 next_hdr_offset = sizeof(struct iphdr); 555 next_hdr = skb_network_header(skb) + next_hdr_offset; 556 } else { 557 protocol = ipv6_hdr(skb)->nexthdr; 558 next_hdr_offset = sizeof(struct ipv6hdr); 559 next_hdr = skb_network_header(skb) + next_hdr_offset; 560 } 561 562 switch (protocol) { 563 case IPPROTO_GRE: 564 next_hdr_offset += sizeof(struct gre_base_hdr); 565 if (next_hdr_offset > encap_len) 566 return -EINVAL; 567 568 if (((struct gre_base_hdr *)next_hdr)->flags & GRE_CSUM) 569 return handle_gso_type(skb, SKB_GSO_GRE_CSUM, 570 encap_len); 571 return handle_gso_type(skb, SKB_GSO_GRE, encap_len); 572 573 case IPPROTO_UDP: 574 next_hdr_offset += sizeof(struct udphdr); 575 if (next_hdr_offset > encap_len) 576 return -EINVAL; 577 578 if (((struct udphdr *)next_hdr)->check) 579 return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL_CSUM, 580 encap_len); 581 return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL, encap_len); 582 583 case IPPROTO_IP: 584 case IPPROTO_IPV6: 585 if (ipv4) 586 return handle_gso_type(skb, SKB_GSO_IPXIP4, encap_len); 587 else 588 return handle_gso_type(skb, SKB_GSO_IPXIP6, encap_len); 589 590 default: 591 return -EPROTONOSUPPORT; 592 } 593 } 594 595 int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress) 596 { 597 struct iphdr *iph; 598 bool ipv4; 599 int err; 600 601 if (unlikely(len < sizeof(struct iphdr) || len > LWT_BPF_MAX_HEADROOM)) 602 return -EINVAL; 603 604 /* validate protocol and length */ 605 iph = (struct iphdr *)hdr; 606 if (iph->version == 4) { 607 ipv4 = true; 608 if (unlikely(len < iph->ihl * 4)) 609 return -EINVAL; 610 } else if (iph->version == 6) { 611 ipv4 = false; 612 if (unlikely(len < sizeof(struct ipv6hdr))) 613 return -EINVAL; 614 } else { 615 return -EINVAL; 616 } 617 618 if (ingress) 619 err = skb_cow_head(skb, len + skb->mac_len); 620 else 621 err = skb_cow_head(skb, 622 len + LL_RESERVED_SPACE(skb_dst(skb)->dev)); 623 if (unlikely(err)) 624 return err; 625 626 /* push the encap headers and fix pointers */ 627 skb_reset_inner_headers(skb); 628 skb_reset_inner_mac_header(skb); /* mac header is not yet set */ 629 skb_set_inner_protocol(skb, skb->protocol); 630 skb->encapsulation = 1; 631 skb_push(skb, len); 632 if (ingress) 633 skb_postpush_rcsum(skb, iph, len); 634 skb_reset_network_header(skb); 635 memcpy(skb_network_header(skb), hdr, len); 636 bpf_compute_data_pointers(skb); 637 skb_clear_hash(skb); 638 639 if (ipv4) { 640 skb->protocol = htons(ETH_P_IP); 641 iph = ip_hdr(skb); 642 643 if (!iph->check) 644 iph->check = ip_fast_csum((unsigned char *)iph, 645 iph->ihl); 646 } else { 647 skb->protocol = htons(ETH_P_IPV6); 648 } 649 650 if (skb_is_gso(skb)) 651 return handle_gso_encap(skb, ipv4, len); 652 653 return 0; 654 } 655 656 static int __init bpf_lwt_init(void) 657 { 658 return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF); 659 } 660 661 subsys_initcall(bpf_lwt_init) 662