1 /* 2 * vrf.c: device driver to encapsulate a VRF space 3 * 4 * Copyright (c) 2015 Cumulus Networks. All rights reserved. 5 * Copyright (c) 2015 Shrijeet Mukherjee <shm@cumulusnetworks.com> 6 * Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com> 7 * 8 * Based on dummy, team and ipvlan drivers 9 * 10 * This program is free software; you can redistribute it and/or modify 11 * it under the terms of the GNU General Public License as published by 12 * the Free Software Foundation; either version 2 of the License, or 13 * (at your option) any later version. 14 */ 15 16 #include <linux/module.h> 17 #include <linux/kernel.h> 18 #include <linux/netdevice.h> 19 #include <linux/etherdevice.h> 20 #include <linux/ip.h> 21 #include <linux/init.h> 22 #include <linux/moduleparam.h> 23 #include <linux/netfilter.h> 24 #include <linux/rtnetlink.h> 25 #include <net/rtnetlink.h> 26 #include <linux/u64_stats_sync.h> 27 #include <linux/hashtable.h> 28 29 #include <linux/inetdevice.h> 30 #include <net/arp.h> 31 #include <net/ip.h> 32 #include <net/ip_fib.h> 33 #include <net/ip6_fib.h> 34 #include <net/ip6_route.h> 35 #include <net/route.h> 36 #include <net/addrconf.h> 37 #include <net/l3mdev.h> 38 39 #define RT_FL_TOS(oldflp4) \ 40 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) 41 42 #define DRV_NAME "vrf" 43 #define DRV_VERSION "1.0" 44 45 #define vrf_master_get_rcu(dev) \ 46 ((struct net_device *)rcu_dereference(dev->rx_handler_data)) 47 48 struct net_vrf { 49 struct rtable *rth; 50 struct rt6_info *rt6; 51 u32 tb_id; 52 }; 53 54 struct pcpu_dstats { 55 u64 tx_pkts; 56 u64 tx_bytes; 57 u64 tx_drps; 58 u64 rx_pkts; 59 u64 rx_bytes; 60 struct u64_stats_sync syncp; 61 }; 62 63 static struct dst_entry *vrf_ip_check(struct dst_entry *dst, u32 cookie) 64 { 65 return dst; 66 } 67 68 static int vrf_ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) 69 { 70 return ip_local_out(net, sk, skb); 71 } 72 73 static unsigned int vrf_v4_mtu(const struct dst_entry *dst) 74 { 75 /* TO-DO: return max ethernet size? */ 76 return dst->dev->mtu; 77 } 78 79 static void vrf_dst_destroy(struct dst_entry *dst) 80 { 81 /* our dst lives forever - or until the device is closed */ 82 } 83 84 static unsigned int vrf_default_advmss(const struct dst_entry *dst) 85 { 86 return 65535 - 40; 87 } 88 89 static struct dst_ops vrf_dst_ops = { 90 .family = AF_INET, 91 .local_out = vrf_ip_local_out, 92 .check = vrf_ip_check, 93 .mtu = vrf_v4_mtu, 94 .destroy = vrf_dst_destroy, 95 .default_advmss = vrf_default_advmss, 96 }; 97 98 /* neighbor handling is done with actual device; do not want 99 * to flip skb->dev for those ndisc packets. This really fails 100 * for multiple next protocols (e.g., NEXTHDR_HOP). But it is 101 * a start. 102 */ 103 #if IS_ENABLED(CONFIG_IPV6) 104 static bool check_ipv6_frame(const struct sk_buff *skb) 105 { 106 const struct ipv6hdr *ipv6h = (struct ipv6hdr *)skb->data; 107 size_t hlen = sizeof(*ipv6h); 108 bool rc = true; 109 110 if (skb->len < hlen) 111 goto out; 112 113 if (ipv6h->nexthdr == NEXTHDR_ICMP) { 114 const struct icmp6hdr *icmph; 115 116 if (skb->len < hlen + sizeof(*icmph)) 117 goto out; 118 119 icmph = (struct icmp6hdr *)(skb->data + sizeof(*ipv6h)); 120 switch (icmph->icmp6_type) { 121 case NDISC_ROUTER_SOLICITATION: 122 case NDISC_ROUTER_ADVERTISEMENT: 123 case NDISC_NEIGHBOUR_SOLICITATION: 124 case NDISC_NEIGHBOUR_ADVERTISEMENT: 125 case NDISC_REDIRECT: 126 rc = false; 127 break; 128 } 129 } 130 131 out: 132 return rc; 133 } 134 #else 135 static bool check_ipv6_frame(const struct sk_buff *skb) 136 { 137 return false; 138 } 139 #endif 140 141 static bool is_ip_rx_frame(struct sk_buff *skb) 142 { 143 switch (skb->protocol) { 144 case htons(ETH_P_IP): 145 return true; 146 case htons(ETH_P_IPV6): 147 return check_ipv6_frame(skb); 148 } 149 return false; 150 } 151 152 static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb) 153 { 154 vrf_dev->stats.tx_errors++; 155 kfree_skb(skb); 156 } 157 158 /* note: already called with rcu_read_lock */ 159 static rx_handler_result_t vrf_handle_frame(struct sk_buff **pskb) 160 { 161 struct sk_buff *skb = *pskb; 162 163 if (is_ip_rx_frame(skb)) { 164 struct net_device *dev = vrf_master_get_rcu(skb->dev); 165 struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); 166 167 u64_stats_update_begin(&dstats->syncp); 168 dstats->rx_pkts++; 169 dstats->rx_bytes += skb->len; 170 u64_stats_update_end(&dstats->syncp); 171 172 skb->dev = dev; 173 174 return RX_HANDLER_ANOTHER; 175 } 176 return RX_HANDLER_PASS; 177 } 178 179 static struct rtnl_link_stats64 *vrf_get_stats64(struct net_device *dev, 180 struct rtnl_link_stats64 *stats) 181 { 182 int i; 183 184 for_each_possible_cpu(i) { 185 const struct pcpu_dstats *dstats; 186 u64 tbytes, tpkts, tdrops, rbytes, rpkts; 187 unsigned int start; 188 189 dstats = per_cpu_ptr(dev->dstats, i); 190 do { 191 start = u64_stats_fetch_begin_irq(&dstats->syncp); 192 tbytes = dstats->tx_bytes; 193 tpkts = dstats->tx_pkts; 194 tdrops = dstats->tx_drps; 195 rbytes = dstats->rx_bytes; 196 rpkts = dstats->rx_pkts; 197 } while (u64_stats_fetch_retry_irq(&dstats->syncp, start)); 198 stats->tx_bytes += tbytes; 199 stats->tx_packets += tpkts; 200 stats->tx_dropped += tdrops; 201 stats->rx_bytes += rbytes; 202 stats->rx_packets += rpkts; 203 } 204 return stats; 205 } 206 207 #if IS_ENABLED(CONFIG_IPV6) 208 static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, 209 struct net_device *dev) 210 { 211 const struct ipv6hdr *iph = ipv6_hdr(skb); 212 struct net *net = dev_net(skb->dev); 213 struct flowi6 fl6 = { 214 /* needed to match OIF rule */ 215 .flowi6_oif = dev->ifindex, 216 .flowi6_iif = LOOPBACK_IFINDEX, 217 .daddr = iph->daddr, 218 .saddr = iph->saddr, 219 .flowlabel = ip6_flowinfo(iph), 220 .flowi6_mark = skb->mark, 221 .flowi6_proto = iph->nexthdr, 222 .flowi6_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF, 223 }; 224 int ret = NET_XMIT_DROP; 225 struct dst_entry *dst; 226 struct dst_entry *dst_null = &net->ipv6.ip6_null_entry->dst; 227 228 dst = ip6_route_output(net, NULL, &fl6); 229 if (dst == dst_null) 230 goto err; 231 232 skb_dst_drop(skb); 233 skb_dst_set(skb, dst); 234 235 ret = ip6_local_out(net, skb->sk, skb); 236 if (unlikely(net_xmit_eval(ret))) 237 dev->stats.tx_errors++; 238 else 239 ret = NET_XMIT_SUCCESS; 240 241 return ret; 242 err: 243 vrf_tx_error(dev, skb); 244 return NET_XMIT_DROP; 245 } 246 #else 247 static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, 248 struct net_device *dev) 249 { 250 vrf_tx_error(dev, skb); 251 return NET_XMIT_DROP; 252 } 253 #endif 254 255 static int vrf_send_v4_prep(struct sk_buff *skb, struct flowi4 *fl4, 256 struct net_device *vrf_dev) 257 { 258 struct rtable *rt; 259 int err = 1; 260 261 rt = ip_route_output_flow(dev_net(vrf_dev), fl4, NULL); 262 if (IS_ERR(rt)) 263 goto out; 264 265 /* TO-DO: what about broadcast ? */ 266 if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { 267 ip_rt_put(rt); 268 goto out; 269 } 270 271 skb_dst_drop(skb); 272 skb_dst_set(skb, &rt->dst); 273 err = 0; 274 out: 275 return err; 276 } 277 278 static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb, 279 struct net_device *vrf_dev) 280 { 281 struct iphdr *ip4h = ip_hdr(skb); 282 int ret = NET_XMIT_DROP; 283 struct flowi4 fl4 = { 284 /* needed to match OIF rule */ 285 .flowi4_oif = vrf_dev->ifindex, 286 .flowi4_iif = LOOPBACK_IFINDEX, 287 .flowi4_tos = RT_TOS(ip4h->tos), 288 .flowi4_flags = FLOWI_FLAG_ANYSRC | FLOWI_FLAG_L3MDEV_SRC | 289 FLOWI_FLAG_SKIP_NH_OIF, 290 .daddr = ip4h->daddr, 291 }; 292 293 if (vrf_send_v4_prep(skb, &fl4, vrf_dev)) 294 goto err; 295 296 if (!ip4h->saddr) { 297 ip4h->saddr = inet_select_addr(skb_dst(skb)->dev, 0, 298 RT_SCOPE_LINK); 299 } 300 301 ret = ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); 302 if (unlikely(net_xmit_eval(ret))) 303 vrf_dev->stats.tx_errors++; 304 else 305 ret = NET_XMIT_SUCCESS; 306 307 out: 308 return ret; 309 err: 310 vrf_tx_error(vrf_dev, skb); 311 goto out; 312 } 313 314 static netdev_tx_t is_ip_tx_frame(struct sk_buff *skb, struct net_device *dev) 315 { 316 /* strip the ethernet header added for pass through VRF device */ 317 __skb_pull(skb, skb_network_offset(skb)); 318 319 switch (skb->protocol) { 320 case htons(ETH_P_IP): 321 return vrf_process_v4_outbound(skb, dev); 322 case htons(ETH_P_IPV6): 323 return vrf_process_v6_outbound(skb, dev); 324 default: 325 vrf_tx_error(dev, skb); 326 return NET_XMIT_DROP; 327 } 328 } 329 330 static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev) 331 { 332 netdev_tx_t ret = is_ip_tx_frame(skb, dev); 333 334 if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { 335 struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); 336 337 u64_stats_update_begin(&dstats->syncp); 338 dstats->tx_pkts++; 339 dstats->tx_bytes += skb->len; 340 u64_stats_update_end(&dstats->syncp); 341 } else { 342 this_cpu_inc(dev->dstats->tx_drps); 343 } 344 345 return ret; 346 } 347 348 #if IS_ENABLED(CONFIG_IPV6) 349 static struct dst_entry *vrf_ip6_check(struct dst_entry *dst, u32 cookie) 350 { 351 return dst; 352 } 353 354 static struct dst_ops vrf_dst_ops6 = { 355 .family = AF_INET6, 356 .local_out = ip6_local_out, 357 .check = vrf_ip6_check, 358 .mtu = vrf_v4_mtu, 359 .destroy = vrf_dst_destroy, 360 .default_advmss = vrf_default_advmss, 361 }; 362 363 static int init_dst_ops6_kmem_cachep(void) 364 { 365 vrf_dst_ops6.kmem_cachep = kmem_cache_create("vrf_ip6_dst_cache", 366 sizeof(struct rt6_info), 367 0, 368 SLAB_HWCACHE_ALIGN, 369 NULL); 370 371 if (!vrf_dst_ops6.kmem_cachep) 372 return -ENOMEM; 373 374 return 0; 375 } 376 377 static void free_dst_ops6_kmem_cachep(void) 378 { 379 kmem_cache_destroy(vrf_dst_ops6.kmem_cachep); 380 } 381 382 static int vrf_input6(struct sk_buff *skb) 383 { 384 skb->dev->stats.rx_errors++; 385 kfree_skb(skb); 386 return 0; 387 } 388 389 /* modelled after ip6_finish_output2 */ 390 static int vrf_finish_output6(struct net *net, struct sock *sk, 391 struct sk_buff *skb) 392 { 393 struct dst_entry *dst = skb_dst(skb); 394 struct net_device *dev = dst->dev; 395 struct neighbour *neigh; 396 struct in6_addr *nexthop; 397 int ret; 398 399 skb->protocol = htons(ETH_P_IPV6); 400 skb->dev = dev; 401 402 rcu_read_lock_bh(); 403 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr); 404 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); 405 if (unlikely(!neigh)) 406 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); 407 if (!IS_ERR(neigh)) { 408 ret = dst_neigh_output(dst, neigh, skb); 409 rcu_read_unlock_bh(); 410 return ret; 411 } 412 rcu_read_unlock_bh(); 413 414 IP6_INC_STATS(dev_net(dst->dev), 415 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); 416 kfree_skb(skb); 417 return -EINVAL; 418 } 419 420 /* modelled after ip6_output */ 421 static int vrf_output6(struct net *net, struct sock *sk, struct sk_buff *skb) 422 { 423 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, 424 net, sk, skb, NULL, skb_dst(skb)->dev, 425 vrf_finish_output6, 426 !(IP6CB(skb)->flags & IP6SKB_REROUTED)); 427 } 428 429 static void vrf_rt6_destroy(struct net_vrf *vrf) 430 { 431 dst_destroy(&vrf->rt6->dst); 432 free_percpu(vrf->rt6->rt6i_pcpu); 433 vrf->rt6 = NULL; 434 } 435 436 static int vrf_rt6_create(struct net_device *dev) 437 { 438 struct net_vrf *vrf = netdev_priv(dev); 439 struct dst_entry *dst; 440 struct rt6_info *rt6; 441 int cpu; 442 int rc = -ENOMEM; 443 444 rt6 = dst_alloc(&vrf_dst_ops6, dev, 0, 445 DST_OBSOLETE_NONE, 446 (DST_HOST | DST_NOPOLICY | DST_NOXFRM)); 447 if (!rt6) 448 goto out; 449 450 dst = &rt6->dst; 451 452 rt6->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_KERNEL); 453 if (!rt6->rt6i_pcpu) { 454 dst_destroy(dst); 455 goto out; 456 } 457 for_each_possible_cpu(cpu) { 458 struct rt6_info **p = per_cpu_ptr(rt6->rt6i_pcpu, cpu); 459 *p = NULL; 460 } 461 462 memset(dst + 1, 0, sizeof(*rt6) - sizeof(*dst)); 463 464 INIT_LIST_HEAD(&rt6->rt6i_siblings); 465 INIT_LIST_HEAD(&rt6->rt6i_uncached); 466 467 rt6->dst.input = vrf_input6; 468 rt6->dst.output = vrf_output6; 469 470 rt6->rt6i_table = fib6_get_table(dev_net(dev), vrf->tb_id); 471 472 atomic_set(&rt6->dst.__refcnt, 2); 473 474 vrf->rt6 = rt6; 475 rc = 0; 476 out: 477 return rc; 478 } 479 #else 480 static int init_dst_ops6_kmem_cachep(void) 481 { 482 return 0; 483 } 484 485 static void free_dst_ops6_kmem_cachep(void) 486 { 487 } 488 489 static void vrf_rt6_destroy(struct net_vrf *vrf) 490 { 491 } 492 493 static int vrf_rt6_create(struct net_device *dev) 494 { 495 return 0; 496 } 497 #endif 498 499 /* modelled after ip_finish_output2 */ 500 static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 501 { 502 struct dst_entry *dst = skb_dst(skb); 503 struct rtable *rt = (struct rtable *)dst; 504 struct net_device *dev = dst->dev; 505 unsigned int hh_len = LL_RESERVED_SPACE(dev); 506 struct neighbour *neigh; 507 u32 nexthop; 508 int ret = -EINVAL; 509 510 /* Be paranoid, rather than too clever. */ 511 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { 512 struct sk_buff *skb2; 513 514 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev)); 515 if (!skb2) { 516 ret = -ENOMEM; 517 goto err; 518 } 519 if (skb->sk) 520 skb_set_owner_w(skb2, skb->sk); 521 522 consume_skb(skb); 523 skb = skb2; 524 } 525 526 rcu_read_lock_bh(); 527 528 nexthop = (__force u32)rt_nexthop(rt, ip_hdr(skb)->daddr); 529 neigh = __ipv4_neigh_lookup_noref(dev, nexthop); 530 if (unlikely(!neigh)) 531 neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); 532 if (!IS_ERR(neigh)) 533 ret = dst_neigh_output(dst, neigh, skb); 534 535 rcu_read_unlock_bh(); 536 err: 537 if (unlikely(ret < 0)) 538 vrf_tx_error(skb->dev, skb); 539 return ret; 540 } 541 542 static int vrf_output(struct net *net, struct sock *sk, struct sk_buff *skb) 543 { 544 struct net_device *dev = skb_dst(skb)->dev; 545 546 IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); 547 548 skb->dev = dev; 549 skb->protocol = htons(ETH_P_IP); 550 551 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, 552 net, sk, skb, NULL, dev, 553 vrf_finish_output, 554 !(IPCB(skb)->flags & IPSKB_REROUTED)); 555 } 556 557 static void vrf_rtable_destroy(struct net_vrf *vrf) 558 { 559 struct dst_entry *dst = (struct dst_entry *)vrf->rth; 560 561 dst_destroy(dst); 562 vrf->rth = NULL; 563 } 564 565 static struct rtable *vrf_rtable_create(struct net_device *dev) 566 { 567 struct net_vrf *vrf = netdev_priv(dev); 568 struct rtable *rth; 569 570 rth = dst_alloc(&vrf_dst_ops, dev, 2, 571 DST_OBSOLETE_NONE, 572 (DST_HOST | DST_NOPOLICY | DST_NOXFRM)); 573 if (rth) { 574 rth->dst.output = vrf_output; 575 rth->rt_genid = rt_genid_ipv4(dev_net(dev)); 576 rth->rt_flags = 0; 577 rth->rt_type = RTN_UNICAST; 578 rth->rt_is_input = 0; 579 rth->rt_iif = 0; 580 rth->rt_pmtu = 0; 581 rth->rt_gateway = 0; 582 rth->rt_uses_gateway = 0; 583 rth->rt_table_id = vrf->tb_id; 584 INIT_LIST_HEAD(&rth->rt_uncached); 585 rth->rt_uncached_list = NULL; 586 } 587 588 return rth; 589 } 590 591 /**************************** device handling ********************/ 592 593 /* cycle interface to flush neighbor cache and move routes across tables */ 594 static void cycle_netdev(struct net_device *dev) 595 { 596 unsigned int flags = dev->flags; 597 int ret; 598 599 if (!netif_running(dev)) 600 return; 601 602 ret = dev_change_flags(dev, flags & ~IFF_UP); 603 if (ret >= 0) 604 ret = dev_change_flags(dev, flags); 605 606 if (ret < 0) { 607 netdev_err(dev, 608 "Failed to cycle device %s; route tables might be wrong!\n", 609 dev->name); 610 } 611 } 612 613 static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev) 614 { 615 int ret; 616 617 /* register the packet handler for slave ports */ 618 ret = netdev_rx_handler_register(port_dev, vrf_handle_frame, dev); 619 if (ret) { 620 netdev_err(port_dev, 621 "Device %s failed to register rx_handler\n", 622 port_dev->name); 623 goto out_fail; 624 } 625 626 ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL); 627 if (ret < 0) 628 goto out_unregister; 629 630 port_dev->priv_flags |= IFF_L3MDEV_SLAVE; 631 cycle_netdev(port_dev); 632 633 return 0; 634 635 out_unregister: 636 netdev_rx_handler_unregister(port_dev); 637 out_fail: 638 return ret; 639 } 640 641 static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev) 642 { 643 if (netif_is_l3_master(port_dev) || netif_is_l3_slave(port_dev)) 644 return -EINVAL; 645 646 return do_vrf_add_slave(dev, port_dev); 647 } 648 649 /* inverse of do_vrf_add_slave */ 650 static int do_vrf_del_slave(struct net_device *dev, struct net_device *port_dev) 651 { 652 netdev_upper_dev_unlink(port_dev, dev); 653 port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE; 654 655 netdev_rx_handler_unregister(port_dev); 656 657 cycle_netdev(port_dev); 658 659 return 0; 660 } 661 662 static int vrf_del_slave(struct net_device *dev, struct net_device *port_dev) 663 { 664 return do_vrf_del_slave(dev, port_dev); 665 } 666 667 static void vrf_dev_uninit(struct net_device *dev) 668 { 669 struct net_vrf *vrf = netdev_priv(dev); 670 struct net_device *port_dev; 671 struct list_head *iter; 672 673 vrf_rtable_destroy(vrf); 674 vrf_rt6_destroy(vrf); 675 676 netdev_for_each_lower_dev(dev, port_dev, iter) 677 vrf_del_slave(dev, port_dev); 678 679 free_percpu(dev->dstats); 680 dev->dstats = NULL; 681 } 682 683 static int vrf_dev_init(struct net_device *dev) 684 { 685 struct net_vrf *vrf = netdev_priv(dev); 686 687 dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats); 688 if (!dev->dstats) 689 goto out_nomem; 690 691 /* create the default dst which points back to us */ 692 vrf->rth = vrf_rtable_create(dev); 693 if (!vrf->rth) 694 goto out_stats; 695 696 if (vrf_rt6_create(dev) != 0) 697 goto out_rth; 698 699 dev->flags = IFF_MASTER | IFF_NOARP; 700 701 return 0; 702 703 out_rth: 704 vrf_rtable_destroy(vrf); 705 out_stats: 706 free_percpu(dev->dstats); 707 dev->dstats = NULL; 708 out_nomem: 709 return -ENOMEM; 710 } 711 712 static const struct net_device_ops vrf_netdev_ops = { 713 .ndo_init = vrf_dev_init, 714 .ndo_uninit = vrf_dev_uninit, 715 .ndo_start_xmit = vrf_xmit, 716 .ndo_get_stats64 = vrf_get_stats64, 717 .ndo_add_slave = vrf_add_slave, 718 .ndo_del_slave = vrf_del_slave, 719 }; 720 721 static u32 vrf_fib_table(const struct net_device *dev) 722 { 723 struct net_vrf *vrf = netdev_priv(dev); 724 725 return vrf->tb_id; 726 } 727 728 static struct rtable *vrf_get_rtable(const struct net_device *dev, 729 const struct flowi4 *fl4) 730 { 731 struct rtable *rth = NULL; 732 733 if (!(fl4->flowi4_flags & FLOWI_FLAG_L3MDEV_SRC)) { 734 struct net_vrf *vrf = netdev_priv(dev); 735 736 rth = vrf->rth; 737 atomic_inc(&rth->dst.__refcnt); 738 } 739 740 return rth; 741 } 742 743 /* called under rcu_read_lock */ 744 static int vrf_get_saddr(struct net_device *dev, struct flowi4 *fl4) 745 { 746 struct fib_result res = { .tclassid = 0 }; 747 struct net *net = dev_net(dev); 748 u32 orig_tos = fl4->flowi4_tos; 749 u8 flags = fl4->flowi4_flags; 750 u8 scope = fl4->flowi4_scope; 751 u8 tos = RT_FL_TOS(fl4); 752 int rc; 753 754 if (unlikely(!fl4->daddr)) 755 return 0; 756 757 fl4->flowi4_flags |= FLOWI_FLAG_SKIP_NH_OIF; 758 fl4->flowi4_iif = LOOPBACK_IFINDEX; 759 fl4->flowi4_tos = tos & IPTOS_RT_MASK; 760 fl4->flowi4_scope = ((tos & RTO_ONLINK) ? 761 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); 762 763 rc = fib_lookup(net, fl4, &res, 0); 764 if (!rc) { 765 if (res.type == RTN_LOCAL) 766 fl4->saddr = res.fi->fib_prefsrc ? : fl4->daddr; 767 else 768 fib_select_path(net, &res, fl4, -1); 769 } 770 771 fl4->flowi4_flags = flags; 772 fl4->flowi4_tos = orig_tos; 773 fl4->flowi4_scope = scope; 774 775 return rc; 776 } 777 778 #if IS_ENABLED(CONFIG_IPV6) 779 static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev, 780 const struct flowi6 *fl6) 781 { 782 struct rt6_info *rt = NULL; 783 784 if (!(fl6->flowi6_flags & FLOWI_FLAG_L3MDEV_SRC)) { 785 struct net_vrf *vrf = netdev_priv(dev); 786 787 rt = vrf->rt6; 788 atomic_inc(&rt->dst.__refcnt); 789 } 790 791 return (struct dst_entry *)rt; 792 } 793 #endif 794 795 static const struct l3mdev_ops vrf_l3mdev_ops = { 796 .l3mdev_fib_table = vrf_fib_table, 797 .l3mdev_get_rtable = vrf_get_rtable, 798 .l3mdev_get_saddr = vrf_get_saddr, 799 #if IS_ENABLED(CONFIG_IPV6) 800 .l3mdev_get_rt6_dst = vrf_get_rt6_dst, 801 #endif 802 }; 803 804 static void vrf_get_drvinfo(struct net_device *dev, 805 struct ethtool_drvinfo *info) 806 { 807 strlcpy(info->driver, DRV_NAME, sizeof(info->driver)); 808 strlcpy(info->version, DRV_VERSION, sizeof(info->version)); 809 } 810 811 static const struct ethtool_ops vrf_ethtool_ops = { 812 .get_drvinfo = vrf_get_drvinfo, 813 }; 814 815 static void vrf_setup(struct net_device *dev) 816 { 817 ether_setup(dev); 818 819 /* Initialize the device structure. */ 820 dev->netdev_ops = &vrf_netdev_ops; 821 dev->l3mdev_ops = &vrf_l3mdev_ops; 822 dev->ethtool_ops = &vrf_ethtool_ops; 823 dev->destructor = free_netdev; 824 825 /* Fill in device structure with ethernet-generic values. */ 826 eth_hw_addr_random(dev); 827 828 /* don't acquire vrf device's netif_tx_lock when transmitting */ 829 dev->features |= NETIF_F_LLTX; 830 831 /* don't allow vrf devices to change network namespaces. */ 832 dev->features |= NETIF_F_NETNS_LOCAL; 833 } 834 835 static int vrf_validate(struct nlattr *tb[], struct nlattr *data[]) 836 { 837 if (tb[IFLA_ADDRESS]) { 838 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) 839 return -EINVAL; 840 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) 841 return -EADDRNOTAVAIL; 842 } 843 return 0; 844 } 845 846 static void vrf_dellink(struct net_device *dev, struct list_head *head) 847 { 848 unregister_netdevice_queue(dev, head); 849 } 850 851 static int vrf_newlink(struct net *src_net, struct net_device *dev, 852 struct nlattr *tb[], struct nlattr *data[]) 853 { 854 struct net_vrf *vrf = netdev_priv(dev); 855 856 if (!data || !data[IFLA_VRF_TABLE]) 857 return -EINVAL; 858 859 vrf->tb_id = nla_get_u32(data[IFLA_VRF_TABLE]); 860 861 dev->priv_flags |= IFF_L3MDEV_MASTER; 862 863 return register_netdevice(dev); 864 } 865 866 static size_t vrf_nl_getsize(const struct net_device *dev) 867 { 868 return nla_total_size(sizeof(u32)); /* IFLA_VRF_TABLE */ 869 } 870 871 static int vrf_fillinfo(struct sk_buff *skb, 872 const struct net_device *dev) 873 { 874 struct net_vrf *vrf = netdev_priv(dev); 875 876 return nla_put_u32(skb, IFLA_VRF_TABLE, vrf->tb_id); 877 } 878 879 static size_t vrf_get_slave_size(const struct net_device *bond_dev, 880 const struct net_device *slave_dev) 881 { 882 return nla_total_size(sizeof(u32)); /* IFLA_VRF_PORT_TABLE */ 883 } 884 885 static int vrf_fill_slave_info(struct sk_buff *skb, 886 const struct net_device *vrf_dev, 887 const struct net_device *slave_dev) 888 { 889 struct net_vrf *vrf = netdev_priv(vrf_dev); 890 891 if (nla_put_u32(skb, IFLA_VRF_PORT_TABLE, vrf->tb_id)) 892 return -EMSGSIZE; 893 894 return 0; 895 } 896 897 static const struct nla_policy vrf_nl_policy[IFLA_VRF_MAX + 1] = { 898 [IFLA_VRF_TABLE] = { .type = NLA_U32 }, 899 }; 900 901 static struct rtnl_link_ops vrf_link_ops __read_mostly = { 902 .kind = DRV_NAME, 903 .priv_size = sizeof(struct net_vrf), 904 905 .get_size = vrf_nl_getsize, 906 .policy = vrf_nl_policy, 907 .validate = vrf_validate, 908 .fill_info = vrf_fillinfo, 909 910 .get_slave_size = vrf_get_slave_size, 911 .fill_slave_info = vrf_fill_slave_info, 912 913 .newlink = vrf_newlink, 914 .dellink = vrf_dellink, 915 .setup = vrf_setup, 916 .maxtype = IFLA_VRF_MAX, 917 }; 918 919 static int vrf_device_event(struct notifier_block *unused, 920 unsigned long event, void *ptr) 921 { 922 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 923 924 /* only care about unregister events to drop slave references */ 925 if (event == NETDEV_UNREGISTER) { 926 struct net_device *vrf_dev; 927 928 if (!netif_is_l3_slave(dev)) 929 goto out; 930 931 vrf_dev = netdev_master_upper_dev_get(dev); 932 vrf_del_slave(vrf_dev, dev); 933 } 934 out: 935 return NOTIFY_DONE; 936 } 937 938 static struct notifier_block vrf_notifier_block __read_mostly = { 939 .notifier_call = vrf_device_event, 940 }; 941 942 static int __init vrf_init_module(void) 943 { 944 int rc; 945 946 vrf_dst_ops.kmem_cachep = 947 kmem_cache_create("vrf_ip_dst_cache", 948 sizeof(struct rtable), 0, 949 SLAB_HWCACHE_ALIGN, 950 NULL); 951 952 if (!vrf_dst_ops.kmem_cachep) 953 return -ENOMEM; 954 955 rc = init_dst_ops6_kmem_cachep(); 956 if (rc != 0) 957 goto error2; 958 959 register_netdevice_notifier(&vrf_notifier_block); 960 961 rc = rtnl_link_register(&vrf_link_ops); 962 if (rc < 0) 963 goto error; 964 965 return 0; 966 967 error: 968 unregister_netdevice_notifier(&vrf_notifier_block); 969 free_dst_ops6_kmem_cachep(); 970 error2: 971 kmem_cache_destroy(vrf_dst_ops.kmem_cachep); 972 return rc; 973 } 974 975 static void __exit vrf_cleanup_module(void) 976 { 977 rtnl_link_unregister(&vrf_link_ops); 978 unregister_netdevice_notifier(&vrf_notifier_block); 979 kmem_cache_destroy(vrf_dst_ops.kmem_cachep); 980 free_dst_ops6_kmem_cachep(); 981 } 982 983 module_init(vrf_init_module); 984 module_exit(vrf_cleanup_module); 985 MODULE_AUTHOR("Shrijeet Mukherjee, David Ahern"); 986 MODULE_DESCRIPTION("Device driver to instantiate VRF domains"); 987 MODULE_LICENSE("GPL"); 988 MODULE_ALIAS_RTNL_LINK(DRV_NAME); 989 MODULE_VERSION(DRV_VERSION); 990