1 /* 2 * vrf.c: device driver to encapsulate a VRF space 3 * 4 * Copyright (c) 2015 Cumulus Networks. All rights reserved. 5 * Copyright (c) 2015 Shrijeet Mukherjee <shm@cumulusnetworks.com> 6 * Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com> 7 * 8 * Based on dummy, team and ipvlan drivers 9 * 10 * This program is free software; you can redistribute it and/or modify 11 * it under the terms of the GNU General Public License as published by 12 * the Free Software Foundation; either version 2 of the License, or 13 * (at your option) any later version. 14 */ 15 16 #include <linux/module.h> 17 #include <linux/kernel.h> 18 #include <linux/netdevice.h> 19 #include <linux/etherdevice.h> 20 #include <linux/ip.h> 21 #include <linux/init.h> 22 #include <linux/moduleparam.h> 23 #include <linux/netfilter.h> 24 #include <linux/rtnetlink.h> 25 #include <net/rtnetlink.h> 26 #include <linux/u64_stats_sync.h> 27 #include <linux/hashtable.h> 28 29 #include <linux/inetdevice.h> 30 #include <net/arp.h> 31 #include <net/ip.h> 32 #include <net/ip_fib.h> 33 #include <net/ip6_fib.h> 34 #include <net/ip6_route.h> 35 #include <net/rtnetlink.h> 36 #include <net/route.h> 37 #include <net/addrconf.h> 38 #include <net/l3mdev.h> 39 40 #define RT_FL_TOS(oldflp4) \ 41 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) 42 43 #define DRV_NAME "vrf" 44 #define DRV_VERSION "1.0" 45 46 #define vrf_master_get_rcu(dev) \ 47 ((struct net_device *)rcu_dereference(dev->rx_handler_data)) 48 49 struct slave { 50 struct list_head list; 51 struct net_device *dev; 52 }; 53 54 struct slave_queue { 55 struct list_head all_slaves; 56 }; 57 58 struct net_vrf { 59 struct slave_queue queue; 60 struct rtable *rth; 61 struct rt6_info *rt6; 62 u32 tb_id; 63 }; 64 65 struct pcpu_dstats { 66 u64 tx_pkts; 67 u64 tx_bytes; 68 u64 tx_drps; 69 u64 rx_pkts; 70 u64 rx_bytes; 71 struct u64_stats_sync syncp; 72 }; 73 74 static struct dst_entry *vrf_ip_check(struct dst_entry *dst, u32 cookie) 75 { 76 return dst; 77 } 78 79 static int vrf_ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) 80 { 81 return ip_local_out(net, sk, skb); 82 } 83 84 static unsigned int vrf_v4_mtu(const struct dst_entry *dst) 85 { 86 /* TO-DO: return max ethernet size? */ 87 return dst->dev->mtu; 88 } 89 90 static void vrf_dst_destroy(struct dst_entry *dst) 91 { 92 /* our dst lives forever - or until the device is closed */ 93 } 94 95 static unsigned int vrf_default_advmss(const struct dst_entry *dst) 96 { 97 return 65535 - 40; 98 } 99 100 static struct dst_ops vrf_dst_ops = { 101 .family = AF_INET, 102 .local_out = vrf_ip_local_out, 103 .check = vrf_ip_check, 104 .mtu = vrf_v4_mtu, 105 .destroy = vrf_dst_destroy, 106 .default_advmss = vrf_default_advmss, 107 }; 108 109 /* neighbor handling is done with actual device; do not want 110 * to flip skb->dev for those ndisc packets. This really fails 111 * for multiple next protocols (e.g., NEXTHDR_HOP). But it is 112 * a start. 113 */ 114 #if IS_ENABLED(CONFIG_IPV6) 115 static bool check_ipv6_frame(const struct sk_buff *skb) 116 { 117 const struct ipv6hdr *ipv6h = (struct ipv6hdr *)skb->data; 118 size_t hlen = sizeof(*ipv6h); 119 bool rc = true; 120 121 if (skb->len < hlen) 122 goto out; 123 124 if (ipv6h->nexthdr == NEXTHDR_ICMP) { 125 const struct icmp6hdr *icmph; 126 127 if (skb->len < hlen + sizeof(*icmph)) 128 goto out; 129 130 icmph = (struct icmp6hdr *)(skb->data + sizeof(*ipv6h)); 131 switch (icmph->icmp6_type) { 132 case NDISC_ROUTER_SOLICITATION: 133 case NDISC_ROUTER_ADVERTISEMENT: 134 case NDISC_NEIGHBOUR_SOLICITATION: 135 case NDISC_NEIGHBOUR_ADVERTISEMENT: 136 case NDISC_REDIRECT: 137 rc = false; 138 break; 139 } 140 } 141 142 out: 143 return rc; 144 } 145 #else 146 static bool check_ipv6_frame(const struct sk_buff *skb) 147 { 148 return false; 149 } 150 #endif 151 152 static bool is_ip_rx_frame(struct sk_buff *skb) 153 { 154 switch (skb->protocol) { 155 case htons(ETH_P_IP): 156 return true; 157 case htons(ETH_P_IPV6): 158 return check_ipv6_frame(skb); 159 } 160 return false; 161 } 162 163 static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb) 164 { 165 vrf_dev->stats.tx_errors++; 166 kfree_skb(skb); 167 } 168 169 /* note: already called with rcu_read_lock */ 170 static rx_handler_result_t vrf_handle_frame(struct sk_buff **pskb) 171 { 172 struct sk_buff *skb = *pskb; 173 174 if (is_ip_rx_frame(skb)) { 175 struct net_device *dev = vrf_master_get_rcu(skb->dev); 176 struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); 177 178 u64_stats_update_begin(&dstats->syncp); 179 dstats->rx_pkts++; 180 dstats->rx_bytes += skb->len; 181 u64_stats_update_end(&dstats->syncp); 182 183 skb->dev = dev; 184 185 return RX_HANDLER_ANOTHER; 186 } 187 return RX_HANDLER_PASS; 188 } 189 190 static struct rtnl_link_stats64 *vrf_get_stats64(struct net_device *dev, 191 struct rtnl_link_stats64 *stats) 192 { 193 int i; 194 195 for_each_possible_cpu(i) { 196 const struct pcpu_dstats *dstats; 197 u64 tbytes, tpkts, tdrops, rbytes, rpkts; 198 unsigned int start; 199 200 dstats = per_cpu_ptr(dev->dstats, i); 201 do { 202 start = u64_stats_fetch_begin_irq(&dstats->syncp); 203 tbytes = dstats->tx_bytes; 204 tpkts = dstats->tx_pkts; 205 tdrops = dstats->tx_drps; 206 rbytes = dstats->rx_bytes; 207 rpkts = dstats->rx_pkts; 208 } while (u64_stats_fetch_retry_irq(&dstats->syncp, start)); 209 stats->tx_bytes += tbytes; 210 stats->tx_packets += tpkts; 211 stats->tx_dropped += tdrops; 212 stats->rx_bytes += rbytes; 213 stats->rx_packets += rpkts; 214 } 215 return stats; 216 } 217 218 #if IS_ENABLED(CONFIG_IPV6) 219 static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, 220 struct net_device *dev) 221 { 222 const struct ipv6hdr *iph = ipv6_hdr(skb); 223 struct net *net = dev_net(skb->dev); 224 struct flowi6 fl6 = { 225 /* needed to match OIF rule */ 226 .flowi6_oif = dev->ifindex, 227 .flowi6_iif = LOOPBACK_IFINDEX, 228 .daddr = iph->daddr, 229 .saddr = iph->saddr, 230 .flowlabel = ip6_flowinfo(iph), 231 .flowi6_mark = skb->mark, 232 .flowi6_proto = iph->nexthdr, 233 .flowi6_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF, 234 }; 235 int ret = NET_XMIT_DROP; 236 struct dst_entry *dst; 237 struct dst_entry *dst_null = &net->ipv6.ip6_null_entry->dst; 238 239 dst = ip6_route_output(net, NULL, &fl6); 240 if (dst == dst_null) 241 goto err; 242 243 skb_dst_drop(skb); 244 skb_dst_set(skb, dst); 245 246 ret = ip6_local_out(net, skb->sk, skb); 247 if (unlikely(net_xmit_eval(ret))) 248 dev->stats.tx_errors++; 249 else 250 ret = NET_XMIT_SUCCESS; 251 252 return ret; 253 err: 254 vrf_tx_error(dev, skb); 255 return NET_XMIT_DROP; 256 } 257 #else 258 static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb, 259 struct net_device *dev) 260 { 261 vrf_tx_error(dev, skb); 262 return NET_XMIT_DROP; 263 } 264 #endif 265 266 static int vrf_send_v4_prep(struct sk_buff *skb, struct flowi4 *fl4, 267 struct net_device *vrf_dev) 268 { 269 struct rtable *rt; 270 int err = 1; 271 272 rt = ip_route_output_flow(dev_net(vrf_dev), fl4, NULL); 273 if (IS_ERR(rt)) 274 goto out; 275 276 /* TO-DO: what about broadcast ? */ 277 if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { 278 ip_rt_put(rt); 279 goto out; 280 } 281 282 skb_dst_drop(skb); 283 skb_dst_set(skb, &rt->dst); 284 err = 0; 285 out: 286 return err; 287 } 288 289 static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb, 290 struct net_device *vrf_dev) 291 { 292 struct iphdr *ip4h = ip_hdr(skb); 293 int ret = NET_XMIT_DROP; 294 struct flowi4 fl4 = { 295 /* needed to match OIF rule */ 296 .flowi4_oif = vrf_dev->ifindex, 297 .flowi4_iif = LOOPBACK_IFINDEX, 298 .flowi4_tos = RT_TOS(ip4h->tos), 299 .flowi4_flags = FLOWI_FLAG_ANYSRC | FLOWI_FLAG_L3MDEV_SRC | 300 FLOWI_FLAG_SKIP_NH_OIF, 301 .daddr = ip4h->daddr, 302 }; 303 304 if (vrf_send_v4_prep(skb, &fl4, vrf_dev)) 305 goto err; 306 307 if (!ip4h->saddr) { 308 ip4h->saddr = inet_select_addr(skb_dst(skb)->dev, 0, 309 RT_SCOPE_LINK); 310 } 311 312 ret = ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); 313 if (unlikely(net_xmit_eval(ret))) 314 vrf_dev->stats.tx_errors++; 315 else 316 ret = NET_XMIT_SUCCESS; 317 318 out: 319 return ret; 320 err: 321 vrf_tx_error(vrf_dev, skb); 322 goto out; 323 } 324 325 static netdev_tx_t is_ip_tx_frame(struct sk_buff *skb, struct net_device *dev) 326 { 327 /* strip the ethernet header added for pass through VRF device */ 328 __skb_pull(skb, skb_network_offset(skb)); 329 330 switch (skb->protocol) { 331 case htons(ETH_P_IP): 332 return vrf_process_v4_outbound(skb, dev); 333 case htons(ETH_P_IPV6): 334 return vrf_process_v6_outbound(skb, dev); 335 default: 336 vrf_tx_error(dev, skb); 337 return NET_XMIT_DROP; 338 } 339 } 340 341 static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev) 342 { 343 netdev_tx_t ret = is_ip_tx_frame(skb, dev); 344 345 if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { 346 struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); 347 348 u64_stats_update_begin(&dstats->syncp); 349 dstats->tx_pkts++; 350 dstats->tx_bytes += skb->len; 351 u64_stats_update_end(&dstats->syncp); 352 } else { 353 this_cpu_inc(dev->dstats->tx_drps); 354 } 355 356 return ret; 357 } 358 359 #if IS_ENABLED(CONFIG_IPV6) 360 static struct dst_entry *vrf_ip6_check(struct dst_entry *dst, u32 cookie) 361 { 362 return dst; 363 } 364 365 static struct dst_ops vrf_dst_ops6 = { 366 .family = AF_INET6, 367 .local_out = ip6_local_out, 368 .check = vrf_ip6_check, 369 .mtu = vrf_v4_mtu, 370 .destroy = vrf_dst_destroy, 371 .default_advmss = vrf_default_advmss, 372 }; 373 374 static int init_dst_ops6_kmem_cachep(void) 375 { 376 vrf_dst_ops6.kmem_cachep = kmem_cache_create("vrf_ip6_dst_cache", 377 sizeof(struct rt6_info), 378 0, 379 SLAB_HWCACHE_ALIGN, 380 NULL); 381 382 if (!vrf_dst_ops6.kmem_cachep) 383 return -ENOMEM; 384 385 return 0; 386 } 387 388 static void free_dst_ops6_kmem_cachep(void) 389 { 390 kmem_cache_destroy(vrf_dst_ops6.kmem_cachep); 391 } 392 393 static int vrf_input6(struct sk_buff *skb) 394 { 395 skb->dev->stats.rx_errors++; 396 kfree_skb(skb); 397 return 0; 398 } 399 400 /* modelled after ip6_finish_output2 */ 401 static int vrf_finish_output6(struct net *net, struct sock *sk, 402 struct sk_buff *skb) 403 { 404 struct dst_entry *dst = skb_dst(skb); 405 struct net_device *dev = dst->dev; 406 struct neighbour *neigh; 407 struct in6_addr *nexthop; 408 int ret; 409 410 skb->protocol = htons(ETH_P_IPV6); 411 skb->dev = dev; 412 413 rcu_read_lock_bh(); 414 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr); 415 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); 416 if (unlikely(!neigh)) 417 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); 418 if (!IS_ERR(neigh)) { 419 ret = dst_neigh_output(dst, neigh, skb); 420 rcu_read_unlock_bh(); 421 return ret; 422 } 423 rcu_read_unlock_bh(); 424 425 IP6_INC_STATS(dev_net(dst->dev), 426 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); 427 kfree_skb(skb); 428 return -EINVAL; 429 } 430 431 /* modelled after ip6_output */ 432 static int vrf_output6(struct net *net, struct sock *sk, struct sk_buff *skb) 433 { 434 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, 435 net, sk, skb, NULL, skb_dst(skb)->dev, 436 vrf_finish_output6, 437 !(IP6CB(skb)->flags & IP6SKB_REROUTED)); 438 } 439 440 static void vrf_rt6_destroy(struct net_vrf *vrf) 441 { 442 dst_destroy(&vrf->rt6->dst); 443 free_percpu(vrf->rt6->rt6i_pcpu); 444 vrf->rt6 = NULL; 445 } 446 447 static int vrf_rt6_create(struct net_device *dev) 448 { 449 struct net_vrf *vrf = netdev_priv(dev); 450 struct dst_entry *dst; 451 struct rt6_info *rt6; 452 int cpu; 453 int rc = -ENOMEM; 454 455 rt6 = dst_alloc(&vrf_dst_ops6, dev, 0, 456 DST_OBSOLETE_NONE, 457 (DST_HOST | DST_NOPOLICY | DST_NOXFRM)); 458 if (!rt6) 459 goto out; 460 461 dst = &rt6->dst; 462 463 rt6->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_KERNEL); 464 if (!rt6->rt6i_pcpu) { 465 dst_destroy(dst); 466 goto out; 467 } 468 for_each_possible_cpu(cpu) { 469 struct rt6_info **p = per_cpu_ptr(rt6->rt6i_pcpu, cpu); 470 *p = NULL; 471 } 472 473 memset(dst + 1, 0, sizeof(*rt6) - sizeof(*dst)); 474 475 INIT_LIST_HEAD(&rt6->rt6i_siblings); 476 INIT_LIST_HEAD(&rt6->rt6i_uncached); 477 478 rt6->dst.input = vrf_input6; 479 rt6->dst.output = vrf_output6; 480 481 rt6->rt6i_table = fib6_get_table(dev_net(dev), vrf->tb_id); 482 483 atomic_set(&rt6->dst.__refcnt, 2); 484 485 vrf->rt6 = rt6; 486 rc = 0; 487 out: 488 return rc; 489 } 490 #else 491 static int init_dst_ops6_kmem_cachep(void) 492 { 493 return 0; 494 } 495 496 static void free_dst_ops6_kmem_cachep(void) 497 { 498 } 499 500 static void vrf_rt6_destroy(struct net_vrf *vrf) 501 { 502 } 503 504 static int vrf_rt6_create(struct net_device *dev) 505 { 506 return 0; 507 } 508 #endif 509 510 /* modelled after ip_finish_output2 */ 511 static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 512 { 513 struct dst_entry *dst = skb_dst(skb); 514 struct rtable *rt = (struct rtable *)dst; 515 struct net_device *dev = dst->dev; 516 unsigned int hh_len = LL_RESERVED_SPACE(dev); 517 struct neighbour *neigh; 518 u32 nexthop; 519 int ret = -EINVAL; 520 521 /* Be paranoid, rather than too clever. */ 522 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { 523 struct sk_buff *skb2; 524 525 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev)); 526 if (!skb2) { 527 ret = -ENOMEM; 528 goto err; 529 } 530 if (skb->sk) 531 skb_set_owner_w(skb2, skb->sk); 532 533 consume_skb(skb); 534 skb = skb2; 535 } 536 537 rcu_read_lock_bh(); 538 539 nexthop = (__force u32)rt_nexthop(rt, ip_hdr(skb)->daddr); 540 neigh = __ipv4_neigh_lookup_noref(dev, nexthop); 541 if (unlikely(!neigh)) 542 neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); 543 if (!IS_ERR(neigh)) 544 ret = dst_neigh_output(dst, neigh, skb); 545 546 rcu_read_unlock_bh(); 547 err: 548 if (unlikely(ret < 0)) 549 vrf_tx_error(skb->dev, skb); 550 return ret; 551 } 552 553 static int vrf_output(struct net *net, struct sock *sk, struct sk_buff *skb) 554 { 555 struct net_device *dev = skb_dst(skb)->dev; 556 557 IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); 558 559 skb->dev = dev; 560 skb->protocol = htons(ETH_P_IP); 561 562 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, 563 net, sk, skb, NULL, dev, 564 vrf_finish_output, 565 !(IPCB(skb)->flags & IPSKB_REROUTED)); 566 } 567 568 static void vrf_rtable_destroy(struct net_vrf *vrf) 569 { 570 struct dst_entry *dst = (struct dst_entry *)vrf->rth; 571 572 dst_destroy(dst); 573 vrf->rth = NULL; 574 } 575 576 static struct rtable *vrf_rtable_create(struct net_device *dev) 577 { 578 struct net_vrf *vrf = netdev_priv(dev); 579 struct rtable *rth; 580 581 rth = dst_alloc(&vrf_dst_ops, dev, 2, 582 DST_OBSOLETE_NONE, 583 (DST_HOST | DST_NOPOLICY | DST_NOXFRM)); 584 if (rth) { 585 rth->dst.output = vrf_output; 586 rth->rt_genid = rt_genid_ipv4(dev_net(dev)); 587 rth->rt_flags = 0; 588 rth->rt_type = RTN_UNICAST; 589 rth->rt_is_input = 0; 590 rth->rt_iif = 0; 591 rth->rt_pmtu = 0; 592 rth->rt_gateway = 0; 593 rth->rt_uses_gateway = 0; 594 rth->rt_table_id = vrf->tb_id; 595 INIT_LIST_HEAD(&rth->rt_uncached); 596 rth->rt_uncached_list = NULL; 597 } 598 599 return rth; 600 } 601 602 /**************************** device handling ********************/ 603 604 /* cycle interface to flush neighbor cache and move routes across tables */ 605 static void cycle_netdev(struct net_device *dev) 606 { 607 unsigned int flags = dev->flags; 608 int ret; 609 610 if (!netif_running(dev)) 611 return; 612 613 ret = dev_change_flags(dev, flags & ~IFF_UP); 614 if (ret >= 0) 615 ret = dev_change_flags(dev, flags); 616 617 if (ret < 0) { 618 netdev_err(dev, 619 "Failed to cycle device %s; route tables might be wrong!\n", 620 dev->name); 621 } 622 } 623 624 static struct slave *__vrf_find_slave_dev(struct slave_queue *queue, 625 struct net_device *dev) 626 { 627 struct list_head *head = &queue->all_slaves; 628 struct slave *slave; 629 630 list_for_each_entry(slave, head, list) { 631 if (slave->dev == dev) 632 return slave; 633 } 634 635 return NULL; 636 } 637 638 /* inverse of __vrf_insert_slave */ 639 static void __vrf_remove_slave(struct slave_queue *queue, struct slave *slave) 640 { 641 list_del(&slave->list); 642 } 643 644 static void __vrf_insert_slave(struct slave_queue *queue, struct slave *slave) 645 { 646 list_add(&slave->list, &queue->all_slaves); 647 } 648 649 static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev) 650 { 651 struct slave *slave = kzalloc(sizeof(*slave), GFP_KERNEL); 652 struct net_vrf *vrf = netdev_priv(dev); 653 struct slave_queue *queue = &vrf->queue; 654 int ret = -ENOMEM; 655 656 if (!slave) 657 goto out_fail; 658 659 slave->dev = port_dev; 660 661 /* register the packet handler for slave ports */ 662 ret = netdev_rx_handler_register(port_dev, vrf_handle_frame, dev); 663 if (ret) { 664 netdev_err(port_dev, 665 "Device %s failed to register rx_handler\n", 666 port_dev->name); 667 goto out_fail; 668 } 669 670 ret = netdev_master_upper_dev_link(port_dev, dev); 671 if (ret < 0) 672 goto out_unregister; 673 674 port_dev->priv_flags |= IFF_L3MDEV_SLAVE; 675 __vrf_insert_slave(queue, slave); 676 cycle_netdev(port_dev); 677 678 return 0; 679 680 out_unregister: 681 netdev_rx_handler_unregister(port_dev); 682 out_fail: 683 kfree(slave); 684 return ret; 685 } 686 687 static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev) 688 { 689 if (netif_is_l3_master(port_dev) || netif_is_l3_slave(port_dev)) 690 return -EINVAL; 691 692 return do_vrf_add_slave(dev, port_dev); 693 } 694 695 /* inverse of do_vrf_add_slave */ 696 static int do_vrf_del_slave(struct net_device *dev, struct net_device *port_dev) 697 { 698 struct net_vrf *vrf = netdev_priv(dev); 699 struct slave_queue *queue = &vrf->queue; 700 struct slave *slave; 701 702 netdev_upper_dev_unlink(port_dev, dev); 703 port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE; 704 705 netdev_rx_handler_unregister(port_dev); 706 707 cycle_netdev(port_dev); 708 709 slave = __vrf_find_slave_dev(queue, port_dev); 710 if (slave) 711 __vrf_remove_slave(queue, slave); 712 713 kfree(slave); 714 715 return 0; 716 } 717 718 static int vrf_del_slave(struct net_device *dev, struct net_device *port_dev) 719 { 720 return do_vrf_del_slave(dev, port_dev); 721 } 722 723 static void vrf_dev_uninit(struct net_device *dev) 724 { 725 struct net_vrf *vrf = netdev_priv(dev); 726 struct slave_queue *queue = &vrf->queue; 727 struct list_head *head = &queue->all_slaves; 728 struct slave *slave, *next; 729 730 vrf_rtable_destroy(vrf); 731 vrf_rt6_destroy(vrf); 732 733 list_for_each_entry_safe(slave, next, head, list) 734 vrf_del_slave(dev, slave->dev); 735 736 free_percpu(dev->dstats); 737 dev->dstats = NULL; 738 } 739 740 static int vrf_dev_init(struct net_device *dev) 741 { 742 struct net_vrf *vrf = netdev_priv(dev); 743 744 INIT_LIST_HEAD(&vrf->queue.all_slaves); 745 746 dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats); 747 if (!dev->dstats) 748 goto out_nomem; 749 750 /* create the default dst which points back to us */ 751 vrf->rth = vrf_rtable_create(dev); 752 if (!vrf->rth) 753 goto out_stats; 754 755 if (vrf_rt6_create(dev) != 0) 756 goto out_rth; 757 758 dev->flags = IFF_MASTER | IFF_NOARP; 759 760 return 0; 761 762 out_rth: 763 vrf_rtable_destroy(vrf); 764 out_stats: 765 free_percpu(dev->dstats); 766 dev->dstats = NULL; 767 out_nomem: 768 return -ENOMEM; 769 } 770 771 static const struct net_device_ops vrf_netdev_ops = { 772 .ndo_init = vrf_dev_init, 773 .ndo_uninit = vrf_dev_uninit, 774 .ndo_start_xmit = vrf_xmit, 775 .ndo_get_stats64 = vrf_get_stats64, 776 .ndo_add_slave = vrf_add_slave, 777 .ndo_del_slave = vrf_del_slave, 778 }; 779 780 static u32 vrf_fib_table(const struct net_device *dev) 781 { 782 struct net_vrf *vrf = netdev_priv(dev); 783 784 return vrf->tb_id; 785 } 786 787 static struct rtable *vrf_get_rtable(const struct net_device *dev, 788 const struct flowi4 *fl4) 789 { 790 struct rtable *rth = NULL; 791 792 if (!(fl4->flowi4_flags & FLOWI_FLAG_L3MDEV_SRC)) { 793 struct net_vrf *vrf = netdev_priv(dev); 794 795 rth = vrf->rth; 796 atomic_inc(&rth->dst.__refcnt); 797 } 798 799 return rth; 800 } 801 802 /* called under rcu_read_lock */ 803 static void vrf_get_saddr(struct net_device *dev, struct flowi4 *fl4) 804 { 805 struct fib_result res = { .tclassid = 0 }; 806 struct net *net = dev_net(dev); 807 u32 orig_tos = fl4->flowi4_tos; 808 u8 flags = fl4->flowi4_flags; 809 u8 scope = fl4->flowi4_scope; 810 u8 tos = RT_FL_TOS(fl4); 811 812 if (unlikely(!fl4->daddr)) 813 return; 814 815 fl4->flowi4_flags |= FLOWI_FLAG_SKIP_NH_OIF; 816 fl4->flowi4_iif = LOOPBACK_IFINDEX; 817 fl4->flowi4_tos = tos & IPTOS_RT_MASK; 818 fl4->flowi4_scope = ((tos & RTO_ONLINK) ? 819 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); 820 821 if (!fib_lookup(net, fl4, &res, 0)) { 822 if (res.type == RTN_LOCAL) 823 fl4->saddr = res.fi->fib_prefsrc ? : fl4->daddr; 824 else 825 fib_select_path(net, &res, fl4, -1); 826 } 827 828 fl4->flowi4_flags = flags; 829 fl4->flowi4_tos = orig_tos; 830 fl4->flowi4_scope = scope; 831 } 832 833 #if IS_ENABLED(CONFIG_IPV6) 834 static struct dst_entry *vrf_get_rt6_dst(const struct net_device *dev, 835 const struct flowi6 *fl6) 836 { 837 struct rt6_info *rt = NULL; 838 839 if (!(fl6->flowi6_flags & FLOWI_FLAG_L3MDEV_SRC)) { 840 struct net_vrf *vrf = netdev_priv(dev); 841 842 rt = vrf->rt6; 843 atomic_inc(&rt->dst.__refcnt); 844 } 845 846 return (struct dst_entry *)rt; 847 } 848 #endif 849 850 static const struct l3mdev_ops vrf_l3mdev_ops = { 851 .l3mdev_fib_table = vrf_fib_table, 852 .l3mdev_get_rtable = vrf_get_rtable, 853 .l3mdev_get_saddr = vrf_get_saddr, 854 #if IS_ENABLED(CONFIG_IPV6) 855 .l3mdev_get_rt6_dst = vrf_get_rt6_dst, 856 #endif 857 }; 858 859 static void vrf_get_drvinfo(struct net_device *dev, 860 struct ethtool_drvinfo *info) 861 { 862 strlcpy(info->driver, DRV_NAME, sizeof(info->driver)); 863 strlcpy(info->version, DRV_VERSION, sizeof(info->version)); 864 } 865 866 static const struct ethtool_ops vrf_ethtool_ops = { 867 .get_drvinfo = vrf_get_drvinfo, 868 }; 869 870 static void vrf_setup(struct net_device *dev) 871 { 872 ether_setup(dev); 873 874 /* Initialize the device structure. */ 875 dev->netdev_ops = &vrf_netdev_ops; 876 dev->l3mdev_ops = &vrf_l3mdev_ops; 877 dev->ethtool_ops = &vrf_ethtool_ops; 878 dev->destructor = free_netdev; 879 880 /* Fill in device structure with ethernet-generic values. */ 881 eth_hw_addr_random(dev); 882 883 /* don't acquire vrf device's netif_tx_lock when transmitting */ 884 dev->features |= NETIF_F_LLTX; 885 886 /* don't allow vrf devices to change network namespaces. */ 887 dev->features |= NETIF_F_NETNS_LOCAL; 888 } 889 890 static int vrf_validate(struct nlattr *tb[], struct nlattr *data[]) 891 { 892 if (tb[IFLA_ADDRESS]) { 893 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) 894 return -EINVAL; 895 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) 896 return -EADDRNOTAVAIL; 897 } 898 return 0; 899 } 900 901 static void vrf_dellink(struct net_device *dev, struct list_head *head) 902 { 903 unregister_netdevice_queue(dev, head); 904 } 905 906 static int vrf_newlink(struct net *src_net, struct net_device *dev, 907 struct nlattr *tb[], struct nlattr *data[]) 908 { 909 struct net_vrf *vrf = netdev_priv(dev); 910 911 if (!data || !data[IFLA_VRF_TABLE]) 912 return -EINVAL; 913 914 vrf->tb_id = nla_get_u32(data[IFLA_VRF_TABLE]); 915 916 dev->priv_flags |= IFF_L3MDEV_MASTER; 917 918 return register_netdevice(dev); 919 } 920 921 static size_t vrf_nl_getsize(const struct net_device *dev) 922 { 923 return nla_total_size(sizeof(u32)); /* IFLA_VRF_TABLE */ 924 } 925 926 static int vrf_fillinfo(struct sk_buff *skb, 927 const struct net_device *dev) 928 { 929 struct net_vrf *vrf = netdev_priv(dev); 930 931 return nla_put_u32(skb, IFLA_VRF_TABLE, vrf->tb_id); 932 } 933 934 static const struct nla_policy vrf_nl_policy[IFLA_VRF_MAX + 1] = { 935 [IFLA_VRF_TABLE] = { .type = NLA_U32 }, 936 }; 937 938 static struct rtnl_link_ops vrf_link_ops __read_mostly = { 939 .kind = DRV_NAME, 940 .priv_size = sizeof(struct net_vrf), 941 942 .get_size = vrf_nl_getsize, 943 .policy = vrf_nl_policy, 944 .validate = vrf_validate, 945 .fill_info = vrf_fillinfo, 946 947 .newlink = vrf_newlink, 948 .dellink = vrf_dellink, 949 .setup = vrf_setup, 950 .maxtype = IFLA_VRF_MAX, 951 }; 952 953 static int vrf_device_event(struct notifier_block *unused, 954 unsigned long event, void *ptr) 955 { 956 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 957 958 /* only care about unregister events to drop slave references */ 959 if (event == NETDEV_UNREGISTER) { 960 struct net_device *vrf_dev; 961 962 if (!netif_is_l3_slave(dev)) 963 goto out; 964 965 vrf_dev = netdev_master_upper_dev_get(dev); 966 vrf_del_slave(vrf_dev, dev); 967 } 968 out: 969 return NOTIFY_DONE; 970 } 971 972 static struct notifier_block vrf_notifier_block __read_mostly = { 973 .notifier_call = vrf_device_event, 974 }; 975 976 static int __init vrf_init_module(void) 977 { 978 int rc; 979 980 vrf_dst_ops.kmem_cachep = 981 kmem_cache_create("vrf_ip_dst_cache", 982 sizeof(struct rtable), 0, 983 SLAB_HWCACHE_ALIGN, 984 NULL); 985 986 if (!vrf_dst_ops.kmem_cachep) 987 return -ENOMEM; 988 989 rc = init_dst_ops6_kmem_cachep(); 990 if (rc != 0) 991 goto error2; 992 993 register_netdevice_notifier(&vrf_notifier_block); 994 995 rc = rtnl_link_register(&vrf_link_ops); 996 if (rc < 0) 997 goto error; 998 999 return 0; 1000 1001 error: 1002 unregister_netdevice_notifier(&vrf_notifier_block); 1003 free_dst_ops6_kmem_cachep(); 1004 error2: 1005 kmem_cache_destroy(vrf_dst_ops.kmem_cachep); 1006 return rc; 1007 } 1008 1009 static void __exit vrf_cleanup_module(void) 1010 { 1011 rtnl_link_unregister(&vrf_link_ops); 1012 unregister_netdevice_notifier(&vrf_notifier_block); 1013 kmem_cache_destroy(vrf_dst_ops.kmem_cachep); 1014 free_dst_ops6_kmem_cachep(); 1015 } 1016 1017 module_init(vrf_init_module); 1018 module_exit(vrf_cleanup_module); 1019 MODULE_AUTHOR("Shrijeet Mukherjee, David Ahern"); 1020 MODULE_DESCRIPTION("Device driver to instantiate VRF domains"); 1021 MODULE_LICENSE("GPL"); 1022 MODULE_ALIAS_RTNL_LINK(DRV_NAME); 1023 MODULE_VERSION(DRV_VERSION); 1024