1 /* Copyright (c) 2014 Mahesh Bandewar <maheshb@google.com> 2 * 3 * This program is free software; you can redistribute it and/or 4 * modify it under the terms of the GNU General Public License as 5 * published by the Free Software Foundation; either version 2 of 6 * the License, or (at your option) any later version. 7 * 8 */ 9 10 #include "ipvlan.h" 11 12 static u32 ipvlan_jhash_secret __read_mostly; 13 14 void ipvlan_init_secret(void) 15 { 16 net_get_random_once(&ipvlan_jhash_secret, sizeof(ipvlan_jhash_secret)); 17 } 18 19 static void ipvlan_count_rx(const struct ipvl_dev *ipvlan, 20 unsigned int len, bool success, bool mcast) 21 { 22 if (!ipvlan) 23 return; 24 25 if (likely(success)) { 26 struct ipvl_pcpu_stats *pcptr; 27 28 pcptr = this_cpu_ptr(ipvlan->pcpu_stats); 29 u64_stats_update_begin(&pcptr->syncp); 30 pcptr->rx_pkts++; 31 pcptr->rx_bytes += len; 32 if (mcast) 33 pcptr->rx_mcast++; 34 u64_stats_update_end(&pcptr->syncp); 35 } else { 36 this_cpu_inc(ipvlan->pcpu_stats->rx_errs); 37 } 38 } 39 40 static u8 ipvlan_get_v6_hash(const void *iaddr) 41 { 42 const struct in6_addr *ip6_addr = iaddr; 43 44 return __ipv6_addr_jhash(ip6_addr, ipvlan_jhash_secret) & 45 IPVLAN_HASH_MASK; 46 } 47 48 static u8 ipvlan_get_v4_hash(const void *iaddr) 49 { 50 const struct in_addr *ip4_addr = iaddr; 51 52 return jhash_1word(ip4_addr->s_addr, ipvlan_jhash_secret) & 53 IPVLAN_HASH_MASK; 54 } 55 56 struct ipvl_addr *ipvlan_ht_addr_lookup(const struct ipvl_port *port, 57 const void *iaddr, bool is_v6) 58 { 59 struct ipvl_addr *addr; 60 u8 hash; 61 62 hash = is_v6 ? ipvlan_get_v6_hash(iaddr) : 63 ipvlan_get_v4_hash(iaddr); 64 hlist_for_each_entry_rcu(addr, &port->hlhead[hash], hlnode) { 65 if (is_v6 && addr->atype == IPVL_IPV6 && 66 ipv6_addr_equal(&addr->ip6addr, iaddr)) 67 return addr; 68 else if (!is_v6 && addr->atype == IPVL_IPV4 && 69 addr->ip4addr.s_addr == 70 ((struct in_addr *)iaddr)->s_addr) 71 return addr; 72 } 73 return NULL; 74 } 75 76 void ipvlan_ht_addr_add(struct ipvl_dev *ipvlan, struct ipvl_addr *addr) 77 { 78 struct ipvl_port *port = ipvlan->port; 79 u8 hash; 80 81 hash = (addr->atype == IPVL_IPV6) ? 82 ipvlan_get_v6_hash(&addr->ip6addr) : 83 ipvlan_get_v4_hash(&addr->ip4addr); 84 if (hlist_unhashed(&addr->hlnode)) 85 hlist_add_head_rcu(&addr->hlnode, &port->hlhead[hash]); 86 } 87 88 void ipvlan_ht_addr_del(struct ipvl_addr *addr, bool sync) 89 { 90 hlist_del_init_rcu(&addr->hlnode); 91 if (sync) 92 synchronize_rcu(); 93 } 94 95 struct ipvl_addr *ipvlan_find_addr(const struct ipvl_dev *ipvlan, 96 const void *iaddr, bool is_v6) 97 { 98 struct ipvl_addr *addr; 99 100 list_for_each_entry(addr, &ipvlan->addrs, anode) { 101 if ((is_v6 && addr->atype == IPVL_IPV6 && 102 ipv6_addr_equal(&addr->ip6addr, iaddr)) || 103 (!is_v6 && addr->atype == IPVL_IPV4 && 104 addr->ip4addr.s_addr == ((struct in_addr *)iaddr)->s_addr)) 105 return addr; 106 } 107 return NULL; 108 } 109 110 bool ipvlan_addr_busy(struct ipvl_port *port, void *iaddr, bool is_v6) 111 { 112 struct ipvl_dev *ipvlan; 113 114 ASSERT_RTNL(); 115 116 list_for_each_entry(ipvlan, &port->ipvlans, pnode) { 117 if (ipvlan_find_addr(ipvlan, iaddr, is_v6)) 118 return true; 119 } 120 return false; 121 } 122 123 static void *ipvlan_get_L3_hdr(struct sk_buff *skb, int *type) 124 { 125 void *lyr3h = NULL; 126 127 switch (skb->protocol) { 128 case htons(ETH_P_ARP): { 129 struct arphdr *arph; 130 131 if (unlikely(!pskb_may_pull(skb, sizeof(*arph)))) 132 return NULL; 133 134 arph = arp_hdr(skb); 135 *type = IPVL_ARP; 136 lyr3h = arph; 137 break; 138 } 139 case htons(ETH_P_IP): { 140 u32 pktlen; 141 struct iphdr *ip4h; 142 143 if (unlikely(!pskb_may_pull(skb, sizeof(*ip4h)))) 144 return NULL; 145 146 ip4h = ip_hdr(skb); 147 pktlen = ntohs(ip4h->tot_len); 148 if (ip4h->ihl < 5 || ip4h->version != 4) 149 return NULL; 150 if (skb->len < pktlen || pktlen < (ip4h->ihl * 4)) 151 return NULL; 152 153 *type = IPVL_IPV4; 154 lyr3h = ip4h; 155 break; 156 } 157 case htons(ETH_P_IPV6): { 158 struct ipv6hdr *ip6h; 159 160 if (unlikely(!pskb_may_pull(skb, sizeof(*ip6h)))) 161 return NULL; 162 163 ip6h = ipv6_hdr(skb); 164 if (ip6h->version != 6) 165 return NULL; 166 167 *type = IPVL_IPV6; 168 lyr3h = ip6h; 169 /* Only Neighbour Solicitation pkts need different treatment */ 170 if (ipv6_addr_any(&ip6h->saddr) && 171 ip6h->nexthdr == NEXTHDR_ICMP) { 172 *type = IPVL_ICMPV6; 173 lyr3h = ip6h + 1; 174 } 175 break; 176 } 177 default: 178 return NULL; 179 } 180 181 return lyr3h; 182 } 183 184 unsigned int ipvlan_mac_hash(const unsigned char *addr) 185 { 186 u32 hash = jhash_1word(__get_unaligned_cpu32(addr+2), 187 ipvlan_jhash_secret); 188 189 return hash & IPVLAN_MAC_FILTER_MASK; 190 } 191 192 void ipvlan_process_multicast(struct work_struct *work) 193 { 194 struct ipvl_port *port = container_of(work, struct ipvl_port, wq); 195 struct ethhdr *ethh; 196 struct ipvl_dev *ipvlan; 197 struct sk_buff *skb, *nskb; 198 struct sk_buff_head list; 199 unsigned int len; 200 unsigned int mac_hash; 201 int ret; 202 u8 pkt_type; 203 bool hlocal, dlocal; 204 205 __skb_queue_head_init(&list); 206 207 spin_lock_bh(&port->backlog.lock); 208 skb_queue_splice_tail_init(&port->backlog, &list); 209 spin_unlock_bh(&port->backlog.lock); 210 211 while ((skb = __skb_dequeue(&list)) != NULL) { 212 ethh = eth_hdr(skb); 213 hlocal = ether_addr_equal(ethh->h_source, port->dev->dev_addr); 214 mac_hash = ipvlan_mac_hash(ethh->h_dest); 215 216 if (ether_addr_equal(ethh->h_dest, port->dev->broadcast)) 217 pkt_type = PACKET_BROADCAST; 218 else 219 pkt_type = PACKET_MULTICAST; 220 221 dlocal = false; 222 rcu_read_lock(); 223 list_for_each_entry_rcu(ipvlan, &port->ipvlans, pnode) { 224 if (hlocal && (ipvlan->dev == skb->dev)) { 225 dlocal = true; 226 continue; 227 } 228 if (!test_bit(mac_hash, ipvlan->mac_filters)) 229 continue; 230 231 ret = NET_RX_DROP; 232 len = skb->len + ETH_HLEN; 233 nskb = skb_clone(skb, GFP_ATOMIC); 234 if (!nskb) 235 goto acct; 236 237 nskb->pkt_type = pkt_type; 238 nskb->dev = ipvlan->dev; 239 if (hlocal) 240 ret = dev_forward_skb(ipvlan->dev, nskb); 241 else 242 ret = netif_rx(nskb); 243 acct: 244 ipvlan_count_rx(ipvlan, len, ret == NET_RX_SUCCESS, true); 245 } 246 rcu_read_unlock(); 247 248 if (dlocal) { 249 /* If the packet originated here, send it out. */ 250 skb->dev = port->dev; 251 skb->pkt_type = pkt_type; 252 dev_queue_xmit(skb); 253 } else { 254 kfree_skb(skb); 255 } 256 } 257 } 258 259 static int ipvlan_rcv_frame(struct ipvl_addr *addr, struct sk_buff *skb, 260 bool local) 261 { 262 struct ipvl_dev *ipvlan = addr->master; 263 struct net_device *dev = ipvlan->dev; 264 unsigned int len; 265 rx_handler_result_t ret = RX_HANDLER_CONSUMED; 266 bool success = false; 267 268 len = skb->len + ETH_HLEN; 269 if (unlikely(!(dev->flags & IFF_UP))) { 270 kfree_skb(skb); 271 goto out; 272 } 273 274 skb = skb_share_check(skb, GFP_ATOMIC); 275 if (!skb) 276 goto out; 277 278 skb->dev = dev; 279 skb->pkt_type = PACKET_HOST; 280 281 if (local) { 282 if (dev_forward_skb(ipvlan->dev, skb) == NET_RX_SUCCESS) 283 success = true; 284 } else { 285 ret = RX_HANDLER_ANOTHER; 286 success = true; 287 } 288 289 out: 290 ipvlan_count_rx(ipvlan, len, success, false); 291 return ret; 292 } 293 294 static struct ipvl_addr *ipvlan_addr_lookup(struct ipvl_port *port, 295 void *lyr3h, int addr_type, 296 bool use_dest) 297 { 298 struct ipvl_addr *addr = NULL; 299 300 if (addr_type == IPVL_IPV6) { 301 struct ipv6hdr *ip6h; 302 struct in6_addr *i6addr; 303 304 ip6h = (struct ipv6hdr *)lyr3h; 305 i6addr = use_dest ? &ip6h->daddr : &ip6h->saddr; 306 addr = ipvlan_ht_addr_lookup(port, i6addr, true); 307 } else if (addr_type == IPVL_ICMPV6) { 308 struct nd_msg *ndmh; 309 struct in6_addr *i6addr; 310 311 /* Make sure that the NeighborSolicitation ICMPv6 packets 312 * are handled to avoid DAD issue. 313 */ 314 ndmh = (struct nd_msg *)lyr3h; 315 if (ndmh->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION) { 316 i6addr = &ndmh->target; 317 addr = ipvlan_ht_addr_lookup(port, i6addr, true); 318 } 319 } else if (addr_type == IPVL_IPV4) { 320 struct iphdr *ip4h; 321 __be32 *i4addr; 322 323 ip4h = (struct iphdr *)lyr3h; 324 i4addr = use_dest ? &ip4h->daddr : &ip4h->saddr; 325 addr = ipvlan_ht_addr_lookup(port, i4addr, false); 326 } else if (addr_type == IPVL_ARP) { 327 struct arphdr *arph; 328 unsigned char *arp_ptr; 329 __be32 dip; 330 331 arph = (struct arphdr *)lyr3h; 332 arp_ptr = (unsigned char *)(arph + 1); 333 if (use_dest) 334 arp_ptr += (2 * port->dev->addr_len) + 4; 335 else 336 arp_ptr += port->dev->addr_len; 337 338 memcpy(&dip, arp_ptr, 4); 339 addr = ipvlan_ht_addr_lookup(port, &dip, false); 340 } 341 342 return addr; 343 } 344 345 static int ipvlan_process_v4_outbound(struct sk_buff *skb) 346 { 347 const struct iphdr *ip4h = ip_hdr(skb); 348 struct net_device *dev = skb->dev; 349 struct rtable *rt; 350 int err, ret = NET_XMIT_DROP; 351 struct flowi4 fl4 = { 352 .flowi4_oif = dev_get_iflink(dev), 353 .flowi4_tos = RT_TOS(ip4h->tos), 354 .flowi4_flags = FLOWI_FLAG_ANYSRC, 355 .daddr = ip4h->daddr, 356 .saddr = ip4h->saddr, 357 }; 358 359 rt = ip_route_output_flow(dev_net(dev), &fl4, NULL); 360 if (IS_ERR(rt)) 361 goto err; 362 363 if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { 364 ip_rt_put(rt); 365 goto err; 366 } 367 skb_dst_drop(skb); 368 skb_dst_set(skb, &rt->dst); 369 err = ip_local_out(skb); 370 if (unlikely(net_xmit_eval(err))) 371 dev->stats.tx_errors++; 372 else 373 ret = NET_XMIT_SUCCESS; 374 goto out; 375 err: 376 dev->stats.tx_errors++; 377 kfree_skb(skb); 378 out: 379 return ret; 380 } 381 382 static int ipvlan_process_v6_outbound(struct sk_buff *skb) 383 { 384 const struct ipv6hdr *ip6h = ipv6_hdr(skb); 385 struct net_device *dev = skb->dev; 386 struct dst_entry *dst; 387 int err, ret = NET_XMIT_DROP; 388 struct flowi6 fl6 = { 389 .flowi6_iif = skb->dev->ifindex, 390 .daddr = ip6h->daddr, 391 .saddr = ip6h->saddr, 392 .flowi6_flags = FLOWI_FLAG_ANYSRC, 393 .flowlabel = ip6_flowinfo(ip6h), 394 .flowi6_mark = skb->mark, 395 .flowi6_proto = ip6h->nexthdr, 396 }; 397 398 dst = ip6_route_output(dev_net(dev), NULL, &fl6); 399 if (dst->error) { 400 ret = dst->error; 401 dst_release(dst); 402 goto err; 403 } 404 skb_dst_drop(skb); 405 skb_dst_set(skb, dst); 406 err = ip6_local_out(skb); 407 if (unlikely(net_xmit_eval(err))) 408 dev->stats.tx_errors++; 409 else 410 ret = NET_XMIT_SUCCESS; 411 goto out; 412 err: 413 dev->stats.tx_errors++; 414 kfree_skb(skb); 415 out: 416 return ret; 417 } 418 419 static int ipvlan_process_outbound(struct sk_buff *skb, 420 const struct ipvl_dev *ipvlan) 421 { 422 struct ethhdr *ethh = eth_hdr(skb); 423 int ret = NET_XMIT_DROP; 424 425 /* In this mode we dont care about multicast and broadcast traffic */ 426 if (is_multicast_ether_addr(ethh->h_dest)) { 427 pr_warn_ratelimited("Dropped {multi|broad}cast of type= [%x]\n", 428 ntohs(skb->protocol)); 429 kfree_skb(skb); 430 goto out; 431 } 432 433 /* The ipvlan is a pseudo-L2 device, so the packets that we receive 434 * will have L2; which need to discarded and processed further 435 * in the net-ns of the main-device. 436 */ 437 if (skb_mac_header_was_set(skb)) { 438 skb_pull(skb, sizeof(*ethh)); 439 skb->mac_header = (typeof(skb->mac_header))~0U; 440 skb_reset_network_header(skb); 441 } 442 443 if (skb->protocol == htons(ETH_P_IPV6)) 444 ret = ipvlan_process_v6_outbound(skb); 445 else if (skb->protocol == htons(ETH_P_IP)) 446 ret = ipvlan_process_v4_outbound(skb); 447 else { 448 pr_warn_ratelimited("Dropped outbound packet type=%x\n", 449 ntohs(skb->protocol)); 450 kfree_skb(skb); 451 } 452 out: 453 return ret; 454 } 455 456 static void ipvlan_multicast_enqueue(struct ipvl_port *port, 457 struct sk_buff *skb) 458 { 459 if (skb->protocol == htons(ETH_P_PAUSE)) { 460 kfree_skb(skb); 461 return; 462 } 463 464 spin_lock(&port->backlog.lock); 465 if (skb_queue_len(&port->backlog) < IPVLAN_QBACKLOG_LIMIT) { 466 __skb_queue_tail(&port->backlog, skb); 467 spin_unlock(&port->backlog.lock); 468 schedule_work(&port->wq); 469 } else { 470 spin_unlock(&port->backlog.lock); 471 atomic_long_inc(&skb->dev->rx_dropped); 472 kfree_skb(skb); 473 } 474 } 475 476 static int ipvlan_xmit_mode_l3(struct sk_buff *skb, struct net_device *dev) 477 { 478 const struct ipvl_dev *ipvlan = netdev_priv(dev); 479 void *lyr3h; 480 struct ipvl_addr *addr; 481 int addr_type; 482 483 lyr3h = ipvlan_get_L3_hdr(skb, &addr_type); 484 if (!lyr3h) 485 goto out; 486 487 addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true); 488 if (addr) 489 return ipvlan_rcv_frame(addr, skb, true); 490 491 out: 492 skb->dev = ipvlan->phy_dev; 493 return ipvlan_process_outbound(skb, ipvlan); 494 } 495 496 static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev) 497 { 498 const struct ipvl_dev *ipvlan = netdev_priv(dev); 499 struct ethhdr *eth = eth_hdr(skb); 500 struct ipvl_addr *addr; 501 void *lyr3h; 502 int addr_type; 503 504 if (ether_addr_equal(eth->h_dest, eth->h_source)) { 505 lyr3h = ipvlan_get_L3_hdr(skb, &addr_type); 506 if (lyr3h) { 507 addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true); 508 if (addr) 509 return ipvlan_rcv_frame(addr, skb, true); 510 } 511 skb = skb_share_check(skb, GFP_ATOMIC); 512 if (!skb) 513 return NET_XMIT_DROP; 514 515 /* Packet definitely does not belong to any of the 516 * virtual devices, but the dest is local. So forward 517 * the skb for the main-dev. At the RX side we just return 518 * RX_PASS for it to be processed further on the stack. 519 */ 520 return dev_forward_skb(ipvlan->phy_dev, skb); 521 522 } else if (is_multicast_ether_addr(eth->h_dest)) { 523 ipvlan_multicast_enqueue(ipvlan->port, skb); 524 return NET_XMIT_SUCCESS; 525 } 526 527 skb->dev = ipvlan->phy_dev; 528 return dev_queue_xmit(skb); 529 } 530 531 int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) 532 { 533 struct ipvl_dev *ipvlan = netdev_priv(dev); 534 struct ipvl_port *port = ipvlan_port_get_rcu(ipvlan->phy_dev); 535 536 if (!port) 537 goto out; 538 539 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) 540 goto out; 541 542 switch(port->mode) { 543 case IPVLAN_MODE_L2: 544 return ipvlan_xmit_mode_l2(skb, dev); 545 case IPVLAN_MODE_L3: 546 return ipvlan_xmit_mode_l3(skb, dev); 547 } 548 549 /* Should not reach here */ 550 WARN_ONCE(true, "ipvlan_queue_xmit() called for mode = [%hx]\n", 551 port->mode); 552 out: 553 kfree_skb(skb); 554 return NET_XMIT_DROP; 555 } 556 557 static bool ipvlan_external_frame(struct sk_buff *skb, struct ipvl_port *port) 558 { 559 struct ethhdr *eth = eth_hdr(skb); 560 struct ipvl_addr *addr; 561 void *lyr3h; 562 int addr_type; 563 564 if (ether_addr_equal(eth->h_source, skb->dev->dev_addr)) { 565 lyr3h = ipvlan_get_L3_hdr(skb, &addr_type); 566 if (!lyr3h) 567 return true; 568 569 addr = ipvlan_addr_lookup(port, lyr3h, addr_type, false); 570 if (addr) 571 return false; 572 } 573 574 return true; 575 } 576 577 static rx_handler_result_t ipvlan_handle_mode_l3(struct sk_buff **pskb, 578 struct ipvl_port *port) 579 { 580 void *lyr3h; 581 int addr_type; 582 struct ipvl_addr *addr; 583 struct sk_buff *skb = *pskb; 584 rx_handler_result_t ret = RX_HANDLER_PASS; 585 586 lyr3h = ipvlan_get_L3_hdr(skb, &addr_type); 587 if (!lyr3h) 588 goto out; 589 590 addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true); 591 if (addr) 592 ret = ipvlan_rcv_frame(addr, skb, false); 593 594 out: 595 return ret; 596 } 597 598 static rx_handler_result_t ipvlan_handle_mode_l2(struct sk_buff **pskb, 599 struct ipvl_port *port) 600 { 601 struct sk_buff *skb = *pskb; 602 struct ethhdr *eth = eth_hdr(skb); 603 rx_handler_result_t ret = RX_HANDLER_PASS; 604 void *lyr3h; 605 int addr_type; 606 607 if (is_multicast_ether_addr(eth->h_dest)) { 608 if (ipvlan_external_frame(skb, port)) { 609 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); 610 611 /* External frames are queued for device local 612 * distribution, but a copy is given to master 613 * straight away to avoid sending duplicates later 614 * when work-queue processes this frame. This is 615 * achieved by returning RX_HANDLER_PASS. 616 */ 617 if (nskb) 618 ipvlan_multicast_enqueue(port, nskb); 619 } 620 } else { 621 struct ipvl_addr *addr; 622 623 lyr3h = ipvlan_get_L3_hdr(skb, &addr_type); 624 if (!lyr3h) 625 return ret; 626 627 addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true); 628 if (addr) 629 ret = ipvlan_rcv_frame(addr, skb, false); 630 } 631 632 return ret; 633 } 634 635 rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb) 636 { 637 struct sk_buff *skb = *pskb; 638 struct ipvl_port *port = ipvlan_port_get_rcu(skb->dev); 639 640 if (!port) 641 return RX_HANDLER_PASS; 642 643 switch (port->mode) { 644 case IPVLAN_MODE_L2: 645 return ipvlan_handle_mode_l2(pskb, port); 646 case IPVLAN_MODE_L3: 647 return ipvlan_handle_mode_l3(pskb, port); 648 } 649 650 /* Should not reach here */ 651 WARN_ONCE(true, "ipvlan_handle_frame() called for mode = [%hx]\n", 652 port->mode); 653 kfree_skb(skb); 654 return NET_RX_DROP; 655 } 656