1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2013 Nicira, Inc. 4 */ 5 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 8 #include <linux/capability.h> 9 #include <linux/module.h> 10 #include <linux/types.h> 11 #include <linux/kernel.h> 12 #include <linux/slab.h> 13 #include <linux/uaccess.h> 14 #include <linux/skbuff.h> 15 #include <linux/netdevice.h> 16 #include <linux/in.h> 17 #include <linux/tcp.h> 18 #include <linux/udp.h> 19 #include <linux/if_arp.h> 20 #include <linux/init.h> 21 #include <linux/in6.h> 22 #include <linux/inetdevice.h> 23 #include <linux/igmp.h> 24 #include <linux/netfilter_ipv4.h> 25 #include <linux/etherdevice.h> 26 #include <linux/if_ether.h> 27 #include <linux/if_vlan.h> 28 #include <linux/rculist.h> 29 #include <linux/err.h> 30 31 #include <net/sock.h> 32 #include <net/ip.h> 33 #include <net/icmp.h> 34 #include <net/protocol.h> 35 #include <net/ip_tunnels.h> 36 #include <net/arp.h> 37 #include <net/checksum.h> 38 #include <net/dsfield.h> 39 #include <net/inet_ecn.h> 40 #include <net/xfrm.h> 41 #include <net/net_namespace.h> 42 #include <net/netns/generic.h> 43 #include <net/rtnetlink.h> 44 #include <net/udp.h> 45 #include <net/dst_metadata.h> 46 47 #if IS_ENABLED(CONFIG_IPV6) 48 #include <net/ipv6.h> 49 #include <net/ip6_fib.h> 50 #include <net/ip6_route.h> 51 #endif 52 53 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote) 54 { 55 return hash_32((__force u32)key ^ (__force u32)remote, 56 IP_TNL_HASH_BITS); 57 } 58 59 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p, 60 __be16 flags, __be32 key) 61 { 62 if (p->i_flags & TUNNEL_KEY) { 63 if (flags & TUNNEL_KEY) 64 return key == p->i_key; 65 else 66 /* key expected, none present */ 67 return false; 68 } else 69 return !(flags & TUNNEL_KEY); 70 } 71 72 /* Fallback tunnel: no source, no destination, no key, no options 73 74 Tunnel hash table: 75 We require exact key match i.e. if a key is present in packet 76 it will match only tunnel with the same key; if it is not present, 77 it will match only keyless tunnel. 78 79 All keysless packets, if not matched configured keyless tunnels 80 will match fallback tunnel. 81 Given src, dst and key, find appropriate for input tunnel. 82 */ 83 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, 84 int link, __be16 flags, 85 __be32 remote, __be32 local, 86 __be32 key) 87 { 88 struct ip_tunnel *t, *cand = NULL; 89 struct hlist_head *head; 90 struct net_device *ndev; 91 unsigned int hash; 92 93 hash = ip_tunnel_hash(key, remote); 94 head = &itn->tunnels[hash]; 95 96 hlist_for_each_entry_rcu(t, head, hash_node) { 97 if (local != t->parms.iph.saddr || 98 remote != t->parms.iph.daddr || 99 !(t->dev->flags & IFF_UP)) 100 continue; 101 102 if (!ip_tunnel_key_match(&t->parms, flags, key)) 103 continue; 104 105 if (t->parms.link == link) 106 return t; 107 else 108 cand = t; 109 } 110 111 hlist_for_each_entry_rcu(t, head, hash_node) { 112 if (remote != t->parms.iph.daddr || 113 t->parms.iph.saddr != 0 || 114 !(t->dev->flags & IFF_UP)) 115 continue; 116 117 if (!ip_tunnel_key_match(&t->parms, flags, key)) 118 continue; 119 120 if (t->parms.link == link) 121 return t; 122 else if (!cand) 123 cand = t; 124 } 125 126 hash = ip_tunnel_hash(key, 0); 127 head = &itn->tunnels[hash]; 128 129 hlist_for_each_entry_rcu(t, head, hash_node) { 130 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) && 131 (local != t->parms.iph.daddr || !ipv4_is_multicast(local))) 132 continue; 133 134 if (!(t->dev->flags & IFF_UP)) 135 continue; 136 137 if (!ip_tunnel_key_match(&t->parms, flags, key)) 138 continue; 139 140 if (t->parms.link == link) 141 return t; 142 else if (!cand) 143 cand = t; 144 } 145 146 hlist_for_each_entry_rcu(t, head, hash_node) { 147 if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) || 148 t->parms.iph.saddr != 0 || 149 t->parms.iph.daddr != 0 || 150 !(t->dev->flags & IFF_UP)) 151 continue; 152 153 if (t->parms.link == link) 154 return t; 155 else if (!cand) 156 cand = t; 157 } 158 159 if (cand) 160 return cand; 161 162 t = rcu_dereference(itn->collect_md_tun); 163 if (t && t->dev->flags & IFF_UP) 164 return t; 165 166 ndev = READ_ONCE(itn->fb_tunnel_dev); 167 if (ndev && ndev->flags & IFF_UP) 168 return netdev_priv(ndev); 169 170 return NULL; 171 } 172 EXPORT_SYMBOL_GPL(ip_tunnel_lookup); 173 174 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn, 175 struct ip_tunnel_parm *parms) 176 { 177 unsigned int h; 178 __be32 remote; 179 __be32 i_key = parms->i_key; 180 181 if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr)) 182 remote = parms->iph.daddr; 183 else 184 remote = 0; 185 186 if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI)) 187 i_key = 0; 188 189 h = ip_tunnel_hash(i_key, remote); 190 return &itn->tunnels[h]; 191 } 192 193 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t) 194 { 195 struct hlist_head *head = ip_bucket(itn, &t->parms); 196 197 if (t->collect_md) 198 rcu_assign_pointer(itn->collect_md_tun, t); 199 hlist_add_head_rcu(&t->hash_node, head); 200 } 201 202 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t) 203 { 204 if (t->collect_md) 205 rcu_assign_pointer(itn->collect_md_tun, NULL); 206 hlist_del_init_rcu(&t->hash_node); 207 } 208 209 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn, 210 struct ip_tunnel_parm *parms, 211 int type) 212 { 213 __be32 remote = parms->iph.daddr; 214 __be32 local = parms->iph.saddr; 215 __be32 key = parms->i_key; 216 __be16 flags = parms->i_flags; 217 int link = parms->link; 218 struct ip_tunnel *t = NULL; 219 struct hlist_head *head = ip_bucket(itn, parms); 220 221 hlist_for_each_entry_rcu(t, head, hash_node) { 222 if (local == t->parms.iph.saddr && 223 remote == t->parms.iph.daddr && 224 link == t->parms.link && 225 type == t->dev->type && 226 ip_tunnel_key_match(&t->parms, flags, key)) 227 break; 228 } 229 return t; 230 } 231 232 static struct net_device *__ip_tunnel_create(struct net *net, 233 const struct rtnl_link_ops *ops, 234 struct ip_tunnel_parm *parms) 235 { 236 int err; 237 struct ip_tunnel *tunnel; 238 struct net_device *dev; 239 char name[IFNAMSIZ]; 240 241 err = -E2BIG; 242 if (parms->name[0]) { 243 if (!dev_valid_name(parms->name)) 244 goto failed; 245 strlcpy(name, parms->name, IFNAMSIZ); 246 } else { 247 if (strlen(ops->kind) > (IFNAMSIZ - 3)) 248 goto failed; 249 strcpy(name, ops->kind); 250 strcat(name, "%d"); 251 } 252 253 ASSERT_RTNL(); 254 dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup); 255 if (!dev) { 256 err = -ENOMEM; 257 goto failed; 258 } 259 dev_net_set(dev, net); 260 261 dev->rtnl_link_ops = ops; 262 263 tunnel = netdev_priv(dev); 264 tunnel->parms = *parms; 265 tunnel->net = net; 266 267 err = register_netdevice(dev); 268 if (err) 269 goto failed_free; 270 271 return dev; 272 273 failed_free: 274 free_netdev(dev); 275 failed: 276 return ERR_PTR(err); 277 } 278 279 static int ip_tunnel_bind_dev(struct net_device *dev) 280 { 281 struct net_device *tdev = NULL; 282 struct ip_tunnel *tunnel = netdev_priv(dev); 283 const struct iphdr *iph; 284 int hlen = LL_MAX_HEADER; 285 int mtu = ETH_DATA_LEN; 286 int t_hlen = tunnel->hlen + sizeof(struct iphdr); 287 288 iph = &tunnel->parms.iph; 289 290 /* Guess output device to choose reasonable mtu and needed_headroom */ 291 if (iph->daddr) { 292 struct flowi4 fl4; 293 struct rtable *rt; 294 295 ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr, 296 iph->saddr, tunnel->parms.o_key, 297 RT_TOS(iph->tos), tunnel->parms.link, 298 tunnel->fwmark, 0); 299 rt = ip_route_output_key(tunnel->net, &fl4); 300 301 if (!IS_ERR(rt)) { 302 tdev = rt->dst.dev; 303 ip_rt_put(rt); 304 } 305 if (dev->type != ARPHRD_ETHER) 306 dev->flags |= IFF_POINTOPOINT; 307 308 dst_cache_reset(&tunnel->dst_cache); 309 } 310 311 if (!tdev && tunnel->parms.link) 312 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link); 313 314 if (tdev) { 315 hlen = tdev->hard_header_len + tdev->needed_headroom; 316 mtu = min(tdev->mtu, IP_MAX_MTU); 317 } 318 319 dev->needed_headroom = t_hlen + hlen; 320 mtu -= t_hlen; 321 322 if (mtu < IPV4_MIN_MTU) 323 mtu = IPV4_MIN_MTU; 324 325 return mtu; 326 } 327 328 static struct ip_tunnel *ip_tunnel_create(struct net *net, 329 struct ip_tunnel_net *itn, 330 struct ip_tunnel_parm *parms) 331 { 332 struct ip_tunnel *nt; 333 struct net_device *dev; 334 int t_hlen; 335 int mtu; 336 int err; 337 338 dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms); 339 if (IS_ERR(dev)) 340 return ERR_CAST(dev); 341 342 mtu = ip_tunnel_bind_dev(dev); 343 err = dev_set_mtu(dev, mtu); 344 if (err) 345 goto err_dev_set_mtu; 346 347 nt = netdev_priv(dev); 348 t_hlen = nt->hlen + sizeof(struct iphdr); 349 dev->min_mtu = ETH_MIN_MTU; 350 dev->max_mtu = IP_MAX_MTU - t_hlen; 351 ip_tunnel_add(itn, nt); 352 return nt; 353 354 err_dev_set_mtu: 355 unregister_netdevice(dev); 356 return ERR_PTR(err); 357 } 358 359 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, 360 const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, 361 bool log_ecn_error) 362 { 363 const struct iphdr *iph = ip_hdr(skb); 364 int err; 365 366 #ifdef CONFIG_NET_IPGRE_BROADCAST 367 if (ipv4_is_multicast(iph->daddr)) { 368 tunnel->dev->stats.multicast++; 369 skb->pkt_type = PACKET_BROADCAST; 370 } 371 #endif 372 373 if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) || 374 ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) { 375 tunnel->dev->stats.rx_crc_errors++; 376 tunnel->dev->stats.rx_errors++; 377 goto drop; 378 } 379 380 if (tunnel->parms.i_flags&TUNNEL_SEQ) { 381 if (!(tpi->flags&TUNNEL_SEQ) || 382 (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) { 383 tunnel->dev->stats.rx_fifo_errors++; 384 tunnel->dev->stats.rx_errors++; 385 goto drop; 386 } 387 tunnel->i_seqno = ntohl(tpi->seq) + 1; 388 } 389 390 skb_reset_network_header(skb); 391 392 err = IP_ECN_decapsulate(iph, skb); 393 if (unlikely(err)) { 394 if (log_ecn_error) 395 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", 396 &iph->saddr, iph->tos); 397 if (err > 1) { 398 ++tunnel->dev->stats.rx_frame_errors; 399 ++tunnel->dev->stats.rx_errors; 400 goto drop; 401 } 402 } 403 404 dev_sw_netstats_rx_add(tunnel->dev, skb->len); 405 skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev))); 406 407 if (tunnel->dev->type == ARPHRD_ETHER) { 408 skb->protocol = eth_type_trans(skb, tunnel->dev); 409 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 410 } else { 411 skb->dev = tunnel->dev; 412 } 413 414 if (tun_dst) 415 skb_dst_set(skb, (struct dst_entry *)tun_dst); 416 417 gro_cells_receive(&tunnel->gro_cells, skb); 418 return 0; 419 420 drop: 421 if (tun_dst) 422 dst_release((struct dst_entry *)tun_dst); 423 kfree_skb(skb); 424 return 0; 425 } 426 EXPORT_SYMBOL_GPL(ip_tunnel_rcv); 427 428 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops, 429 unsigned int num) 430 { 431 if (num >= MAX_IPTUN_ENCAP_OPS) 432 return -ERANGE; 433 434 return !cmpxchg((const struct ip_tunnel_encap_ops **) 435 &iptun_encaps[num], 436 NULL, ops) ? 0 : -1; 437 } 438 EXPORT_SYMBOL(ip_tunnel_encap_add_ops); 439 440 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops, 441 unsigned int num) 442 { 443 int ret; 444 445 if (num >= MAX_IPTUN_ENCAP_OPS) 446 return -ERANGE; 447 448 ret = (cmpxchg((const struct ip_tunnel_encap_ops **) 449 &iptun_encaps[num], 450 ops, NULL) == ops) ? 0 : -1; 451 452 synchronize_net(); 453 454 return ret; 455 } 456 EXPORT_SYMBOL(ip_tunnel_encap_del_ops); 457 458 int ip_tunnel_encap_setup(struct ip_tunnel *t, 459 struct ip_tunnel_encap *ipencap) 460 { 461 int hlen; 462 463 memset(&t->encap, 0, sizeof(t->encap)); 464 465 hlen = ip_encap_hlen(ipencap); 466 if (hlen < 0) 467 return hlen; 468 469 t->encap.type = ipencap->type; 470 t->encap.sport = ipencap->sport; 471 t->encap.dport = ipencap->dport; 472 t->encap.flags = ipencap->flags; 473 474 t->encap_hlen = hlen; 475 t->hlen = t->encap_hlen + t->tun_hlen; 476 477 return 0; 478 } 479 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup); 480 481 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, 482 struct rtable *rt, __be16 df, 483 const struct iphdr *inner_iph, 484 int tunnel_hlen, __be32 dst, bool md) 485 { 486 struct ip_tunnel *tunnel = netdev_priv(dev); 487 int pkt_size; 488 int mtu; 489 490 tunnel_hlen = md ? tunnel_hlen : tunnel->hlen; 491 pkt_size = skb->len - tunnel_hlen; 492 493 if (df) 494 mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen); 495 else 496 mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; 497 498 if (skb_valid_dst(skb)) 499 skb_dst_update_pmtu_no_confirm(skb, mtu); 500 501 if (skb->protocol == htons(ETH_P_IP)) { 502 if (!skb_is_gso(skb) && 503 (inner_iph->frag_off & htons(IP_DF)) && 504 mtu < pkt_size) { 505 memset(IPCB(skb), 0, sizeof(*IPCB(skb))); 506 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); 507 return -E2BIG; 508 } 509 } 510 #if IS_ENABLED(CONFIG_IPV6) 511 else if (skb->protocol == htons(ETH_P_IPV6)) { 512 struct rt6_info *rt6; 513 __be32 daddr; 514 515 rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) : 516 NULL; 517 daddr = md ? dst : tunnel->parms.iph.daddr; 518 519 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && 520 mtu >= IPV6_MIN_MTU) { 521 if ((daddr && !ipv4_is_multicast(daddr)) || 522 rt6->rt6i_dst.plen == 128) { 523 rt6->rt6i_flags |= RTF_MODIFIED; 524 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu); 525 } 526 } 527 528 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU && 529 mtu < pkt_size) { 530 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 531 return -E2BIG; 532 } 533 } 534 #endif 535 return 0; 536 } 537 538 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, 539 u8 proto, int tunnel_hlen) 540 { 541 struct ip_tunnel *tunnel = netdev_priv(dev); 542 u32 headroom = sizeof(struct iphdr); 543 struct ip_tunnel_info *tun_info; 544 const struct ip_tunnel_key *key; 545 const struct iphdr *inner_iph; 546 struct rtable *rt = NULL; 547 struct flowi4 fl4; 548 __be16 df = 0; 549 u8 tos, ttl; 550 bool use_cache; 551 552 tun_info = skb_tunnel_info(skb); 553 if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || 554 ip_tunnel_info_af(tun_info) != AF_INET)) 555 goto tx_error; 556 key = &tun_info->key; 557 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 558 inner_iph = (const struct iphdr *)skb_inner_network_header(skb); 559 tos = key->tos; 560 if (tos == 1) { 561 if (skb->protocol == htons(ETH_P_IP)) 562 tos = inner_iph->tos; 563 else if (skb->protocol == htons(ETH_P_IPV6)) 564 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); 565 } 566 ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 567 tunnel_id_to_key32(key->tun_id), RT_TOS(tos), 568 0, skb->mark, skb_get_hash(skb)); 569 if (tunnel->encap.type != TUNNEL_ENCAP_NONE) 570 goto tx_error; 571 572 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); 573 if (use_cache) 574 rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr); 575 if (!rt) { 576 rt = ip_route_output_key(tunnel->net, &fl4); 577 if (IS_ERR(rt)) { 578 dev->stats.tx_carrier_errors++; 579 goto tx_error; 580 } 581 if (use_cache) 582 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, 583 fl4.saddr); 584 } 585 if (rt->dst.dev == dev) { 586 ip_rt_put(rt); 587 dev->stats.collisions++; 588 goto tx_error; 589 } 590 591 if (key->tun_flags & TUNNEL_DONT_FRAGMENT) 592 df = htons(IP_DF); 593 if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen, 594 key->u.ipv4.dst, true)) { 595 ip_rt_put(rt); 596 goto tx_error; 597 } 598 599 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); 600 ttl = key->ttl; 601 if (ttl == 0) { 602 if (skb->protocol == htons(ETH_P_IP)) 603 ttl = inner_iph->ttl; 604 else if (skb->protocol == htons(ETH_P_IPV6)) 605 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit; 606 else 607 ttl = ip4_dst_hoplimit(&rt->dst); 608 } 609 610 headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; 611 if (headroom > dev->needed_headroom) 612 dev->needed_headroom = headroom; 613 614 if (skb_cow_head(skb, dev->needed_headroom)) { 615 ip_rt_put(rt); 616 goto tx_dropped; 617 } 618 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl, 619 df, !net_eq(tunnel->net, dev_net(dev))); 620 return; 621 tx_error: 622 dev->stats.tx_errors++; 623 goto kfree; 624 tx_dropped: 625 dev->stats.tx_dropped++; 626 kfree: 627 kfree_skb(skb); 628 } 629 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit); 630 631 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, 632 const struct iphdr *tnl_params, u8 protocol) 633 { 634 struct ip_tunnel *tunnel = netdev_priv(dev); 635 struct ip_tunnel_info *tun_info = NULL; 636 const struct iphdr *inner_iph; 637 unsigned int max_headroom; /* The extra header space needed */ 638 struct rtable *rt = NULL; /* Route to the other host */ 639 bool use_cache = false; 640 struct flowi4 fl4; 641 bool md = false; 642 bool connected; 643 u8 tos, ttl; 644 __be32 dst; 645 __be16 df; 646 647 inner_iph = (const struct iphdr *)skb_inner_network_header(skb); 648 connected = (tunnel->parms.iph.daddr != 0); 649 650 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 651 652 dst = tnl_params->daddr; 653 if (dst == 0) { 654 /* NBMA tunnel */ 655 656 if (!skb_dst(skb)) { 657 dev->stats.tx_fifo_errors++; 658 goto tx_error; 659 } 660 661 tun_info = skb_tunnel_info(skb); 662 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) && 663 ip_tunnel_info_af(tun_info) == AF_INET && 664 tun_info->key.u.ipv4.dst) { 665 dst = tun_info->key.u.ipv4.dst; 666 md = true; 667 connected = true; 668 } 669 else if (skb->protocol == htons(ETH_P_IP)) { 670 rt = skb_rtable(skb); 671 dst = rt_nexthop(rt, inner_iph->daddr); 672 } 673 #if IS_ENABLED(CONFIG_IPV6) 674 else if (skb->protocol == htons(ETH_P_IPV6)) { 675 const struct in6_addr *addr6; 676 struct neighbour *neigh; 677 bool do_tx_error_icmp; 678 int addr_type; 679 680 neigh = dst_neigh_lookup(skb_dst(skb), 681 &ipv6_hdr(skb)->daddr); 682 if (!neigh) 683 goto tx_error; 684 685 addr6 = (const struct in6_addr *)&neigh->primary_key; 686 addr_type = ipv6_addr_type(addr6); 687 688 if (addr_type == IPV6_ADDR_ANY) { 689 addr6 = &ipv6_hdr(skb)->daddr; 690 addr_type = ipv6_addr_type(addr6); 691 } 692 693 if ((addr_type & IPV6_ADDR_COMPATv4) == 0) 694 do_tx_error_icmp = true; 695 else { 696 do_tx_error_icmp = false; 697 dst = addr6->s6_addr32[3]; 698 } 699 neigh_release(neigh); 700 if (do_tx_error_icmp) 701 goto tx_error_icmp; 702 } 703 #endif 704 else 705 goto tx_error; 706 707 if (!md) 708 connected = false; 709 } 710 711 tos = tnl_params->tos; 712 if (tos & 0x1) { 713 tos &= ~0x1; 714 if (skb->protocol == htons(ETH_P_IP)) { 715 tos = inner_iph->tos; 716 connected = false; 717 } else if (skb->protocol == htons(ETH_P_IPV6)) { 718 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); 719 connected = false; 720 } 721 } 722 723 ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr, 724 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link, 725 tunnel->fwmark, skb_get_hash(skb)); 726 727 if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) 728 goto tx_error; 729 730 if (connected && md) { 731 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); 732 if (use_cache) 733 rt = dst_cache_get_ip4(&tun_info->dst_cache, 734 &fl4.saddr); 735 } else { 736 rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, 737 &fl4.saddr) : NULL; 738 } 739 740 if (!rt) { 741 rt = ip_route_output_key(tunnel->net, &fl4); 742 743 if (IS_ERR(rt)) { 744 dev->stats.tx_carrier_errors++; 745 goto tx_error; 746 } 747 if (use_cache) 748 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, 749 fl4.saddr); 750 else if (!md && connected) 751 dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, 752 fl4.saddr); 753 } 754 755 if (rt->dst.dev == dev) { 756 ip_rt_put(rt); 757 dev->stats.collisions++; 758 goto tx_error; 759 } 760 761 df = tnl_params->frag_off; 762 if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df) 763 df |= (inner_iph->frag_off & htons(IP_DF)); 764 765 if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) { 766 ip_rt_put(rt); 767 goto tx_error; 768 } 769 770 if (tunnel->err_count > 0) { 771 if (time_before(jiffies, 772 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { 773 tunnel->err_count--; 774 775 dst_link_failure(skb); 776 } else 777 tunnel->err_count = 0; 778 } 779 780 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); 781 ttl = tnl_params->ttl; 782 if (ttl == 0) { 783 if (skb->protocol == htons(ETH_P_IP)) 784 ttl = inner_iph->ttl; 785 #if IS_ENABLED(CONFIG_IPV6) 786 else if (skb->protocol == htons(ETH_P_IPV6)) 787 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit; 788 #endif 789 else 790 ttl = ip4_dst_hoplimit(&rt->dst); 791 } 792 793 max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) 794 + rt->dst.header_len + ip_encap_hlen(&tunnel->encap); 795 if (max_headroom > dev->needed_headroom) 796 dev->needed_headroom = max_headroom; 797 798 if (skb_cow_head(skb, dev->needed_headroom)) { 799 ip_rt_put(rt); 800 dev->stats.tx_dropped++; 801 kfree_skb(skb); 802 return; 803 } 804 805 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, 806 df, !net_eq(tunnel->net, dev_net(dev))); 807 return; 808 809 #if IS_ENABLED(CONFIG_IPV6) 810 tx_error_icmp: 811 dst_link_failure(skb); 812 #endif 813 tx_error: 814 dev->stats.tx_errors++; 815 kfree_skb(skb); 816 } 817 EXPORT_SYMBOL_GPL(ip_tunnel_xmit); 818 819 static void ip_tunnel_update(struct ip_tunnel_net *itn, 820 struct ip_tunnel *t, 821 struct net_device *dev, 822 struct ip_tunnel_parm *p, 823 bool set_mtu, 824 __u32 fwmark) 825 { 826 ip_tunnel_del(itn, t); 827 t->parms.iph.saddr = p->iph.saddr; 828 t->parms.iph.daddr = p->iph.daddr; 829 t->parms.i_key = p->i_key; 830 t->parms.o_key = p->o_key; 831 if (dev->type != ARPHRD_ETHER) { 832 memcpy(dev->dev_addr, &p->iph.saddr, 4); 833 memcpy(dev->broadcast, &p->iph.daddr, 4); 834 } 835 ip_tunnel_add(itn, t); 836 837 t->parms.iph.ttl = p->iph.ttl; 838 t->parms.iph.tos = p->iph.tos; 839 t->parms.iph.frag_off = p->iph.frag_off; 840 841 if (t->parms.link != p->link || t->fwmark != fwmark) { 842 int mtu; 843 844 t->parms.link = p->link; 845 t->fwmark = fwmark; 846 mtu = ip_tunnel_bind_dev(dev); 847 if (set_mtu) 848 dev->mtu = mtu; 849 } 850 dst_cache_reset(&t->dst_cache); 851 netdev_state_change(dev); 852 } 853 854 int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) 855 { 856 int err = 0; 857 struct ip_tunnel *t = netdev_priv(dev); 858 struct net *net = t->net; 859 struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id); 860 861 switch (cmd) { 862 case SIOCGETTUNNEL: 863 if (dev == itn->fb_tunnel_dev) { 864 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); 865 if (!t) 866 t = netdev_priv(dev); 867 } 868 memcpy(p, &t->parms, sizeof(*p)); 869 break; 870 871 case SIOCADDTUNNEL: 872 case SIOCCHGTUNNEL: 873 err = -EPERM; 874 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 875 goto done; 876 if (p->iph.ttl) 877 p->iph.frag_off |= htons(IP_DF); 878 if (!(p->i_flags & VTI_ISVTI)) { 879 if (!(p->i_flags & TUNNEL_KEY)) 880 p->i_key = 0; 881 if (!(p->o_flags & TUNNEL_KEY)) 882 p->o_key = 0; 883 } 884 885 t = ip_tunnel_find(itn, p, itn->type); 886 887 if (cmd == SIOCADDTUNNEL) { 888 if (!t) { 889 t = ip_tunnel_create(net, itn, p); 890 err = PTR_ERR_OR_ZERO(t); 891 break; 892 } 893 894 err = -EEXIST; 895 break; 896 } 897 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { 898 if (t) { 899 if (t->dev != dev) { 900 err = -EEXIST; 901 break; 902 } 903 } else { 904 unsigned int nflags = 0; 905 906 if (ipv4_is_multicast(p->iph.daddr)) 907 nflags = IFF_BROADCAST; 908 else if (p->iph.daddr) 909 nflags = IFF_POINTOPOINT; 910 911 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) { 912 err = -EINVAL; 913 break; 914 } 915 916 t = netdev_priv(dev); 917 } 918 } 919 920 if (t) { 921 err = 0; 922 ip_tunnel_update(itn, t, dev, p, true, 0); 923 } else { 924 err = -ENOENT; 925 } 926 break; 927 928 case SIOCDELTUNNEL: 929 err = -EPERM; 930 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 931 goto done; 932 933 if (dev == itn->fb_tunnel_dev) { 934 err = -ENOENT; 935 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); 936 if (!t) 937 goto done; 938 err = -EPERM; 939 if (t == netdev_priv(itn->fb_tunnel_dev)) 940 goto done; 941 dev = t->dev; 942 } 943 unregister_netdevice(dev); 944 err = 0; 945 break; 946 947 default: 948 err = -EINVAL; 949 } 950 951 done: 952 return err; 953 } 954 EXPORT_SYMBOL_GPL(ip_tunnel_ctl); 955 956 int ip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) 957 { 958 struct ip_tunnel_parm p; 959 int err; 960 961 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) 962 return -EFAULT; 963 err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd); 964 if (!err && copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) 965 return -EFAULT; 966 return err; 967 } 968 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl); 969 970 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict) 971 { 972 struct ip_tunnel *tunnel = netdev_priv(dev); 973 int t_hlen = tunnel->hlen + sizeof(struct iphdr); 974 int max_mtu = IP_MAX_MTU - t_hlen; 975 976 if (new_mtu < ETH_MIN_MTU) 977 return -EINVAL; 978 979 if (new_mtu > max_mtu) { 980 if (strict) 981 return -EINVAL; 982 983 new_mtu = max_mtu; 984 } 985 986 dev->mtu = new_mtu; 987 return 0; 988 } 989 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu); 990 991 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu) 992 { 993 return __ip_tunnel_change_mtu(dev, new_mtu, true); 994 } 995 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu); 996 997 static void ip_tunnel_dev_free(struct net_device *dev) 998 { 999 struct ip_tunnel *tunnel = netdev_priv(dev); 1000 1001 gro_cells_destroy(&tunnel->gro_cells); 1002 dst_cache_destroy(&tunnel->dst_cache); 1003 free_percpu(dev->tstats); 1004 } 1005 1006 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head) 1007 { 1008 struct ip_tunnel *tunnel = netdev_priv(dev); 1009 struct ip_tunnel_net *itn; 1010 1011 itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id); 1012 1013 if (itn->fb_tunnel_dev != dev) { 1014 ip_tunnel_del(itn, netdev_priv(dev)); 1015 unregister_netdevice_queue(dev, head); 1016 } 1017 } 1018 EXPORT_SYMBOL_GPL(ip_tunnel_dellink); 1019 1020 struct net *ip_tunnel_get_link_net(const struct net_device *dev) 1021 { 1022 struct ip_tunnel *tunnel = netdev_priv(dev); 1023 1024 return tunnel->net; 1025 } 1026 EXPORT_SYMBOL(ip_tunnel_get_link_net); 1027 1028 int ip_tunnel_get_iflink(const struct net_device *dev) 1029 { 1030 struct ip_tunnel *tunnel = netdev_priv(dev); 1031 1032 return tunnel->parms.link; 1033 } 1034 EXPORT_SYMBOL(ip_tunnel_get_iflink); 1035 1036 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, 1037 struct rtnl_link_ops *ops, char *devname) 1038 { 1039 struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id); 1040 struct ip_tunnel_parm parms; 1041 unsigned int i; 1042 1043 itn->rtnl_link_ops = ops; 1044 for (i = 0; i < IP_TNL_HASH_SIZE; i++) 1045 INIT_HLIST_HEAD(&itn->tunnels[i]); 1046 1047 if (!ops || !net_has_fallback_tunnels(net)) { 1048 struct ip_tunnel_net *it_init_net; 1049 1050 it_init_net = net_generic(&init_net, ip_tnl_net_id); 1051 itn->type = it_init_net->type; 1052 itn->fb_tunnel_dev = NULL; 1053 return 0; 1054 } 1055 1056 memset(&parms, 0, sizeof(parms)); 1057 if (devname) 1058 strlcpy(parms.name, devname, IFNAMSIZ); 1059 1060 rtnl_lock(); 1061 itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms); 1062 /* FB netdevice is special: we have one, and only one per netns. 1063 * Allowing to move it to another netns is clearly unsafe. 1064 */ 1065 if (!IS_ERR(itn->fb_tunnel_dev)) { 1066 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; 1067 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev); 1068 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev)); 1069 itn->type = itn->fb_tunnel_dev->type; 1070 } 1071 rtnl_unlock(); 1072 1073 return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev); 1074 } 1075 EXPORT_SYMBOL_GPL(ip_tunnel_init_net); 1076 1077 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn, 1078 struct list_head *head, 1079 struct rtnl_link_ops *ops) 1080 { 1081 struct net_device *dev, *aux; 1082 int h; 1083 1084 for_each_netdev_safe(net, dev, aux) 1085 if (dev->rtnl_link_ops == ops) 1086 unregister_netdevice_queue(dev, head); 1087 1088 for (h = 0; h < IP_TNL_HASH_SIZE; h++) { 1089 struct ip_tunnel *t; 1090 struct hlist_node *n; 1091 struct hlist_head *thead = &itn->tunnels[h]; 1092 1093 hlist_for_each_entry_safe(t, n, thead, hash_node) 1094 /* If dev is in the same netns, it has already 1095 * been added to the list by the previous loop. 1096 */ 1097 if (!net_eq(dev_net(t->dev), net)) 1098 unregister_netdevice_queue(t->dev, head); 1099 } 1100 } 1101 1102 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id, 1103 struct rtnl_link_ops *ops) 1104 { 1105 struct ip_tunnel_net *itn; 1106 struct net *net; 1107 LIST_HEAD(list); 1108 1109 rtnl_lock(); 1110 list_for_each_entry(net, net_list, exit_list) { 1111 itn = net_generic(net, id); 1112 ip_tunnel_destroy(net, itn, &list, ops); 1113 } 1114 unregister_netdevice_many(&list); 1115 rtnl_unlock(); 1116 } 1117 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets); 1118 1119 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], 1120 struct ip_tunnel_parm *p, __u32 fwmark) 1121 { 1122 struct ip_tunnel *nt; 1123 struct net *net = dev_net(dev); 1124 struct ip_tunnel_net *itn; 1125 int mtu; 1126 int err; 1127 1128 nt = netdev_priv(dev); 1129 itn = net_generic(net, nt->ip_tnl_net_id); 1130 1131 if (nt->collect_md) { 1132 if (rtnl_dereference(itn->collect_md_tun)) 1133 return -EEXIST; 1134 } else { 1135 if (ip_tunnel_find(itn, p, dev->type)) 1136 return -EEXIST; 1137 } 1138 1139 nt->net = net; 1140 nt->parms = *p; 1141 nt->fwmark = fwmark; 1142 err = register_netdevice(dev); 1143 if (err) 1144 goto err_register_netdevice; 1145 1146 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) 1147 eth_hw_addr_random(dev); 1148 1149 mtu = ip_tunnel_bind_dev(dev); 1150 if (tb[IFLA_MTU]) { 1151 unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr)); 1152 1153 mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max); 1154 } 1155 1156 err = dev_set_mtu(dev, mtu); 1157 if (err) 1158 goto err_dev_set_mtu; 1159 1160 ip_tunnel_add(itn, nt); 1161 return 0; 1162 1163 err_dev_set_mtu: 1164 unregister_netdevice(dev); 1165 err_register_netdevice: 1166 return err; 1167 } 1168 EXPORT_SYMBOL_GPL(ip_tunnel_newlink); 1169 1170 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], 1171 struct ip_tunnel_parm *p, __u32 fwmark) 1172 { 1173 struct ip_tunnel *t; 1174 struct ip_tunnel *tunnel = netdev_priv(dev); 1175 struct net *net = tunnel->net; 1176 struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id); 1177 1178 if (dev == itn->fb_tunnel_dev) 1179 return -EINVAL; 1180 1181 t = ip_tunnel_find(itn, p, dev->type); 1182 1183 if (t) { 1184 if (t->dev != dev) 1185 return -EEXIST; 1186 } else { 1187 t = tunnel; 1188 1189 if (dev->type != ARPHRD_ETHER) { 1190 unsigned int nflags = 0; 1191 1192 if (ipv4_is_multicast(p->iph.daddr)) 1193 nflags = IFF_BROADCAST; 1194 else if (p->iph.daddr) 1195 nflags = IFF_POINTOPOINT; 1196 1197 if ((dev->flags ^ nflags) & 1198 (IFF_POINTOPOINT | IFF_BROADCAST)) 1199 return -EINVAL; 1200 } 1201 } 1202 1203 ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark); 1204 return 0; 1205 } 1206 EXPORT_SYMBOL_GPL(ip_tunnel_changelink); 1207 1208 int ip_tunnel_init(struct net_device *dev) 1209 { 1210 struct ip_tunnel *tunnel = netdev_priv(dev); 1211 struct iphdr *iph = &tunnel->parms.iph; 1212 int err; 1213 1214 dev->needs_free_netdev = true; 1215 dev->priv_destructor = ip_tunnel_dev_free; 1216 dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); 1217 if (!dev->tstats) 1218 return -ENOMEM; 1219 1220 err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); 1221 if (err) { 1222 free_percpu(dev->tstats); 1223 return err; 1224 } 1225 1226 err = gro_cells_init(&tunnel->gro_cells, dev); 1227 if (err) { 1228 dst_cache_destroy(&tunnel->dst_cache); 1229 free_percpu(dev->tstats); 1230 return err; 1231 } 1232 1233 tunnel->dev = dev; 1234 tunnel->net = dev_net(dev); 1235 strcpy(tunnel->parms.name, dev->name); 1236 iph->version = 4; 1237 iph->ihl = 5; 1238 1239 if (tunnel->collect_md) 1240 netif_keep_dst(dev); 1241 return 0; 1242 } 1243 EXPORT_SYMBOL_GPL(ip_tunnel_init); 1244 1245 void ip_tunnel_uninit(struct net_device *dev) 1246 { 1247 struct ip_tunnel *tunnel = netdev_priv(dev); 1248 struct net *net = tunnel->net; 1249 struct ip_tunnel_net *itn; 1250 1251 itn = net_generic(net, tunnel->ip_tnl_net_id); 1252 ip_tunnel_del(itn, netdev_priv(dev)); 1253 if (itn->fb_tunnel_dev == dev) 1254 WRITE_ONCE(itn->fb_tunnel_dev, NULL); 1255 1256 dst_cache_reset(&tunnel->dst_cache); 1257 } 1258 EXPORT_SYMBOL_GPL(ip_tunnel_uninit); 1259 1260 /* Do least required initialization, rest of init is done in tunnel_init call */ 1261 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id) 1262 { 1263 struct ip_tunnel *tunnel = netdev_priv(dev); 1264 tunnel->ip_tnl_net_id = net_id; 1265 } 1266 EXPORT_SYMBOL_GPL(ip_tunnel_setup); 1267 1268 MODULE_LICENSE("GPL"); 1269