1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2013 Nicira, Inc. 4 */ 5 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 8 #include <linux/capability.h> 9 #include <linux/module.h> 10 #include <linux/types.h> 11 #include <linux/kernel.h> 12 #include <linux/slab.h> 13 #include <linux/uaccess.h> 14 #include <linux/skbuff.h> 15 #include <linux/netdevice.h> 16 #include <linux/in.h> 17 #include <linux/tcp.h> 18 #include <linux/udp.h> 19 #include <linux/if_arp.h> 20 #include <linux/init.h> 21 #include <linux/in6.h> 22 #include <linux/inetdevice.h> 23 #include <linux/igmp.h> 24 #include <linux/netfilter_ipv4.h> 25 #include <linux/etherdevice.h> 26 #include <linux/if_ether.h> 27 #include <linux/if_vlan.h> 28 #include <linux/rculist.h> 29 #include <linux/err.h> 30 31 #include <net/sock.h> 32 #include <net/ip.h> 33 #include <net/icmp.h> 34 #include <net/protocol.h> 35 #include <net/ip_tunnels.h> 36 #include <net/arp.h> 37 #include <net/checksum.h> 38 #include <net/dsfield.h> 39 #include <net/inet_ecn.h> 40 #include <net/xfrm.h> 41 #include <net/net_namespace.h> 42 #include <net/netns/generic.h> 43 #include <net/rtnetlink.h> 44 #include <net/udp.h> 45 #include <net/dst_metadata.h> 46 47 #if IS_ENABLED(CONFIG_IPV6) 48 #include <net/ipv6.h> 49 #include <net/ip6_fib.h> 50 #include <net/ip6_route.h> 51 #endif 52 53 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote) 54 { 55 return hash_32((__force u32)key ^ (__force u32)remote, 56 IP_TNL_HASH_BITS); 57 } 58 59 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p, 60 __be16 flags, __be32 key) 61 { 62 if (p->i_flags & TUNNEL_KEY) { 63 if (flags & TUNNEL_KEY) 64 return key == p->i_key; 65 else 66 /* key expected, none present */ 67 return false; 68 } else 69 return !(flags & TUNNEL_KEY); 70 } 71 72 /* Fallback tunnel: no source, no destination, no key, no options 73 74 Tunnel hash table: 75 We require exact key match i.e. if a key is present in packet 76 it will match only tunnel with the same key; if it is not present, 77 it will match only keyless tunnel. 78 79 All keysless packets, if not matched configured keyless tunnels 80 will match fallback tunnel. 81 Given src, dst and key, find appropriate for input tunnel. 82 */ 83 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, 84 int link, __be16 flags, 85 __be32 remote, __be32 local, 86 __be32 key) 87 { 88 struct ip_tunnel *t, *cand = NULL; 89 struct hlist_head *head; 90 struct net_device *ndev; 91 unsigned int hash; 92 93 hash = ip_tunnel_hash(key, remote); 94 head = &itn->tunnels[hash]; 95 96 hlist_for_each_entry_rcu(t, head, hash_node) { 97 if (local != t->parms.iph.saddr || 98 remote != t->parms.iph.daddr || 99 !(t->dev->flags & IFF_UP)) 100 continue; 101 102 if (!ip_tunnel_key_match(&t->parms, flags, key)) 103 continue; 104 105 if (t->parms.link == link) 106 return t; 107 else 108 cand = t; 109 } 110 111 hlist_for_each_entry_rcu(t, head, hash_node) { 112 if (remote != t->parms.iph.daddr || 113 t->parms.iph.saddr != 0 || 114 !(t->dev->flags & IFF_UP)) 115 continue; 116 117 if (!ip_tunnel_key_match(&t->parms, flags, key)) 118 continue; 119 120 if (t->parms.link == link) 121 return t; 122 else if (!cand) 123 cand = t; 124 } 125 126 hash = ip_tunnel_hash(key, 0); 127 head = &itn->tunnels[hash]; 128 129 hlist_for_each_entry_rcu(t, head, hash_node) { 130 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) && 131 (local != t->parms.iph.daddr || !ipv4_is_multicast(local))) 132 continue; 133 134 if (!(t->dev->flags & IFF_UP)) 135 continue; 136 137 if (!ip_tunnel_key_match(&t->parms, flags, key)) 138 continue; 139 140 if (t->parms.link == link) 141 return t; 142 else if (!cand) 143 cand = t; 144 } 145 146 hlist_for_each_entry_rcu(t, head, hash_node) { 147 if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) || 148 t->parms.iph.saddr != 0 || 149 t->parms.iph.daddr != 0 || 150 !(t->dev->flags & IFF_UP)) 151 continue; 152 153 if (t->parms.link == link) 154 return t; 155 else if (!cand) 156 cand = t; 157 } 158 159 if (cand) 160 return cand; 161 162 t = rcu_dereference(itn->collect_md_tun); 163 if (t && t->dev->flags & IFF_UP) 164 return t; 165 166 ndev = READ_ONCE(itn->fb_tunnel_dev); 167 if (ndev && ndev->flags & IFF_UP) 168 return netdev_priv(ndev); 169 170 return NULL; 171 } 172 EXPORT_SYMBOL_GPL(ip_tunnel_lookup); 173 174 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn, 175 struct ip_tunnel_parm *parms) 176 { 177 unsigned int h; 178 __be32 remote; 179 __be32 i_key = parms->i_key; 180 181 if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr)) 182 remote = parms->iph.daddr; 183 else 184 remote = 0; 185 186 if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI)) 187 i_key = 0; 188 189 h = ip_tunnel_hash(i_key, remote); 190 return &itn->tunnels[h]; 191 } 192 193 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t) 194 { 195 struct hlist_head *head = ip_bucket(itn, &t->parms); 196 197 if (t->collect_md) 198 rcu_assign_pointer(itn->collect_md_tun, t); 199 hlist_add_head_rcu(&t->hash_node, head); 200 } 201 202 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t) 203 { 204 if (t->collect_md) 205 rcu_assign_pointer(itn->collect_md_tun, NULL); 206 hlist_del_init_rcu(&t->hash_node); 207 } 208 209 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn, 210 struct ip_tunnel_parm *parms, 211 int type) 212 { 213 __be32 remote = parms->iph.daddr; 214 __be32 local = parms->iph.saddr; 215 __be32 key = parms->i_key; 216 __be16 flags = parms->i_flags; 217 int link = parms->link; 218 struct ip_tunnel *t = NULL; 219 struct hlist_head *head = ip_bucket(itn, parms); 220 221 hlist_for_each_entry_rcu(t, head, hash_node) { 222 if (local == t->parms.iph.saddr && 223 remote == t->parms.iph.daddr && 224 link == t->parms.link && 225 type == t->dev->type && 226 ip_tunnel_key_match(&t->parms, flags, key)) 227 break; 228 } 229 return t; 230 } 231 232 static struct net_device *__ip_tunnel_create(struct net *net, 233 const struct rtnl_link_ops *ops, 234 struct ip_tunnel_parm *parms) 235 { 236 int err; 237 struct ip_tunnel *tunnel; 238 struct net_device *dev; 239 char name[IFNAMSIZ]; 240 241 err = -E2BIG; 242 if (parms->name[0]) { 243 if (!dev_valid_name(parms->name)) 244 goto failed; 245 strscpy(name, parms->name, IFNAMSIZ); 246 } else { 247 if (strlen(ops->kind) > (IFNAMSIZ - 3)) 248 goto failed; 249 strcpy(name, ops->kind); 250 strcat(name, "%d"); 251 } 252 253 ASSERT_RTNL(); 254 dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup); 255 if (!dev) { 256 err = -ENOMEM; 257 goto failed; 258 } 259 dev_net_set(dev, net); 260 261 dev->rtnl_link_ops = ops; 262 263 tunnel = netdev_priv(dev); 264 tunnel->parms = *parms; 265 tunnel->net = net; 266 267 err = register_netdevice(dev); 268 if (err) 269 goto failed_free; 270 271 return dev; 272 273 failed_free: 274 free_netdev(dev); 275 failed: 276 return ERR_PTR(err); 277 } 278 279 static int ip_tunnel_bind_dev(struct net_device *dev) 280 { 281 struct net_device *tdev = NULL; 282 struct ip_tunnel *tunnel = netdev_priv(dev); 283 const struct iphdr *iph; 284 int hlen = LL_MAX_HEADER; 285 int mtu = ETH_DATA_LEN; 286 int t_hlen = tunnel->hlen + sizeof(struct iphdr); 287 288 iph = &tunnel->parms.iph; 289 290 /* Guess output device to choose reasonable mtu and needed_headroom */ 291 if (iph->daddr) { 292 struct flowi4 fl4; 293 struct rtable *rt; 294 295 ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr, 296 iph->saddr, tunnel->parms.o_key, 297 RT_TOS(iph->tos), dev_net(dev), 298 tunnel->parms.link, tunnel->fwmark, 0, 0); 299 rt = ip_route_output_key(tunnel->net, &fl4); 300 301 if (!IS_ERR(rt)) { 302 tdev = rt->dst.dev; 303 ip_rt_put(rt); 304 } 305 if (dev->type != ARPHRD_ETHER) 306 dev->flags |= IFF_POINTOPOINT; 307 308 dst_cache_reset(&tunnel->dst_cache); 309 } 310 311 if (!tdev && tunnel->parms.link) 312 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link); 313 314 if (tdev) { 315 hlen = tdev->hard_header_len + tdev->needed_headroom; 316 mtu = min(tdev->mtu, IP_MAX_MTU); 317 } 318 319 dev->needed_headroom = t_hlen + hlen; 320 mtu -= t_hlen + (dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0); 321 322 if (mtu < IPV4_MIN_MTU) 323 mtu = IPV4_MIN_MTU; 324 325 return mtu; 326 } 327 328 static struct ip_tunnel *ip_tunnel_create(struct net *net, 329 struct ip_tunnel_net *itn, 330 struct ip_tunnel_parm *parms) 331 { 332 struct ip_tunnel *nt; 333 struct net_device *dev; 334 int t_hlen; 335 int mtu; 336 int err; 337 338 dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms); 339 if (IS_ERR(dev)) 340 return ERR_CAST(dev); 341 342 mtu = ip_tunnel_bind_dev(dev); 343 err = dev_set_mtu(dev, mtu); 344 if (err) 345 goto err_dev_set_mtu; 346 347 nt = netdev_priv(dev); 348 t_hlen = nt->hlen + sizeof(struct iphdr); 349 dev->min_mtu = ETH_MIN_MTU; 350 dev->max_mtu = IP_MAX_MTU - t_hlen; 351 if (dev->type == ARPHRD_ETHER) 352 dev->max_mtu -= dev->hard_header_len; 353 354 ip_tunnel_add(itn, nt); 355 return nt; 356 357 err_dev_set_mtu: 358 unregister_netdevice(dev); 359 return ERR_PTR(err); 360 } 361 362 void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info) 363 { 364 const struct iphdr *iph = ip_hdr(skb); 365 const struct udphdr *udph; 366 367 if (iph->protocol != IPPROTO_UDP) 368 return; 369 370 udph = (struct udphdr *)((__u8 *)iph + (iph->ihl << 2)); 371 info->encap.sport = udph->source; 372 info->encap.dport = udph->dest; 373 } 374 EXPORT_SYMBOL(ip_tunnel_md_udp_encap); 375 376 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, 377 const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, 378 bool log_ecn_error) 379 { 380 const struct iphdr *iph = ip_hdr(skb); 381 int err; 382 383 #ifdef CONFIG_NET_IPGRE_BROADCAST 384 if (ipv4_is_multicast(iph->daddr)) { 385 DEV_STATS_INC(tunnel->dev, multicast); 386 skb->pkt_type = PACKET_BROADCAST; 387 } 388 #endif 389 390 if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) || 391 ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) { 392 DEV_STATS_INC(tunnel->dev, rx_crc_errors); 393 DEV_STATS_INC(tunnel->dev, rx_errors); 394 goto drop; 395 } 396 397 if (tunnel->parms.i_flags&TUNNEL_SEQ) { 398 if (!(tpi->flags&TUNNEL_SEQ) || 399 (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) { 400 DEV_STATS_INC(tunnel->dev, rx_fifo_errors); 401 DEV_STATS_INC(tunnel->dev, rx_errors); 402 goto drop; 403 } 404 tunnel->i_seqno = ntohl(tpi->seq) + 1; 405 } 406 407 skb_set_network_header(skb, (tunnel->dev->type == ARPHRD_ETHER) ? ETH_HLEN : 0); 408 409 err = IP_ECN_decapsulate(iph, skb); 410 if (unlikely(err)) { 411 if (log_ecn_error) 412 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", 413 &iph->saddr, iph->tos); 414 if (err > 1) { 415 DEV_STATS_INC(tunnel->dev, rx_frame_errors); 416 DEV_STATS_INC(tunnel->dev, rx_errors); 417 goto drop; 418 } 419 } 420 421 dev_sw_netstats_rx_add(tunnel->dev, skb->len); 422 skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev))); 423 424 if (tunnel->dev->type == ARPHRD_ETHER) { 425 skb->protocol = eth_type_trans(skb, tunnel->dev); 426 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 427 } else { 428 skb->dev = tunnel->dev; 429 } 430 431 if (tun_dst) 432 skb_dst_set(skb, (struct dst_entry *)tun_dst); 433 434 gro_cells_receive(&tunnel->gro_cells, skb); 435 return 0; 436 437 drop: 438 if (tun_dst) 439 dst_release((struct dst_entry *)tun_dst); 440 kfree_skb(skb); 441 return 0; 442 } 443 EXPORT_SYMBOL_GPL(ip_tunnel_rcv); 444 445 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops, 446 unsigned int num) 447 { 448 if (num >= MAX_IPTUN_ENCAP_OPS) 449 return -ERANGE; 450 451 return !cmpxchg((const struct ip_tunnel_encap_ops **) 452 &iptun_encaps[num], 453 NULL, ops) ? 0 : -1; 454 } 455 EXPORT_SYMBOL(ip_tunnel_encap_add_ops); 456 457 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops, 458 unsigned int num) 459 { 460 int ret; 461 462 if (num >= MAX_IPTUN_ENCAP_OPS) 463 return -ERANGE; 464 465 ret = (cmpxchg((const struct ip_tunnel_encap_ops **) 466 &iptun_encaps[num], 467 ops, NULL) == ops) ? 0 : -1; 468 469 synchronize_net(); 470 471 return ret; 472 } 473 EXPORT_SYMBOL(ip_tunnel_encap_del_ops); 474 475 int ip_tunnel_encap_setup(struct ip_tunnel *t, 476 struct ip_tunnel_encap *ipencap) 477 { 478 int hlen; 479 480 memset(&t->encap, 0, sizeof(t->encap)); 481 482 hlen = ip_encap_hlen(ipencap); 483 if (hlen < 0) 484 return hlen; 485 486 t->encap.type = ipencap->type; 487 t->encap.sport = ipencap->sport; 488 t->encap.dport = ipencap->dport; 489 t->encap.flags = ipencap->flags; 490 491 t->encap_hlen = hlen; 492 t->hlen = t->encap_hlen + t->tun_hlen; 493 494 return 0; 495 } 496 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup); 497 498 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, 499 struct rtable *rt, __be16 df, 500 const struct iphdr *inner_iph, 501 int tunnel_hlen, __be32 dst, bool md) 502 { 503 struct ip_tunnel *tunnel = netdev_priv(dev); 504 int pkt_size; 505 int mtu; 506 507 tunnel_hlen = md ? tunnel_hlen : tunnel->hlen; 508 pkt_size = skb->len - tunnel_hlen; 509 pkt_size -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0; 510 511 if (df) { 512 mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen); 513 mtu -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0; 514 } else { 515 mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; 516 } 517 518 if (skb_valid_dst(skb)) 519 skb_dst_update_pmtu_no_confirm(skb, mtu); 520 521 if (skb->protocol == htons(ETH_P_IP)) { 522 if (!skb_is_gso(skb) && 523 (inner_iph->frag_off & htons(IP_DF)) && 524 mtu < pkt_size) { 525 icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); 526 return -E2BIG; 527 } 528 } 529 #if IS_ENABLED(CONFIG_IPV6) 530 else if (skb->protocol == htons(ETH_P_IPV6)) { 531 struct rt6_info *rt6; 532 __be32 daddr; 533 534 rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) : 535 NULL; 536 daddr = md ? dst : tunnel->parms.iph.daddr; 537 538 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && 539 mtu >= IPV6_MIN_MTU) { 540 if ((daddr && !ipv4_is_multicast(daddr)) || 541 rt6->rt6i_dst.plen == 128) { 542 rt6->rt6i_flags |= RTF_MODIFIED; 543 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu); 544 } 545 } 546 547 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU && 548 mtu < pkt_size) { 549 icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 550 return -E2BIG; 551 } 552 } 553 #endif 554 return 0; 555 } 556 557 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, 558 u8 proto, int tunnel_hlen) 559 { 560 struct ip_tunnel *tunnel = netdev_priv(dev); 561 u32 headroom = sizeof(struct iphdr); 562 struct ip_tunnel_info *tun_info; 563 const struct ip_tunnel_key *key; 564 const struct iphdr *inner_iph; 565 struct rtable *rt = NULL; 566 struct flowi4 fl4; 567 __be16 df = 0; 568 u8 tos, ttl; 569 bool use_cache; 570 571 tun_info = skb_tunnel_info(skb); 572 if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || 573 ip_tunnel_info_af(tun_info) != AF_INET)) 574 goto tx_error; 575 key = &tun_info->key; 576 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 577 inner_iph = (const struct iphdr *)skb_inner_network_header(skb); 578 tos = key->tos; 579 if (tos == 1) { 580 if (skb->protocol == htons(ETH_P_IP)) 581 tos = inner_iph->tos; 582 else if (skb->protocol == htons(ETH_P_IPV6)) 583 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); 584 } 585 ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 586 tunnel_id_to_key32(key->tun_id), RT_TOS(tos), 587 dev_net(dev), 0, skb->mark, skb_get_hash(skb), 588 key->flow_flags); 589 590 if (!tunnel_hlen) 591 tunnel_hlen = ip_encap_hlen(&tun_info->encap); 592 593 if (ip_tunnel_encap(skb, &tun_info->encap, &proto, &fl4) < 0) 594 goto tx_error; 595 596 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); 597 if (use_cache) 598 rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr); 599 if (!rt) { 600 rt = ip_route_output_key(tunnel->net, &fl4); 601 if (IS_ERR(rt)) { 602 DEV_STATS_INC(dev, tx_carrier_errors); 603 goto tx_error; 604 } 605 if (use_cache) 606 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, 607 fl4.saddr); 608 } 609 if (rt->dst.dev == dev) { 610 ip_rt_put(rt); 611 DEV_STATS_INC(dev, collisions); 612 goto tx_error; 613 } 614 615 if (key->tun_flags & TUNNEL_DONT_FRAGMENT) 616 df = htons(IP_DF); 617 if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen, 618 key->u.ipv4.dst, true)) { 619 ip_rt_put(rt); 620 goto tx_error; 621 } 622 623 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); 624 ttl = key->ttl; 625 if (ttl == 0) { 626 if (skb->protocol == htons(ETH_P_IP)) 627 ttl = inner_iph->ttl; 628 else if (skb->protocol == htons(ETH_P_IPV6)) 629 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit; 630 else 631 ttl = ip4_dst_hoplimit(&rt->dst); 632 } 633 634 headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; 635 if (headroom > READ_ONCE(dev->needed_headroom)) 636 WRITE_ONCE(dev->needed_headroom, headroom); 637 638 if (skb_cow_head(skb, READ_ONCE(dev->needed_headroom))) { 639 ip_rt_put(rt); 640 goto tx_dropped; 641 } 642 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl, 643 df, !net_eq(tunnel->net, dev_net(dev))); 644 return; 645 tx_error: 646 DEV_STATS_INC(dev, tx_errors); 647 goto kfree; 648 tx_dropped: 649 DEV_STATS_INC(dev, tx_dropped); 650 kfree: 651 kfree_skb(skb); 652 } 653 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit); 654 655 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, 656 const struct iphdr *tnl_params, u8 protocol) 657 { 658 struct ip_tunnel *tunnel = netdev_priv(dev); 659 struct ip_tunnel_info *tun_info = NULL; 660 const struct iphdr *inner_iph; 661 unsigned int max_headroom; /* The extra header space needed */ 662 struct rtable *rt = NULL; /* Route to the other host */ 663 __be16 payload_protocol; 664 bool use_cache = false; 665 struct flowi4 fl4; 666 bool md = false; 667 bool connected; 668 u8 tos, ttl; 669 __be32 dst; 670 __be16 df; 671 672 inner_iph = (const struct iphdr *)skb_inner_network_header(skb); 673 connected = (tunnel->parms.iph.daddr != 0); 674 payload_protocol = skb_protocol(skb, true); 675 676 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 677 678 dst = tnl_params->daddr; 679 if (dst == 0) { 680 /* NBMA tunnel */ 681 682 if (!skb_dst(skb)) { 683 DEV_STATS_INC(dev, tx_fifo_errors); 684 goto tx_error; 685 } 686 687 tun_info = skb_tunnel_info(skb); 688 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) && 689 ip_tunnel_info_af(tun_info) == AF_INET && 690 tun_info->key.u.ipv4.dst) { 691 dst = tun_info->key.u.ipv4.dst; 692 md = true; 693 connected = true; 694 } else if (payload_protocol == htons(ETH_P_IP)) { 695 rt = skb_rtable(skb); 696 dst = rt_nexthop(rt, inner_iph->daddr); 697 } 698 #if IS_ENABLED(CONFIG_IPV6) 699 else if (payload_protocol == htons(ETH_P_IPV6)) { 700 const struct in6_addr *addr6; 701 struct neighbour *neigh; 702 bool do_tx_error_icmp; 703 int addr_type; 704 705 neigh = dst_neigh_lookup(skb_dst(skb), 706 &ipv6_hdr(skb)->daddr); 707 if (!neigh) 708 goto tx_error; 709 710 addr6 = (const struct in6_addr *)&neigh->primary_key; 711 addr_type = ipv6_addr_type(addr6); 712 713 if (addr_type == IPV6_ADDR_ANY) { 714 addr6 = &ipv6_hdr(skb)->daddr; 715 addr_type = ipv6_addr_type(addr6); 716 } 717 718 if ((addr_type & IPV6_ADDR_COMPATv4) == 0) 719 do_tx_error_icmp = true; 720 else { 721 do_tx_error_icmp = false; 722 dst = addr6->s6_addr32[3]; 723 } 724 neigh_release(neigh); 725 if (do_tx_error_icmp) 726 goto tx_error_icmp; 727 } 728 #endif 729 else 730 goto tx_error; 731 732 if (!md) 733 connected = false; 734 } 735 736 tos = tnl_params->tos; 737 if (tos & 0x1) { 738 tos &= ~0x1; 739 if (payload_protocol == htons(ETH_P_IP)) { 740 tos = inner_iph->tos; 741 connected = false; 742 } else if (payload_protocol == htons(ETH_P_IPV6)) { 743 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); 744 connected = false; 745 } 746 } 747 748 ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr, 749 tunnel->parms.o_key, RT_TOS(tos), 750 dev_net(dev), tunnel->parms.link, 751 tunnel->fwmark, skb_get_hash(skb), 0); 752 753 if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0) 754 goto tx_error; 755 756 if (connected && md) { 757 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); 758 if (use_cache) 759 rt = dst_cache_get_ip4(&tun_info->dst_cache, 760 &fl4.saddr); 761 } else { 762 rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, 763 &fl4.saddr) : NULL; 764 } 765 766 if (!rt) { 767 rt = ip_route_output_key(tunnel->net, &fl4); 768 769 if (IS_ERR(rt)) { 770 DEV_STATS_INC(dev, tx_carrier_errors); 771 goto tx_error; 772 } 773 if (use_cache) 774 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, 775 fl4.saddr); 776 else if (!md && connected) 777 dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, 778 fl4.saddr); 779 } 780 781 if (rt->dst.dev == dev) { 782 ip_rt_put(rt); 783 DEV_STATS_INC(dev, collisions); 784 goto tx_error; 785 } 786 787 df = tnl_params->frag_off; 788 if (payload_protocol == htons(ETH_P_IP) && !tunnel->ignore_df) 789 df |= (inner_iph->frag_off & htons(IP_DF)); 790 791 if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) { 792 ip_rt_put(rt); 793 goto tx_error; 794 } 795 796 if (tunnel->err_count > 0) { 797 if (time_before(jiffies, 798 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { 799 tunnel->err_count--; 800 801 dst_link_failure(skb); 802 } else 803 tunnel->err_count = 0; 804 } 805 806 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); 807 ttl = tnl_params->ttl; 808 if (ttl == 0) { 809 if (payload_protocol == htons(ETH_P_IP)) 810 ttl = inner_iph->ttl; 811 #if IS_ENABLED(CONFIG_IPV6) 812 else if (payload_protocol == htons(ETH_P_IPV6)) 813 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit; 814 #endif 815 else 816 ttl = ip4_dst_hoplimit(&rt->dst); 817 } 818 819 max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) 820 + rt->dst.header_len + ip_encap_hlen(&tunnel->encap); 821 if (max_headroom > READ_ONCE(dev->needed_headroom)) 822 WRITE_ONCE(dev->needed_headroom, max_headroom); 823 824 if (skb_cow_head(skb, READ_ONCE(dev->needed_headroom))) { 825 ip_rt_put(rt); 826 DEV_STATS_INC(dev, tx_dropped); 827 kfree_skb(skb); 828 return; 829 } 830 831 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, 832 df, !net_eq(tunnel->net, dev_net(dev))); 833 return; 834 835 #if IS_ENABLED(CONFIG_IPV6) 836 tx_error_icmp: 837 dst_link_failure(skb); 838 #endif 839 tx_error: 840 DEV_STATS_INC(dev, tx_errors); 841 kfree_skb(skb); 842 } 843 EXPORT_SYMBOL_GPL(ip_tunnel_xmit); 844 845 static void ip_tunnel_update(struct ip_tunnel_net *itn, 846 struct ip_tunnel *t, 847 struct net_device *dev, 848 struct ip_tunnel_parm *p, 849 bool set_mtu, 850 __u32 fwmark) 851 { 852 ip_tunnel_del(itn, t); 853 t->parms.iph.saddr = p->iph.saddr; 854 t->parms.iph.daddr = p->iph.daddr; 855 t->parms.i_key = p->i_key; 856 t->parms.o_key = p->o_key; 857 if (dev->type != ARPHRD_ETHER) { 858 __dev_addr_set(dev, &p->iph.saddr, 4); 859 memcpy(dev->broadcast, &p->iph.daddr, 4); 860 } 861 ip_tunnel_add(itn, t); 862 863 t->parms.iph.ttl = p->iph.ttl; 864 t->parms.iph.tos = p->iph.tos; 865 t->parms.iph.frag_off = p->iph.frag_off; 866 867 if (t->parms.link != p->link || t->fwmark != fwmark) { 868 int mtu; 869 870 t->parms.link = p->link; 871 t->fwmark = fwmark; 872 mtu = ip_tunnel_bind_dev(dev); 873 if (set_mtu) 874 dev->mtu = mtu; 875 } 876 dst_cache_reset(&t->dst_cache); 877 netdev_state_change(dev); 878 } 879 880 int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) 881 { 882 int err = 0; 883 struct ip_tunnel *t = netdev_priv(dev); 884 struct net *net = t->net; 885 struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id); 886 887 switch (cmd) { 888 case SIOCGETTUNNEL: 889 if (dev == itn->fb_tunnel_dev) { 890 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); 891 if (!t) 892 t = netdev_priv(dev); 893 } 894 memcpy(p, &t->parms, sizeof(*p)); 895 break; 896 897 case SIOCADDTUNNEL: 898 case SIOCCHGTUNNEL: 899 err = -EPERM; 900 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 901 goto done; 902 if (p->iph.ttl) 903 p->iph.frag_off |= htons(IP_DF); 904 if (!(p->i_flags & VTI_ISVTI)) { 905 if (!(p->i_flags & TUNNEL_KEY)) 906 p->i_key = 0; 907 if (!(p->o_flags & TUNNEL_KEY)) 908 p->o_key = 0; 909 } 910 911 t = ip_tunnel_find(itn, p, itn->type); 912 913 if (cmd == SIOCADDTUNNEL) { 914 if (!t) { 915 t = ip_tunnel_create(net, itn, p); 916 err = PTR_ERR_OR_ZERO(t); 917 break; 918 } 919 920 err = -EEXIST; 921 break; 922 } 923 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { 924 if (t) { 925 if (t->dev != dev) { 926 err = -EEXIST; 927 break; 928 } 929 } else { 930 unsigned int nflags = 0; 931 932 if (ipv4_is_multicast(p->iph.daddr)) 933 nflags = IFF_BROADCAST; 934 else if (p->iph.daddr) 935 nflags = IFF_POINTOPOINT; 936 937 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) { 938 err = -EINVAL; 939 break; 940 } 941 942 t = netdev_priv(dev); 943 } 944 } 945 946 if (t) { 947 err = 0; 948 ip_tunnel_update(itn, t, dev, p, true, 0); 949 } else { 950 err = -ENOENT; 951 } 952 break; 953 954 case SIOCDELTUNNEL: 955 err = -EPERM; 956 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 957 goto done; 958 959 if (dev == itn->fb_tunnel_dev) { 960 err = -ENOENT; 961 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); 962 if (!t) 963 goto done; 964 err = -EPERM; 965 if (t == netdev_priv(itn->fb_tunnel_dev)) 966 goto done; 967 dev = t->dev; 968 } 969 unregister_netdevice(dev); 970 err = 0; 971 break; 972 973 default: 974 err = -EINVAL; 975 } 976 977 done: 978 return err; 979 } 980 EXPORT_SYMBOL_GPL(ip_tunnel_ctl); 981 982 int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr, 983 void __user *data, int cmd) 984 { 985 struct ip_tunnel_parm p; 986 int err; 987 988 if (copy_from_user(&p, data, sizeof(p))) 989 return -EFAULT; 990 err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd); 991 if (!err && copy_to_user(data, &p, sizeof(p))) 992 return -EFAULT; 993 return err; 994 } 995 EXPORT_SYMBOL_GPL(ip_tunnel_siocdevprivate); 996 997 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict) 998 { 999 struct ip_tunnel *tunnel = netdev_priv(dev); 1000 int t_hlen = tunnel->hlen + sizeof(struct iphdr); 1001 int max_mtu = IP_MAX_MTU - t_hlen; 1002 1003 if (dev->type == ARPHRD_ETHER) 1004 max_mtu -= dev->hard_header_len; 1005 1006 if (new_mtu < ETH_MIN_MTU) 1007 return -EINVAL; 1008 1009 if (new_mtu > max_mtu) { 1010 if (strict) 1011 return -EINVAL; 1012 1013 new_mtu = max_mtu; 1014 } 1015 1016 dev->mtu = new_mtu; 1017 return 0; 1018 } 1019 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu); 1020 1021 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu) 1022 { 1023 return __ip_tunnel_change_mtu(dev, new_mtu, true); 1024 } 1025 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu); 1026 1027 static void ip_tunnel_dev_free(struct net_device *dev) 1028 { 1029 struct ip_tunnel *tunnel = netdev_priv(dev); 1030 1031 gro_cells_destroy(&tunnel->gro_cells); 1032 dst_cache_destroy(&tunnel->dst_cache); 1033 free_percpu(dev->tstats); 1034 } 1035 1036 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head) 1037 { 1038 struct ip_tunnel *tunnel = netdev_priv(dev); 1039 struct ip_tunnel_net *itn; 1040 1041 itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id); 1042 1043 if (itn->fb_tunnel_dev != dev) { 1044 ip_tunnel_del(itn, netdev_priv(dev)); 1045 unregister_netdevice_queue(dev, head); 1046 } 1047 } 1048 EXPORT_SYMBOL_GPL(ip_tunnel_dellink); 1049 1050 struct net *ip_tunnel_get_link_net(const struct net_device *dev) 1051 { 1052 struct ip_tunnel *tunnel = netdev_priv(dev); 1053 1054 return tunnel->net; 1055 } 1056 EXPORT_SYMBOL(ip_tunnel_get_link_net); 1057 1058 int ip_tunnel_get_iflink(const struct net_device *dev) 1059 { 1060 struct ip_tunnel *tunnel = netdev_priv(dev); 1061 1062 return tunnel->parms.link; 1063 } 1064 EXPORT_SYMBOL(ip_tunnel_get_iflink); 1065 1066 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, 1067 struct rtnl_link_ops *ops, char *devname) 1068 { 1069 struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id); 1070 struct ip_tunnel_parm parms; 1071 unsigned int i; 1072 1073 itn->rtnl_link_ops = ops; 1074 for (i = 0; i < IP_TNL_HASH_SIZE; i++) 1075 INIT_HLIST_HEAD(&itn->tunnels[i]); 1076 1077 if (!ops || !net_has_fallback_tunnels(net)) { 1078 struct ip_tunnel_net *it_init_net; 1079 1080 it_init_net = net_generic(&init_net, ip_tnl_net_id); 1081 itn->type = it_init_net->type; 1082 itn->fb_tunnel_dev = NULL; 1083 return 0; 1084 } 1085 1086 memset(&parms, 0, sizeof(parms)); 1087 if (devname) 1088 strscpy(parms.name, devname, IFNAMSIZ); 1089 1090 rtnl_lock(); 1091 itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms); 1092 /* FB netdevice is special: we have one, and only one per netns. 1093 * Allowing to move it to another netns is clearly unsafe. 1094 */ 1095 if (!IS_ERR(itn->fb_tunnel_dev)) { 1096 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; 1097 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev); 1098 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev)); 1099 itn->type = itn->fb_tunnel_dev->type; 1100 } 1101 rtnl_unlock(); 1102 1103 return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev); 1104 } 1105 EXPORT_SYMBOL_GPL(ip_tunnel_init_net); 1106 1107 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn, 1108 struct list_head *head, 1109 struct rtnl_link_ops *ops) 1110 { 1111 struct net_device *dev, *aux; 1112 int h; 1113 1114 for_each_netdev_safe(net, dev, aux) 1115 if (dev->rtnl_link_ops == ops) 1116 unregister_netdevice_queue(dev, head); 1117 1118 for (h = 0; h < IP_TNL_HASH_SIZE; h++) { 1119 struct ip_tunnel *t; 1120 struct hlist_node *n; 1121 struct hlist_head *thead = &itn->tunnels[h]; 1122 1123 hlist_for_each_entry_safe(t, n, thead, hash_node) 1124 /* If dev is in the same netns, it has already 1125 * been added to the list by the previous loop. 1126 */ 1127 if (!net_eq(dev_net(t->dev), net)) 1128 unregister_netdevice_queue(t->dev, head); 1129 } 1130 } 1131 1132 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id, 1133 struct rtnl_link_ops *ops) 1134 { 1135 struct ip_tunnel_net *itn; 1136 struct net *net; 1137 LIST_HEAD(list); 1138 1139 rtnl_lock(); 1140 list_for_each_entry(net, net_list, exit_list) { 1141 itn = net_generic(net, id); 1142 ip_tunnel_destroy(net, itn, &list, ops); 1143 } 1144 unregister_netdevice_many(&list); 1145 rtnl_unlock(); 1146 } 1147 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets); 1148 1149 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], 1150 struct ip_tunnel_parm *p, __u32 fwmark) 1151 { 1152 struct ip_tunnel *nt; 1153 struct net *net = dev_net(dev); 1154 struct ip_tunnel_net *itn; 1155 int mtu; 1156 int err; 1157 1158 nt = netdev_priv(dev); 1159 itn = net_generic(net, nt->ip_tnl_net_id); 1160 1161 if (nt->collect_md) { 1162 if (rtnl_dereference(itn->collect_md_tun)) 1163 return -EEXIST; 1164 } else { 1165 if (ip_tunnel_find(itn, p, dev->type)) 1166 return -EEXIST; 1167 } 1168 1169 nt->net = net; 1170 nt->parms = *p; 1171 nt->fwmark = fwmark; 1172 err = register_netdevice(dev); 1173 if (err) 1174 goto err_register_netdevice; 1175 1176 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) 1177 eth_hw_addr_random(dev); 1178 1179 mtu = ip_tunnel_bind_dev(dev); 1180 if (tb[IFLA_MTU]) { 1181 unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr)); 1182 1183 if (dev->type == ARPHRD_ETHER) 1184 max -= dev->hard_header_len; 1185 1186 mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max); 1187 } 1188 1189 err = dev_set_mtu(dev, mtu); 1190 if (err) 1191 goto err_dev_set_mtu; 1192 1193 ip_tunnel_add(itn, nt); 1194 return 0; 1195 1196 err_dev_set_mtu: 1197 unregister_netdevice(dev); 1198 err_register_netdevice: 1199 return err; 1200 } 1201 EXPORT_SYMBOL_GPL(ip_tunnel_newlink); 1202 1203 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], 1204 struct ip_tunnel_parm *p, __u32 fwmark) 1205 { 1206 struct ip_tunnel *t; 1207 struct ip_tunnel *tunnel = netdev_priv(dev); 1208 struct net *net = tunnel->net; 1209 struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id); 1210 1211 if (dev == itn->fb_tunnel_dev) 1212 return -EINVAL; 1213 1214 t = ip_tunnel_find(itn, p, dev->type); 1215 1216 if (t) { 1217 if (t->dev != dev) 1218 return -EEXIST; 1219 } else { 1220 t = tunnel; 1221 1222 if (dev->type != ARPHRD_ETHER) { 1223 unsigned int nflags = 0; 1224 1225 if (ipv4_is_multicast(p->iph.daddr)) 1226 nflags = IFF_BROADCAST; 1227 else if (p->iph.daddr) 1228 nflags = IFF_POINTOPOINT; 1229 1230 if ((dev->flags ^ nflags) & 1231 (IFF_POINTOPOINT | IFF_BROADCAST)) 1232 return -EINVAL; 1233 } 1234 } 1235 1236 ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark); 1237 return 0; 1238 } 1239 EXPORT_SYMBOL_GPL(ip_tunnel_changelink); 1240 1241 int ip_tunnel_init(struct net_device *dev) 1242 { 1243 struct ip_tunnel *tunnel = netdev_priv(dev); 1244 struct iphdr *iph = &tunnel->parms.iph; 1245 int err; 1246 1247 dev->needs_free_netdev = true; 1248 dev->priv_destructor = ip_tunnel_dev_free; 1249 dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); 1250 if (!dev->tstats) 1251 return -ENOMEM; 1252 1253 err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); 1254 if (err) { 1255 free_percpu(dev->tstats); 1256 return err; 1257 } 1258 1259 err = gro_cells_init(&tunnel->gro_cells, dev); 1260 if (err) { 1261 dst_cache_destroy(&tunnel->dst_cache); 1262 free_percpu(dev->tstats); 1263 return err; 1264 } 1265 1266 tunnel->dev = dev; 1267 tunnel->net = dev_net(dev); 1268 strcpy(tunnel->parms.name, dev->name); 1269 iph->version = 4; 1270 iph->ihl = 5; 1271 1272 if (tunnel->collect_md) 1273 netif_keep_dst(dev); 1274 return 0; 1275 } 1276 EXPORT_SYMBOL_GPL(ip_tunnel_init); 1277 1278 void ip_tunnel_uninit(struct net_device *dev) 1279 { 1280 struct ip_tunnel *tunnel = netdev_priv(dev); 1281 struct net *net = tunnel->net; 1282 struct ip_tunnel_net *itn; 1283 1284 itn = net_generic(net, tunnel->ip_tnl_net_id); 1285 ip_tunnel_del(itn, netdev_priv(dev)); 1286 if (itn->fb_tunnel_dev == dev) 1287 WRITE_ONCE(itn->fb_tunnel_dev, NULL); 1288 1289 dst_cache_reset(&tunnel->dst_cache); 1290 } 1291 EXPORT_SYMBOL_GPL(ip_tunnel_uninit); 1292 1293 /* Do least required initialization, rest of init is done in tunnel_init call */ 1294 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id) 1295 { 1296 struct ip_tunnel *tunnel = netdev_priv(dev); 1297 tunnel->ip_tnl_net_id = net_id; 1298 } 1299 EXPORT_SYMBOL_GPL(ip_tunnel_setup); 1300 1301 MODULE_LICENSE("GPL"); 1302