1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2013 Nicira, Inc. 4 */ 5 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 8 #include <linux/capability.h> 9 #include <linux/module.h> 10 #include <linux/types.h> 11 #include <linux/kernel.h> 12 #include <linux/slab.h> 13 #include <linux/uaccess.h> 14 #include <linux/skbuff.h> 15 #include <linux/netdevice.h> 16 #include <linux/in.h> 17 #include <linux/tcp.h> 18 #include <linux/udp.h> 19 #include <linux/if_arp.h> 20 #include <linux/init.h> 21 #include <linux/in6.h> 22 #include <linux/inetdevice.h> 23 #include <linux/igmp.h> 24 #include <linux/netfilter_ipv4.h> 25 #include <linux/etherdevice.h> 26 #include <linux/if_ether.h> 27 #include <linux/if_vlan.h> 28 #include <linux/rculist.h> 29 #include <linux/err.h> 30 31 #include <net/sock.h> 32 #include <net/ip.h> 33 #include <net/icmp.h> 34 #include <net/protocol.h> 35 #include <net/ip_tunnels.h> 36 #include <net/arp.h> 37 #include <net/checksum.h> 38 #include <net/dsfield.h> 39 #include <net/inet_ecn.h> 40 #include <net/xfrm.h> 41 #include <net/net_namespace.h> 42 #include <net/netns/generic.h> 43 #include <net/rtnetlink.h> 44 #include <net/udp.h> 45 #include <net/dst_metadata.h> 46 47 #if IS_ENABLED(CONFIG_IPV6) 48 #include <net/ipv6.h> 49 #include <net/ip6_fib.h> 50 #include <net/ip6_route.h> 51 #endif 52 53 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote) 54 { 55 return hash_32((__force u32)key ^ (__force u32)remote, 56 IP_TNL_HASH_BITS); 57 } 58 59 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p, 60 __be16 flags, __be32 key) 61 { 62 if (p->i_flags & TUNNEL_KEY) { 63 if (flags & TUNNEL_KEY) 64 return key == p->i_key; 65 else 66 /* key expected, none present */ 67 return false; 68 } else 69 return !(flags & TUNNEL_KEY); 70 } 71 72 /* Fallback tunnel: no source, no destination, no key, no options 73 74 Tunnel hash table: 75 We require exact key match i.e. if a key is present in packet 76 it will match only tunnel with the same key; if it is not present, 77 it will match only keyless tunnel. 78 79 All keysless packets, if not matched configured keyless tunnels 80 will match fallback tunnel. 81 Given src, dst and key, find appropriate for input tunnel. 82 */ 83 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, 84 int link, __be16 flags, 85 __be32 remote, __be32 local, 86 __be32 key) 87 { 88 unsigned int hash; 89 struct ip_tunnel *t, *cand = NULL; 90 struct hlist_head *head; 91 92 hash = ip_tunnel_hash(key, remote); 93 head = &itn->tunnels[hash]; 94 95 hlist_for_each_entry_rcu(t, head, hash_node) { 96 if (local != t->parms.iph.saddr || 97 remote != t->parms.iph.daddr || 98 !(t->dev->flags & IFF_UP)) 99 continue; 100 101 if (!ip_tunnel_key_match(&t->parms, flags, key)) 102 continue; 103 104 if (t->parms.link == link) 105 return t; 106 else 107 cand = t; 108 } 109 110 hlist_for_each_entry_rcu(t, head, hash_node) { 111 if (remote != t->parms.iph.daddr || 112 t->parms.iph.saddr != 0 || 113 !(t->dev->flags & IFF_UP)) 114 continue; 115 116 if (!ip_tunnel_key_match(&t->parms, flags, key)) 117 continue; 118 119 if (t->parms.link == link) 120 return t; 121 else if (!cand) 122 cand = t; 123 } 124 125 hash = ip_tunnel_hash(key, 0); 126 head = &itn->tunnels[hash]; 127 128 hlist_for_each_entry_rcu(t, head, hash_node) { 129 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) && 130 (local != t->parms.iph.daddr || !ipv4_is_multicast(local))) 131 continue; 132 133 if (!(t->dev->flags & IFF_UP)) 134 continue; 135 136 if (!ip_tunnel_key_match(&t->parms, flags, key)) 137 continue; 138 139 if (t->parms.link == link) 140 return t; 141 else if (!cand) 142 cand = t; 143 } 144 145 if (flags & TUNNEL_NO_KEY) 146 goto skip_key_lookup; 147 148 hlist_for_each_entry_rcu(t, head, hash_node) { 149 if (t->parms.i_key != key || 150 t->parms.iph.saddr != 0 || 151 t->parms.iph.daddr != 0 || 152 !(t->dev->flags & IFF_UP)) 153 continue; 154 155 if (t->parms.link == link) 156 return t; 157 else if (!cand) 158 cand = t; 159 } 160 161 skip_key_lookup: 162 if (cand) 163 return cand; 164 165 t = rcu_dereference(itn->collect_md_tun); 166 if (t && t->dev->flags & IFF_UP) 167 return t; 168 169 if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP) 170 return netdev_priv(itn->fb_tunnel_dev); 171 172 return NULL; 173 } 174 EXPORT_SYMBOL_GPL(ip_tunnel_lookup); 175 176 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn, 177 struct ip_tunnel_parm *parms) 178 { 179 unsigned int h; 180 __be32 remote; 181 __be32 i_key = parms->i_key; 182 183 if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr)) 184 remote = parms->iph.daddr; 185 else 186 remote = 0; 187 188 if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI)) 189 i_key = 0; 190 191 h = ip_tunnel_hash(i_key, remote); 192 return &itn->tunnels[h]; 193 } 194 195 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t) 196 { 197 struct hlist_head *head = ip_bucket(itn, &t->parms); 198 199 if (t->collect_md) 200 rcu_assign_pointer(itn->collect_md_tun, t); 201 hlist_add_head_rcu(&t->hash_node, head); 202 } 203 204 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t) 205 { 206 if (t->collect_md) 207 rcu_assign_pointer(itn->collect_md_tun, NULL); 208 hlist_del_init_rcu(&t->hash_node); 209 } 210 211 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn, 212 struct ip_tunnel_parm *parms, 213 int type) 214 { 215 __be32 remote = parms->iph.daddr; 216 __be32 local = parms->iph.saddr; 217 __be32 key = parms->i_key; 218 __be16 flags = parms->i_flags; 219 int link = parms->link; 220 struct ip_tunnel *t = NULL; 221 struct hlist_head *head = ip_bucket(itn, parms); 222 223 hlist_for_each_entry_rcu(t, head, hash_node) { 224 if (local == t->parms.iph.saddr && 225 remote == t->parms.iph.daddr && 226 link == t->parms.link && 227 type == t->dev->type && 228 ip_tunnel_key_match(&t->parms, flags, key)) 229 break; 230 } 231 return t; 232 } 233 234 static struct net_device *__ip_tunnel_create(struct net *net, 235 const struct rtnl_link_ops *ops, 236 struct ip_tunnel_parm *parms) 237 { 238 int err; 239 struct ip_tunnel *tunnel; 240 struct net_device *dev; 241 char name[IFNAMSIZ]; 242 243 err = -E2BIG; 244 if (parms->name[0]) { 245 if (!dev_valid_name(parms->name)) 246 goto failed; 247 strlcpy(name, parms->name, IFNAMSIZ); 248 } else { 249 if (strlen(ops->kind) > (IFNAMSIZ - 3)) 250 goto failed; 251 strcpy(name, ops->kind); 252 strcat(name, "%d"); 253 } 254 255 ASSERT_RTNL(); 256 dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup); 257 if (!dev) { 258 err = -ENOMEM; 259 goto failed; 260 } 261 dev_net_set(dev, net); 262 263 dev->rtnl_link_ops = ops; 264 265 tunnel = netdev_priv(dev); 266 tunnel->parms = *parms; 267 tunnel->net = net; 268 269 err = register_netdevice(dev); 270 if (err) 271 goto failed_free; 272 273 return dev; 274 275 failed_free: 276 free_netdev(dev); 277 failed: 278 return ERR_PTR(err); 279 } 280 281 static int ip_tunnel_bind_dev(struct net_device *dev) 282 { 283 struct net_device *tdev = NULL; 284 struct ip_tunnel *tunnel = netdev_priv(dev); 285 const struct iphdr *iph; 286 int hlen = LL_MAX_HEADER; 287 int mtu = ETH_DATA_LEN; 288 int t_hlen = tunnel->hlen + sizeof(struct iphdr); 289 290 iph = &tunnel->parms.iph; 291 292 /* Guess output device to choose reasonable mtu and needed_headroom */ 293 if (iph->daddr) { 294 struct flowi4 fl4; 295 struct rtable *rt; 296 297 ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr, 298 iph->saddr, tunnel->parms.o_key, 299 RT_TOS(iph->tos), tunnel->parms.link, 300 tunnel->fwmark, 0); 301 rt = ip_route_output_key(tunnel->net, &fl4); 302 303 if (!IS_ERR(rt)) { 304 tdev = rt->dst.dev; 305 ip_rt_put(rt); 306 } 307 if (dev->type != ARPHRD_ETHER) 308 dev->flags |= IFF_POINTOPOINT; 309 310 dst_cache_reset(&tunnel->dst_cache); 311 } 312 313 if (!tdev && tunnel->parms.link) 314 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link); 315 316 if (tdev) { 317 hlen = tdev->hard_header_len + tdev->needed_headroom; 318 mtu = min(tdev->mtu, IP_MAX_MTU); 319 } 320 321 dev->needed_headroom = t_hlen + hlen; 322 mtu -= (dev->hard_header_len + t_hlen); 323 324 if (mtu < IPV4_MIN_MTU) 325 mtu = IPV4_MIN_MTU; 326 327 return mtu; 328 } 329 330 static struct ip_tunnel *ip_tunnel_create(struct net *net, 331 struct ip_tunnel_net *itn, 332 struct ip_tunnel_parm *parms) 333 { 334 struct ip_tunnel *nt; 335 struct net_device *dev; 336 int t_hlen; 337 int mtu; 338 int err; 339 340 dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms); 341 if (IS_ERR(dev)) 342 return ERR_CAST(dev); 343 344 mtu = ip_tunnel_bind_dev(dev); 345 err = dev_set_mtu(dev, mtu); 346 if (err) 347 goto err_dev_set_mtu; 348 349 nt = netdev_priv(dev); 350 t_hlen = nt->hlen + sizeof(struct iphdr); 351 dev->min_mtu = ETH_MIN_MTU; 352 dev->max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen; 353 ip_tunnel_add(itn, nt); 354 return nt; 355 356 err_dev_set_mtu: 357 unregister_netdevice(dev); 358 return ERR_PTR(err); 359 } 360 361 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, 362 const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, 363 bool log_ecn_error) 364 { 365 struct pcpu_sw_netstats *tstats; 366 const struct iphdr *iph = ip_hdr(skb); 367 int err; 368 369 #ifdef CONFIG_NET_IPGRE_BROADCAST 370 if (ipv4_is_multicast(iph->daddr)) { 371 tunnel->dev->stats.multicast++; 372 skb->pkt_type = PACKET_BROADCAST; 373 } 374 #endif 375 376 if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) || 377 ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) { 378 tunnel->dev->stats.rx_crc_errors++; 379 tunnel->dev->stats.rx_errors++; 380 goto drop; 381 } 382 383 if (tunnel->parms.i_flags&TUNNEL_SEQ) { 384 if (!(tpi->flags&TUNNEL_SEQ) || 385 (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) { 386 tunnel->dev->stats.rx_fifo_errors++; 387 tunnel->dev->stats.rx_errors++; 388 goto drop; 389 } 390 tunnel->i_seqno = ntohl(tpi->seq) + 1; 391 } 392 393 skb_reset_network_header(skb); 394 395 err = IP_ECN_decapsulate(iph, skb); 396 if (unlikely(err)) { 397 if (log_ecn_error) 398 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", 399 &iph->saddr, iph->tos); 400 if (err > 1) { 401 ++tunnel->dev->stats.rx_frame_errors; 402 ++tunnel->dev->stats.rx_errors; 403 goto drop; 404 } 405 } 406 407 tstats = this_cpu_ptr(tunnel->dev->tstats); 408 u64_stats_update_begin(&tstats->syncp); 409 tstats->rx_packets++; 410 tstats->rx_bytes += skb->len; 411 u64_stats_update_end(&tstats->syncp); 412 413 skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev))); 414 415 if (tunnel->dev->type == ARPHRD_ETHER) { 416 skb->protocol = eth_type_trans(skb, tunnel->dev); 417 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 418 } else { 419 skb->dev = tunnel->dev; 420 } 421 422 if (tun_dst) 423 skb_dst_set(skb, (struct dst_entry *)tun_dst); 424 425 gro_cells_receive(&tunnel->gro_cells, skb); 426 return 0; 427 428 drop: 429 if (tun_dst) 430 dst_release((struct dst_entry *)tun_dst); 431 kfree_skb(skb); 432 return 0; 433 } 434 EXPORT_SYMBOL_GPL(ip_tunnel_rcv); 435 436 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops, 437 unsigned int num) 438 { 439 if (num >= MAX_IPTUN_ENCAP_OPS) 440 return -ERANGE; 441 442 return !cmpxchg((const struct ip_tunnel_encap_ops **) 443 &iptun_encaps[num], 444 NULL, ops) ? 0 : -1; 445 } 446 EXPORT_SYMBOL(ip_tunnel_encap_add_ops); 447 448 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops, 449 unsigned int num) 450 { 451 int ret; 452 453 if (num >= MAX_IPTUN_ENCAP_OPS) 454 return -ERANGE; 455 456 ret = (cmpxchg((const struct ip_tunnel_encap_ops **) 457 &iptun_encaps[num], 458 ops, NULL) == ops) ? 0 : -1; 459 460 synchronize_net(); 461 462 return ret; 463 } 464 EXPORT_SYMBOL(ip_tunnel_encap_del_ops); 465 466 int ip_tunnel_encap_setup(struct ip_tunnel *t, 467 struct ip_tunnel_encap *ipencap) 468 { 469 int hlen; 470 471 memset(&t->encap, 0, sizeof(t->encap)); 472 473 hlen = ip_encap_hlen(ipencap); 474 if (hlen < 0) 475 return hlen; 476 477 t->encap.type = ipencap->type; 478 t->encap.sport = ipencap->sport; 479 t->encap.dport = ipencap->dport; 480 t->encap.flags = ipencap->flags; 481 482 t->encap_hlen = hlen; 483 t->hlen = t->encap_hlen + t->tun_hlen; 484 485 return 0; 486 } 487 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup); 488 489 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, 490 struct rtable *rt, __be16 df, 491 const struct iphdr *inner_iph, 492 int tunnel_hlen, __be32 dst, bool md) 493 { 494 struct ip_tunnel *tunnel = netdev_priv(dev); 495 int pkt_size; 496 int mtu; 497 498 tunnel_hlen = md ? tunnel_hlen : tunnel->hlen; 499 pkt_size = skb->len - tunnel_hlen - dev->hard_header_len; 500 501 if (df) 502 mtu = dst_mtu(&rt->dst) - dev->hard_header_len 503 - sizeof(struct iphdr) - tunnel_hlen; 504 else 505 mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; 506 507 if (skb_valid_dst(skb)) 508 skb_dst_update_pmtu_no_confirm(skb, mtu); 509 510 if (skb->protocol == htons(ETH_P_IP)) { 511 if (!skb_is_gso(skb) && 512 (inner_iph->frag_off & htons(IP_DF)) && 513 mtu < pkt_size) { 514 memset(IPCB(skb), 0, sizeof(*IPCB(skb))); 515 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); 516 return -E2BIG; 517 } 518 } 519 #if IS_ENABLED(CONFIG_IPV6) 520 else if (skb->protocol == htons(ETH_P_IPV6)) { 521 struct rt6_info *rt6; 522 __be32 daddr; 523 524 rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) : 525 NULL; 526 daddr = md ? dst : tunnel->parms.iph.daddr; 527 528 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && 529 mtu >= IPV6_MIN_MTU) { 530 if ((daddr && !ipv4_is_multicast(daddr)) || 531 rt6->rt6i_dst.plen == 128) { 532 rt6->rt6i_flags |= RTF_MODIFIED; 533 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu); 534 } 535 } 536 537 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU && 538 mtu < pkt_size) { 539 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 540 return -E2BIG; 541 } 542 } 543 #endif 544 return 0; 545 } 546 547 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, 548 u8 proto, int tunnel_hlen) 549 { 550 struct ip_tunnel *tunnel = netdev_priv(dev); 551 u32 headroom = sizeof(struct iphdr); 552 struct ip_tunnel_info *tun_info; 553 const struct ip_tunnel_key *key; 554 const struct iphdr *inner_iph; 555 struct rtable *rt = NULL; 556 struct flowi4 fl4; 557 __be16 df = 0; 558 u8 tos, ttl; 559 bool use_cache; 560 561 tun_info = skb_tunnel_info(skb); 562 if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || 563 ip_tunnel_info_af(tun_info) != AF_INET)) 564 goto tx_error; 565 key = &tun_info->key; 566 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 567 inner_iph = (const struct iphdr *)skb_inner_network_header(skb); 568 tos = key->tos; 569 if (tos == 1) { 570 if (skb->protocol == htons(ETH_P_IP)) 571 tos = inner_iph->tos; 572 else if (skb->protocol == htons(ETH_P_IPV6)) 573 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); 574 } 575 ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 576 tunnel_id_to_key32(key->tun_id), RT_TOS(tos), 577 0, skb->mark, skb_get_hash(skb)); 578 if (tunnel->encap.type != TUNNEL_ENCAP_NONE) 579 goto tx_error; 580 581 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); 582 if (use_cache) 583 rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr); 584 if (!rt) { 585 rt = ip_route_output_key(tunnel->net, &fl4); 586 if (IS_ERR(rt)) { 587 dev->stats.tx_carrier_errors++; 588 goto tx_error; 589 } 590 if (use_cache) 591 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, 592 fl4.saddr); 593 } 594 if (rt->dst.dev == dev) { 595 ip_rt_put(rt); 596 dev->stats.collisions++; 597 goto tx_error; 598 } 599 600 if (key->tun_flags & TUNNEL_DONT_FRAGMENT) 601 df = htons(IP_DF); 602 if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen, 603 key->u.ipv4.dst, true)) { 604 ip_rt_put(rt); 605 goto tx_error; 606 } 607 608 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); 609 ttl = key->ttl; 610 if (ttl == 0) { 611 if (skb->protocol == htons(ETH_P_IP)) 612 ttl = inner_iph->ttl; 613 else if (skb->protocol == htons(ETH_P_IPV6)) 614 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit; 615 else 616 ttl = ip4_dst_hoplimit(&rt->dst); 617 } 618 619 if (!df && skb->protocol == htons(ETH_P_IP)) 620 df = inner_iph->frag_off & htons(IP_DF); 621 622 headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; 623 if (headroom > dev->needed_headroom) 624 dev->needed_headroom = headroom; 625 626 if (skb_cow_head(skb, dev->needed_headroom)) { 627 ip_rt_put(rt); 628 goto tx_dropped; 629 } 630 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl, 631 df, !net_eq(tunnel->net, dev_net(dev))); 632 return; 633 tx_error: 634 dev->stats.tx_errors++; 635 goto kfree; 636 tx_dropped: 637 dev->stats.tx_dropped++; 638 kfree: 639 kfree_skb(skb); 640 } 641 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit); 642 643 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, 644 const struct iphdr *tnl_params, u8 protocol) 645 { 646 struct ip_tunnel *tunnel = netdev_priv(dev); 647 struct ip_tunnel_info *tun_info = NULL; 648 const struct iphdr *inner_iph; 649 unsigned int max_headroom; /* The extra header space needed */ 650 struct rtable *rt = NULL; /* Route to the other host */ 651 bool use_cache = false; 652 struct flowi4 fl4; 653 bool md = false; 654 bool connected; 655 u8 tos, ttl; 656 __be32 dst; 657 __be16 df; 658 659 inner_iph = (const struct iphdr *)skb_inner_network_header(skb); 660 connected = (tunnel->parms.iph.daddr != 0); 661 662 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 663 664 dst = tnl_params->daddr; 665 if (dst == 0) { 666 /* NBMA tunnel */ 667 668 if (!skb_dst(skb)) { 669 dev->stats.tx_fifo_errors++; 670 goto tx_error; 671 } 672 673 tun_info = skb_tunnel_info(skb); 674 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) && 675 ip_tunnel_info_af(tun_info) == AF_INET && 676 tun_info->key.u.ipv4.dst) { 677 dst = tun_info->key.u.ipv4.dst; 678 md = true; 679 connected = true; 680 } 681 else if (skb->protocol == htons(ETH_P_IP)) { 682 rt = skb_rtable(skb); 683 dst = rt_nexthop(rt, inner_iph->daddr); 684 } 685 #if IS_ENABLED(CONFIG_IPV6) 686 else if (skb->protocol == htons(ETH_P_IPV6)) { 687 const struct in6_addr *addr6; 688 struct neighbour *neigh; 689 bool do_tx_error_icmp; 690 int addr_type; 691 692 neigh = dst_neigh_lookup(skb_dst(skb), 693 &ipv6_hdr(skb)->daddr); 694 if (!neigh) 695 goto tx_error; 696 697 addr6 = (const struct in6_addr *)&neigh->primary_key; 698 addr_type = ipv6_addr_type(addr6); 699 700 if (addr_type == IPV6_ADDR_ANY) { 701 addr6 = &ipv6_hdr(skb)->daddr; 702 addr_type = ipv6_addr_type(addr6); 703 } 704 705 if ((addr_type & IPV6_ADDR_COMPATv4) == 0) 706 do_tx_error_icmp = true; 707 else { 708 do_tx_error_icmp = false; 709 dst = addr6->s6_addr32[3]; 710 } 711 neigh_release(neigh); 712 if (do_tx_error_icmp) 713 goto tx_error_icmp; 714 } 715 #endif 716 else 717 goto tx_error; 718 719 if (!md) 720 connected = false; 721 } 722 723 tos = tnl_params->tos; 724 if (tos & 0x1) { 725 tos &= ~0x1; 726 if (skb->protocol == htons(ETH_P_IP)) { 727 tos = inner_iph->tos; 728 connected = false; 729 } else if (skb->protocol == htons(ETH_P_IPV6)) { 730 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); 731 connected = false; 732 } 733 } 734 735 ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr, 736 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link, 737 tunnel->fwmark, skb_get_hash(skb)); 738 739 if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) 740 goto tx_error; 741 742 if (connected && md) { 743 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); 744 if (use_cache) 745 rt = dst_cache_get_ip4(&tun_info->dst_cache, 746 &fl4.saddr); 747 } else { 748 rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, 749 &fl4.saddr) : NULL; 750 } 751 752 if (!rt) { 753 rt = ip_route_output_key(tunnel->net, &fl4); 754 755 if (IS_ERR(rt)) { 756 dev->stats.tx_carrier_errors++; 757 goto tx_error; 758 } 759 if (use_cache) 760 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, 761 fl4.saddr); 762 else if (!md && connected) 763 dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, 764 fl4.saddr); 765 } 766 767 if (rt->dst.dev == dev) { 768 ip_rt_put(rt); 769 dev->stats.collisions++; 770 goto tx_error; 771 } 772 773 if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph, 774 0, 0, false)) { 775 ip_rt_put(rt); 776 goto tx_error; 777 } 778 779 if (tunnel->err_count > 0) { 780 if (time_before(jiffies, 781 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { 782 tunnel->err_count--; 783 784 dst_link_failure(skb); 785 } else 786 tunnel->err_count = 0; 787 } 788 789 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); 790 ttl = tnl_params->ttl; 791 if (ttl == 0) { 792 if (skb->protocol == htons(ETH_P_IP)) 793 ttl = inner_iph->ttl; 794 #if IS_ENABLED(CONFIG_IPV6) 795 else if (skb->protocol == htons(ETH_P_IPV6)) 796 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit; 797 #endif 798 else 799 ttl = ip4_dst_hoplimit(&rt->dst); 800 } 801 802 df = tnl_params->frag_off; 803 if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df) 804 df |= (inner_iph->frag_off&htons(IP_DF)); 805 806 max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) 807 + rt->dst.header_len + ip_encap_hlen(&tunnel->encap); 808 if (max_headroom > dev->needed_headroom) 809 dev->needed_headroom = max_headroom; 810 811 if (skb_cow_head(skb, dev->needed_headroom)) { 812 ip_rt_put(rt); 813 dev->stats.tx_dropped++; 814 kfree_skb(skb); 815 return; 816 } 817 818 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, 819 df, !net_eq(tunnel->net, dev_net(dev))); 820 return; 821 822 #if IS_ENABLED(CONFIG_IPV6) 823 tx_error_icmp: 824 dst_link_failure(skb); 825 #endif 826 tx_error: 827 dev->stats.tx_errors++; 828 kfree_skb(skb); 829 } 830 EXPORT_SYMBOL_GPL(ip_tunnel_xmit); 831 832 static void ip_tunnel_update(struct ip_tunnel_net *itn, 833 struct ip_tunnel *t, 834 struct net_device *dev, 835 struct ip_tunnel_parm *p, 836 bool set_mtu, 837 __u32 fwmark) 838 { 839 ip_tunnel_del(itn, t); 840 t->parms.iph.saddr = p->iph.saddr; 841 t->parms.iph.daddr = p->iph.daddr; 842 t->parms.i_key = p->i_key; 843 t->parms.o_key = p->o_key; 844 if (dev->type != ARPHRD_ETHER) { 845 memcpy(dev->dev_addr, &p->iph.saddr, 4); 846 memcpy(dev->broadcast, &p->iph.daddr, 4); 847 } 848 ip_tunnel_add(itn, t); 849 850 t->parms.iph.ttl = p->iph.ttl; 851 t->parms.iph.tos = p->iph.tos; 852 t->parms.iph.frag_off = p->iph.frag_off; 853 854 if (t->parms.link != p->link || t->fwmark != fwmark) { 855 int mtu; 856 857 t->parms.link = p->link; 858 t->fwmark = fwmark; 859 mtu = ip_tunnel_bind_dev(dev); 860 if (set_mtu) 861 dev->mtu = mtu; 862 } 863 dst_cache_reset(&t->dst_cache); 864 netdev_state_change(dev); 865 } 866 867 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) 868 { 869 int err = 0; 870 struct ip_tunnel *t = netdev_priv(dev); 871 struct net *net = t->net; 872 struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id); 873 874 switch (cmd) { 875 case SIOCGETTUNNEL: 876 if (dev == itn->fb_tunnel_dev) { 877 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); 878 if (!t) 879 t = netdev_priv(dev); 880 } 881 memcpy(p, &t->parms, sizeof(*p)); 882 break; 883 884 case SIOCADDTUNNEL: 885 case SIOCCHGTUNNEL: 886 err = -EPERM; 887 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 888 goto done; 889 if (p->iph.ttl) 890 p->iph.frag_off |= htons(IP_DF); 891 if (!(p->i_flags & VTI_ISVTI)) { 892 if (!(p->i_flags & TUNNEL_KEY)) 893 p->i_key = 0; 894 if (!(p->o_flags & TUNNEL_KEY)) 895 p->o_key = 0; 896 } 897 898 t = ip_tunnel_find(itn, p, itn->type); 899 900 if (cmd == SIOCADDTUNNEL) { 901 if (!t) { 902 t = ip_tunnel_create(net, itn, p); 903 err = PTR_ERR_OR_ZERO(t); 904 break; 905 } 906 907 err = -EEXIST; 908 break; 909 } 910 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { 911 if (t) { 912 if (t->dev != dev) { 913 err = -EEXIST; 914 break; 915 } 916 } else { 917 unsigned int nflags = 0; 918 919 if (ipv4_is_multicast(p->iph.daddr)) 920 nflags = IFF_BROADCAST; 921 else if (p->iph.daddr) 922 nflags = IFF_POINTOPOINT; 923 924 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) { 925 err = -EINVAL; 926 break; 927 } 928 929 t = netdev_priv(dev); 930 } 931 } 932 933 if (t) { 934 err = 0; 935 ip_tunnel_update(itn, t, dev, p, true, 0); 936 } else { 937 err = -ENOENT; 938 } 939 break; 940 941 case SIOCDELTUNNEL: 942 err = -EPERM; 943 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 944 goto done; 945 946 if (dev == itn->fb_tunnel_dev) { 947 err = -ENOENT; 948 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); 949 if (!t) 950 goto done; 951 err = -EPERM; 952 if (t == netdev_priv(itn->fb_tunnel_dev)) 953 goto done; 954 dev = t->dev; 955 } 956 unregister_netdevice(dev); 957 err = 0; 958 break; 959 960 default: 961 err = -EINVAL; 962 } 963 964 done: 965 return err; 966 } 967 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl); 968 969 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict) 970 { 971 struct ip_tunnel *tunnel = netdev_priv(dev); 972 int t_hlen = tunnel->hlen + sizeof(struct iphdr); 973 int max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen; 974 975 if (new_mtu < ETH_MIN_MTU) 976 return -EINVAL; 977 978 if (new_mtu > max_mtu) { 979 if (strict) 980 return -EINVAL; 981 982 new_mtu = max_mtu; 983 } 984 985 dev->mtu = new_mtu; 986 return 0; 987 } 988 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu); 989 990 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu) 991 { 992 return __ip_tunnel_change_mtu(dev, new_mtu, true); 993 } 994 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu); 995 996 static void ip_tunnel_dev_free(struct net_device *dev) 997 { 998 struct ip_tunnel *tunnel = netdev_priv(dev); 999 1000 gro_cells_destroy(&tunnel->gro_cells); 1001 dst_cache_destroy(&tunnel->dst_cache); 1002 free_percpu(dev->tstats); 1003 } 1004 1005 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head) 1006 { 1007 struct ip_tunnel *tunnel = netdev_priv(dev); 1008 struct ip_tunnel_net *itn; 1009 1010 itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id); 1011 1012 if (itn->fb_tunnel_dev != dev) { 1013 ip_tunnel_del(itn, netdev_priv(dev)); 1014 unregister_netdevice_queue(dev, head); 1015 } 1016 } 1017 EXPORT_SYMBOL_GPL(ip_tunnel_dellink); 1018 1019 struct net *ip_tunnel_get_link_net(const struct net_device *dev) 1020 { 1021 struct ip_tunnel *tunnel = netdev_priv(dev); 1022 1023 return tunnel->net; 1024 } 1025 EXPORT_SYMBOL(ip_tunnel_get_link_net); 1026 1027 int ip_tunnel_get_iflink(const struct net_device *dev) 1028 { 1029 struct ip_tunnel *tunnel = netdev_priv(dev); 1030 1031 return tunnel->parms.link; 1032 } 1033 EXPORT_SYMBOL(ip_tunnel_get_iflink); 1034 1035 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, 1036 struct rtnl_link_ops *ops, char *devname) 1037 { 1038 struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id); 1039 struct ip_tunnel_parm parms; 1040 unsigned int i; 1041 1042 itn->rtnl_link_ops = ops; 1043 for (i = 0; i < IP_TNL_HASH_SIZE; i++) 1044 INIT_HLIST_HEAD(&itn->tunnels[i]); 1045 1046 if (!ops || !net_has_fallback_tunnels(net)) { 1047 struct ip_tunnel_net *it_init_net; 1048 1049 it_init_net = net_generic(&init_net, ip_tnl_net_id); 1050 itn->type = it_init_net->type; 1051 itn->fb_tunnel_dev = NULL; 1052 return 0; 1053 } 1054 1055 memset(&parms, 0, sizeof(parms)); 1056 if (devname) 1057 strlcpy(parms.name, devname, IFNAMSIZ); 1058 1059 rtnl_lock(); 1060 itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms); 1061 /* FB netdevice is special: we have one, and only one per netns. 1062 * Allowing to move it to another netns is clearly unsafe. 1063 */ 1064 if (!IS_ERR(itn->fb_tunnel_dev)) { 1065 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; 1066 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev); 1067 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev)); 1068 itn->type = itn->fb_tunnel_dev->type; 1069 } 1070 rtnl_unlock(); 1071 1072 return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev); 1073 } 1074 EXPORT_SYMBOL_GPL(ip_tunnel_init_net); 1075 1076 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn, 1077 struct list_head *head, 1078 struct rtnl_link_ops *ops) 1079 { 1080 struct net_device *dev, *aux; 1081 int h; 1082 1083 for_each_netdev_safe(net, dev, aux) 1084 if (dev->rtnl_link_ops == ops) 1085 unregister_netdevice_queue(dev, head); 1086 1087 for (h = 0; h < IP_TNL_HASH_SIZE; h++) { 1088 struct ip_tunnel *t; 1089 struct hlist_node *n; 1090 struct hlist_head *thead = &itn->tunnels[h]; 1091 1092 hlist_for_each_entry_safe(t, n, thead, hash_node) 1093 /* If dev is in the same netns, it has already 1094 * been added to the list by the previous loop. 1095 */ 1096 if (!net_eq(dev_net(t->dev), net)) 1097 unregister_netdevice_queue(t->dev, head); 1098 } 1099 } 1100 1101 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id, 1102 struct rtnl_link_ops *ops) 1103 { 1104 struct ip_tunnel_net *itn; 1105 struct net *net; 1106 LIST_HEAD(list); 1107 1108 rtnl_lock(); 1109 list_for_each_entry(net, net_list, exit_list) { 1110 itn = net_generic(net, id); 1111 ip_tunnel_destroy(net, itn, &list, ops); 1112 } 1113 unregister_netdevice_many(&list); 1114 rtnl_unlock(); 1115 } 1116 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets); 1117 1118 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], 1119 struct ip_tunnel_parm *p, __u32 fwmark) 1120 { 1121 struct ip_tunnel *nt; 1122 struct net *net = dev_net(dev); 1123 struct ip_tunnel_net *itn; 1124 int mtu; 1125 int err; 1126 1127 nt = netdev_priv(dev); 1128 itn = net_generic(net, nt->ip_tnl_net_id); 1129 1130 if (nt->collect_md) { 1131 if (rtnl_dereference(itn->collect_md_tun)) 1132 return -EEXIST; 1133 } else { 1134 if (ip_tunnel_find(itn, p, dev->type)) 1135 return -EEXIST; 1136 } 1137 1138 nt->net = net; 1139 nt->parms = *p; 1140 nt->fwmark = fwmark; 1141 err = register_netdevice(dev); 1142 if (err) 1143 goto err_register_netdevice; 1144 1145 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) 1146 eth_hw_addr_random(dev); 1147 1148 mtu = ip_tunnel_bind_dev(dev); 1149 if (tb[IFLA_MTU]) { 1150 unsigned int max = IP_MAX_MTU - dev->hard_header_len - nt->hlen; 1151 1152 mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, 1153 (unsigned int)(max - sizeof(struct iphdr))); 1154 } 1155 1156 err = dev_set_mtu(dev, mtu); 1157 if (err) 1158 goto err_dev_set_mtu; 1159 1160 ip_tunnel_add(itn, nt); 1161 return 0; 1162 1163 err_dev_set_mtu: 1164 unregister_netdevice(dev); 1165 err_register_netdevice: 1166 return err; 1167 } 1168 EXPORT_SYMBOL_GPL(ip_tunnel_newlink); 1169 1170 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], 1171 struct ip_tunnel_parm *p, __u32 fwmark) 1172 { 1173 struct ip_tunnel *t; 1174 struct ip_tunnel *tunnel = netdev_priv(dev); 1175 struct net *net = tunnel->net; 1176 struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id); 1177 1178 if (dev == itn->fb_tunnel_dev) 1179 return -EINVAL; 1180 1181 t = ip_tunnel_find(itn, p, dev->type); 1182 1183 if (t) { 1184 if (t->dev != dev) 1185 return -EEXIST; 1186 } else { 1187 t = tunnel; 1188 1189 if (dev->type != ARPHRD_ETHER) { 1190 unsigned int nflags = 0; 1191 1192 if (ipv4_is_multicast(p->iph.daddr)) 1193 nflags = IFF_BROADCAST; 1194 else if (p->iph.daddr) 1195 nflags = IFF_POINTOPOINT; 1196 1197 if ((dev->flags ^ nflags) & 1198 (IFF_POINTOPOINT | IFF_BROADCAST)) 1199 return -EINVAL; 1200 } 1201 } 1202 1203 ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark); 1204 return 0; 1205 } 1206 EXPORT_SYMBOL_GPL(ip_tunnel_changelink); 1207 1208 int ip_tunnel_init(struct net_device *dev) 1209 { 1210 struct ip_tunnel *tunnel = netdev_priv(dev); 1211 struct iphdr *iph = &tunnel->parms.iph; 1212 int err; 1213 1214 dev->needs_free_netdev = true; 1215 dev->priv_destructor = ip_tunnel_dev_free; 1216 dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); 1217 if (!dev->tstats) 1218 return -ENOMEM; 1219 1220 err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); 1221 if (err) { 1222 free_percpu(dev->tstats); 1223 return err; 1224 } 1225 1226 err = gro_cells_init(&tunnel->gro_cells, dev); 1227 if (err) { 1228 dst_cache_destroy(&tunnel->dst_cache); 1229 free_percpu(dev->tstats); 1230 return err; 1231 } 1232 1233 tunnel->dev = dev; 1234 tunnel->net = dev_net(dev); 1235 strcpy(tunnel->parms.name, dev->name); 1236 iph->version = 4; 1237 iph->ihl = 5; 1238 1239 if (tunnel->collect_md) { 1240 dev->features |= NETIF_F_NETNS_LOCAL; 1241 netif_keep_dst(dev); 1242 } 1243 return 0; 1244 } 1245 EXPORT_SYMBOL_GPL(ip_tunnel_init); 1246 1247 void ip_tunnel_uninit(struct net_device *dev) 1248 { 1249 struct ip_tunnel *tunnel = netdev_priv(dev); 1250 struct net *net = tunnel->net; 1251 struct ip_tunnel_net *itn; 1252 1253 itn = net_generic(net, tunnel->ip_tnl_net_id); 1254 /* fb_tunnel_dev will be unregisted in net-exit call. */ 1255 if (itn->fb_tunnel_dev != dev) 1256 ip_tunnel_del(itn, netdev_priv(dev)); 1257 1258 dst_cache_reset(&tunnel->dst_cache); 1259 } 1260 EXPORT_SYMBOL_GPL(ip_tunnel_uninit); 1261 1262 /* Do least required initialization, rest of init is done in tunnel_init call */ 1263 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id) 1264 { 1265 struct ip_tunnel *tunnel = netdev_priv(dev); 1266 tunnel->ip_tnl_net_id = net_id; 1267 } 1268 EXPORT_SYMBOL_GPL(ip_tunnel_setup); 1269 1270 MODULE_LICENSE("GPL"); 1271