1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2013 Nicira, Inc. 4 */ 5 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 8 #include <linux/capability.h> 9 #include <linux/module.h> 10 #include <linux/types.h> 11 #include <linux/kernel.h> 12 #include <linux/slab.h> 13 #include <linux/uaccess.h> 14 #include <linux/skbuff.h> 15 #include <linux/netdevice.h> 16 #include <linux/in.h> 17 #include <linux/tcp.h> 18 #include <linux/udp.h> 19 #include <linux/if_arp.h> 20 #include <linux/init.h> 21 #include <linux/in6.h> 22 #include <linux/inetdevice.h> 23 #include <linux/igmp.h> 24 #include <linux/netfilter_ipv4.h> 25 #include <linux/etherdevice.h> 26 #include <linux/if_ether.h> 27 #include <linux/if_vlan.h> 28 #include <linux/rculist.h> 29 #include <linux/err.h> 30 31 #include <net/sock.h> 32 #include <net/ip.h> 33 #include <net/icmp.h> 34 #include <net/protocol.h> 35 #include <net/ip_tunnels.h> 36 #include <net/arp.h> 37 #include <net/checksum.h> 38 #include <net/dsfield.h> 39 #include <net/inet_ecn.h> 40 #include <net/xfrm.h> 41 #include <net/net_namespace.h> 42 #include <net/netns/generic.h> 43 #include <net/rtnetlink.h> 44 #include <net/udp.h> 45 #include <net/dst_metadata.h> 46 47 #if IS_ENABLED(CONFIG_IPV6) 48 #include <net/ipv6.h> 49 #include <net/ip6_fib.h> 50 #include <net/ip6_route.h> 51 #endif 52 53 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote) 54 { 55 return hash_32((__force u32)key ^ (__force u32)remote, 56 IP_TNL_HASH_BITS); 57 } 58 59 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p, 60 __be16 flags, __be32 key) 61 { 62 if (p->i_flags & TUNNEL_KEY) { 63 if (flags & TUNNEL_KEY) 64 return key == p->i_key; 65 else 66 /* key expected, none present */ 67 return false; 68 } else 69 return !(flags & TUNNEL_KEY); 70 } 71 72 /* Fallback tunnel: no source, no destination, no key, no options 73 74 Tunnel hash table: 75 We require exact key match i.e. if a key is present in packet 76 it will match only tunnel with the same key; if it is not present, 77 it will match only keyless tunnel. 78 79 All keysless packets, if not matched configured keyless tunnels 80 will match fallback tunnel. 81 Given src, dst and key, find appropriate for input tunnel. 82 */ 83 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, 84 int link, __be16 flags, 85 __be32 remote, __be32 local, 86 __be32 key) 87 { 88 unsigned int hash; 89 struct ip_tunnel *t, *cand = NULL; 90 struct hlist_head *head; 91 92 hash = ip_tunnel_hash(key, remote); 93 head = &itn->tunnels[hash]; 94 95 hlist_for_each_entry_rcu(t, head, hash_node) { 96 if (local != t->parms.iph.saddr || 97 remote != t->parms.iph.daddr || 98 !(t->dev->flags & IFF_UP)) 99 continue; 100 101 if (!ip_tunnel_key_match(&t->parms, flags, key)) 102 continue; 103 104 if (t->parms.link == link) 105 return t; 106 else 107 cand = t; 108 } 109 110 hlist_for_each_entry_rcu(t, head, hash_node) { 111 if (remote != t->parms.iph.daddr || 112 t->parms.iph.saddr != 0 || 113 !(t->dev->flags & IFF_UP)) 114 continue; 115 116 if (!ip_tunnel_key_match(&t->parms, flags, key)) 117 continue; 118 119 if (t->parms.link == link) 120 return t; 121 else if (!cand) 122 cand = t; 123 } 124 125 hash = ip_tunnel_hash(key, 0); 126 head = &itn->tunnels[hash]; 127 128 hlist_for_each_entry_rcu(t, head, hash_node) { 129 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) && 130 (local != t->parms.iph.daddr || !ipv4_is_multicast(local))) 131 continue; 132 133 if (!(t->dev->flags & IFF_UP)) 134 continue; 135 136 if (!ip_tunnel_key_match(&t->parms, flags, key)) 137 continue; 138 139 if (t->parms.link == link) 140 return t; 141 else if (!cand) 142 cand = t; 143 } 144 145 hlist_for_each_entry_rcu(t, head, hash_node) { 146 if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) || 147 t->parms.iph.saddr != 0 || 148 t->parms.iph.daddr != 0 || 149 !(t->dev->flags & IFF_UP)) 150 continue; 151 152 if (t->parms.link == link) 153 return t; 154 else if (!cand) 155 cand = t; 156 } 157 158 if (cand) 159 return cand; 160 161 t = rcu_dereference(itn->collect_md_tun); 162 if (t && t->dev->flags & IFF_UP) 163 return t; 164 165 if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP) 166 return netdev_priv(itn->fb_tunnel_dev); 167 168 return NULL; 169 } 170 EXPORT_SYMBOL_GPL(ip_tunnel_lookup); 171 172 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn, 173 struct ip_tunnel_parm *parms) 174 { 175 unsigned int h; 176 __be32 remote; 177 __be32 i_key = parms->i_key; 178 179 if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr)) 180 remote = parms->iph.daddr; 181 else 182 remote = 0; 183 184 if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI)) 185 i_key = 0; 186 187 h = ip_tunnel_hash(i_key, remote); 188 return &itn->tunnels[h]; 189 } 190 191 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t) 192 { 193 struct hlist_head *head = ip_bucket(itn, &t->parms); 194 195 if (t->collect_md) 196 rcu_assign_pointer(itn->collect_md_tun, t); 197 hlist_add_head_rcu(&t->hash_node, head); 198 } 199 200 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t) 201 { 202 if (t->collect_md) 203 rcu_assign_pointer(itn->collect_md_tun, NULL); 204 hlist_del_init_rcu(&t->hash_node); 205 } 206 207 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn, 208 struct ip_tunnel_parm *parms, 209 int type) 210 { 211 __be32 remote = parms->iph.daddr; 212 __be32 local = parms->iph.saddr; 213 __be32 key = parms->i_key; 214 __be16 flags = parms->i_flags; 215 int link = parms->link; 216 struct ip_tunnel *t = NULL; 217 struct hlist_head *head = ip_bucket(itn, parms); 218 219 hlist_for_each_entry_rcu(t, head, hash_node) { 220 if (local == t->parms.iph.saddr && 221 remote == t->parms.iph.daddr && 222 link == t->parms.link && 223 type == t->dev->type && 224 ip_tunnel_key_match(&t->parms, flags, key)) 225 break; 226 } 227 return t; 228 } 229 230 static struct net_device *__ip_tunnel_create(struct net *net, 231 const struct rtnl_link_ops *ops, 232 struct ip_tunnel_parm *parms) 233 { 234 int err; 235 struct ip_tunnel *tunnel; 236 struct net_device *dev; 237 char name[IFNAMSIZ]; 238 239 err = -E2BIG; 240 if (parms->name[0]) { 241 if (!dev_valid_name(parms->name)) 242 goto failed; 243 strlcpy(name, parms->name, IFNAMSIZ); 244 } else { 245 if (strlen(ops->kind) > (IFNAMSIZ - 3)) 246 goto failed; 247 strcpy(name, ops->kind); 248 strcat(name, "%d"); 249 } 250 251 ASSERT_RTNL(); 252 dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup); 253 if (!dev) { 254 err = -ENOMEM; 255 goto failed; 256 } 257 dev_net_set(dev, net); 258 259 dev->rtnl_link_ops = ops; 260 261 tunnel = netdev_priv(dev); 262 tunnel->parms = *parms; 263 tunnel->net = net; 264 265 err = register_netdevice(dev); 266 if (err) 267 goto failed_free; 268 269 return dev; 270 271 failed_free: 272 free_netdev(dev); 273 failed: 274 return ERR_PTR(err); 275 } 276 277 static int ip_tunnel_bind_dev(struct net_device *dev) 278 { 279 struct net_device *tdev = NULL; 280 struct ip_tunnel *tunnel = netdev_priv(dev); 281 const struct iphdr *iph; 282 int hlen = LL_MAX_HEADER; 283 int mtu = ETH_DATA_LEN; 284 int t_hlen = tunnel->hlen + sizeof(struct iphdr); 285 286 iph = &tunnel->parms.iph; 287 288 /* Guess output device to choose reasonable mtu and needed_headroom */ 289 if (iph->daddr) { 290 struct flowi4 fl4; 291 struct rtable *rt; 292 293 ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr, 294 iph->saddr, tunnel->parms.o_key, 295 RT_TOS(iph->tos), tunnel->parms.link, 296 tunnel->fwmark, 0); 297 rt = ip_route_output_key(tunnel->net, &fl4); 298 299 if (!IS_ERR(rt)) { 300 tdev = rt->dst.dev; 301 ip_rt_put(rt); 302 } 303 if (dev->type != ARPHRD_ETHER) 304 dev->flags |= IFF_POINTOPOINT; 305 306 dst_cache_reset(&tunnel->dst_cache); 307 } 308 309 if (!tdev && tunnel->parms.link) 310 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link); 311 312 if (tdev) { 313 hlen = tdev->hard_header_len + tdev->needed_headroom; 314 mtu = min(tdev->mtu, IP_MAX_MTU); 315 } 316 317 dev->needed_headroom = t_hlen + hlen; 318 mtu -= (dev->hard_header_len + t_hlen); 319 320 if (mtu < IPV4_MIN_MTU) 321 mtu = IPV4_MIN_MTU; 322 323 return mtu; 324 } 325 326 static struct ip_tunnel *ip_tunnel_create(struct net *net, 327 struct ip_tunnel_net *itn, 328 struct ip_tunnel_parm *parms) 329 { 330 struct ip_tunnel *nt; 331 struct net_device *dev; 332 int t_hlen; 333 int mtu; 334 int err; 335 336 dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms); 337 if (IS_ERR(dev)) 338 return ERR_CAST(dev); 339 340 mtu = ip_tunnel_bind_dev(dev); 341 err = dev_set_mtu(dev, mtu); 342 if (err) 343 goto err_dev_set_mtu; 344 345 nt = netdev_priv(dev); 346 t_hlen = nt->hlen + sizeof(struct iphdr); 347 dev->min_mtu = ETH_MIN_MTU; 348 dev->max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen; 349 ip_tunnel_add(itn, nt); 350 return nt; 351 352 err_dev_set_mtu: 353 unregister_netdevice(dev); 354 return ERR_PTR(err); 355 } 356 357 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, 358 const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, 359 bool log_ecn_error) 360 { 361 struct pcpu_sw_netstats *tstats; 362 const struct iphdr *iph = ip_hdr(skb); 363 int err; 364 365 #ifdef CONFIG_NET_IPGRE_BROADCAST 366 if (ipv4_is_multicast(iph->daddr)) { 367 tunnel->dev->stats.multicast++; 368 skb->pkt_type = PACKET_BROADCAST; 369 } 370 #endif 371 372 if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) || 373 ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) { 374 tunnel->dev->stats.rx_crc_errors++; 375 tunnel->dev->stats.rx_errors++; 376 goto drop; 377 } 378 379 if (tunnel->parms.i_flags&TUNNEL_SEQ) { 380 if (!(tpi->flags&TUNNEL_SEQ) || 381 (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) { 382 tunnel->dev->stats.rx_fifo_errors++; 383 tunnel->dev->stats.rx_errors++; 384 goto drop; 385 } 386 tunnel->i_seqno = ntohl(tpi->seq) + 1; 387 } 388 389 skb_reset_network_header(skb); 390 391 err = IP_ECN_decapsulate(iph, skb); 392 if (unlikely(err)) { 393 if (log_ecn_error) 394 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", 395 &iph->saddr, iph->tos); 396 if (err > 1) { 397 ++tunnel->dev->stats.rx_frame_errors; 398 ++tunnel->dev->stats.rx_errors; 399 goto drop; 400 } 401 } 402 403 tstats = this_cpu_ptr(tunnel->dev->tstats); 404 u64_stats_update_begin(&tstats->syncp); 405 tstats->rx_packets++; 406 tstats->rx_bytes += skb->len; 407 u64_stats_update_end(&tstats->syncp); 408 409 skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev))); 410 411 if (tunnel->dev->type == ARPHRD_ETHER) { 412 skb->protocol = eth_type_trans(skb, tunnel->dev); 413 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 414 } else { 415 skb->dev = tunnel->dev; 416 } 417 418 if (tun_dst) 419 skb_dst_set(skb, (struct dst_entry *)tun_dst); 420 421 gro_cells_receive(&tunnel->gro_cells, skb); 422 return 0; 423 424 drop: 425 if (tun_dst) 426 dst_release((struct dst_entry *)tun_dst); 427 kfree_skb(skb); 428 return 0; 429 } 430 EXPORT_SYMBOL_GPL(ip_tunnel_rcv); 431 432 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops, 433 unsigned int num) 434 { 435 if (num >= MAX_IPTUN_ENCAP_OPS) 436 return -ERANGE; 437 438 return !cmpxchg((const struct ip_tunnel_encap_ops **) 439 &iptun_encaps[num], 440 NULL, ops) ? 0 : -1; 441 } 442 EXPORT_SYMBOL(ip_tunnel_encap_add_ops); 443 444 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops, 445 unsigned int num) 446 { 447 int ret; 448 449 if (num >= MAX_IPTUN_ENCAP_OPS) 450 return -ERANGE; 451 452 ret = (cmpxchg((const struct ip_tunnel_encap_ops **) 453 &iptun_encaps[num], 454 ops, NULL) == ops) ? 0 : -1; 455 456 synchronize_net(); 457 458 return ret; 459 } 460 EXPORT_SYMBOL(ip_tunnel_encap_del_ops); 461 462 int ip_tunnel_encap_setup(struct ip_tunnel *t, 463 struct ip_tunnel_encap *ipencap) 464 { 465 int hlen; 466 467 memset(&t->encap, 0, sizeof(t->encap)); 468 469 hlen = ip_encap_hlen(ipencap); 470 if (hlen < 0) 471 return hlen; 472 473 t->encap.type = ipencap->type; 474 t->encap.sport = ipencap->sport; 475 t->encap.dport = ipencap->dport; 476 t->encap.flags = ipencap->flags; 477 478 t->encap_hlen = hlen; 479 t->hlen = t->encap_hlen + t->tun_hlen; 480 481 return 0; 482 } 483 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup); 484 485 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, 486 struct rtable *rt, __be16 df, 487 const struct iphdr *inner_iph, 488 int tunnel_hlen, __be32 dst, bool md) 489 { 490 struct ip_tunnel *tunnel = netdev_priv(dev); 491 int pkt_size; 492 int mtu; 493 494 tunnel_hlen = md ? tunnel_hlen : tunnel->hlen; 495 pkt_size = skb->len - tunnel_hlen - dev->hard_header_len; 496 497 if (df) 498 mtu = dst_mtu(&rt->dst) - dev->hard_header_len 499 - sizeof(struct iphdr) - tunnel_hlen; 500 else 501 mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; 502 503 if (skb_valid_dst(skb)) 504 skb_dst_update_pmtu_no_confirm(skb, mtu); 505 506 if (skb->protocol == htons(ETH_P_IP)) { 507 if (!skb_is_gso(skb) && 508 (inner_iph->frag_off & htons(IP_DF)) && 509 mtu < pkt_size) { 510 memset(IPCB(skb), 0, sizeof(*IPCB(skb))); 511 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); 512 return -E2BIG; 513 } 514 } 515 #if IS_ENABLED(CONFIG_IPV6) 516 else if (skb->protocol == htons(ETH_P_IPV6)) { 517 struct rt6_info *rt6; 518 __be32 daddr; 519 520 rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) : 521 NULL; 522 daddr = md ? dst : tunnel->parms.iph.daddr; 523 524 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && 525 mtu >= IPV6_MIN_MTU) { 526 if ((daddr && !ipv4_is_multicast(daddr)) || 527 rt6->rt6i_dst.plen == 128) { 528 rt6->rt6i_flags |= RTF_MODIFIED; 529 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu); 530 } 531 } 532 533 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU && 534 mtu < pkt_size) { 535 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 536 return -E2BIG; 537 } 538 } 539 #endif 540 return 0; 541 } 542 543 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, 544 u8 proto, int tunnel_hlen) 545 { 546 struct ip_tunnel *tunnel = netdev_priv(dev); 547 u32 headroom = sizeof(struct iphdr); 548 struct ip_tunnel_info *tun_info; 549 const struct ip_tunnel_key *key; 550 const struct iphdr *inner_iph; 551 struct rtable *rt = NULL; 552 struct flowi4 fl4; 553 __be16 df = 0; 554 u8 tos, ttl; 555 bool use_cache; 556 557 tun_info = skb_tunnel_info(skb); 558 if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || 559 ip_tunnel_info_af(tun_info) != AF_INET)) 560 goto tx_error; 561 key = &tun_info->key; 562 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 563 inner_iph = (const struct iphdr *)skb_inner_network_header(skb); 564 tos = key->tos; 565 if (tos == 1) { 566 if (skb->protocol == htons(ETH_P_IP)) 567 tos = inner_iph->tos; 568 else if (skb->protocol == htons(ETH_P_IPV6)) 569 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); 570 } 571 ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 572 tunnel_id_to_key32(key->tun_id), RT_TOS(tos), 573 0, skb->mark, skb_get_hash(skb)); 574 if (tunnel->encap.type != TUNNEL_ENCAP_NONE) 575 goto tx_error; 576 577 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); 578 if (use_cache) 579 rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr); 580 if (!rt) { 581 rt = ip_route_output_key(tunnel->net, &fl4); 582 if (IS_ERR(rt)) { 583 dev->stats.tx_carrier_errors++; 584 goto tx_error; 585 } 586 if (use_cache) 587 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, 588 fl4.saddr); 589 } 590 if (rt->dst.dev == dev) { 591 ip_rt_put(rt); 592 dev->stats.collisions++; 593 goto tx_error; 594 } 595 596 if (key->tun_flags & TUNNEL_DONT_FRAGMENT) 597 df = htons(IP_DF); 598 if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen, 599 key->u.ipv4.dst, true)) { 600 ip_rt_put(rt); 601 goto tx_error; 602 } 603 604 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); 605 ttl = key->ttl; 606 if (ttl == 0) { 607 if (skb->protocol == htons(ETH_P_IP)) 608 ttl = inner_iph->ttl; 609 else if (skb->protocol == htons(ETH_P_IPV6)) 610 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit; 611 else 612 ttl = ip4_dst_hoplimit(&rt->dst); 613 } 614 615 if (!df && skb->protocol == htons(ETH_P_IP)) 616 df = inner_iph->frag_off & htons(IP_DF); 617 618 headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; 619 if (headroom > dev->needed_headroom) 620 dev->needed_headroom = headroom; 621 622 if (skb_cow_head(skb, dev->needed_headroom)) { 623 ip_rt_put(rt); 624 goto tx_dropped; 625 } 626 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl, 627 df, !net_eq(tunnel->net, dev_net(dev))); 628 return; 629 tx_error: 630 dev->stats.tx_errors++; 631 goto kfree; 632 tx_dropped: 633 dev->stats.tx_dropped++; 634 kfree: 635 kfree_skb(skb); 636 } 637 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit); 638 639 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, 640 const struct iphdr *tnl_params, u8 protocol) 641 { 642 struct ip_tunnel *tunnel = netdev_priv(dev); 643 struct ip_tunnel_info *tun_info = NULL; 644 const struct iphdr *inner_iph; 645 unsigned int max_headroom; /* The extra header space needed */ 646 struct rtable *rt = NULL; /* Route to the other host */ 647 bool use_cache = false; 648 struct flowi4 fl4; 649 bool md = false; 650 bool connected; 651 u8 tos, ttl; 652 __be32 dst; 653 __be16 df; 654 655 inner_iph = (const struct iphdr *)skb_inner_network_header(skb); 656 connected = (tunnel->parms.iph.daddr != 0); 657 658 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 659 660 dst = tnl_params->daddr; 661 if (dst == 0) { 662 /* NBMA tunnel */ 663 664 if (!skb_dst(skb)) { 665 dev->stats.tx_fifo_errors++; 666 goto tx_error; 667 } 668 669 tun_info = skb_tunnel_info(skb); 670 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) && 671 ip_tunnel_info_af(tun_info) == AF_INET && 672 tun_info->key.u.ipv4.dst) { 673 dst = tun_info->key.u.ipv4.dst; 674 md = true; 675 connected = true; 676 } 677 else if (skb->protocol == htons(ETH_P_IP)) { 678 rt = skb_rtable(skb); 679 dst = rt_nexthop(rt, inner_iph->daddr); 680 } 681 #if IS_ENABLED(CONFIG_IPV6) 682 else if (skb->protocol == htons(ETH_P_IPV6)) { 683 const struct in6_addr *addr6; 684 struct neighbour *neigh; 685 bool do_tx_error_icmp; 686 int addr_type; 687 688 neigh = dst_neigh_lookup(skb_dst(skb), 689 &ipv6_hdr(skb)->daddr); 690 if (!neigh) 691 goto tx_error; 692 693 addr6 = (const struct in6_addr *)&neigh->primary_key; 694 addr_type = ipv6_addr_type(addr6); 695 696 if (addr_type == IPV6_ADDR_ANY) { 697 addr6 = &ipv6_hdr(skb)->daddr; 698 addr_type = ipv6_addr_type(addr6); 699 } 700 701 if ((addr_type & IPV6_ADDR_COMPATv4) == 0) 702 do_tx_error_icmp = true; 703 else { 704 do_tx_error_icmp = false; 705 dst = addr6->s6_addr32[3]; 706 } 707 neigh_release(neigh); 708 if (do_tx_error_icmp) 709 goto tx_error_icmp; 710 } 711 #endif 712 else 713 goto tx_error; 714 715 if (!md) 716 connected = false; 717 } 718 719 tos = tnl_params->tos; 720 if (tos & 0x1) { 721 tos &= ~0x1; 722 if (skb->protocol == htons(ETH_P_IP)) { 723 tos = inner_iph->tos; 724 connected = false; 725 } else if (skb->protocol == htons(ETH_P_IPV6)) { 726 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); 727 connected = false; 728 } 729 } 730 731 ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr, 732 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link, 733 tunnel->fwmark, skb_get_hash(skb)); 734 735 if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) 736 goto tx_error; 737 738 if (connected && md) { 739 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); 740 if (use_cache) 741 rt = dst_cache_get_ip4(&tun_info->dst_cache, 742 &fl4.saddr); 743 } else { 744 rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, 745 &fl4.saddr) : NULL; 746 } 747 748 if (!rt) { 749 rt = ip_route_output_key(tunnel->net, &fl4); 750 751 if (IS_ERR(rt)) { 752 dev->stats.tx_carrier_errors++; 753 goto tx_error; 754 } 755 if (use_cache) 756 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, 757 fl4.saddr); 758 else if (!md && connected) 759 dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, 760 fl4.saddr); 761 } 762 763 if (rt->dst.dev == dev) { 764 ip_rt_put(rt); 765 dev->stats.collisions++; 766 goto tx_error; 767 } 768 769 if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph, 770 0, 0, false)) { 771 ip_rt_put(rt); 772 goto tx_error; 773 } 774 775 if (tunnel->err_count > 0) { 776 if (time_before(jiffies, 777 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { 778 tunnel->err_count--; 779 780 dst_link_failure(skb); 781 } else 782 tunnel->err_count = 0; 783 } 784 785 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); 786 ttl = tnl_params->ttl; 787 if (ttl == 0) { 788 if (skb->protocol == htons(ETH_P_IP)) 789 ttl = inner_iph->ttl; 790 #if IS_ENABLED(CONFIG_IPV6) 791 else if (skb->protocol == htons(ETH_P_IPV6)) 792 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit; 793 #endif 794 else 795 ttl = ip4_dst_hoplimit(&rt->dst); 796 } 797 798 df = tnl_params->frag_off; 799 if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df) 800 df |= (inner_iph->frag_off&htons(IP_DF)); 801 802 max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) 803 + rt->dst.header_len + ip_encap_hlen(&tunnel->encap); 804 if (max_headroom > dev->needed_headroom) 805 dev->needed_headroom = max_headroom; 806 807 if (skb_cow_head(skb, dev->needed_headroom)) { 808 ip_rt_put(rt); 809 dev->stats.tx_dropped++; 810 kfree_skb(skb); 811 return; 812 } 813 814 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, 815 df, !net_eq(tunnel->net, dev_net(dev))); 816 return; 817 818 #if IS_ENABLED(CONFIG_IPV6) 819 tx_error_icmp: 820 dst_link_failure(skb); 821 #endif 822 tx_error: 823 dev->stats.tx_errors++; 824 kfree_skb(skb); 825 } 826 EXPORT_SYMBOL_GPL(ip_tunnel_xmit); 827 828 static void ip_tunnel_update(struct ip_tunnel_net *itn, 829 struct ip_tunnel *t, 830 struct net_device *dev, 831 struct ip_tunnel_parm *p, 832 bool set_mtu, 833 __u32 fwmark) 834 { 835 ip_tunnel_del(itn, t); 836 t->parms.iph.saddr = p->iph.saddr; 837 t->parms.iph.daddr = p->iph.daddr; 838 t->parms.i_key = p->i_key; 839 t->parms.o_key = p->o_key; 840 if (dev->type != ARPHRD_ETHER) { 841 memcpy(dev->dev_addr, &p->iph.saddr, 4); 842 memcpy(dev->broadcast, &p->iph.daddr, 4); 843 } 844 ip_tunnel_add(itn, t); 845 846 t->parms.iph.ttl = p->iph.ttl; 847 t->parms.iph.tos = p->iph.tos; 848 t->parms.iph.frag_off = p->iph.frag_off; 849 850 if (t->parms.link != p->link || t->fwmark != fwmark) { 851 int mtu; 852 853 t->parms.link = p->link; 854 t->fwmark = fwmark; 855 mtu = ip_tunnel_bind_dev(dev); 856 if (set_mtu) 857 dev->mtu = mtu; 858 } 859 dst_cache_reset(&t->dst_cache); 860 netdev_state_change(dev); 861 } 862 863 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) 864 { 865 int err = 0; 866 struct ip_tunnel *t = netdev_priv(dev); 867 struct net *net = t->net; 868 struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id); 869 870 switch (cmd) { 871 case SIOCGETTUNNEL: 872 if (dev == itn->fb_tunnel_dev) { 873 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); 874 if (!t) 875 t = netdev_priv(dev); 876 } 877 memcpy(p, &t->parms, sizeof(*p)); 878 break; 879 880 case SIOCADDTUNNEL: 881 case SIOCCHGTUNNEL: 882 err = -EPERM; 883 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 884 goto done; 885 if (p->iph.ttl) 886 p->iph.frag_off |= htons(IP_DF); 887 if (!(p->i_flags & VTI_ISVTI)) { 888 if (!(p->i_flags & TUNNEL_KEY)) 889 p->i_key = 0; 890 if (!(p->o_flags & TUNNEL_KEY)) 891 p->o_key = 0; 892 } 893 894 t = ip_tunnel_find(itn, p, itn->type); 895 896 if (cmd == SIOCADDTUNNEL) { 897 if (!t) { 898 t = ip_tunnel_create(net, itn, p); 899 err = PTR_ERR_OR_ZERO(t); 900 break; 901 } 902 903 err = -EEXIST; 904 break; 905 } 906 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { 907 if (t) { 908 if (t->dev != dev) { 909 err = -EEXIST; 910 break; 911 } 912 } else { 913 unsigned int nflags = 0; 914 915 if (ipv4_is_multicast(p->iph.daddr)) 916 nflags = IFF_BROADCAST; 917 else if (p->iph.daddr) 918 nflags = IFF_POINTOPOINT; 919 920 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) { 921 err = -EINVAL; 922 break; 923 } 924 925 t = netdev_priv(dev); 926 } 927 } 928 929 if (t) { 930 err = 0; 931 ip_tunnel_update(itn, t, dev, p, true, 0); 932 } else { 933 err = -ENOENT; 934 } 935 break; 936 937 case SIOCDELTUNNEL: 938 err = -EPERM; 939 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 940 goto done; 941 942 if (dev == itn->fb_tunnel_dev) { 943 err = -ENOENT; 944 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); 945 if (!t) 946 goto done; 947 err = -EPERM; 948 if (t == netdev_priv(itn->fb_tunnel_dev)) 949 goto done; 950 dev = t->dev; 951 } 952 unregister_netdevice(dev); 953 err = 0; 954 break; 955 956 default: 957 err = -EINVAL; 958 } 959 960 done: 961 return err; 962 } 963 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl); 964 965 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict) 966 { 967 struct ip_tunnel *tunnel = netdev_priv(dev); 968 int t_hlen = tunnel->hlen + sizeof(struct iphdr); 969 int max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen; 970 971 if (new_mtu < ETH_MIN_MTU) 972 return -EINVAL; 973 974 if (new_mtu > max_mtu) { 975 if (strict) 976 return -EINVAL; 977 978 new_mtu = max_mtu; 979 } 980 981 dev->mtu = new_mtu; 982 return 0; 983 } 984 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu); 985 986 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu) 987 { 988 return __ip_tunnel_change_mtu(dev, new_mtu, true); 989 } 990 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu); 991 992 static void ip_tunnel_dev_free(struct net_device *dev) 993 { 994 struct ip_tunnel *tunnel = netdev_priv(dev); 995 996 gro_cells_destroy(&tunnel->gro_cells); 997 dst_cache_destroy(&tunnel->dst_cache); 998 free_percpu(dev->tstats); 999 } 1000 1001 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head) 1002 { 1003 struct ip_tunnel *tunnel = netdev_priv(dev); 1004 struct ip_tunnel_net *itn; 1005 1006 itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id); 1007 1008 if (itn->fb_tunnel_dev != dev) { 1009 ip_tunnel_del(itn, netdev_priv(dev)); 1010 unregister_netdevice_queue(dev, head); 1011 } 1012 } 1013 EXPORT_SYMBOL_GPL(ip_tunnel_dellink); 1014 1015 struct net *ip_tunnel_get_link_net(const struct net_device *dev) 1016 { 1017 struct ip_tunnel *tunnel = netdev_priv(dev); 1018 1019 return tunnel->net; 1020 } 1021 EXPORT_SYMBOL(ip_tunnel_get_link_net); 1022 1023 int ip_tunnel_get_iflink(const struct net_device *dev) 1024 { 1025 struct ip_tunnel *tunnel = netdev_priv(dev); 1026 1027 return tunnel->parms.link; 1028 } 1029 EXPORT_SYMBOL(ip_tunnel_get_iflink); 1030 1031 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, 1032 struct rtnl_link_ops *ops, char *devname) 1033 { 1034 struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id); 1035 struct ip_tunnel_parm parms; 1036 unsigned int i; 1037 1038 itn->rtnl_link_ops = ops; 1039 for (i = 0; i < IP_TNL_HASH_SIZE; i++) 1040 INIT_HLIST_HEAD(&itn->tunnels[i]); 1041 1042 if (!ops || !net_has_fallback_tunnels(net)) { 1043 struct ip_tunnel_net *it_init_net; 1044 1045 it_init_net = net_generic(&init_net, ip_tnl_net_id); 1046 itn->type = it_init_net->type; 1047 itn->fb_tunnel_dev = NULL; 1048 return 0; 1049 } 1050 1051 memset(&parms, 0, sizeof(parms)); 1052 if (devname) 1053 strlcpy(parms.name, devname, IFNAMSIZ); 1054 1055 rtnl_lock(); 1056 itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms); 1057 /* FB netdevice is special: we have one, and only one per netns. 1058 * Allowing to move it to another netns is clearly unsafe. 1059 */ 1060 if (!IS_ERR(itn->fb_tunnel_dev)) { 1061 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; 1062 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev); 1063 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev)); 1064 itn->type = itn->fb_tunnel_dev->type; 1065 } 1066 rtnl_unlock(); 1067 1068 return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev); 1069 } 1070 EXPORT_SYMBOL_GPL(ip_tunnel_init_net); 1071 1072 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn, 1073 struct list_head *head, 1074 struct rtnl_link_ops *ops) 1075 { 1076 struct net_device *dev, *aux; 1077 int h; 1078 1079 for_each_netdev_safe(net, dev, aux) 1080 if (dev->rtnl_link_ops == ops) 1081 unregister_netdevice_queue(dev, head); 1082 1083 for (h = 0; h < IP_TNL_HASH_SIZE; h++) { 1084 struct ip_tunnel *t; 1085 struct hlist_node *n; 1086 struct hlist_head *thead = &itn->tunnels[h]; 1087 1088 hlist_for_each_entry_safe(t, n, thead, hash_node) 1089 /* If dev is in the same netns, it has already 1090 * been added to the list by the previous loop. 1091 */ 1092 if (!net_eq(dev_net(t->dev), net)) 1093 unregister_netdevice_queue(t->dev, head); 1094 } 1095 } 1096 1097 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id, 1098 struct rtnl_link_ops *ops) 1099 { 1100 struct ip_tunnel_net *itn; 1101 struct net *net; 1102 LIST_HEAD(list); 1103 1104 rtnl_lock(); 1105 list_for_each_entry(net, net_list, exit_list) { 1106 itn = net_generic(net, id); 1107 ip_tunnel_destroy(net, itn, &list, ops); 1108 } 1109 unregister_netdevice_many(&list); 1110 rtnl_unlock(); 1111 } 1112 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets); 1113 1114 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], 1115 struct ip_tunnel_parm *p, __u32 fwmark) 1116 { 1117 struct ip_tunnel *nt; 1118 struct net *net = dev_net(dev); 1119 struct ip_tunnel_net *itn; 1120 int mtu; 1121 int err; 1122 1123 nt = netdev_priv(dev); 1124 itn = net_generic(net, nt->ip_tnl_net_id); 1125 1126 if (nt->collect_md) { 1127 if (rtnl_dereference(itn->collect_md_tun)) 1128 return -EEXIST; 1129 } else { 1130 if (ip_tunnel_find(itn, p, dev->type)) 1131 return -EEXIST; 1132 } 1133 1134 nt->net = net; 1135 nt->parms = *p; 1136 nt->fwmark = fwmark; 1137 err = register_netdevice(dev); 1138 if (err) 1139 goto err_register_netdevice; 1140 1141 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) 1142 eth_hw_addr_random(dev); 1143 1144 mtu = ip_tunnel_bind_dev(dev); 1145 if (tb[IFLA_MTU]) { 1146 unsigned int max = IP_MAX_MTU - dev->hard_header_len - nt->hlen; 1147 1148 mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, 1149 (unsigned int)(max - sizeof(struct iphdr))); 1150 } 1151 1152 err = dev_set_mtu(dev, mtu); 1153 if (err) 1154 goto err_dev_set_mtu; 1155 1156 ip_tunnel_add(itn, nt); 1157 return 0; 1158 1159 err_dev_set_mtu: 1160 unregister_netdevice(dev); 1161 err_register_netdevice: 1162 return err; 1163 } 1164 EXPORT_SYMBOL_GPL(ip_tunnel_newlink); 1165 1166 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], 1167 struct ip_tunnel_parm *p, __u32 fwmark) 1168 { 1169 struct ip_tunnel *t; 1170 struct ip_tunnel *tunnel = netdev_priv(dev); 1171 struct net *net = tunnel->net; 1172 struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id); 1173 1174 if (dev == itn->fb_tunnel_dev) 1175 return -EINVAL; 1176 1177 t = ip_tunnel_find(itn, p, dev->type); 1178 1179 if (t) { 1180 if (t->dev != dev) 1181 return -EEXIST; 1182 } else { 1183 t = tunnel; 1184 1185 if (dev->type != ARPHRD_ETHER) { 1186 unsigned int nflags = 0; 1187 1188 if (ipv4_is_multicast(p->iph.daddr)) 1189 nflags = IFF_BROADCAST; 1190 else if (p->iph.daddr) 1191 nflags = IFF_POINTOPOINT; 1192 1193 if ((dev->flags ^ nflags) & 1194 (IFF_POINTOPOINT | IFF_BROADCAST)) 1195 return -EINVAL; 1196 } 1197 } 1198 1199 ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark); 1200 return 0; 1201 } 1202 EXPORT_SYMBOL_GPL(ip_tunnel_changelink); 1203 1204 int ip_tunnel_init(struct net_device *dev) 1205 { 1206 struct ip_tunnel *tunnel = netdev_priv(dev); 1207 struct iphdr *iph = &tunnel->parms.iph; 1208 int err; 1209 1210 dev->needs_free_netdev = true; 1211 dev->priv_destructor = ip_tunnel_dev_free; 1212 dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); 1213 if (!dev->tstats) 1214 return -ENOMEM; 1215 1216 err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); 1217 if (err) { 1218 free_percpu(dev->tstats); 1219 return err; 1220 } 1221 1222 err = gro_cells_init(&tunnel->gro_cells, dev); 1223 if (err) { 1224 dst_cache_destroy(&tunnel->dst_cache); 1225 free_percpu(dev->tstats); 1226 return err; 1227 } 1228 1229 tunnel->dev = dev; 1230 tunnel->net = dev_net(dev); 1231 strcpy(tunnel->parms.name, dev->name); 1232 iph->version = 4; 1233 iph->ihl = 5; 1234 1235 if (tunnel->collect_md) 1236 netif_keep_dst(dev); 1237 return 0; 1238 } 1239 EXPORT_SYMBOL_GPL(ip_tunnel_init); 1240 1241 void ip_tunnel_uninit(struct net_device *dev) 1242 { 1243 struct ip_tunnel *tunnel = netdev_priv(dev); 1244 struct net *net = tunnel->net; 1245 struct ip_tunnel_net *itn; 1246 1247 itn = net_generic(net, tunnel->ip_tnl_net_id); 1248 /* fb_tunnel_dev will be unregisted in net-exit call. */ 1249 if (itn->fb_tunnel_dev != dev) 1250 ip_tunnel_del(itn, netdev_priv(dev)); 1251 1252 dst_cache_reset(&tunnel->dst_cache); 1253 } 1254 EXPORT_SYMBOL_GPL(ip_tunnel_uninit); 1255 1256 /* Do least required initialization, rest of init is done in tunnel_init call */ 1257 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id) 1258 { 1259 struct ip_tunnel *tunnel = netdev_priv(dev); 1260 tunnel->ip_tnl_net_id = net_id; 1261 } 1262 EXPORT_SYMBOL_GPL(ip_tunnel_setup); 1263 1264 MODULE_LICENSE("GPL"); 1265