1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2013 Nicira, Inc. 4 */ 5 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 8 #include <linux/capability.h> 9 #include <linux/module.h> 10 #include <linux/types.h> 11 #include <linux/kernel.h> 12 #include <linux/slab.h> 13 #include <linux/uaccess.h> 14 #include <linux/skbuff.h> 15 #include <linux/netdevice.h> 16 #include <linux/in.h> 17 #include <linux/tcp.h> 18 #include <linux/udp.h> 19 #include <linux/if_arp.h> 20 #include <linux/init.h> 21 #include <linux/in6.h> 22 #include <linux/inetdevice.h> 23 #include <linux/igmp.h> 24 #include <linux/netfilter_ipv4.h> 25 #include <linux/etherdevice.h> 26 #include <linux/if_ether.h> 27 #include <linux/if_vlan.h> 28 #include <linux/rculist.h> 29 #include <linux/err.h> 30 31 #include <net/sock.h> 32 #include <net/ip.h> 33 #include <net/icmp.h> 34 #include <net/protocol.h> 35 #include <net/ip_tunnels.h> 36 #include <net/arp.h> 37 #include <net/checksum.h> 38 #include <net/dsfield.h> 39 #include <net/inet_ecn.h> 40 #include <net/xfrm.h> 41 #include <net/net_namespace.h> 42 #include <net/netns/generic.h> 43 #include <net/rtnetlink.h> 44 #include <net/udp.h> 45 #include <net/dst_metadata.h> 46 47 #if IS_ENABLED(CONFIG_IPV6) 48 #include <net/ipv6.h> 49 #include <net/ip6_fib.h> 50 #include <net/ip6_route.h> 51 #endif 52 53 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote) 54 { 55 return hash_32((__force u32)key ^ (__force u32)remote, 56 IP_TNL_HASH_BITS); 57 } 58 59 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p, 60 __be16 flags, __be32 key) 61 { 62 if (p->i_flags & TUNNEL_KEY) { 63 if (flags & TUNNEL_KEY) 64 return key == p->i_key; 65 else 66 /* key expected, none present */ 67 return false; 68 } else 69 return !(flags & TUNNEL_KEY); 70 } 71 72 /* Fallback tunnel: no source, no destination, no key, no options 73 74 Tunnel hash table: 75 We require exact key match i.e. if a key is present in packet 76 it will match only tunnel with the same key; if it is not present, 77 it will match only keyless tunnel. 78 79 All keysless packets, if not matched configured keyless tunnels 80 will match fallback tunnel. 81 Given src, dst and key, find appropriate for input tunnel. 82 */ 83 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, 84 int link, __be16 flags, 85 __be32 remote, __be32 local, 86 __be32 key) 87 { 88 unsigned int hash; 89 struct ip_tunnel *t, *cand = NULL; 90 struct hlist_head *head; 91 92 hash = ip_tunnel_hash(key, remote); 93 head = &itn->tunnels[hash]; 94 95 hlist_for_each_entry_rcu(t, head, hash_node) { 96 if (local != t->parms.iph.saddr || 97 remote != t->parms.iph.daddr || 98 !(t->dev->flags & IFF_UP)) 99 continue; 100 101 if (!ip_tunnel_key_match(&t->parms, flags, key)) 102 continue; 103 104 if (t->parms.link == link) 105 return t; 106 else 107 cand = t; 108 } 109 110 hlist_for_each_entry_rcu(t, head, hash_node) { 111 if (remote != t->parms.iph.daddr || 112 t->parms.iph.saddr != 0 || 113 !(t->dev->flags & IFF_UP)) 114 continue; 115 116 if (!ip_tunnel_key_match(&t->parms, flags, key)) 117 continue; 118 119 if (t->parms.link == link) 120 return t; 121 else if (!cand) 122 cand = t; 123 } 124 125 hash = ip_tunnel_hash(key, 0); 126 head = &itn->tunnels[hash]; 127 128 hlist_for_each_entry_rcu(t, head, hash_node) { 129 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) && 130 (local != t->parms.iph.daddr || !ipv4_is_multicast(local))) 131 continue; 132 133 if (!(t->dev->flags & IFF_UP)) 134 continue; 135 136 if (!ip_tunnel_key_match(&t->parms, flags, key)) 137 continue; 138 139 if (t->parms.link == link) 140 return t; 141 else if (!cand) 142 cand = t; 143 } 144 145 hlist_for_each_entry_rcu(t, head, hash_node) { 146 if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) || 147 t->parms.iph.saddr != 0 || 148 t->parms.iph.daddr != 0 || 149 !(t->dev->flags & IFF_UP)) 150 continue; 151 152 if (t->parms.link == link) 153 return t; 154 else if (!cand) 155 cand = t; 156 } 157 158 if (cand) 159 return cand; 160 161 t = rcu_dereference(itn->collect_md_tun); 162 if (t && t->dev->flags & IFF_UP) 163 return t; 164 165 if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP) 166 return netdev_priv(itn->fb_tunnel_dev); 167 168 return NULL; 169 } 170 EXPORT_SYMBOL_GPL(ip_tunnel_lookup); 171 172 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn, 173 struct ip_tunnel_parm *parms) 174 { 175 unsigned int h; 176 __be32 remote; 177 __be32 i_key = parms->i_key; 178 179 if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr)) 180 remote = parms->iph.daddr; 181 else 182 remote = 0; 183 184 if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI)) 185 i_key = 0; 186 187 h = ip_tunnel_hash(i_key, remote); 188 return &itn->tunnels[h]; 189 } 190 191 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t) 192 { 193 struct hlist_head *head = ip_bucket(itn, &t->parms); 194 195 if (t->collect_md) 196 rcu_assign_pointer(itn->collect_md_tun, t); 197 hlist_add_head_rcu(&t->hash_node, head); 198 } 199 200 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t) 201 { 202 if (t->collect_md) 203 rcu_assign_pointer(itn->collect_md_tun, NULL); 204 hlist_del_init_rcu(&t->hash_node); 205 } 206 207 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn, 208 struct ip_tunnel_parm *parms, 209 int type) 210 { 211 __be32 remote = parms->iph.daddr; 212 __be32 local = parms->iph.saddr; 213 __be32 key = parms->i_key; 214 __be16 flags = parms->i_flags; 215 int link = parms->link; 216 struct ip_tunnel *t = NULL; 217 struct hlist_head *head = ip_bucket(itn, parms); 218 219 hlist_for_each_entry_rcu(t, head, hash_node) { 220 if (local == t->parms.iph.saddr && 221 remote == t->parms.iph.daddr && 222 link == t->parms.link && 223 type == t->dev->type && 224 ip_tunnel_key_match(&t->parms, flags, key)) 225 break; 226 } 227 return t; 228 } 229 230 static struct net_device *__ip_tunnel_create(struct net *net, 231 const struct rtnl_link_ops *ops, 232 struct ip_tunnel_parm *parms) 233 { 234 int err; 235 struct ip_tunnel *tunnel; 236 struct net_device *dev; 237 char name[IFNAMSIZ]; 238 239 err = -E2BIG; 240 if (parms->name[0]) { 241 if (!dev_valid_name(parms->name)) 242 goto failed; 243 strlcpy(name, parms->name, IFNAMSIZ); 244 } else { 245 if (strlen(ops->kind) > (IFNAMSIZ - 3)) 246 goto failed; 247 strcpy(name, ops->kind); 248 strcat(name, "%d"); 249 } 250 251 ASSERT_RTNL(); 252 dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup); 253 if (!dev) { 254 err = -ENOMEM; 255 goto failed; 256 } 257 dev_net_set(dev, net); 258 259 dev->rtnl_link_ops = ops; 260 261 tunnel = netdev_priv(dev); 262 tunnel->parms = *parms; 263 tunnel->net = net; 264 265 err = register_netdevice(dev); 266 if (err) 267 goto failed_free; 268 269 return dev; 270 271 failed_free: 272 free_netdev(dev); 273 failed: 274 return ERR_PTR(err); 275 } 276 277 static int ip_tunnel_bind_dev(struct net_device *dev) 278 { 279 struct net_device *tdev = NULL; 280 struct ip_tunnel *tunnel = netdev_priv(dev); 281 const struct iphdr *iph; 282 int hlen = LL_MAX_HEADER; 283 int mtu = ETH_DATA_LEN; 284 int t_hlen = tunnel->hlen + sizeof(struct iphdr); 285 286 iph = &tunnel->parms.iph; 287 288 /* Guess output device to choose reasonable mtu and needed_headroom */ 289 if (iph->daddr) { 290 struct flowi4 fl4; 291 struct rtable *rt; 292 293 ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr, 294 iph->saddr, tunnel->parms.o_key, 295 RT_TOS(iph->tos), tunnel->parms.link, 296 tunnel->fwmark, 0); 297 rt = ip_route_output_key(tunnel->net, &fl4); 298 299 if (!IS_ERR(rt)) { 300 tdev = rt->dst.dev; 301 ip_rt_put(rt); 302 } 303 if (dev->type != ARPHRD_ETHER) 304 dev->flags |= IFF_POINTOPOINT; 305 306 dst_cache_reset(&tunnel->dst_cache); 307 } 308 309 if (!tdev && tunnel->parms.link) 310 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link); 311 312 if (tdev) { 313 hlen = tdev->hard_header_len + tdev->needed_headroom; 314 mtu = min(tdev->mtu, IP_MAX_MTU); 315 } 316 317 dev->needed_headroom = t_hlen + hlen; 318 mtu -= (dev->hard_header_len + t_hlen); 319 320 if (mtu < IPV4_MIN_MTU) 321 mtu = IPV4_MIN_MTU; 322 323 return mtu; 324 } 325 326 static struct ip_tunnel *ip_tunnel_create(struct net *net, 327 struct ip_tunnel_net *itn, 328 struct ip_tunnel_parm *parms) 329 { 330 struct ip_tunnel *nt; 331 struct net_device *dev; 332 int t_hlen; 333 int mtu; 334 int err; 335 336 dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms); 337 if (IS_ERR(dev)) 338 return ERR_CAST(dev); 339 340 mtu = ip_tunnel_bind_dev(dev); 341 err = dev_set_mtu(dev, mtu); 342 if (err) 343 goto err_dev_set_mtu; 344 345 nt = netdev_priv(dev); 346 t_hlen = nt->hlen + sizeof(struct iphdr); 347 dev->min_mtu = ETH_MIN_MTU; 348 dev->max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen; 349 ip_tunnel_add(itn, nt); 350 return nt; 351 352 err_dev_set_mtu: 353 unregister_netdevice(dev); 354 return ERR_PTR(err); 355 } 356 357 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, 358 const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, 359 bool log_ecn_error) 360 { 361 struct pcpu_sw_netstats *tstats; 362 const struct iphdr *iph = ip_hdr(skb); 363 int err; 364 365 #ifdef CONFIG_NET_IPGRE_BROADCAST 366 if (ipv4_is_multicast(iph->daddr)) { 367 tunnel->dev->stats.multicast++; 368 skb->pkt_type = PACKET_BROADCAST; 369 } 370 #endif 371 372 if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) || 373 ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) { 374 tunnel->dev->stats.rx_crc_errors++; 375 tunnel->dev->stats.rx_errors++; 376 goto drop; 377 } 378 379 if (tunnel->parms.i_flags&TUNNEL_SEQ) { 380 if (!(tpi->flags&TUNNEL_SEQ) || 381 (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) { 382 tunnel->dev->stats.rx_fifo_errors++; 383 tunnel->dev->stats.rx_errors++; 384 goto drop; 385 } 386 tunnel->i_seqno = ntohl(tpi->seq) + 1; 387 } 388 389 skb_reset_network_header(skb); 390 391 err = IP_ECN_decapsulate(iph, skb); 392 if (unlikely(err)) { 393 if (log_ecn_error) 394 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", 395 &iph->saddr, iph->tos); 396 if (err > 1) { 397 ++tunnel->dev->stats.rx_frame_errors; 398 ++tunnel->dev->stats.rx_errors; 399 goto drop; 400 } 401 } 402 403 tstats = this_cpu_ptr(tunnel->dev->tstats); 404 u64_stats_update_begin(&tstats->syncp); 405 tstats->rx_packets++; 406 tstats->rx_bytes += skb->len; 407 u64_stats_update_end(&tstats->syncp); 408 409 skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev))); 410 411 if (tunnel->dev->type == ARPHRD_ETHER) { 412 skb->protocol = eth_type_trans(skb, tunnel->dev); 413 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 414 } else { 415 skb->dev = tunnel->dev; 416 } 417 418 if (tun_dst) 419 skb_dst_set(skb, (struct dst_entry *)tun_dst); 420 421 gro_cells_receive(&tunnel->gro_cells, skb); 422 return 0; 423 424 drop: 425 if (tun_dst) 426 dst_release((struct dst_entry *)tun_dst); 427 kfree_skb(skb); 428 return 0; 429 } 430 EXPORT_SYMBOL_GPL(ip_tunnel_rcv); 431 432 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops, 433 unsigned int num) 434 { 435 if (num >= MAX_IPTUN_ENCAP_OPS) 436 return -ERANGE; 437 438 return !cmpxchg((const struct ip_tunnel_encap_ops **) 439 &iptun_encaps[num], 440 NULL, ops) ? 0 : -1; 441 } 442 EXPORT_SYMBOL(ip_tunnel_encap_add_ops); 443 444 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops, 445 unsigned int num) 446 { 447 int ret; 448 449 if (num >= MAX_IPTUN_ENCAP_OPS) 450 return -ERANGE; 451 452 ret = (cmpxchg((const struct ip_tunnel_encap_ops **) 453 &iptun_encaps[num], 454 ops, NULL) == ops) ? 0 : -1; 455 456 synchronize_net(); 457 458 return ret; 459 } 460 EXPORT_SYMBOL(ip_tunnel_encap_del_ops); 461 462 int ip_tunnel_encap_setup(struct ip_tunnel *t, 463 struct ip_tunnel_encap *ipencap) 464 { 465 int hlen; 466 467 memset(&t->encap, 0, sizeof(t->encap)); 468 469 hlen = ip_encap_hlen(ipencap); 470 if (hlen < 0) 471 return hlen; 472 473 t->encap.type = ipencap->type; 474 t->encap.sport = ipencap->sport; 475 t->encap.dport = ipencap->dport; 476 t->encap.flags = ipencap->flags; 477 478 t->encap_hlen = hlen; 479 t->hlen = t->encap_hlen + t->tun_hlen; 480 481 return 0; 482 } 483 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup); 484 485 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, 486 struct rtable *rt, __be16 df, 487 const struct iphdr *inner_iph, 488 int tunnel_hlen, __be32 dst, bool md) 489 { 490 struct ip_tunnel *tunnel = netdev_priv(dev); 491 int pkt_size; 492 int mtu; 493 494 tunnel_hlen = md ? tunnel_hlen : tunnel->hlen; 495 pkt_size = skb->len - tunnel_hlen - dev->hard_header_len; 496 497 if (df) 498 mtu = dst_mtu(&rt->dst) - dev->hard_header_len 499 - sizeof(struct iphdr) - tunnel_hlen; 500 else 501 mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; 502 503 if (skb_valid_dst(skb)) 504 skb_dst_update_pmtu_no_confirm(skb, mtu); 505 506 if (skb->protocol == htons(ETH_P_IP)) { 507 if (!skb_is_gso(skb) && 508 (inner_iph->frag_off & htons(IP_DF)) && 509 mtu < pkt_size) { 510 memset(IPCB(skb), 0, sizeof(*IPCB(skb))); 511 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); 512 return -E2BIG; 513 } 514 } 515 #if IS_ENABLED(CONFIG_IPV6) 516 else if (skb->protocol == htons(ETH_P_IPV6)) { 517 struct rt6_info *rt6; 518 __be32 daddr; 519 520 rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) : 521 NULL; 522 daddr = md ? dst : tunnel->parms.iph.daddr; 523 524 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && 525 mtu >= IPV6_MIN_MTU) { 526 if ((daddr && !ipv4_is_multicast(daddr)) || 527 rt6->rt6i_dst.plen == 128) { 528 rt6->rt6i_flags |= RTF_MODIFIED; 529 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu); 530 } 531 } 532 533 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU && 534 mtu < pkt_size) { 535 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 536 return -E2BIG; 537 } 538 } 539 #endif 540 return 0; 541 } 542 543 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, 544 u8 proto, int tunnel_hlen) 545 { 546 struct ip_tunnel *tunnel = netdev_priv(dev); 547 u32 headroom = sizeof(struct iphdr); 548 struct ip_tunnel_info *tun_info; 549 const struct ip_tunnel_key *key; 550 const struct iphdr *inner_iph; 551 struct rtable *rt = NULL; 552 struct flowi4 fl4; 553 __be16 df = 0; 554 u8 tos, ttl; 555 bool use_cache; 556 557 tun_info = skb_tunnel_info(skb); 558 if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || 559 ip_tunnel_info_af(tun_info) != AF_INET)) 560 goto tx_error; 561 key = &tun_info->key; 562 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 563 inner_iph = (const struct iphdr *)skb_inner_network_header(skb); 564 tos = key->tos; 565 if (tos == 1) { 566 if (skb->protocol == htons(ETH_P_IP)) 567 tos = inner_iph->tos; 568 else if (skb->protocol == htons(ETH_P_IPV6)) 569 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); 570 } 571 ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 572 tunnel_id_to_key32(key->tun_id), RT_TOS(tos), 573 0, skb->mark, skb_get_hash(skb)); 574 if (tunnel->encap.type != TUNNEL_ENCAP_NONE) 575 goto tx_error; 576 577 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); 578 if (use_cache) 579 rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr); 580 if (!rt) { 581 rt = ip_route_output_key(tunnel->net, &fl4); 582 if (IS_ERR(rt)) { 583 dev->stats.tx_carrier_errors++; 584 goto tx_error; 585 } 586 if (use_cache) 587 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, 588 fl4.saddr); 589 } 590 if (rt->dst.dev == dev) { 591 ip_rt_put(rt); 592 dev->stats.collisions++; 593 goto tx_error; 594 } 595 596 if (key->tun_flags & TUNNEL_DONT_FRAGMENT) 597 df = htons(IP_DF); 598 if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen, 599 key->u.ipv4.dst, true)) { 600 ip_rt_put(rt); 601 goto tx_error; 602 } 603 604 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); 605 ttl = key->ttl; 606 if (ttl == 0) { 607 if (skb->protocol == htons(ETH_P_IP)) 608 ttl = inner_iph->ttl; 609 else if (skb->protocol == htons(ETH_P_IPV6)) 610 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit; 611 else 612 ttl = ip4_dst_hoplimit(&rt->dst); 613 } 614 615 if (!df && skb->protocol == htons(ETH_P_IP)) 616 df = inner_iph->frag_off & htons(IP_DF); 617 618 headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; 619 if (headroom > dev->needed_headroom) 620 dev->needed_headroom = headroom; 621 622 if (skb_cow_head(skb, dev->needed_headroom)) { 623 ip_rt_put(rt); 624 goto tx_dropped; 625 } 626 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl, 627 df, !net_eq(tunnel->net, dev_net(dev))); 628 return; 629 tx_error: 630 dev->stats.tx_errors++; 631 goto kfree; 632 tx_dropped: 633 dev->stats.tx_dropped++; 634 kfree: 635 kfree_skb(skb); 636 } 637 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit); 638 639 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, 640 const struct iphdr *tnl_params, u8 protocol) 641 { 642 struct ip_tunnel *tunnel = netdev_priv(dev); 643 struct ip_tunnel_info *tun_info = NULL; 644 const struct iphdr *inner_iph; 645 unsigned int max_headroom; /* The extra header space needed */ 646 struct rtable *rt = NULL; /* Route to the other host */ 647 bool use_cache = false; 648 struct flowi4 fl4; 649 bool md = false; 650 bool connected; 651 u8 tos, ttl; 652 __be32 dst; 653 __be16 df; 654 655 inner_iph = (const struct iphdr *)skb_inner_network_header(skb); 656 connected = (tunnel->parms.iph.daddr != 0); 657 658 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 659 660 dst = tnl_params->daddr; 661 if (dst == 0) { 662 /* NBMA tunnel */ 663 664 if (!skb_dst(skb)) { 665 dev->stats.tx_fifo_errors++; 666 goto tx_error; 667 } 668 669 tun_info = skb_tunnel_info(skb); 670 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) && 671 ip_tunnel_info_af(tun_info) == AF_INET && 672 tun_info->key.u.ipv4.dst) { 673 dst = tun_info->key.u.ipv4.dst; 674 md = true; 675 connected = true; 676 } 677 else if (skb->protocol == htons(ETH_P_IP)) { 678 rt = skb_rtable(skb); 679 dst = rt_nexthop(rt, inner_iph->daddr); 680 } 681 #if IS_ENABLED(CONFIG_IPV6) 682 else if (skb->protocol == htons(ETH_P_IPV6)) { 683 const struct in6_addr *addr6; 684 struct neighbour *neigh; 685 bool do_tx_error_icmp; 686 int addr_type; 687 688 neigh = dst_neigh_lookup(skb_dst(skb), 689 &ipv6_hdr(skb)->daddr); 690 if (!neigh) 691 goto tx_error; 692 693 addr6 = (const struct in6_addr *)&neigh->primary_key; 694 addr_type = ipv6_addr_type(addr6); 695 696 if (addr_type == IPV6_ADDR_ANY) { 697 addr6 = &ipv6_hdr(skb)->daddr; 698 addr_type = ipv6_addr_type(addr6); 699 } 700 701 if ((addr_type & IPV6_ADDR_COMPATv4) == 0) 702 do_tx_error_icmp = true; 703 else { 704 do_tx_error_icmp = false; 705 dst = addr6->s6_addr32[3]; 706 } 707 neigh_release(neigh); 708 if (do_tx_error_icmp) 709 goto tx_error_icmp; 710 } 711 #endif 712 else 713 goto tx_error; 714 715 if (!md) 716 connected = false; 717 } 718 719 tos = tnl_params->tos; 720 if (tos & 0x1) { 721 tos &= ~0x1; 722 if (skb->protocol == htons(ETH_P_IP)) { 723 tos = inner_iph->tos; 724 connected = false; 725 } else if (skb->protocol == htons(ETH_P_IPV6)) { 726 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); 727 connected = false; 728 } 729 } 730 731 ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr, 732 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link, 733 tunnel->fwmark, skb_get_hash(skb)); 734 735 if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) 736 goto tx_error; 737 738 if (connected && md) { 739 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); 740 if (use_cache) 741 rt = dst_cache_get_ip4(&tun_info->dst_cache, 742 &fl4.saddr); 743 } else { 744 rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, 745 &fl4.saddr) : NULL; 746 } 747 748 if (!rt) { 749 rt = ip_route_output_key(tunnel->net, &fl4); 750 751 if (IS_ERR(rt)) { 752 dev->stats.tx_carrier_errors++; 753 goto tx_error; 754 } 755 if (use_cache) 756 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, 757 fl4.saddr); 758 else if (!md && connected) 759 dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, 760 fl4.saddr); 761 } 762 763 if (rt->dst.dev == dev) { 764 ip_rt_put(rt); 765 dev->stats.collisions++; 766 goto tx_error; 767 } 768 769 if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph, 770 0, 0, false)) { 771 ip_rt_put(rt); 772 goto tx_error; 773 } 774 775 if (tunnel->err_count > 0) { 776 if (time_before(jiffies, 777 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { 778 tunnel->err_count--; 779 780 dst_link_failure(skb); 781 } else 782 tunnel->err_count = 0; 783 } 784 785 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); 786 ttl = tnl_params->ttl; 787 if (ttl == 0) { 788 if (skb->protocol == htons(ETH_P_IP)) 789 ttl = inner_iph->ttl; 790 #if IS_ENABLED(CONFIG_IPV6) 791 else if (skb->protocol == htons(ETH_P_IPV6)) 792 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit; 793 #endif 794 else 795 ttl = ip4_dst_hoplimit(&rt->dst); 796 } 797 798 df = tnl_params->frag_off; 799 if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df) 800 df |= (inner_iph->frag_off&htons(IP_DF)); 801 802 max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) 803 + rt->dst.header_len + ip_encap_hlen(&tunnel->encap); 804 if (max_headroom > dev->needed_headroom) 805 dev->needed_headroom = max_headroom; 806 807 if (skb_cow_head(skb, dev->needed_headroom)) { 808 ip_rt_put(rt); 809 dev->stats.tx_dropped++; 810 kfree_skb(skb); 811 return; 812 } 813 814 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, 815 df, !net_eq(tunnel->net, dev_net(dev))); 816 return; 817 818 #if IS_ENABLED(CONFIG_IPV6) 819 tx_error_icmp: 820 dst_link_failure(skb); 821 #endif 822 tx_error: 823 dev->stats.tx_errors++; 824 kfree_skb(skb); 825 } 826 EXPORT_SYMBOL_GPL(ip_tunnel_xmit); 827 828 static void ip_tunnel_update(struct ip_tunnel_net *itn, 829 struct ip_tunnel *t, 830 struct net_device *dev, 831 struct ip_tunnel_parm *p, 832 bool set_mtu, 833 __u32 fwmark) 834 { 835 ip_tunnel_del(itn, t); 836 t->parms.iph.saddr = p->iph.saddr; 837 t->parms.iph.daddr = p->iph.daddr; 838 t->parms.i_key = p->i_key; 839 t->parms.o_key = p->o_key; 840 if (dev->type != ARPHRD_ETHER) { 841 memcpy(dev->dev_addr, &p->iph.saddr, 4); 842 memcpy(dev->broadcast, &p->iph.daddr, 4); 843 } 844 ip_tunnel_add(itn, t); 845 846 t->parms.iph.ttl = p->iph.ttl; 847 t->parms.iph.tos = p->iph.tos; 848 t->parms.iph.frag_off = p->iph.frag_off; 849 850 if (t->parms.link != p->link || t->fwmark != fwmark) { 851 int mtu; 852 853 t->parms.link = p->link; 854 t->fwmark = fwmark; 855 mtu = ip_tunnel_bind_dev(dev); 856 if (set_mtu) 857 dev->mtu = mtu; 858 } 859 dst_cache_reset(&t->dst_cache); 860 netdev_state_change(dev); 861 } 862 863 int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) 864 { 865 int err = 0; 866 struct ip_tunnel *t = netdev_priv(dev); 867 struct net *net = t->net; 868 struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id); 869 870 switch (cmd) { 871 case SIOCGETTUNNEL: 872 if (dev == itn->fb_tunnel_dev) { 873 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); 874 if (!t) 875 t = netdev_priv(dev); 876 } 877 memcpy(p, &t->parms, sizeof(*p)); 878 break; 879 880 case SIOCADDTUNNEL: 881 case SIOCCHGTUNNEL: 882 err = -EPERM; 883 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 884 goto done; 885 if (p->iph.ttl) 886 p->iph.frag_off |= htons(IP_DF); 887 if (!(p->i_flags & VTI_ISVTI)) { 888 if (!(p->i_flags & TUNNEL_KEY)) 889 p->i_key = 0; 890 if (!(p->o_flags & TUNNEL_KEY)) 891 p->o_key = 0; 892 } 893 894 t = ip_tunnel_find(itn, p, itn->type); 895 896 if (cmd == SIOCADDTUNNEL) { 897 if (!t) { 898 t = ip_tunnel_create(net, itn, p); 899 err = PTR_ERR_OR_ZERO(t); 900 break; 901 } 902 903 err = -EEXIST; 904 break; 905 } 906 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { 907 if (t) { 908 if (t->dev != dev) { 909 err = -EEXIST; 910 break; 911 } 912 } else { 913 unsigned int nflags = 0; 914 915 if (ipv4_is_multicast(p->iph.daddr)) 916 nflags = IFF_BROADCAST; 917 else if (p->iph.daddr) 918 nflags = IFF_POINTOPOINT; 919 920 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) { 921 err = -EINVAL; 922 break; 923 } 924 925 t = netdev_priv(dev); 926 } 927 } 928 929 if (t) { 930 err = 0; 931 ip_tunnel_update(itn, t, dev, p, true, 0); 932 } else { 933 err = -ENOENT; 934 } 935 break; 936 937 case SIOCDELTUNNEL: 938 err = -EPERM; 939 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 940 goto done; 941 942 if (dev == itn->fb_tunnel_dev) { 943 err = -ENOENT; 944 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type); 945 if (!t) 946 goto done; 947 err = -EPERM; 948 if (t == netdev_priv(itn->fb_tunnel_dev)) 949 goto done; 950 dev = t->dev; 951 } 952 unregister_netdevice(dev); 953 err = 0; 954 break; 955 956 default: 957 err = -EINVAL; 958 } 959 960 done: 961 return err; 962 } 963 EXPORT_SYMBOL_GPL(ip_tunnel_ctl); 964 965 int ip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) 966 { 967 struct ip_tunnel_parm p; 968 int err; 969 970 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) 971 return -EFAULT; 972 err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd); 973 if (!err && copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) 974 return -EFAULT; 975 return err; 976 } 977 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl); 978 979 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict) 980 { 981 struct ip_tunnel *tunnel = netdev_priv(dev); 982 int t_hlen = tunnel->hlen + sizeof(struct iphdr); 983 int max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen; 984 985 if (new_mtu < ETH_MIN_MTU) 986 return -EINVAL; 987 988 if (new_mtu > max_mtu) { 989 if (strict) 990 return -EINVAL; 991 992 new_mtu = max_mtu; 993 } 994 995 dev->mtu = new_mtu; 996 return 0; 997 } 998 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu); 999 1000 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu) 1001 { 1002 return __ip_tunnel_change_mtu(dev, new_mtu, true); 1003 } 1004 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu); 1005 1006 static void ip_tunnel_dev_free(struct net_device *dev) 1007 { 1008 struct ip_tunnel *tunnel = netdev_priv(dev); 1009 1010 gro_cells_destroy(&tunnel->gro_cells); 1011 dst_cache_destroy(&tunnel->dst_cache); 1012 free_percpu(dev->tstats); 1013 } 1014 1015 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head) 1016 { 1017 struct ip_tunnel *tunnel = netdev_priv(dev); 1018 struct ip_tunnel_net *itn; 1019 1020 itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id); 1021 1022 if (itn->fb_tunnel_dev != dev) { 1023 ip_tunnel_del(itn, netdev_priv(dev)); 1024 unregister_netdevice_queue(dev, head); 1025 } 1026 } 1027 EXPORT_SYMBOL_GPL(ip_tunnel_dellink); 1028 1029 struct net *ip_tunnel_get_link_net(const struct net_device *dev) 1030 { 1031 struct ip_tunnel *tunnel = netdev_priv(dev); 1032 1033 return tunnel->net; 1034 } 1035 EXPORT_SYMBOL(ip_tunnel_get_link_net); 1036 1037 int ip_tunnel_get_iflink(const struct net_device *dev) 1038 { 1039 struct ip_tunnel *tunnel = netdev_priv(dev); 1040 1041 return tunnel->parms.link; 1042 } 1043 EXPORT_SYMBOL(ip_tunnel_get_iflink); 1044 1045 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, 1046 struct rtnl_link_ops *ops, char *devname) 1047 { 1048 struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id); 1049 struct ip_tunnel_parm parms; 1050 unsigned int i; 1051 1052 itn->rtnl_link_ops = ops; 1053 for (i = 0; i < IP_TNL_HASH_SIZE; i++) 1054 INIT_HLIST_HEAD(&itn->tunnels[i]); 1055 1056 if (!ops || !net_has_fallback_tunnels(net)) { 1057 struct ip_tunnel_net *it_init_net; 1058 1059 it_init_net = net_generic(&init_net, ip_tnl_net_id); 1060 itn->type = it_init_net->type; 1061 itn->fb_tunnel_dev = NULL; 1062 return 0; 1063 } 1064 1065 memset(&parms, 0, sizeof(parms)); 1066 if (devname) 1067 strlcpy(parms.name, devname, IFNAMSIZ); 1068 1069 rtnl_lock(); 1070 itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms); 1071 /* FB netdevice is special: we have one, and only one per netns. 1072 * Allowing to move it to another netns is clearly unsafe. 1073 */ 1074 if (!IS_ERR(itn->fb_tunnel_dev)) { 1075 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; 1076 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev); 1077 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev)); 1078 itn->type = itn->fb_tunnel_dev->type; 1079 } 1080 rtnl_unlock(); 1081 1082 return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev); 1083 } 1084 EXPORT_SYMBOL_GPL(ip_tunnel_init_net); 1085 1086 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn, 1087 struct list_head *head, 1088 struct rtnl_link_ops *ops) 1089 { 1090 struct net_device *dev, *aux; 1091 int h; 1092 1093 for_each_netdev_safe(net, dev, aux) 1094 if (dev->rtnl_link_ops == ops) 1095 unregister_netdevice_queue(dev, head); 1096 1097 for (h = 0; h < IP_TNL_HASH_SIZE; h++) { 1098 struct ip_tunnel *t; 1099 struct hlist_node *n; 1100 struct hlist_head *thead = &itn->tunnels[h]; 1101 1102 hlist_for_each_entry_safe(t, n, thead, hash_node) 1103 /* If dev is in the same netns, it has already 1104 * been added to the list by the previous loop. 1105 */ 1106 if (!net_eq(dev_net(t->dev), net)) 1107 unregister_netdevice_queue(t->dev, head); 1108 } 1109 } 1110 1111 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id, 1112 struct rtnl_link_ops *ops) 1113 { 1114 struct ip_tunnel_net *itn; 1115 struct net *net; 1116 LIST_HEAD(list); 1117 1118 rtnl_lock(); 1119 list_for_each_entry(net, net_list, exit_list) { 1120 itn = net_generic(net, id); 1121 ip_tunnel_destroy(net, itn, &list, ops); 1122 } 1123 unregister_netdevice_many(&list); 1124 rtnl_unlock(); 1125 } 1126 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets); 1127 1128 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], 1129 struct ip_tunnel_parm *p, __u32 fwmark) 1130 { 1131 struct ip_tunnel *nt; 1132 struct net *net = dev_net(dev); 1133 struct ip_tunnel_net *itn; 1134 int mtu; 1135 int err; 1136 1137 nt = netdev_priv(dev); 1138 itn = net_generic(net, nt->ip_tnl_net_id); 1139 1140 if (nt->collect_md) { 1141 if (rtnl_dereference(itn->collect_md_tun)) 1142 return -EEXIST; 1143 } else { 1144 if (ip_tunnel_find(itn, p, dev->type)) 1145 return -EEXIST; 1146 } 1147 1148 nt->net = net; 1149 nt->parms = *p; 1150 nt->fwmark = fwmark; 1151 err = register_netdevice(dev); 1152 if (err) 1153 goto err_register_netdevice; 1154 1155 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) 1156 eth_hw_addr_random(dev); 1157 1158 mtu = ip_tunnel_bind_dev(dev); 1159 if (tb[IFLA_MTU]) { 1160 unsigned int max = IP_MAX_MTU - dev->hard_header_len - nt->hlen; 1161 1162 mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, 1163 (unsigned int)(max - sizeof(struct iphdr))); 1164 } 1165 1166 err = dev_set_mtu(dev, mtu); 1167 if (err) 1168 goto err_dev_set_mtu; 1169 1170 ip_tunnel_add(itn, nt); 1171 return 0; 1172 1173 err_dev_set_mtu: 1174 unregister_netdevice(dev); 1175 err_register_netdevice: 1176 return err; 1177 } 1178 EXPORT_SYMBOL_GPL(ip_tunnel_newlink); 1179 1180 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], 1181 struct ip_tunnel_parm *p, __u32 fwmark) 1182 { 1183 struct ip_tunnel *t; 1184 struct ip_tunnel *tunnel = netdev_priv(dev); 1185 struct net *net = tunnel->net; 1186 struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id); 1187 1188 if (dev == itn->fb_tunnel_dev) 1189 return -EINVAL; 1190 1191 t = ip_tunnel_find(itn, p, dev->type); 1192 1193 if (t) { 1194 if (t->dev != dev) 1195 return -EEXIST; 1196 } else { 1197 t = tunnel; 1198 1199 if (dev->type != ARPHRD_ETHER) { 1200 unsigned int nflags = 0; 1201 1202 if (ipv4_is_multicast(p->iph.daddr)) 1203 nflags = IFF_BROADCAST; 1204 else if (p->iph.daddr) 1205 nflags = IFF_POINTOPOINT; 1206 1207 if ((dev->flags ^ nflags) & 1208 (IFF_POINTOPOINT | IFF_BROADCAST)) 1209 return -EINVAL; 1210 } 1211 } 1212 1213 ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark); 1214 return 0; 1215 } 1216 EXPORT_SYMBOL_GPL(ip_tunnel_changelink); 1217 1218 int ip_tunnel_init(struct net_device *dev) 1219 { 1220 struct ip_tunnel *tunnel = netdev_priv(dev); 1221 struct iphdr *iph = &tunnel->parms.iph; 1222 int err; 1223 1224 dev->needs_free_netdev = true; 1225 dev->priv_destructor = ip_tunnel_dev_free; 1226 dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); 1227 if (!dev->tstats) 1228 return -ENOMEM; 1229 1230 err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); 1231 if (err) { 1232 free_percpu(dev->tstats); 1233 return err; 1234 } 1235 1236 err = gro_cells_init(&tunnel->gro_cells, dev); 1237 if (err) { 1238 dst_cache_destroy(&tunnel->dst_cache); 1239 free_percpu(dev->tstats); 1240 return err; 1241 } 1242 1243 tunnel->dev = dev; 1244 tunnel->net = dev_net(dev); 1245 strcpy(tunnel->parms.name, dev->name); 1246 iph->version = 4; 1247 iph->ihl = 5; 1248 1249 if (tunnel->collect_md) 1250 netif_keep_dst(dev); 1251 return 0; 1252 } 1253 EXPORT_SYMBOL_GPL(ip_tunnel_init); 1254 1255 void ip_tunnel_uninit(struct net_device *dev) 1256 { 1257 struct ip_tunnel *tunnel = netdev_priv(dev); 1258 struct net *net = tunnel->net; 1259 struct ip_tunnel_net *itn; 1260 1261 itn = net_generic(net, tunnel->ip_tnl_net_id); 1262 /* fb_tunnel_dev will be unregisted in net-exit call. */ 1263 if (itn->fb_tunnel_dev != dev) 1264 ip_tunnel_del(itn, netdev_priv(dev)); 1265 1266 dst_cache_reset(&tunnel->dst_cache); 1267 } 1268 EXPORT_SYMBOL_GPL(ip_tunnel_uninit); 1269 1270 /* Do least required initialization, rest of init is done in tunnel_init call */ 1271 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id) 1272 { 1273 struct ip_tunnel *tunnel = netdev_priv(dev); 1274 tunnel->ip_tnl_net_id = net_id; 1275 } 1276 EXPORT_SYMBOL_GPL(ip_tunnel_setup); 1277 1278 MODULE_LICENSE("GPL"); 1279