1 /* 2 * Linux NET3: GRE over IP protocol decoder. 3 * 4 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru) 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 9 * 2 of the License, or (at your option) any later version. 10 * 11 */ 12 13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 14 15 #include <linux/capability.h> 16 #include <linux/module.h> 17 #include <linux/types.h> 18 #include <linux/kernel.h> 19 #include <linux/slab.h> 20 #include <asm/uaccess.h> 21 #include <linux/skbuff.h> 22 #include <linux/netdevice.h> 23 #include <linux/in.h> 24 #include <linux/tcp.h> 25 #include <linux/udp.h> 26 #include <linux/if_arp.h> 27 #include <linux/mroute.h> 28 #include <linux/init.h> 29 #include <linux/in6.h> 30 #include <linux/inetdevice.h> 31 #include <linux/igmp.h> 32 #include <linux/netfilter_ipv4.h> 33 #include <linux/etherdevice.h> 34 #include <linux/if_ether.h> 35 36 #include <net/sock.h> 37 #include <net/ip.h> 38 #include <net/icmp.h> 39 #include <net/protocol.h> 40 #include <net/ipip.h> 41 #include <net/arp.h> 42 #include <net/checksum.h> 43 #include <net/dsfield.h> 44 #include <net/inet_ecn.h> 45 #include <net/xfrm.h> 46 #include <net/net_namespace.h> 47 #include <net/netns/generic.h> 48 #include <net/rtnetlink.h> 49 #include <net/gre.h> 50 51 #if IS_ENABLED(CONFIG_IPV6) 52 #include <net/ipv6.h> 53 #include <net/ip6_fib.h> 54 #include <net/ip6_route.h> 55 #endif 56 57 /* 58 Problems & solutions 59 -------------------- 60 61 1. The most important issue is detecting local dead loops. 62 They would cause complete host lockup in transmit, which 63 would be "resolved" by stack overflow or, if queueing is enabled, 64 with infinite looping in net_bh. 65 66 We cannot track such dead loops during route installation, 67 it is infeasible task. The most general solutions would be 68 to keep skb->encapsulation counter (sort of local ttl), 69 and silently drop packet when it expires. It is a good 70 solution, but it supposes maintaining new variable in ALL 71 skb, even if no tunneling is used. 72 73 Current solution: xmit_recursion breaks dead loops. This is a percpu 74 counter, since when we enter the first ndo_xmit(), cpu migration is 75 forbidden. We force an exit if this counter reaches RECURSION_LIMIT 76 77 2. Networking dead loops would not kill routers, but would really 78 kill network. IP hop limit plays role of "t->recursion" in this case, 79 if we copy it from packet being encapsulated to upper header. 80 It is very good solution, but it introduces two problems: 81 82 - Routing protocols, using packets with ttl=1 (OSPF, RIP2), 83 do not work over tunnels. 84 - traceroute does not work. I planned to relay ICMP from tunnel, 85 so that this problem would be solved and traceroute output 86 would even more informative. This idea appeared to be wrong: 87 only Linux complies to rfc1812 now (yes, guys, Linux is the only 88 true router now :-)), all routers (at least, in neighbourhood of mine) 89 return only 8 bytes of payload. It is the end. 90 91 Hence, if we want that OSPF worked or traceroute said something reasonable, 92 we should search for another solution. 93 94 One of them is to parse packet trying to detect inner encapsulation 95 made by our node. It is difficult or even impossible, especially, 96 taking into account fragmentation. TO be short, ttl is not solution at all. 97 98 Current solution: The solution was UNEXPECTEDLY SIMPLE. 99 We force DF flag on tunnels with preconfigured hop limit, 100 that is ALL. :-) Well, it does not remove the problem completely, 101 but exponential growth of network traffic is changed to linear 102 (branches, that exceed pmtu are pruned) and tunnel mtu 103 rapidly degrades to value <68, where looping stops. 104 Yes, it is not good if there exists a router in the loop, 105 which does not force DF, even when encapsulating packets have DF set. 106 But it is not our problem! Nobody could accuse us, we made 107 all that we could make. Even if it is your gated who injected 108 fatal route to network, even if it were you who configured 109 fatal static route: you are innocent. :-) 110 111 112 113 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain 114 practically identical code. It would be good to glue them 115 together, but it is not very evident, how to make them modular. 116 sit is integral part of IPv6, ipip and gre are naturally modular. 117 We could extract common parts (hash table, ioctl etc) 118 to a separate module (ip_tunnel.c). 119 120 Alexey Kuznetsov. 121 */ 122 123 static bool log_ecn_error = true; 124 module_param(log_ecn_error, bool, 0644); 125 MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); 126 127 static struct rtnl_link_ops ipgre_link_ops __read_mostly; 128 static int ipgre_tunnel_init(struct net_device *dev); 129 static void ipgre_tunnel_setup(struct net_device *dev); 130 static int ipgre_tunnel_bind_dev(struct net_device *dev); 131 132 /* Fallback tunnel: no source, no destination, no key, no options */ 133 134 #define HASH_SIZE 16 135 136 static int ipgre_net_id __read_mostly; 137 struct ipgre_net { 138 struct ip_tunnel __rcu *tunnels[4][HASH_SIZE]; 139 140 struct net_device *fb_tunnel_dev; 141 }; 142 143 /* Tunnel hash table */ 144 145 /* 146 4 hash tables: 147 148 3: (remote,local) 149 2: (remote,*) 150 1: (*,local) 151 0: (*,*) 152 153 We require exact key match i.e. if a key is present in packet 154 it will match only tunnel with the same key; if it is not present, 155 it will match only keyless tunnel. 156 157 All keysless packets, if not matched configured keyless tunnels 158 will match fallback tunnel. 159 */ 160 161 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) 162 163 #define tunnels_r_l tunnels[3] 164 #define tunnels_r tunnels[2] 165 #define tunnels_l tunnels[1] 166 #define tunnels_wc tunnels[0] 167 /* 168 * Locking : hash tables are protected by RCU and RTNL 169 */ 170 171 #define for_each_ip_tunnel_rcu(start) \ 172 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) 173 174 /* often modified stats are per cpu, other are shared (netdev->stats) */ 175 struct pcpu_tstats { 176 u64 rx_packets; 177 u64 rx_bytes; 178 u64 tx_packets; 179 u64 tx_bytes; 180 struct u64_stats_sync syncp; 181 }; 182 183 static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev, 184 struct rtnl_link_stats64 *tot) 185 { 186 int i; 187 188 for_each_possible_cpu(i) { 189 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); 190 u64 rx_packets, rx_bytes, tx_packets, tx_bytes; 191 unsigned int start; 192 193 do { 194 start = u64_stats_fetch_begin_bh(&tstats->syncp); 195 rx_packets = tstats->rx_packets; 196 tx_packets = tstats->tx_packets; 197 rx_bytes = tstats->rx_bytes; 198 tx_bytes = tstats->tx_bytes; 199 } while (u64_stats_fetch_retry_bh(&tstats->syncp, start)); 200 201 tot->rx_packets += rx_packets; 202 tot->tx_packets += tx_packets; 203 tot->rx_bytes += rx_bytes; 204 tot->tx_bytes += tx_bytes; 205 } 206 207 tot->multicast = dev->stats.multicast; 208 tot->rx_crc_errors = dev->stats.rx_crc_errors; 209 tot->rx_fifo_errors = dev->stats.rx_fifo_errors; 210 tot->rx_length_errors = dev->stats.rx_length_errors; 211 tot->rx_frame_errors = dev->stats.rx_frame_errors; 212 tot->rx_errors = dev->stats.rx_errors; 213 214 tot->tx_fifo_errors = dev->stats.tx_fifo_errors; 215 tot->tx_carrier_errors = dev->stats.tx_carrier_errors; 216 tot->tx_dropped = dev->stats.tx_dropped; 217 tot->tx_aborted_errors = dev->stats.tx_aborted_errors; 218 tot->tx_errors = dev->stats.tx_errors; 219 220 return tot; 221 } 222 223 /* Does key in tunnel parameters match packet */ 224 static bool ipgre_key_match(const struct ip_tunnel_parm *p, 225 __be16 flags, __be32 key) 226 { 227 if (p->i_flags & GRE_KEY) { 228 if (flags & GRE_KEY) 229 return key == p->i_key; 230 else 231 return false; /* key expected, none present */ 232 } else 233 return !(flags & GRE_KEY); 234 } 235 236 /* Given src, dst and key, find appropriate for input tunnel. */ 237 238 static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev, 239 __be32 remote, __be32 local, 240 __be16 flags, __be32 key, 241 __be16 gre_proto) 242 { 243 struct net *net = dev_net(dev); 244 int link = dev->ifindex; 245 unsigned int h0 = HASH(remote); 246 unsigned int h1 = HASH(key); 247 struct ip_tunnel *t, *cand = NULL; 248 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 249 int dev_type = (gre_proto == htons(ETH_P_TEB)) ? 250 ARPHRD_ETHER : ARPHRD_IPGRE; 251 int score, cand_score = 4; 252 253 for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) { 254 if (local != t->parms.iph.saddr || 255 remote != t->parms.iph.daddr || 256 !(t->dev->flags & IFF_UP)) 257 continue; 258 259 if (!ipgre_key_match(&t->parms, flags, key)) 260 continue; 261 262 if (t->dev->type != ARPHRD_IPGRE && 263 t->dev->type != dev_type) 264 continue; 265 266 score = 0; 267 if (t->parms.link != link) 268 score |= 1; 269 if (t->dev->type != dev_type) 270 score |= 2; 271 if (score == 0) 272 return t; 273 274 if (score < cand_score) { 275 cand = t; 276 cand_score = score; 277 } 278 } 279 280 for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) { 281 if (remote != t->parms.iph.daddr || 282 !(t->dev->flags & IFF_UP)) 283 continue; 284 285 if (!ipgre_key_match(&t->parms, flags, key)) 286 continue; 287 288 if (t->dev->type != ARPHRD_IPGRE && 289 t->dev->type != dev_type) 290 continue; 291 292 score = 0; 293 if (t->parms.link != link) 294 score |= 1; 295 if (t->dev->type != dev_type) 296 score |= 2; 297 if (score == 0) 298 return t; 299 300 if (score < cand_score) { 301 cand = t; 302 cand_score = score; 303 } 304 } 305 306 for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) { 307 if ((local != t->parms.iph.saddr && 308 (local != t->parms.iph.daddr || 309 !ipv4_is_multicast(local))) || 310 !(t->dev->flags & IFF_UP)) 311 continue; 312 313 if (!ipgre_key_match(&t->parms, flags, key)) 314 continue; 315 316 if (t->dev->type != ARPHRD_IPGRE && 317 t->dev->type != dev_type) 318 continue; 319 320 score = 0; 321 if (t->parms.link != link) 322 score |= 1; 323 if (t->dev->type != dev_type) 324 score |= 2; 325 if (score == 0) 326 return t; 327 328 if (score < cand_score) { 329 cand = t; 330 cand_score = score; 331 } 332 } 333 334 for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) { 335 if (t->parms.i_key != key || 336 !(t->dev->flags & IFF_UP)) 337 continue; 338 339 if (t->dev->type != ARPHRD_IPGRE && 340 t->dev->type != dev_type) 341 continue; 342 343 score = 0; 344 if (t->parms.link != link) 345 score |= 1; 346 if (t->dev->type != dev_type) 347 score |= 2; 348 if (score == 0) 349 return t; 350 351 if (score < cand_score) { 352 cand = t; 353 cand_score = score; 354 } 355 } 356 357 if (cand != NULL) 358 return cand; 359 360 dev = ign->fb_tunnel_dev; 361 if (dev->flags & IFF_UP) 362 return netdev_priv(dev); 363 364 return NULL; 365 } 366 367 static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign, 368 struct ip_tunnel_parm *parms) 369 { 370 __be32 remote = parms->iph.daddr; 371 __be32 local = parms->iph.saddr; 372 __be32 key = parms->i_key; 373 unsigned int h = HASH(key); 374 int prio = 0; 375 376 if (local) 377 prio |= 1; 378 if (remote && !ipv4_is_multicast(remote)) { 379 prio |= 2; 380 h ^= HASH(remote); 381 } 382 383 return &ign->tunnels[prio][h]; 384 } 385 386 static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign, 387 struct ip_tunnel *t) 388 { 389 return __ipgre_bucket(ign, &t->parms); 390 } 391 392 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t) 393 { 394 struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t); 395 396 rcu_assign_pointer(t->next, rtnl_dereference(*tp)); 397 rcu_assign_pointer(*tp, t); 398 } 399 400 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t) 401 { 402 struct ip_tunnel __rcu **tp; 403 struct ip_tunnel *iter; 404 405 for (tp = ipgre_bucket(ign, t); 406 (iter = rtnl_dereference(*tp)) != NULL; 407 tp = &iter->next) { 408 if (t == iter) { 409 rcu_assign_pointer(*tp, t->next); 410 break; 411 } 412 } 413 } 414 415 static struct ip_tunnel *ipgre_tunnel_find(struct net *net, 416 struct ip_tunnel_parm *parms, 417 int type) 418 { 419 __be32 remote = parms->iph.daddr; 420 __be32 local = parms->iph.saddr; 421 __be32 key = parms->i_key; 422 int link = parms->link; 423 struct ip_tunnel *t; 424 struct ip_tunnel __rcu **tp; 425 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 426 427 for (tp = __ipgre_bucket(ign, parms); 428 (t = rtnl_dereference(*tp)) != NULL; 429 tp = &t->next) 430 if (local == t->parms.iph.saddr && 431 remote == t->parms.iph.daddr && 432 key == t->parms.i_key && 433 link == t->parms.link && 434 type == t->dev->type) 435 break; 436 437 return t; 438 } 439 440 static struct ip_tunnel *ipgre_tunnel_locate(struct net *net, 441 struct ip_tunnel_parm *parms, int create) 442 { 443 struct ip_tunnel *t, *nt; 444 struct net_device *dev; 445 char name[IFNAMSIZ]; 446 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 447 448 t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE); 449 if (t || !create) 450 return t; 451 452 if (parms->name[0]) 453 strlcpy(name, parms->name, IFNAMSIZ); 454 else 455 strcpy(name, "gre%d"); 456 457 dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup); 458 if (!dev) 459 return NULL; 460 461 dev_net_set(dev, net); 462 463 nt = netdev_priv(dev); 464 nt->parms = *parms; 465 dev->rtnl_link_ops = &ipgre_link_ops; 466 467 dev->mtu = ipgre_tunnel_bind_dev(dev); 468 469 if (register_netdevice(dev) < 0) 470 goto failed_free; 471 472 /* Can use a lockless transmit, unless we generate output sequences */ 473 if (!(nt->parms.o_flags & GRE_SEQ)) 474 dev->features |= NETIF_F_LLTX; 475 476 dev_hold(dev); 477 ipgre_tunnel_link(ign, nt); 478 return nt; 479 480 failed_free: 481 free_netdev(dev); 482 return NULL; 483 } 484 485 static void ipgre_tunnel_uninit(struct net_device *dev) 486 { 487 struct net *net = dev_net(dev); 488 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 489 490 ipgre_tunnel_unlink(ign, netdev_priv(dev)); 491 dev_put(dev); 492 } 493 494 495 static void ipgre_err(struct sk_buff *skb, u32 info) 496 { 497 498 /* All the routers (except for Linux) return only 499 8 bytes of packet payload. It means, that precise relaying of 500 ICMP in the real Internet is absolutely infeasible. 501 502 Moreover, Cisco "wise men" put GRE key to the third word 503 in GRE header. It makes impossible maintaining even soft state for keyed 504 GRE tunnels with enabled checksum. Tell them "thank you". 505 506 Well, I wonder, rfc1812 was written by Cisco employee, 507 what the hell these idiots break standards established 508 by themselves??? 509 */ 510 511 const struct iphdr *iph = (const struct iphdr *)skb->data; 512 __be16 *p = (__be16 *)(skb->data+(iph->ihl<<2)); 513 int grehlen = (iph->ihl<<2) + 4; 514 const int type = icmp_hdr(skb)->type; 515 const int code = icmp_hdr(skb)->code; 516 struct ip_tunnel *t; 517 __be16 flags; 518 __be32 key = 0; 519 520 flags = p[0]; 521 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) { 522 if (flags&(GRE_VERSION|GRE_ROUTING)) 523 return; 524 if (flags&GRE_KEY) { 525 grehlen += 4; 526 if (flags&GRE_CSUM) 527 grehlen += 4; 528 } 529 } 530 531 /* If only 8 bytes returned, keyed message will be dropped here */ 532 if (skb_headlen(skb) < grehlen) 533 return; 534 535 if (flags & GRE_KEY) 536 key = *(((__be32 *)p) + (grehlen / 4) - 1); 537 538 switch (type) { 539 default: 540 case ICMP_PARAMETERPROB: 541 return; 542 543 case ICMP_DEST_UNREACH: 544 switch (code) { 545 case ICMP_SR_FAILED: 546 case ICMP_PORT_UNREACH: 547 /* Impossible event. */ 548 return; 549 default: 550 /* All others are translated to HOST_UNREACH. 551 rfc2003 contains "deep thoughts" about NET_UNREACH, 552 I believe they are just ether pollution. --ANK 553 */ 554 break; 555 } 556 break; 557 case ICMP_TIME_EXCEEDED: 558 if (code != ICMP_EXC_TTL) 559 return; 560 break; 561 562 case ICMP_REDIRECT: 563 break; 564 } 565 566 t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr, 567 flags, key, p[1]); 568 569 if (t == NULL) 570 return; 571 572 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { 573 ipv4_update_pmtu(skb, dev_net(skb->dev), info, 574 t->parms.link, 0, IPPROTO_GRE, 0); 575 return; 576 } 577 if (type == ICMP_REDIRECT) { 578 ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0, 579 IPPROTO_GRE, 0); 580 return; 581 } 582 if (t->parms.iph.daddr == 0 || 583 ipv4_is_multicast(t->parms.iph.daddr)) 584 return; 585 586 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) 587 return; 588 589 if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) 590 t->err_count++; 591 else 592 t->err_count = 1; 593 t->err_time = jiffies; 594 } 595 596 static inline u8 597 ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb) 598 { 599 u8 inner = 0; 600 if (skb->protocol == htons(ETH_P_IP)) 601 inner = old_iph->tos; 602 else if (skb->protocol == htons(ETH_P_IPV6)) 603 inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph); 604 return INET_ECN_encapsulate(tos, inner); 605 } 606 607 static int ipgre_rcv(struct sk_buff *skb) 608 { 609 const struct iphdr *iph; 610 u8 *h; 611 __be16 flags; 612 __sum16 csum = 0; 613 __be32 key = 0; 614 u32 seqno = 0; 615 struct ip_tunnel *tunnel; 616 int offset = 4; 617 __be16 gre_proto; 618 int err; 619 620 if (!pskb_may_pull(skb, 16)) 621 goto drop; 622 623 iph = ip_hdr(skb); 624 h = skb->data; 625 flags = *(__be16 *)h; 626 627 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) { 628 /* - Version must be 0. 629 - We do not support routing headers. 630 */ 631 if (flags&(GRE_VERSION|GRE_ROUTING)) 632 goto drop; 633 634 if (flags&GRE_CSUM) { 635 switch (skb->ip_summed) { 636 case CHECKSUM_COMPLETE: 637 csum = csum_fold(skb->csum); 638 if (!csum) 639 break; 640 /* fall through */ 641 case CHECKSUM_NONE: 642 skb->csum = 0; 643 csum = __skb_checksum_complete(skb); 644 skb->ip_summed = CHECKSUM_COMPLETE; 645 } 646 offset += 4; 647 } 648 if (flags&GRE_KEY) { 649 key = *(__be32 *)(h + offset); 650 offset += 4; 651 } 652 if (flags&GRE_SEQ) { 653 seqno = ntohl(*(__be32 *)(h + offset)); 654 offset += 4; 655 } 656 } 657 658 gre_proto = *(__be16 *)(h + 2); 659 660 tunnel = ipgre_tunnel_lookup(skb->dev, 661 iph->saddr, iph->daddr, flags, key, 662 gre_proto); 663 if (tunnel) { 664 struct pcpu_tstats *tstats; 665 666 secpath_reset(skb); 667 668 skb->protocol = gre_proto; 669 /* WCCP version 1 and 2 protocol decoding. 670 * - Change protocol to IP 671 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header 672 */ 673 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) { 674 skb->protocol = htons(ETH_P_IP); 675 if ((*(h + offset) & 0xF0) != 0x40) 676 offset += 4; 677 } 678 679 skb->mac_header = skb->network_header; 680 __pskb_pull(skb, offset); 681 skb_postpull_rcsum(skb, skb_transport_header(skb), offset); 682 skb->pkt_type = PACKET_HOST; 683 #ifdef CONFIG_NET_IPGRE_BROADCAST 684 if (ipv4_is_multicast(iph->daddr)) { 685 /* Looped back packet, drop it! */ 686 if (rt_is_output_route(skb_rtable(skb))) 687 goto drop; 688 tunnel->dev->stats.multicast++; 689 skb->pkt_type = PACKET_BROADCAST; 690 } 691 #endif 692 693 if (((flags&GRE_CSUM) && csum) || 694 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) { 695 tunnel->dev->stats.rx_crc_errors++; 696 tunnel->dev->stats.rx_errors++; 697 goto drop; 698 } 699 if (tunnel->parms.i_flags&GRE_SEQ) { 700 if (!(flags&GRE_SEQ) || 701 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) { 702 tunnel->dev->stats.rx_fifo_errors++; 703 tunnel->dev->stats.rx_errors++; 704 goto drop; 705 } 706 tunnel->i_seqno = seqno + 1; 707 } 708 709 /* Warning: All skb pointers will be invalidated! */ 710 if (tunnel->dev->type == ARPHRD_ETHER) { 711 if (!pskb_may_pull(skb, ETH_HLEN)) { 712 tunnel->dev->stats.rx_length_errors++; 713 tunnel->dev->stats.rx_errors++; 714 goto drop; 715 } 716 717 iph = ip_hdr(skb); 718 skb->protocol = eth_type_trans(skb, tunnel->dev); 719 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 720 } 721 722 __skb_tunnel_rx(skb, tunnel->dev); 723 724 skb_reset_network_header(skb); 725 err = IP_ECN_decapsulate(iph, skb); 726 if (unlikely(err)) { 727 if (log_ecn_error) 728 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", 729 &iph->saddr, iph->tos); 730 if (err > 1) { 731 ++tunnel->dev->stats.rx_frame_errors; 732 ++tunnel->dev->stats.rx_errors; 733 goto drop; 734 } 735 } 736 737 tstats = this_cpu_ptr(tunnel->dev->tstats); 738 u64_stats_update_begin(&tstats->syncp); 739 tstats->rx_packets++; 740 tstats->rx_bytes += skb->len; 741 u64_stats_update_end(&tstats->syncp); 742 743 gro_cells_receive(&tunnel->gro_cells, skb); 744 return 0; 745 } 746 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); 747 748 drop: 749 kfree_skb(skb); 750 return 0; 751 } 752 753 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) 754 { 755 struct ip_tunnel *tunnel = netdev_priv(dev); 756 struct pcpu_tstats *tstats; 757 const struct iphdr *old_iph = ip_hdr(skb); 758 const struct iphdr *tiph; 759 struct flowi4 fl4; 760 u8 tos; 761 __be16 df; 762 struct rtable *rt; /* Route to the other host */ 763 struct net_device *tdev; /* Device to other host */ 764 struct iphdr *iph; /* Our new IP header */ 765 unsigned int max_headroom; /* The extra header space needed */ 766 int gre_hlen; 767 __be32 dst; 768 int mtu; 769 770 if (skb->ip_summed == CHECKSUM_PARTIAL && 771 skb_checksum_help(skb)) 772 goto tx_error; 773 774 if (dev->type == ARPHRD_ETHER) 775 IPCB(skb)->flags = 0; 776 777 if (dev->header_ops && dev->type == ARPHRD_IPGRE) { 778 gre_hlen = 0; 779 tiph = (const struct iphdr *)skb->data; 780 } else { 781 gre_hlen = tunnel->hlen; 782 tiph = &tunnel->parms.iph; 783 } 784 785 if ((dst = tiph->daddr) == 0) { 786 /* NBMA tunnel */ 787 788 if (skb_dst(skb) == NULL) { 789 dev->stats.tx_fifo_errors++; 790 goto tx_error; 791 } 792 793 if (skb->protocol == htons(ETH_P_IP)) { 794 rt = skb_rtable(skb); 795 dst = rt_nexthop(rt, old_iph->daddr); 796 } 797 #if IS_ENABLED(CONFIG_IPV6) 798 else if (skb->protocol == htons(ETH_P_IPV6)) { 799 const struct in6_addr *addr6; 800 struct neighbour *neigh; 801 bool do_tx_error_icmp; 802 int addr_type; 803 804 neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr); 805 if (neigh == NULL) 806 goto tx_error; 807 808 addr6 = (const struct in6_addr *)&neigh->primary_key; 809 addr_type = ipv6_addr_type(addr6); 810 811 if (addr_type == IPV6_ADDR_ANY) { 812 addr6 = &ipv6_hdr(skb)->daddr; 813 addr_type = ipv6_addr_type(addr6); 814 } 815 816 if ((addr_type & IPV6_ADDR_COMPATv4) == 0) 817 do_tx_error_icmp = true; 818 else { 819 do_tx_error_icmp = false; 820 dst = addr6->s6_addr32[3]; 821 } 822 neigh_release(neigh); 823 if (do_tx_error_icmp) 824 goto tx_error_icmp; 825 } 826 #endif 827 else 828 goto tx_error; 829 } 830 831 tos = tiph->tos; 832 if (tos == 1) { 833 tos = 0; 834 if (skb->protocol == htons(ETH_P_IP)) 835 tos = old_iph->tos; 836 else if (skb->protocol == htons(ETH_P_IPV6)) 837 tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph); 838 } 839 840 rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr, 841 tunnel->parms.o_key, RT_TOS(tos), 842 tunnel->parms.link); 843 if (IS_ERR(rt)) { 844 dev->stats.tx_carrier_errors++; 845 goto tx_error; 846 } 847 tdev = rt->dst.dev; 848 849 if (tdev == dev) { 850 ip_rt_put(rt); 851 dev->stats.collisions++; 852 goto tx_error; 853 } 854 855 df = tiph->frag_off; 856 if (df) 857 mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen; 858 else 859 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; 860 861 if (skb_dst(skb)) 862 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); 863 864 if (skb->protocol == htons(ETH_P_IP)) { 865 df |= (old_iph->frag_off&htons(IP_DF)); 866 867 if ((old_iph->frag_off&htons(IP_DF)) && 868 mtu < ntohs(old_iph->tot_len)) { 869 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); 870 ip_rt_put(rt); 871 goto tx_error; 872 } 873 } 874 #if IS_ENABLED(CONFIG_IPV6) 875 else if (skb->protocol == htons(ETH_P_IPV6)) { 876 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); 877 878 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) { 879 if ((tunnel->parms.iph.daddr && 880 !ipv4_is_multicast(tunnel->parms.iph.daddr)) || 881 rt6->rt6i_dst.plen == 128) { 882 rt6->rt6i_flags |= RTF_MODIFIED; 883 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu); 884 } 885 } 886 887 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) { 888 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 889 ip_rt_put(rt); 890 goto tx_error; 891 } 892 } 893 #endif 894 895 if (tunnel->err_count > 0) { 896 if (time_before(jiffies, 897 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { 898 tunnel->err_count--; 899 900 dst_link_failure(skb); 901 } else 902 tunnel->err_count = 0; 903 } 904 905 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len; 906 907 if (skb_headroom(skb) < max_headroom || skb_shared(skb)|| 908 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { 909 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); 910 if (max_headroom > dev->needed_headroom) 911 dev->needed_headroom = max_headroom; 912 if (!new_skb) { 913 ip_rt_put(rt); 914 dev->stats.tx_dropped++; 915 dev_kfree_skb(skb); 916 return NETDEV_TX_OK; 917 } 918 if (skb->sk) 919 skb_set_owner_w(new_skb, skb->sk); 920 dev_kfree_skb(skb); 921 skb = new_skb; 922 old_iph = ip_hdr(skb); 923 } 924 925 skb_reset_transport_header(skb); 926 skb_push(skb, gre_hlen); 927 skb_reset_network_header(skb); 928 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 929 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | 930 IPSKB_REROUTED); 931 skb_dst_drop(skb); 932 skb_dst_set(skb, &rt->dst); 933 934 /* 935 * Push down and install the IPIP header. 936 */ 937 938 iph = ip_hdr(skb); 939 iph->version = 4; 940 iph->ihl = sizeof(struct iphdr) >> 2; 941 iph->frag_off = df; 942 iph->protocol = IPPROTO_GRE; 943 iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb); 944 iph->daddr = fl4.daddr; 945 iph->saddr = fl4.saddr; 946 947 if ((iph->ttl = tiph->ttl) == 0) { 948 if (skb->protocol == htons(ETH_P_IP)) 949 iph->ttl = old_iph->ttl; 950 #if IS_ENABLED(CONFIG_IPV6) 951 else if (skb->protocol == htons(ETH_P_IPV6)) 952 iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit; 953 #endif 954 else 955 iph->ttl = ip4_dst_hoplimit(&rt->dst); 956 } 957 958 ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags; 959 ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ? 960 htons(ETH_P_TEB) : skb->protocol; 961 962 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) { 963 __be32 *ptr = (__be32 *)(((u8 *)iph) + tunnel->hlen - 4); 964 965 if (tunnel->parms.o_flags&GRE_SEQ) { 966 ++tunnel->o_seqno; 967 *ptr = htonl(tunnel->o_seqno); 968 ptr--; 969 } 970 if (tunnel->parms.o_flags&GRE_KEY) { 971 *ptr = tunnel->parms.o_key; 972 ptr--; 973 } 974 if (tunnel->parms.o_flags&GRE_CSUM) { 975 *ptr = 0; 976 *(__sum16 *)ptr = ip_compute_csum((void *)(iph+1), skb->len - sizeof(struct iphdr)); 977 } 978 } 979 980 nf_reset(skb); 981 tstats = this_cpu_ptr(dev->tstats); 982 __IPTUNNEL_XMIT(tstats, &dev->stats); 983 return NETDEV_TX_OK; 984 985 #if IS_ENABLED(CONFIG_IPV6) 986 tx_error_icmp: 987 dst_link_failure(skb); 988 #endif 989 tx_error: 990 dev->stats.tx_errors++; 991 dev_kfree_skb(skb); 992 return NETDEV_TX_OK; 993 } 994 995 static int ipgre_tunnel_bind_dev(struct net_device *dev) 996 { 997 struct net_device *tdev = NULL; 998 struct ip_tunnel *tunnel; 999 const struct iphdr *iph; 1000 int hlen = LL_MAX_HEADER; 1001 int mtu = ETH_DATA_LEN; 1002 int addend = sizeof(struct iphdr) + 4; 1003 1004 tunnel = netdev_priv(dev); 1005 iph = &tunnel->parms.iph; 1006 1007 /* Guess output device to choose reasonable mtu and needed_headroom */ 1008 1009 if (iph->daddr) { 1010 struct flowi4 fl4; 1011 struct rtable *rt; 1012 1013 rt = ip_route_output_gre(dev_net(dev), &fl4, 1014 iph->daddr, iph->saddr, 1015 tunnel->parms.o_key, 1016 RT_TOS(iph->tos), 1017 tunnel->parms.link); 1018 if (!IS_ERR(rt)) { 1019 tdev = rt->dst.dev; 1020 ip_rt_put(rt); 1021 } 1022 1023 if (dev->type != ARPHRD_ETHER) 1024 dev->flags |= IFF_POINTOPOINT; 1025 } 1026 1027 if (!tdev && tunnel->parms.link) 1028 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); 1029 1030 if (tdev) { 1031 hlen = tdev->hard_header_len + tdev->needed_headroom; 1032 mtu = tdev->mtu; 1033 } 1034 dev->iflink = tunnel->parms.link; 1035 1036 /* Precalculate GRE options length */ 1037 if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) { 1038 if (tunnel->parms.o_flags&GRE_CSUM) 1039 addend += 4; 1040 if (tunnel->parms.o_flags&GRE_KEY) 1041 addend += 4; 1042 if (tunnel->parms.o_flags&GRE_SEQ) 1043 addend += 4; 1044 } 1045 dev->needed_headroom = addend + hlen; 1046 mtu -= dev->hard_header_len + addend; 1047 1048 if (mtu < 68) 1049 mtu = 68; 1050 1051 tunnel->hlen = addend; 1052 1053 return mtu; 1054 } 1055 1056 static int 1057 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) 1058 { 1059 int err = 0; 1060 struct ip_tunnel_parm p; 1061 struct ip_tunnel *t; 1062 struct net *net = dev_net(dev); 1063 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 1064 1065 switch (cmd) { 1066 case SIOCGETTUNNEL: 1067 t = NULL; 1068 if (dev == ign->fb_tunnel_dev) { 1069 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { 1070 err = -EFAULT; 1071 break; 1072 } 1073 t = ipgre_tunnel_locate(net, &p, 0); 1074 } 1075 if (t == NULL) 1076 t = netdev_priv(dev); 1077 memcpy(&p, &t->parms, sizeof(p)); 1078 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) 1079 err = -EFAULT; 1080 break; 1081 1082 case SIOCADDTUNNEL: 1083 case SIOCCHGTUNNEL: 1084 err = -EPERM; 1085 if (!capable(CAP_NET_ADMIN)) 1086 goto done; 1087 1088 err = -EFAULT; 1089 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) 1090 goto done; 1091 1092 err = -EINVAL; 1093 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE || 1094 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) || 1095 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING))) 1096 goto done; 1097 if (p.iph.ttl) 1098 p.iph.frag_off |= htons(IP_DF); 1099 1100 if (!(p.i_flags&GRE_KEY)) 1101 p.i_key = 0; 1102 if (!(p.o_flags&GRE_KEY)) 1103 p.o_key = 0; 1104 1105 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL); 1106 1107 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { 1108 if (t != NULL) { 1109 if (t->dev != dev) { 1110 err = -EEXIST; 1111 break; 1112 } 1113 } else { 1114 unsigned int nflags = 0; 1115 1116 t = netdev_priv(dev); 1117 1118 if (ipv4_is_multicast(p.iph.daddr)) 1119 nflags = IFF_BROADCAST; 1120 else if (p.iph.daddr) 1121 nflags = IFF_POINTOPOINT; 1122 1123 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) { 1124 err = -EINVAL; 1125 break; 1126 } 1127 ipgre_tunnel_unlink(ign, t); 1128 synchronize_net(); 1129 t->parms.iph.saddr = p.iph.saddr; 1130 t->parms.iph.daddr = p.iph.daddr; 1131 t->parms.i_key = p.i_key; 1132 t->parms.o_key = p.o_key; 1133 memcpy(dev->dev_addr, &p.iph.saddr, 4); 1134 memcpy(dev->broadcast, &p.iph.daddr, 4); 1135 ipgre_tunnel_link(ign, t); 1136 netdev_state_change(dev); 1137 } 1138 } 1139 1140 if (t) { 1141 err = 0; 1142 if (cmd == SIOCCHGTUNNEL) { 1143 t->parms.iph.ttl = p.iph.ttl; 1144 t->parms.iph.tos = p.iph.tos; 1145 t->parms.iph.frag_off = p.iph.frag_off; 1146 if (t->parms.link != p.link) { 1147 t->parms.link = p.link; 1148 dev->mtu = ipgre_tunnel_bind_dev(dev); 1149 netdev_state_change(dev); 1150 } 1151 } 1152 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) 1153 err = -EFAULT; 1154 } else 1155 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); 1156 break; 1157 1158 case SIOCDELTUNNEL: 1159 err = -EPERM; 1160 if (!capable(CAP_NET_ADMIN)) 1161 goto done; 1162 1163 if (dev == ign->fb_tunnel_dev) { 1164 err = -EFAULT; 1165 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) 1166 goto done; 1167 err = -ENOENT; 1168 if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL) 1169 goto done; 1170 err = -EPERM; 1171 if (t == netdev_priv(ign->fb_tunnel_dev)) 1172 goto done; 1173 dev = t->dev; 1174 } 1175 unregister_netdevice(dev); 1176 err = 0; 1177 break; 1178 1179 default: 1180 err = -EINVAL; 1181 } 1182 1183 done: 1184 return err; 1185 } 1186 1187 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu) 1188 { 1189 struct ip_tunnel *tunnel = netdev_priv(dev); 1190 if (new_mtu < 68 || 1191 new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen) 1192 return -EINVAL; 1193 dev->mtu = new_mtu; 1194 return 0; 1195 } 1196 1197 /* Nice toy. Unfortunately, useless in real life :-) 1198 It allows to construct virtual multiprotocol broadcast "LAN" 1199 over the Internet, provided multicast routing is tuned. 1200 1201 1202 I have no idea was this bicycle invented before me, 1203 so that I had to set ARPHRD_IPGRE to a random value. 1204 I have an impression, that Cisco could make something similar, 1205 but this feature is apparently missing in IOS<=11.2(8). 1206 1207 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks 1208 with broadcast 224.66.66.66. If you have access to mbone, play with me :-) 1209 1210 ping -t 255 224.66.66.66 1211 1212 If nobody answers, mbone does not work. 1213 1214 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255 1215 ip addr add 10.66.66.<somewhat>/24 dev Universe 1216 ifconfig Universe up 1217 ifconfig Universe add fe80::<Your_real_addr>/10 1218 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96 1219 ftp 10.66.66.66 1220 ... 1221 ftp fec0:6666:6666::193.233.7.65 1222 ... 1223 1224 */ 1225 1226 static int ipgre_header(struct sk_buff *skb, struct net_device *dev, 1227 unsigned short type, 1228 const void *daddr, const void *saddr, unsigned int len) 1229 { 1230 struct ip_tunnel *t = netdev_priv(dev); 1231 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen); 1232 __be16 *p = (__be16 *)(iph+1); 1233 1234 memcpy(iph, &t->parms.iph, sizeof(struct iphdr)); 1235 p[0] = t->parms.o_flags; 1236 p[1] = htons(type); 1237 1238 /* 1239 * Set the source hardware address. 1240 */ 1241 1242 if (saddr) 1243 memcpy(&iph->saddr, saddr, 4); 1244 if (daddr) 1245 memcpy(&iph->daddr, daddr, 4); 1246 if (iph->daddr) 1247 return t->hlen; 1248 1249 return -t->hlen; 1250 } 1251 1252 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr) 1253 { 1254 const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb); 1255 memcpy(haddr, &iph->saddr, 4); 1256 return 4; 1257 } 1258 1259 static const struct header_ops ipgre_header_ops = { 1260 .create = ipgre_header, 1261 .parse = ipgre_header_parse, 1262 }; 1263 1264 #ifdef CONFIG_NET_IPGRE_BROADCAST 1265 static int ipgre_open(struct net_device *dev) 1266 { 1267 struct ip_tunnel *t = netdev_priv(dev); 1268 1269 if (ipv4_is_multicast(t->parms.iph.daddr)) { 1270 struct flowi4 fl4; 1271 struct rtable *rt; 1272 1273 rt = ip_route_output_gre(dev_net(dev), &fl4, 1274 t->parms.iph.daddr, 1275 t->parms.iph.saddr, 1276 t->parms.o_key, 1277 RT_TOS(t->parms.iph.tos), 1278 t->parms.link); 1279 if (IS_ERR(rt)) 1280 return -EADDRNOTAVAIL; 1281 dev = rt->dst.dev; 1282 ip_rt_put(rt); 1283 if (__in_dev_get_rtnl(dev) == NULL) 1284 return -EADDRNOTAVAIL; 1285 t->mlink = dev->ifindex; 1286 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr); 1287 } 1288 return 0; 1289 } 1290 1291 static int ipgre_close(struct net_device *dev) 1292 { 1293 struct ip_tunnel *t = netdev_priv(dev); 1294 1295 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) { 1296 struct in_device *in_dev; 1297 in_dev = inetdev_by_index(dev_net(dev), t->mlink); 1298 if (in_dev) 1299 ip_mc_dec_group(in_dev, t->parms.iph.daddr); 1300 } 1301 return 0; 1302 } 1303 1304 #endif 1305 1306 static const struct net_device_ops ipgre_netdev_ops = { 1307 .ndo_init = ipgre_tunnel_init, 1308 .ndo_uninit = ipgre_tunnel_uninit, 1309 #ifdef CONFIG_NET_IPGRE_BROADCAST 1310 .ndo_open = ipgre_open, 1311 .ndo_stop = ipgre_close, 1312 #endif 1313 .ndo_start_xmit = ipgre_tunnel_xmit, 1314 .ndo_do_ioctl = ipgre_tunnel_ioctl, 1315 .ndo_change_mtu = ipgre_tunnel_change_mtu, 1316 .ndo_get_stats64 = ipgre_get_stats64, 1317 }; 1318 1319 static void ipgre_dev_free(struct net_device *dev) 1320 { 1321 struct ip_tunnel *tunnel = netdev_priv(dev); 1322 1323 gro_cells_destroy(&tunnel->gro_cells); 1324 free_percpu(dev->tstats); 1325 free_netdev(dev); 1326 } 1327 1328 #define GRE_FEATURES (NETIF_F_SG | \ 1329 NETIF_F_FRAGLIST | \ 1330 NETIF_F_HIGHDMA | \ 1331 NETIF_F_HW_CSUM) 1332 1333 static void ipgre_tunnel_setup(struct net_device *dev) 1334 { 1335 dev->netdev_ops = &ipgre_netdev_ops; 1336 dev->destructor = ipgre_dev_free; 1337 1338 dev->type = ARPHRD_IPGRE; 1339 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; 1340 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4; 1341 dev->flags = IFF_NOARP; 1342 dev->iflink = 0; 1343 dev->addr_len = 4; 1344 dev->features |= NETIF_F_NETNS_LOCAL; 1345 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; 1346 1347 dev->features |= GRE_FEATURES; 1348 dev->hw_features |= GRE_FEATURES; 1349 } 1350 1351 static int ipgre_tunnel_init(struct net_device *dev) 1352 { 1353 struct ip_tunnel *tunnel; 1354 struct iphdr *iph; 1355 int err; 1356 1357 tunnel = netdev_priv(dev); 1358 iph = &tunnel->parms.iph; 1359 1360 tunnel->dev = dev; 1361 strcpy(tunnel->parms.name, dev->name); 1362 1363 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); 1364 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); 1365 1366 if (iph->daddr) { 1367 #ifdef CONFIG_NET_IPGRE_BROADCAST 1368 if (ipv4_is_multicast(iph->daddr)) { 1369 if (!iph->saddr) 1370 return -EINVAL; 1371 dev->flags = IFF_BROADCAST; 1372 dev->header_ops = &ipgre_header_ops; 1373 } 1374 #endif 1375 } else 1376 dev->header_ops = &ipgre_header_ops; 1377 1378 dev->tstats = alloc_percpu(struct pcpu_tstats); 1379 if (!dev->tstats) 1380 return -ENOMEM; 1381 1382 err = gro_cells_init(&tunnel->gro_cells, dev); 1383 if (err) { 1384 free_percpu(dev->tstats); 1385 return err; 1386 } 1387 1388 return 0; 1389 } 1390 1391 static void ipgre_fb_tunnel_init(struct net_device *dev) 1392 { 1393 struct ip_tunnel *tunnel = netdev_priv(dev); 1394 struct iphdr *iph = &tunnel->parms.iph; 1395 1396 tunnel->dev = dev; 1397 strcpy(tunnel->parms.name, dev->name); 1398 1399 iph->version = 4; 1400 iph->protocol = IPPROTO_GRE; 1401 iph->ihl = 5; 1402 tunnel->hlen = sizeof(struct iphdr) + 4; 1403 1404 dev_hold(dev); 1405 } 1406 1407 1408 static const struct gre_protocol ipgre_protocol = { 1409 .handler = ipgre_rcv, 1410 .err_handler = ipgre_err, 1411 }; 1412 1413 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head) 1414 { 1415 int prio; 1416 1417 for (prio = 0; prio < 4; prio++) { 1418 int h; 1419 for (h = 0; h < HASH_SIZE; h++) { 1420 struct ip_tunnel *t; 1421 1422 t = rtnl_dereference(ign->tunnels[prio][h]); 1423 1424 while (t != NULL) { 1425 unregister_netdevice_queue(t->dev, head); 1426 t = rtnl_dereference(t->next); 1427 } 1428 } 1429 } 1430 } 1431 1432 static int __net_init ipgre_init_net(struct net *net) 1433 { 1434 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 1435 int err; 1436 1437 ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0", 1438 ipgre_tunnel_setup); 1439 if (!ign->fb_tunnel_dev) { 1440 err = -ENOMEM; 1441 goto err_alloc_dev; 1442 } 1443 dev_net_set(ign->fb_tunnel_dev, net); 1444 1445 ipgre_fb_tunnel_init(ign->fb_tunnel_dev); 1446 ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops; 1447 1448 if ((err = register_netdev(ign->fb_tunnel_dev))) 1449 goto err_reg_dev; 1450 1451 rcu_assign_pointer(ign->tunnels_wc[0], 1452 netdev_priv(ign->fb_tunnel_dev)); 1453 return 0; 1454 1455 err_reg_dev: 1456 ipgre_dev_free(ign->fb_tunnel_dev); 1457 err_alloc_dev: 1458 return err; 1459 } 1460 1461 static void __net_exit ipgre_exit_net(struct net *net) 1462 { 1463 struct ipgre_net *ign; 1464 LIST_HEAD(list); 1465 1466 ign = net_generic(net, ipgre_net_id); 1467 rtnl_lock(); 1468 ipgre_destroy_tunnels(ign, &list); 1469 unregister_netdevice_many(&list); 1470 rtnl_unlock(); 1471 } 1472 1473 static struct pernet_operations ipgre_net_ops = { 1474 .init = ipgre_init_net, 1475 .exit = ipgre_exit_net, 1476 .id = &ipgre_net_id, 1477 .size = sizeof(struct ipgre_net), 1478 }; 1479 1480 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) 1481 { 1482 __be16 flags; 1483 1484 if (!data) 1485 return 0; 1486 1487 flags = 0; 1488 if (data[IFLA_GRE_IFLAGS]) 1489 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]); 1490 if (data[IFLA_GRE_OFLAGS]) 1491 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]); 1492 if (flags & (GRE_VERSION|GRE_ROUTING)) 1493 return -EINVAL; 1494 1495 return 0; 1496 } 1497 1498 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[]) 1499 { 1500 __be32 daddr; 1501 1502 if (tb[IFLA_ADDRESS]) { 1503 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) 1504 return -EINVAL; 1505 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) 1506 return -EADDRNOTAVAIL; 1507 } 1508 1509 if (!data) 1510 goto out; 1511 1512 if (data[IFLA_GRE_REMOTE]) { 1513 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4); 1514 if (!daddr) 1515 return -EINVAL; 1516 } 1517 1518 out: 1519 return ipgre_tunnel_validate(tb, data); 1520 } 1521 1522 static void ipgre_netlink_parms(struct nlattr *data[], 1523 struct ip_tunnel_parm *parms) 1524 { 1525 memset(parms, 0, sizeof(*parms)); 1526 1527 parms->iph.protocol = IPPROTO_GRE; 1528 1529 if (!data) 1530 return; 1531 1532 if (data[IFLA_GRE_LINK]) 1533 parms->link = nla_get_u32(data[IFLA_GRE_LINK]); 1534 1535 if (data[IFLA_GRE_IFLAGS]) 1536 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]); 1537 1538 if (data[IFLA_GRE_OFLAGS]) 1539 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]); 1540 1541 if (data[IFLA_GRE_IKEY]) 1542 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]); 1543 1544 if (data[IFLA_GRE_OKEY]) 1545 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]); 1546 1547 if (data[IFLA_GRE_LOCAL]) 1548 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]); 1549 1550 if (data[IFLA_GRE_REMOTE]) 1551 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]); 1552 1553 if (data[IFLA_GRE_TTL]) 1554 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]); 1555 1556 if (data[IFLA_GRE_TOS]) 1557 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]); 1558 1559 if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) 1560 parms->iph.frag_off = htons(IP_DF); 1561 } 1562 1563 static int ipgre_tap_init(struct net_device *dev) 1564 { 1565 struct ip_tunnel *tunnel; 1566 1567 tunnel = netdev_priv(dev); 1568 1569 tunnel->dev = dev; 1570 strcpy(tunnel->parms.name, dev->name); 1571 1572 ipgre_tunnel_bind_dev(dev); 1573 1574 dev->tstats = alloc_percpu(struct pcpu_tstats); 1575 if (!dev->tstats) 1576 return -ENOMEM; 1577 1578 return 0; 1579 } 1580 1581 static const struct net_device_ops ipgre_tap_netdev_ops = { 1582 .ndo_init = ipgre_tap_init, 1583 .ndo_uninit = ipgre_tunnel_uninit, 1584 .ndo_start_xmit = ipgre_tunnel_xmit, 1585 .ndo_set_mac_address = eth_mac_addr, 1586 .ndo_validate_addr = eth_validate_addr, 1587 .ndo_change_mtu = ipgre_tunnel_change_mtu, 1588 .ndo_get_stats64 = ipgre_get_stats64, 1589 }; 1590 1591 static void ipgre_tap_setup(struct net_device *dev) 1592 { 1593 1594 ether_setup(dev); 1595 1596 dev->netdev_ops = &ipgre_tap_netdev_ops; 1597 dev->destructor = ipgre_dev_free; 1598 1599 dev->iflink = 0; 1600 dev->features |= NETIF_F_NETNS_LOCAL; 1601 } 1602 1603 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], 1604 struct nlattr *data[]) 1605 { 1606 struct ip_tunnel *nt; 1607 struct net *net = dev_net(dev); 1608 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 1609 int mtu; 1610 int err; 1611 1612 nt = netdev_priv(dev); 1613 ipgre_netlink_parms(data, &nt->parms); 1614 1615 if (ipgre_tunnel_find(net, &nt->parms, dev->type)) 1616 return -EEXIST; 1617 1618 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) 1619 eth_hw_addr_random(dev); 1620 1621 mtu = ipgre_tunnel_bind_dev(dev); 1622 if (!tb[IFLA_MTU]) 1623 dev->mtu = mtu; 1624 1625 /* Can use a lockless transmit, unless we generate output sequences */ 1626 if (!(nt->parms.o_flags & GRE_SEQ)) 1627 dev->features |= NETIF_F_LLTX; 1628 1629 err = register_netdevice(dev); 1630 if (err) 1631 goto out; 1632 1633 dev_hold(dev); 1634 ipgre_tunnel_link(ign, nt); 1635 1636 out: 1637 return err; 1638 } 1639 1640 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], 1641 struct nlattr *data[]) 1642 { 1643 struct ip_tunnel *t, *nt; 1644 struct net *net = dev_net(dev); 1645 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 1646 struct ip_tunnel_parm p; 1647 int mtu; 1648 1649 if (dev == ign->fb_tunnel_dev) 1650 return -EINVAL; 1651 1652 nt = netdev_priv(dev); 1653 ipgre_netlink_parms(data, &p); 1654 1655 t = ipgre_tunnel_locate(net, &p, 0); 1656 1657 if (t) { 1658 if (t->dev != dev) 1659 return -EEXIST; 1660 } else { 1661 t = nt; 1662 1663 if (dev->type != ARPHRD_ETHER) { 1664 unsigned int nflags = 0; 1665 1666 if (ipv4_is_multicast(p.iph.daddr)) 1667 nflags = IFF_BROADCAST; 1668 else if (p.iph.daddr) 1669 nflags = IFF_POINTOPOINT; 1670 1671 if ((dev->flags ^ nflags) & 1672 (IFF_POINTOPOINT | IFF_BROADCAST)) 1673 return -EINVAL; 1674 } 1675 1676 ipgre_tunnel_unlink(ign, t); 1677 t->parms.iph.saddr = p.iph.saddr; 1678 t->parms.iph.daddr = p.iph.daddr; 1679 t->parms.i_key = p.i_key; 1680 if (dev->type != ARPHRD_ETHER) { 1681 memcpy(dev->dev_addr, &p.iph.saddr, 4); 1682 memcpy(dev->broadcast, &p.iph.daddr, 4); 1683 } 1684 ipgre_tunnel_link(ign, t); 1685 netdev_state_change(dev); 1686 } 1687 1688 t->parms.o_key = p.o_key; 1689 t->parms.iph.ttl = p.iph.ttl; 1690 t->parms.iph.tos = p.iph.tos; 1691 t->parms.iph.frag_off = p.iph.frag_off; 1692 1693 if (t->parms.link != p.link) { 1694 t->parms.link = p.link; 1695 mtu = ipgre_tunnel_bind_dev(dev); 1696 if (!tb[IFLA_MTU]) 1697 dev->mtu = mtu; 1698 netdev_state_change(dev); 1699 } 1700 1701 return 0; 1702 } 1703 1704 static size_t ipgre_get_size(const struct net_device *dev) 1705 { 1706 return 1707 /* IFLA_GRE_LINK */ 1708 nla_total_size(4) + 1709 /* IFLA_GRE_IFLAGS */ 1710 nla_total_size(2) + 1711 /* IFLA_GRE_OFLAGS */ 1712 nla_total_size(2) + 1713 /* IFLA_GRE_IKEY */ 1714 nla_total_size(4) + 1715 /* IFLA_GRE_OKEY */ 1716 nla_total_size(4) + 1717 /* IFLA_GRE_LOCAL */ 1718 nla_total_size(4) + 1719 /* IFLA_GRE_REMOTE */ 1720 nla_total_size(4) + 1721 /* IFLA_GRE_TTL */ 1722 nla_total_size(1) + 1723 /* IFLA_GRE_TOS */ 1724 nla_total_size(1) + 1725 /* IFLA_GRE_PMTUDISC */ 1726 nla_total_size(1) + 1727 0; 1728 } 1729 1730 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) 1731 { 1732 struct ip_tunnel *t = netdev_priv(dev); 1733 struct ip_tunnel_parm *p = &t->parms; 1734 1735 if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || 1736 nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) || 1737 nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) || 1738 nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) || 1739 nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) || 1740 nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) || 1741 nla_put_be32(skb, IFLA_GRE_REMOTE, p->iph.daddr) || 1742 nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) || 1743 nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) || 1744 nla_put_u8(skb, IFLA_GRE_PMTUDISC, 1745 !!(p->iph.frag_off & htons(IP_DF)))) 1746 goto nla_put_failure; 1747 return 0; 1748 1749 nla_put_failure: 1750 return -EMSGSIZE; 1751 } 1752 1753 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { 1754 [IFLA_GRE_LINK] = { .type = NLA_U32 }, 1755 [IFLA_GRE_IFLAGS] = { .type = NLA_U16 }, 1756 [IFLA_GRE_OFLAGS] = { .type = NLA_U16 }, 1757 [IFLA_GRE_IKEY] = { .type = NLA_U32 }, 1758 [IFLA_GRE_OKEY] = { .type = NLA_U32 }, 1759 [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) }, 1760 [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) }, 1761 [IFLA_GRE_TTL] = { .type = NLA_U8 }, 1762 [IFLA_GRE_TOS] = { .type = NLA_U8 }, 1763 [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 }, 1764 }; 1765 1766 static struct rtnl_link_ops ipgre_link_ops __read_mostly = { 1767 .kind = "gre", 1768 .maxtype = IFLA_GRE_MAX, 1769 .policy = ipgre_policy, 1770 .priv_size = sizeof(struct ip_tunnel), 1771 .setup = ipgre_tunnel_setup, 1772 .validate = ipgre_tunnel_validate, 1773 .newlink = ipgre_newlink, 1774 .changelink = ipgre_changelink, 1775 .get_size = ipgre_get_size, 1776 .fill_info = ipgre_fill_info, 1777 }; 1778 1779 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = { 1780 .kind = "gretap", 1781 .maxtype = IFLA_GRE_MAX, 1782 .policy = ipgre_policy, 1783 .priv_size = sizeof(struct ip_tunnel), 1784 .setup = ipgre_tap_setup, 1785 .validate = ipgre_tap_validate, 1786 .newlink = ipgre_newlink, 1787 .changelink = ipgre_changelink, 1788 .get_size = ipgre_get_size, 1789 .fill_info = ipgre_fill_info, 1790 }; 1791 1792 /* 1793 * And now the modules code and kernel interface. 1794 */ 1795 1796 static int __init ipgre_init(void) 1797 { 1798 int err; 1799 1800 pr_info("GRE over IPv4 tunneling driver\n"); 1801 1802 err = register_pernet_device(&ipgre_net_ops); 1803 if (err < 0) 1804 return err; 1805 1806 err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO); 1807 if (err < 0) { 1808 pr_info("%s: can't add protocol\n", __func__); 1809 goto add_proto_failed; 1810 } 1811 1812 err = rtnl_link_register(&ipgre_link_ops); 1813 if (err < 0) 1814 goto rtnl_link_failed; 1815 1816 err = rtnl_link_register(&ipgre_tap_ops); 1817 if (err < 0) 1818 goto tap_ops_failed; 1819 1820 out: 1821 return err; 1822 1823 tap_ops_failed: 1824 rtnl_link_unregister(&ipgre_link_ops); 1825 rtnl_link_failed: 1826 gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); 1827 add_proto_failed: 1828 unregister_pernet_device(&ipgre_net_ops); 1829 goto out; 1830 } 1831 1832 static void __exit ipgre_fini(void) 1833 { 1834 rtnl_link_unregister(&ipgre_tap_ops); 1835 rtnl_link_unregister(&ipgre_link_ops); 1836 if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) 1837 pr_info("%s: can't remove protocol\n", __func__); 1838 unregister_pernet_device(&ipgre_net_ops); 1839 } 1840 1841 module_init(ipgre_init); 1842 module_exit(ipgre_fini); 1843 MODULE_LICENSE("GPL"); 1844 MODULE_ALIAS_RTNL_LINK("gre"); 1845 MODULE_ALIAS_RTNL_LINK("gretap"); 1846 MODULE_ALIAS_NETDEV("gre0"); 1847