1 /* 2 * Linux NET3: GRE over IP protocol decoder. 3 * 4 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru) 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 9 * 2 of the License, or (at your option) any later version. 10 * 11 */ 12 13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 14 15 #include <linux/capability.h> 16 #include <linux/module.h> 17 #include <linux/types.h> 18 #include <linux/kernel.h> 19 #include <linux/slab.h> 20 #include <asm/uaccess.h> 21 #include <linux/skbuff.h> 22 #include <linux/netdevice.h> 23 #include <linux/in.h> 24 #include <linux/tcp.h> 25 #include <linux/udp.h> 26 #include <linux/if_arp.h> 27 #include <linux/mroute.h> 28 #include <linux/init.h> 29 #include <linux/in6.h> 30 #include <linux/inetdevice.h> 31 #include <linux/igmp.h> 32 #include <linux/netfilter_ipv4.h> 33 #include <linux/etherdevice.h> 34 #include <linux/if_ether.h> 35 36 #include <net/sock.h> 37 #include <net/ip.h> 38 #include <net/icmp.h> 39 #include <net/protocol.h> 40 #include <net/ipip.h> 41 #include <net/arp.h> 42 #include <net/checksum.h> 43 #include <net/dsfield.h> 44 #include <net/inet_ecn.h> 45 #include <net/xfrm.h> 46 #include <net/net_namespace.h> 47 #include <net/netns/generic.h> 48 #include <net/rtnetlink.h> 49 #include <net/gre.h> 50 51 #if IS_ENABLED(CONFIG_IPV6) 52 #include <net/ipv6.h> 53 #include <net/ip6_fib.h> 54 #include <net/ip6_route.h> 55 #endif 56 57 /* 58 Problems & solutions 59 -------------------- 60 61 1. The most important issue is detecting local dead loops. 62 They would cause complete host lockup in transmit, which 63 would be "resolved" by stack overflow or, if queueing is enabled, 64 with infinite looping in net_bh. 65 66 We cannot track such dead loops during route installation, 67 it is infeasible task. The most general solutions would be 68 to keep skb->encapsulation counter (sort of local ttl), 69 and silently drop packet when it expires. It is a good 70 solution, but it supposes maintaining new variable in ALL 71 skb, even if no tunneling is used. 72 73 Current solution: xmit_recursion breaks dead loops. This is a percpu 74 counter, since when we enter the first ndo_xmit(), cpu migration is 75 forbidden. We force an exit if this counter reaches RECURSION_LIMIT 76 77 2. Networking dead loops would not kill routers, but would really 78 kill network. IP hop limit plays role of "t->recursion" in this case, 79 if we copy it from packet being encapsulated to upper header. 80 It is very good solution, but it introduces two problems: 81 82 - Routing protocols, using packets with ttl=1 (OSPF, RIP2), 83 do not work over tunnels. 84 - traceroute does not work. I planned to relay ICMP from tunnel, 85 so that this problem would be solved and traceroute output 86 would even more informative. This idea appeared to be wrong: 87 only Linux complies to rfc1812 now (yes, guys, Linux is the only 88 true router now :-)), all routers (at least, in neighbourhood of mine) 89 return only 8 bytes of payload. It is the end. 90 91 Hence, if we want that OSPF worked or traceroute said something reasonable, 92 we should search for another solution. 93 94 One of them is to parse packet trying to detect inner encapsulation 95 made by our node. It is difficult or even impossible, especially, 96 taking into account fragmentation. TO be short, ttl is not solution at all. 97 98 Current solution: The solution was UNEXPECTEDLY SIMPLE. 99 We force DF flag on tunnels with preconfigured hop limit, 100 that is ALL. :-) Well, it does not remove the problem completely, 101 but exponential growth of network traffic is changed to linear 102 (branches, that exceed pmtu are pruned) and tunnel mtu 103 rapidly degrades to value <68, where looping stops. 104 Yes, it is not good if there exists a router in the loop, 105 which does not force DF, even when encapsulating packets have DF set. 106 But it is not our problem! Nobody could accuse us, we made 107 all that we could make. Even if it is your gated who injected 108 fatal route to network, even if it were you who configured 109 fatal static route: you are innocent. :-) 110 111 112 113 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain 114 practically identical code. It would be good to glue them 115 together, but it is not very evident, how to make them modular. 116 sit is integral part of IPv6, ipip and gre are naturally modular. 117 We could extract common parts (hash table, ioctl etc) 118 to a separate module (ip_tunnel.c). 119 120 Alexey Kuznetsov. 121 */ 122 123 static struct rtnl_link_ops ipgre_link_ops __read_mostly; 124 static int ipgre_tunnel_init(struct net_device *dev); 125 static void ipgre_tunnel_setup(struct net_device *dev); 126 static int ipgre_tunnel_bind_dev(struct net_device *dev); 127 128 /* Fallback tunnel: no source, no destination, no key, no options */ 129 130 #define HASH_SIZE 16 131 132 static int ipgre_net_id __read_mostly; 133 struct ipgre_net { 134 struct ip_tunnel __rcu *tunnels[4][HASH_SIZE]; 135 136 struct net_device *fb_tunnel_dev; 137 }; 138 139 /* Tunnel hash table */ 140 141 /* 142 4 hash tables: 143 144 3: (remote,local) 145 2: (remote,*) 146 1: (*,local) 147 0: (*,*) 148 149 We require exact key match i.e. if a key is present in packet 150 it will match only tunnel with the same key; if it is not present, 151 it will match only keyless tunnel. 152 153 All keysless packets, if not matched configured keyless tunnels 154 will match fallback tunnel. 155 */ 156 157 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) 158 159 #define tunnels_r_l tunnels[3] 160 #define tunnels_r tunnels[2] 161 #define tunnels_l tunnels[1] 162 #define tunnels_wc tunnels[0] 163 /* 164 * Locking : hash tables are protected by RCU and RTNL 165 */ 166 167 #define for_each_ip_tunnel_rcu(start) \ 168 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) 169 170 /* often modified stats are per cpu, other are shared (netdev->stats) */ 171 struct pcpu_tstats { 172 u64 rx_packets; 173 u64 rx_bytes; 174 u64 tx_packets; 175 u64 tx_bytes; 176 struct u64_stats_sync syncp; 177 }; 178 179 static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev, 180 struct rtnl_link_stats64 *tot) 181 { 182 int i; 183 184 for_each_possible_cpu(i) { 185 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); 186 u64 rx_packets, rx_bytes, tx_packets, tx_bytes; 187 unsigned int start; 188 189 do { 190 start = u64_stats_fetch_begin_bh(&tstats->syncp); 191 rx_packets = tstats->rx_packets; 192 tx_packets = tstats->tx_packets; 193 rx_bytes = tstats->rx_bytes; 194 tx_bytes = tstats->tx_bytes; 195 } while (u64_stats_fetch_retry_bh(&tstats->syncp, start)); 196 197 tot->rx_packets += rx_packets; 198 tot->tx_packets += tx_packets; 199 tot->rx_bytes += rx_bytes; 200 tot->tx_bytes += tx_bytes; 201 } 202 203 tot->multicast = dev->stats.multicast; 204 tot->rx_crc_errors = dev->stats.rx_crc_errors; 205 tot->rx_fifo_errors = dev->stats.rx_fifo_errors; 206 tot->rx_length_errors = dev->stats.rx_length_errors; 207 tot->rx_errors = dev->stats.rx_errors; 208 tot->tx_fifo_errors = dev->stats.tx_fifo_errors; 209 tot->tx_carrier_errors = dev->stats.tx_carrier_errors; 210 tot->tx_dropped = dev->stats.tx_dropped; 211 tot->tx_aborted_errors = dev->stats.tx_aborted_errors; 212 tot->tx_errors = dev->stats.tx_errors; 213 214 return tot; 215 } 216 217 /* Given src, dst and key, find appropriate for input tunnel. */ 218 219 static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev, 220 __be32 remote, __be32 local, 221 __be32 key, __be16 gre_proto) 222 { 223 struct net *net = dev_net(dev); 224 int link = dev->ifindex; 225 unsigned int h0 = HASH(remote); 226 unsigned int h1 = HASH(key); 227 struct ip_tunnel *t, *cand = NULL; 228 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 229 int dev_type = (gre_proto == htons(ETH_P_TEB)) ? 230 ARPHRD_ETHER : ARPHRD_IPGRE; 231 int score, cand_score = 4; 232 233 for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) { 234 if (local != t->parms.iph.saddr || 235 remote != t->parms.iph.daddr || 236 key != t->parms.i_key || 237 !(t->dev->flags & IFF_UP)) 238 continue; 239 240 if (t->dev->type != ARPHRD_IPGRE && 241 t->dev->type != dev_type) 242 continue; 243 244 score = 0; 245 if (t->parms.link != link) 246 score |= 1; 247 if (t->dev->type != dev_type) 248 score |= 2; 249 if (score == 0) 250 return t; 251 252 if (score < cand_score) { 253 cand = t; 254 cand_score = score; 255 } 256 } 257 258 for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) { 259 if (remote != t->parms.iph.daddr || 260 key != t->parms.i_key || 261 !(t->dev->flags & IFF_UP)) 262 continue; 263 264 if (t->dev->type != ARPHRD_IPGRE && 265 t->dev->type != dev_type) 266 continue; 267 268 score = 0; 269 if (t->parms.link != link) 270 score |= 1; 271 if (t->dev->type != dev_type) 272 score |= 2; 273 if (score == 0) 274 return t; 275 276 if (score < cand_score) { 277 cand = t; 278 cand_score = score; 279 } 280 } 281 282 for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) { 283 if ((local != t->parms.iph.saddr && 284 (local != t->parms.iph.daddr || 285 !ipv4_is_multicast(local))) || 286 key != t->parms.i_key || 287 !(t->dev->flags & IFF_UP)) 288 continue; 289 290 if (t->dev->type != ARPHRD_IPGRE && 291 t->dev->type != dev_type) 292 continue; 293 294 score = 0; 295 if (t->parms.link != link) 296 score |= 1; 297 if (t->dev->type != dev_type) 298 score |= 2; 299 if (score == 0) 300 return t; 301 302 if (score < cand_score) { 303 cand = t; 304 cand_score = score; 305 } 306 } 307 308 for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) { 309 if (t->parms.i_key != key || 310 !(t->dev->flags & IFF_UP)) 311 continue; 312 313 if (t->dev->type != ARPHRD_IPGRE && 314 t->dev->type != dev_type) 315 continue; 316 317 score = 0; 318 if (t->parms.link != link) 319 score |= 1; 320 if (t->dev->type != dev_type) 321 score |= 2; 322 if (score == 0) 323 return t; 324 325 if (score < cand_score) { 326 cand = t; 327 cand_score = score; 328 } 329 } 330 331 if (cand != NULL) 332 return cand; 333 334 dev = ign->fb_tunnel_dev; 335 if (dev->flags & IFF_UP) 336 return netdev_priv(dev); 337 338 return NULL; 339 } 340 341 static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign, 342 struct ip_tunnel_parm *parms) 343 { 344 __be32 remote = parms->iph.daddr; 345 __be32 local = parms->iph.saddr; 346 __be32 key = parms->i_key; 347 unsigned int h = HASH(key); 348 int prio = 0; 349 350 if (local) 351 prio |= 1; 352 if (remote && !ipv4_is_multicast(remote)) { 353 prio |= 2; 354 h ^= HASH(remote); 355 } 356 357 return &ign->tunnels[prio][h]; 358 } 359 360 static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign, 361 struct ip_tunnel *t) 362 { 363 return __ipgre_bucket(ign, &t->parms); 364 } 365 366 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t) 367 { 368 struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t); 369 370 rcu_assign_pointer(t->next, rtnl_dereference(*tp)); 371 rcu_assign_pointer(*tp, t); 372 } 373 374 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t) 375 { 376 struct ip_tunnel __rcu **tp; 377 struct ip_tunnel *iter; 378 379 for (tp = ipgre_bucket(ign, t); 380 (iter = rtnl_dereference(*tp)) != NULL; 381 tp = &iter->next) { 382 if (t == iter) { 383 rcu_assign_pointer(*tp, t->next); 384 break; 385 } 386 } 387 } 388 389 static struct ip_tunnel *ipgre_tunnel_find(struct net *net, 390 struct ip_tunnel_parm *parms, 391 int type) 392 { 393 __be32 remote = parms->iph.daddr; 394 __be32 local = parms->iph.saddr; 395 __be32 key = parms->i_key; 396 int link = parms->link; 397 struct ip_tunnel *t; 398 struct ip_tunnel __rcu **tp; 399 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 400 401 for (tp = __ipgre_bucket(ign, parms); 402 (t = rtnl_dereference(*tp)) != NULL; 403 tp = &t->next) 404 if (local == t->parms.iph.saddr && 405 remote == t->parms.iph.daddr && 406 key == t->parms.i_key && 407 link == t->parms.link && 408 type == t->dev->type) 409 break; 410 411 return t; 412 } 413 414 static struct ip_tunnel *ipgre_tunnel_locate(struct net *net, 415 struct ip_tunnel_parm *parms, int create) 416 { 417 struct ip_tunnel *t, *nt; 418 struct net_device *dev; 419 char name[IFNAMSIZ]; 420 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 421 422 t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE); 423 if (t || !create) 424 return t; 425 426 if (parms->name[0]) 427 strlcpy(name, parms->name, IFNAMSIZ); 428 else 429 strcpy(name, "gre%d"); 430 431 dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup); 432 if (!dev) 433 return NULL; 434 435 dev_net_set(dev, net); 436 437 nt = netdev_priv(dev); 438 nt->parms = *parms; 439 dev->rtnl_link_ops = &ipgre_link_ops; 440 441 dev->mtu = ipgre_tunnel_bind_dev(dev); 442 443 if (register_netdevice(dev) < 0) 444 goto failed_free; 445 446 /* Can use a lockless transmit, unless we generate output sequences */ 447 if (!(nt->parms.o_flags & GRE_SEQ)) 448 dev->features |= NETIF_F_LLTX; 449 450 dev_hold(dev); 451 ipgre_tunnel_link(ign, nt); 452 return nt; 453 454 failed_free: 455 free_netdev(dev); 456 return NULL; 457 } 458 459 static void ipgre_tunnel_uninit(struct net_device *dev) 460 { 461 struct net *net = dev_net(dev); 462 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 463 464 ipgre_tunnel_unlink(ign, netdev_priv(dev)); 465 dev_put(dev); 466 } 467 468 469 static void ipgre_err(struct sk_buff *skb, u32 info) 470 { 471 472 /* All the routers (except for Linux) return only 473 8 bytes of packet payload. It means, that precise relaying of 474 ICMP in the real Internet is absolutely infeasible. 475 476 Moreover, Cisco "wise men" put GRE key to the third word 477 in GRE header. It makes impossible maintaining even soft state for keyed 478 GRE tunnels with enabled checksum. Tell them "thank you". 479 480 Well, I wonder, rfc1812 was written by Cisco employee, 481 what the hell these idiots break standards established 482 by themselves??? 483 */ 484 485 const struct iphdr *iph = (const struct iphdr *)skb->data; 486 __be16 *p = (__be16 *)(skb->data+(iph->ihl<<2)); 487 int grehlen = (iph->ihl<<2) + 4; 488 const int type = icmp_hdr(skb)->type; 489 const int code = icmp_hdr(skb)->code; 490 struct ip_tunnel *t; 491 __be16 flags; 492 493 flags = p[0]; 494 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) { 495 if (flags&(GRE_VERSION|GRE_ROUTING)) 496 return; 497 if (flags&GRE_KEY) { 498 grehlen += 4; 499 if (flags&GRE_CSUM) 500 grehlen += 4; 501 } 502 } 503 504 /* If only 8 bytes returned, keyed message will be dropped here */ 505 if (skb_headlen(skb) < grehlen) 506 return; 507 508 switch (type) { 509 default: 510 case ICMP_PARAMETERPROB: 511 return; 512 513 case ICMP_DEST_UNREACH: 514 switch (code) { 515 case ICMP_SR_FAILED: 516 case ICMP_PORT_UNREACH: 517 /* Impossible event. */ 518 return; 519 default: 520 /* All others are translated to HOST_UNREACH. 521 rfc2003 contains "deep thoughts" about NET_UNREACH, 522 I believe they are just ether pollution. --ANK 523 */ 524 break; 525 } 526 break; 527 case ICMP_TIME_EXCEEDED: 528 if (code != ICMP_EXC_TTL) 529 return; 530 break; 531 532 case ICMP_REDIRECT: 533 break; 534 } 535 536 rcu_read_lock(); 537 t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr, 538 flags & GRE_KEY ? 539 *(((__be32 *)p) + (grehlen / 4) - 1) : 0, 540 p[1]); 541 if (t == NULL) 542 goto out; 543 544 if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { 545 ipv4_update_pmtu(skb, dev_net(skb->dev), info, 546 t->parms.link, 0, IPPROTO_GRE, 0); 547 goto out; 548 } 549 if (type == ICMP_REDIRECT) { 550 ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0, 551 IPPROTO_GRE, 0); 552 goto out; 553 } 554 if (t->parms.iph.daddr == 0 || 555 ipv4_is_multicast(t->parms.iph.daddr)) 556 goto out; 557 558 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) 559 goto out; 560 561 if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) 562 t->err_count++; 563 else 564 t->err_count = 1; 565 t->err_time = jiffies; 566 out: 567 rcu_read_unlock(); 568 } 569 570 static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb) 571 { 572 if (INET_ECN_is_ce(iph->tos)) { 573 if (skb->protocol == htons(ETH_P_IP)) { 574 IP_ECN_set_ce(ip_hdr(skb)); 575 } else if (skb->protocol == htons(ETH_P_IPV6)) { 576 IP6_ECN_set_ce(ipv6_hdr(skb)); 577 } 578 } 579 } 580 581 static inline u8 582 ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb) 583 { 584 u8 inner = 0; 585 if (skb->protocol == htons(ETH_P_IP)) 586 inner = old_iph->tos; 587 else if (skb->protocol == htons(ETH_P_IPV6)) 588 inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph); 589 return INET_ECN_encapsulate(tos, inner); 590 } 591 592 static int ipgre_rcv(struct sk_buff *skb) 593 { 594 const struct iphdr *iph; 595 u8 *h; 596 __be16 flags; 597 __sum16 csum = 0; 598 __be32 key = 0; 599 u32 seqno = 0; 600 struct ip_tunnel *tunnel; 601 int offset = 4; 602 __be16 gre_proto; 603 604 if (!pskb_may_pull(skb, 16)) 605 goto drop_nolock; 606 607 iph = ip_hdr(skb); 608 h = skb->data; 609 flags = *(__be16 *)h; 610 611 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) { 612 /* - Version must be 0. 613 - We do not support routing headers. 614 */ 615 if (flags&(GRE_VERSION|GRE_ROUTING)) 616 goto drop_nolock; 617 618 if (flags&GRE_CSUM) { 619 switch (skb->ip_summed) { 620 case CHECKSUM_COMPLETE: 621 csum = csum_fold(skb->csum); 622 if (!csum) 623 break; 624 /* fall through */ 625 case CHECKSUM_NONE: 626 skb->csum = 0; 627 csum = __skb_checksum_complete(skb); 628 skb->ip_summed = CHECKSUM_COMPLETE; 629 } 630 offset += 4; 631 } 632 if (flags&GRE_KEY) { 633 key = *(__be32 *)(h + offset); 634 offset += 4; 635 } 636 if (flags&GRE_SEQ) { 637 seqno = ntohl(*(__be32 *)(h + offset)); 638 offset += 4; 639 } 640 } 641 642 gre_proto = *(__be16 *)(h + 2); 643 644 rcu_read_lock(); 645 if ((tunnel = ipgre_tunnel_lookup(skb->dev, 646 iph->saddr, iph->daddr, key, 647 gre_proto))) { 648 struct pcpu_tstats *tstats; 649 650 secpath_reset(skb); 651 652 skb->protocol = gre_proto; 653 /* WCCP version 1 and 2 protocol decoding. 654 * - Change protocol to IP 655 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header 656 */ 657 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) { 658 skb->protocol = htons(ETH_P_IP); 659 if ((*(h + offset) & 0xF0) != 0x40) 660 offset += 4; 661 } 662 663 skb->mac_header = skb->network_header; 664 __pskb_pull(skb, offset); 665 skb_postpull_rcsum(skb, skb_transport_header(skb), offset); 666 skb->pkt_type = PACKET_HOST; 667 #ifdef CONFIG_NET_IPGRE_BROADCAST 668 if (ipv4_is_multicast(iph->daddr)) { 669 /* Looped back packet, drop it! */ 670 if (rt_is_output_route(skb_rtable(skb))) 671 goto drop; 672 tunnel->dev->stats.multicast++; 673 skb->pkt_type = PACKET_BROADCAST; 674 } 675 #endif 676 677 if (((flags&GRE_CSUM) && csum) || 678 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) { 679 tunnel->dev->stats.rx_crc_errors++; 680 tunnel->dev->stats.rx_errors++; 681 goto drop; 682 } 683 if (tunnel->parms.i_flags&GRE_SEQ) { 684 if (!(flags&GRE_SEQ) || 685 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) { 686 tunnel->dev->stats.rx_fifo_errors++; 687 tunnel->dev->stats.rx_errors++; 688 goto drop; 689 } 690 tunnel->i_seqno = seqno + 1; 691 } 692 693 /* Warning: All skb pointers will be invalidated! */ 694 if (tunnel->dev->type == ARPHRD_ETHER) { 695 if (!pskb_may_pull(skb, ETH_HLEN)) { 696 tunnel->dev->stats.rx_length_errors++; 697 tunnel->dev->stats.rx_errors++; 698 goto drop; 699 } 700 701 iph = ip_hdr(skb); 702 skb->protocol = eth_type_trans(skb, tunnel->dev); 703 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 704 } 705 706 tstats = this_cpu_ptr(tunnel->dev->tstats); 707 u64_stats_update_begin(&tstats->syncp); 708 tstats->rx_packets++; 709 tstats->rx_bytes += skb->len; 710 u64_stats_update_end(&tstats->syncp); 711 712 __skb_tunnel_rx(skb, tunnel->dev); 713 714 skb_reset_network_header(skb); 715 ipgre_ecn_decapsulate(iph, skb); 716 717 netif_rx(skb); 718 719 rcu_read_unlock(); 720 return 0; 721 } 722 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); 723 724 drop: 725 rcu_read_unlock(); 726 drop_nolock: 727 kfree_skb(skb); 728 return 0; 729 } 730 731 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) 732 { 733 struct ip_tunnel *tunnel = netdev_priv(dev); 734 struct pcpu_tstats *tstats; 735 const struct iphdr *old_iph = ip_hdr(skb); 736 const struct iphdr *tiph; 737 struct flowi4 fl4; 738 u8 tos; 739 __be16 df; 740 struct rtable *rt; /* Route to the other host */ 741 struct net_device *tdev; /* Device to other host */ 742 struct iphdr *iph; /* Our new IP header */ 743 unsigned int max_headroom; /* The extra header space needed */ 744 int gre_hlen; 745 __be32 dst; 746 int mtu; 747 748 if (dev->type == ARPHRD_ETHER) 749 IPCB(skb)->flags = 0; 750 751 if (dev->header_ops && dev->type == ARPHRD_IPGRE) { 752 gre_hlen = 0; 753 tiph = (const struct iphdr *)skb->data; 754 } else { 755 gre_hlen = tunnel->hlen; 756 tiph = &tunnel->parms.iph; 757 } 758 759 if ((dst = tiph->daddr) == 0) { 760 /* NBMA tunnel */ 761 762 if (skb_dst(skb) == NULL) { 763 dev->stats.tx_fifo_errors++; 764 goto tx_error; 765 } 766 767 if (skb->protocol == htons(ETH_P_IP)) { 768 rt = skb_rtable(skb); 769 dst = rt_nexthop(rt, old_iph->daddr); 770 } 771 #if IS_ENABLED(CONFIG_IPV6) 772 else if (skb->protocol == htons(ETH_P_IPV6)) { 773 const struct in6_addr *addr6; 774 struct neighbour *neigh; 775 bool do_tx_error_icmp; 776 int addr_type; 777 778 neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr); 779 if (neigh == NULL) 780 goto tx_error; 781 782 addr6 = (const struct in6_addr *)&neigh->primary_key; 783 addr_type = ipv6_addr_type(addr6); 784 785 if (addr_type == IPV6_ADDR_ANY) { 786 addr6 = &ipv6_hdr(skb)->daddr; 787 addr_type = ipv6_addr_type(addr6); 788 } 789 790 if ((addr_type & IPV6_ADDR_COMPATv4) == 0) 791 do_tx_error_icmp = true; 792 else { 793 do_tx_error_icmp = false; 794 dst = addr6->s6_addr32[3]; 795 } 796 neigh_release(neigh); 797 if (do_tx_error_icmp) 798 goto tx_error_icmp; 799 } 800 #endif 801 else 802 goto tx_error; 803 } 804 805 tos = tiph->tos; 806 if (tos == 1) { 807 tos = 0; 808 if (skb->protocol == htons(ETH_P_IP)) 809 tos = old_iph->tos; 810 else if (skb->protocol == htons(ETH_P_IPV6)) 811 tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph); 812 } 813 814 rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr, 815 tunnel->parms.o_key, RT_TOS(tos), 816 tunnel->parms.link); 817 if (IS_ERR(rt)) { 818 dev->stats.tx_carrier_errors++; 819 goto tx_error; 820 } 821 tdev = rt->dst.dev; 822 823 if (tdev == dev) { 824 ip_rt_put(rt); 825 dev->stats.collisions++; 826 goto tx_error; 827 } 828 829 df = tiph->frag_off; 830 if (df) 831 mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen; 832 else 833 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; 834 835 if (skb_dst(skb)) 836 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); 837 838 if (skb->protocol == htons(ETH_P_IP)) { 839 df |= (old_iph->frag_off&htons(IP_DF)); 840 841 if ((old_iph->frag_off&htons(IP_DF)) && 842 mtu < ntohs(old_iph->tot_len)) { 843 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); 844 ip_rt_put(rt); 845 goto tx_error; 846 } 847 } 848 #if IS_ENABLED(CONFIG_IPV6) 849 else if (skb->protocol == htons(ETH_P_IPV6)) { 850 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb); 851 852 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) { 853 if ((tunnel->parms.iph.daddr && 854 !ipv4_is_multicast(tunnel->parms.iph.daddr)) || 855 rt6->rt6i_dst.plen == 128) { 856 rt6->rt6i_flags |= RTF_MODIFIED; 857 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu); 858 } 859 } 860 861 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) { 862 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 863 ip_rt_put(rt); 864 goto tx_error; 865 } 866 } 867 #endif 868 869 if (tunnel->err_count > 0) { 870 if (time_before(jiffies, 871 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { 872 tunnel->err_count--; 873 874 dst_link_failure(skb); 875 } else 876 tunnel->err_count = 0; 877 } 878 879 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len; 880 881 if (skb_headroom(skb) < max_headroom || skb_shared(skb)|| 882 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { 883 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); 884 if (max_headroom > dev->needed_headroom) 885 dev->needed_headroom = max_headroom; 886 if (!new_skb) { 887 ip_rt_put(rt); 888 dev->stats.tx_dropped++; 889 dev_kfree_skb(skb); 890 return NETDEV_TX_OK; 891 } 892 if (skb->sk) 893 skb_set_owner_w(new_skb, skb->sk); 894 dev_kfree_skb(skb); 895 skb = new_skb; 896 old_iph = ip_hdr(skb); 897 } 898 899 skb_reset_transport_header(skb); 900 skb_push(skb, gre_hlen); 901 skb_reset_network_header(skb); 902 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 903 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | 904 IPSKB_REROUTED); 905 skb_dst_drop(skb); 906 skb_dst_set(skb, &rt->dst); 907 908 /* 909 * Push down and install the IPIP header. 910 */ 911 912 iph = ip_hdr(skb); 913 iph->version = 4; 914 iph->ihl = sizeof(struct iphdr) >> 2; 915 iph->frag_off = df; 916 iph->protocol = IPPROTO_GRE; 917 iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb); 918 iph->daddr = fl4.daddr; 919 iph->saddr = fl4.saddr; 920 921 if ((iph->ttl = tiph->ttl) == 0) { 922 if (skb->protocol == htons(ETH_P_IP)) 923 iph->ttl = old_iph->ttl; 924 #if IS_ENABLED(CONFIG_IPV6) 925 else if (skb->protocol == htons(ETH_P_IPV6)) 926 iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit; 927 #endif 928 else 929 iph->ttl = ip4_dst_hoplimit(&rt->dst); 930 } 931 932 ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags; 933 ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ? 934 htons(ETH_P_TEB) : skb->protocol; 935 936 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) { 937 __be32 *ptr = (__be32 *)(((u8 *)iph) + tunnel->hlen - 4); 938 939 if (tunnel->parms.o_flags&GRE_SEQ) { 940 ++tunnel->o_seqno; 941 *ptr = htonl(tunnel->o_seqno); 942 ptr--; 943 } 944 if (tunnel->parms.o_flags&GRE_KEY) { 945 *ptr = tunnel->parms.o_key; 946 ptr--; 947 } 948 if (tunnel->parms.o_flags&GRE_CSUM) { 949 *ptr = 0; 950 *(__sum16 *)ptr = ip_compute_csum((void *)(iph+1), skb->len - sizeof(struct iphdr)); 951 } 952 } 953 954 nf_reset(skb); 955 tstats = this_cpu_ptr(dev->tstats); 956 __IPTUNNEL_XMIT(tstats, &dev->stats); 957 return NETDEV_TX_OK; 958 959 #if IS_ENABLED(CONFIG_IPV6) 960 tx_error_icmp: 961 dst_link_failure(skb); 962 #endif 963 tx_error: 964 dev->stats.tx_errors++; 965 dev_kfree_skb(skb); 966 return NETDEV_TX_OK; 967 } 968 969 static int ipgre_tunnel_bind_dev(struct net_device *dev) 970 { 971 struct net_device *tdev = NULL; 972 struct ip_tunnel *tunnel; 973 const struct iphdr *iph; 974 int hlen = LL_MAX_HEADER; 975 int mtu = ETH_DATA_LEN; 976 int addend = sizeof(struct iphdr) + 4; 977 978 tunnel = netdev_priv(dev); 979 iph = &tunnel->parms.iph; 980 981 /* Guess output device to choose reasonable mtu and needed_headroom */ 982 983 if (iph->daddr) { 984 struct flowi4 fl4; 985 struct rtable *rt; 986 987 rt = ip_route_output_gre(dev_net(dev), &fl4, 988 iph->daddr, iph->saddr, 989 tunnel->parms.o_key, 990 RT_TOS(iph->tos), 991 tunnel->parms.link); 992 if (!IS_ERR(rt)) { 993 tdev = rt->dst.dev; 994 ip_rt_put(rt); 995 } 996 997 if (dev->type != ARPHRD_ETHER) 998 dev->flags |= IFF_POINTOPOINT; 999 } 1000 1001 if (!tdev && tunnel->parms.link) 1002 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link); 1003 1004 if (tdev) { 1005 hlen = tdev->hard_header_len + tdev->needed_headroom; 1006 mtu = tdev->mtu; 1007 } 1008 dev->iflink = tunnel->parms.link; 1009 1010 /* Precalculate GRE options length */ 1011 if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) { 1012 if (tunnel->parms.o_flags&GRE_CSUM) 1013 addend += 4; 1014 if (tunnel->parms.o_flags&GRE_KEY) 1015 addend += 4; 1016 if (tunnel->parms.o_flags&GRE_SEQ) 1017 addend += 4; 1018 } 1019 dev->needed_headroom = addend + hlen; 1020 mtu -= dev->hard_header_len + addend; 1021 1022 if (mtu < 68) 1023 mtu = 68; 1024 1025 tunnel->hlen = addend; 1026 1027 return mtu; 1028 } 1029 1030 static int 1031 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) 1032 { 1033 int err = 0; 1034 struct ip_tunnel_parm p; 1035 struct ip_tunnel *t; 1036 struct net *net = dev_net(dev); 1037 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 1038 1039 switch (cmd) { 1040 case SIOCGETTUNNEL: 1041 t = NULL; 1042 if (dev == ign->fb_tunnel_dev) { 1043 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { 1044 err = -EFAULT; 1045 break; 1046 } 1047 t = ipgre_tunnel_locate(net, &p, 0); 1048 } 1049 if (t == NULL) 1050 t = netdev_priv(dev); 1051 memcpy(&p, &t->parms, sizeof(p)); 1052 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) 1053 err = -EFAULT; 1054 break; 1055 1056 case SIOCADDTUNNEL: 1057 case SIOCCHGTUNNEL: 1058 err = -EPERM; 1059 if (!capable(CAP_NET_ADMIN)) 1060 goto done; 1061 1062 err = -EFAULT; 1063 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) 1064 goto done; 1065 1066 err = -EINVAL; 1067 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE || 1068 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) || 1069 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING))) 1070 goto done; 1071 if (p.iph.ttl) 1072 p.iph.frag_off |= htons(IP_DF); 1073 1074 if (!(p.i_flags&GRE_KEY)) 1075 p.i_key = 0; 1076 if (!(p.o_flags&GRE_KEY)) 1077 p.o_key = 0; 1078 1079 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL); 1080 1081 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { 1082 if (t != NULL) { 1083 if (t->dev != dev) { 1084 err = -EEXIST; 1085 break; 1086 } 1087 } else { 1088 unsigned int nflags = 0; 1089 1090 t = netdev_priv(dev); 1091 1092 if (ipv4_is_multicast(p.iph.daddr)) 1093 nflags = IFF_BROADCAST; 1094 else if (p.iph.daddr) 1095 nflags = IFF_POINTOPOINT; 1096 1097 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) { 1098 err = -EINVAL; 1099 break; 1100 } 1101 ipgre_tunnel_unlink(ign, t); 1102 synchronize_net(); 1103 t->parms.iph.saddr = p.iph.saddr; 1104 t->parms.iph.daddr = p.iph.daddr; 1105 t->parms.i_key = p.i_key; 1106 t->parms.o_key = p.o_key; 1107 memcpy(dev->dev_addr, &p.iph.saddr, 4); 1108 memcpy(dev->broadcast, &p.iph.daddr, 4); 1109 ipgre_tunnel_link(ign, t); 1110 netdev_state_change(dev); 1111 } 1112 } 1113 1114 if (t) { 1115 err = 0; 1116 if (cmd == SIOCCHGTUNNEL) { 1117 t->parms.iph.ttl = p.iph.ttl; 1118 t->parms.iph.tos = p.iph.tos; 1119 t->parms.iph.frag_off = p.iph.frag_off; 1120 if (t->parms.link != p.link) { 1121 t->parms.link = p.link; 1122 dev->mtu = ipgre_tunnel_bind_dev(dev); 1123 netdev_state_change(dev); 1124 } 1125 } 1126 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) 1127 err = -EFAULT; 1128 } else 1129 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); 1130 break; 1131 1132 case SIOCDELTUNNEL: 1133 err = -EPERM; 1134 if (!capable(CAP_NET_ADMIN)) 1135 goto done; 1136 1137 if (dev == ign->fb_tunnel_dev) { 1138 err = -EFAULT; 1139 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) 1140 goto done; 1141 err = -ENOENT; 1142 if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL) 1143 goto done; 1144 err = -EPERM; 1145 if (t == netdev_priv(ign->fb_tunnel_dev)) 1146 goto done; 1147 dev = t->dev; 1148 } 1149 unregister_netdevice(dev); 1150 err = 0; 1151 break; 1152 1153 default: 1154 err = -EINVAL; 1155 } 1156 1157 done: 1158 return err; 1159 } 1160 1161 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu) 1162 { 1163 struct ip_tunnel *tunnel = netdev_priv(dev); 1164 if (new_mtu < 68 || 1165 new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen) 1166 return -EINVAL; 1167 dev->mtu = new_mtu; 1168 return 0; 1169 } 1170 1171 /* Nice toy. Unfortunately, useless in real life :-) 1172 It allows to construct virtual multiprotocol broadcast "LAN" 1173 over the Internet, provided multicast routing is tuned. 1174 1175 1176 I have no idea was this bicycle invented before me, 1177 so that I had to set ARPHRD_IPGRE to a random value. 1178 I have an impression, that Cisco could make something similar, 1179 but this feature is apparently missing in IOS<=11.2(8). 1180 1181 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks 1182 with broadcast 224.66.66.66. If you have access to mbone, play with me :-) 1183 1184 ping -t 255 224.66.66.66 1185 1186 If nobody answers, mbone does not work. 1187 1188 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255 1189 ip addr add 10.66.66.<somewhat>/24 dev Universe 1190 ifconfig Universe up 1191 ifconfig Universe add fe80::<Your_real_addr>/10 1192 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96 1193 ftp 10.66.66.66 1194 ... 1195 ftp fec0:6666:6666::193.233.7.65 1196 ... 1197 1198 */ 1199 1200 static int ipgre_header(struct sk_buff *skb, struct net_device *dev, 1201 unsigned short type, 1202 const void *daddr, const void *saddr, unsigned int len) 1203 { 1204 struct ip_tunnel *t = netdev_priv(dev); 1205 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen); 1206 __be16 *p = (__be16 *)(iph+1); 1207 1208 memcpy(iph, &t->parms.iph, sizeof(struct iphdr)); 1209 p[0] = t->parms.o_flags; 1210 p[1] = htons(type); 1211 1212 /* 1213 * Set the source hardware address. 1214 */ 1215 1216 if (saddr) 1217 memcpy(&iph->saddr, saddr, 4); 1218 if (daddr) 1219 memcpy(&iph->daddr, daddr, 4); 1220 if (iph->daddr) 1221 return t->hlen; 1222 1223 return -t->hlen; 1224 } 1225 1226 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr) 1227 { 1228 const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb); 1229 memcpy(haddr, &iph->saddr, 4); 1230 return 4; 1231 } 1232 1233 static const struct header_ops ipgre_header_ops = { 1234 .create = ipgre_header, 1235 .parse = ipgre_header_parse, 1236 }; 1237 1238 #ifdef CONFIG_NET_IPGRE_BROADCAST 1239 static int ipgre_open(struct net_device *dev) 1240 { 1241 struct ip_tunnel *t = netdev_priv(dev); 1242 1243 if (ipv4_is_multicast(t->parms.iph.daddr)) { 1244 struct flowi4 fl4; 1245 struct rtable *rt; 1246 1247 rt = ip_route_output_gre(dev_net(dev), &fl4, 1248 t->parms.iph.daddr, 1249 t->parms.iph.saddr, 1250 t->parms.o_key, 1251 RT_TOS(t->parms.iph.tos), 1252 t->parms.link); 1253 if (IS_ERR(rt)) 1254 return -EADDRNOTAVAIL; 1255 dev = rt->dst.dev; 1256 ip_rt_put(rt); 1257 if (__in_dev_get_rtnl(dev) == NULL) 1258 return -EADDRNOTAVAIL; 1259 t->mlink = dev->ifindex; 1260 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr); 1261 } 1262 return 0; 1263 } 1264 1265 static int ipgre_close(struct net_device *dev) 1266 { 1267 struct ip_tunnel *t = netdev_priv(dev); 1268 1269 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) { 1270 struct in_device *in_dev; 1271 in_dev = inetdev_by_index(dev_net(dev), t->mlink); 1272 if (in_dev) 1273 ip_mc_dec_group(in_dev, t->parms.iph.daddr); 1274 } 1275 return 0; 1276 } 1277 1278 #endif 1279 1280 static const struct net_device_ops ipgre_netdev_ops = { 1281 .ndo_init = ipgre_tunnel_init, 1282 .ndo_uninit = ipgre_tunnel_uninit, 1283 #ifdef CONFIG_NET_IPGRE_BROADCAST 1284 .ndo_open = ipgre_open, 1285 .ndo_stop = ipgre_close, 1286 #endif 1287 .ndo_start_xmit = ipgre_tunnel_xmit, 1288 .ndo_do_ioctl = ipgre_tunnel_ioctl, 1289 .ndo_change_mtu = ipgre_tunnel_change_mtu, 1290 .ndo_get_stats64 = ipgre_get_stats64, 1291 }; 1292 1293 static void ipgre_dev_free(struct net_device *dev) 1294 { 1295 free_percpu(dev->tstats); 1296 free_netdev(dev); 1297 } 1298 1299 static void ipgre_tunnel_setup(struct net_device *dev) 1300 { 1301 dev->netdev_ops = &ipgre_netdev_ops; 1302 dev->destructor = ipgre_dev_free; 1303 1304 dev->type = ARPHRD_IPGRE; 1305 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; 1306 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4; 1307 dev->flags = IFF_NOARP; 1308 dev->iflink = 0; 1309 dev->addr_len = 4; 1310 dev->features |= NETIF_F_NETNS_LOCAL; 1311 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; 1312 } 1313 1314 static int ipgre_tunnel_init(struct net_device *dev) 1315 { 1316 struct ip_tunnel *tunnel; 1317 struct iphdr *iph; 1318 1319 tunnel = netdev_priv(dev); 1320 iph = &tunnel->parms.iph; 1321 1322 tunnel->dev = dev; 1323 strcpy(tunnel->parms.name, dev->name); 1324 1325 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); 1326 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); 1327 1328 if (iph->daddr) { 1329 #ifdef CONFIG_NET_IPGRE_BROADCAST 1330 if (ipv4_is_multicast(iph->daddr)) { 1331 if (!iph->saddr) 1332 return -EINVAL; 1333 dev->flags = IFF_BROADCAST; 1334 dev->header_ops = &ipgre_header_ops; 1335 } 1336 #endif 1337 } else 1338 dev->header_ops = &ipgre_header_ops; 1339 1340 dev->tstats = alloc_percpu(struct pcpu_tstats); 1341 if (!dev->tstats) 1342 return -ENOMEM; 1343 1344 return 0; 1345 } 1346 1347 static void ipgre_fb_tunnel_init(struct net_device *dev) 1348 { 1349 struct ip_tunnel *tunnel = netdev_priv(dev); 1350 struct iphdr *iph = &tunnel->parms.iph; 1351 1352 tunnel->dev = dev; 1353 strcpy(tunnel->parms.name, dev->name); 1354 1355 iph->version = 4; 1356 iph->protocol = IPPROTO_GRE; 1357 iph->ihl = 5; 1358 tunnel->hlen = sizeof(struct iphdr) + 4; 1359 1360 dev_hold(dev); 1361 } 1362 1363 1364 static const struct gre_protocol ipgre_protocol = { 1365 .handler = ipgre_rcv, 1366 .err_handler = ipgre_err, 1367 }; 1368 1369 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head) 1370 { 1371 int prio; 1372 1373 for (prio = 0; prio < 4; prio++) { 1374 int h; 1375 for (h = 0; h < HASH_SIZE; h++) { 1376 struct ip_tunnel *t; 1377 1378 t = rtnl_dereference(ign->tunnels[prio][h]); 1379 1380 while (t != NULL) { 1381 unregister_netdevice_queue(t->dev, head); 1382 t = rtnl_dereference(t->next); 1383 } 1384 } 1385 } 1386 } 1387 1388 static int __net_init ipgre_init_net(struct net *net) 1389 { 1390 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 1391 int err; 1392 1393 ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0", 1394 ipgre_tunnel_setup); 1395 if (!ign->fb_tunnel_dev) { 1396 err = -ENOMEM; 1397 goto err_alloc_dev; 1398 } 1399 dev_net_set(ign->fb_tunnel_dev, net); 1400 1401 ipgre_fb_tunnel_init(ign->fb_tunnel_dev); 1402 ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops; 1403 1404 if ((err = register_netdev(ign->fb_tunnel_dev))) 1405 goto err_reg_dev; 1406 1407 rcu_assign_pointer(ign->tunnels_wc[0], 1408 netdev_priv(ign->fb_tunnel_dev)); 1409 return 0; 1410 1411 err_reg_dev: 1412 ipgre_dev_free(ign->fb_tunnel_dev); 1413 err_alloc_dev: 1414 return err; 1415 } 1416 1417 static void __net_exit ipgre_exit_net(struct net *net) 1418 { 1419 struct ipgre_net *ign; 1420 LIST_HEAD(list); 1421 1422 ign = net_generic(net, ipgre_net_id); 1423 rtnl_lock(); 1424 ipgre_destroy_tunnels(ign, &list); 1425 unregister_netdevice_many(&list); 1426 rtnl_unlock(); 1427 } 1428 1429 static struct pernet_operations ipgre_net_ops = { 1430 .init = ipgre_init_net, 1431 .exit = ipgre_exit_net, 1432 .id = &ipgre_net_id, 1433 .size = sizeof(struct ipgre_net), 1434 }; 1435 1436 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) 1437 { 1438 __be16 flags; 1439 1440 if (!data) 1441 return 0; 1442 1443 flags = 0; 1444 if (data[IFLA_GRE_IFLAGS]) 1445 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]); 1446 if (data[IFLA_GRE_OFLAGS]) 1447 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]); 1448 if (flags & (GRE_VERSION|GRE_ROUTING)) 1449 return -EINVAL; 1450 1451 return 0; 1452 } 1453 1454 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[]) 1455 { 1456 __be32 daddr; 1457 1458 if (tb[IFLA_ADDRESS]) { 1459 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) 1460 return -EINVAL; 1461 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) 1462 return -EADDRNOTAVAIL; 1463 } 1464 1465 if (!data) 1466 goto out; 1467 1468 if (data[IFLA_GRE_REMOTE]) { 1469 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4); 1470 if (!daddr) 1471 return -EINVAL; 1472 } 1473 1474 out: 1475 return ipgre_tunnel_validate(tb, data); 1476 } 1477 1478 static void ipgre_netlink_parms(struct nlattr *data[], 1479 struct ip_tunnel_parm *parms) 1480 { 1481 memset(parms, 0, sizeof(*parms)); 1482 1483 parms->iph.protocol = IPPROTO_GRE; 1484 1485 if (!data) 1486 return; 1487 1488 if (data[IFLA_GRE_LINK]) 1489 parms->link = nla_get_u32(data[IFLA_GRE_LINK]); 1490 1491 if (data[IFLA_GRE_IFLAGS]) 1492 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]); 1493 1494 if (data[IFLA_GRE_OFLAGS]) 1495 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]); 1496 1497 if (data[IFLA_GRE_IKEY]) 1498 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]); 1499 1500 if (data[IFLA_GRE_OKEY]) 1501 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]); 1502 1503 if (data[IFLA_GRE_LOCAL]) 1504 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]); 1505 1506 if (data[IFLA_GRE_REMOTE]) 1507 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]); 1508 1509 if (data[IFLA_GRE_TTL]) 1510 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]); 1511 1512 if (data[IFLA_GRE_TOS]) 1513 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]); 1514 1515 if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) 1516 parms->iph.frag_off = htons(IP_DF); 1517 } 1518 1519 static int ipgre_tap_init(struct net_device *dev) 1520 { 1521 struct ip_tunnel *tunnel; 1522 1523 tunnel = netdev_priv(dev); 1524 1525 tunnel->dev = dev; 1526 strcpy(tunnel->parms.name, dev->name); 1527 1528 ipgre_tunnel_bind_dev(dev); 1529 1530 dev->tstats = alloc_percpu(struct pcpu_tstats); 1531 if (!dev->tstats) 1532 return -ENOMEM; 1533 1534 return 0; 1535 } 1536 1537 static const struct net_device_ops ipgre_tap_netdev_ops = { 1538 .ndo_init = ipgre_tap_init, 1539 .ndo_uninit = ipgre_tunnel_uninit, 1540 .ndo_start_xmit = ipgre_tunnel_xmit, 1541 .ndo_set_mac_address = eth_mac_addr, 1542 .ndo_validate_addr = eth_validate_addr, 1543 .ndo_change_mtu = ipgre_tunnel_change_mtu, 1544 .ndo_get_stats64 = ipgre_get_stats64, 1545 }; 1546 1547 static void ipgre_tap_setup(struct net_device *dev) 1548 { 1549 1550 ether_setup(dev); 1551 1552 dev->netdev_ops = &ipgre_tap_netdev_ops; 1553 dev->destructor = ipgre_dev_free; 1554 1555 dev->iflink = 0; 1556 dev->features |= NETIF_F_NETNS_LOCAL; 1557 } 1558 1559 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], 1560 struct nlattr *data[]) 1561 { 1562 struct ip_tunnel *nt; 1563 struct net *net = dev_net(dev); 1564 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 1565 int mtu; 1566 int err; 1567 1568 nt = netdev_priv(dev); 1569 ipgre_netlink_parms(data, &nt->parms); 1570 1571 if (ipgre_tunnel_find(net, &nt->parms, dev->type)) 1572 return -EEXIST; 1573 1574 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) 1575 eth_hw_addr_random(dev); 1576 1577 mtu = ipgre_tunnel_bind_dev(dev); 1578 if (!tb[IFLA_MTU]) 1579 dev->mtu = mtu; 1580 1581 /* Can use a lockless transmit, unless we generate output sequences */ 1582 if (!(nt->parms.o_flags & GRE_SEQ)) 1583 dev->features |= NETIF_F_LLTX; 1584 1585 err = register_netdevice(dev); 1586 if (err) 1587 goto out; 1588 1589 dev_hold(dev); 1590 ipgre_tunnel_link(ign, nt); 1591 1592 out: 1593 return err; 1594 } 1595 1596 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], 1597 struct nlattr *data[]) 1598 { 1599 struct ip_tunnel *t, *nt; 1600 struct net *net = dev_net(dev); 1601 struct ipgre_net *ign = net_generic(net, ipgre_net_id); 1602 struct ip_tunnel_parm p; 1603 int mtu; 1604 1605 if (dev == ign->fb_tunnel_dev) 1606 return -EINVAL; 1607 1608 nt = netdev_priv(dev); 1609 ipgre_netlink_parms(data, &p); 1610 1611 t = ipgre_tunnel_locate(net, &p, 0); 1612 1613 if (t) { 1614 if (t->dev != dev) 1615 return -EEXIST; 1616 } else { 1617 t = nt; 1618 1619 if (dev->type != ARPHRD_ETHER) { 1620 unsigned int nflags = 0; 1621 1622 if (ipv4_is_multicast(p.iph.daddr)) 1623 nflags = IFF_BROADCAST; 1624 else if (p.iph.daddr) 1625 nflags = IFF_POINTOPOINT; 1626 1627 if ((dev->flags ^ nflags) & 1628 (IFF_POINTOPOINT | IFF_BROADCAST)) 1629 return -EINVAL; 1630 } 1631 1632 ipgre_tunnel_unlink(ign, t); 1633 t->parms.iph.saddr = p.iph.saddr; 1634 t->parms.iph.daddr = p.iph.daddr; 1635 t->parms.i_key = p.i_key; 1636 if (dev->type != ARPHRD_ETHER) { 1637 memcpy(dev->dev_addr, &p.iph.saddr, 4); 1638 memcpy(dev->broadcast, &p.iph.daddr, 4); 1639 } 1640 ipgre_tunnel_link(ign, t); 1641 netdev_state_change(dev); 1642 } 1643 1644 t->parms.o_key = p.o_key; 1645 t->parms.iph.ttl = p.iph.ttl; 1646 t->parms.iph.tos = p.iph.tos; 1647 t->parms.iph.frag_off = p.iph.frag_off; 1648 1649 if (t->parms.link != p.link) { 1650 t->parms.link = p.link; 1651 mtu = ipgre_tunnel_bind_dev(dev); 1652 if (!tb[IFLA_MTU]) 1653 dev->mtu = mtu; 1654 netdev_state_change(dev); 1655 } 1656 1657 return 0; 1658 } 1659 1660 static size_t ipgre_get_size(const struct net_device *dev) 1661 { 1662 return 1663 /* IFLA_GRE_LINK */ 1664 nla_total_size(4) + 1665 /* IFLA_GRE_IFLAGS */ 1666 nla_total_size(2) + 1667 /* IFLA_GRE_OFLAGS */ 1668 nla_total_size(2) + 1669 /* IFLA_GRE_IKEY */ 1670 nla_total_size(4) + 1671 /* IFLA_GRE_OKEY */ 1672 nla_total_size(4) + 1673 /* IFLA_GRE_LOCAL */ 1674 nla_total_size(4) + 1675 /* IFLA_GRE_REMOTE */ 1676 nla_total_size(4) + 1677 /* IFLA_GRE_TTL */ 1678 nla_total_size(1) + 1679 /* IFLA_GRE_TOS */ 1680 nla_total_size(1) + 1681 /* IFLA_GRE_PMTUDISC */ 1682 nla_total_size(1) + 1683 0; 1684 } 1685 1686 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) 1687 { 1688 struct ip_tunnel *t = netdev_priv(dev); 1689 struct ip_tunnel_parm *p = &t->parms; 1690 1691 if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || 1692 nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) || 1693 nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) || 1694 nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) || 1695 nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) || 1696 nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) || 1697 nla_put_be32(skb, IFLA_GRE_REMOTE, p->iph.daddr) || 1698 nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) || 1699 nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) || 1700 nla_put_u8(skb, IFLA_GRE_PMTUDISC, 1701 !!(p->iph.frag_off & htons(IP_DF)))) 1702 goto nla_put_failure; 1703 return 0; 1704 1705 nla_put_failure: 1706 return -EMSGSIZE; 1707 } 1708 1709 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { 1710 [IFLA_GRE_LINK] = { .type = NLA_U32 }, 1711 [IFLA_GRE_IFLAGS] = { .type = NLA_U16 }, 1712 [IFLA_GRE_OFLAGS] = { .type = NLA_U16 }, 1713 [IFLA_GRE_IKEY] = { .type = NLA_U32 }, 1714 [IFLA_GRE_OKEY] = { .type = NLA_U32 }, 1715 [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) }, 1716 [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) }, 1717 [IFLA_GRE_TTL] = { .type = NLA_U8 }, 1718 [IFLA_GRE_TOS] = { .type = NLA_U8 }, 1719 [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 }, 1720 }; 1721 1722 static struct rtnl_link_ops ipgre_link_ops __read_mostly = { 1723 .kind = "gre", 1724 .maxtype = IFLA_GRE_MAX, 1725 .policy = ipgre_policy, 1726 .priv_size = sizeof(struct ip_tunnel), 1727 .setup = ipgre_tunnel_setup, 1728 .validate = ipgre_tunnel_validate, 1729 .newlink = ipgre_newlink, 1730 .changelink = ipgre_changelink, 1731 .get_size = ipgre_get_size, 1732 .fill_info = ipgre_fill_info, 1733 }; 1734 1735 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = { 1736 .kind = "gretap", 1737 .maxtype = IFLA_GRE_MAX, 1738 .policy = ipgre_policy, 1739 .priv_size = sizeof(struct ip_tunnel), 1740 .setup = ipgre_tap_setup, 1741 .validate = ipgre_tap_validate, 1742 .newlink = ipgre_newlink, 1743 .changelink = ipgre_changelink, 1744 .get_size = ipgre_get_size, 1745 .fill_info = ipgre_fill_info, 1746 }; 1747 1748 /* 1749 * And now the modules code and kernel interface. 1750 */ 1751 1752 static int __init ipgre_init(void) 1753 { 1754 int err; 1755 1756 pr_info("GRE over IPv4 tunneling driver\n"); 1757 1758 err = register_pernet_device(&ipgre_net_ops); 1759 if (err < 0) 1760 return err; 1761 1762 err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO); 1763 if (err < 0) { 1764 pr_info("%s: can't add protocol\n", __func__); 1765 goto add_proto_failed; 1766 } 1767 1768 err = rtnl_link_register(&ipgre_link_ops); 1769 if (err < 0) 1770 goto rtnl_link_failed; 1771 1772 err = rtnl_link_register(&ipgre_tap_ops); 1773 if (err < 0) 1774 goto tap_ops_failed; 1775 1776 out: 1777 return err; 1778 1779 tap_ops_failed: 1780 rtnl_link_unregister(&ipgre_link_ops); 1781 rtnl_link_failed: 1782 gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); 1783 add_proto_failed: 1784 unregister_pernet_device(&ipgre_net_ops); 1785 goto out; 1786 } 1787 1788 static void __exit ipgre_fini(void) 1789 { 1790 rtnl_link_unregister(&ipgre_tap_ops); 1791 rtnl_link_unregister(&ipgre_link_ops); 1792 if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) 1793 pr_info("%s: can't remove protocol\n", __func__); 1794 unregister_pernet_device(&ipgre_net_ops); 1795 } 1796 1797 module_init(ipgre_init); 1798 module_exit(ipgre_fini); 1799 MODULE_LICENSE("GPL"); 1800 MODULE_ALIAS_RTNL_LINK("gre"); 1801 MODULE_ALIAS_RTNL_LINK("gretap"); 1802 MODULE_ALIAS_NETDEV("gre0"); 1803