xref: /openbmc/linux/net/ipv4/ip_gre.c (revision 545e4006)
1 /*
2  *	Linux NET3:	GRE over IP protocol decoder.
3  *
4  *	Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *	This program is free software; you can redistribute it and/or
7  *	modify it under the terms of the GNU General Public License
8  *	as published by the Free Software Foundation; either version
9  *	2 of the License, or (at your option) any later version.
10  *
11  */
12 
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <asm/uaccess.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
20 #include <linux/in.h>
21 #include <linux/tcp.h>
22 #include <linux/udp.h>
23 #include <linux/if_arp.h>
24 #include <linux/mroute.h>
25 #include <linux/init.h>
26 #include <linux/in6.h>
27 #include <linux/inetdevice.h>
28 #include <linux/igmp.h>
29 #include <linux/netfilter_ipv4.h>
30 #include <linux/if_ether.h>
31 
32 #include <net/sock.h>
33 #include <net/ip.h>
34 #include <net/icmp.h>
35 #include <net/protocol.h>
36 #include <net/ipip.h>
37 #include <net/arp.h>
38 #include <net/checksum.h>
39 #include <net/dsfield.h>
40 #include <net/inet_ecn.h>
41 #include <net/xfrm.h>
42 #include <net/net_namespace.h>
43 #include <net/netns/generic.h>
44 
45 #ifdef CONFIG_IPV6
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #endif
50 
51 /*
52    Problems & solutions
53    --------------------
54 
55    1. The most important issue is detecting local dead loops.
56    They would cause complete host lockup in transmit, which
57    would be "resolved" by stack overflow or, if queueing is enabled,
58    with infinite looping in net_bh.
59 
60    We cannot track such dead loops during route installation,
61    it is infeasible task. The most general solutions would be
62    to keep skb->encapsulation counter (sort of local ttl),
63    and silently drop packet when it expires. It is the best
64    solution, but it supposes maintaing new variable in ALL
65    skb, even if no tunneling is used.
66 
67    Current solution: t->recursion lock breaks dead loops. It looks
68    like dev->tbusy flag, but I preferred new variable, because
69    the semantics is different. One day, when hard_start_xmit
70    will be multithreaded we will have to use skb->encapsulation.
71 
72 
73 
74    2. Networking dead loops would not kill routers, but would really
75    kill network. IP hop limit plays role of "t->recursion" in this case,
76    if we copy it from packet being encapsulated to upper header.
77    It is very good solution, but it introduces two problems:
78 
79    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
80      do not work over tunnels.
81    - traceroute does not work. I planned to relay ICMP from tunnel,
82      so that this problem would be solved and traceroute output
83      would even more informative. This idea appeared to be wrong:
84      only Linux complies to rfc1812 now (yes, guys, Linux is the only
85      true router now :-)), all routers (at least, in neighbourhood of mine)
86      return only 8 bytes of payload. It is the end.
87 
88    Hence, if we want that OSPF worked or traceroute said something reasonable,
89    we should search for another solution.
90 
91    One of them is to parse packet trying to detect inner encapsulation
92    made by our node. It is difficult or even impossible, especially,
93    taking into account fragmentation. TO be short, tt is not solution at all.
94 
95    Current solution: The solution was UNEXPECTEDLY SIMPLE.
96    We force DF flag on tunnels with preconfigured hop limit,
97    that is ALL. :-) Well, it does not remove the problem completely,
98    but exponential growth of network traffic is changed to linear
99    (branches, that exceed pmtu are pruned) and tunnel mtu
100    fastly degrades to value <68, where looping stops.
101    Yes, it is not good if there exists a router in the loop,
102    which does not force DF, even when encapsulating packets have DF set.
103    But it is not our problem! Nobody could accuse us, we made
104    all that we could make. Even if it is your gated who injected
105    fatal route to network, even if it were you who configured
106    fatal static route: you are innocent. :-)
107 
108 
109 
110    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
111    practically identical code. It would be good to glue them
112    together, but it is not very evident, how to make them modular.
113    sit is integral part of IPv6, ipip and gre are naturally modular.
114    We could extract common parts (hash table, ioctl etc)
115    to a separate module (ip_tunnel.c).
116 
117    Alexey Kuznetsov.
118  */
119 
120 static int ipgre_tunnel_init(struct net_device *dev);
121 static void ipgre_tunnel_setup(struct net_device *dev);
122 
123 /* Fallback tunnel: no source, no destination, no key, no options */
124 
125 static int ipgre_fb_tunnel_init(struct net_device *dev);
126 
127 #define HASH_SIZE  16
128 
129 static int ipgre_net_id;
130 struct ipgre_net {
131 	struct ip_tunnel *tunnels[4][HASH_SIZE];
132 
133 	struct net_device *fb_tunnel_dev;
134 };
135 
136 /* Tunnel hash table */
137 
138 /*
139    4 hash tables:
140 
141    3: (remote,local)
142    2: (remote,*)
143    1: (*,local)
144    0: (*,*)
145 
146    We require exact key match i.e. if a key is present in packet
147    it will match only tunnel with the same key; if it is not present,
148    it will match only keyless tunnel.
149 
150    All keysless packets, if not matched configured keyless tunnels
151    will match fallback tunnel.
152  */
153 
154 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
155 
156 #define tunnels_r_l	tunnels[3]
157 #define tunnels_r	tunnels[2]
158 #define tunnels_l	tunnels[1]
159 #define tunnels_wc	tunnels[0]
160 
161 static DEFINE_RWLOCK(ipgre_lock);
162 
163 /* Given src, dst and key, find appropriate for input tunnel. */
164 
165 static struct ip_tunnel * ipgre_tunnel_lookup(struct net *net,
166 		__be32 remote, __be32 local, __be32 key)
167 {
168 	unsigned h0 = HASH(remote);
169 	unsigned h1 = HASH(key);
170 	struct ip_tunnel *t;
171 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
172 
173 	for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
174 		if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
175 			if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
176 				return t;
177 		}
178 	}
179 	for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
180 		if (remote == t->parms.iph.daddr) {
181 			if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
182 				return t;
183 		}
184 	}
185 	for (t = ign->tunnels_l[h1]; t; t = t->next) {
186 		if (local == t->parms.iph.saddr ||
187 		     (local == t->parms.iph.daddr &&
188 		      ipv4_is_multicast(local))) {
189 			if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
190 				return t;
191 		}
192 	}
193 	for (t = ign->tunnels_wc[h1]; t; t = t->next) {
194 		if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
195 			return t;
196 	}
197 
198 	if (ign->fb_tunnel_dev->flags&IFF_UP)
199 		return netdev_priv(ign->fb_tunnel_dev);
200 	return NULL;
201 }
202 
203 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
204 		struct ip_tunnel_parm *parms)
205 {
206 	__be32 remote = parms->iph.daddr;
207 	__be32 local = parms->iph.saddr;
208 	__be32 key = parms->i_key;
209 	unsigned h = HASH(key);
210 	int prio = 0;
211 
212 	if (local)
213 		prio |= 1;
214 	if (remote && !ipv4_is_multicast(remote)) {
215 		prio |= 2;
216 		h ^= HASH(remote);
217 	}
218 
219 	return &ign->tunnels[prio][h];
220 }
221 
222 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
223 		struct ip_tunnel *t)
224 {
225 	return __ipgre_bucket(ign, &t->parms);
226 }
227 
228 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
229 {
230 	struct ip_tunnel **tp = ipgre_bucket(ign, t);
231 
232 	t->next = *tp;
233 	write_lock_bh(&ipgre_lock);
234 	*tp = t;
235 	write_unlock_bh(&ipgre_lock);
236 }
237 
238 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
239 {
240 	struct ip_tunnel **tp;
241 
242 	for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
243 		if (t == *tp) {
244 			write_lock_bh(&ipgre_lock);
245 			*tp = t->next;
246 			write_unlock_bh(&ipgre_lock);
247 			break;
248 		}
249 	}
250 }
251 
252 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
253 		struct ip_tunnel_parm *parms, int create)
254 {
255 	__be32 remote = parms->iph.daddr;
256 	__be32 local = parms->iph.saddr;
257 	__be32 key = parms->i_key;
258 	struct ip_tunnel *t, **tp, *nt;
259 	struct net_device *dev;
260 	char name[IFNAMSIZ];
261 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
262 
263 	for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next) {
264 		if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
265 			if (key == t->parms.i_key)
266 				return t;
267 		}
268 	}
269 	if (!create)
270 		return NULL;
271 
272 	if (parms->name[0])
273 		strlcpy(name, parms->name, IFNAMSIZ);
274 	else
275 		sprintf(name, "gre%%d");
276 
277 	dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
278 	if (!dev)
279 	  return NULL;
280 
281 	dev_net_set(dev, net);
282 
283 	if (strchr(name, '%')) {
284 		if (dev_alloc_name(dev, name) < 0)
285 			goto failed_free;
286 	}
287 
288 	dev->init = ipgre_tunnel_init;
289 	nt = netdev_priv(dev);
290 	nt->parms = *parms;
291 
292 	if (register_netdevice(dev) < 0)
293 		goto failed_free;
294 
295 	dev_hold(dev);
296 	ipgre_tunnel_link(ign, nt);
297 	return nt;
298 
299 failed_free:
300 	free_netdev(dev);
301 	return NULL;
302 }
303 
304 static void ipgre_tunnel_uninit(struct net_device *dev)
305 {
306 	struct net *net = dev_net(dev);
307 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
308 
309 	ipgre_tunnel_unlink(ign, netdev_priv(dev));
310 	dev_put(dev);
311 }
312 
313 
314 static void ipgre_err(struct sk_buff *skb, u32 info)
315 {
316 
317 /* All the routers (except for Linux) return only
318    8 bytes of packet payload. It means, that precise relaying of
319    ICMP in the real Internet is absolutely infeasible.
320 
321    Moreover, Cisco "wise men" put GRE key to the third word
322    in GRE header. It makes impossible maintaining even soft state for keyed
323    GRE tunnels with enabled checksum. Tell them "thank you".
324 
325    Well, I wonder, rfc1812 was written by Cisco employee,
326    what the hell these idiots break standrads established
327    by themself???
328  */
329 
330 	struct iphdr *iph = (struct iphdr*)skb->data;
331 	__be16	     *p = (__be16*)(skb->data+(iph->ihl<<2));
332 	int grehlen = (iph->ihl<<2) + 4;
333 	const int type = icmp_hdr(skb)->type;
334 	const int code = icmp_hdr(skb)->code;
335 	struct ip_tunnel *t;
336 	__be16 flags;
337 
338 	flags = p[0];
339 	if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
340 		if (flags&(GRE_VERSION|GRE_ROUTING))
341 			return;
342 		if (flags&GRE_KEY) {
343 			grehlen += 4;
344 			if (flags&GRE_CSUM)
345 				grehlen += 4;
346 		}
347 	}
348 
349 	/* If only 8 bytes returned, keyed message will be dropped here */
350 	if (skb_headlen(skb) < grehlen)
351 		return;
352 
353 	switch (type) {
354 	default:
355 	case ICMP_PARAMETERPROB:
356 		return;
357 
358 	case ICMP_DEST_UNREACH:
359 		switch (code) {
360 		case ICMP_SR_FAILED:
361 		case ICMP_PORT_UNREACH:
362 			/* Impossible event. */
363 			return;
364 		case ICMP_FRAG_NEEDED:
365 			/* Soft state for pmtu is maintained by IP core. */
366 			return;
367 		default:
368 			/* All others are translated to HOST_UNREACH.
369 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
370 			   I believe they are just ether pollution. --ANK
371 			 */
372 			break;
373 		}
374 		break;
375 	case ICMP_TIME_EXCEEDED:
376 		if (code != ICMP_EXC_TTL)
377 			return;
378 		break;
379 	}
380 
381 	read_lock(&ipgre_lock);
382 	t = ipgre_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr,
383 			(flags&GRE_KEY) ?
384 			*(((__be32*)p) + (grehlen>>2) - 1) : 0);
385 	if (t == NULL || t->parms.iph.daddr == 0 ||
386 	    ipv4_is_multicast(t->parms.iph.daddr))
387 		goto out;
388 
389 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
390 		goto out;
391 
392 	if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
393 		t->err_count++;
394 	else
395 		t->err_count = 1;
396 	t->err_time = jiffies;
397 out:
398 	read_unlock(&ipgre_lock);
399 	return;
400 }
401 
402 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
403 {
404 	if (INET_ECN_is_ce(iph->tos)) {
405 		if (skb->protocol == htons(ETH_P_IP)) {
406 			IP_ECN_set_ce(ip_hdr(skb));
407 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
408 			IP6_ECN_set_ce(ipv6_hdr(skb));
409 		}
410 	}
411 }
412 
413 static inline u8
414 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
415 {
416 	u8 inner = 0;
417 	if (skb->protocol == htons(ETH_P_IP))
418 		inner = old_iph->tos;
419 	else if (skb->protocol == htons(ETH_P_IPV6))
420 		inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
421 	return INET_ECN_encapsulate(tos, inner);
422 }
423 
424 static int ipgre_rcv(struct sk_buff *skb)
425 {
426 	struct iphdr *iph;
427 	u8     *h;
428 	__be16    flags;
429 	__sum16   csum = 0;
430 	__be32 key = 0;
431 	u32    seqno = 0;
432 	struct ip_tunnel *tunnel;
433 	int    offset = 4;
434 
435 	if (!pskb_may_pull(skb, 16))
436 		goto drop_nolock;
437 
438 	iph = ip_hdr(skb);
439 	h = skb->data;
440 	flags = *(__be16*)h;
441 
442 	if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
443 		/* - Version must be 0.
444 		   - We do not support routing headers.
445 		 */
446 		if (flags&(GRE_VERSION|GRE_ROUTING))
447 			goto drop_nolock;
448 
449 		if (flags&GRE_CSUM) {
450 			switch (skb->ip_summed) {
451 			case CHECKSUM_COMPLETE:
452 				csum = csum_fold(skb->csum);
453 				if (!csum)
454 					break;
455 				/* fall through */
456 			case CHECKSUM_NONE:
457 				skb->csum = 0;
458 				csum = __skb_checksum_complete(skb);
459 				skb->ip_summed = CHECKSUM_COMPLETE;
460 			}
461 			offset += 4;
462 		}
463 		if (flags&GRE_KEY) {
464 			key = *(__be32*)(h + offset);
465 			offset += 4;
466 		}
467 		if (flags&GRE_SEQ) {
468 			seqno = ntohl(*(__be32*)(h + offset));
469 			offset += 4;
470 		}
471 	}
472 
473 	read_lock(&ipgre_lock);
474 	if ((tunnel = ipgre_tunnel_lookup(dev_net(skb->dev),
475 					iph->saddr, iph->daddr, key)) != NULL) {
476 		struct net_device_stats *stats = &tunnel->dev->stats;
477 
478 		secpath_reset(skb);
479 
480 		skb->protocol = *(__be16*)(h + 2);
481 		/* WCCP version 1 and 2 protocol decoding.
482 		 * - Change protocol to IP
483 		 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
484 		 */
485 		if (flags == 0 &&
486 		    skb->protocol == htons(ETH_P_WCCP)) {
487 			skb->protocol = htons(ETH_P_IP);
488 			if ((*(h + offset) & 0xF0) != 0x40)
489 				offset += 4;
490 		}
491 
492 		skb->mac_header = skb->network_header;
493 		__pskb_pull(skb, offset);
494 		skb_reset_network_header(skb);
495 		skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
496 		skb->pkt_type = PACKET_HOST;
497 #ifdef CONFIG_NET_IPGRE_BROADCAST
498 		if (ipv4_is_multicast(iph->daddr)) {
499 			/* Looped back packet, drop it! */
500 			if (skb->rtable->fl.iif == 0)
501 				goto drop;
502 			stats->multicast++;
503 			skb->pkt_type = PACKET_BROADCAST;
504 		}
505 #endif
506 
507 		if (((flags&GRE_CSUM) && csum) ||
508 		    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
509 			stats->rx_crc_errors++;
510 			stats->rx_errors++;
511 			goto drop;
512 		}
513 		if (tunnel->parms.i_flags&GRE_SEQ) {
514 			if (!(flags&GRE_SEQ) ||
515 			    (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
516 				stats->rx_fifo_errors++;
517 				stats->rx_errors++;
518 				goto drop;
519 			}
520 			tunnel->i_seqno = seqno + 1;
521 		}
522 		stats->rx_packets++;
523 		stats->rx_bytes += skb->len;
524 		skb->dev = tunnel->dev;
525 		dst_release(skb->dst);
526 		skb->dst = NULL;
527 		nf_reset(skb);
528 		ipgre_ecn_decapsulate(iph, skb);
529 		netif_rx(skb);
530 		read_unlock(&ipgre_lock);
531 		return(0);
532 	}
533 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
534 
535 drop:
536 	read_unlock(&ipgre_lock);
537 drop_nolock:
538 	kfree_skb(skb);
539 	return(0);
540 }
541 
542 static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
543 {
544 	struct ip_tunnel *tunnel = netdev_priv(dev);
545 	struct net_device_stats *stats = &tunnel->dev->stats;
546 	struct iphdr  *old_iph = ip_hdr(skb);
547 	struct iphdr  *tiph;
548 	u8     tos;
549 	__be16 df;
550 	struct rtable *rt;     			/* Route to the other host */
551 	struct net_device *tdev;			/* Device to other host */
552 	struct iphdr  *iph;			/* Our new IP header */
553 	unsigned int max_headroom;		/* The extra header space needed */
554 	int    gre_hlen;
555 	__be32 dst;
556 	int    mtu;
557 
558 	if (tunnel->recursion++) {
559 		stats->collisions++;
560 		goto tx_error;
561 	}
562 
563 	if (dev->header_ops) {
564 		gre_hlen = 0;
565 		tiph = (struct iphdr*)skb->data;
566 	} else {
567 		gre_hlen = tunnel->hlen;
568 		tiph = &tunnel->parms.iph;
569 	}
570 
571 	if ((dst = tiph->daddr) == 0) {
572 		/* NBMA tunnel */
573 
574 		if (skb->dst == NULL) {
575 			stats->tx_fifo_errors++;
576 			goto tx_error;
577 		}
578 
579 		if (skb->protocol == htons(ETH_P_IP)) {
580 			rt = skb->rtable;
581 			if ((dst = rt->rt_gateway) == 0)
582 				goto tx_error_icmp;
583 		}
584 #ifdef CONFIG_IPV6
585 		else if (skb->protocol == htons(ETH_P_IPV6)) {
586 			struct in6_addr *addr6;
587 			int addr_type;
588 			struct neighbour *neigh = skb->dst->neighbour;
589 
590 			if (neigh == NULL)
591 				goto tx_error;
592 
593 			addr6 = (struct in6_addr*)&neigh->primary_key;
594 			addr_type = ipv6_addr_type(addr6);
595 
596 			if (addr_type == IPV6_ADDR_ANY) {
597 				addr6 = &ipv6_hdr(skb)->daddr;
598 				addr_type = ipv6_addr_type(addr6);
599 			}
600 
601 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
602 				goto tx_error_icmp;
603 
604 			dst = addr6->s6_addr32[3];
605 		}
606 #endif
607 		else
608 			goto tx_error;
609 	}
610 
611 	tos = tiph->tos;
612 	if (tos&1) {
613 		if (skb->protocol == htons(ETH_P_IP))
614 			tos = old_iph->tos;
615 		tos &= ~1;
616 	}
617 
618 	{
619 		struct flowi fl = { .oif = tunnel->parms.link,
620 				    .nl_u = { .ip4_u =
621 					      { .daddr = dst,
622 						.saddr = tiph->saddr,
623 						.tos = RT_TOS(tos) } },
624 				    .proto = IPPROTO_GRE };
625 		if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
626 			stats->tx_carrier_errors++;
627 			goto tx_error;
628 		}
629 	}
630 	tdev = rt->u.dst.dev;
631 
632 	if (tdev == dev) {
633 		ip_rt_put(rt);
634 		stats->collisions++;
635 		goto tx_error;
636 	}
637 
638 	df = tiph->frag_off;
639 	if (df)
640 		mtu = dst_mtu(&rt->u.dst) - tunnel->hlen;
641 	else
642 		mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
643 
644 	if (skb->dst)
645 		skb->dst->ops->update_pmtu(skb->dst, mtu);
646 
647 	if (skb->protocol == htons(ETH_P_IP)) {
648 		df |= (old_iph->frag_off&htons(IP_DF));
649 
650 		if ((old_iph->frag_off&htons(IP_DF)) &&
651 		    mtu < ntohs(old_iph->tot_len)) {
652 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
653 			ip_rt_put(rt);
654 			goto tx_error;
655 		}
656 	}
657 #ifdef CONFIG_IPV6
658 	else if (skb->protocol == htons(ETH_P_IPV6)) {
659 		struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
660 
661 		if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
662 			if ((tunnel->parms.iph.daddr &&
663 			     !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
664 			    rt6->rt6i_dst.plen == 128) {
665 				rt6->rt6i_flags |= RTF_MODIFIED;
666 				skb->dst->metrics[RTAX_MTU-1] = mtu;
667 			}
668 		}
669 
670 		if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
671 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
672 			ip_rt_put(rt);
673 			goto tx_error;
674 		}
675 	}
676 #endif
677 
678 	if (tunnel->err_count > 0) {
679 		if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
680 			tunnel->err_count--;
681 
682 			dst_link_failure(skb);
683 		} else
684 			tunnel->err_count = 0;
685 	}
686 
687 	max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
688 
689 	if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
690 	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
691 		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
692 		if (!new_skb) {
693 			ip_rt_put(rt);
694 			stats->tx_dropped++;
695 			dev_kfree_skb(skb);
696 			tunnel->recursion--;
697 			return 0;
698 		}
699 		if (skb->sk)
700 			skb_set_owner_w(new_skb, skb->sk);
701 		dev_kfree_skb(skb);
702 		skb = new_skb;
703 		old_iph = ip_hdr(skb);
704 	}
705 
706 	skb->transport_header = skb->network_header;
707 	skb_push(skb, gre_hlen);
708 	skb_reset_network_header(skb);
709 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
710 	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
711 			      IPSKB_REROUTED);
712 	dst_release(skb->dst);
713 	skb->dst = &rt->u.dst;
714 
715 	/*
716 	 *	Push down and install the IPIP header.
717 	 */
718 
719 	iph 			=	ip_hdr(skb);
720 	iph->version		=	4;
721 	iph->ihl		=	sizeof(struct iphdr) >> 2;
722 	iph->frag_off		=	df;
723 	iph->protocol		=	IPPROTO_GRE;
724 	iph->tos		=	ipgre_ecn_encapsulate(tos, old_iph, skb);
725 	iph->daddr		=	rt->rt_dst;
726 	iph->saddr		=	rt->rt_src;
727 
728 	if ((iph->ttl = tiph->ttl) == 0) {
729 		if (skb->protocol == htons(ETH_P_IP))
730 			iph->ttl = old_iph->ttl;
731 #ifdef CONFIG_IPV6
732 		else if (skb->protocol == htons(ETH_P_IPV6))
733 			iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit;
734 #endif
735 		else
736 			iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
737 	}
738 
739 	((__be16*)(iph+1))[0] = tunnel->parms.o_flags;
740 	((__be16*)(iph+1))[1] = skb->protocol;
741 
742 	if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
743 		__be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
744 
745 		if (tunnel->parms.o_flags&GRE_SEQ) {
746 			++tunnel->o_seqno;
747 			*ptr = htonl(tunnel->o_seqno);
748 			ptr--;
749 		}
750 		if (tunnel->parms.o_flags&GRE_KEY) {
751 			*ptr = tunnel->parms.o_key;
752 			ptr--;
753 		}
754 		if (tunnel->parms.o_flags&GRE_CSUM) {
755 			*ptr = 0;
756 			*(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
757 		}
758 	}
759 
760 	nf_reset(skb);
761 
762 	IPTUNNEL_XMIT();
763 	tunnel->recursion--;
764 	return 0;
765 
766 tx_error_icmp:
767 	dst_link_failure(skb);
768 
769 tx_error:
770 	stats->tx_errors++;
771 	dev_kfree_skb(skb);
772 	tunnel->recursion--;
773 	return 0;
774 }
775 
776 static void ipgre_tunnel_bind_dev(struct net_device *dev)
777 {
778 	struct net_device *tdev = NULL;
779 	struct ip_tunnel *tunnel;
780 	struct iphdr *iph;
781 	int hlen = LL_MAX_HEADER;
782 	int mtu = ETH_DATA_LEN;
783 	int addend = sizeof(struct iphdr) + 4;
784 
785 	tunnel = netdev_priv(dev);
786 	iph = &tunnel->parms.iph;
787 
788 	/* Guess output device to choose reasonable mtu and hard_header_len */
789 
790 	if (iph->daddr) {
791 		struct flowi fl = { .oif = tunnel->parms.link,
792 				    .nl_u = { .ip4_u =
793 					      { .daddr = iph->daddr,
794 						.saddr = iph->saddr,
795 						.tos = RT_TOS(iph->tos) } },
796 				    .proto = IPPROTO_GRE };
797 		struct rtable *rt;
798 		if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
799 			tdev = rt->u.dst.dev;
800 			ip_rt_put(rt);
801 		}
802 		dev->flags |= IFF_POINTOPOINT;
803 	}
804 
805 	if (!tdev && tunnel->parms.link)
806 		tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
807 
808 	if (tdev) {
809 		hlen = tdev->hard_header_len;
810 		mtu = tdev->mtu;
811 	}
812 	dev->iflink = tunnel->parms.link;
813 
814 	/* Precalculate GRE options length */
815 	if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
816 		if (tunnel->parms.o_flags&GRE_CSUM)
817 			addend += 4;
818 		if (tunnel->parms.o_flags&GRE_KEY)
819 			addend += 4;
820 		if (tunnel->parms.o_flags&GRE_SEQ)
821 			addend += 4;
822 	}
823 	dev->hard_header_len = hlen + addend;
824 	dev->mtu = mtu - addend;
825 	tunnel->hlen = addend;
826 
827 }
828 
829 static int
830 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
831 {
832 	int err = 0;
833 	struct ip_tunnel_parm p;
834 	struct ip_tunnel *t;
835 	struct net *net = dev_net(dev);
836 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
837 
838 	switch (cmd) {
839 	case SIOCGETTUNNEL:
840 		t = NULL;
841 		if (dev == ign->fb_tunnel_dev) {
842 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
843 				err = -EFAULT;
844 				break;
845 			}
846 			t = ipgre_tunnel_locate(net, &p, 0);
847 		}
848 		if (t == NULL)
849 			t = netdev_priv(dev);
850 		memcpy(&p, &t->parms, sizeof(p));
851 		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
852 			err = -EFAULT;
853 		break;
854 
855 	case SIOCADDTUNNEL:
856 	case SIOCCHGTUNNEL:
857 		err = -EPERM;
858 		if (!capable(CAP_NET_ADMIN))
859 			goto done;
860 
861 		err = -EFAULT;
862 		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
863 			goto done;
864 
865 		err = -EINVAL;
866 		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
867 		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
868 		    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
869 			goto done;
870 		if (p.iph.ttl)
871 			p.iph.frag_off |= htons(IP_DF);
872 
873 		if (!(p.i_flags&GRE_KEY))
874 			p.i_key = 0;
875 		if (!(p.o_flags&GRE_KEY))
876 			p.o_key = 0;
877 
878 		t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
879 
880 		if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
881 			if (t != NULL) {
882 				if (t->dev != dev) {
883 					err = -EEXIST;
884 					break;
885 				}
886 			} else {
887 				unsigned nflags=0;
888 
889 				t = netdev_priv(dev);
890 
891 				if (ipv4_is_multicast(p.iph.daddr))
892 					nflags = IFF_BROADCAST;
893 				else if (p.iph.daddr)
894 					nflags = IFF_POINTOPOINT;
895 
896 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
897 					err = -EINVAL;
898 					break;
899 				}
900 				ipgre_tunnel_unlink(ign, t);
901 				t->parms.iph.saddr = p.iph.saddr;
902 				t->parms.iph.daddr = p.iph.daddr;
903 				t->parms.i_key = p.i_key;
904 				t->parms.o_key = p.o_key;
905 				memcpy(dev->dev_addr, &p.iph.saddr, 4);
906 				memcpy(dev->broadcast, &p.iph.daddr, 4);
907 				ipgre_tunnel_link(ign, t);
908 				netdev_state_change(dev);
909 			}
910 		}
911 
912 		if (t) {
913 			err = 0;
914 			if (cmd == SIOCCHGTUNNEL) {
915 				t->parms.iph.ttl = p.iph.ttl;
916 				t->parms.iph.tos = p.iph.tos;
917 				t->parms.iph.frag_off = p.iph.frag_off;
918 				if (t->parms.link != p.link) {
919 					t->parms.link = p.link;
920 					ipgre_tunnel_bind_dev(dev);
921 					netdev_state_change(dev);
922 				}
923 			}
924 			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
925 				err = -EFAULT;
926 		} else
927 			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
928 		break;
929 
930 	case SIOCDELTUNNEL:
931 		err = -EPERM;
932 		if (!capable(CAP_NET_ADMIN))
933 			goto done;
934 
935 		if (dev == ign->fb_tunnel_dev) {
936 			err = -EFAULT;
937 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
938 				goto done;
939 			err = -ENOENT;
940 			if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
941 				goto done;
942 			err = -EPERM;
943 			if (t == netdev_priv(ign->fb_tunnel_dev))
944 				goto done;
945 			dev = t->dev;
946 		}
947 		unregister_netdevice(dev);
948 		err = 0;
949 		break;
950 
951 	default:
952 		err = -EINVAL;
953 	}
954 
955 done:
956 	return err;
957 }
958 
959 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
960 {
961 	struct ip_tunnel *tunnel = netdev_priv(dev);
962 	if (new_mtu < 68 || new_mtu > 0xFFF8 - tunnel->hlen)
963 		return -EINVAL;
964 	dev->mtu = new_mtu;
965 	return 0;
966 }
967 
968 /* Nice toy. Unfortunately, useless in real life :-)
969    It allows to construct virtual multiprotocol broadcast "LAN"
970    over the Internet, provided multicast routing is tuned.
971 
972 
973    I have no idea was this bicycle invented before me,
974    so that I had to set ARPHRD_IPGRE to a random value.
975    I have an impression, that Cisco could make something similar,
976    but this feature is apparently missing in IOS<=11.2(8).
977 
978    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
979    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
980 
981    ping -t 255 224.66.66.66
982 
983    If nobody answers, mbone does not work.
984 
985    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
986    ip addr add 10.66.66.<somewhat>/24 dev Universe
987    ifconfig Universe up
988    ifconfig Universe add fe80::<Your_real_addr>/10
989    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
990    ftp 10.66.66.66
991    ...
992    ftp fec0:6666:6666::193.233.7.65
993    ...
994 
995  */
996 
997 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
998 			unsigned short type,
999 			const void *daddr, const void *saddr, unsigned len)
1000 {
1001 	struct ip_tunnel *t = netdev_priv(dev);
1002 	struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1003 	__be16 *p = (__be16*)(iph+1);
1004 
1005 	memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1006 	p[0]		= t->parms.o_flags;
1007 	p[1]		= htons(type);
1008 
1009 	/*
1010 	 *	Set the source hardware address.
1011 	 */
1012 
1013 	if (saddr)
1014 		memcpy(&iph->saddr, saddr, 4);
1015 
1016 	if (daddr) {
1017 		memcpy(&iph->daddr, daddr, 4);
1018 		return t->hlen;
1019 	}
1020 	if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1021 		return t->hlen;
1022 
1023 	return -t->hlen;
1024 }
1025 
1026 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1027 {
1028 	struct iphdr *iph = (struct iphdr*) skb_mac_header(skb);
1029 	memcpy(haddr, &iph->saddr, 4);
1030 	return 4;
1031 }
1032 
1033 static const struct header_ops ipgre_header_ops = {
1034 	.create	= ipgre_header,
1035 	.parse	= ipgre_header_parse,
1036 };
1037 
1038 #ifdef CONFIG_NET_IPGRE_BROADCAST
1039 static int ipgre_open(struct net_device *dev)
1040 {
1041 	struct ip_tunnel *t = netdev_priv(dev);
1042 
1043 	if (ipv4_is_multicast(t->parms.iph.daddr)) {
1044 		struct flowi fl = { .oif = t->parms.link,
1045 				    .nl_u = { .ip4_u =
1046 					      { .daddr = t->parms.iph.daddr,
1047 						.saddr = t->parms.iph.saddr,
1048 						.tos = RT_TOS(t->parms.iph.tos) } },
1049 				    .proto = IPPROTO_GRE };
1050 		struct rtable *rt;
1051 		if (ip_route_output_key(dev_net(dev), &rt, &fl))
1052 			return -EADDRNOTAVAIL;
1053 		dev = rt->u.dst.dev;
1054 		ip_rt_put(rt);
1055 		if (__in_dev_get_rtnl(dev) == NULL)
1056 			return -EADDRNOTAVAIL;
1057 		t->mlink = dev->ifindex;
1058 		ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1059 	}
1060 	return 0;
1061 }
1062 
1063 static int ipgre_close(struct net_device *dev)
1064 {
1065 	struct ip_tunnel *t = netdev_priv(dev);
1066 	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1067 		struct in_device *in_dev;
1068 		in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1069 		if (in_dev) {
1070 			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1071 			in_dev_put(in_dev);
1072 		}
1073 	}
1074 	return 0;
1075 }
1076 
1077 #endif
1078 
1079 static void ipgre_tunnel_setup(struct net_device *dev)
1080 {
1081 	dev->uninit		= ipgre_tunnel_uninit;
1082 	dev->destructor 	= free_netdev;
1083 	dev->hard_start_xmit	= ipgre_tunnel_xmit;
1084 	dev->do_ioctl		= ipgre_tunnel_ioctl;
1085 	dev->change_mtu		= ipgre_tunnel_change_mtu;
1086 
1087 	dev->type		= ARPHRD_IPGRE;
1088 	dev->hard_header_len 	= LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1089 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1090 	dev->flags		= IFF_NOARP;
1091 	dev->iflink		= 0;
1092 	dev->addr_len		= 4;
1093 	dev->features		|= NETIF_F_NETNS_LOCAL;
1094 }
1095 
1096 static int ipgre_tunnel_init(struct net_device *dev)
1097 {
1098 	struct ip_tunnel *tunnel;
1099 	struct iphdr *iph;
1100 
1101 	tunnel = netdev_priv(dev);
1102 	iph = &tunnel->parms.iph;
1103 
1104 	tunnel->dev = dev;
1105 	strcpy(tunnel->parms.name, dev->name);
1106 
1107 	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1108 	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1109 
1110 	ipgre_tunnel_bind_dev(dev);
1111 
1112 	if (iph->daddr) {
1113 #ifdef CONFIG_NET_IPGRE_BROADCAST
1114 		if (ipv4_is_multicast(iph->daddr)) {
1115 			if (!iph->saddr)
1116 				return -EINVAL;
1117 			dev->flags = IFF_BROADCAST;
1118 			dev->header_ops = &ipgre_header_ops;
1119 			dev->open = ipgre_open;
1120 			dev->stop = ipgre_close;
1121 		}
1122 #endif
1123 	} else
1124 		dev->header_ops = &ipgre_header_ops;
1125 
1126 	return 0;
1127 }
1128 
1129 static int ipgre_fb_tunnel_init(struct net_device *dev)
1130 {
1131 	struct ip_tunnel *tunnel = netdev_priv(dev);
1132 	struct iphdr *iph = &tunnel->parms.iph;
1133 	struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1134 
1135 	tunnel->dev = dev;
1136 	strcpy(tunnel->parms.name, dev->name);
1137 
1138 	iph->version		= 4;
1139 	iph->protocol		= IPPROTO_GRE;
1140 	iph->ihl		= 5;
1141 	tunnel->hlen		= sizeof(struct iphdr) + 4;
1142 
1143 	dev_hold(dev);
1144 	ign->tunnels_wc[0]	= tunnel;
1145 	return 0;
1146 }
1147 
1148 
1149 static struct net_protocol ipgre_protocol = {
1150 	.handler	=	ipgre_rcv,
1151 	.err_handler	=	ipgre_err,
1152 	.netns_ok	=	1,
1153 };
1154 
1155 static void ipgre_destroy_tunnels(struct ipgre_net *ign)
1156 {
1157 	int prio;
1158 
1159 	for (prio = 0; prio < 4; prio++) {
1160 		int h;
1161 		for (h = 0; h < HASH_SIZE; h++) {
1162 			struct ip_tunnel *t;
1163 			while ((t = ign->tunnels[prio][h]) != NULL)
1164 				unregister_netdevice(t->dev);
1165 		}
1166 	}
1167 }
1168 
1169 static int ipgre_init_net(struct net *net)
1170 {
1171 	int err;
1172 	struct ipgre_net *ign;
1173 
1174 	err = -ENOMEM;
1175 	ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1176 	if (ign == NULL)
1177 		goto err_alloc;
1178 
1179 	err = net_assign_generic(net, ipgre_net_id, ign);
1180 	if (err < 0)
1181 		goto err_assign;
1182 
1183 	ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1184 					   ipgre_tunnel_setup);
1185 	if (!ign->fb_tunnel_dev) {
1186 		err = -ENOMEM;
1187 		goto err_alloc_dev;
1188 	}
1189 
1190 	ign->fb_tunnel_dev->init = ipgre_fb_tunnel_init;
1191 	dev_net_set(ign->fb_tunnel_dev, net);
1192 
1193 	if ((err = register_netdev(ign->fb_tunnel_dev)))
1194 		goto err_reg_dev;
1195 
1196 	return 0;
1197 
1198 err_reg_dev:
1199 	free_netdev(ign->fb_tunnel_dev);
1200 err_alloc_dev:
1201 	/* nothing */
1202 err_assign:
1203 	kfree(ign);
1204 err_alloc:
1205 	return err;
1206 }
1207 
1208 static void ipgre_exit_net(struct net *net)
1209 {
1210 	struct ipgre_net *ign;
1211 
1212 	ign = net_generic(net, ipgre_net_id);
1213 	rtnl_lock();
1214 	ipgre_destroy_tunnels(ign);
1215 	rtnl_unlock();
1216 	kfree(ign);
1217 }
1218 
1219 static struct pernet_operations ipgre_net_ops = {
1220 	.init = ipgre_init_net,
1221 	.exit = ipgre_exit_net,
1222 };
1223 
1224 /*
1225  *	And now the modules code and kernel interface.
1226  */
1227 
1228 static int __init ipgre_init(void)
1229 {
1230 	int err;
1231 
1232 	printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1233 
1234 	if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1235 		printk(KERN_INFO "ipgre init: can't add protocol\n");
1236 		return -EAGAIN;
1237 	}
1238 
1239 	err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1240 	if (err < 0)
1241 		inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1242 
1243 	return err;
1244 }
1245 
1246 static void __exit ipgre_fini(void)
1247 {
1248 	if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1249 		printk(KERN_INFO "ipgre close: can't remove protocol\n");
1250 
1251 	unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1252 }
1253 
1254 module_init(ipgre_init);
1255 module_exit(ipgre_fini);
1256 MODULE_LICENSE("GPL");
1257