xref: /openbmc/linux/net/ipv4/ip_gre.c (revision fd589a8f)
1 /*
2  *	Linux NET3:	GRE over IP protocol decoder.
3  *
4  *	Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *	This program is free software; you can redistribute it and/or
7  *	modify it under the terms of the GNU General Public License
8  *	as published by the Free Software Foundation; either version
9  *	2 of the License, or (at your option) any later version.
10  *
11  */
12 
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <asm/uaccess.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
20 #include <linux/in.h>
21 #include <linux/tcp.h>
22 #include <linux/udp.h>
23 #include <linux/if_arp.h>
24 #include <linux/mroute.h>
25 #include <linux/init.h>
26 #include <linux/in6.h>
27 #include <linux/inetdevice.h>
28 #include <linux/igmp.h>
29 #include <linux/netfilter_ipv4.h>
30 #include <linux/etherdevice.h>
31 #include <linux/if_ether.h>
32 
33 #include <net/sock.h>
34 #include <net/ip.h>
35 #include <net/icmp.h>
36 #include <net/protocol.h>
37 #include <net/ipip.h>
38 #include <net/arp.h>
39 #include <net/checksum.h>
40 #include <net/dsfield.h>
41 #include <net/inet_ecn.h>
42 #include <net/xfrm.h>
43 #include <net/net_namespace.h>
44 #include <net/netns/generic.h>
45 #include <net/rtnetlink.h>
46 
47 #ifdef CONFIG_IPV6
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #endif
52 
53 /*
54    Problems & solutions
55    --------------------
56 
57    1. The most important issue is detecting local dead loops.
58    They would cause complete host lockup in transmit, which
59    would be "resolved" by stack overflow or, if queueing is enabled,
60    with infinite looping in net_bh.
61 
62    We cannot track such dead loops during route installation,
63    it is infeasible task. The most general solutions would be
64    to keep skb->encapsulation counter (sort of local ttl),
65    and silently drop packet when it expires. It is the best
66    solution, but it supposes maintaing new variable in ALL
67    skb, even if no tunneling is used.
68 
69    Current solution: t->recursion lock breaks dead loops. It looks
70    like dev->tbusy flag, but I preferred new variable, because
71    the semantics is different. One day, when hard_start_xmit
72    will be multithreaded we will have to use skb->encapsulation.
73 
74 
75 
76    2. Networking dead loops would not kill routers, but would really
77    kill network. IP hop limit plays role of "t->recursion" in this case,
78    if we copy it from packet being encapsulated to upper header.
79    It is very good solution, but it introduces two problems:
80 
81    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
82      do not work over tunnels.
83    - traceroute does not work. I planned to relay ICMP from tunnel,
84      so that this problem would be solved and traceroute output
85      would even more informative. This idea appeared to be wrong:
86      only Linux complies to rfc1812 now (yes, guys, Linux is the only
87      true router now :-)), all routers (at least, in neighbourhood of mine)
88      return only 8 bytes of payload. It is the end.
89 
90    Hence, if we want that OSPF worked or traceroute said something reasonable,
91    we should search for another solution.
92 
93    One of them is to parse packet trying to detect inner encapsulation
94    made by our node. It is difficult or even impossible, especially,
95    taking into account fragmentation. TO be short, tt is not solution at all.
96 
97    Current solution: The solution was UNEXPECTEDLY SIMPLE.
98    We force DF flag on tunnels with preconfigured hop limit,
99    that is ALL. :-) Well, it does not remove the problem completely,
100    but exponential growth of network traffic is changed to linear
101    (branches, that exceed pmtu are pruned) and tunnel mtu
102    fastly degrades to value <68, where looping stops.
103    Yes, it is not good if there exists a router in the loop,
104    which does not force DF, even when encapsulating packets have DF set.
105    But it is not our problem! Nobody could accuse us, we made
106    all that we could make. Even if it is your gated who injected
107    fatal route to network, even if it were you who configured
108    fatal static route: you are innocent. :-)
109 
110 
111 
112    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
113    practically identical code. It would be good to glue them
114    together, but it is not very evident, how to make them modular.
115    sit is integral part of IPv6, ipip and gre are naturally modular.
116    We could extract common parts (hash table, ioctl etc)
117    to a separate module (ip_tunnel.c).
118 
119    Alexey Kuznetsov.
120  */
121 
122 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
123 static int ipgre_tunnel_init(struct net_device *dev);
124 static void ipgre_tunnel_setup(struct net_device *dev);
125 static int ipgre_tunnel_bind_dev(struct net_device *dev);
126 
127 /* Fallback tunnel: no source, no destination, no key, no options */
128 
129 #define HASH_SIZE  16
130 
131 static int ipgre_net_id;
132 struct ipgre_net {
133 	struct ip_tunnel *tunnels[4][HASH_SIZE];
134 
135 	struct net_device *fb_tunnel_dev;
136 };
137 
138 /* Tunnel hash table */
139 
140 /*
141    4 hash tables:
142 
143    3: (remote,local)
144    2: (remote,*)
145    1: (*,local)
146    0: (*,*)
147 
148    We require exact key match i.e. if a key is present in packet
149    it will match only tunnel with the same key; if it is not present,
150    it will match only keyless tunnel.
151 
152    All keysless packets, if not matched configured keyless tunnels
153    will match fallback tunnel.
154  */
155 
156 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
157 
158 #define tunnels_r_l	tunnels[3]
159 #define tunnels_r	tunnels[2]
160 #define tunnels_l	tunnels[1]
161 #define tunnels_wc	tunnels[0]
162 
163 static DEFINE_RWLOCK(ipgre_lock);
164 
165 /* Given src, dst and key, find appropriate for input tunnel. */
166 
167 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
168 					      __be32 remote, __be32 local,
169 					      __be32 key, __be16 gre_proto)
170 {
171 	struct net *net = dev_net(dev);
172 	int link = dev->ifindex;
173 	unsigned h0 = HASH(remote);
174 	unsigned h1 = HASH(key);
175 	struct ip_tunnel *t, *cand = NULL;
176 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
177 	int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
178 		       ARPHRD_ETHER : ARPHRD_IPGRE;
179 	int score, cand_score = 4;
180 
181 	for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
182 		if (local != t->parms.iph.saddr ||
183 		    remote != t->parms.iph.daddr ||
184 		    key != t->parms.i_key ||
185 		    !(t->dev->flags & IFF_UP))
186 			continue;
187 
188 		if (t->dev->type != ARPHRD_IPGRE &&
189 		    t->dev->type != dev_type)
190 			continue;
191 
192 		score = 0;
193 		if (t->parms.link != link)
194 			score |= 1;
195 		if (t->dev->type != dev_type)
196 			score |= 2;
197 		if (score == 0)
198 			return t;
199 
200 		if (score < cand_score) {
201 			cand = t;
202 			cand_score = score;
203 		}
204 	}
205 
206 	for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
207 		if (remote != t->parms.iph.daddr ||
208 		    key != t->parms.i_key ||
209 		    !(t->dev->flags & IFF_UP))
210 			continue;
211 
212 		if (t->dev->type != ARPHRD_IPGRE &&
213 		    t->dev->type != dev_type)
214 			continue;
215 
216 		score = 0;
217 		if (t->parms.link != link)
218 			score |= 1;
219 		if (t->dev->type != dev_type)
220 			score |= 2;
221 		if (score == 0)
222 			return t;
223 
224 		if (score < cand_score) {
225 			cand = t;
226 			cand_score = score;
227 		}
228 	}
229 
230 	for (t = ign->tunnels_l[h1]; t; t = t->next) {
231 		if ((local != t->parms.iph.saddr &&
232 		     (local != t->parms.iph.daddr ||
233 		      !ipv4_is_multicast(local))) ||
234 		    key != t->parms.i_key ||
235 		    !(t->dev->flags & IFF_UP))
236 			continue;
237 
238 		if (t->dev->type != ARPHRD_IPGRE &&
239 		    t->dev->type != dev_type)
240 			continue;
241 
242 		score = 0;
243 		if (t->parms.link != link)
244 			score |= 1;
245 		if (t->dev->type != dev_type)
246 			score |= 2;
247 		if (score == 0)
248 			return t;
249 
250 		if (score < cand_score) {
251 			cand = t;
252 			cand_score = score;
253 		}
254 	}
255 
256 	for (t = ign->tunnels_wc[h1]; t; t = t->next) {
257 		if (t->parms.i_key != key ||
258 		    !(t->dev->flags & IFF_UP))
259 			continue;
260 
261 		if (t->dev->type != ARPHRD_IPGRE &&
262 		    t->dev->type != dev_type)
263 			continue;
264 
265 		score = 0;
266 		if (t->parms.link != link)
267 			score |= 1;
268 		if (t->dev->type != dev_type)
269 			score |= 2;
270 		if (score == 0)
271 			return t;
272 
273 		if (score < cand_score) {
274 			cand = t;
275 			cand_score = score;
276 		}
277 	}
278 
279 	if (cand != NULL)
280 		return cand;
281 
282 	if (ign->fb_tunnel_dev->flags & IFF_UP)
283 		return netdev_priv(ign->fb_tunnel_dev);
284 
285 	return NULL;
286 }
287 
288 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
289 		struct ip_tunnel_parm *parms)
290 {
291 	__be32 remote = parms->iph.daddr;
292 	__be32 local = parms->iph.saddr;
293 	__be32 key = parms->i_key;
294 	unsigned h = HASH(key);
295 	int prio = 0;
296 
297 	if (local)
298 		prio |= 1;
299 	if (remote && !ipv4_is_multicast(remote)) {
300 		prio |= 2;
301 		h ^= HASH(remote);
302 	}
303 
304 	return &ign->tunnels[prio][h];
305 }
306 
307 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
308 		struct ip_tunnel *t)
309 {
310 	return __ipgre_bucket(ign, &t->parms);
311 }
312 
313 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
314 {
315 	struct ip_tunnel **tp = ipgre_bucket(ign, t);
316 
317 	t->next = *tp;
318 	write_lock_bh(&ipgre_lock);
319 	*tp = t;
320 	write_unlock_bh(&ipgre_lock);
321 }
322 
323 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
324 {
325 	struct ip_tunnel **tp;
326 
327 	for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
328 		if (t == *tp) {
329 			write_lock_bh(&ipgre_lock);
330 			*tp = t->next;
331 			write_unlock_bh(&ipgre_lock);
332 			break;
333 		}
334 	}
335 }
336 
337 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
338 					   struct ip_tunnel_parm *parms,
339 					   int type)
340 {
341 	__be32 remote = parms->iph.daddr;
342 	__be32 local = parms->iph.saddr;
343 	__be32 key = parms->i_key;
344 	int link = parms->link;
345 	struct ip_tunnel *t, **tp;
346 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
347 
348 	for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
349 		if (local == t->parms.iph.saddr &&
350 		    remote == t->parms.iph.daddr &&
351 		    key == t->parms.i_key &&
352 		    link == t->parms.link &&
353 		    type == t->dev->type)
354 			break;
355 
356 	return t;
357 }
358 
359 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
360 		struct ip_tunnel_parm *parms, int create)
361 {
362 	struct ip_tunnel *t, *nt;
363 	struct net_device *dev;
364 	char name[IFNAMSIZ];
365 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
366 
367 	t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
368 	if (t || !create)
369 		return t;
370 
371 	if (parms->name[0])
372 		strlcpy(name, parms->name, IFNAMSIZ);
373 	else
374 		sprintf(name, "gre%%d");
375 
376 	dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
377 	if (!dev)
378 	  return NULL;
379 
380 	dev_net_set(dev, net);
381 
382 	if (strchr(name, '%')) {
383 		if (dev_alloc_name(dev, name) < 0)
384 			goto failed_free;
385 	}
386 
387 	nt = netdev_priv(dev);
388 	nt->parms = *parms;
389 	dev->rtnl_link_ops = &ipgre_link_ops;
390 
391 	dev->mtu = ipgre_tunnel_bind_dev(dev);
392 
393 	if (register_netdevice(dev) < 0)
394 		goto failed_free;
395 
396 	dev_hold(dev);
397 	ipgre_tunnel_link(ign, nt);
398 	return nt;
399 
400 failed_free:
401 	free_netdev(dev);
402 	return NULL;
403 }
404 
405 static void ipgre_tunnel_uninit(struct net_device *dev)
406 {
407 	struct net *net = dev_net(dev);
408 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
409 
410 	ipgre_tunnel_unlink(ign, netdev_priv(dev));
411 	dev_put(dev);
412 }
413 
414 
415 static void ipgre_err(struct sk_buff *skb, u32 info)
416 {
417 
418 /* All the routers (except for Linux) return only
419    8 bytes of packet payload. It means, that precise relaying of
420    ICMP in the real Internet is absolutely infeasible.
421 
422    Moreover, Cisco "wise men" put GRE key to the third word
423    in GRE header. It makes impossible maintaining even soft state for keyed
424    GRE tunnels with enabled checksum. Tell them "thank you".
425 
426    Well, I wonder, rfc1812 was written by Cisco employee,
427    what the hell these idiots break standrads established
428    by themself???
429  */
430 
431 	struct iphdr *iph = (struct iphdr *)skb->data;
432 	__be16	     *p = (__be16*)(skb->data+(iph->ihl<<2));
433 	int grehlen = (iph->ihl<<2) + 4;
434 	const int type = icmp_hdr(skb)->type;
435 	const int code = icmp_hdr(skb)->code;
436 	struct ip_tunnel *t;
437 	__be16 flags;
438 
439 	flags = p[0];
440 	if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
441 		if (flags&(GRE_VERSION|GRE_ROUTING))
442 			return;
443 		if (flags&GRE_KEY) {
444 			grehlen += 4;
445 			if (flags&GRE_CSUM)
446 				grehlen += 4;
447 		}
448 	}
449 
450 	/* If only 8 bytes returned, keyed message will be dropped here */
451 	if (skb_headlen(skb) < grehlen)
452 		return;
453 
454 	switch (type) {
455 	default:
456 	case ICMP_PARAMETERPROB:
457 		return;
458 
459 	case ICMP_DEST_UNREACH:
460 		switch (code) {
461 		case ICMP_SR_FAILED:
462 		case ICMP_PORT_UNREACH:
463 			/* Impossible event. */
464 			return;
465 		case ICMP_FRAG_NEEDED:
466 			/* Soft state for pmtu is maintained by IP core. */
467 			return;
468 		default:
469 			/* All others are translated to HOST_UNREACH.
470 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
471 			   I believe they are just ether pollution. --ANK
472 			 */
473 			break;
474 		}
475 		break;
476 	case ICMP_TIME_EXCEEDED:
477 		if (code != ICMP_EXC_TTL)
478 			return;
479 		break;
480 	}
481 
482 	read_lock(&ipgre_lock);
483 	t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
484 				flags & GRE_KEY ?
485 				*(((__be32 *)p) + (grehlen / 4) - 1) : 0,
486 				p[1]);
487 	if (t == NULL || t->parms.iph.daddr == 0 ||
488 	    ipv4_is_multicast(t->parms.iph.daddr))
489 		goto out;
490 
491 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
492 		goto out;
493 
494 	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
495 		t->err_count++;
496 	else
497 		t->err_count = 1;
498 	t->err_time = jiffies;
499 out:
500 	read_unlock(&ipgre_lock);
501 	return;
502 }
503 
504 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
505 {
506 	if (INET_ECN_is_ce(iph->tos)) {
507 		if (skb->protocol == htons(ETH_P_IP)) {
508 			IP_ECN_set_ce(ip_hdr(skb));
509 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
510 			IP6_ECN_set_ce(ipv6_hdr(skb));
511 		}
512 	}
513 }
514 
515 static inline u8
516 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
517 {
518 	u8 inner = 0;
519 	if (skb->protocol == htons(ETH_P_IP))
520 		inner = old_iph->tos;
521 	else if (skb->protocol == htons(ETH_P_IPV6))
522 		inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
523 	return INET_ECN_encapsulate(tos, inner);
524 }
525 
526 static int ipgre_rcv(struct sk_buff *skb)
527 {
528 	struct iphdr *iph;
529 	u8     *h;
530 	__be16    flags;
531 	__sum16   csum = 0;
532 	__be32 key = 0;
533 	u32    seqno = 0;
534 	struct ip_tunnel *tunnel;
535 	int    offset = 4;
536 	__be16 gre_proto;
537 	unsigned int len;
538 
539 	if (!pskb_may_pull(skb, 16))
540 		goto drop_nolock;
541 
542 	iph = ip_hdr(skb);
543 	h = skb->data;
544 	flags = *(__be16*)h;
545 
546 	if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
547 		/* - Version must be 0.
548 		   - We do not support routing headers.
549 		 */
550 		if (flags&(GRE_VERSION|GRE_ROUTING))
551 			goto drop_nolock;
552 
553 		if (flags&GRE_CSUM) {
554 			switch (skb->ip_summed) {
555 			case CHECKSUM_COMPLETE:
556 				csum = csum_fold(skb->csum);
557 				if (!csum)
558 					break;
559 				/* fall through */
560 			case CHECKSUM_NONE:
561 				skb->csum = 0;
562 				csum = __skb_checksum_complete(skb);
563 				skb->ip_summed = CHECKSUM_COMPLETE;
564 			}
565 			offset += 4;
566 		}
567 		if (flags&GRE_KEY) {
568 			key = *(__be32*)(h + offset);
569 			offset += 4;
570 		}
571 		if (flags&GRE_SEQ) {
572 			seqno = ntohl(*(__be32*)(h + offset));
573 			offset += 4;
574 		}
575 	}
576 
577 	gre_proto = *(__be16 *)(h + 2);
578 
579 	read_lock(&ipgre_lock);
580 	if ((tunnel = ipgre_tunnel_lookup(skb->dev,
581 					  iph->saddr, iph->daddr, key,
582 					  gre_proto))) {
583 		struct net_device_stats *stats = &tunnel->dev->stats;
584 
585 		secpath_reset(skb);
586 
587 		skb->protocol = gre_proto;
588 		/* WCCP version 1 and 2 protocol decoding.
589 		 * - Change protocol to IP
590 		 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
591 		 */
592 		if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
593 			skb->protocol = htons(ETH_P_IP);
594 			if ((*(h + offset) & 0xF0) != 0x40)
595 				offset += 4;
596 		}
597 
598 		skb->mac_header = skb->network_header;
599 		__pskb_pull(skb, offset);
600 		skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
601 		skb->pkt_type = PACKET_HOST;
602 #ifdef CONFIG_NET_IPGRE_BROADCAST
603 		if (ipv4_is_multicast(iph->daddr)) {
604 			/* Looped back packet, drop it! */
605 			if (skb_rtable(skb)->fl.iif == 0)
606 				goto drop;
607 			stats->multicast++;
608 			skb->pkt_type = PACKET_BROADCAST;
609 		}
610 #endif
611 
612 		if (((flags&GRE_CSUM) && csum) ||
613 		    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
614 			stats->rx_crc_errors++;
615 			stats->rx_errors++;
616 			goto drop;
617 		}
618 		if (tunnel->parms.i_flags&GRE_SEQ) {
619 			if (!(flags&GRE_SEQ) ||
620 			    (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
621 				stats->rx_fifo_errors++;
622 				stats->rx_errors++;
623 				goto drop;
624 			}
625 			tunnel->i_seqno = seqno + 1;
626 		}
627 
628 		len = skb->len;
629 
630 		/* Warning: All skb pointers will be invalidated! */
631 		if (tunnel->dev->type == ARPHRD_ETHER) {
632 			if (!pskb_may_pull(skb, ETH_HLEN)) {
633 				stats->rx_length_errors++;
634 				stats->rx_errors++;
635 				goto drop;
636 			}
637 
638 			iph = ip_hdr(skb);
639 			skb->protocol = eth_type_trans(skb, tunnel->dev);
640 			skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
641 		}
642 
643 		stats->rx_packets++;
644 		stats->rx_bytes += len;
645 		skb->dev = tunnel->dev;
646 		skb_dst_drop(skb);
647 		nf_reset(skb);
648 
649 		skb_reset_network_header(skb);
650 		ipgre_ecn_decapsulate(iph, skb);
651 
652 		netif_rx(skb);
653 		read_unlock(&ipgre_lock);
654 		return(0);
655 	}
656 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
657 
658 drop:
659 	read_unlock(&ipgre_lock);
660 drop_nolock:
661 	kfree_skb(skb);
662 	return(0);
663 }
664 
665 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
666 {
667 	struct ip_tunnel *tunnel = netdev_priv(dev);
668 	struct net_device_stats *stats = &tunnel->dev->stats;
669 	struct iphdr  *old_iph = ip_hdr(skb);
670 	struct iphdr  *tiph;
671 	u8     tos;
672 	__be16 df;
673 	struct rtable *rt;     			/* Route to the other host */
674 	struct net_device *tdev;			/* Device to other host */
675 	struct iphdr  *iph;			/* Our new IP header */
676 	unsigned int max_headroom;		/* The extra header space needed */
677 	int    gre_hlen;
678 	__be32 dst;
679 	int    mtu;
680 
681 	if (tunnel->recursion++) {
682 		stats->collisions++;
683 		goto tx_error;
684 	}
685 
686 	if (dev->type == ARPHRD_ETHER)
687 		IPCB(skb)->flags = 0;
688 
689 	if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
690 		gre_hlen = 0;
691 		tiph = (struct iphdr *)skb->data;
692 	} else {
693 		gre_hlen = tunnel->hlen;
694 		tiph = &tunnel->parms.iph;
695 	}
696 
697 	if ((dst = tiph->daddr) == 0) {
698 		/* NBMA tunnel */
699 
700 		if (skb_dst(skb) == NULL) {
701 			stats->tx_fifo_errors++;
702 			goto tx_error;
703 		}
704 
705 		if (skb->protocol == htons(ETH_P_IP)) {
706 			rt = skb_rtable(skb);
707 			if ((dst = rt->rt_gateway) == 0)
708 				goto tx_error_icmp;
709 		}
710 #ifdef CONFIG_IPV6
711 		else if (skb->protocol == htons(ETH_P_IPV6)) {
712 			struct in6_addr *addr6;
713 			int addr_type;
714 			struct neighbour *neigh = skb_dst(skb)->neighbour;
715 
716 			if (neigh == NULL)
717 				goto tx_error;
718 
719 			addr6 = (struct in6_addr *)&neigh->primary_key;
720 			addr_type = ipv6_addr_type(addr6);
721 
722 			if (addr_type == IPV6_ADDR_ANY) {
723 				addr6 = &ipv6_hdr(skb)->daddr;
724 				addr_type = ipv6_addr_type(addr6);
725 			}
726 
727 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
728 				goto tx_error_icmp;
729 
730 			dst = addr6->s6_addr32[3];
731 		}
732 #endif
733 		else
734 			goto tx_error;
735 	}
736 
737 	tos = tiph->tos;
738 	if (tos == 1) {
739 		tos = 0;
740 		if (skb->protocol == htons(ETH_P_IP))
741 			tos = old_iph->tos;
742 	}
743 
744 	{
745 		struct flowi fl = { .oif = tunnel->parms.link,
746 				    .nl_u = { .ip4_u =
747 					      { .daddr = dst,
748 						.saddr = tiph->saddr,
749 						.tos = RT_TOS(tos) } },
750 				    .proto = IPPROTO_GRE };
751 		if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
752 			stats->tx_carrier_errors++;
753 			goto tx_error;
754 		}
755 	}
756 	tdev = rt->u.dst.dev;
757 
758 	if (tdev == dev) {
759 		ip_rt_put(rt);
760 		stats->collisions++;
761 		goto tx_error;
762 	}
763 
764 	df = tiph->frag_off;
765 	if (df)
766 		mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
767 	else
768 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
769 
770 	if (skb_dst(skb))
771 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
772 
773 	if (skb->protocol == htons(ETH_P_IP)) {
774 		df |= (old_iph->frag_off&htons(IP_DF));
775 
776 		if ((old_iph->frag_off&htons(IP_DF)) &&
777 		    mtu < ntohs(old_iph->tot_len)) {
778 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
779 			ip_rt_put(rt);
780 			goto tx_error;
781 		}
782 	}
783 #ifdef CONFIG_IPV6
784 	else if (skb->protocol == htons(ETH_P_IPV6)) {
785 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
786 
787 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
788 			if ((tunnel->parms.iph.daddr &&
789 			     !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
790 			    rt6->rt6i_dst.plen == 128) {
791 				rt6->rt6i_flags |= RTF_MODIFIED;
792 				skb_dst(skb)->metrics[RTAX_MTU-1] = mtu;
793 			}
794 		}
795 
796 		if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
797 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
798 			ip_rt_put(rt);
799 			goto tx_error;
800 		}
801 	}
802 #endif
803 
804 	if (tunnel->err_count > 0) {
805 		if (time_before(jiffies,
806 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
807 			tunnel->err_count--;
808 
809 			dst_link_failure(skb);
810 		} else
811 			tunnel->err_count = 0;
812 	}
813 
814 	max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
815 
816 	if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
817 	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
818 		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
819 		if (!new_skb) {
820 			ip_rt_put(rt);
821 			stats->tx_dropped++;
822 			dev_kfree_skb(skb);
823 			tunnel->recursion--;
824 			return NETDEV_TX_OK;
825 		}
826 		if (skb->sk)
827 			skb_set_owner_w(new_skb, skb->sk);
828 		dev_kfree_skb(skb);
829 		skb = new_skb;
830 		old_iph = ip_hdr(skb);
831 	}
832 
833 	skb_reset_transport_header(skb);
834 	skb_push(skb, gre_hlen);
835 	skb_reset_network_header(skb);
836 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
837 	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
838 			      IPSKB_REROUTED);
839 	skb_dst_drop(skb);
840 	skb_dst_set(skb, &rt->u.dst);
841 
842 	/*
843 	 *	Push down and install the IPIP header.
844 	 */
845 
846 	iph 			=	ip_hdr(skb);
847 	iph->version		=	4;
848 	iph->ihl		=	sizeof(struct iphdr) >> 2;
849 	iph->frag_off		=	df;
850 	iph->protocol		=	IPPROTO_GRE;
851 	iph->tos		=	ipgre_ecn_encapsulate(tos, old_iph, skb);
852 	iph->daddr		=	rt->rt_dst;
853 	iph->saddr		=	rt->rt_src;
854 
855 	if ((iph->ttl = tiph->ttl) == 0) {
856 		if (skb->protocol == htons(ETH_P_IP))
857 			iph->ttl = old_iph->ttl;
858 #ifdef CONFIG_IPV6
859 		else if (skb->protocol == htons(ETH_P_IPV6))
860 			iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
861 #endif
862 		else
863 			iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
864 	}
865 
866 	((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
867 	((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
868 				   htons(ETH_P_TEB) : skb->protocol;
869 
870 	if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
871 		__be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
872 
873 		if (tunnel->parms.o_flags&GRE_SEQ) {
874 			++tunnel->o_seqno;
875 			*ptr = htonl(tunnel->o_seqno);
876 			ptr--;
877 		}
878 		if (tunnel->parms.o_flags&GRE_KEY) {
879 			*ptr = tunnel->parms.o_key;
880 			ptr--;
881 		}
882 		if (tunnel->parms.o_flags&GRE_CSUM) {
883 			*ptr = 0;
884 			*(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
885 		}
886 	}
887 
888 	nf_reset(skb);
889 
890 	IPTUNNEL_XMIT();
891 	tunnel->recursion--;
892 	return NETDEV_TX_OK;
893 
894 tx_error_icmp:
895 	dst_link_failure(skb);
896 
897 tx_error:
898 	stats->tx_errors++;
899 	dev_kfree_skb(skb);
900 	tunnel->recursion--;
901 	return NETDEV_TX_OK;
902 }
903 
904 static int ipgre_tunnel_bind_dev(struct net_device *dev)
905 {
906 	struct net_device *tdev = NULL;
907 	struct ip_tunnel *tunnel;
908 	struct iphdr *iph;
909 	int hlen = LL_MAX_HEADER;
910 	int mtu = ETH_DATA_LEN;
911 	int addend = sizeof(struct iphdr) + 4;
912 
913 	tunnel = netdev_priv(dev);
914 	iph = &tunnel->parms.iph;
915 
916 	/* Guess output device to choose reasonable mtu and needed_headroom */
917 
918 	if (iph->daddr) {
919 		struct flowi fl = { .oif = tunnel->parms.link,
920 				    .nl_u = { .ip4_u =
921 					      { .daddr = iph->daddr,
922 						.saddr = iph->saddr,
923 						.tos = RT_TOS(iph->tos) } },
924 				    .proto = IPPROTO_GRE };
925 		struct rtable *rt;
926 		if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
927 			tdev = rt->u.dst.dev;
928 			ip_rt_put(rt);
929 		}
930 
931 		if (dev->type != ARPHRD_ETHER)
932 			dev->flags |= IFF_POINTOPOINT;
933 	}
934 
935 	if (!tdev && tunnel->parms.link)
936 		tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
937 
938 	if (tdev) {
939 		hlen = tdev->hard_header_len + tdev->needed_headroom;
940 		mtu = tdev->mtu;
941 	}
942 	dev->iflink = tunnel->parms.link;
943 
944 	/* Precalculate GRE options length */
945 	if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
946 		if (tunnel->parms.o_flags&GRE_CSUM)
947 			addend += 4;
948 		if (tunnel->parms.o_flags&GRE_KEY)
949 			addend += 4;
950 		if (tunnel->parms.o_flags&GRE_SEQ)
951 			addend += 4;
952 	}
953 	dev->needed_headroom = addend + hlen;
954 	mtu -= dev->hard_header_len + addend;
955 
956 	if (mtu < 68)
957 		mtu = 68;
958 
959 	tunnel->hlen = addend;
960 
961 	return mtu;
962 }
963 
964 static int
965 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
966 {
967 	int err = 0;
968 	struct ip_tunnel_parm p;
969 	struct ip_tunnel *t;
970 	struct net *net = dev_net(dev);
971 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
972 
973 	switch (cmd) {
974 	case SIOCGETTUNNEL:
975 		t = NULL;
976 		if (dev == ign->fb_tunnel_dev) {
977 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
978 				err = -EFAULT;
979 				break;
980 			}
981 			t = ipgre_tunnel_locate(net, &p, 0);
982 		}
983 		if (t == NULL)
984 			t = netdev_priv(dev);
985 		memcpy(&p, &t->parms, sizeof(p));
986 		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
987 			err = -EFAULT;
988 		break;
989 
990 	case SIOCADDTUNNEL:
991 	case SIOCCHGTUNNEL:
992 		err = -EPERM;
993 		if (!capable(CAP_NET_ADMIN))
994 			goto done;
995 
996 		err = -EFAULT;
997 		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
998 			goto done;
999 
1000 		err = -EINVAL;
1001 		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1002 		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1003 		    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1004 			goto done;
1005 		if (p.iph.ttl)
1006 			p.iph.frag_off |= htons(IP_DF);
1007 
1008 		if (!(p.i_flags&GRE_KEY))
1009 			p.i_key = 0;
1010 		if (!(p.o_flags&GRE_KEY))
1011 			p.o_key = 0;
1012 
1013 		t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1014 
1015 		if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1016 			if (t != NULL) {
1017 				if (t->dev != dev) {
1018 					err = -EEXIST;
1019 					break;
1020 				}
1021 			} else {
1022 				unsigned nflags = 0;
1023 
1024 				t = netdev_priv(dev);
1025 
1026 				if (ipv4_is_multicast(p.iph.daddr))
1027 					nflags = IFF_BROADCAST;
1028 				else if (p.iph.daddr)
1029 					nflags = IFF_POINTOPOINT;
1030 
1031 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1032 					err = -EINVAL;
1033 					break;
1034 				}
1035 				ipgre_tunnel_unlink(ign, t);
1036 				t->parms.iph.saddr = p.iph.saddr;
1037 				t->parms.iph.daddr = p.iph.daddr;
1038 				t->parms.i_key = p.i_key;
1039 				t->parms.o_key = p.o_key;
1040 				memcpy(dev->dev_addr, &p.iph.saddr, 4);
1041 				memcpy(dev->broadcast, &p.iph.daddr, 4);
1042 				ipgre_tunnel_link(ign, t);
1043 				netdev_state_change(dev);
1044 			}
1045 		}
1046 
1047 		if (t) {
1048 			err = 0;
1049 			if (cmd == SIOCCHGTUNNEL) {
1050 				t->parms.iph.ttl = p.iph.ttl;
1051 				t->parms.iph.tos = p.iph.tos;
1052 				t->parms.iph.frag_off = p.iph.frag_off;
1053 				if (t->parms.link != p.link) {
1054 					t->parms.link = p.link;
1055 					dev->mtu = ipgre_tunnel_bind_dev(dev);
1056 					netdev_state_change(dev);
1057 				}
1058 			}
1059 			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1060 				err = -EFAULT;
1061 		} else
1062 			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1063 		break;
1064 
1065 	case SIOCDELTUNNEL:
1066 		err = -EPERM;
1067 		if (!capable(CAP_NET_ADMIN))
1068 			goto done;
1069 
1070 		if (dev == ign->fb_tunnel_dev) {
1071 			err = -EFAULT;
1072 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1073 				goto done;
1074 			err = -ENOENT;
1075 			if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1076 				goto done;
1077 			err = -EPERM;
1078 			if (t == netdev_priv(ign->fb_tunnel_dev))
1079 				goto done;
1080 			dev = t->dev;
1081 		}
1082 		unregister_netdevice(dev);
1083 		err = 0;
1084 		break;
1085 
1086 	default:
1087 		err = -EINVAL;
1088 	}
1089 
1090 done:
1091 	return err;
1092 }
1093 
1094 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1095 {
1096 	struct ip_tunnel *tunnel = netdev_priv(dev);
1097 	if (new_mtu < 68 ||
1098 	    new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1099 		return -EINVAL;
1100 	dev->mtu = new_mtu;
1101 	return 0;
1102 }
1103 
1104 /* Nice toy. Unfortunately, useless in real life :-)
1105    It allows to construct virtual multiprotocol broadcast "LAN"
1106    over the Internet, provided multicast routing is tuned.
1107 
1108 
1109    I have no idea was this bicycle invented before me,
1110    so that I had to set ARPHRD_IPGRE to a random value.
1111    I have an impression, that Cisco could make something similar,
1112    but this feature is apparently missing in IOS<=11.2(8).
1113 
1114    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1115    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1116 
1117    ping -t 255 224.66.66.66
1118 
1119    If nobody answers, mbone does not work.
1120 
1121    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1122    ip addr add 10.66.66.<somewhat>/24 dev Universe
1123    ifconfig Universe up
1124    ifconfig Universe add fe80::<Your_real_addr>/10
1125    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1126    ftp 10.66.66.66
1127    ...
1128    ftp fec0:6666:6666::193.233.7.65
1129    ...
1130 
1131  */
1132 
1133 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1134 			unsigned short type,
1135 			const void *daddr, const void *saddr, unsigned len)
1136 {
1137 	struct ip_tunnel *t = netdev_priv(dev);
1138 	struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1139 	__be16 *p = (__be16*)(iph+1);
1140 
1141 	memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1142 	p[0]		= t->parms.o_flags;
1143 	p[1]		= htons(type);
1144 
1145 	/*
1146 	 *	Set the source hardware address.
1147 	 */
1148 
1149 	if (saddr)
1150 		memcpy(&iph->saddr, saddr, 4);
1151 
1152 	if (daddr) {
1153 		memcpy(&iph->daddr, daddr, 4);
1154 		return t->hlen;
1155 	}
1156 	if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1157 		return t->hlen;
1158 
1159 	return -t->hlen;
1160 }
1161 
1162 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1163 {
1164 	struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1165 	memcpy(haddr, &iph->saddr, 4);
1166 	return 4;
1167 }
1168 
1169 static const struct header_ops ipgre_header_ops = {
1170 	.create	= ipgre_header,
1171 	.parse	= ipgre_header_parse,
1172 };
1173 
1174 #ifdef CONFIG_NET_IPGRE_BROADCAST
1175 static int ipgre_open(struct net_device *dev)
1176 {
1177 	struct ip_tunnel *t = netdev_priv(dev);
1178 
1179 	if (ipv4_is_multicast(t->parms.iph.daddr)) {
1180 		struct flowi fl = { .oif = t->parms.link,
1181 				    .nl_u = { .ip4_u =
1182 					      { .daddr = t->parms.iph.daddr,
1183 						.saddr = t->parms.iph.saddr,
1184 						.tos = RT_TOS(t->parms.iph.tos) } },
1185 				    .proto = IPPROTO_GRE };
1186 		struct rtable *rt;
1187 		if (ip_route_output_key(dev_net(dev), &rt, &fl))
1188 			return -EADDRNOTAVAIL;
1189 		dev = rt->u.dst.dev;
1190 		ip_rt_put(rt);
1191 		if (__in_dev_get_rtnl(dev) == NULL)
1192 			return -EADDRNOTAVAIL;
1193 		t->mlink = dev->ifindex;
1194 		ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1195 	}
1196 	return 0;
1197 }
1198 
1199 static int ipgre_close(struct net_device *dev)
1200 {
1201 	struct ip_tunnel *t = netdev_priv(dev);
1202 
1203 	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1204 		struct in_device *in_dev;
1205 		in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1206 		if (in_dev) {
1207 			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1208 			in_dev_put(in_dev);
1209 		}
1210 	}
1211 	return 0;
1212 }
1213 
1214 #endif
1215 
1216 static const struct net_device_ops ipgre_netdev_ops = {
1217 	.ndo_init		= ipgre_tunnel_init,
1218 	.ndo_uninit		= ipgre_tunnel_uninit,
1219 #ifdef CONFIG_NET_IPGRE_BROADCAST
1220 	.ndo_open		= ipgre_open,
1221 	.ndo_stop		= ipgre_close,
1222 #endif
1223 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1224 	.ndo_do_ioctl		= ipgre_tunnel_ioctl,
1225 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1226 };
1227 
1228 static void ipgre_tunnel_setup(struct net_device *dev)
1229 {
1230 	dev->netdev_ops		= &ipgre_netdev_ops;
1231 	dev->destructor 	= free_netdev;
1232 
1233 	dev->type		= ARPHRD_IPGRE;
1234 	dev->needed_headroom 	= LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1235 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1236 	dev->flags		= IFF_NOARP;
1237 	dev->iflink		= 0;
1238 	dev->addr_len		= 4;
1239 	dev->features		|= NETIF_F_NETNS_LOCAL;
1240 	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
1241 }
1242 
1243 static int ipgre_tunnel_init(struct net_device *dev)
1244 {
1245 	struct ip_tunnel *tunnel;
1246 	struct iphdr *iph;
1247 
1248 	tunnel = netdev_priv(dev);
1249 	iph = &tunnel->parms.iph;
1250 
1251 	tunnel->dev = dev;
1252 	strcpy(tunnel->parms.name, dev->name);
1253 
1254 	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1255 	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1256 
1257 	if (iph->daddr) {
1258 #ifdef CONFIG_NET_IPGRE_BROADCAST
1259 		if (ipv4_is_multicast(iph->daddr)) {
1260 			if (!iph->saddr)
1261 				return -EINVAL;
1262 			dev->flags = IFF_BROADCAST;
1263 			dev->header_ops = &ipgre_header_ops;
1264 		}
1265 #endif
1266 	} else
1267 		dev->header_ops = &ipgre_header_ops;
1268 
1269 	return 0;
1270 }
1271 
1272 static void ipgre_fb_tunnel_init(struct net_device *dev)
1273 {
1274 	struct ip_tunnel *tunnel = netdev_priv(dev);
1275 	struct iphdr *iph = &tunnel->parms.iph;
1276 	struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1277 
1278 	tunnel->dev = dev;
1279 	strcpy(tunnel->parms.name, dev->name);
1280 
1281 	iph->version		= 4;
1282 	iph->protocol		= IPPROTO_GRE;
1283 	iph->ihl		= 5;
1284 	tunnel->hlen		= sizeof(struct iphdr) + 4;
1285 
1286 	dev_hold(dev);
1287 	ign->tunnels_wc[0]	= tunnel;
1288 }
1289 
1290 
1291 static const struct net_protocol ipgre_protocol = {
1292 	.handler	=	ipgre_rcv,
1293 	.err_handler	=	ipgre_err,
1294 	.netns_ok	=	1,
1295 };
1296 
1297 static void ipgre_destroy_tunnels(struct ipgre_net *ign)
1298 {
1299 	int prio;
1300 
1301 	for (prio = 0; prio < 4; prio++) {
1302 		int h;
1303 		for (h = 0; h < HASH_SIZE; h++) {
1304 			struct ip_tunnel *t;
1305 			while ((t = ign->tunnels[prio][h]) != NULL)
1306 				unregister_netdevice(t->dev);
1307 		}
1308 	}
1309 }
1310 
1311 static int ipgre_init_net(struct net *net)
1312 {
1313 	int err;
1314 	struct ipgre_net *ign;
1315 
1316 	err = -ENOMEM;
1317 	ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1318 	if (ign == NULL)
1319 		goto err_alloc;
1320 
1321 	err = net_assign_generic(net, ipgre_net_id, ign);
1322 	if (err < 0)
1323 		goto err_assign;
1324 
1325 	ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1326 					   ipgre_tunnel_setup);
1327 	if (!ign->fb_tunnel_dev) {
1328 		err = -ENOMEM;
1329 		goto err_alloc_dev;
1330 	}
1331 	dev_net_set(ign->fb_tunnel_dev, net);
1332 
1333 	ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1334 	ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1335 
1336 	if ((err = register_netdev(ign->fb_tunnel_dev)))
1337 		goto err_reg_dev;
1338 
1339 	return 0;
1340 
1341 err_reg_dev:
1342 	free_netdev(ign->fb_tunnel_dev);
1343 err_alloc_dev:
1344 	/* nothing */
1345 err_assign:
1346 	kfree(ign);
1347 err_alloc:
1348 	return err;
1349 }
1350 
1351 static void ipgre_exit_net(struct net *net)
1352 {
1353 	struct ipgre_net *ign;
1354 
1355 	ign = net_generic(net, ipgre_net_id);
1356 	rtnl_lock();
1357 	ipgre_destroy_tunnels(ign);
1358 	rtnl_unlock();
1359 	kfree(ign);
1360 }
1361 
1362 static struct pernet_operations ipgre_net_ops = {
1363 	.init = ipgre_init_net,
1364 	.exit = ipgre_exit_net,
1365 };
1366 
1367 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1368 {
1369 	__be16 flags;
1370 
1371 	if (!data)
1372 		return 0;
1373 
1374 	flags = 0;
1375 	if (data[IFLA_GRE_IFLAGS])
1376 		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1377 	if (data[IFLA_GRE_OFLAGS])
1378 		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1379 	if (flags & (GRE_VERSION|GRE_ROUTING))
1380 		return -EINVAL;
1381 
1382 	return 0;
1383 }
1384 
1385 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1386 {
1387 	__be32 daddr;
1388 
1389 	if (tb[IFLA_ADDRESS]) {
1390 		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1391 			return -EINVAL;
1392 		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1393 			return -EADDRNOTAVAIL;
1394 	}
1395 
1396 	if (!data)
1397 		goto out;
1398 
1399 	if (data[IFLA_GRE_REMOTE]) {
1400 		memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1401 		if (!daddr)
1402 			return -EINVAL;
1403 	}
1404 
1405 out:
1406 	return ipgre_tunnel_validate(tb, data);
1407 }
1408 
1409 static void ipgre_netlink_parms(struct nlattr *data[],
1410 				struct ip_tunnel_parm *parms)
1411 {
1412 	memset(parms, 0, sizeof(*parms));
1413 
1414 	parms->iph.protocol = IPPROTO_GRE;
1415 
1416 	if (!data)
1417 		return;
1418 
1419 	if (data[IFLA_GRE_LINK])
1420 		parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1421 
1422 	if (data[IFLA_GRE_IFLAGS])
1423 		parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1424 
1425 	if (data[IFLA_GRE_OFLAGS])
1426 		parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1427 
1428 	if (data[IFLA_GRE_IKEY])
1429 		parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1430 
1431 	if (data[IFLA_GRE_OKEY])
1432 		parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1433 
1434 	if (data[IFLA_GRE_LOCAL])
1435 		parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1436 
1437 	if (data[IFLA_GRE_REMOTE])
1438 		parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1439 
1440 	if (data[IFLA_GRE_TTL])
1441 		parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1442 
1443 	if (data[IFLA_GRE_TOS])
1444 		parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1445 
1446 	if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1447 		parms->iph.frag_off = htons(IP_DF);
1448 }
1449 
1450 static int ipgre_tap_init(struct net_device *dev)
1451 {
1452 	struct ip_tunnel *tunnel;
1453 
1454 	tunnel = netdev_priv(dev);
1455 
1456 	tunnel->dev = dev;
1457 	strcpy(tunnel->parms.name, dev->name);
1458 
1459 	ipgre_tunnel_bind_dev(dev);
1460 
1461 	return 0;
1462 }
1463 
1464 static const struct net_device_ops ipgre_tap_netdev_ops = {
1465 	.ndo_init		= ipgre_tap_init,
1466 	.ndo_uninit		= ipgre_tunnel_uninit,
1467 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1468 	.ndo_set_mac_address 	= eth_mac_addr,
1469 	.ndo_validate_addr	= eth_validate_addr,
1470 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1471 };
1472 
1473 static void ipgre_tap_setup(struct net_device *dev)
1474 {
1475 
1476 	ether_setup(dev);
1477 
1478 	dev->netdev_ops		= &ipgre_netdev_ops;
1479 	dev->destructor 	= free_netdev;
1480 
1481 	dev->iflink		= 0;
1482 	dev->features		|= NETIF_F_NETNS_LOCAL;
1483 }
1484 
1485 static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[],
1486 			 struct nlattr *data[])
1487 {
1488 	struct ip_tunnel *nt;
1489 	struct net *net = dev_net(dev);
1490 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1491 	int mtu;
1492 	int err;
1493 
1494 	nt = netdev_priv(dev);
1495 	ipgre_netlink_parms(data, &nt->parms);
1496 
1497 	if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1498 		return -EEXIST;
1499 
1500 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1501 		random_ether_addr(dev->dev_addr);
1502 
1503 	mtu = ipgre_tunnel_bind_dev(dev);
1504 	if (!tb[IFLA_MTU])
1505 		dev->mtu = mtu;
1506 
1507 	err = register_netdevice(dev);
1508 	if (err)
1509 		goto out;
1510 
1511 	dev_hold(dev);
1512 	ipgre_tunnel_link(ign, nt);
1513 
1514 out:
1515 	return err;
1516 }
1517 
1518 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1519 			    struct nlattr *data[])
1520 {
1521 	struct ip_tunnel *t, *nt;
1522 	struct net *net = dev_net(dev);
1523 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1524 	struct ip_tunnel_parm p;
1525 	int mtu;
1526 
1527 	if (dev == ign->fb_tunnel_dev)
1528 		return -EINVAL;
1529 
1530 	nt = netdev_priv(dev);
1531 	ipgre_netlink_parms(data, &p);
1532 
1533 	t = ipgre_tunnel_locate(net, &p, 0);
1534 
1535 	if (t) {
1536 		if (t->dev != dev)
1537 			return -EEXIST;
1538 	} else {
1539 		unsigned nflags = 0;
1540 
1541 		t = nt;
1542 
1543 		if (ipv4_is_multicast(p.iph.daddr))
1544 			nflags = IFF_BROADCAST;
1545 		else if (p.iph.daddr)
1546 			nflags = IFF_POINTOPOINT;
1547 
1548 		if ((dev->flags ^ nflags) &
1549 		    (IFF_POINTOPOINT | IFF_BROADCAST))
1550 			return -EINVAL;
1551 
1552 		ipgre_tunnel_unlink(ign, t);
1553 		t->parms.iph.saddr = p.iph.saddr;
1554 		t->parms.iph.daddr = p.iph.daddr;
1555 		t->parms.i_key = p.i_key;
1556 		memcpy(dev->dev_addr, &p.iph.saddr, 4);
1557 		memcpy(dev->broadcast, &p.iph.daddr, 4);
1558 		ipgre_tunnel_link(ign, t);
1559 		netdev_state_change(dev);
1560 	}
1561 
1562 	t->parms.o_key = p.o_key;
1563 	t->parms.iph.ttl = p.iph.ttl;
1564 	t->parms.iph.tos = p.iph.tos;
1565 	t->parms.iph.frag_off = p.iph.frag_off;
1566 
1567 	if (t->parms.link != p.link) {
1568 		t->parms.link = p.link;
1569 		mtu = ipgre_tunnel_bind_dev(dev);
1570 		if (!tb[IFLA_MTU])
1571 			dev->mtu = mtu;
1572 		netdev_state_change(dev);
1573 	}
1574 
1575 	return 0;
1576 }
1577 
1578 static size_t ipgre_get_size(const struct net_device *dev)
1579 {
1580 	return
1581 		/* IFLA_GRE_LINK */
1582 		nla_total_size(4) +
1583 		/* IFLA_GRE_IFLAGS */
1584 		nla_total_size(2) +
1585 		/* IFLA_GRE_OFLAGS */
1586 		nla_total_size(2) +
1587 		/* IFLA_GRE_IKEY */
1588 		nla_total_size(4) +
1589 		/* IFLA_GRE_OKEY */
1590 		nla_total_size(4) +
1591 		/* IFLA_GRE_LOCAL */
1592 		nla_total_size(4) +
1593 		/* IFLA_GRE_REMOTE */
1594 		nla_total_size(4) +
1595 		/* IFLA_GRE_TTL */
1596 		nla_total_size(1) +
1597 		/* IFLA_GRE_TOS */
1598 		nla_total_size(1) +
1599 		/* IFLA_GRE_PMTUDISC */
1600 		nla_total_size(1) +
1601 		0;
1602 }
1603 
1604 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1605 {
1606 	struct ip_tunnel *t = netdev_priv(dev);
1607 	struct ip_tunnel_parm *p = &t->parms;
1608 
1609 	NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1610 	NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1611 	NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1612 	NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1613 	NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1614 	NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1615 	NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1616 	NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1617 	NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1618 	NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1619 
1620 	return 0;
1621 
1622 nla_put_failure:
1623 	return -EMSGSIZE;
1624 }
1625 
1626 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1627 	[IFLA_GRE_LINK]		= { .type = NLA_U32 },
1628 	[IFLA_GRE_IFLAGS]	= { .type = NLA_U16 },
1629 	[IFLA_GRE_OFLAGS]	= { .type = NLA_U16 },
1630 	[IFLA_GRE_IKEY]		= { .type = NLA_U32 },
1631 	[IFLA_GRE_OKEY]		= { .type = NLA_U32 },
1632 	[IFLA_GRE_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1633 	[IFLA_GRE_REMOTE]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1634 	[IFLA_GRE_TTL]		= { .type = NLA_U8 },
1635 	[IFLA_GRE_TOS]		= { .type = NLA_U8 },
1636 	[IFLA_GRE_PMTUDISC]	= { .type = NLA_U8 },
1637 };
1638 
1639 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1640 	.kind		= "gre",
1641 	.maxtype	= IFLA_GRE_MAX,
1642 	.policy		= ipgre_policy,
1643 	.priv_size	= sizeof(struct ip_tunnel),
1644 	.setup		= ipgre_tunnel_setup,
1645 	.validate	= ipgre_tunnel_validate,
1646 	.newlink	= ipgre_newlink,
1647 	.changelink	= ipgre_changelink,
1648 	.get_size	= ipgre_get_size,
1649 	.fill_info	= ipgre_fill_info,
1650 };
1651 
1652 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1653 	.kind		= "gretap",
1654 	.maxtype	= IFLA_GRE_MAX,
1655 	.policy		= ipgre_policy,
1656 	.priv_size	= sizeof(struct ip_tunnel),
1657 	.setup		= ipgre_tap_setup,
1658 	.validate	= ipgre_tap_validate,
1659 	.newlink	= ipgre_newlink,
1660 	.changelink	= ipgre_changelink,
1661 	.get_size	= ipgre_get_size,
1662 	.fill_info	= ipgre_fill_info,
1663 };
1664 
1665 /*
1666  *	And now the modules code and kernel interface.
1667  */
1668 
1669 static int __init ipgre_init(void)
1670 {
1671 	int err;
1672 
1673 	printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1674 
1675 	if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1676 		printk(KERN_INFO "ipgre init: can't add protocol\n");
1677 		return -EAGAIN;
1678 	}
1679 
1680 	err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1681 	if (err < 0)
1682 		goto gen_device_failed;
1683 
1684 	err = rtnl_link_register(&ipgre_link_ops);
1685 	if (err < 0)
1686 		goto rtnl_link_failed;
1687 
1688 	err = rtnl_link_register(&ipgre_tap_ops);
1689 	if (err < 0)
1690 		goto tap_ops_failed;
1691 
1692 out:
1693 	return err;
1694 
1695 tap_ops_failed:
1696 	rtnl_link_unregister(&ipgre_link_ops);
1697 rtnl_link_failed:
1698 	unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1699 gen_device_failed:
1700 	inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1701 	goto out;
1702 }
1703 
1704 static void __exit ipgre_fini(void)
1705 {
1706 	rtnl_link_unregister(&ipgre_tap_ops);
1707 	rtnl_link_unregister(&ipgre_link_ops);
1708 	unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1709 	if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1710 		printk(KERN_INFO "ipgre close: can't remove protocol\n");
1711 }
1712 
1713 module_init(ipgre_init);
1714 module_exit(ipgre_fini);
1715 MODULE_LICENSE("GPL");
1716 MODULE_ALIAS_RTNL_LINK("gre");
1717 MODULE_ALIAS_RTNL_LINK("gretap");
1718