xref: /openbmc/linux/net/ipv4/ip_gre.c (revision b04b4f78)
1 /*
2  *	Linux NET3:	GRE over IP protocol decoder.
3  *
4  *	Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *	This program is free software; you can redistribute it and/or
7  *	modify it under the terms of the GNU General Public License
8  *	as published by the Free Software Foundation; either version
9  *	2 of the License, or (at your option) any later version.
10  *
11  */
12 
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <asm/uaccess.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
20 #include <linux/in.h>
21 #include <linux/tcp.h>
22 #include <linux/udp.h>
23 #include <linux/if_arp.h>
24 #include <linux/mroute.h>
25 #include <linux/init.h>
26 #include <linux/in6.h>
27 #include <linux/inetdevice.h>
28 #include <linux/igmp.h>
29 #include <linux/netfilter_ipv4.h>
30 #include <linux/etherdevice.h>
31 #include <linux/if_ether.h>
32 
33 #include <net/sock.h>
34 #include <net/ip.h>
35 #include <net/icmp.h>
36 #include <net/protocol.h>
37 #include <net/ipip.h>
38 #include <net/arp.h>
39 #include <net/checksum.h>
40 #include <net/dsfield.h>
41 #include <net/inet_ecn.h>
42 #include <net/xfrm.h>
43 #include <net/net_namespace.h>
44 #include <net/netns/generic.h>
45 #include <net/rtnetlink.h>
46 
47 #ifdef CONFIG_IPV6
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #endif
52 
53 /*
54    Problems & solutions
55    --------------------
56 
57    1. The most important issue is detecting local dead loops.
58    They would cause complete host lockup in transmit, which
59    would be "resolved" by stack overflow or, if queueing is enabled,
60    with infinite looping in net_bh.
61 
62    We cannot track such dead loops during route installation,
63    it is infeasible task. The most general solutions would be
64    to keep skb->encapsulation counter (sort of local ttl),
65    and silently drop packet when it expires. It is the best
66    solution, but it supposes maintaing new variable in ALL
67    skb, even if no tunneling is used.
68 
69    Current solution: t->recursion lock breaks dead loops. It looks
70    like dev->tbusy flag, but I preferred new variable, because
71    the semantics is different. One day, when hard_start_xmit
72    will be multithreaded we will have to use skb->encapsulation.
73 
74 
75 
76    2. Networking dead loops would not kill routers, but would really
77    kill network. IP hop limit plays role of "t->recursion" in this case,
78    if we copy it from packet being encapsulated to upper header.
79    It is very good solution, but it introduces two problems:
80 
81    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
82      do not work over tunnels.
83    - traceroute does not work. I planned to relay ICMP from tunnel,
84      so that this problem would be solved and traceroute output
85      would even more informative. This idea appeared to be wrong:
86      only Linux complies to rfc1812 now (yes, guys, Linux is the only
87      true router now :-)), all routers (at least, in neighbourhood of mine)
88      return only 8 bytes of payload. It is the end.
89 
90    Hence, if we want that OSPF worked or traceroute said something reasonable,
91    we should search for another solution.
92 
93    One of them is to parse packet trying to detect inner encapsulation
94    made by our node. It is difficult or even impossible, especially,
95    taking into account fragmentation. TO be short, tt is not solution at all.
96 
97    Current solution: The solution was UNEXPECTEDLY SIMPLE.
98    We force DF flag on tunnels with preconfigured hop limit,
99    that is ALL. :-) Well, it does not remove the problem completely,
100    but exponential growth of network traffic is changed to linear
101    (branches, that exceed pmtu are pruned) and tunnel mtu
102    fastly degrades to value <68, where looping stops.
103    Yes, it is not good if there exists a router in the loop,
104    which does not force DF, even when encapsulating packets have DF set.
105    But it is not our problem! Nobody could accuse us, we made
106    all that we could make. Even if it is your gated who injected
107    fatal route to network, even if it were you who configured
108    fatal static route: you are innocent. :-)
109 
110 
111 
112    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
113    practically identical code. It would be good to glue them
114    together, but it is not very evident, how to make them modular.
115    sit is integral part of IPv6, ipip and gre are naturally modular.
116    We could extract common parts (hash table, ioctl etc)
117    to a separate module (ip_tunnel.c).
118 
119    Alexey Kuznetsov.
120  */
121 
122 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
123 static int ipgre_tunnel_init(struct net_device *dev);
124 static void ipgre_tunnel_setup(struct net_device *dev);
125 static int ipgre_tunnel_bind_dev(struct net_device *dev);
126 
127 /* Fallback tunnel: no source, no destination, no key, no options */
128 
129 #define HASH_SIZE  16
130 
131 static int ipgre_net_id;
132 struct ipgre_net {
133 	struct ip_tunnel *tunnels[4][HASH_SIZE];
134 
135 	struct net_device *fb_tunnel_dev;
136 };
137 
138 /* Tunnel hash table */
139 
140 /*
141    4 hash tables:
142 
143    3: (remote,local)
144    2: (remote,*)
145    1: (*,local)
146    0: (*,*)
147 
148    We require exact key match i.e. if a key is present in packet
149    it will match only tunnel with the same key; if it is not present,
150    it will match only keyless tunnel.
151 
152    All keysless packets, if not matched configured keyless tunnels
153    will match fallback tunnel.
154  */
155 
156 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
157 
158 #define tunnels_r_l	tunnels[3]
159 #define tunnels_r	tunnels[2]
160 #define tunnels_l	tunnels[1]
161 #define tunnels_wc	tunnels[0]
162 
163 static DEFINE_RWLOCK(ipgre_lock);
164 
165 /* Given src, dst and key, find appropriate for input tunnel. */
166 
167 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
168 					      __be32 remote, __be32 local,
169 					      __be32 key, __be16 gre_proto)
170 {
171 	struct net *net = dev_net(dev);
172 	int link = dev->ifindex;
173 	unsigned h0 = HASH(remote);
174 	unsigned h1 = HASH(key);
175 	struct ip_tunnel *t, *cand = NULL;
176 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
177 	int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
178 		       ARPHRD_ETHER : ARPHRD_IPGRE;
179 	int score, cand_score = 4;
180 
181 	for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
182 		if (local != t->parms.iph.saddr ||
183 		    remote != t->parms.iph.daddr ||
184 		    key != t->parms.i_key ||
185 		    !(t->dev->flags & IFF_UP))
186 			continue;
187 
188 		if (t->dev->type != ARPHRD_IPGRE &&
189 		    t->dev->type != dev_type)
190 			continue;
191 
192 		score = 0;
193 		if (t->parms.link != link)
194 			score |= 1;
195 		if (t->dev->type != dev_type)
196 			score |= 2;
197 		if (score == 0)
198 			return t;
199 
200 		if (score < cand_score) {
201 			cand = t;
202 			cand_score = score;
203 		}
204 	}
205 
206 	for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
207 		if (remote != t->parms.iph.daddr ||
208 		    key != t->parms.i_key ||
209 		    !(t->dev->flags & IFF_UP))
210 			continue;
211 
212 		if (t->dev->type != ARPHRD_IPGRE &&
213 		    t->dev->type != dev_type)
214 			continue;
215 
216 		score = 0;
217 		if (t->parms.link != link)
218 			score |= 1;
219 		if (t->dev->type != dev_type)
220 			score |= 2;
221 		if (score == 0)
222 			return t;
223 
224 		if (score < cand_score) {
225 			cand = t;
226 			cand_score = score;
227 		}
228 	}
229 
230 	for (t = ign->tunnels_l[h1]; t; t = t->next) {
231 		if ((local != t->parms.iph.saddr &&
232 		     (local != t->parms.iph.daddr ||
233 		      !ipv4_is_multicast(local))) ||
234 		    key != t->parms.i_key ||
235 		    !(t->dev->flags & IFF_UP))
236 			continue;
237 
238 		if (t->dev->type != ARPHRD_IPGRE &&
239 		    t->dev->type != dev_type)
240 			continue;
241 
242 		score = 0;
243 		if (t->parms.link != link)
244 			score |= 1;
245 		if (t->dev->type != dev_type)
246 			score |= 2;
247 		if (score == 0)
248 			return t;
249 
250 		if (score < cand_score) {
251 			cand = t;
252 			cand_score = score;
253 		}
254 	}
255 
256 	for (t = ign->tunnels_wc[h1]; t; t = t->next) {
257 		if (t->parms.i_key != key ||
258 		    !(t->dev->flags & IFF_UP))
259 			continue;
260 
261 		if (t->dev->type != ARPHRD_IPGRE &&
262 		    t->dev->type != dev_type)
263 			continue;
264 
265 		score = 0;
266 		if (t->parms.link != link)
267 			score |= 1;
268 		if (t->dev->type != dev_type)
269 			score |= 2;
270 		if (score == 0)
271 			return t;
272 
273 		if (score < cand_score) {
274 			cand = t;
275 			cand_score = score;
276 		}
277 	}
278 
279 	if (cand != NULL)
280 		return cand;
281 
282 	if (ign->fb_tunnel_dev->flags & IFF_UP)
283 		return netdev_priv(ign->fb_tunnel_dev);
284 
285 	return NULL;
286 }
287 
288 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
289 		struct ip_tunnel_parm *parms)
290 {
291 	__be32 remote = parms->iph.daddr;
292 	__be32 local = parms->iph.saddr;
293 	__be32 key = parms->i_key;
294 	unsigned h = HASH(key);
295 	int prio = 0;
296 
297 	if (local)
298 		prio |= 1;
299 	if (remote && !ipv4_is_multicast(remote)) {
300 		prio |= 2;
301 		h ^= HASH(remote);
302 	}
303 
304 	return &ign->tunnels[prio][h];
305 }
306 
307 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
308 		struct ip_tunnel *t)
309 {
310 	return __ipgre_bucket(ign, &t->parms);
311 }
312 
313 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
314 {
315 	struct ip_tunnel **tp = ipgre_bucket(ign, t);
316 
317 	t->next = *tp;
318 	write_lock_bh(&ipgre_lock);
319 	*tp = t;
320 	write_unlock_bh(&ipgre_lock);
321 }
322 
323 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
324 {
325 	struct ip_tunnel **tp;
326 
327 	for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
328 		if (t == *tp) {
329 			write_lock_bh(&ipgre_lock);
330 			*tp = t->next;
331 			write_unlock_bh(&ipgre_lock);
332 			break;
333 		}
334 	}
335 }
336 
337 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
338 					   struct ip_tunnel_parm *parms,
339 					   int type)
340 {
341 	__be32 remote = parms->iph.daddr;
342 	__be32 local = parms->iph.saddr;
343 	__be32 key = parms->i_key;
344 	int link = parms->link;
345 	struct ip_tunnel *t, **tp;
346 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
347 
348 	for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
349 		if (local == t->parms.iph.saddr &&
350 		    remote == t->parms.iph.daddr &&
351 		    key == t->parms.i_key &&
352 		    link == t->parms.link &&
353 		    type == t->dev->type)
354 			break;
355 
356 	return t;
357 }
358 
359 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
360 		struct ip_tunnel_parm *parms, int create)
361 {
362 	struct ip_tunnel *t, *nt;
363 	struct net_device *dev;
364 	char name[IFNAMSIZ];
365 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
366 
367 	t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
368 	if (t || !create)
369 		return t;
370 
371 	if (parms->name[0])
372 		strlcpy(name, parms->name, IFNAMSIZ);
373 	else
374 		sprintf(name, "gre%%d");
375 
376 	dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
377 	if (!dev)
378 	  return NULL;
379 
380 	dev_net_set(dev, net);
381 
382 	if (strchr(name, '%')) {
383 		if (dev_alloc_name(dev, name) < 0)
384 			goto failed_free;
385 	}
386 
387 	nt = netdev_priv(dev);
388 	nt->parms = *parms;
389 	dev->rtnl_link_ops = &ipgre_link_ops;
390 
391 	dev->mtu = ipgre_tunnel_bind_dev(dev);
392 
393 	if (register_netdevice(dev) < 0)
394 		goto failed_free;
395 
396 	dev_hold(dev);
397 	ipgre_tunnel_link(ign, nt);
398 	return nt;
399 
400 failed_free:
401 	free_netdev(dev);
402 	return NULL;
403 }
404 
405 static void ipgre_tunnel_uninit(struct net_device *dev)
406 {
407 	struct net *net = dev_net(dev);
408 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
409 
410 	ipgre_tunnel_unlink(ign, netdev_priv(dev));
411 	dev_put(dev);
412 }
413 
414 
415 static void ipgre_err(struct sk_buff *skb, u32 info)
416 {
417 
418 /* All the routers (except for Linux) return only
419    8 bytes of packet payload. It means, that precise relaying of
420    ICMP in the real Internet is absolutely infeasible.
421 
422    Moreover, Cisco "wise men" put GRE key to the third word
423    in GRE header. It makes impossible maintaining even soft state for keyed
424    GRE tunnels with enabled checksum. Tell them "thank you".
425 
426    Well, I wonder, rfc1812 was written by Cisco employee,
427    what the hell these idiots break standrads established
428    by themself???
429  */
430 
431 	struct iphdr *iph = (struct iphdr *)skb->data;
432 	__be16	     *p = (__be16*)(skb->data+(iph->ihl<<2));
433 	int grehlen = (iph->ihl<<2) + 4;
434 	const int type = icmp_hdr(skb)->type;
435 	const int code = icmp_hdr(skb)->code;
436 	struct ip_tunnel *t;
437 	__be16 flags;
438 
439 	flags = p[0];
440 	if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
441 		if (flags&(GRE_VERSION|GRE_ROUTING))
442 			return;
443 		if (flags&GRE_KEY) {
444 			grehlen += 4;
445 			if (flags&GRE_CSUM)
446 				grehlen += 4;
447 		}
448 	}
449 
450 	/* If only 8 bytes returned, keyed message will be dropped here */
451 	if (skb_headlen(skb) < grehlen)
452 		return;
453 
454 	switch (type) {
455 	default:
456 	case ICMP_PARAMETERPROB:
457 		return;
458 
459 	case ICMP_DEST_UNREACH:
460 		switch (code) {
461 		case ICMP_SR_FAILED:
462 		case ICMP_PORT_UNREACH:
463 			/* Impossible event. */
464 			return;
465 		case ICMP_FRAG_NEEDED:
466 			/* Soft state for pmtu is maintained by IP core. */
467 			return;
468 		default:
469 			/* All others are translated to HOST_UNREACH.
470 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
471 			   I believe they are just ether pollution. --ANK
472 			 */
473 			break;
474 		}
475 		break;
476 	case ICMP_TIME_EXCEEDED:
477 		if (code != ICMP_EXC_TTL)
478 			return;
479 		break;
480 	}
481 
482 	read_lock(&ipgre_lock);
483 	t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
484 				flags & GRE_KEY ?
485 				*(((__be32 *)p) + (grehlen / 4) - 1) : 0,
486 				p[1]);
487 	if (t == NULL || t->parms.iph.daddr == 0 ||
488 	    ipv4_is_multicast(t->parms.iph.daddr))
489 		goto out;
490 
491 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
492 		goto out;
493 
494 	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
495 		t->err_count++;
496 	else
497 		t->err_count = 1;
498 	t->err_time = jiffies;
499 out:
500 	read_unlock(&ipgre_lock);
501 	return;
502 }
503 
504 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
505 {
506 	if (INET_ECN_is_ce(iph->tos)) {
507 		if (skb->protocol == htons(ETH_P_IP)) {
508 			IP_ECN_set_ce(ip_hdr(skb));
509 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
510 			IP6_ECN_set_ce(ipv6_hdr(skb));
511 		}
512 	}
513 }
514 
515 static inline u8
516 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
517 {
518 	u8 inner = 0;
519 	if (skb->protocol == htons(ETH_P_IP))
520 		inner = old_iph->tos;
521 	else if (skb->protocol == htons(ETH_P_IPV6))
522 		inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
523 	return INET_ECN_encapsulate(tos, inner);
524 }
525 
526 static int ipgre_rcv(struct sk_buff *skb)
527 {
528 	struct iphdr *iph;
529 	u8     *h;
530 	__be16    flags;
531 	__sum16   csum = 0;
532 	__be32 key = 0;
533 	u32    seqno = 0;
534 	struct ip_tunnel *tunnel;
535 	int    offset = 4;
536 	__be16 gre_proto;
537 	unsigned int len;
538 
539 	if (!pskb_may_pull(skb, 16))
540 		goto drop_nolock;
541 
542 	iph = ip_hdr(skb);
543 	h = skb->data;
544 	flags = *(__be16*)h;
545 
546 	if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
547 		/* - Version must be 0.
548 		   - We do not support routing headers.
549 		 */
550 		if (flags&(GRE_VERSION|GRE_ROUTING))
551 			goto drop_nolock;
552 
553 		if (flags&GRE_CSUM) {
554 			switch (skb->ip_summed) {
555 			case CHECKSUM_COMPLETE:
556 				csum = csum_fold(skb->csum);
557 				if (!csum)
558 					break;
559 				/* fall through */
560 			case CHECKSUM_NONE:
561 				skb->csum = 0;
562 				csum = __skb_checksum_complete(skb);
563 				skb->ip_summed = CHECKSUM_COMPLETE;
564 			}
565 			offset += 4;
566 		}
567 		if (flags&GRE_KEY) {
568 			key = *(__be32*)(h + offset);
569 			offset += 4;
570 		}
571 		if (flags&GRE_SEQ) {
572 			seqno = ntohl(*(__be32*)(h + offset));
573 			offset += 4;
574 		}
575 	}
576 
577 	gre_proto = *(__be16 *)(h + 2);
578 
579 	read_lock(&ipgre_lock);
580 	if ((tunnel = ipgre_tunnel_lookup(skb->dev,
581 					  iph->saddr, iph->daddr, key,
582 					  gre_proto))) {
583 		struct net_device_stats *stats = &tunnel->dev->stats;
584 
585 		secpath_reset(skb);
586 
587 		skb->protocol = gre_proto;
588 		/* WCCP version 1 and 2 protocol decoding.
589 		 * - Change protocol to IP
590 		 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
591 		 */
592 		if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
593 			skb->protocol = htons(ETH_P_IP);
594 			if ((*(h + offset) & 0xF0) != 0x40)
595 				offset += 4;
596 		}
597 
598 		skb->mac_header = skb->network_header;
599 		__pskb_pull(skb, offset);
600 		skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
601 		skb->pkt_type = PACKET_HOST;
602 #ifdef CONFIG_NET_IPGRE_BROADCAST
603 		if (ipv4_is_multicast(iph->daddr)) {
604 			/* Looped back packet, drop it! */
605 			if (skb->rtable->fl.iif == 0)
606 				goto drop;
607 			stats->multicast++;
608 			skb->pkt_type = PACKET_BROADCAST;
609 		}
610 #endif
611 
612 		if (((flags&GRE_CSUM) && csum) ||
613 		    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
614 			stats->rx_crc_errors++;
615 			stats->rx_errors++;
616 			goto drop;
617 		}
618 		if (tunnel->parms.i_flags&GRE_SEQ) {
619 			if (!(flags&GRE_SEQ) ||
620 			    (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
621 				stats->rx_fifo_errors++;
622 				stats->rx_errors++;
623 				goto drop;
624 			}
625 			tunnel->i_seqno = seqno + 1;
626 		}
627 
628 		len = skb->len;
629 
630 		/* Warning: All skb pointers will be invalidated! */
631 		if (tunnel->dev->type == ARPHRD_ETHER) {
632 			if (!pskb_may_pull(skb, ETH_HLEN)) {
633 				stats->rx_length_errors++;
634 				stats->rx_errors++;
635 				goto drop;
636 			}
637 
638 			iph = ip_hdr(skb);
639 			skb->protocol = eth_type_trans(skb, tunnel->dev);
640 			skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
641 		}
642 
643 		stats->rx_packets++;
644 		stats->rx_bytes += len;
645 		skb->dev = tunnel->dev;
646 		dst_release(skb->dst);
647 		skb->dst = NULL;
648 		nf_reset(skb);
649 
650 		skb_reset_network_header(skb);
651 		ipgre_ecn_decapsulate(iph, skb);
652 
653 		netif_rx(skb);
654 		read_unlock(&ipgre_lock);
655 		return(0);
656 	}
657 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
658 
659 drop:
660 	read_unlock(&ipgre_lock);
661 drop_nolock:
662 	kfree_skb(skb);
663 	return(0);
664 }
665 
666 static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
667 {
668 	struct ip_tunnel *tunnel = netdev_priv(dev);
669 	struct net_device_stats *stats = &tunnel->dev->stats;
670 	struct iphdr  *old_iph = ip_hdr(skb);
671 	struct iphdr  *tiph;
672 	u8     tos;
673 	__be16 df;
674 	struct rtable *rt;     			/* Route to the other host */
675 	struct net_device *tdev;			/* Device to other host */
676 	struct iphdr  *iph;			/* Our new IP header */
677 	unsigned int max_headroom;		/* The extra header space needed */
678 	int    gre_hlen;
679 	__be32 dst;
680 	int    mtu;
681 
682 	if (tunnel->recursion++) {
683 		stats->collisions++;
684 		goto tx_error;
685 	}
686 
687 	if (dev->type == ARPHRD_ETHER)
688 		IPCB(skb)->flags = 0;
689 
690 	if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
691 		gre_hlen = 0;
692 		tiph = (struct iphdr *)skb->data;
693 	} else {
694 		gre_hlen = tunnel->hlen;
695 		tiph = &tunnel->parms.iph;
696 	}
697 
698 	if ((dst = tiph->daddr) == 0) {
699 		/* NBMA tunnel */
700 
701 		if (skb->dst == NULL) {
702 			stats->tx_fifo_errors++;
703 			goto tx_error;
704 		}
705 
706 		if (skb->protocol == htons(ETH_P_IP)) {
707 			rt = skb->rtable;
708 			if ((dst = rt->rt_gateway) == 0)
709 				goto tx_error_icmp;
710 		}
711 #ifdef CONFIG_IPV6
712 		else if (skb->protocol == htons(ETH_P_IPV6)) {
713 			struct in6_addr *addr6;
714 			int addr_type;
715 			struct neighbour *neigh = skb->dst->neighbour;
716 
717 			if (neigh == NULL)
718 				goto tx_error;
719 
720 			addr6 = (struct in6_addr *)&neigh->primary_key;
721 			addr_type = ipv6_addr_type(addr6);
722 
723 			if (addr_type == IPV6_ADDR_ANY) {
724 				addr6 = &ipv6_hdr(skb)->daddr;
725 				addr_type = ipv6_addr_type(addr6);
726 			}
727 
728 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
729 				goto tx_error_icmp;
730 
731 			dst = addr6->s6_addr32[3];
732 		}
733 #endif
734 		else
735 			goto tx_error;
736 	}
737 
738 	tos = tiph->tos;
739 	if (tos&1) {
740 		if (skb->protocol == htons(ETH_P_IP))
741 			tos = old_iph->tos;
742 		tos &= ~1;
743 	}
744 
745 	{
746 		struct flowi fl = { .oif = tunnel->parms.link,
747 				    .nl_u = { .ip4_u =
748 					      { .daddr = dst,
749 						.saddr = tiph->saddr,
750 						.tos = RT_TOS(tos) } },
751 				    .proto = IPPROTO_GRE };
752 		if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
753 			stats->tx_carrier_errors++;
754 			goto tx_error;
755 		}
756 	}
757 	tdev = rt->u.dst.dev;
758 
759 	if (tdev == dev) {
760 		ip_rt_put(rt);
761 		stats->collisions++;
762 		goto tx_error;
763 	}
764 
765 	df = tiph->frag_off;
766 	if (df)
767 		mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
768 	else
769 		mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
770 
771 	if (skb->dst)
772 		skb->dst->ops->update_pmtu(skb->dst, mtu);
773 
774 	if (skb->protocol == htons(ETH_P_IP)) {
775 		df |= (old_iph->frag_off&htons(IP_DF));
776 
777 		if ((old_iph->frag_off&htons(IP_DF)) &&
778 		    mtu < ntohs(old_iph->tot_len)) {
779 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
780 			ip_rt_put(rt);
781 			goto tx_error;
782 		}
783 	}
784 #ifdef CONFIG_IPV6
785 	else if (skb->protocol == htons(ETH_P_IPV6)) {
786 		struct rt6_info *rt6 = (struct rt6_info *)skb->dst;
787 
788 		if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
789 			if ((tunnel->parms.iph.daddr &&
790 			     !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
791 			    rt6->rt6i_dst.plen == 128) {
792 				rt6->rt6i_flags |= RTF_MODIFIED;
793 				skb->dst->metrics[RTAX_MTU-1] = mtu;
794 			}
795 		}
796 
797 		if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
798 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
799 			ip_rt_put(rt);
800 			goto tx_error;
801 		}
802 	}
803 #endif
804 
805 	if (tunnel->err_count > 0) {
806 		if (time_before(jiffies,
807 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
808 			tunnel->err_count--;
809 
810 			dst_link_failure(skb);
811 		} else
812 			tunnel->err_count = 0;
813 	}
814 
815 	max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
816 
817 	if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
818 	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
819 		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
820 		if (!new_skb) {
821 			ip_rt_put(rt);
822 			stats->tx_dropped++;
823 			dev_kfree_skb(skb);
824 			tunnel->recursion--;
825 			return 0;
826 		}
827 		if (skb->sk)
828 			skb_set_owner_w(new_skb, skb->sk);
829 		dev_kfree_skb(skb);
830 		skb = new_skb;
831 		old_iph = ip_hdr(skb);
832 	}
833 
834 	skb_reset_transport_header(skb);
835 	skb_push(skb, gre_hlen);
836 	skb_reset_network_header(skb);
837 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
838 	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
839 			      IPSKB_REROUTED);
840 	dst_release(skb->dst);
841 	skb->dst = &rt->u.dst;
842 
843 	/*
844 	 *	Push down and install the IPIP header.
845 	 */
846 
847 	iph 			=	ip_hdr(skb);
848 	iph->version		=	4;
849 	iph->ihl		=	sizeof(struct iphdr) >> 2;
850 	iph->frag_off		=	df;
851 	iph->protocol		=	IPPROTO_GRE;
852 	iph->tos		=	ipgre_ecn_encapsulate(tos, old_iph, skb);
853 	iph->daddr		=	rt->rt_dst;
854 	iph->saddr		=	rt->rt_src;
855 
856 	if ((iph->ttl = tiph->ttl) == 0) {
857 		if (skb->protocol == htons(ETH_P_IP))
858 			iph->ttl = old_iph->ttl;
859 #ifdef CONFIG_IPV6
860 		else if (skb->protocol == htons(ETH_P_IPV6))
861 			iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
862 #endif
863 		else
864 			iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
865 	}
866 
867 	((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
868 	((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
869 				   htons(ETH_P_TEB) : skb->protocol;
870 
871 	if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
872 		__be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
873 
874 		if (tunnel->parms.o_flags&GRE_SEQ) {
875 			++tunnel->o_seqno;
876 			*ptr = htonl(tunnel->o_seqno);
877 			ptr--;
878 		}
879 		if (tunnel->parms.o_flags&GRE_KEY) {
880 			*ptr = tunnel->parms.o_key;
881 			ptr--;
882 		}
883 		if (tunnel->parms.o_flags&GRE_CSUM) {
884 			*ptr = 0;
885 			*(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
886 		}
887 	}
888 
889 	nf_reset(skb);
890 
891 	IPTUNNEL_XMIT();
892 	tunnel->recursion--;
893 	return 0;
894 
895 tx_error_icmp:
896 	dst_link_failure(skb);
897 
898 tx_error:
899 	stats->tx_errors++;
900 	dev_kfree_skb(skb);
901 	tunnel->recursion--;
902 	return 0;
903 }
904 
905 static int ipgre_tunnel_bind_dev(struct net_device *dev)
906 {
907 	struct net_device *tdev = NULL;
908 	struct ip_tunnel *tunnel;
909 	struct iphdr *iph;
910 	int hlen = LL_MAX_HEADER;
911 	int mtu = ETH_DATA_LEN;
912 	int addend = sizeof(struct iphdr) + 4;
913 
914 	tunnel = netdev_priv(dev);
915 	iph = &tunnel->parms.iph;
916 
917 	/* Guess output device to choose reasonable mtu and needed_headroom */
918 
919 	if (iph->daddr) {
920 		struct flowi fl = { .oif = tunnel->parms.link,
921 				    .nl_u = { .ip4_u =
922 					      { .daddr = iph->daddr,
923 						.saddr = iph->saddr,
924 						.tos = RT_TOS(iph->tos) } },
925 				    .proto = IPPROTO_GRE };
926 		struct rtable *rt;
927 		if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
928 			tdev = rt->u.dst.dev;
929 			ip_rt_put(rt);
930 		}
931 
932 		if (dev->type != ARPHRD_ETHER)
933 			dev->flags |= IFF_POINTOPOINT;
934 	}
935 
936 	if (!tdev && tunnel->parms.link)
937 		tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
938 
939 	if (tdev) {
940 		hlen = tdev->hard_header_len + tdev->needed_headroom;
941 		mtu = tdev->mtu;
942 	}
943 	dev->iflink = tunnel->parms.link;
944 
945 	/* Precalculate GRE options length */
946 	if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
947 		if (tunnel->parms.o_flags&GRE_CSUM)
948 			addend += 4;
949 		if (tunnel->parms.o_flags&GRE_KEY)
950 			addend += 4;
951 		if (tunnel->parms.o_flags&GRE_SEQ)
952 			addend += 4;
953 	}
954 	dev->needed_headroom = addend + hlen;
955 	mtu -= dev->hard_header_len - addend;
956 
957 	if (mtu < 68)
958 		mtu = 68;
959 
960 	tunnel->hlen = addend;
961 
962 	return mtu;
963 }
964 
965 static int
966 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
967 {
968 	int err = 0;
969 	struct ip_tunnel_parm p;
970 	struct ip_tunnel *t;
971 	struct net *net = dev_net(dev);
972 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
973 
974 	switch (cmd) {
975 	case SIOCGETTUNNEL:
976 		t = NULL;
977 		if (dev == ign->fb_tunnel_dev) {
978 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
979 				err = -EFAULT;
980 				break;
981 			}
982 			t = ipgre_tunnel_locate(net, &p, 0);
983 		}
984 		if (t == NULL)
985 			t = netdev_priv(dev);
986 		memcpy(&p, &t->parms, sizeof(p));
987 		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
988 			err = -EFAULT;
989 		break;
990 
991 	case SIOCADDTUNNEL:
992 	case SIOCCHGTUNNEL:
993 		err = -EPERM;
994 		if (!capable(CAP_NET_ADMIN))
995 			goto done;
996 
997 		err = -EFAULT;
998 		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
999 			goto done;
1000 
1001 		err = -EINVAL;
1002 		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1003 		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1004 		    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1005 			goto done;
1006 		if (p.iph.ttl)
1007 			p.iph.frag_off |= htons(IP_DF);
1008 
1009 		if (!(p.i_flags&GRE_KEY))
1010 			p.i_key = 0;
1011 		if (!(p.o_flags&GRE_KEY))
1012 			p.o_key = 0;
1013 
1014 		t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1015 
1016 		if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1017 			if (t != NULL) {
1018 				if (t->dev != dev) {
1019 					err = -EEXIST;
1020 					break;
1021 				}
1022 			} else {
1023 				unsigned nflags = 0;
1024 
1025 				t = netdev_priv(dev);
1026 
1027 				if (ipv4_is_multicast(p.iph.daddr))
1028 					nflags = IFF_BROADCAST;
1029 				else if (p.iph.daddr)
1030 					nflags = IFF_POINTOPOINT;
1031 
1032 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1033 					err = -EINVAL;
1034 					break;
1035 				}
1036 				ipgre_tunnel_unlink(ign, t);
1037 				t->parms.iph.saddr = p.iph.saddr;
1038 				t->parms.iph.daddr = p.iph.daddr;
1039 				t->parms.i_key = p.i_key;
1040 				t->parms.o_key = p.o_key;
1041 				memcpy(dev->dev_addr, &p.iph.saddr, 4);
1042 				memcpy(dev->broadcast, &p.iph.daddr, 4);
1043 				ipgre_tunnel_link(ign, t);
1044 				netdev_state_change(dev);
1045 			}
1046 		}
1047 
1048 		if (t) {
1049 			err = 0;
1050 			if (cmd == SIOCCHGTUNNEL) {
1051 				t->parms.iph.ttl = p.iph.ttl;
1052 				t->parms.iph.tos = p.iph.tos;
1053 				t->parms.iph.frag_off = p.iph.frag_off;
1054 				if (t->parms.link != p.link) {
1055 					t->parms.link = p.link;
1056 					dev->mtu = ipgre_tunnel_bind_dev(dev);
1057 					netdev_state_change(dev);
1058 				}
1059 			}
1060 			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1061 				err = -EFAULT;
1062 		} else
1063 			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1064 		break;
1065 
1066 	case SIOCDELTUNNEL:
1067 		err = -EPERM;
1068 		if (!capable(CAP_NET_ADMIN))
1069 			goto done;
1070 
1071 		if (dev == ign->fb_tunnel_dev) {
1072 			err = -EFAULT;
1073 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1074 				goto done;
1075 			err = -ENOENT;
1076 			if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1077 				goto done;
1078 			err = -EPERM;
1079 			if (t == netdev_priv(ign->fb_tunnel_dev))
1080 				goto done;
1081 			dev = t->dev;
1082 		}
1083 		unregister_netdevice(dev);
1084 		err = 0;
1085 		break;
1086 
1087 	default:
1088 		err = -EINVAL;
1089 	}
1090 
1091 done:
1092 	return err;
1093 }
1094 
1095 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1096 {
1097 	struct ip_tunnel *tunnel = netdev_priv(dev);
1098 	if (new_mtu < 68 ||
1099 	    new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1100 		return -EINVAL;
1101 	dev->mtu = new_mtu;
1102 	return 0;
1103 }
1104 
1105 /* Nice toy. Unfortunately, useless in real life :-)
1106    It allows to construct virtual multiprotocol broadcast "LAN"
1107    over the Internet, provided multicast routing is tuned.
1108 
1109 
1110    I have no idea was this bicycle invented before me,
1111    so that I had to set ARPHRD_IPGRE to a random value.
1112    I have an impression, that Cisco could make something similar,
1113    but this feature is apparently missing in IOS<=11.2(8).
1114 
1115    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1116    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1117 
1118    ping -t 255 224.66.66.66
1119 
1120    If nobody answers, mbone does not work.
1121 
1122    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1123    ip addr add 10.66.66.<somewhat>/24 dev Universe
1124    ifconfig Universe up
1125    ifconfig Universe add fe80::<Your_real_addr>/10
1126    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1127    ftp 10.66.66.66
1128    ...
1129    ftp fec0:6666:6666::193.233.7.65
1130    ...
1131 
1132  */
1133 
1134 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1135 			unsigned short type,
1136 			const void *daddr, const void *saddr, unsigned len)
1137 {
1138 	struct ip_tunnel *t = netdev_priv(dev);
1139 	struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1140 	__be16 *p = (__be16*)(iph+1);
1141 
1142 	memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1143 	p[0]		= t->parms.o_flags;
1144 	p[1]		= htons(type);
1145 
1146 	/*
1147 	 *	Set the source hardware address.
1148 	 */
1149 
1150 	if (saddr)
1151 		memcpy(&iph->saddr, saddr, 4);
1152 
1153 	if (daddr) {
1154 		memcpy(&iph->daddr, daddr, 4);
1155 		return t->hlen;
1156 	}
1157 	if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1158 		return t->hlen;
1159 
1160 	return -t->hlen;
1161 }
1162 
1163 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1164 {
1165 	struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1166 	memcpy(haddr, &iph->saddr, 4);
1167 	return 4;
1168 }
1169 
1170 static const struct header_ops ipgre_header_ops = {
1171 	.create	= ipgre_header,
1172 	.parse	= ipgre_header_parse,
1173 };
1174 
1175 #ifdef CONFIG_NET_IPGRE_BROADCAST
1176 static int ipgre_open(struct net_device *dev)
1177 {
1178 	struct ip_tunnel *t = netdev_priv(dev);
1179 
1180 	if (ipv4_is_multicast(t->parms.iph.daddr)) {
1181 		struct flowi fl = { .oif = t->parms.link,
1182 				    .nl_u = { .ip4_u =
1183 					      { .daddr = t->parms.iph.daddr,
1184 						.saddr = t->parms.iph.saddr,
1185 						.tos = RT_TOS(t->parms.iph.tos) } },
1186 				    .proto = IPPROTO_GRE };
1187 		struct rtable *rt;
1188 		if (ip_route_output_key(dev_net(dev), &rt, &fl))
1189 			return -EADDRNOTAVAIL;
1190 		dev = rt->u.dst.dev;
1191 		ip_rt_put(rt);
1192 		if (__in_dev_get_rtnl(dev) == NULL)
1193 			return -EADDRNOTAVAIL;
1194 		t->mlink = dev->ifindex;
1195 		ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1196 	}
1197 	return 0;
1198 }
1199 
1200 static int ipgre_close(struct net_device *dev)
1201 {
1202 	struct ip_tunnel *t = netdev_priv(dev);
1203 
1204 	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1205 		struct in_device *in_dev;
1206 		in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1207 		if (in_dev) {
1208 			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1209 			in_dev_put(in_dev);
1210 		}
1211 	}
1212 	return 0;
1213 }
1214 
1215 #endif
1216 
1217 static const struct net_device_ops ipgre_netdev_ops = {
1218 	.ndo_init		= ipgre_tunnel_init,
1219 	.ndo_uninit		= ipgre_tunnel_uninit,
1220 #ifdef CONFIG_NET_IPGRE_BROADCAST
1221 	.ndo_open		= ipgre_open,
1222 	.ndo_stop		= ipgre_close,
1223 #endif
1224 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1225 	.ndo_do_ioctl		= ipgre_tunnel_ioctl,
1226 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1227 };
1228 
1229 static void ipgre_tunnel_setup(struct net_device *dev)
1230 {
1231 	dev->netdev_ops		= &ipgre_netdev_ops;
1232 	dev->destructor 	= free_netdev;
1233 
1234 	dev->type		= ARPHRD_IPGRE;
1235 	dev->needed_headroom 	= LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1236 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1237 	dev->flags		= IFF_NOARP;
1238 	dev->iflink		= 0;
1239 	dev->addr_len		= 4;
1240 	dev->features		|= NETIF_F_NETNS_LOCAL;
1241 }
1242 
1243 static int ipgre_tunnel_init(struct net_device *dev)
1244 {
1245 	struct ip_tunnel *tunnel;
1246 	struct iphdr *iph;
1247 
1248 	tunnel = netdev_priv(dev);
1249 	iph = &tunnel->parms.iph;
1250 
1251 	tunnel->dev = dev;
1252 	strcpy(tunnel->parms.name, dev->name);
1253 
1254 	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1255 	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1256 
1257 	if (iph->daddr) {
1258 #ifdef CONFIG_NET_IPGRE_BROADCAST
1259 		if (ipv4_is_multicast(iph->daddr)) {
1260 			if (!iph->saddr)
1261 				return -EINVAL;
1262 			dev->flags = IFF_BROADCAST;
1263 			dev->header_ops = &ipgre_header_ops;
1264 		}
1265 #endif
1266 	} else
1267 		dev->header_ops = &ipgre_header_ops;
1268 
1269 	return 0;
1270 }
1271 
1272 static void ipgre_fb_tunnel_init(struct net_device *dev)
1273 {
1274 	struct ip_tunnel *tunnel = netdev_priv(dev);
1275 	struct iphdr *iph = &tunnel->parms.iph;
1276 	struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1277 
1278 	tunnel->dev = dev;
1279 	strcpy(tunnel->parms.name, dev->name);
1280 
1281 	iph->version		= 4;
1282 	iph->protocol		= IPPROTO_GRE;
1283 	iph->ihl		= 5;
1284 	tunnel->hlen		= sizeof(struct iphdr) + 4;
1285 
1286 	dev_hold(dev);
1287 	ign->tunnels_wc[0]	= tunnel;
1288 }
1289 
1290 
1291 static struct net_protocol ipgre_protocol = {
1292 	.handler	=	ipgre_rcv,
1293 	.err_handler	=	ipgre_err,
1294 	.netns_ok	=	1,
1295 };
1296 
1297 static void ipgre_destroy_tunnels(struct ipgre_net *ign)
1298 {
1299 	int prio;
1300 
1301 	for (prio = 0; prio < 4; prio++) {
1302 		int h;
1303 		for (h = 0; h < HASH_SIZE; h++) {
1304 			struct ip_tunnel *t;
1305 			while ((t = ign->tunnels[prio][h]) != NULL)
1306 				unregister_netdevice(t->dev);
1307 		}
1308 	}
1309 }
1310 
1311 static int ipgre_init_net(struct net *net)
1312 {
1313 	int err;
1314 	struct ipgre_net *ign;
1315 
1316 	err = -ENOMEM;
1317 	ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1318 	if (ign == NULL)
1319 		goto err_alloc;
1320 
1321 	err = net_assign_generic(net, ipgre_net_id, ign);
1322 	if (err < 0)
1323 		goto err_assign;
1324 
1325 	ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1326 					   ipgre_tunnel_setup);
1327 	if (!ign->fb_tunnel_dev) {
1328 		err = -ENOMEM;
1329 		goto err_alloc_dev;
1330 	}
1331 	dev_net_set(ign->fb_tunnel_dev, net);
1332 
1333 	ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1334 	ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1335 
1336 	if ((err = register_netdev(ign->fb_tunnel_dev)))
1337 		goto err_reg_dev;
1338 
1339 	return 0;
1340 
1341 err_reg_dev:
1342 	free_netdev(ign->fb_tunnel_dev);
1343 err_alloc_dev:
1344 	/* nothing */
1345 err_assign:
1346 	kfree(ign);
1347 err_alloc:
1348 	return err;
1349 }
1350 
1351 static void ipgre_exit_net(struct net *net)
1352 {
1353 	struct ipgre_net *ign;
1354 
1355 	ign = net_generic(net, ipgre_net_id);
1356 	rtnl_lock();
1357 	ipgre_destroy_tunnels(ign);
1358 	rtnl_unlock();
1359 	kfree(ign);
1360 }
1361 
1362 static struct pernet_operations ipgre_net_ops = {
1363 	.init = ipgre_init_net,
1364 	.exit = ipgre_exit_net,
1365 };
1366 
1367 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1368 {
1369 	__be16 flags;
1370 
1371 	if (!data)
1372 		return 0;
1373 
1374 	flags = 0;
1375 	if (data[IFLA_GRE_IFLAGS])
1376 		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1377 	if (data[IFLA_GRE_OFLAGS])
1378 		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1379 	if (flags & (GRE_VERSION|GRE_ROUTING))
1380 		return -EINVAL;
1381 
1382 	return 0;
1383 }
1384 
1385 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1386 {
1387 	__be32 daddr;
1388 
1389 	if (tb[IFLA_ADDRESS]) {
1390 		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1391 			return -EINVAL;
1392 		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1393 			return -EADDRNOTAVAIL;
1394 	}
1395 
1396 	if (!data)
1397 		goto out;
1398 
1399 	if (data[IFLA_GRE_REMOTE]) {
1400 		memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1401 		if (!daddr)
1402 			return -EINVAL;
1403 	}
1404 
1405 out:
1406 	return ipgre_tunnel_validate(tb, data);
1407 }
1408 
1409 static void ipgre_netlink_parms(struct nlattr *data[],
1410 				struct ip_tunnel_parm *parms)
1411 {
1412 	memset(parms, 0, sizeof(*parms));
1413 
1414 	parms->iph.protocol = IPPROTO_GRE;
1415 
1416 	if (!data)
1417 		return;
1418 
1419 	if (data[IFLA_GRE_LINK])
1420 		parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1421 
1422 	if (data[IFLA_GRE_IFLAGS])
1423 		parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1424 
1425 	if (data[IFLA_GRE_OFLAGS])
1426 		parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1427 
1428 	if (data[IFLA_GRE_IKEY])
1429 		parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1430 
1431 	if (data[IFLA_GRE_OKEY])
1432 		parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1433 
1434 	if (data[IFLA_GRE_LOCAL])
1435 		parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1436 
1437 	if (data[IFLA_GRE_REMOTE])
1438 		parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1439 
1440 	if (data[IFLA_GRE_TTL])
1441 		parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1442 
1443 	if (data[IFLA_GRE_TOS])
1444 		parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1445 
1446 	if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1447 		parms->iph.frag_off = htons(IP_DF);
1448 }
1449 
1450 static int ipgre_tap_init(struct net_device *dev)
1451 {
1452 	struct ip_tunnel *tunnel;
1453 
1454 	tunnel = netdev_priv(dev);
1455 
1456 	tunnel->dev = dev;
1457 	strcpy(tunnel->parms.name, dev->name);
1458 
1459 	ipgre_tunnel_bind_dev(dev);
1460 
1461 	return 0;
1462 }
1463 
1464 static const struct net_device_ops ipgre_tap_netdev_ops = {
1465 	.ndo_init		= ipgre_tap_init,
1466 	.ndo_uninit		= ipgre_tunnel_uninit,
1467 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1468 	.ndo_set_mac_address 	= eth_mac_addr,
1469 	.ndo_validate_addr	= eth_validate_addr,
1470 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1471 };
1472 
1473 static void ipgre_tap_setup(struct net_device *dev)
1474 {
1475 
1476 	ether_setup(dev);
1477 
1478 	dev->netdev_ops		= &ipgre_netdev_ops;
1479 	dev->destructor 	= free_netdev;
1480 
1481 	dev->iflink		= 0;
1482 	dev->features		|= NETIF_F_NETNS_LOCAL;
1483 }
1484 
1485 static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[],
1486 			 struct nlattr *data[])
1487 {
1488 	struct ip_tunnel *nt;
1489 	struct net *net = dev_net(dev);
1490 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1491 	int mtu;
1492 	int err;
1493 
1494 	nt = netdev_priv(dev);
1495 	ipgre_netlink_parms(data, &nt->parms);
1496 
1497 	if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1498 		return -EEXIST;
1499 
1500 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1501 		random_ether_addr(dev->dev_addr);
1502 
1503 	mtu = ipgre_tunnel_bind_dev(dev);
1504 	if (!tb[IFLA_MTU])
1505 		dev->mtu = mtu;
1506 
1507 	err = register_netdevice(dev);
1508 	if (err)
1509 		goto out;
1510 
1511 	dev_hold(dev);
1512 	ipgre_tunnel_link(ign, nt);
1513 
1514 out:
1515 	return err;
1516 }
1517 
1518 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1519 			    struct nlattr *data[])
1520 {
1521 	struct ip_tunnel *t, *nt;
1522 	struct net *net = dev_net(dev);
1523 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1524 	struct ip_tunnel_parm p;
1525 	int mtu;
1526 
1527 	if (dev == ign->fb_tunnel_dev)
1528 		return -EINVAL;
1529 
1530 	nt = netdev_priv(dev);
1531 	ipgre_netlink_parms(data, &p);
1532 
1533 	t = ipgre_tunnel_locate(net, &p, 0);
1534 
1535 	if (t) {
1536 		if (t->dev != dev)
1537 			return -EEXIST;
1538 	} else {
1539 		unsigned nflags = 0;
1540 
1541 		t = nt;
1542 
1543 		if (ipv4_is_multicast(p.iph.daddr))
1544 			nflags = IFF_BROADCAST;
1545 		else if (p.iph.daddr)
1546 			nflags = IFF_POINTOPOINT;
1547 
1548 		if ((dev->flags ^ nflags) &
1549 		    (IFF_POINTOPOINT | IFF_BROADCAST))
1550 			return -EINVAL;
1551 
1552 		ipgre_tunnel_unlink(ign, t);
1553 		t->parms.iph.saddr = p.iph.saddr;
1554 		t->parms.iph.daddr = p.iph.daddr;
1555 		t->parms.i_key = p.i_key;
1556 		memcpy(dev->dev_addr, &p.iph.saddr, 4);
1557 		memcpy(dev->broadcast, &p.iph.daddr, 4);
1558 		ipgre_tunnel_link(ign, t);
1559 		netdev_state_change(dev);
1560 	}
1561 
1562 	t->parms.o_key = p.o_key;
1563 	t->parms.iph.ttl = p.iph.ttl;
1564 	t->parms.iph.tos = p.iph.tos;
1565 	t->parms.iph.frag_off = p.iph.frag_off;
1566 
1567 	if (t->parms.link != p.link) {
1568 		t->parms.link = p.link;
1569 		mtu = ipgre_tunnel_bind_dev(dev);
1570 		if (!tb[IFLA_MTU])
1571 			dev->mtu = mtu;
1572 		netdev_state_change(dev);
1573 	}
1574 
1575 	return 0;
1576 }
1577 
1578 static size_t ipgre_get_size(const struct net_device *dev)
1579 {
1580 	return
1581 		/* IFLA_GRE_LINK */
1582 		nla_total_size(4) +
1583 		/* IFLA_GRE_IFLAGS */
1584 		nla_total_size(2) +
1585 		/* IFLA_GRE_OFLAGS */
1586 		nla_total_size(2) +
1587 		/* IFLA_GRE_IKEY */
1588 		nla_total_size(4) +
1589 		/* IFLA_GRE_OKEY */
1590 		nla_total_size(4) +
1591 		/* IFLA_GRE_LOCAL */
1592 		nla_total_size(4) +
1593 		/* IFLA_GRE_REMOTE */
1594 		nla_total_size(4) +
1595 		/* IFLA_GRE_TTL */
1596 		nla_total_size(1) +
1597 		/* IFLA_GRE_TOS */
1598 		nla_total_size(1) +
1599 		/* IFLA_GRE_PMTUDISC */
1600 		nla_total_size(1) +
1601 		0;
1602 }
1603 
1604 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1605 {
1606 	struct ip_tunnel *t = netdev_priv(dev);
1607 	struct ip_tunnel_parm *p = &t->parms;
1608 
1609 	NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1610 	NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1611 	NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1612 	NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1613 	NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1614 	NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1615 	NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1616 	NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1617 	NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1618 	NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1619 
1620 	return 0;
1621 
1622 nla_put_failure:
1623 	return -EMSGSIZE;
1624 }
1625 
1626 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1627 	[IFLA_GRE_LINK]		= { .type = NLA_U32 },
1628 	[IFLA_GRE_IFLAGS]	= { .type = NLA_U16 },
1629 	[IFLA_GRE_OFLAGS]	= { .type = NLA_U16 },
1630 	[IFLA_GRE_IKEY]		= { .type = NLA_U32 },
1631 	[IFLA_GRE_OKEY]		= { .type = NLA_U32 },
1632 	[IFLA_GRE_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1633 	[IFLA_GRE_REMOTE]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1634 	[IFLA_GRE_TTL]		= { .type = NLA_U8 },
1635 	[IFLA_GRE_TOS]		= { .type = NLA_U8 },
1636 	[IFLA_GRE_PMTUDISC]	= { .type = NLA_U8 },
1637 };
1638 
1639 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1640 	.kind		= "gre",
1641 	.maxtype	= IFLA_GRE_MAX,
1642 	.policy		= ipgre_policy,
1643 	.priv_size	= sizeof(struct ip_tunnel),
1644 	.setup		= ipgre_tunnel_setup,
1645 	.validate	= ipgre_tunnel_validate,
1646 	.newlink	= ipgre_newlink,
1647 	.changelink	= ipgre_changelink,
1648 	.get_size	= ipgre_get_size,
1649 	.fill_info	= ipgre_fill_info,
1650 };
1651 
1652 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1653 	.kind		= "gretap",
1654 	.maxtype	= IFLA_GRE_MAX,
1655 	.policy		= ipgre_policy,
1656 	.priv_size	= sizeof(struct ip_tunnel),
1657 	.setup		= ipgre_tap_setup,
1658 	.validate	= ipgre_tap_validate,
1659 	.newlink	= ipgre_newlink,
1660 	.changelink	= ipgre_changelink,
1661 	.get_size	= ipgre_get_size,
1662 	.fill_info	= ipgre_fill_info,
1663 };
1664 
1665 /*
1666  *	And now the modules code and kernel interface.
1667  */
1668 
1669 static int __init ipgre_init(void)
1670 {
1671 	int err;
1672 
1673 	printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1674 
1675 	if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1676 		printk(KERN_INFO "ipgre init: can't add protocol\n");
1677 		return -EAGAIN;
1678 	}
1679 
1680 	err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1681 	if (err < 0)
1682 		goto gen_device_failed;
1683 
1684 	err = rtnl_link_register(&ipgre_link_ops);
1685 	if (err < 0)
1686 		goto rtnl_link_failed;
1687 
1688 	err = rtnl_link_register(&ipgre_tap_ops);
1689 	if (err < 0)
1690 		goto tap_ops_failed;
1691 
1692 out:
1693 	return err;
1694 
1695 tap_ops_failed:
1696 	rtnl_link_unregister(&ipgre_link_ops);
1697 rtnl_link_failed:
1698 	unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1699 gen_device_failed:
1700 	inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1701 	goto out;
1702 }
1703 
1704 static void __exit ipgre_fini(void)
1705 {
1706 	rtnl_link_unregister(&ipgre_tap_ops);
1707 	rtnl_link_unregister(&ipgre_link_ops);
1708 	unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1709 	if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1710 		printk(KERN_INFO "ipgre close: can't remove protocol\n");
1711 }
1712 
1713 module_init(ipgre_init);
1714 module_exit(ipgre_fini);
1715 MODULE_LICENSE("GPL");
1716 MODULE_ALIAS_RTNL_LINK("gre");
1717 MODULE_ALIAS_RTNL_LINK("gretap");
1718