xref: /openbmc/linux/net/ipv4/ip_gre.c (revision 06d352f2)
1 /*
2  *	Linux NET3:	GRE over IP protocol decoder.
3  *
4  *	Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *	This program is free software; you can redistribute it and/or
7  *	modify it under the terms of the GNU General Public License
8  *	as published by the Free Software Foundation; either version
9  *	2 of the License, or (at your option) any later version.
10  *
11  */
12 
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <asm/uaccess.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
20 #include <linux/in.h>
21 #include <linux/tcp.h>
22 #include <linux/udp.h>
23 #include <linux/if_arp.h>
24 #include <linux/mroute.h>
25 #include <linux/init.h>
26 #include <linux/in6.h>
27 #include <linux/inetdevice.h>
28 #include <linux/igmp.h>
29 #include <linux/netfilter_ipv4.h>
30 #include <linux/etherdevice.h>
31 #include <linux/if_ether.h>
32 
33 #include <net/sock.h>
34 #include <net/ip.h>
35 #include <net/icmp.h>
36 #include <net/protocol.h>
37 #include <net/ipip.h>
38 #include <net/arp.h>
39 #include <net/checksum.h>
40 #include <net/dsfield.h>
41 #include <net/inet_ecn.h>
42 #include <net/xfrm.h>
43 #include <net/net_namespace.h>
44 #include <net/netns/generic.h>
45 #include <net/rtnetlink.h>
46 
47 #ifdef CONFIG_IPV6
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #endif
52 
53 /*
54    Problems & solutions
55    --------------------
56 
57    1. The most important issue is detecting local dead loops.
58    They would cause complete host lockup in transmit, which
59    would be "resolved" by stack overflow or, if queueing is enabled,
60    with infinite looping in net_bh.
61 
62    We cannot track such dead loops during route installation,
63    it is infeasible task. The most general solutions would be
64    to keep skb->encapsulation counter (sort of local ttl),
65    and silently drop packet when it expires. It is the best
66    solution, but it supposes maintaing new variable in ALL
67    skb, even if no tunneling is used.
68 
69    Current solution: HARD_TX_LOCK lock breaks dead loops.
70 
71 
72 
73    2. Networking dead loops would not kill routers, but would really
74    kill network. IP hop limit plays role of "t->recursion" in this case,
75    if we copy it from packet being encapsulated to upper header.
76    It is very good solution, but it introduces two problems:
77 
78    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
79      do not work over tunnels.
80    - traceroute does not work. I planned to relay ICMP from tunnel,
81      so that this problem would be solved and traceroute output
82      would even more informative. This idea appeared to be wrong:
83      only Linux complies to rfc1812 now (yes, guys, Linux is the only
84      true router now :-)), all routers (at least, in neighbourhood of mine)
85      return only 8 bytes of payload. It is the end.
86 
87    Hence, if we want that OSPF worked or traceroute said something reasonable,
88    we should search for another solution.
89 
90    One of them is to parse packet trying to detect inner encapsulation
91    made by our node. It is difficult or even impossible, especially,
92    taking into account fragmentation. TO be short, tt is not solution at all.
93 
94    Current solution: The solution was UNEXPECTEDLY SIMPLE.
95    We force DF flag on tunnels with preconfigured hop limit,
96    that is ALL. :-) Well, it does not remove the problem completely,
97    but exponential growth of network traffic is changed to linear
98    (branches, that exceed pmtu are pruned) and tunnel mtu
99    fastly degrades to value <68, where looping stops.
100    Yes, it is not good if there exists a router in the loop,
101    which does not force DF, even when encapsulating packets have DF set.
102    But it is not our problem! Nobody could accuse us, we made
103    all that we could make. Even if it is your gated who injected
104    fatal route to network, even if it were you who configured
105    fatal static route: you are innocent. :-)
106 
107 
108 
109    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
110    practically identical code. It would be good to glue them
111    together, but it is not very evident, how to make them modular.
112    sit is integral part of IPv6, ipip and gre are naturally modular.
113    We could extract common parts (hash table, ioctl etc)
114    to a separate module (ip_tunnel.c).
115 
116    Alexey Kuznetsov.
117  */
118 
119 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
120 static int ipgre_tunnel_init(struct net_device *dev);
121 static void ipgre_tunnel_setup(struct net_device *dev);
122 static int ipgre_tunnel_bind_dev(struct net_device *dev);
123 
124 /* Fallback tunnel: no source, no destination, no key, no options */
125 
126 #define HASH_SIZE  16
127 
128 static int ipgre_net_id __read_mostly;
129 struct ipgre_net {
130 	struct ip_tunnel *tunnels[4][HASH_SIZE];
131 
132 	struct net_device *fb_tunnel_dev;
133 };
134 
135 /* Tunnel hash table */
136 
137 /*
138    4 hash tables:
139 
140    3: (remote,local)
141    2: (remote,*)
142    1: (*,local)
143    0: (*,*)
144 
145    We require exact key match i.e. if a key is present in packet
146    it will match only tunnel with the same key; if it is not present,
147    it will match only keyless tunnel.
148 
149    All keysless packets, if not matched configured keyless tunnels
150    will match fallback tunnel.
151  */
152 
153 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
154 
155 #define tunnels_r_l	tunnels[3]
156 #define tunnels_r	tunnels[2]
157 #define tunnels_l	tunnels[1]
158 #define tunnels_wc	tunnels[0]
159 /*
160  * Locking : hash tables are protected by RCU and a spinlock
161  */
162 static DEFINE_SPINLOCK(ipgre_lock);
163 
164 #define for_each_ip_tunnel_rcu(start) \
165 	for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
166 
167 /* Given src, dst and key, find appropriate for input tunnel. */
168 
169 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
170 					      __be32 remote, __be32 local,
171 					      __be32 key, __be16 gre_proto)
172 {
173 	struct net *net = dev_net(dev);
174 	int link = dev->ifindex;
175 	unsigned h0 = HASH(remote);
176 	unsigned h1 = HASH(key);
177 	struct ip_tunnel *t, *cand = NULL;
178 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
179 	int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
180 		       ARPHRD_ETHER : ARPHRD_IPGRE;
181 	int score, cand_score = 4;
182 
183 	for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
184 		if (local != t->parms.iph.saddr ||
185 		    remote != t->parms.iph.daddr ||
186 		    key != t->parms.i_key ||
187 		    !(t->dev->flags & IFF_UP))
188 			continue;
189 
190 		if (t->dev->type != ARPHRD_IPGRE &&
191 		    t->dev->type != dev_type)
192 			continue;
193 
194 		score = 0;
195 		if (t->parms.link != link)
196 			score |= 1;
197 		if (t->dev->type != dev_type)
198 			score |= 2;
199 		if (score == 0)
200 			return t;
201 
202 		if (score < cand_score) {
203 			cand = t;
204 			cand_score = score;
205 		}
206 	}
207 
208 	for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
209 		if (remote != t->parms.iph.daddr ||
210 		    key != t->parms.i_key ||
211 		    !(t->dev->flags & IFF_UP))
212 			continue;
213 
214 		if (t->dev->type != ARPHRD_IPGRE &&
215 		    t->dev->type != dev_type)
216 			continue;
217 
218 		score = 0;
219 		if (t->parms.link != link)
220 			score |= 1;
221 		if (t->dev->type != dev_type)
222 			score |= 2;
223 		if (score == 0)
224 			return t;
225 
226 		if (score < cand_score) {
227 			cand = t;
228 			cand_score = score;
229 		}
230 	}
231 
232 	for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
233 		if ((local != t->parms.iph.saddr &&
234 		     (local != t->parms.iph.daddr ||
235 		      !ipv4_is_multicast(local))) ||
236 		    key != t->parms.i_key ||
237 		    !(t->dev->flags & IFF_UP))
238 			continue;
239 
240 		if (t->dev->type != ARPHRD_IPGRE &&
241 		    t->dev->type != dev_type)
242 			continue;
243 
244 		score = 0;
245 		if (t->parms.link != link)
246 			score |= 1;
247 		if (t->dev->type != dev_type)
248 			score |= 2;
249 		if (score == 0)
250 			return t;
251 
252 		if (score < cand_score) {
253 			cand = t;
254 			cand_score = score;
255 		}
256 	}
257 
258 	for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
259 		if (t->parms.i_key != key ||
260 		    !(t->dev->flags & IFF_UP))
261 			continue;
262 
263 		if (t->dev->type != ARPHRD_IPGRE &&
264 		    t->dev->type != dev_type)
265 			continue;
266 
267 		score = 0;
268 		if (t->parms.link != link)
269 			score |= 1;
270 		if (t->dev->type != dev_type)
271 			score |= 2;
272 		if (score == 0)
273 			return t;
274 
275 		if (score < cand_score) {
276 			cand = t;
277 			cand_score = score;
278 		}
279 	}
280 
281 	if (cand != NULL)
282 		return cand;
283 
284 	dev = ign->fb_tunnel_dev;
285 	if (dev->flags & IFF_UP)
286 		return netdev_priv(dev);
287 
288 	return NULL;
289 }
290 
291 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
292 		struct ip_tunnel_parm *parms)
293 {
294 	__be32 remote = parms->iph.daddr;
295 	__be32 local = parms->iph.saddr;
296 	__be32 key = parms->i_key;
297 	unsigned h = HASH(key);
298 	int prio = 0;
299 
300 	if (local)
301 		prio |= 1;
302 	if (remote && !ipv4_is_multicast(remote)) {
303 		prio |= 2;
304 		h ^= HASH(remote);
305 	}
306 
307 	return &ign->tunnels[prio][h];
308 }
309 
310 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
311 		struct ip_tunnel *t)
312 {
313 	return __ipgre_bucket(ign, &t->parms);
314 }
315 
316 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
317 {
318 	struct ip_tunnel **tp = ipgre_bucket(ign, t);
319 
320 	spin_lock_bh(&ipgre_lock);
321 	t->next = *tp;
322 	rcu_assign_pointer(*tp, t);
323 	spin_unlock_bh(&ipgre_lock);
324 }
325 
326 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
327 {
328 	struct ip_tunnel **tp;
329 
330 	for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
331 		if (t == *tp) {
332 			spin_lock_bh(&ipgre_lock);
333 			*tp = t->next;
334 			spin_unlock_bh(&ipgre_lock);
335 			break;
336 		}
337 	}
338 }
339 
340 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
341 					   struct ip_tunnel_parm *parms,
342 					   int type)
343 {
344 	__be32 remote = parms->iph.daddr;
345 	__be32 local = parms->iph.saddr;
346 	__be32 key = parms->i_key;
347 	int link = parms->link;
348 	struct ip_tunnel *t, **tp;
349 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
350 
351 	for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
352 		if (local == t->parms.iph.saddr &&
353 		    remote == t->parms.iph.daddr &&
354 		    key == t->parms.i_key &&
355 		    link == t->parms.link &&
356 		    type == t->dev->type)
357 			break;
358 
359 	return t;
360 }
361 
362 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
363 		struct ip_tunnel_parm *parms, int create)
364 {
365 	struct ip_tunnel *t, *nt;
366 	struct net_device *dev;
367 	char name[IFNAMSIZ];
368 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
369 
370 	t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
371 	if (t || !create)
372 		return t;
373 
374 	if (parms->name[0])
375 		strlcpy(name, parms->name, IFNAMSIZ);
376 	else
377 		sprintf(name, "gre%%d");
378 
379 	dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
380 	if (!dev)
381 	  return NULL;
382 
383 	dev_net_set(dev, net);
384 
385 	if (strchr(name, '%')) {
386 		if (dev_alloc_name(dev, name) < 0)
387 			goto failed_free;
388 	}
389 
390 	nt = netdev_priv(dev);
391 	nt->parms = *parms;
392 	dev->rtnl_link_ops = &ipgre_link_ops;
393 
394 	dev->mtu = ipgre_tunnel_bind_dev(dev);
395 
396 	if (register_netdevice(dev) < 0)
397 		goto failed_free;
398 
399 	dev_hold(dev);
400 	ipgre_tunnel_link(ign, nt);
401 	return nt;
402 
403 failed_free:
404 	free_netdev(dev);
405 	return NULL;
406 }
407 
408 static void ipgre_tunnel_uninit(struct net_device *dev)
409 {
410 	struct net *net = dev_net(dev);
411 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
412 
413 	ipgre_tunnel_unlink(ign, netdev_priv(dev));
414 	dev_put(dev);
415 }
416 
417 
418 static void ipgre_err(struct sk_buff *skb, u32 info)
419 {
420 
421 /* All the routers (except for Linux) return only
422    8 bytes of packet payload. It means, that precise relaying of
423    ICMP in the real Internet is absolutely infeasible.
424 
425    Moreover, Cisco "wise men" put GRE key to the third word
426    in GRE header. It makes impossible maintaining even soft state for keyed
427    GRE tunnels with enabled checksum. Tell them "thank you".
428 
429    Well, I wonder, rfc1812 was written by Cisco employee,
430    what the hell these idiots break standrads established
431    by themself???
432  */
433 
434 	struct iphdr *iph = (struct iphdr *)skb->data;
435 	__be16	     *p = (__be16*)(skb->data+(iph->ihl<<2));
436 	int grehlen = (iph->ihl<<2) + 4;
437 	const int type = icmp_hdr(skb)->type;
438 	const int code = icmp_hdr(skb)->code;
439 	struct ip_tunnel *t;
440 	__be16 flags;
441 
442 	flags = p[0];
443 	if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
444 		if (flags&(GRE_VERSION|GRE_ROUTING))
445 			return;
446 		if (flags&GRE_KEY) {
447 			grehlen += 4;
448 			if (flags&GRE_CSUM)
449 				grehlen += 4;
450 		}
451 	}
452 
453 	/* If only 8 bytes returned, keyed message will be dropped here */
454 	if (skb_headlen(skb) < grehlen)
455 		return;
456 
457 	switch (type) {
458 	default:
459 	case ICMP_PARAMETERPROB:
460 		return;
461 
462 	case ICMP_DEST_UNREACH:
463 		switch (code) {
464 		case ICMP_SR_FAILED:
465 		case ICMP_PORT_UNREACH:
466 			/* Impossible event. */
467 			return;
468 		case ICMP_FRAG_NEEDED:
469 			/* Soft state for pmtu is maintained by IP core. */
470 			return;
471 		default:
472 			/* All others are translated to HOST_UNREACH.
473 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
474 			   I believe they are just ether pollution. --ANK
475 			 */
476 			break;
477 		}
478 		break;
479 	case ICMP_TIME_EXCEEDED:
480 		if (code != ICMP_EXC_TTL)
481 			return;
482 		break;
483 	}
484 
485 	rcu_read_lock();
486 	t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
487 				flags & GRE_KEY ?
488 				*(((__be32 *)p) + (grehlen / 4) - 1) : 0,
489 				p[1]);
490 	if (t == NULL || t->parms.iph.daddr == 0 ||
491 	    ipv4_is_multicast(t->parms.iph.daddr))
492 		goto out;
493 
494 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
495 		goto out;
496 
497 	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
498 		t->err_count++;
499 	else
500 		t->err_count = 1;
501 	t->err_time = jiffies;
502 out:
503 	rcu_read_unlock();
504 	return;
505 }
506 
507 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
508 {
509 	if (INET_ECN_is_ce(iph->tos)) {
510 		if (skb->protocol == htons(ETH_P_IP)) {
511 			IP_ECN_set_ce(ip_hdr(skb));
512 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
513 			IP6_ECN_set_ce(ipv6_hdr(skb));
514 		}
515 	}
516 }
517 
518 static inline u8
519 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
520 {
521 	u8 inner = 0;
522 	if (skb->protocol == htons(ETH_P_IP))
523 		inner = old_iph->tos;
524 	else if (skb->protocol == htons(ETH_P_IPV6))
525 		inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
526 	return INET_ECN_encapsulate(tos, inner);
527 }
528 
529 static int ipgre_rcv(struct sk_buff *skb)
530 {
531 	struct iphdr *iph;
532 	u8     *h;
533 	__be16    flags;
534 	__sum16   csum = 0;
535 	__be32 key = 0;
536 	u32    seqno = 0;
537 	struct ip_tunnel *tunnel;
538 	int    offset = 4;
539 	__be16 gre_proto;
540 	unsigned int len;
541 
542 	if (!pskb_may_pull(skb, 16))
543 		goto drop_nolock;
544 
545 	iph = ip_hdr(skb);
546 	h = skb->data;
547 	flags = *(__be16*)h;
548 
549 	if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
550 		/* - Version must be 0.
551 		   - We do not support routing headers.
552 		 */
553 		if (flags&(GRE_VERSION|GRE_ROUTING))
554 			goto drop_nolock;
555 
556 		if (flags&GRE_CSUM) {
557 			switch (skb->ip_summed) {
558 			case CHECKSUM_COMPLETE:
559 				csum = csum_fold(skb->csum);
560 				if (!csum)
561 					break;
562 				/* fall through */
563 			case CHECKSUM_NONE:
564 				skb->csum = 0;
565 				csum = __skb_checksum_complete(skb);
566 				skb->ip_summed = CHECKSUM_COMPLETE;
567 			}
568 			offset += 4;
569 		}
570 		if (flags&GRE_KEY) {
571 			key = *(__be32*)(h + offset);
572 			offset += 4;
573 		}
574 		if (flags&GRE_SEQ) {
575 			seqno = ntohl(*(__be32*)(h + offset));
576 			offset += 4;
577 		}
578 	}
579 
580 	gre_proto = *(__be16 *)(h + 2);
581 
582 	rcu_read_lock();
583 	if ((tunnel = ipgre_tunnel_lookup(skb->dev,
584 					  iph->saddr, iph->daddr, key,
585 					  gre_proto))) {
586 		struct net_device_stats *stats = &tunnel->dev->stats;
587 
588 		secpath_reset(skb);
589 
590 		skb->protocol = gre_proto;
591 		/* WCCP version 1 and 2 protocol decoding.
592 		 * - Change protocol to IP
593 		 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
594 		 */
595 		if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
596 			skb->protocol = htons(ETH_P_IP);
597 			if ((*(h + offset) & 0xF0) != 0x40)
598 				offset += 4;
599 		}
600 
601 		skb->mac_header = skb->network_header;
602 		__pskb_pull(skb, offset);
603 		skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
604 		skb->pkt_type = PACKET_HOST;
605 #ifdef CONFIG_NET_IPGRE_BROADCAST
606 		if (ipv4_is_multicast(iph->daddr)) {
607 			/* Looped back packet, drop it! */
608 			if (skb_rtable(skb)->fl.iif == 0)
609 				goto drop;
610 			stats->multicast++;
611 			skb->pkt_type = PACKET_BROADCAST;
612 		}
613 #endif
614 
615 		if (((flags&GRE_CSUM) && csum) ||
616 		    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
617 			stats->rx_crc_errors++;
618 			stats->rx_errors++;
619 			goto drop;
620 		}
621 		if (tunnel->parms.i_flags&GRE_SEQ) {
622 			if (!(flags&GRE_SEQ) ||
623 			    (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
624 				stats->rx_fifo_errors++;
625 				stats->rx_errors++;
626 				goto drop;
627 			}
628 			tunnel->i_seqno = seqno + 1;
629 		}
630 
631 		len = skb->len;
632 
633 		/* Warning: All skb pointers will be invalidated! */
634 		if (tunnel->dev->type == ARPHRD_ETHER) {
635 			if (!pskb_may_pull(skb, ETH_HLEN)) {
636 				stats->rx_length_errors++;
637 				stats->rx_errors++;
638 				goto drop;
639 			}
640 
641 			iph = ip_hdr(skb);
642 			skb->protocol = eth_type_trans(skb, tunnel->dev);
643 			skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
644 		}
645 
646 		stats->rx_packets++;
647 		stats->rx_bytes += len;
648 		skb->dev = tunnel->dev;
649 		skb_dst_drop(skb);
650 		nf_reset(skb);
651 
652 		skb_reset_network_header(skb);
653 		ipgre_ecn_decapsulate(iph, skb);
654 
655 		netif_rx(skb);
656 		rcu_read_unlock();
657 		return(0);
658 	}
659 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
660 
661 drop:
662 	rcu_read_unlock();
663 drop_nolock:
664 	kfree_skb(skb);
665 	return(0);
666 }
667 
668 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
669 {
670 	struct ip_tunnel *tunnel = netdev_priv(dev);
671 	struct net_device_stats *stats = &dev->stats;
672 	struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
673 	struct iphdr  *old_iph = ip_hdr(skb);
674 	struct iphdr  *tiph;
675 	u8     tos;
676 	__be16 df;
677 	struct rtable *rt;     			/* Route to the other host */
678 	struct net_device *tdev;			/* Device to other host */
679 	struct iphdr  *iph;			/* Our new IP header */
680 	unsigned int max_headroom;		/* The extra header space needed */
681 	int    gre_hlen;
682 	__be32 dst;
683 	int    mtu;
684 
685 	if (dev->type == ARPHRD_ETHER)
686 		IPCB(skb)->flags = 0;
687 
688 	if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
689 		gre_hlen = 0;
690 		tiph = (struct iphdr *)skb->data;
691 	} else {
692 		gre_hlen = tunnel->hlen;
693 		tiph = &tunnel->parms.iph;
694 	}
695 
696 	if ((dst = tiph->daddr) == 0) {
697 		/* NBMA tunnel */
698 
699 		if (skb_dst(skb) == NULL) {
700 			stats->tx_fifo_errors++;
701 			goto tx_error;
702 		}
703 
704 		if (skb->protocol == htons(ETH_P_IP)) {
705 			rt = skb_rtable(skb);
706 			if ((dst = rt->rt_gateway) == 0)
707 				goto tx_error_icmp;
708 		}
709 #ifdef CONFIG_IPV6
710 		else if (skb->protocol == htons(ETH_P_IPV6)) {
711 			struct in6_addr *addr6;
712 			int addr_type;
713 			struct neighbour *neigh = skb_dst(skb)->neighbour;
714 
715 			if (neigh == NULL)
716 				goto tx_error;
717 
718 			addr6 = (struct in6_addr *)&neigh->primary_key;
719 			addr_type = ipv6_addr_type(addr6);
720 
721 			if (addr_type == IPV6_ADDR_ANY) {
722 				addr6 = &ipv6_hdr(skb)->daddr;
723 				addr_type = ipv6_addr_type(addr6);
724 			}
725 
726 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
727 				goto tx_error_icmp;
728 
729 			dst = addr6->s6_addr32[3];
730 		}
731 #endif
732 		else
733 			goto tx_error;
734 	}
735 
736 	tos = tiph->tos;
737 	if (tos == 1) {
738 		tos = 0;
739 		if (skb->protocol == htons(ETH_P_IP))
740 			tos = old_iph->tos;
741 	}
742 
743 	{
744 		struct flowi fl = { .oif = tunnel->parms.link,
745 				    .nl_u = { .ip4_u =
746 					      { .daddr = dst,
747 						.saddr = tiph->saddr,
748 						.tos = RT_TOS(tos) } },
749 				    .proto = IPPROTO_GRE };
750 		if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
751 			stats->tx_carrier_errors++;
752 			goto tx_error;
753 		}
754 	}
755 	tdev = rt->u.dst.dev;
756 
757 	if (tdev == dev) {
758 		ip_rt_put(rt);
759 		stats->collisions++;
760 		goto tx_error;
761 	}
762 
763 	df = tiph->frag_off;
764 	if (df)
765 		mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
766 	else
767 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
768 
769 	if (skb_dst(skb))
770 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
771 
772 	if (skb->protocol == htons(ETH_P_IP)) {
773 		df |= (old_iph->frag_off&htons(IP_DF));
774 
775 		if ((old_iph->frag_off&htons(IP_DF)) &&
776 		    mtu < ntohs(old_iph->tot_len)) {
777 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
778 			ip_rt_put(rt);
779 			goto tx_error;
780 		}
781 	}
782 #ifdef CONFIG_IPV6
783 	else if (skb->protocol == htons(ETH_P_IPV6)) {
784 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
785 
786 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
787 			if ((tunnel->parms.iph.daddr &&
788 			     !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
789 			    rt6->rt6i_dst.plen == 128) {
790 				rt6->rt6i_flags |= RTF_MODIFIED;
791 				skb_dst(skb)->metrics[RTAX_MTU-1] = mtu;
792 			}
793 		}
794 
795 		if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
796 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
797 			ip_rt_put(rt);
798 			goto tx_error;
799 		}
800 	}
801 #endif
802 
803 	if (tunnel->err_count > 0) {
804 		if (time_before(jiffies,
805 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
806 			tunnel->err_count--;
807 
808 			dst_link_failure(skb);
809 		} else
810 			tunnel->err_count = 0;
811 	}
812 
813 	max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
814 
815 	if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
816 	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
817 		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
818 		if (!new_skb) {
819 			ip_rt_put(rt);
820 			txq->tx_dropped++;
821 			dev_kfree_skb(skb);
822 			return NETDEV_TX_OK;
823 		}
824 		if (skb->sk)
825 			skb_set_owner_w(new_skb, skb->sk);
826 		dev_kfree_skb(skb);
827 		skb = new_skb;
828 		old_iph = ip_hdr(skb);
829 	}
830 
831 	skb_reset_transport_header(skb);
832 	skb_push(skb, gre_hlen);
833 	skb_reset_network_header(skb);
834 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
835 	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
836 			      IPSKB_REROUTED);
837 	skb_dst_drop(skb);
838 	skb_dst_set(skb, &rt->u.dst);
839 
840 	/*
841 	 *	Push down and install the IPIP header.
842 	 */
843 
844 	iph 			=	ip_hdr(skb);
845 	iph->version		=	4;
846 	iph->ihl		=	sizeof(struct iphdr) >> 2;
847 	iph->frag_off		=	df;
848 	iph->protocol		=	IPPROTO_GRE;
849 	iph->tos		=	ipgre_ecn_encapsulate(tos, old_iph, skb);
850 	iph->daddr		=	rt->rt_dst;
851 	iph->saddr		=	rt->rt_src;
852 
853 	if ((iph->ttl = tiph->ttl) == 0) {
854 		if (skb->protocol == htons(ETH_P_IP))
855 			iph->ttl = old_iph->ttl;
856 #ifdef CONFIG_IPV6
857 		else if (skb->protocol == htons(ETH_P_IPV6))
858 			iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
859 #endif
860 		else
861 			iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
862 	}
863 
864 	((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
865 	((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
866 				   htons(ETH_P_TEB) : skb->protocol;
867 
868 	if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
869 		__be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
870 
871 		if (tunnel->parms.o_flags&GRE_SEQ) {
872 			++tunnel->o_seqno;
873 			*ptr = htonl(tunnel->o_seqno);
874 			ptr--;
875 		}
876 		if (tunnel->parms.o_flags&GRE_KEY) {
877 			*ptr = tunnel->parms.o_key;
878 			ptr--;
879 		}
880 		if (tunnel->parms.o_flags&GRE_CSUM) {
881 			*ptr = 0;
882 			*(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
883 		}
884 	}
885 
886 	nf_reset(skb);
887 
888 	IPTUNNEL_XMIT();
889 	return NETDEV_TX_OK;
890 
891 tx_error_icmp:
892 	dst_link_failure(skb);
893 
894 tx_error:
895 	stats->tx_errors++;
896 	dev_kfree_skb(skb);
897 	return NETDEV_TX_OK;
898 }
899 
900 static int ipgre_tunnel_bind_dev(struct net_device *dev)
901 {
902 	struct net_device *tdev = NULL;
903 	struct ip_tunnel *tunnel;
904 	struct iphdr *iph;
905 	int hlen = LL_MAX_HEADER;
906 	int mtu = ETH_DATA_LEN;
907 	int addend = sizeof(struct iphdr) + 4;
908 
909 	tunnel = netdev_priv(dev);
910 	iph = &tunnel->parms.iph;
911 
912 	/* Guess output device to choose reasonable mtu and needed_headroom */
913 
914 	if (iph->daddr) {
915 		struct flowi fl = { .oif = tunnel->parms.link,
916 				    .nl_u = { .ip4_u =
917 					      { .daddr = iph->daddr,
918 						.saddr = iph->saddr,
919 						.tos = RT_TOS(iph->tos) } },
920 				    .proto = IPPROTO_GRE };
921 		struct rtable *rt;
922 		if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
923 			tdev = rt->u.dst.dev;
924 			ip_rt_put(rt);
925 		}
926 
927 		if (dev->type != ARPHRD_ETHER)
928 			dev->flags |= IFF_POINTOPOINT;
929 	}
930 
931 	if (!tdev && tunnel->parms.link)
932 		tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
933 
934 	if (tdev) {
935 		hlen = tdev->hard_header_len + tdev->needed_headroom;
936 		mtu = tdev->mtu;
937 	}
938 	dev->iflink = tunnel->parms.link;
939 
940 	/* Precalculate GRE options length */
941 	if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
942 		if (tunnel->parms.o_flags&GRE_CSUM)
943 			addend += 4;
944 		if (tunnel->parms.o_flags&GRE_KEY)
945 			addend += 4;
946 		if (tunnel->parms.o_flags&GRE_SEQ)
947 			addend += 4;
948 	}
949 	dev->needed_headroom = addend + hlen;
950 	mtu -= dev->hard_header_len + addend;
951 
952 	if (mtu < 68)
953 		mtu = 68;
954 
955 	tunnel->hlen = addend;
956 
957 	return mtu;
958 }
959 
960 static int
961 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
962 {
963 	int err = 0;
964 	struct ip_tunnel_parm p;
965 	struct ip_tunnel *t;
966 	struct net *net = dev_net(dev);
967 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
968 
969 	switch (cmd) {
970 	case SIOCGETTUNNEL:
971 		t = NULL;
972 		if (dev == ign->fb_tunnel_dev) {
973 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
974 				err = -EFAULT;
975 				break;
976 			}
977 			t = ipgre_tunnel_locate(net, &p, 0);
978 		}
979 		if (t == NULL)
980 			t = netdev_priv(dev);
981 		memcpy(&p, &t->parms, sizeof(p));
982 		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
983 			err = -EFAULT;
984 		break;
985 
986 	case SIOCADDTUNNEL:
987 	case SIOCCHGTUNNEL:
988 		err = -EPERM;
989 		if (!capable(CAP_NET_ADMIN))
990 			goto done;
991 
992 		err = -EFAULT;
993 		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
994 			goto done;
995 
996 		err = -EINVAL;
997 		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
998 		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
999 		    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1000 			goto done;
1001 		if (p.iph.ttl)
1002 			p.iph.frag_off |= htons(IP_DF);
1003 
1004 		if (!(p.i_flags&GRE_KEY))
1005 			p.i_key = 0;
1006 		if (!(p.o_flags&GRE_KEY))
1007 			p.o_key = 0;
1008 
1009 		t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1010 
1011 		if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1012 			if (t != NULL) {
1013 				if (t->dev != dev) {
1014 					err = -EEXIST;
1015 					break;
1016 				}
1017 			} else {
1018 				unsigned nflags = 0;
1019 
1020 				t = netdev_priv(dev);
1021 
1022 				if (ipv4_is_multicast(p.iph.daddr))
1023 					nflags = IFF_BROADCAST;
1024 				else if (p.iph.daddr)
1025 					nflags = IFF_POINTOPOINT;
1026 
1027 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1028 					err = -EINVAL;
1029 					break;
1030 				}
1031 				ipgre_tunnel_unlink(ign, t);
1032 				t->parms.iph.saddr = p.iph.saddr;
1033 				t->parms.iph.daddr = p.iph.daddr;
1034 				t->parms.i_key = p.i_key;
1035 				t->parms.o_key = p.o_key;
1036 				memcpy(dev->dev_addr, &p.iph.saddr, 4);
1037 				memcpy(dev->broadcast, &p.iph.daddr, 4);
1038 				ipgre_tunnel_link(ign, t);
1039 				netdev_state_change(dev);
1040 			}
1041 		}
1042 
1043 		if (t) {
1044 			err = 0;
1045 			if (cmd == SIOCCHGTUNNEL) {
1046 				t->parms.iph.ttl = p.iph.ttl;
1047 				t->parms.iph.tos = p.iph.tos;
1048 				t->parms.iph.frag_off = p.iph.frag_off;
1049 				if (t->parms.link != p.link) {
1050 					t->parms.link = p.link;
1051 					dev->mtu = ipgre_tunnel_bind_dev(dev);
1052 					netdev_state_change(dev);
1053 				}
1054 			}
1055 			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1056 				err = -EFAULT;
1057 		} else
1058 			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1059 		break;
1060 
1061 	case SIOCDELTUNNEL:
1062 		err = -EPERM;
1063 		if (!capable(CAP_NET_ADMIN))
1064 			goto done;
1065 
1066 		if (dev == ign->fb_tunnel_dev) {
1067 			err = -EFAULT;
1068 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1069 				goto done;
1070 			err = -ENOENT;
1071 			if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1072 				goto done;
1073 			err = -EPERM;
1074 			if (t == netdev_priv(ign->fb_tunnel_dev))
1075 				goto done;
1076 			dev = t->dev;
1077 		}
1078 		unregister_netdevice(dev);
1079 		err = 0;
1080 		break;
1081 
1082 	default:
1083 		err = -EINVAL;
1084 	}
1085 
1086 done:
1087 	return err;
1088 }
1089 
1090 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1091 {
1092 	struct ip_tunnel *tunnel = netdev_priv(dev);
1093 	if (new_mtu < 68 ||
1094 	    new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1095 		return -EINVAL;
1096 	dev->mtu = new_mtu;
1097 	return 0;
1098 }
1099 
1100 /* Nice toy. Unfortunately, useless in real life :-)
1101    It allows to construct virtual multiprotocol broadcast "LAN"
1102    over the Internet, provided multicast routing is tuned.
1103 
1104 
1105    I have no idea was this bicycle invented before me,
1106    so that I had to set ARPHRD_IPGRE to a random value.
1107    I have an impression, that Cisco could make something similar,
1108    but this feature is apparently missing in IOS<=11.2(8).
1109 
1110    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1111    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1112 
1113    ping -t 255 224.66.66.66
1114 
1115    If nobody answers, mbone does not work.
1116 
1117    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1118    ip addr add 10.66.66.<somewhat>/24 dev Universe
1119    ifconfig Universe up
1120    ifconfig Universe add fe80::<Your_real_addr>/10
1121    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1122    ftp 10.66.66.66
1123    ...
1124    ftp fec0:6666:6666::193.233.7.65
1125    ...
1126 
1127  */
1128 
1129 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1130 			unsigned short type,
1131 			const void *daddr, const void *saddr, unsigned len)
1132 {
1133 	struct ip_tunnel *t = netdev_priv(dev);
1134 	struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1135 	__be16 *p = (__be16*)(iph+1);
1136 
1137 	memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1138 	p[0]		= t->parms.o_flags;
1139 	p[1]		= htons(type);
1140 
1141 	/*
1142 	 *	Set the source hardware address.
1143 	 */
1144 
1145 	if (saddr)
1146 		memcpy(&iph->saddr, saddr, 4);
1147 
1148 	if (daddr) {
1149 		memcpy(&iph->daddr, daddr, 4);
1150 		return t->hlen;
1151 	}
1152 	if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1153 		return t->hlen;
1154 
1155 	return -t->hlen;
1156 }
1157 
1158 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1159 {
1160 	struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1161 	memcpy(haddr, &iph->saddr, 4);
1162 	return 4;
1163 }
1164 
1165 static const struct header_ops ipgre_header_ops = {
1166 	.create	= ipgre_header,
1167 	.parse	= ipgre_header_parse,
1168 };
1169 
1170 #ifdef CONFIG_NET_IPGRE_BROADCAST
1171 static int ipgre_open(struct net_device *dev)
1172 {
1173 	struct ip_tunnel *t = netdev_priv(dev);
1174 
1175 	if (ipv4_is_multicast(t->parms.iph.daddr)) {
1176 		struct flowi fl = { .oif = t->parms.link,
1177 				    .nl_u = { .ip4_u =
1178 					      { .daddr = t->parms.iph.daddr,
1179 						.saddr = t->parms.iph.saddr,
1180 						.tos = RT_TOS(t->parms.iph.tos) } },
1181 				    .proto = IPPROTO_GRE };
1182 		struct rtable *rt;
1183 		if (ip_route_output_key(dev_net(dev), &rt, &fl))
1184 			return -EADDRNOTAVAIL;
1185 		dev = rt->u.dst.dev;
1186 		ip_rt_put(rt);
1187 		if (__in_dev_get_rtnl(dev) == NULL)
1188 			return -EADDRNOTAVAIL;
1189 		t->mlink = dev->ifindex;
1190 		ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1191 	}
1192 	return 0;
1193 }
1194 
1195 static int ipgre_close(struct net_device *dev)
1196 {
1197 	struct ip_tunnel *t = netdev_priv(dev);
1198 
1199 	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1200 		struct in_device *in_dev;
1201 		in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1202 		if (in_dev) {
1203 			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1204 			in_dev_put(in_dev);
1205 		}
1206 	}
1207 	return 0;
1208 }
1209 
1210 #endif
1211 
1212 static const struct net_device_ops ipgre_netdev_ops = {
1213 	.ndo_init		= ipgre_tunnel_init,
1214 	.ndo_uninit		= ipgre_tunnel_uninit,
1215 #ifdef CONFIG_NET_IPGRE_BROADCAST
1216 	.ndo_open		= ipgre_open,
1217 	.ndo_stop		= ipgre_close,
1218 #endif
1219 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1220 	.ndo_do_ioctl		= ipgre_tunnel_ioctl,
1221 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1222 };
1223 
1224 static void ipgre_tunnel_setup(struct net_device *dev)
1225 {
1226 	dev->netdev_ops		= &ipgre_netdev_ops;
1227 	dev->destructor 	= free_netdev;
1228 
1229 	dev->type		= ARPHRD_IPGRE;
1230 	dev->needed_headroom 	= LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1231 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1232 	dev->flags		= IFF_NOARP;
1233 	dev->iflink		= 0;
1234 	dev->addr_len		= 4;
1235 	dev->features		|= NETIF_F_NETNS_LOCAL;
1236 	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
1237 }
1238 
1239 static int ipgre_tunnel_init(struct net_device *dev)
1240 {
1241 	struct ip_tunnel *tunnel;
1242 	struct iphdr *iph;
1243 
1244 	tunnel = netdev_priv(dev);
1245 	iph = &tunnel->parms.iph;
1246 
1247 	tunnel->dev = dev;
1248 	strcpy(tunnel->parms.name, dev->name);
1249 
1250 	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1251 	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1252 
1253 	if (iph->daddr) {
1254 #ifdef CONFIG_NET_IPGRE_BROADCAST
1255 		if (ipv4_is_multicast(iph->daddr)) {
1256 			if (!iph->saddr)
1257 				return -EINVAL;
1258 			dev->flags = IFF_BROADCAST;
1259 			dev->header_ops = &ipgre_header_ops;
1260 		}
1261 #endif
1262 	} else
1263 		dev->header_ops = &ipgre_header_ops;
1264 
1265 	return 0;
1266 }
1267 
1268 static void ipgre_fb_tunnel_init(struct net_device *dev)
1269 {
1270 	struct ip_tunnel *tunnel = netdev_priv(dev);
1271 	struct iphdr *iph = &tunnel->parms.iph;
1272 	struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1273 
1274 	tunnel->dev = dev;
1275 	strcpy(tunnel->parms.name, dev->name);
1276 
1277 	iph->version		= 4;
1278 	iph->protocol		= IPPROTO_GRE;
1279 	iph->ihl		= 5;
1280 	tunnel->hlen		= sizeof(struct iphdr) + 4;
1281 
1282 	dev_hold(dev);
1283 	ign->tunnels_wc[0]	= tunnel;
1284 }
1285 
1286 
1287 static const struct net_protocol ipgre_protocol = {
1288 	.handler	=	ipgre_rcv,
1289 	.err_handler	=	ipgre_err,
1290 	.netns_ok	=	1,
1291 };
1292 
1293 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1294 {
1295 	int prio;
1296 
1297 	for (prio = 0; prio < 4; prio++) {
1298 		int h;
1299 		for (h = 0; h < HASH_SIZE; h++) {
1300 			struct ip_tunnel *t = ign->tunnels[prio][h];
1301 
1302 			while (t != NULL) {
1303 				unregister_netdevice_queue(t->dev, head);
1304 				t = t->next;
1305 			}
1306 		}
1307 	}
1308 }
1309 
1310 static int ipgre_init_net(struct net *net)
1311 {
1312 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1313 	int err;
1314 
1315 	ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1316 					   ipgre_tunnel_setup);
1317 	if (!ign->fb_tunnel_dev) {
1318 		err = -ENOMEM;
1319 		goto err_alloc_dev;
1320 	}
1321 	dev_net_set(ign->fb_tunnel_dev, net);
1322 
1323 	ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1324 	ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1325 
1326 	if ((err = register_netdev(ign->fb_tunnel_dev)))
1327 		goto err_reg_dev;
1328 
1329 	return 0;
1330 
1331 err_reg_dev:
1332 	free_netdev(ign->fb_tunnel_dev);
1333 err_alloc_dev:
1334 	return err;
1335 }
1336 
1337 static void ipgre_exit_net(struct net *net)
1338 {
1339 	struct ipgre_net *ign;
1340 	LIST_HEAD(list);
1341 
1342 	ign = net_generic(net, ipgre_net_id);
1343 	rtnl_lock();
1344 	ipgre_destroy_tunnels(ign, &list);
1345 	unregister_netdevice_many(&list);
1346 	rtnl_unlock();
1347 }
1348 
1349 static struct pernet_operations ipgre_net_ops = {
1350 	.init = ipgre_init_net,
1351 	.exit = ipgre_exit_net,
1352 	.id   = &ipgre_net_id,
1353 	.size = sizeof(struct ipgre_net),
1354 };
1355 
1356 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1357 {
1358 	__be16 flags;
1359 
1360 	if (!data)
1361 		return 0;
1362 
1363 	flags = 0;
1364 	if (data[IFLA_GRE_IFLAGS])
1365 		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1366 	if (data[IFLA_GRE_OFLAGS])
1367 		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1368 	if (flags & (GRE_VERSION|GRE_ROUTING))
1369 		return -EINVAL;
1370 
1371 	return 0;
1372 }
1373 
1374 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1375 {
1376 	__be32 daddr;
1377 
1378 	if (tb[IFLA_ADDRESS]) {
1379 		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1380 			return -EINVAL;
1381 		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1382 			return -EADDRNOTAVAIL;
1383 	}
1384 
1385 	if (!data)
1386 		goto out;
1387 
1388 	if (data[IFLA_GRE_REMOTE]) {
1389 		memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1390 		if (!daddr)
1391 			return -EINVAL;
1392 	}
1393 
1394 out:
1395 	return ipgre_tunnel_validate(tb, data);
1396 }
1397 
1398 static void ipgre_netlink_parms(struct nlattr *data[],
1399 				struct ip_tunnel_parm *parms)
1400 {
1401 	memset(parms, 0, sizeof(*parms));
1402 
1403 	parms->iph.protocol = IPPROTO_GRE;
1404 
1405 	if (!data)
1406 		return;
1407 
1408 	if (data[IFLA_GRE_LINK])
1409 		parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1410 
1411 	if (data[IFLA_GRE_IFLAGS])
1412 		parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1413 
1414 	if (data[IFLA_GRE_OFLAGS])
1415 		parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1416 
1417 	if (data[IFLA_GRE_IKEY])
1418 		parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1419 
1420 	if (data[IFLA_GRE_OKEY])
1421 		parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1422 
1423 	if (data[IFLA_GRE_LOCAL])
1424 		parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1425 
1426 	if (data[IFLA_GRE_REMOTE])
1427 		parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1428 
1429 	if (data[IFLA_GRE_TTL])
1430 		parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1431 
1432 	if (data[IFLA_GRE_TOS])
1433 		parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1434 
1435 	if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1436 		parms->iph.frag_off = htons(IP_DF);
1437 }
1438 
1439 static int ipgre_tap_init(struct net_device *dev)
1440 {
1441 	struct ip_tunnel *tunnel;
1442 
1443 	tunnel = netdev_priv(dev);
1444 
1445 	tunnel->dev = dev;
1446 	strcpy(tunnel->parms.name, dev->name);
1447 
1448 	ipgre_tunnel_bind_dev(dev);
1449 
1450 	return 0;
1451 }
1452 
1453 static const struct net_device_ops ipgre_tap_netdev_ops = {
1454 	.ndo_init		= ipgre_tap_init,
1455 	.ndo_uninit		= ipgre_tunnel_uninit,
1456 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1457 	.ndo_set_mac_address 	= eth_mac_addr,
1458 	.ndo_validate_addr	= eth_validate_addr,
1459 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1460 };
1461 
1462 static void ipgre_tap_setup(struct net_device *dev)
1463 {
1464 
1465 	ether_setup(dev);
1466 
1467 	dev->netdev_ops		= &ipgre_tap_netdev_ops;
1468 	dev->destructor 	= free_netdev;
1469 
1470 	dev->iflink		= 0;
1471 	dev->features		|= NETIF_F_NETNS_LOCAL;
1472 }
1473 
1474 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1475 			 struct nlattr *data[])
1476 {
1477 	struct ip_tunnel *nt;
1478 	struct net *net = dev_net(dev);
1479 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1480 	int mtu;
1481 	int err;
1482 
1483 	nt = netdev_priv(dev);
1484 	ipgre_netlink_parms(data, &nt->parms);
1485 
1486 	if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1487 		return -EEXIST;
1488 
1489 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1490 		random_ether_addr(dev->dev_addr);
1491 
1492 	mtu = ipgre_tunnel_bind_dev(dev);
1493 	if (!tb[IFLA_MTU])
1494 		dev->mtu = mtu;
1495 
1496 	err = register_netdevice(dev);
1497 	if (err)
1498 		goto out;
1499 
1500 	dev_hold(dev);
1501 	ipgre_tunnel_link(ign, nt);
1502 
1503 out:
1504 	return err;
1505 }
1506 
1507 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1508 			    struct nlattr *data[])
1509 {
1510 	struct ip_tunnel *t, *nt;
1511 	struct net *net = dev_net(dev);
1512 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1513 	struct ip_tunnel_parm p;
1514 	int mtu;
1515 
1516 	if (dev == ign->fb_tunnel_dev)
1517 		return -EINVAL;
1518 
1519 	nt = netdev_priv(dev);
1520 	ipgre_netlink_parms(data, &p);
1521 
1522 	t = ipgre_tunnel_locate(net, &p, 0);
1523 
1524 	if (t) {
1525 		if (t->dev != dev)
1526 			return -EEXIST;
1527 	} else {
1528 		t = nt;
1529 
1530 		if (dev->type != ARPHRD_ETHER) {
1531 			unsigned nflags = 0;
1532 
1533 			if (ipv4_is_multicast(p.iph.daddr))
1534 				nflags = IFF_BROADCAST;
1535 			else if (p.iph.daddr)
1536 				nflags = IFF_POINTOPOINT;
1537 
1538 			if ((dev->flags ^ nflags) &
1539 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1540 				return -EINVAL;
1541 		}
1542 
1543 		ipgre_tunnel_unlink(ign, t);
1544 		t->parms.iph.saddr = p.iph.saddr;
1545 		t->parms.iph.daddr = p.iph.daddr;
1546 		t->parms.i_key = p.i_key;
1547 		if (dev->type != ARPHRD_ETHER) {
1548 			memcpy(dev->dev_addr, &p.iph.saddr, 4);
1549 			memcpy(dev->broadcast, &p.iph.daddr, 4);
1550 		}
1551 		ipgre_tunnel_link(ign, t);
1552 		netdev_state_change(dev);
1553 	}
1554 
1555 	t->parms.o_key = p.o_key;
1556 	t->parms.iph.ttl = p.iph.ttl;
1557 	t->parms.iph.tos = p.iph.tos;
1558 	t->parms.iph.frag_off = p.iph.frag_off;
1559 
1560 	if (t->parms.link != p.link) {
1561 		t->parms.link = p.link;
1562 		mtu = ipgre_tunnel_bind_dev(dev);
1563 		if (!tb[IFLA_MTU])
1564 			dev->mtu = mtu;
1565 		netdev_state_change(dev);
1566 	}
1567 
1568 	return 0;
1569 }
1570 
1571 static size_t ipgre_get_size(const struct net_device *dev)
1572 {
1573 	return
1574 		/* IFLA_GRE_LINK */
1575 		nla_total_size(4) +
1576 		/* IFLA_GRE_IFLAGS */
1577 		nla_total_size(2) +
1578 		/* IFLA_GRE_OFLAGS */
1579 		nla_total_size(2) +
1580 		/* IFLA_GRE_IKEY */
1581 		nla_total_size(4) +
1582 		/* IFLA_GRE_OKEY */
1583 		nla_total_size(4) +
1584 		/* IFLA_GRE_LOCAL */
1585 		nla_total_size(4) +
1586 		/* IFLA_GRE_REMOTE */
1587 		nla_total_size(4) +
1588 		/* IFLA_GRE_TTL */
1589 		nla_total_size(1) +
1590 		/* IFLA_GRE_TOS */
1591 		nla_total_size(1) +
1592 		/* IFLA_GRE_PMTUDISC */
1593 		nla_total_size(1) +
1594 		0;
1595 }
1596 
1597 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1598 {
1599 	struct ip_tunnel *t = netdev_priv(dev);
1600 	struct ip_tunnel_parm *p = &t->parms;
1601 
1602 	NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1603 	NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1604 	NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1605 	NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1606 	NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1607 	NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1608 	NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1609 	NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1610 	NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1611 	NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1612 
1613 	return 0;
1614 
1615 nla_put_failure:
1616 	return -EMSGSIZE;
1617 }
1618 
1619 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1620 	[IFLA_GRE_LINK]		= { .type = NLA_U32 },
1621 	[IFLA_GRE_IFLAGS]	= { .type = NLA_U16 },
1622 	[IFLA_GRE_OFLAGS]	= { .type = NLA_U16 },
1623 	[IFLA_GRE_IKEY]		= { .type = NLA_U32 },
1624 	[IFLA_GRE_OKEY]		= { .type = NLA_U32 },
1625 	[IFLA_GRE_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1626 	[IFLA_GRE_REMOTE]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1627 	[IFLA_GRE_TTL]		= { .type = NLA_U8 },
1628 	[IFLA_GRE_TOS]		= { .type = NLA_U8 },
1629 	[IFLA_GRE_PMTUDISC]	= { .type = NLA_U8 },
1630 };
1631 
1632 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1633 	.kind		= "gre",
1634 	.maxtype	= IFLA_GRE_MAX,
1635 	.policy		= ipgre_policy,
1636 	.priv_size	= sizeof(struct ip_tunnel),
1637 	.setup		= ipgre_tunnel_setup,
1638 	.validate	= ipgre_tunnel_validate,
1639 	.newlink	= ipgre_newlink,
1640 	.changelink	= ipgre_changelink,
1641 	.get_size	= ipgre_get_size,
1642 	.fill_info	= ipgre_fill_info,
1643 };
1644 
1645 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1646 	.kind		= "gretap",
1647 	.maxtype	= IFLA_GRE_MAX,
1648 	.policy		= ipgre_policy,
1649 	.priv_size	= sizeof(struct ip_tunnel),
1650 	.setup		= ipgre_tap_setup,
1651 	.validate	= ipgre_tap_validate,
1652 	.newlink	= ipgre_newlink,
1653 	.changelink	= ipgre_changelink,
1654 	.get_size	= ipgre_get_size,
1655 	.fill_info	= ipgre_fill_info,
1656 };
1657 
1658 /*
1659  *	And now the modules code and kernel interface.
1660  */
1661 
1662 static int __init ipgre_init(void)
1663 {
1664 	int err;
1665 
1666 	printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1667 
1668 	if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1669 		printk(KERN_INFO "ipgre init: can't add protocol\n");
1670 		return -EAGAIN;
1671 	}
1672 
1673 	err = register_pernet_device(&ipgre_net_ops);
1674 	if (err < 0)
1675 		goto gen_device_failed;
1676 
1677 	err = rtnl_link_register(&ipgre_link_ops);
1678 	if (err < 0)
1679 		goto rtnl_link_failed;
1680 
1681 	err = rtnl_link_register(&ipgre_tap_ops);
1682 	if (err < 0)
1683 		goto tap_ops_failed;
1684 
1685 out:
1686 	return err;
1687 
1688 tap_ops_failed:
1689 	rtnl_link_unregister(&ipgre_link_ops);
1690 rtnl_link_failed:
1691 	unregister_pernet_device(&ipgre_net_ops);
1692 gen_device_failed:
1693 	inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1694 	goto out;
1695 }
1696 
1697 static void __exit ipgre_fini(void)
1698 {
1699 	rtnl_link_unregister(&ipgre_tap_ops);
1700 	rtnl_link_unregister(&ipgre_link_ops);
1701 	unregister_pernet_device(&ipgre_net_ops);
1702 	if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1703 		printk(KERN_INFO "ipgre close: can't remove protocol\n");
1704 }
1705 
1706 module_init(ipgre_init);
1707 module_exit(ipgre_fini);
1708 MODULE_LICENSE("GPL");
1709 MODULE_ALIAS_RTNL_LINK("gre");
1710 MODULE_ALIAS_RTNL_LINK("gretap");
1711