xref: /openbmc/linux/net/ipv4/ip_gre.c (revision 63dc02bd)
1 /*
2  *	Linux NET3:	GRE over IP protocol decoder.
3  *
4  *	Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *	This program is free software; you can redistribute it and/or
7  *	modify it under the terms of the GNU General Public License
8  *	as published by the Free Software Foundation; either version
9  *	2 of the License, or (at your option) any later version.
10  *
11  */
12 
13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14 
15 #include <linux/capability.h>
16 #include <linux/module.h>
17 #include <linux/types.h>
18 #include <linux/kernel.h>
19 #include <linux/slab.h>
20 #include <asm/uaccess.h>
21 #include <linux/skbuff.h>
22 #include <linux/netdevice.h>
23 #include <linux/in.h>
24 #include <linux/tcp.h>
25 #include <linux/udp.h>
26 #include <linux/if_arp.h>
27 #include <linux/mroute.h>
28 #include <linux/init.h>
29 #include <linux/in6.h>
30 #include <linux/inetdevice.h>
31 #include <linux/igmp.h>
32 #include <linux/netfilter_ipv4.h>
33 #include <linux/etherdevice.h>
34 #include <linux/if_ether.h>
35 
36 #include <net/sock.h>
37 #include <net/ip.h>
38 #include <net/icmp.h>
39 #include <net/protocol.h>
40 #include <net/ipip.h>
41 #include <net/arp.h>
42 #include <net/checksum.h>
43 #include <net/dsfield.h>
44 #include <net/inet_ecn.h>
45 #include <net/xfrm.h>
46 #include <net/net_namespace.h>
47 #include <net/netns/generic.h>
48 #include <net/rtnetlink.h>
49 #include <net/gre.h>
50 
51 #if IS_ENABLED(CONFIG_IPV6)
52 #include <net/ipv6.h>
53 #include <net/ip6_fib.h>
54 #include <net/ip6_route.h>
55 #endif
56 
57 /*
58    Problems & solutions
59    --------------------
60 
61    1. The most important issue is detecting local dead loops.
62    They would cause complete host lockup in transmit, which
63    would be "resolved" by stack overflow or, if queueing is enabled,
64    with infinite looping in net_bh.
65 
66    We cannot track such dead loops during route installation,
67    it is infeasible task. The most general solutions would be
68    to keep skb->encapsulation counter (sort of local ttl),
69    and silently drop packet when it expires. It is a good
70    solution, but it supposes maintaining new variable in ALL
71    skb, even if no tunneling is used.
72 
73    Current solution: xmit_recursion breaks dead loops. This is a percpu
74    counter, since when we enter the first ndo_xmit(), cpu migration is
75    forbidden. We force an exit if this counter reaches RECURSION_LIMIT
76 
77    2. Networking dead loops would not kill routers, but would really
78    kill network. IP hop limit plays role of "t->recursion" in this case,
79    if we copy it from packet being encapsulated to upper header.
80    It is very good solution, but it introduces two problems:
81 
82    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
83      do not work over tunnels.
84    - traceroute does not work. I planned to relay ICMP from tunnel,
85      so that this problem would be solved and traceroute output
86      would even more informative. This idea appeared to be wrong:
87      only Linux complies to rfc1812 now (yes, guys, Linux is the only
88      true router now :-)), all routers (at least, in neighbourhood of mine)
89      return only 8 bytes of payload. It is the end.
90 
91    Hence, if we want that OSPF worked or traceroute said something reasonable,
92    we should search for another solution.
93 
94    One of them is to parse packet trying to detect inner encapsulation
95    made by our node. It is difficult or even impossible, especially,
96    taking into account fragmentation. TO be short, ttl is not solution at all.
97 
98    Current solution: The solution was UNEXPECTEDLY SIMPLE.
99    We force DF flag on tunnels with preconfigured hop limit,
100    that is ALL. :-) Well, it does not remove the problem completely,
101    but exponential growth of network traffic is changed to linear
102    (branches, that exceed pmtu are pruned) and tunnel mtu
103    rapidly degrades to value <68, where looping stops.
104    Yes, it is not good if there exists a router in the loop,
105    which does not force DF, even when encapsulating packets have DF set.
106    But it is not our problem! Nobody could accuse us, we made
107    all that we could make. Even if it is your gated who injected
108    fatal route to network, even if it were you who configured
109    fatal static route: you are innocent. :-)
110 
111 
112 
113    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
114    practically identical code. It would be good to glue them
115    together, but it is not very evident, how to make them modular.
116    sit is integral part of IPv6, ipip and gre are naturally modular.
117    We could extract common parts (hash table, ioctl etc)
118    to a separate module (ip_tunnel.c).
119 
120    Alexey Kuznetsov.
121  */
122 
123 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
124 static int ipgre_tunnel_init(struct net_device *dev);
125 static void ipgre_tunnel_setup(struct net_device *dev);
126 static int ipgre_tunnel_bind_dev(struct net_device *dev);
127 
128 /* Fallback tunnel: no source, no destination, no key, no options */
129 
130 #define HASH_SIZE  16
131 
132 static int ipgre_net_id __read_mostly;
133 struct ipgre_net {
134 	struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
135 
136 	struct net_device *fb_tunnel_dev;
137 };
138 
139 /* Tunnel hash table */
140 
141 /*
142    4 hash tables:
143 
144    3: (remote,local)
145    2: (remote,*)
146    1: (*,local)
147    0: (*,*)
148 
149    We require exact key match i.e. if a key is present in packet
150    it will match only tunnel with the same key; if it is not present,
151    it will match only keyless tunnel.
152 
153    All keysless packets, if not matched configured keyless tunnels
154    will match fallback tunnel.
155  */
156 
157 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
158 
159 #define tunnels_r_l	tunnels[3]
160 #define tunnels_r	tunnels[2]
161 #define tunnels_l	tunnels[1]
162 #define tunnels_wc	tunnels[0]
163 /*
164  * Locking : hash tables are protected by RCU and RTNL
165  */
166 
167 #define for_each_ip_tunnel_rcu(start) \
168 	for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
169 
170 /* often modified stats are per cpu, other are shared (netdev->stats) */
171 struct pcpu_tstats {
172 	unsigned long	rx_packets;
173 	unsigned long	rx_bytes;
174 	unsigned long	tx_packets;
175 	unsigned long	tx_bytes;
176 } __attribute__((aligned(4*sizeof(unsigned long))));
177 
178 static struct net_device_stats *ipgre_get_stats(struct net_device *dev)
179 {
180 	struct pcpu_tstats sum = { 0 };
181 	int i;
182 
183 	for_each_possible_cpu(i) {
184 		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
185 
186 		sum.rx_packets += tstats->rx_packets;
187 		sum.rx_bytes   += tstats->rx_bytes;
188 		sum.tx_packets += tstats->tx_packets;
189 		sum.tx_bytes   += tstats->tx_bytes;
190 	}
191 	dev->stats.rx_packets = sum.rx_packets;
192 	dev->stats.rx_bytes   = sum.rx_bytes;
193 	dev->stats.tx_packets = sum.tx_packets;
194 	dev->stats.tx_bytes   = sum.tx_bytes;
195 	return &dev->stats;
196 }
197 
198 /* Given src, dst and key, find appropriate for input tunnel. */
199 
200 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
201 					      __be32 remote, __be32 local,
202 					      __be32 key, __be16 gre_proto)
203 {
204 	struct net *net = dev_net(dev);
205 	int link = dev->ifindex;
206 	unsigned int h0 = HASH(remote);
207 	unsigned int h1 = HASH(key);
208 	struct ip_tunnel *t, *cand = NULL;
209 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
210 	int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
211 		       ARPHRD_ETHER : ARPHRD_IPGRE;
212 	int score, cand_score = 4;
213 
214 	for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
215 		if (local != t->parms.iph.saddr ||
216 		    remote != t->parms.iph.daddr ||
217 		    key != t->parms.i_key ||
218 		    !(t->dev->flags & IFF_UP))
219 			continue;
220 
221 		if (t->dev->type != ARPHRD_IPGRE &&
222 		    t->dev->type != dev_type)
223 			continue;
224 
225 		score = 0;
226 		if (t->parms.link != link)
227 			score |= 1;
228 		if (t->dev->type != dev_type)
229 			score |= 2;
230 		if (score == 0)
231 			return t;
232 
233 		if (score < cand_score) {
234 			cand = t;
235 			cand_score = score;
236 		}
237 	}
238 
239 	for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
240 		if (remote != t->parms.iph.daddr ||
241 		    key != t->parms.i_key ||
242 		    !(t->dev->flags & IFF_UP))
243 			continue;
244 
245 		if (t->dev->type != ARPHRD_IPGRE &&
246 		    t->dev->type != dev_type)
247 			continue;
248 
249 		score = 0;
250 		if (t->parms.link != link)
251 			score |= 1;
252 		if (t->dev->type != dev_type)
253 			score |= 2;
254 		if (score == 0)
255 			return t;
256 
257 		if (score < cand_score) {
258 			cand = t;
259 			cand_score = score;
260 		}
261 	}
262 
263 	for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
264 		if ((local != t->parms.iph.saddr &&
265 		     (local != t->parms.iph.daddr ||
266 		      !ipv4_is_multicast(local))) ||
267 		    key != t->parms.i_key ||
268 		    !(t->dev->flags & IFF_UP))
269 			continue;
270 
271 		if (t->dev->type != ARPHRD_IPGRE &&
272 		    t->dev->type != dev_type)
273 			continue;
274 
275 		score = 0;
276 		if (t->parms.link != link)
277 			score |= 1;
278 		if (t->dev->type != dev_type)
279 			score |= 2;
280 		if (score == 0)
281 			return t;
282 
283 		if (score < cand_score) {
284 			cand = t;
285 			cand_score = score;
286 		}
287 	}
288 
289 	for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
290 		if (t->parms.i_key != key ||
291 		    !(t->dev->flags & IFF_UP))
292 			continue;
293 
294 		if (t->dev->type != ARPHRD_IPGRE &&
295 		    t->dev->type != dev_type)
296 			continue;
297 
298 		score = 0;
299 		if (t->parms.link != link)
300 			score |= 1;
301 		if (t->dev->type != dev_type)
302 			score |= 2;
303 		if (score == 0)
304 			return t;
305 
306 		if (score < cand_score) {
307 			cand = t;
308 			cand_score = score;
309 		}
310 	}
311 
312 	if (cand != NULL)
313 		return cand;
314 
315 	dev = ign->fb_tunnel_dev;
316 	if (dev->flags & IFF_UP)
317 		return netdev_priv(dev);
318 
319 	return NULL;
320 }
321 
322 static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
323 		struct ip_tunnel_parm *parms)
324 {
325 	__be32 remote = parms->iph.daddr;
326 	__be32 local = parms->iph.saddr;
327 	__be32 key = parms->i_key;
328 	unsigned int h = HASH(key);
329 	int prio = 0;
330 
331 	if (local)
332 		prio |= 1;
333 	if (remote && !ipv4_is_multicast(remote)) {
334 		prio |= 2;
335 		h ^= HASH(remote);
336 	}
337 
338 	return &ign->tunnels[prio][h];
339 }
340 
341 static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
342 		struct ip_tunnel *t)
343 {
344 	return __ipgre_bucket(ign, &t->parms);
345 }
346 
347 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
348 {
349 	struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
350 
351 	rcu_assign_pointer(t->next, rtnl_dereference(*tp));
352 	rcu_assign_pointer(*tp, t);
353 }
354 
355 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
356 {
357 	struct ip_tunnel __rcu **tp;
358 	struct ip_tunnel *iter;
359 
360 	for (tp = ipgre_bucket(ign, t);
361 	     (iter = rtnl_dereference(*tp)) != NULL;
362 	     tp = &iter->next) {
363 		if (t == iter) {
364 			rcu_assign_pointer(*tp, t->next);
365 			break;
366 		}
367 	}
368 }
369 
370 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
371 					   struct ip_tunnel_parm *parms,
372 					   int type)
373 {
374 	__be32 remote = parms->iph.daddr;
375 	__be32 local = parms->iph.saddr;
376 	__be32 key = parms->i_key;
377 	int link = parms->link;
378 	struct ip_tunnel *t;
379 	struct ip_tunnel __rcu **tp;
380 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
381 
382 	for (tp = __ipgre_bucket(ign, parms);
383 	     (t = rtnl_dereference(*tp)) != NULL;
384 	     tp = &t->next)
385 		if (local == t->parms.iph.saddr &&
386 		    remote == t->parms.iph.daddr &&
387 		    key == t->parms.i_key &&
388 		    link == t->parms.link &&
389 		    type == t->dev->type)
390 			break;
391 
392 	return t;
393 }
394 
395 static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
396 		struct ip_tunnel_parm *parms, int create)
397 {
398 	struct ip_tunnel *t, *nt;
399 	struct net_device *dev;
400 	char name[IFNAMSIZ];
401 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
402 
403 	t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
404 	if (t || !create)
405 		return t;
406 
407 	if (parms->name[0])
408 		strlcpy(name, parms->name, IFNAMSIZ);
409 	else
410 		strcpy(name, "gre%d");
411 
412 	dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
413 	if (!dev)
414 		return NULL;
415 
416 	dev_net_set(dev, net);
417 
418 	nt = netdev_priv(dev);
419 	nt->parms = *parms;
420 	dev->rtnl_link_ops = &ipgre_link_ops;
421 
422 	dev->mtu = ipgre_tunnel_bind_dev(dev);
423 
424 	if (register_netdevice(dev) < 0)
425 		goto failed_free;
426 
427 	/* Can use a lockless transmit, unless we generate output sequences */
428 	if (!(nt->parms.o_flags & GRE_SEQ))
429 		dev->features |= NETIF_F_LLTX;
430 
431 	dev_hold(dev);
432 	ipgre_tunnel_link(ign, nt);
433 	return nt;
434 
435 failed_free:
436 	free_netdev(dev);
437 	return NULL;
438 }
439 
440 static void ipgre_tunnel_uninit(struct net_device *dev)
441 {
442 	struct net *net = dev_net(dev);
443 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
444 
445 	ipgre_tunnel_unlink(ign, netdev_priv(dev));
446 	dev_put(dev);
447 }
448 
449 
450 static void ipgre_err(struct sk_buff *skb, u32 info)
451 {
452 
453 /* All the routers (except for Linux) return only
454    8 bytes of packet payload. It means, that precise relaying of
455    ICMP in the real Internet is absolutely infeasible.
456 
457    Moreover, Cisco "wise men" put GRE key to the third word
458    in GRE header. It makes impossible maintaining even soft state for keyed
459    GRE tunnels with enabled checksum. Tell them "thank you".
460 
461    Well, I wonder, rfc1812 was written by Cisco employee,
462    what the hell these idiots break standards established
463    by themselves???
464  */
465 
466 	const struct iphdr *iph = (const struct iphdr *)skb->data;
467 	__be16	     *p = (__be16*)(skb->data+(iph->ihl<<2));
468 	int grehlen = (iph->ihl<<2) + 4;
469 	const int type = icmp_hdr(skb)->type;
470 	const int code = icmp_hdr(skb)->code;
471 	struct ip_tunnel *t;
472 	__be16 flags;
473 
474 	flags = p[0];
475 	if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
476 		if (flags&(GRE_VERSION|GRE_ROUTING))
477 			return;
478 		if (flags&GRE_KEY) {
479 			grehlen += 4;
480 			if (flags&GRE_CSUM)
481 				grehlen += 4;
482 		}
483 	}
484 
485 	/* If only 8 bytes returned, keyed message will be dropped here */
486 	if (skb_headlen(skb) < grehlen)
487 		return;
488 
489 	switch (type) {
490 	default:
491 	case ICMP_PARAMETERPROB:
492 		return;
493 
494 	case ICMP_DEST_UNREACH:
495 		switch (code) {
496 		case ICMP_SR_FAILED:
497 		case ICMP_PORT_UNREACH:
498 			/* Impossible event. */
499 			return;
500 		case ICMP_FRAG_NEEDED:
501 			/* Soft state for pmtu is maintained by IP core. */
502 			return;
503 		default:
504 			/* All others are translated to HOST_UNREACH.
505 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
506 			   I believe they are just ether pollution. --ANK
507 			 */
508 			break;
509 		}
510 		break;
511 	case ICMP_TIME_EXCEEDED:
512 		if (code != ICMP_EXC_TTL)
513 			return;
514 		break;
515 	}
516 
517 	rcu_read_lock();
518 	t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
519 				flags & GRE_KEY ?
520 				*(((__be32 *)p) + (grehlen / 4) - 1) : 0,
521 				p[1]);
522 	if (t == NULL || t->parms.iph.daddr == 0 ||
523 	    ipv4_is_multicast(t->parms.iph.daddr))
524 		goto out;
525 
526 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
527 		goto out;
528 
529 	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
530 		t->err_count++;
531 	else
532 		t->err_count = 1;
533 	t->err_time = jiffies;
534 out:
535 	rcu_read_unlock();
536 }
537 
538 static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb)
539 {
540 	if (INET_ECN_is_ce(iph->tos)) {
541 		if (skb->protocol == htons(ETH_P_IP)) {
542 			IP_ECN_set_ce(ip_hdr(skb));
543 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
544 			IP6_ECN_set_ce(ipv6_hdr(skb));
545 		}
546 	}
547 }
548 
549 static inline u8
550 ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
551 {
552 	u8 inner = 0;
553 	if (skb->protocol == htons(ETH_P_IP))
554 		inner = old_iph->tos;
555 	else if (skb->protocol == htons(ETH_P_IPV6))
556 		inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
557 	return INET_ECN_encapsulate(tos, inner);
558 }
559 
560 static int ipgre_rcv(struct sk_buff *skb)
561 {
562 	const struct iphdr *iph;
563 	u8     *h;
564 	__be16    flags;
565 	__sum16   csum = 0;
566 	__be32 key = 0;
567 	u32    seqno = 0;
568 	struct ip_tunnel *tunnel;
569 	int    offset = 4;
570 	__be16 gre_proto;
571 
572 	if (!pskb_may_pull(skb, 16))
573 		goto drop_nolock;
574 
575 	iph = ip_hdr(skb);
576 	h = skb->data;
577 	flags = *(__be16*)h;
578 
579 	if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
580 		/* - Version must be 0.
581 		   - We do not support routing headers.
582 		 */
583 		if (flags&(GRE_VERSION|GRE_ROUTING))
584 			goto drop_nolock;
585 
586 		if (flags&GRE_CSUM) {
587 			switch (skb->ip_summed) {
588 			case CHECKSUM_COMPLETE:
589 				csum = csum_fold(skb->csum);
590 				if (!csum)
591 					break;
592 				/* fall through */
593 			case CHECKSUM_NONE:
594 				skb->csum = 0;
595 				csum = __skb_checksum_complete(skb);
596 				skb->ip_summed = CHECKSUM_COMPLETE;
597 			}
598 			offset += 4;
599 		}
600 		if (flags&GRE_KEY) {
601 			key = *(__be32*)(h + offset);
602 			offset += 4;
603 		}
604 		if (flags&GRE_SEQ) {
605 			seqno = ntohl(*(__be32*)(h + offset));
606 			offset += 4;
607 		}
608 	}
609 
610 	gre_proto = *(__be16 *)(h + 2);
611 
612 	rcu_read_lock();
613 	if ((tunnel = ipgre_tunnel_lookup(skb->dev,
614 					  iph->saddr, iph->daddr, key,
615 					  gre_proto))) {
616 		struct pcpu_tstats *tstats;
617 
618 		secpath_reset(skb);
619 
620 		skb->protocol = gre_proto;
621 		/* WCCP version 1 and 2 protocol decoding.
622 		 * - Change protocol to IP
623 		 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
624 		 */
625 		if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
626 			skb->protocol = htons(ETH_P_IP);
627 			if ((*(h + offset) & 0xF0) != 0x40)
628 				offset += 4;
629 		}
630 
631 		skb->mac_header = skb->network_header;
632 		__pskb_pull(skb, offset);
633 		skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
634 		skb->pkt_type = PACKET_HOST;
635 #ifdef CONFIG_NET_IPGRE_BROADCAST
636 		if (ipv4_is_multicast(iph->daddr)) {
637 			/* Looped back packet, drop it! */
638 			if (rt_is_output_route(skb_rtable(skb)))
639 				goto drop;
640 			tunnel->dev->stats.multicast++;
641 			skb->pkt_type = PACKET_BROADCAST;
642 		}
643 #endif
644 
645 		if (((flags&GRE_CSUM) && csum) ||
646 		    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
647 			tunnel->dev->stats.rx_crc_errors++;
648 			tunnel->dev->stats.rx_errors++;
649 			goto drop;
650 		}
651 		if (tunnel->parms.i_flags&GRE_SEQ) {
652 			if (!(flags&GRE_SEQ) ||
653 			    (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
654 				tunnel->dev->stats.rx_fifo_errors++;
655 				tunnel->dev->stats.rx_errors++;
656 				goto drop;
657 			}
658 			tunnel->i_seqno = seqno + 1;
659 		}
660 
661 		/* Warning: All skb pointers will be invalidated! */
662 		if (tunnel->dev->type == ARPHRD_ETHER) {
663 			if (!pskb_may_pull(skb, ETH_HLEN)) {
664 				tunnel->dev->stats.rx_length_errors++;
665 				tunnel->dev->stats.rx_errors++;
666 				goto drop;
667 			}
668 
669 			iph = ip_hdr(skb);
670 			skb->protocol = eth_type_trans(skb, tunnel->dev);
671 			skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
672 		}
673 
674 		tstats = this_cpu_ptr(tunnel->dev->tstats);
675 		tstats->rx_packets++;
676 		tstats->rx_bytes += skb->len;
677 
678 		__skb_tunnel_rx(skb, tunnel->dev);
679 
680 		skb_reset_network_header(skb);
681 		ipgre_ecn_decapsulate(iph, skb);
682 
683 		netif_rx(skb);
684 
685 		rcu_read_unlock();
686 		return 0;
687 	}
688 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
689 
690 drop:
691 	rcu_read_unlock();
692 drop_nolock:
693 	kfree_skb(skb);
694 	return 0;
695 }
696 
697 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
698 {
699 	struct ip_tunnel *tunnel = netdev_priv(dev);
700 	struct pcpu_tstats *tstats;
701 	const struct iphdr  *old_iph = ip_hdr(skb);
702 	const struct iphdr  *tiph;
703 	struct flowi4 fl4;
704 	u8     tos;
705 	__be16 df;
706 	struct rtable *rt;     			/* Route to the other host */
707 	struct net_device *tdev;		/* Device to other host */
708 	struct iphdr  *iph;			/* Our new IP header */
709 	unsigned int max_headroom;		/* The extra header space needed */
710 	int    gre_hlen;
711 	__be32 dst;
712 	int    mtu;
713 
714 	if (dev->type == ARPHRD_ETHER)
715 		IPCB(skb)->flags = 0;
716 
717 	if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
718 		gre_hlen = 0;
719 		tiph = (const struct iphdr *)skb->data;
720 	} else {
721 		gre_hlen = tunnel->hlen;
722 		tiph = &tunnel->parms.iph;
723 	}
724 
725 	if ((dst = tiph->daddr) == 0) {
726 		/* NBMA tunnel */
727 
728 		if (skb_dst(skb) == NULL) {
729 			dev->stats.tx_fifo_errors++;
730 			goto tx_error;
731 		}
732 
733 		if (skb->protocol == htons(ETH_P_IP)) {
734 			rt = skb_rtable(skb);
735 			dst = rt->rt_gateway;
736 		}
737 #if IS_ENABLED(CONFIG_IPV6)
738 		else if (skb->protocol == htons(ETH_P_IPV6)) {
739 			const struct in6_addr *addr6;
740 			struct neighbour *neigh;
741 			bool do_tx_error_icmp;
742 			int addr_type;
743 
744 			neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr);
745 			if (neigh == NULL)
746 				goto tx_error;
747 
748 			addr6 = (const struct in6_addr *)&neigh->primary_key;
749 			addr_type = ipv6_addr_type(addr6);
750 
751 			if (addr_type == IPV6_ADDR_ANY) {
752 				addr6 = &ipv6_hdr(skb)->daddr;
753 				addr_type = ipv6_addr_type(addr6);
754 			}
755 
756 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
757 				do_tx_error_icmp = true;
758 			else {
759 				do_tx_error_icmp = false;
760 				dst = addr6->s6_addr32[3];
761 			}
762 			neigh_release(neigh);
763 			if (do_tx_error_icmp)
764 				goto tx_error_icmp;
765 		}
766 #endif
767 		else
768 			goto tx_error;
769 	}
770 
771 	tos = tiph->tos;
772 	if (tos == 1) {
773 		tos = 0;
774 		if (skb->protocol == htons(ETH_P_IP))
775 			tos = old_iph->tos;
776 		else if (skb->protocol == htons(ETH_P_IPV6))
777 			tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
778 	}
779 
780 	rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr,
781 				 tunnel->parms.o_key, RT_TOS(tos),
782 				 tunnel->parms.link);
783 	if (IS_ERR(rt)) {
784 		dev->stats.tx_carrier_errors++;
785 		goto tx_error;
786 	}
787 	tdev = rt->dst.dev;
788 
789 	if (tdev == dev) {
790 		ip_rt_put(rt);
791 		dev->stats.collisions++;
792 		goto tx_error;
793 	}
794 
795 	df = tiph->frag_off;
796 	if (df)
797 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
798 	else
799 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
800 
801 	if (skb_dst(skb))
802 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
803 
804 	if (skb->protocol == htons(ETH_P_IP)) {
805 		df |= (old_iph->frag_off&htons(IP_DF));
806 
807 		if ((old_iph->frag_off&htons(IP_DF)) &&
808 		    mtu < ntohs(old_iph->tot_len)) {
809 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
810 			ip_rt_put(rt);
811 			goto tx_error;
812 		}
813 	}
814 #if IS_ENABLED(CONFIG_IPV6)
815 	else if (skb->protocol == htons(ETH_P_IPV6)) {
816 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
817 
818 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
819 			if ((tunnel->parms.iph.daddr &&
820 			     !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
821 			    rt6->rt6i_dst.plen == 128) {
822 				rt6->rt6i_flags |= RTF_MODIFIED;
823 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
824 			}
825 		}
826 
827 		if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
828 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
829 			ip_rt_put(rt);
830 			goto tx_error;
831 		}
832 	}
833 #endif
834 
835 	if (tunnel->err_count > 0) {
836 		if (time_before(jiffies,
837 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
838 			tunnel->err_count--;
839 
840 			dst_link_failure(skb);
841 		} else
842 			tunnel->err_count = 0;
843 	}
844 
845 	max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
846 
847 	if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
848 	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
849 		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
850 		if (max_headroom > dev->needed_headroom)
851 			dev->needed_headroom = max_headroom;
852 		if (!new_skb) {
853 			ip_rt_put(rt);
854 			dev->stats.tx_dropped++;
855 			dev_kfree_skb(skb);
856 			return NETDEV_TX_OK;
857 		}
858 		if (skb->sk)
859 			skb_set_owner_w(new_skb, skb->sk);
860 		dev_kfree_skb(skb);
861 		skb = new_skb;
862 		old_iph = ip_hdr(skb);
863 	}
864 
865 	skb_reset_transport_header(skb);
866 	skb_push(skb, gre_hlen);
867 	skb_reset_network_header(skb);
868 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
869 	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
870 			      IPSKB_REROUTED);
871 	skb_dst_drop(skb);
872 	skb_dst_set(skb, &rt->dst);
873 
874 	/*
875 	 *	Push down and install the IPIP header.
876 	 */
877 
878 	iph 			=	ip_hdr(skb);
879 	iph->version		=	4;
880 	iph->ihl		=	sizeof(struct iphdr) >> 2;
881 	iph->frag_off		=	df;
882 	iph->protocol		=	IPPROTO_GRE;
883 	iph->tos		=	ipgre_ecn_encapsulate(tos, old_iph, skb);
884 	iph->daddr		=	fl4.daddr;
885 	iph->saddr		=	fl4.saddr;
886 
887 	if ((iph->ttl = tiph->ttl) == 0) {
888 		if (skb->protocol == htons(ETH_P_IP))
889 			iph->ttl = old_iph->ttl;
890 #if IS_ENABLED(CONFIG_IPV6)
891 		else if (skb->protocol == htons(ETH_P_IPV6))
892 			iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
893 #endif
894 		else
895 			iph->ttl = ip4_dst_hoplimit(&rt->dst);
896 	}
897 
898 	((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
899 	((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
900 				   htons(ETH_P_TEB) : skb->protocol;
901 
902 	if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
903 		__be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
904 
905 		if (tunnel->parms.o_flags&GRE_SEQ) {
906 			++tunnel->o_seqno;
907 			*ptr = htonl(tunnel->o_seqno);
908 			ptr--;
909 		}
910 		if (tunnel->parms.o_flags&GRE_KEY) {
911 			*ptr = tunnel->parms.o_key;
912 			ptr--;
913 		}
914 		if (tunnel->parms.o_flags&GRE_CSUM) {
915 			*ptr = 0;
916 			*(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
917 		}
918 	}
919 
920 	nf_reset(skb);
921 	tstats = this_cpu_ptr(dev->tstats);
922 	__IPTUNNEL_XMIT(tstats, &dev->stats);
923 	return NETDEV_TX_OK;
924 
925 #if IS_ENABLED(CONFIG_IPV6)
926 tx_error_icmp:
927 	dst_link_failure(skb);
928 #endif
929 tx_error:
930 	dev->stats.tx_errors++;
931 	dev_kfree_skb(skb);
932 	return NETDEV_TX_OK;
933 }
934 
935 static int ipgre_tunnel_bind_dev(struct net_device *dev)
936 {
937 	struct net_device *tdev = NULL;
938 	struct ip_tunnel *tunnel;
939 	const struct iphdr *iph;
940 	int hlen = LL_MAX_HEADER;
941 	int mtu = ETH_DATA_LEN;
942 	int addend = sizeof(struct iphdr) + 4;
943 
944 	tunnel = netdev_priv(dev);
945 	iph = &tunnel->parms.iph;
946 
947 	/* Guess output device to choose reasonable mtu and needed_headroom */
948 
949 	if (iph->daddr) {
950 		struct flowi4 fl4;
951 		struct rtable *rt;
952 
953 		rt = ip_route_output_gre(dev_net(dev), &fl4,
954 					 iph->daddr, iph->saddr,
955 					 tunnel->parms.o_key,
956 					 RT_TOS(iph->tos),
957 					 tunnel->parms.link);
958 		if (!IS_ERR(rt)) {
959 			tdev = rt->dst.dev;
960 			ip_rt_put(rt);
961 		}
962 
963 		if (dev->type != ARPHRD_ETHER)
964 			dev->flags |= IFF_POINTOPOINT;
965 	}
966 
967 	if (!tdev && tunnel->parms.link)
968 		tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
969 
970 	if (tdev) {
971 		hlen = tdev->hard_header_len + tdev->needed_headroom;
972 		mtu = tdev->mtu;
973 	}
974 	dev->iflink = tunnel->parms.link;
975 
976 	/* Precalculate GRE options length */
977 	if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
978 		if (tunnel->parms.o_flags&GRE_CSUM)
979 			addend += 4;
980 		if (tunnel->parms.o_flags&GRE_KEY)
981 			addend += 4;
982 		if (tunnel->parms.o_flags&GRE_SEQ)
983 			addend += 4;
984 	}
985 	dev->needed_headroom = addend + hlen;
986 	mtu -= dev->hard_header_len + addend;
987 
988 	if (mtu < 68)
989 		mtu = 68;
990 
991 	tunnel->hlen = addend;
992 
993 	return mtu;
994 }
995 
996 static int
997 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
998 {
999 	int err = 0;
1000 	struct ip_tunnel_parm p;
1001 	struct ip_tunnel *t;
1002 	struct net *net = dev_net(dev);
1003 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1004 
1005 	switch (cmd) {
1006 	case SIOCGETTUNNEL:
1007 		t = NULL;
1008 		if (dev == ign->fb_tunnel_dev) {
1009 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
1010 				err = -EFAULT;
1011 				break;
1012 			}
1013 			t = ipgre_tunnel_locate(net, &p, 0);
1014 		}
1015 		if (t == NULL)
1016 			t = netdev_priv(dev);
1017 		memcpy(&p, &t->parms, sizeof(p));
1018 		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1019 			err = -EFAULT;
1020 		break;
1021 
1022 	case SIOCADDTUNNEL:
1023 	case SIOCCHGTUNNEL:
1024 		err = -EPERM;
1025 		if (!capable(CAP_NET_ADMIN))
1026 			goto done;
1027 
1028 		err = -EFAULT;
1029 		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1030 			goto done;
1031 
1032 		err = -EINVAL;
1033 		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1034 		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1035 		    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1036 			goto done;
1037 		if (p.iph.ttl)
1038 			p.iph.frag_off |= htons(IP_DF);
1039 
1040 		if (!(p.i_flags&GRE_KEY))
1041 			p.i_key = 0;
1042 		if (!(p.o_flags&GRE_KEY))
1043 			p.o_key = 0;
1044 
1045 		t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1046 
1047 		if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1048 			if (t != NULL) {
1049 				if (t->dev != dev) {
1050 					err = -EEXIST;
1051 					break;
1052 				}
1053 			} else {
1054 				unsigned int nflags = 0;
1055 
1056 				t = netdev_priv(dev);
1057 
1058 				if (ipv4_is_multicast(p.iph.daddr))
1059 					nflags = IFF_BROADCAST;
1060 				else if (p.iph.daddr)
1061 					nflags = IFF_POINTOPOINT;
1062 
1063 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1064 					err = -EINVAL;
1065 					break;
1066 				}
1067 				ipgre_tunnel_unlink(ign, t);
1068 				synchronize_net();
1069 				t->parms.iph.saddr = p.iph.saddr;
1070 				t->parms.iph.daddr = p.iph.daddr;
1071 				t->parms.i_key = p.i_key;
1072 				t->parms.o_key = p.o_key;
1073 				memcpy(dev->dev_addr, &p.iph.saddr, 4);
1074 				memcpy(dev->broadcast, &p.iph.daddr, 4);
1075 				ipgre_tunnel_link(ign, t);
1076 				netdev_state_change(dev);
1077 			}
1078 		}
1079 
1080 		if (t) {
1081 			err = 0;
1082 			if (cmd == SIOCCHGTUNNEL) {
1083 				t->parms.iph.ttl = p.iph.ttl;
1084 				t->parms.iph.tos = p.iph.tos;
1085 				t->parms.iph.frag_off = p.iph.frag_off;
1086 				if (t->parms.link != p.link) {
1087 					t->parms.link = p.link;
1088 					dev->mtu = ipgre_tunnel_bind_dev(dev);
1089 					netdev_state_change(dev);
1090 				}
1091 			}
1092 			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1093 				err = -EFAULT;
1094 		} else
1095 			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1096 		break;
1097 
1098 	case SIOCDELTUNNEL:
1099 		err = -EPERM;
1100 		if (!capable(CAP_NET_ADMIN))
1101 			goto done;
1102 
1103 		if (dev == ign->fb_tunnel_dev) {
1104 			err = -EFAULT;
1105 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1106 				goto done;
1107 			err = -ENOENT;
1108 			if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1109 				goto done;
1110 			err = -EPERM;
1111 			if (t == netdev_priv(ign->fb_tunnel_dev))
1112 				goto done;
1113 			dev = t->dev;
1114 		}
1115 		unregister_netdevice(dev);
1116 		err = 0;
1117 		break;
1118 
1119 	default:
1120 		err = -EINVAL;
1121 	}
1122 
1123 done:
1124 	return err;
1125 }
1126 
1127 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1128 {
1129 	struct ip_tunnel *tunnel = netdev_priv(dev);
1130 	if (new_mtu < 68 ||
1131 	    new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1132 		return -EINVAL;
1133 	dev->mtu = new_mtu;
1134 	return 0;
1135 }
1136 
1137 /* Nice toy. Unfortunately, useless in real life :-)
1138    It allows to construct virtual multiprotocol broadcast "LAN"
1139    over the Internet, provided multicast routing is tuned.
1140 
1141 
1142    I have no idea was this bicycle invented before me,
1143    so that I had to set ARPHRD_IPGRE to a random value.
1144    I have an impression, that Cisco could make something similar,
1145    but this feature is apparently missing in IOS<=11.2(8).
1146 
1147    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1148    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1149 
1150    ping -t 255 224.66.66.66
1151 
1152    If nobody answers, mbone does not work.
1153 
1154    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1155    ip addr add 10.66.66.<somewhat>/24 dev Universe
1156    ifconfig Universe up
1157    ifconfig Universe add fe80::<Your_real_addr>/10
1158    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1159    ftp 10.66.66.66
1160    ...
1161    ftp fec0:6666:6666::193.233.7.65
1162    ...
1163 
1164  */
1165 
1166 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1167 			unsigned short type,
1168 			const void *daddr, const void *saddr, unsigned int len)
1169 {
1170 	struct ip_tunnel *t = netdev_priv(dev);
1171 	struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1172 	__be16 *p = (__be16*)(iph+1);
1173 
1174 	memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1175 	p[0]		= t->parms.o_flags;
1176 	p[1]		= htons(type);
1177 
1178 	/*
1179 	 *	Set the source hardware address.
1180 	 */
1181 
1182 	if (saddr)
1183 		memcpy(&iph->saddr, saddr, 4);
1184 	if (daddr)
1185 		memcpy(&iph->daddr, daddr, 4);
1186 	if (iph->daddr)
1187 		return t->hlen;
1188 
1189 	return -t->hlen;
1190 }
1191 
1192 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1193 {
1194 	const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
1195 	memcpy(haddr, &iph->saddr, 4);
1196 	return 4;
1197 }
1198 
1199 static const struct header_ops ipgre_header_ops = {
1200 	.create	= ipgre_header,
1201 	.parse	= ipgre_header_parse,
1202 };
1203 
1204 #ifdef CONFIG_NET_IPGRE_BROADCAST
1205 static int ipgre_open(struct net_device *dev)
1206 {
1207 	struct ip_tunnel *t = netdev_priv(dev);
1208 
1209 	if (ipv4_is_multicast(t->parms.iph.daddr)) {
1210 		struct flowi4 fl4;
1211 		struct rtable *rt;
1212 
1213 		rt = ip_route_output_gre(dev_net(dev), &fl4,
1214 					 t->parms.iph.daddr,
1215 					 t->parms.iph.saddr,
1216 					 t->parms.o_key,
1217 					 RT_TOS(t->parms.iph.tos),
1218 					 t->parms.link);
1219 		if (IS_ERR(rt))
1220 			return -EADDRNOTAVAIL;
1221 		dev = rt->dst.dev;
1222 		ip_rt_put(rt);
1223 		if (__in_dev_get_rtnl(dev) == NULL)
1224 			return -EADDRNOTAVAIL;
1225 		t->mlink = dev->ifindex;
1226 		ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1227 	}
1228 	return 0;
1229 }
1230 
1231 static int ipgre_close(struct net_device *dev)
1232 {
1233 	struct ip_tunnel *t = netdev_priv(dev);
1234 
1235 	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1236 		struct in_device *in_dev;
1237 		in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1238 		if (in_dev)
1239 			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1240 	}
1241 	return 0;
1242 }
1243 
1244 #endif
1245 
1246 static const struct net_device_ops ipgre_netdev_ops = {
1247 	.ndo_init		= ipgre_tunnel_init,
1248 	.ndo_uninit		= ipgre_tunnel_uninit,
1249 #ifdef CONFIG_NET_IPGRE_BROADCAST
1250 	.ndo_open		= ipgre_open,
1251 	.ndo_stop		= ipgre_close,
1252 #endif
1253 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1254 	.ndo_do_ioctl		= ipgre_tunnel_ioctl,
1255 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1256 	.ndo_get_stats		= ipgre_get_stats,
1257 };
1258 
1259 static void ipgre_dev_free(struct net_device *dev)
1260 {
1261 	free_percpu(dev->tstats);
1262 	free_netdev(dev);
1263 }
1264 
1265 static void ipgre_tunnel_setup(struct net_device *dev)
1266 {
1267 	dev->netdev_ops		= &ipgre_netdev_ops;
1268 	dev->destructor 	= ipgre_dev_free;
1269 
1270 	dev->type		= ARPHRD_IPGRE;
1271 	dev->needed_headroom 	= LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1272 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1273 	dev->flags		= IFF_NOARP;
1274 	dev->iflink		= 0;
1275 	dev->addr_len		= 4;
1276 	dev->features		|= NETIF_F_NETNS_LOCAL;
1277 	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
1278 }
1279 
1280 static int ipgre_tunnel_init(struct net_device *dev)
1281 {
1282 	struct ip_tunnel *tunnel;
1283 	struct iphdr *iph;
1284 
1285 	tunnel = netdev_priv(dev);
1286 	iph = &tunnel->parms.iph;
1287 
1288 	tunnel->dev = dev;
1289 	strcpy(tunnel->parms.name, dev->name);
1290 
1291 	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1292 	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1293 
1294 	if (iph->daddr) {
1295 #ifdef CONFIG_NET_IPGRE_BROADCAST
1296 		if (ipv4_is_multicast(iph->daddr)) {
1297 			if (!iph->saddr)
1298 				return -EINVAL;
1299 			dev->flags = IFF_BROADCAST;
1300 			dev->header_ops = &ipgre_header_ops;
1301 		}
1302 #endif
1303 	} else
1304 		dev->header_ops = &ipgre_header_ops;
1305 
1306 	dev->tstats = alloc_percpu(struct pcpu_tstats);
1307 	if (!dev->tstats)
1308 		return -ENOMEM;
1309 
1310 	return 0;
1311 }
1312 
1313 static void ipgre_fb_tunnel_init(struct net_device *dev)
1314 {
1315 	struct ip_tunnel *tunnel = netdev_priv(dev);
1316 	struct iphdr *iph = &tunnel->parms.iph;
1317 
1318 	tunnel->dev = dev;
1319 	strcpy(tunnel->parms.name, dev->name);
1320 
1321 	iph->version		= 4;
1322 	iph->protocol		= IPPROTO_GRE;
1323 	iph->ihl		= 5;
1324 	tunnel->hlen		= sizeof(struct iphdr) + 4;
1325 
1326 	dev_hold(dev);
1327 }
1328 
1329 
1330 static const struct gre_protocol ipgre_protocol = {
1331 	.handler     = ipgre_rcv,
1332 	.err_handler = ipgre_err,
1333 };
1334 
1335 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1336 {
1337 	int prio;
1338 
1339 	for (prio = 0; prio < 4; prio++) {
1340 		int h;
1341 		for (h = 0; h < HASH_SIZE; h++) {
1342 			struct ip_tunnel *t;
1343 
1344 			t = rtnl_dereference(ign->tunnels[prio][h]);
1345 
1346 			while (t != NULL) {
1347 				unregister_netdevice_queue(t->dev, head);
1348 				t = rtnl_dereference(t->next);
1349 			}
1350 		}
1351 	}
1352 }
1353 
1354 static int __net_init ipgre_init_net(struct net *net)
1355 {
1356 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1357 	int err;
1358 
1359 	ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1360 					   ipgre_tunnel_setup);
1361 	if (!ign->fb_tunnel_dev) {
1362 		err = -ENOMEM;
1363 		goto err_alloc_dev;
1364 	}
1365 	dev_net_set(ign->fb_tunnel_dev, net);
1366 
1367 	ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1368 	ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1369 
1370 	if ((err = register_netdev(ign->fb_tunnel_dev)))
1371 		goto err_reg_dev;
1372 
1373 	rcu_assign_pointer(ign->tunnels_wc[0],
1374 			   netdev_priv(ign->fb_tunnel_dev));
1375 	return 0;
1376 
1377 err_reg_dev:
1378 	ipgre_dev_free(ign->fb_tunnel_dev);
1379 err_alloc_dev:
1380 	return err;
1381 }
1382 
1383 static void __net_exit ipgre_exit_net(struct net *net)
1384 {
1385 	struct ipgre_net *ign;
1386 	LIST_HEAD(list);
1387 
1388 	ign = net_generic(net, ipgre_net_id);
1389 	rtnl_lock();
1390 	ipgre_destroy_tunnels(ign, &list);
1391 	unregister_netdevice_many(&list);
1392 	rtnl_unlock();
1393 }
1394 
1395 static struct pernet_operations ipgre_net_ops = {
1396 	.init = ipgre_init_net,
1397 	.exit = ipgre_exit_net,
1398 	.id   = &ipgre_net_id,
1399 	.size = sizeof(struct ipgre_net),
1400 };
1401 
1402 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1403 {
1404 	__be16 flags;
1405 
1406 	if (!data)
1407 		return 0;
1408 
1409 	flags = 0;
1410 	if (data[IFLA_GRE_IFLAGS])
1411 		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1412 	if (data[IFLA_GRE_OFLAGS])
1413 		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1414 	if (flags & (GRE_VERSION|GRE_ROUTING))
1415 		return -EINVAL;
1416 
1417 	return 0;
1418 }
1419 
1420 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1421 {
1422 	__be32 daddr;
1423 
1424 	if (tb[IFLA_ADDRESS]) {
1425 		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1426 			return -EINVAL;
1427 		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1428 			return -EADDRNOTAVAIL;
1429 	}
1430 
1431 	if (!data)
1432 		goto out;
1433 
1434 	if (data[IFLA_GRE_REMOTE]) {
1435 		memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1436 		if (!daddr)
1437 			return -EINVAL;
1438 	}
1439 
1440 out:
1441 	return ipgre_tunnel_validate(tb, data);
1442 }
1443 
1444 static void ipgre_netlink_parms(struct nlattr *data[],
1445 				struct ip_tunnel_parm *parms)
1446 {
1447 	memset(parms, 0, sizeof(*parms));
1448 
1449 	parms->iph.protocol = IPPROTO_GRE;
1450 
1451 	if (!data)
1452 		return;
1453 
1454 	if (data[IFLA_GRE_LINK])
1455 		parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1456 
1457 	if (data[IFLA_GRE_IFLAGS])
1458 		parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1459 
1460 	if (data[IFLA_GRE_OFLAGS])
1461 		parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1462 
1463 	if (data[IFLA_GRE_IKEY])
1464 		parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1465 
1466 	if (data[IFLA_GRE_OKEY])
1467 		parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1468 
1469 	if (data[IFLA_GRE_LOCAL])
1470 		parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1471 
1472 	if (data[IFLA_GRE_REMOTE])
1473 		parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1474 
1475 	if (data[IFLA_GRE_TTL])
1476 		parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1477 
1478 	if (data[IFLA_GRE_TOS])
1479 		parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1480 
1481 	if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1482 		parms->iph.frag_off = htons(IP_DF);
1483 }
1484 
1485 static int ipgre_tap_init(struct net_device *dev)
1486 {
1487 	struct ip_tunnel *tunnel;
1488 
1489 	tunnel = netdev_priv(dev);
1490 
1491 	tunnel->dev = dev;
1492 	strcpy(tunnel->parms.name, dev->name);
1493 
1494 	ipgre_tunnel_bind_dev(dev);
1495 
1496 	dev->tstats = alloc_percpu(struct pcpu_tstats);
1497 	if (!dev->tstats)
1498 		return -ENOMEM;
1499 
1500 	return 0;
1501 }
1502 
1503 static const struct net_device_ops ipgre_tap_netdev_ops = {
1504 	.ndo_init		= ipgre_tap_init,
1505 	.ndo_uninit		= ipgre_tunnel_uninit,
1506 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1507 	.ndo_set_mac_address 	= eth_mac_addr,
1508 	.ndo_validate_addr	= eth_validate_addr,
1509 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1510 	.ndo_get_stats		= ipgre_get_stats,
1511 };
1512 
1513 static void ipgre_tap_setup(struct net_device *dev)
1514 {
1515 
1516 	ether_setup(dev);
1517 
1518 	dev->netdev_ops		= &ipgre_tap_netdev_ops;
1519 	dev->destructor 	= ipgre_dev_free;
1520 
1521 	dev->iflink		= 0;
1522 	dev->features		|= NETIF_F_NETNS_LOCAL;
1523 }
1524 
1525 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1526 			 struct nlattr *data[])
1527 {
1528 	struct ip_tunnel *nt;
1529 	struct net *net = dev_net(dev);
1530 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1531 	int mtu;
1532 	int err;
1533 
1534 	nt = netdev_priv(dev);
1535 	ipgre_netlink_parms(data, &nt->parms);
1536 
1537 	if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1538 		return -EEXIST;
1539 
1540 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1541 		eth_hw_addr_random(dev);
1542 
1543 	mtu = ipgre_tunnel_bind_dev(dev);
1544 	if (!tb[IFLA_MTU])
1545 		dev->mtu = mtu;
1546 
1547 	/* Can use a lockless transmit, unless we generate output sequences */
1548 	if (!(nt->parms.o_flags & GRE_SEQ))
1549 		dev->features |= NETIF_F_LLTX;
1550 
1551 	err = register_netdevice(dev);
1552 	if (err)
1553 		goto out;
1554 
1555 	dev_hold(dev);
1556 	ipgre_tunnel_link(ign, nt);
1557 
1558 out:
1559 	return err;
1560 }
1561 
1562 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1563 			    struct nlattr *data[])
1564 {
1565 	struct ip_tunnel *t, *nt;
1566 	struct net *net = dev_net(dev);
1567 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1568 	struct ip_tunnel_parm p;
1569 	int mtu;
1570 
1571 	if (dev == ign->fb_tunnel_dev)
1572 		return -EINVAL;
1573 
1574 	nt = netdev_priv(dev);
1575 	ipgre_netlink_parms(data, &p);
1576 
1577 	t = ipgre_tunnel_locate(net, &p, 0);
1578 
1579 	if (t) {
1580 		if (t->dev != dev)
1581 			return -EEXIST;
1582 	} else {
1583 		t = nt;
1584 
1585 		if (dev->type != ARPHRD_ETHER) {
1586 			unsigned int nflags = 0;
1587 
1588 			if (ipv4_is_multicast(p.iph.daddr))
1589 				nflags = IFF_BROADCAST;
1590 			else if (p.iph.daddr)
1591 				nflags = IFF_POINTOPOINT;
1592 
1593 			if ((dev->flags ^ nflags) &
1594 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1595 				return -EINVAL;
1596 		}
1597 
1598 		ipgre_tunnel_unlink(ign, t);
1599 		t->parms.iph.saddr = p.iph.saddr;
1600 		t->parms.iph.daddr = p.iph.daddr;
1601 		t->parms.i_key = p.i_key;
1602 		if (dev->type != ARPHRD_ETHER) {
1603 			memcpy(dev->dev_addr, &p.iph.saddr, 4);
1604 			memcpy(dev->broadcast, &p.iph.daddr, 4);
1605 		}
1606 		ipgre_tunnel_link(ign, t);
1607 		netdev_state_change(dev);
1608 	}
1609 
1610 	t->parms.o_key = p.o_key;
1611 	t->parms.iph.ttl = p.iph.ttl;
1612 	t->parms.iph.tos = p.iph.tos;
1613 	t->parms.iph.frag_off = p.iph.frag_off;
1614 
1615 	if (t->parms.link != p.link) {
1616 		t->parms.link = p.link;
1617 		mtu = ipgre_tunnel_bind_dev(dev);
1618 		if (!tb[IFLA_MTU])
1619 			dev->mtu = mtu;
1620 		netdev_state_change(dev);
1621 	}
1622 
1623 	return 0;
1624 }
1625 
1626 static size_t ipgre_get_size(const struct net_device *dev)
1627 {
1628 	return
1629 		/* IFLA_GRE_LINK */
1630 		nla_total_size(4) +
1631 		/* IFLA_GRE_IFLAGS */
1632 		nla_total_size(2) +
1633 		/* IFLA_GRE_OFLAGS */
1634 		nla_total_size(2) +
1635 		/* IFLA_GRE_IKEY */
1636 		nla_total_size(4) +
1637 		/* IFLA_GRE_OKEY */
1638 		nla_total_size(4) +
1639 		/* IFLA_GRE_LOCAL */
1640 		nla_total_size(4) +
1641 		/* IFLA_GRE_REMOTE */
1642 		nla_total_size(4) +
1643 		/* IFLA_GRE_TTL */
1644 		nla_total_size(1) +
1645 		/* IFLA_GRE_TOS */
1646 		nla_total_size(1) +
1647 		/* IFLA_GRE_PMTUDISC */
1648 		nla_total_size(1) +
1649 		0;
1650 }
1651 
1652 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1653 {
1654 	struct ip_tunnel *t = netdev_priv(dev);
1655 	struct ip_tunnel_parm *p = &t->parms;
1656 
1657 	NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1658 	NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1659 	NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1660 	NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1661 	NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1662 	NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1663 	NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1664 	NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1665 	NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1666 	NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1667 
1668 	return 0;
1669 
1670 nla_put_failure:
1671 	return -EMSGSIZE;
1672 }
1673 
1674 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1675 	[IFLA_GRE_LINK]		= { .type = NLA_U32 },
1676 	[IFLA_GRE_IFLAGS]	= { .type = NLA_U16 },
1677 	[IFLA_GRE_OFLAGS]	= { .type = NLA_U16 },
1678 	[IFLA_GRE_IKEY]		= { .type = NLA_U32 },
1679 	[IFLA_GRE_OKEY]		= { .type = NLA_U32 },
1680 	[IFLA_GRE_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1681 	[IFLA_GRE_REMOTE]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1682 	[IFLA_GRE_TTL]		= { .type = NLA_U8 },
1683 	[IFLA_GRE_TOS]		= { .type = NLA_U8 },
1684 	[IFLA_GRE_PMTUDISC]	= { .type = NLA_U8 },
1685 };
1686 
1687 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1688 	.kind		= "gre",
1689 	.maxtype	= IFLA_GRE_MAX,
1690 	.policy		= ipgre_policy,
1691 	.priv_size	= sizeof(struct ip_tunnel),
1692 	.setup		= ipgre_tunnel_setup,
1693 	.validate	= ipgre_tunnel_validate,
1694 	.newlink	= ipgre_newlink,
1695 	.changelink	= ipgre_changelink,
1696 	.get_size	= ipgre_get_size,
1697 	.fill_info	= ipgre_fill_info,
1698 };
1699 
1700 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1701 	.kind		= "gretap",
1702 	.maxtype	= IFLA_GRE_MAX,
1703 	.policy		= ipgre_policy,
1704 	.priv_size	= sizeof(struct ip_tunnel),
1705 	.setup		= ipgre_tap_setup,
1706 	.validate	= ipgre_tap_validate,
1707 	.newlink	= ipgre_newlink,
1708 	.changelink	= ipgre_changelink,
1709 	.get_size	= ipgre_get_size,
1710 	.fill_info	= ipgre_fill_info,
1711 };
1712 
1713 /*
1714  *	And now the modules code and kernel interface.
1715  */
1716 
1717 static int __init ipgre_init(void)
1718 {
1719 	int err;
1720 
1721 	pr_info("GRE over IPv4 tunneling driver\n");
1722 
1723 	err = register_pernet_device(&ipgre_net_ops);
1724 	if (err < 0)
1725 		return err;
1726 
1727 	err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1728 	if (err < 0) {
1729 		pr_info("%s: can't add protocol\n", __func__);
1730 		goto add_proto_failed;
1731 	}
1732 
1733 	err = rtnl_link_register(&ipgre_link_ops);
1734 	if (err < 0)
1735 		goto rtnl_link_failed;
1736 
1737 	err = rtnl_link_register(&ipgre_tap_ops);
1738 	if (err < 0)
1739 		goto tap_ops_failed;
1740 
1741 out:
1742 	return err;
1743 
1744 tap_ops_failed:
1745 	rtnl_link_unregister(&ipgre_link_ops);
1746 rtnl_link_failed:
1747 	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1748 add_proto_failed:
1749 	unregister_pernet_device(&ipgre_net_ops);
1750 	goto out;
1751 }
1752 
1753 static void __exit ipgre_fini(void)
1754 {
1755 	rtnl_link_unregister(&ipgre_tap_ops);
1756 	rtnl_link_unregister(&ipgre_link_ops);
1757 	if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1758 		pr_info("%s: can't remove protocol\n", __func__);
1759 	unregister_pernet_device(&ipgre_net_ops);
1760 }
1761 
1762 module_init(ipgre_init);
1763 module_exit(ipgre_fini);
1764 MODULE_LICENSE("GPL");
1765 MODULE_ALIAS_RTNL_LINK("gre");
1766 MODULE_ALIAS_RTNL_LINK("gretap");
1767 MODULE_ALIAS_NETDEV("gre0");
1768