xref: /openbmc/linux/net/ipv4/ip_gre.c (revision 70342287)
1 /*
2  *	Linux NET3:	GRE over IP protocol decoder.
3  *
4  *	Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *	This program is free software; you can redistribute it and/or
7  *	modify it under the terms of the GNU General Public License
8  *	as published by the Free Software Foundation; either version
9  *	2 of the License, or (at your option) any later version.
10  *
11  */
12 
13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14 
15 #include <linux/capability.h>
16 #include <linux/module.h>
17 #include <linux/types.h>
18 #include <linux/kernel.h>
19 #include <linux/slab.h>
20 #include <asm/uaccess.h>
21 #include <linux/skbuff.h>
22 #include <linux/netdevice.h>
23 #include <linux/in.h>
24 #include <linux/tcp.h>
25 #include <linux/udp.h>
26 #include <linux/if_arp.h>
27 #include <linux/mroute.h>
28 #include <linux/init.h>
29 #include <linux/in6.h>
30 #include <linux/inetdevice.h>
31 #include <linux/igmp.h>
32 #include <linux/netfilter_ipv4.h>
33 #include <linux/etherdevice.h>
34 #include <linux/if_ether.h>
35 
36 #include <net/sock.h>
37 #include <net/ip.h>
38 #include <net/icmp.h>
39 #include <net/protocol.h>
40 #include <net/ipip.h>
41 #include <net/arp.h>
42 #include <net/checksum.h>
43 #include <net/dsfield.h>
44 #include <net/inet_ecn.h>
45 #include <net/xfrm.h>
46 #include <net/net_namespace.h>
47 #include <net/netns/generic.h>
48 #include <net/rtnetlink.h>
49 #include <net/gre.h>
50 
51 #if IS_ENABLED(CONFIG_IPV6)
52 #include <net/ipv6.h>
53 #include <net/ip6_fib.h>
54 #include <net/ip6_route.h>
55 #endif
56 
57 /*
58    Problems & solutions
59    --------------------
60 
61    1. The most important issue is detecting local dead loops.
62    They would cause complete host lockup in transmit, which
63    would be "resolved" by stack overflow or, if queueing is enabled,
64    with infinite looping in net_bh.
65 
66    We cannot track such dead loops during route installation,
67    it is infeasible task. The most general solutions would be
68    to keep skb->encapsulation counter (sort of local ttl),
69    and silently drop packet when it expires. It is a good
70    solution, but it supposes maintaining new variable in ALL
71    skb, even if no tunneling is used.
72 
73    Current solution: xmit_recursion breaks dead loops. This is a percpu
74    counter, since when we enter the first ndo_xmit(), cpu migration is
75    forbidden. We force an exit if this counter reaches RECURSION_LIMIT
76 
77    2. Networking dead loops would not kill routers, but would really
78    kill network. IP hop limit plays role of "t->recursion" in this case,
79    if we copy it from packet being encapsulated to upper header.
80    It is very good solution, but it introduces two problems:
81 
82    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
83      do not work over tunnels.
84    - traceroute does not work. I planned to relay ICMP from tunnel,
85      so that this problem would be solved and traceroute output
86      would even more informative. This idea appeared to be wrong:
87      only Linux complies to rfc1812 now (yes, guys, Linux is the only
88      true router now :-)), all routers (at least, in neighbourhood of mine)
89      return only 8 bytes of payload. It is the end.
90 
91    Hence, if we want that OSPF worked or traceroute said something reasonable,
92    we should search for another solution.
93 
94    One of them is to parse packet trying to detect inner encapsulation
95    made by our node. It is difficult or even impossible, especially,
96    taking into account fragmentation. TO be short, ttl is not solution at all.
97 
98    Current solution: The solution was UNEXPECTEDLY SIMPLE.
99    We force DF flag on tunnels with preconfigured hop limit,
100    that is ALL. :-) Well, it does not remove the problem completely,
101    but exponential growth of network traffic is changed to linear
102    (branches, that exceed pmtu are pruned) and tunnel mtu
103    rapidly degrades to value <68, where looping stops.
104    Yes, it is not good if there exists a router in the loop,
105    which does not force DF, even when encapsulating packets have DF set.
106    But it is not our problem! Nobody could accuse us, we made
107    all that we could make. Even if it is your gated who injected
108    fatal route to network, even if it were you who configured
109    fatal static route: you are innocent. :-)
110 
111 
112 
113    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
114    practically identical code. It would be good to glue them
115    together, but it is not very evident, how to make them modular.
116    sit is integral part of IPv6, ipip and gre are naturally modular.
117    We could extract common parts (hash table, ioctl etc)
118    to a separate module (ip_tunnel.c).
119 
120    Alexey Kuznetsov.
121  */
122 
123 static bool log_ecn_error = true;
124 module_param(log_ecn_error, bool, 0644);
125 MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
126 
127 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
128 static int ipgre_tunnel_init(struct net_device *dev);
129 static void ipgre_tunnel_setup(struct net_device *dev);
130 static int ipgre_tunnel_bind_dev(struct net_device *dev);
131 
132 /* Fallback tunnel: no source, no destination, no key, no options */
133 
134 #define HASH_SIZE  16
135 
136 static int ipgre_net_id __read_mostly;
137 struct ipgre_net {
138 	struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
139 
140 	struct net_device *fb_tunnel_dev;
141 };
142 
143 /* Tunnel hash table */
144 
145 /*
146    4 hash tables:
147 
148    3: (remote,local)
149    2: (remote,*)
150    1: (*,local)
151    0: (*,*)
152 
153    We require exact key match i.e. if a key is present in packet
154    it will match only tunnel with the same key; if it is not present,
155    it will match only keyless tunnel.
156 
157    All keysless packets, if not matched configured keyless tunnels
158    will match fallback tunnel.
159  */
160 
161 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
162 
163 #define tunnels_r_l	tunnels[3]
164 #define tunnels_r	tunnels[2]
165 #define tunnels_l	tunnels[1]
166 #define tunnels_wc	tunnels[0]
167 
168 static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev,
169 						   struct rtnl_link_stats64 *tot)
170 {
171 	int i;
172 
173 	for_each_possible_cpu(i) {
174 		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
175 		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
176 		unsigned int start;
177 
178 		do {
179 			start = u64_stats_fetch_begin_bh(&tstats->syncp);
180 			rx_packets = tstats->rx_packets;
181 			tx_packets = tstats->tx_packets;
182 			rx_bytes = tstats->rx_bytes;
183 			tx_bytes = tstats->tx_bytes;
184 		} while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
185 
186 		tot->rx_packets += rx_packets;
187 		tot->tx_packets += tx_packets;
188 		tot->rx_bytes   += rx_bytes;
189 		tot->tx_bytes   += tx_bytes;
190 	}
191 
192 	tot->multicast = dev->stats.multicast;
193 	tot->rx_crc_errors = dev->stats.rx_crc_errors;
194 	tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
195 	tot->rx_length_errors = dev->stats.rx_length_errors;
196 	tot->rx_frame_errors = dev->stats.rx_frame_errors;
197 	tot->rx_errors = dev->stats.rx_errors;
198 
199 	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
200 	tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
201 	tot->tx_dropped = dev->stats.tx_dropped;
202 	tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
203 	tot->tx_errors = dev->stats.tx_errors;
204 
205 	return tot;
206 }
207 
208 /* Does key in tunnel parameters match packet */
209 static bool ipgre_key_match(const struct ip_tunnel_parm *p,
210 			    __be16 flags, __be32 key)
211 {
212 	if (p->i_flags & GRE_KEY) {
213 		if (flags & GRE_KEY)
214 			return key == p->i_key;
215 		else
216 			return false;	/* key expected, none present */
217 	} else
218 		return !(flags & GRE_KEY);
219 }
220 
221 /* Given src, dst and key, find appropriate for input tunnel. */
222 
223 static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
224 					     __be32 remote, __be32 local,
225 					     __be16 flags, __be32 key,
226 					     __be16 gre_proto)
227 {
228 	struct net *net = dev_net(dev);
229 	int link = dev->ifindex;
230 	unsigned int h0 = HASH(remote);
231 	unsigned int h1 = HASH(key);
232 	struct ip_tunnel *t, *cand = NULL;
233 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
234 	int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
235 		       ARPHRD_ETHER : ARPHRD_IPGRE;
236 	int score, cand_score = 4;
237 
238 	for_each_ip_tunnel_rcu(t, ign->tunnels_r_l[h0 ^ h1]) {
239 		if (local != t->parms.iph.saddr ||
240 		    remote != t->parms.iph.daddr ||
241 		    !(t->dev->flags & IFF_UP))
242 			continue;
243 
244 		if (!ipgre_key_match(&t->parms, flags, key))
245 			continue;
246 
247 		if (t->dev->type != ARPHRD_IPGRE &&
248 		    t->dev->type != dev_type)
249 			continue;
250 
251 		score = 0;
252 		if (t->parms.link != link)
253 			score |= 1;
254 		if (t->dev->type != dev_type)
255 			score |= 2;
256 		if (score == 0)
257 			return t;
258 
259 		if (score < cand_score) {
260 			cand = t;
261 			cand_score = score;
262 		}
263 	}
264 
265 	for_each_ip_tunnel_rcu(t, ign->tunnels_r[h0 ^ h1]) {
266 		if (remote != t->parms.iph.daddr ||
267 		    !(t->dev->flags & IFF_UP))
268 			continue;
269 
270 		if (!ipgre_key_match(&t->parms, flags, key))
271 			continue;
272 
273 		if (t->dev->type != ARPHRD_IPGRE &&
274 		    t->dev->type != dev_type)
275 			continue;
276 
277 		score = 0;
278 		if (t->parms.link != link)
279 			score |= 1;
280 		if (t->dev->type != dev_type)
281 			score |= 2;
282 		if (score == 0)
283 			return t;
284 
285 		if (score < cand_score) {
286 			cand = t;
287 			cand_score = score;
288 		}
289 	}
290 
291 	for_each_ip_tunnel_rcu(t, ign->tunnels_l[h1]) {
292 		if ((local != t->parms.iph.saddr &&
293 		     (local != t->parms.iph.daddr ||
294 		      !ipv4_is_multicast(local))) ||
295 		    !(t->dev->flags & IFF_UP))
296 			continue;
297 
298 		if (!ipgre_key_match(&t->parms, flags, key))
299 			continue;
300 
301 		if (t->dev->type != ARPHRD_IPGRE &&
302 		    t->dev->type != dev_type)
303 			continue;
304 
305 		score = 0;
306 		if (t->parms.link != link)
307 			score |= 1;
308 		if (t->dev->type != dev_type)
309 			score |= 2;
310 		if (score == 0)
311 			return t;
312 
313 		if (score < cand_score) {
314 			cand = t;
315 			cand_score = score;
316 		}
317 	}
318 
319 	for_each_ip_tunnel_rcu(t, ign->tunnels_wc[h1]) {
320 		if (t->parms.i_key != key ||
321 		    !(t->dev->flags & IFF_UP))
322 			continue;
323 
324 		if (t->dev->type != ARPHRD_IPGRE &&
325 		    t->dev->type != dev_type)
326 			continue;
327 
328 		score = 0;
329 		if (t->parms.link != link)
330 			score |= 1;
331 		if (t->dev->type != dev_type)
332 			score |= 2;
333 		if (score == 0)
334 			return t;
335 
336 		if (score < cand_score) {
337 			cand = t;
338 			cand_score = score;
339 		}
340 	}
341 
342 	if (cand != NULL)
343 		return cand;
344 
345 	dev = ign->fb_tunnel_dev;
346 	if (dev->flags & IFF_UP)
347 		return netdev_priv(dev);
348 
349 	return NULL;
350 }
351 
352 static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
353 		struct ip_tunnel_parm *parms)
354 {
355 	__be32 remote = parms->iph.daddr;
356 	__be32 local = parms->iph.saddr;
357 	__be32 key = parms->i_key;
358 	unsigned int h = HASH(key);
359 	int prio = 0;
360 
361 	if (local)
362 		prio |= 1;
363 	if (remote && !ipv4_is_multicast(remote)) {
364 		prio |= 2;
365 		h ^= HASH(remote);
366 	}
367 
368 	return &ign->tunnels[prio][h];
369 }
370 
371 static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
372 		struct ip_tunnel *t)
373 {
374 	return __ipgre_bucket(ign, &t->parms);
375 }
376 
377 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
378 {
379 	struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
380 
381 	rcu_assign_pointer(t->next, rtnl_dereference(*tp));
382 	rcu_assign_pointer(*tp, t);
383 }
384 
385 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
386 {
387 	struct ip_tunnel __rcu **tp;
388 	struct ip_tunnel *iter;
389 
390 	for (tp = ipgre_bucket(ign, t);
391 	     (iter = rtnl_dereference(*tp)) != NULL;
392 	     tp = &iter->next) {
393 		if (t == iter) {
394 			rcu_assign_pointer(*tp, t->next);
395 			break;
396 		}
397 	}
398 }
399 
400 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
401 					   struct ip_tunnel_parm *parms,
402 					   int type)
403 {
404 	__be32 remote = parms->iph.daddr;
405 	__be32 local = parms->iph.saddr;
406 	__be32 key = parms->i_key;
407 	int link = parms->link;
408 	struct ip_tunnel *t;
409 	struct ip_tunnel __rcu **tp;
410 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
411 
412 	for (tp = __ipgre_bucket(ign, parms);
413 	     (t = rtnl_dereference(*tp)) != NULL;
414 	     tp = &t->next)
415 		if (local == t->parms.iph.saddr &&
416 		    remote == t->parms.iph.daddr &&
417 		    key == t->parms.i_key &&
418 		    link == t->parms.link &&
419 		    type == t->dev->type)
420 			break;
421 
422 	return t;
423 }
424 
425 static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
426 		struct ip_tunnel_parm *parms, int create)
427 {
428 	struct ip_tunnel *t, *nt;
429 	struct net_device *dev;
430 	char name[IFNAMSIZ];
431 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
432 
433 	t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
434 	if (t || !create)
435 		return t;
436 
437 	if (parms->name[0])
438 		strlcpy(name, parms->name, IFNAMSIZ);
439 	else
440 		strcpy(name, "gre%d");
441 
442 	dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
443 	if (!dev)
444 		return NULL;
445 
446 	dev_net_set(dev, net);
447 
448 	nt = netdev_priv(dev);
449 	nt->parms = *parms;
450 	dev->rtnl_link_ops = &ipgre_link_ops;
451 
452 	dev->mtu = ipgre_tunnel_bind_dev(dev);
453 
454 	if (register_netdevice(dev) < 0)
455 		goto failed_free;
456 
457 	/* Can use a lockless transmit, unless we generate output sequences */
458 	if (!(nt->parms.o_flags & GRE_SEQ))
459 		dev->features |= NETIF_F_LLTX;
460 
461 	dev_hold(dev);
462 	ipgre_tunnel_link(ign, nt);
463 	return nt;
464 
465 failed_free:
466 	free_netdev(dev);
467 	return NULL;
468 }
469 
470 static void ipgre_tunnel_uninit(struct net_device *dev)
471 {
472 	struct net *net = dev_net(dev);
473 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
474 
475 	ipgre_tunnel_unlink(ign, netdev_priv(dev));
476 	dev_put(dev);
477 }
478 
479 
480 static void ipgre_err(struct sk_buff *skb, u32 info)
481 {
482 
483 /* All the routers (except for Linux) return only
484    8 bytes of packet payload. It means, that precise relaying of
485    ICMP in the real Internet is absolutely infeasible.
486 
487    Moreover, Cisco "wise men" put GRE key to the third word
488    in GRE header. It makes impossible maintaining even soft state for keyed
489    GRE tunnels with enabled checksum. Tell them "thank you".
490 
491    Well, I wonder, rfc1812 was written by Cisco employee,
492    what the hell these idiots break standards established
493    by themselves???
494  */
495 
496 	const struct iphdr *iph = (const struct iphdr *)skb->data;
497 	__be16	     *p = (__be16 *)(skb->data+(iph->ihl<<2));
498 	int grehlen = (iph->ihl<<2) + 4;
499 	const int type = icmp_hdr(skb)->type;
500 	const int code = icmp_hdr(skb)->code;
501 	struct ip_tunnel *t;
502 	__be16 flags;
503 	__be32 key = 0;
504 
505 	flags = p[0];
506 	if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
507 		if (flags&(GRE_VERSION|GRE_ROUTING))
508 			return;
509 		if (flags&GRE_KEY) {
510 			grehlen += 4;
511 			if (flags&GRE_CSUM)
512 				grehlen += 4;
513 		}
514 	}
515 
516 	/* If only 8 bytes returned, keyed message will be dropped here */
517 	if (skb_headlen(skb) < grehlen)
518 		return;
519 
520 	if (flags & GRE_KEY)
521 		key = *(((__be32 *)p) + (grehlen / 4) - 1);
522 
523 	switch (type) {
524 	default:
525 	case ICMP_PARAMETERPROB:
526 		return;
527 
528 	case ICMP_DEST_UNREACH:
529 		switch (code) {
530 		case ICMP_SR_FAILED:
531 		case ICMP_PORT_UNREACH:
532 			/* Impossible event. */
533 			return;
534 		default:
535 			/* All others are translated to HOST_UNREACH.
536 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
537 			   I believe they are just ether pollution. --ANK
538 			 */
539 			break;
540 		}
541 		break;
542 	case ICMP_TIME_EXCEEDED:
543 		if (code != ICMP_EXC_TTL)
544 			return;
545 		break;
546 
547 	case ICMP_REDIRECT:
548 		break;
549 	}
550 
551 	t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
552 				flags, key, p[1]);
553 
554 	if (t == NULL)
555 		return;
556 
557 	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
558 		ipv4_update_pmtu(skb, dev_net(skb->dev), info,
559 				 t->parms.link, 0, IPPROTO_GRE, 0);
560 		return;
561 	}
562 	if (type == ICMP_REDIRECT) {
563 		ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
564 			      IPPROTO_GRE, 0);
565 		return;
566 	}
567 	if (t->parms.iph.daddr == 0 ||
568 	    ipv4_is_multicast(t->parms.iph.daddr))
569 		return;
570 
571 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
572 		return;
573 
574 	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
575 		t->err_count++;
576 	else
577 		t->err_count = 1;
578 	t->err_time = jiffies;
579 }
580 
581 static inline u8
582 ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
583 {
584 	u8 inner = 0;
585 	if (skb->protocol == htons(ETH_P_IP))
586 		inner = old_iph->tos;
587 	else if (skb->protocol == htons(ETH_P_IPV6))
588 		inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
589 	return INET_ECN_encapsulate(tos, inner);
590 }
591 
592 static int ipgre_rcv(struct sk_buff *skb)
593 {
594 	const struct iphdr *iph;
595 	u8     *h;
596 	__be16    flags;
597 	__sum16   csum = 0;
598 	__be32 key = 0;
599 	u32    seqno = 0;
600 	struct ip_tunnel *tunnel;
601 	int    offset = 4;
602 	__be16 gre_proto;
603 	int    err;
604 
605 	if (!pskb_may_pull(skb, 16))
606 		goto drop;
607 
608 	iph = ip_hdr(skb);
609 	h = skb->data;
610 	flags = *(__be16 *)h;
611 
612 	if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
613 		/* - Version must be 0.
614 		   - We do not support routing headers.
615 		 */
616 		if (flags&(GRE_VERSION|GRE_ROUTING))
617 			goto drop;
618 
619 		if (flags&GRE_CSUM) {
620 			switch (skb->ip_summed) {
621 			case CHECKSUM_COMPLETE:
622 				csum = csum_fold(skb->csum);
623 				if (!csum)
624 					break;
625 				/* fall through */
626 			case CHECKSUM_NONE:
627 				skb->csum = 0;
628 				csum = __skb_checksum_complete(skb);
629 				skb->ip_summed = CHECKSUM_COMPLETE;
630 			}
631 			offset += 4;
632 		}
633 		if (flags&GRE_KEY) {
634 			key = *(__be32 *)(h + offset);
635 			offset += 4;
636 		}
637 		if (flags&GRE_SEQ) {
638 			seqno = ntohl(*(__be32 *)(h + offset));
639 			offset += 4;
640 		}
641 	}
642 
643 	gre_proto = *(__be16 *)(h + 2);
644 
645 	tunnel = ipgre_tunnel_lookup(skb->dev,
646 				     iph->saddr, iph->daddr, flags, key,
647 				     gre_proto);
648 	if (tunnel) {
649 		struct pcpu_tstats *tstats;
650 
651 		secpath_reset(skb);
652 
653 		skb->protocol = gre_proto;
654 		/* WCCP version 1 and 2 protocol decoding.
655 		 * - Change protocol to IP
656 		 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
657 		 */
658 		if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
659 			skb->protocol = htons(ETH_P_IP);
660 			if ((*(h + offset) & 0xF0) != 0x40)
661 				offset += 4;
662 		}
663 
664 		skb->mac_header = skb->network_header;
665 		__pskb_pull(skb, offset);
666 		skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
667 		skb->pkt_type = PACKET_HOST;
668 #ifdef CONFIG_NET_IPGRE_BROADCAST
669 		if (ipv4_is_multicast(iph->daddr)) {
670 			/* Looped back packet, drop it! */
671 			if (rt_is_output_route(skb_rtable(skb)))
672 				goto drop;
673 			tunnel->dev->stats.multicast++;
674 			skb->pkt_type = PACKET_BROADCAST;
675 		}
676 #endif
677 
678 		if (((flags&GRE_CSUM) && csum) ||
679 		    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
680 			tunnel->dev->stats.rx_crc_errors++;
681 			tunnel->dev->stats.rx_errors++;
682 			goto drop;
683 		}
684 		if (tunnel->parms.i_flags&GRE_SEQ) {
685 			if (!(flags&GRE_SEQ) ||
686 			    (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
687 				tunnel->dev->stats.rx_fifo_errors++;
688 				tunnel->dev->stats.rx_errors++;
689 				goto drop;
690 			}
691 			tunnel->i_seqno = seqno + 1;
692 		}
693 
694 		/* Warning: All skb pointers will be invalidated! */
695 		if (tunnel->dev->type == ARPHRD_ETHER) {
696 			if (!pskb_may_pull(skb, ETH_HLEN)) {
697 				tunnel->dev->stats.rx_length_errors++;
698 				tunnel->dev->stats.rx_errors++;
699 				goto drop;
700 			}
701 
702 			iph = ip_hdr(skb);
703 			skb->protocol = eth_type_trans(skb, tunnel->dev);
704 			skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
705 		}
706 
707 		__skb_tunnel_rx(skb, tunnel->dev);
708 
709 		skb_reset_network_header(skb);
710 		err = IP_ECN_decapsulate(iph, skb);
711 		if (unlikely(err)) {
712 			if (log_ecn_error)
713 				net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
714 						     &iph->saddr, iph->tos);
715 			if (err > 1) {
716 				++tunnel->dev->stats.rx_frame_errors;
717 				++tunnel->dev->stats.rx_errors;
718 				goto drop;
719 			}
720 		}
721 
722 		tstats = this_cpu_ptr(tunnel->dev->tstats);
723 		u64_stats_update_begin(&tstats->syncp);
724 		tstats->rx_packets++;
725 		tstats->rx_bytes += skb->len;
726 		u64_stats_update_end(&tstats->syncp);
727 
728 		gro_cells_receive(&tunnel->gro_cells, skb);
729 		return 0;
730 	}
731 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
732 
733 drop:
734 	kfree_skb(skb);
735 	return 0;
736 }
737 
738 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
739 {
740 	struct ip_tunnel *tunnel = netdev_priv(dev);
741 	const struct iphdr  *old_iph = ip_hdr(skb);
742 	const struct iphdr  *tiph;
743 	struct flowi4 fl4;
744 	u8     tos;
745 	__be16 df;
746 	struct rtable *rt;     			/* Route to the other host */
747 	struct net_device *tdev;		/* Device to other host */
748 	struct iphdr  *iph;			/* Our new IP header */
749 	unsigned int max_headroom;		/* The extra header space needed */
750 	int    gre_hlen;
751 	__be32 dst;
752 	int    mtu;
753 	u8     ttl;
754 
755 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
756 	    skb_checksum_help(skb))
757 		goto tx_error;
758 
759 	if (dev->type == ARPHRD_ETHER)
760 		IPCB(skb)->flags = 0;
761 
762 	if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
763 		gre_hlen = 0;
764 		if (skb->protocol == htons(ETH_P_IP))
765 			tiph = (const struct iphdr *)skb->data;
766 		else
767 			tiph = &tunnel->parms.iph;
768 	} else {
769 		gre_hlen = tunnel->hlen;
770 		tiph = &tunnel->parms.iph;
771 	}
772 
773 	if ((dst = tiph->daddr) == 0) {
774 		/* NBMA tunnel */
775 
776 		if (skb_dst(skb) == NULL) {
777 			dev->stats.tx_fifo_errors++;
778 			goto tx_error;
779 		}
780 
781 		if (skb->protocol == htons(ETH_P_IP)) {
782 			rt = skb_rtable(skb);
783 			dst = rt_nexthop(rt, old_iph->daddr);
784 		}
785 #if IS_ENABLED(CONFIG_IPV6)
786 		else if (skb->protocol == htons(ETH_P_IPV6)) {
787 			const struct in6_addr *addr6;
788 			struct neighbour *neigh;
789 			bool do_tx_error_icmp;
790 			int addr_type;
791 
792 			neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr);
793 			if (neigh == NULL)
794 				goto tx_error;
795 
796 			addr6 = (const struct in6_addr *)&neigh->primary_key;
797 			addr_type = ipv6_addr_type(addr6);
798 
799 			if (addr_type == IPV6_ADDR_ANY) {
800 				addr6 = &ipv6_hdr(skb)->daddr;
801 				addr_type = ipv6_addr_type(addr6);
802 			}
803 
804 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
805 				do_tx_error_icmp = true;
806 			else {
807 				do_tx_error_icmp = false;
808 				dst = addr6->s6_addr32[3];
809 			}
810 			neigh_release(neigh);
811 			if (do_tx_error_icmp)
812 				goto tx_error_icmp;
813 		}
814 #endif
815 		else
816 			goto tx_error;
817 	}
818 
819 	ttl = tiph->ttl;
820 	tos = tiph->tos;
821 	if (tos == 1) {
822 		tos = 0;
823 		if (skb->protocol == htons(ETH_P_IP))
824 			tos = old_iph->tos;
825 		else if (skb->protocol == htons(ETH_P_IPV6))
826 			tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
827 	}
828 
829 	rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr,
830 				 tunnel->parms.o_key, RT_TOS(tos),
831 				 tunnel->parms.link);
832 	if (IS_ERR(rt)) {
833 		dev->stats.tx_carrier_errors++;
834 		goto tx_error;
835 	}
836 	tdev = rt->dst.dev;
837 
838 	if (tdev == dev) {
839 		ip_rt_put(rt);
840 		dev->stats.collisions++;
841 		goto tx_error;
842 	}
843 
844 	df = tiph->frag_off;
845 	if (df)
846 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
847 	else
848 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
849 
850 	if (skb_dst(skb))
851 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
852 
853 	if (skb->protocol == htons(ETH_P_IP)) {
854 		df |= (old_iph->frag_off&htons(IP_DF));
855 
856 		if ((old_iph->frag_off&htons(IP_DF)) &&
857 		    mtu < ntohs(old_iph->tot_len)) {
858 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
859 			ip_rt_put(rt);
860 			goto tx_error;
861 		}
862 	}
863 #if IS_ENABLED(CONFIG_IPV6)
864 	else if (skb->protocol == htons(ETH_P_IPV6)) {
865 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
866 
867 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
868 			if ((tunnel->parms.iph.daddr &&
869 			     !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
870 			    rt6->rt6i_dst.plen == 128) {
871 				rt6->rt6i_flags |= RTF_MODIFIED;
872 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
873 			}
874 		}
875 
876 		if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
877 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
878 			ip_rt_put(rt);
879 			goto tx_error;
880 		}
881 	}
882 #endif
883 
884 	if (tunnel->err_count > 0) {
885 		if (time_before(jiffies,
886 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
887 			tunnel->err_count--;
888 
889 			dst_link_failure(skb);
890 		} else
891 			tunnel->err_count = 0;
892 	}
893 
894 	max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
895 
896 	if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
897 	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
898 		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
899 		if (max_headroom > dev->needed_headroom)
900 			dev->needed_headroom = max_headroom;
901 		if (!new_skb) {
902 			ip_rt_put(rt);
903 			dev->stats.tx_dropped++;
904 			dev_kfree_skb(skb);
905 			return NETDEV_TX_OK;
906 		}
907 		if (skb->sk)
908 			skb_set_owner_w(new_skb, skb->sk);
909 		dev_kfree_skb(skb);
910 		skb = new_skb;
911 		old_iph = ip_hdr(skb);
912 		/* Warning : tiph value might point to freed memory */
913 	}
914 
915 	skb_push(skb, gre_hlen);
916 	skb_reset_network_header(skb);
917 	skb_set_transport_header(skb, sizeof(*iph));
918 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
919 	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
920 			      IPSKB_REROUTED);
921 	skb_dst_drop(skb);
922 	skb_dst_set(skb, &rt->dst);
923 
924 	/*
925 	 *	Push down and install the IPIP header.
926 	 */
927 
928 	iph 			=	ip_hdr(skb);
929 	iph->version		=	4;
930 	iph->ihl		=	sizeof(struct iphdr) >> 2;
931 	iph->frag_off		=	df;
932 	iph->protocol		=	IPPROTO_GRE;
933 	iph->tos		=	ipgre_ecn_encapsulate(tos, old_iph, skb);
934 	iph->daddr		=	fl4.daddr;
935 	iph->saddr		=	fl4.saddr;
936 	iph->ttl		=	ttl;
937 
938 	if (ttl == 0) {
939 		if (skb->protocol == htons(ETH_P_IP))
940 			iph->ttl = old_iph->ttl;
941 #if IS_ENABLED(CONFIG_IPV6)
942 		else if (skb->protocol == htons(ETH_P_IPV6))
943 			iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
944 #endif
945 		else
946 			iph->ttl = ip4_dst_hoplimit(&rt->dst);
947 	}
948 
949 	((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
950 	((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
951 				   htons(ETH_P_TEB) : skb->protocol;
952 
953 	if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
954 		__be32 *ptr = (__be32 *)(((u8 *)iph) + tunnel->hlen - 4);
955 
956 		if (tunnel->parms.o_flags&GRE_SEQ) {
957 			++tunnel->o_seqno;
958 			*ptr = htonl(tunnel->o_seqno);
959 			ptr--;
960 		}
961 		if (tunnel->parms.o_flags&GRE_KEY) {
962 			*ptr = tunnel->parms.o_key;
963 			ptr--;
964 		}
965 		if (tunnel->parms.o_flags&GRE_CSUM) {
966 			int offset = skb_transport_offset(skb);
967 
968 			*ptr = 0;
969 			*(__sum16 *)ptr = csum_fold(skb_checksum(skb, offset,
970 								 skb->len - offset,
971 								 0));
972 		}
973 	}
974 
975 	iptunnel_xmit(skb, dev);
976 	return NETDEV_TX_OK;
977 
978 #if IS_ENABLED(CONFIG_IPV6)
979 tx_error_icmp:
980 	dst_link_failure(skb);
981 #endif
982 tx_error:
983 	dev->stats.tx_errors++;
984 	dev_kfree_skb(skb);
985 	return NETDEV_TX_OK;
986 }
987 
988 static int ipgre_tunnel_bind_dev(struct net_device *dev)
989 {
990 	struct net_device *tdev = NULL;
991 	struct ip_tunnel *tunnel;
992 	const struct iphdr *iph;
993 	int hlen = LL_MAX_HEADER;
994 	int mtu = ETH_DATA_LEN;
995 	int addend = sizeof(struct iphdr) + 4;
996 
997 	tunnel = netdev_priv(dev);
998 	iph = &tunnel->parms.iph;
999 
1000 	/* Guess output device to choose reasonable mtu and needed_headroom */
1001 
1002 	if (iph->daddr) {
1003 		struct flowi4 fl4;
1004 		struct rtable *rt;
1005 
1006 		rt = ip_route_output_gre(dev_net(dev), &fl4,
1007 					 iph->daddr, iph->saddr,
1008 					 tunnel->parms.o_key,
1009 					 RT_TOS(iph->tos),
1010 					 tunnel->parms.link);
1011 		if (!IS_ERR(rt)) {
1012 			tdev = rt->dst.dev;
1013 			ip_rt_put(rt);
1014 		}
1015 
1016 		if (dev->type != ARPHRD_ETHER)
1017 			dev->flags |= IFF_POINTOPOINT;
1018 	}
1019 
1020 	if (!tdev && tunnel->parms.link)
1021 		tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
1022 
1023 	if (tdev) {
1024 		hlen = tdev->hard_header_len + tdev->needed_headroom;
1025 		mtu = tdev->mtu;
1026 	}
1027 	dev->iflink = tunnel->parms.link;
1028 
1029 	/* Precalculate GRE options length */
1030 	if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
1031 		if (tunnel->parms.o_flags&GRE_CSUM)
1032 			addend += 4;
1033 		if (tunnel->parms.o_flags&GRE_KEY)
1034 			addend += 4;
1035 		if (tunnel->parms.o_flags&GRE_SEQ)
1036 			addend += 4;
1037 	}
1038 	dev->needed_headroom = addend + hlen;
1039 	mtu -= dev->hard_header_len + addend;
1040 
1041 	if (mtu < 68)
1042 		mtu = 68;
1043 
1044 	tunnel->hlen = addend;
1045 
1046 	return mtu;
1047 }
1048 
1049 static int
1050 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
1051 {
1052 	int err = 0;
1053 	struct ip_tunnel_parm p;
1054 	struct ip_tunnel *t;
1055 	struct net *net = dev_net(dev);
1056 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1057 
1058 	switch (cmd) {
1059 	case SIOCGETTUNNEL:
1060 		t = NULL;
1061 		if (dev == ign->fb_tunnel_dev) {
1062 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
1063 				err = -EFAULT;
1064 				break;
1065 			}
1066 			t = ipgre_tunnel_locate(net, &p, 0);
1067 		}
1068 		if (t == NULL)
1069 			t = netdev_priv(dev);
1070 		memcpy(&p, &t->parms, sizeof(p));
1071 		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1072 			err = -EFAULT;
1073 		break;
1074 
1075 	case SIOCADDTUNNEL:
1076 	case SIOCCHGTUNNEL:
1077 		err = -EPERM;
1078 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
1079 			goto done;
1080 
1081 		err = -EFAULT;
1082 		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1083 			goto done;
1084 
1085 		err = -EINVAL;
1086 		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1087 		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1088 		    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1089 			goto done;
1090 		if (p.iph.ttl)
1091 			p.iph.frag_off |= htons(IP_DF);
1092 
1093 		if (!(p.i_flags&GRE_KEY))
1094 			p.i_key = 0;
1095 		if (!(p.o_flags&GRE_KEY))
1096 			p.o_key = 0;
1097 
1098 		t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1099 
1100 		if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1101 			if (t != NULL) {
1102 				if (t->dev != dev) {
1103 					err = -EEXIST;
1104 					break;
1105 				}
1106 			} else {
1107 				unsigned int nflags = 0;
1108 
1109 				t = netdev_priv(dev);
1110 
1111 				if (ipv4_is_multicast(p.iph.daddr))
1112 					nflags = IFF_BROADCAST;
1113 				else if (p.iph.daddr)
1114 					nflags = IFF_POINTOPOINT;
1115 
1116 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1117 					err = -EINVAL;
1118 					break;
1119 				}
1120 				ipgre_tunnel_unlink(ign, t);
1121 				synchronize_net();
1122 				t->parms.iph.saddr = p.iph.saddr;
1123 				t->parms.iph.daddr = p.iph.daddr;
1124 				t->parms.i_key = p.i_key;
1125 				t->parms.o_key = p.o_key;
1126 				memcpy(dev->dev_addr, &p.iph.saddr, 4);
1127 				memcpy(dev->broadcast, &p.iph.daddr, 4);
1128 				ipgre_tunnel_link(ign, t);
1129 				netdev_state_change(dev);
1130 			}
1131 		}
1132 
1133 		if (t) {
1134 			err = 0;
1135 			if (cmd == SIOCCHGTUNNEL) {
1136 				t->parms.iph.ttl = p.iph.ttl;
1137 				t->parms.iph.tos = p.iph.tos;
1138 				t->parms.iph.frag_off = p.iph.frag_off;
1139 				if (t->parms.link != p.link) {
1140 					t->parms.link = p.link;
1141 					dev->mtu = ipgre_tunnel_bind_dev(dev);
1142 					netdev_state_change(dev);
1143 				}
1144 			}
1145 			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1146 				err = -EFAULT;
1147 		} else
1148 			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1149 		break;
1150 
1151 	case SIOCDELTUNNEL:
1152 		err = -EPERM;
1153 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
1154 			goto done;
1155 
1156 		if (dev == ign->fb_tunnel_dev) {
1157 			err = -EFAULT;
1158 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1159 				goto done;
1160 			err = -ENOENT;
1161 			if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1162 				goto done;
1163 			err = -EPERM;
1164 			if (t == netdev_priv(ign->fb_tunnel_dev))
1165 				goto done;
1166 			dev = t->dev;
1167 		}
1168 		unregister_netdevice(dev);
1169 		err = 0;
1170 		break;
1171 
1172 	default:
1173 		err = -EINVAL;
1174 	}
1175 
1176 done:
1177 	return err;
1178 }
1179 
1180 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1181 {
1182 	struct ip_tunnel *tunnel = netdev_priv(dev);
1183 	if (new_mtu < 68 ||
1184 	    new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1185 		return -EINVAL;
1186 	dev->mtu = new_mtu;
1187 	return 0;
1188 }
1189 
1190 /* Nice toy. Unfortunately, useless in real life :-)
1191    It allows to construct virtual multiprotocol broadcast "LAN"
1192    over the Internet, provided multicast routing is tuned.
1193 
1194 
1195    I have no idea was this bicycle invented before me,
1196    so that I had to set ARPHRD_IPGRE to a random value.
1197    I have an impression, that Cisco could make something similar,
1198    but this feature is apparently missing in IOS<=11.2(8).
1199 
1200    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1201    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1202 
1203    ping -t 255 224.66.66.66
1204 
1205    If nobody answers, mbone does not work.
1206 
1207    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1208    ip addr add 10.66.66.<somewhat>/24 dev Universe
1209    ifconfig Universe up
1210    ifconfig Universe add fe80::<Your_real_addr>/10
1211    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1212    ftp 10.66.66.66
1213    ...
1214    ftp fec0:6666:6666::193.233.7.65
1215    ...
1216 
1217  */
1218 
1219 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1220 			unsigned short type,
1221 			const void *daddr, const void *saddr, unsigned int len)
1222 {
1223 	struct ip_tunnel *t = netdev_priv(dev);
1224 	struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1225 	__be16 *p = (__be16 *)(iph+1);
1226 
1227 	memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1228 	p[0]		= t->parms.o_flags;
1229 	p[1]		= htons(type);
1230 
1231 	/*
1232 	 *	Set the source hardware address.
1233 	 */
1234 
1235 	if (saddr)
1236 		memcpy(&iph->saddr, saddr, 4);
1237 	if (daddr)
1238 		memcpy(&iph->daddr, daddr, 4);
1239 	if (iph->daddr)
1240 		return t->hlen;
1241 
1242 	return -t->hlen;
1243 }
1244 
1245 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1246 {
1247 	const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
1248 	memcpy(haddr, &iph->saddr, 4);
1249 	return 4;
1250 }
1251 
1252 static const struct header_ops ipgre_header_ops = {
1253 	.create	= ipgre_header,
1254 	.parse	= ipgre_header_parse,
1255 };
1256 
1257 #ifdef CONFIG_NET_IPGRE_BROADCAST
1258 static int ipgre_open(struct net_device *dev)
1259 {
1260 	struct ip_tunnel *t = netdev_priv(dev);
1261 
1262 	if (ipv4_is_multicast(t->parms.iph.daddr)) {
1263 		struct flowi4 fl4;
1264 		struct rtable *rt;
1265 
1266 		rt = ip_route_output_gre(dev_net(dev), &fl4,
1267 					 t->parms.iph.daddr,
1268 					 t->parms.iph.saddr,
1269 					 t->parms.o_key,
1270 					 RT_TOS(t->parms.iph.tos),
1271 					 t->parms.link);
1272 		if (IS_ERR(rt))
1273 			return -EADDRNOTAVAIL;
1274 		dev = rt->dst.dev;
1275 		ip_rt_put(rt);
1276 		if (__in_dev_get_rtnl(dev) == NULL)
1277 			return -EADDRNOTAVAIL;
1278 		t->mlink = dev->ifindex;
1279 		ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1280 	}
1281 	return 0;
1282 }
1283 
1284 static int ipgre_close(struct net_device *dev)
1285 {
1286 	struct ip_tunnel *t = netdev_priv(dev);
1287 
1288 	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1289 		struct in_device *in_dev;
1290 		in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1291 		if (in_dev)
1292 			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1293 	}
1294 	return 0;
1295 }
1296 
1297 #endif
1298 
1299 static const struct net_device_ops ipgre_netdev_ops = {
1300 	.ndo_init		= ipgre_tunnel_init,
1301 	.ndo_uninit		= ipgre_tunnel_uninit,
1302 #ifdef CONFIG_NET_IPGRE_BROADCAST
1303 	.ndo_open		= ipgre_open,
1304 	.ndo_stop		= ipgre_close,
1305 #endif
1306 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1307 	.ndo_do_ioctl		= ipgre_tunnel_ioctl,
1308 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1309 	.ndo_get_stats64	= ipgre_get_stats64,
1310 };
1311 
1312 static void ipgre_dev_free(struct net_device *dev)
1313 {
1314 	struct ip_tunnel *tunnel = netdev_priv(dev);
1315 
1316 	gro_cells_destroy(&tunnel->gro_cells);
1317 	free_percpu(dev->tstats);
1318 	free_netdev(dev);
1319 }
1320 
1321 #define GRE_FEATURES (NETIF_F_SG |		\
1322 		      NETIF_F_FRAGLIST |	\
1323 		      NETIF_F_HIGHDMA |		\
1324 		      NETIF_F_HW_CSUM)
1325 
1326 static void ipgre_tunnel_setup(struct net_device *dev)
1327 {
1328 	dev->netdev_ops		= &ipgre_netdev_ops;
1329 	dev->destructor 	= ipgre_dev_free;
1330 
1331 	dev->type		= ARPHRD_IPGRE;
1332 	dev->needed_headroom 	= LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1333 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1334 	dev->flags		= IFF_NOARP;
1335 	dev->iflink		= 0;
1336 	dev->addr_len		= 4;
1337 	dev->features		|= NETIF_F_NETNS_LOCAL;
1338 	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
1339 
1340 	dev->features		|= GRE_FEATURES;
1341 	dev->hw_features	|= GRE_FEATURES;
1342 }
1343 
1344 static int ipgre_tunnel_init(struct net_device *dev)
1345 {
1346 	struct ip_tunnel *tunnel;
1347 	struct iphdr *iph;
1348 	int err;
1349 
1350 	tunnel = netdev_priv(dev);
1351 	iph = &tunnel->parms.iph;
1352 
1353 	tunnel->dev = dev;
1354 	strcpy(tunnel->parms.name, dev->name);
1355 
1356 	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1357 	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1358 
1359 	if (iph->daddr) {
1360 #ifdef CONFIG_NET_IPGRE_BROADCAST
1361 		if (ipv4_is_multicast(iph->daddr)) {
1362 			if (!iph->saddr)
1363 				return -EINVAL;
1364 			dev->flags = IFF_BROADCAST;
1365 			dev->header_ops = &ipgre_header_ops;
1366 		}
1367 #endif
1368 	} else
1369 		dev->header_ops = &ipgre_header_ops;
1370 
1371 	dev->tstats = alloc_percpu(struct pcpu_tstats);
1372 	if (!dev->tstats)
1373 		return -ENOMEM;
1374 
1375 	err = gro_cells_init(&tunnel->gro_cells, dev);
1376 	if (err) {
1377 		free_percpu(dev->tstats);
1378 		return err;
1379 	}
1380 
1381 	return 0;
1382 }
1383 
1384 static void ipgre_fb_tunnel_init(struct net_device *dev)
1385 {
1386 	struct ip_tunnel *tunnel = netdev_priv(dev);
1387 	struct iphdr *iph = &tunnel->parms.iph;
1388 
1389 	tunnel->dev = dev;
1390 	strcpy(tunnel->parms.name, dev->name);
1391 
1392 	iph->version		= 4;
1393 	iph->protocol		= IPPROTO_GRE;
1394 	iph->ihl		= 5;
1395 	tunnel->hlen		= sizeof(struct iphdr) + 4;
1396 
1397 	dev_hold(dev);
1398 }
1399 
1400 
1401 static const struct gre_protocol ipgre_protocol = {
1402 	.handler     = ipgre_rcv,
1403 	.err_handler = ipgre_err,
1404 };
1405 
1406 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1407 {
1408 	int prio;
1409 
1410 	for (prio = 0; prio < 4; prio++) {
1411 		int h;
1412 		for (h = 0; h < HASH_SIZE; h++) {
1413 			struct ip_tunnel *t;
1414 
1415 			t = rtnl_dereference(ign->tunnels[prio][h]);
1416 
1417 			while (t != NULL) {
1418 				unregister_netdevice_queue(t->dev, head);
1419 				t = rtnl_dereference(t->next);
1420 			}
1421 		}
1422 	}
1423 }
1424 
1425 static int __net_init ipgre_init_net(struct net *net)
1426 {
1427 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1428 	int err;
1429 
1430 	ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1431 					   ipgre_tunnel_setup);
1432 	if (!ign->fb_tunnel_dev) {
1433 		err = -ENOMEM;
1434 		goto err_alloc_dev;
1435 	}
1436 	dev_net_set(ign->fb_tunnel_dev, net);
1437 
1438 	ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1439 	ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1440 
1441 	if ((err = register_netdev(ign->fb_tunnel_dev)))
1442 		goto err_reg_dev;
1443 
1444 	rcu_assign_pointer(ign->tunnels_wc[0],
1445 			   netdev_priv(ign->fb_tunnel_dev));
1446 	return 0;
1447 
1448 err_reg_dev:
1449 	ipgre_dev_free(ign->fb_tunnel_dev);
1450 err_alloc_dev:
1451 	return err;
1452 }
1453 
1454 static void __net_exit ipgre_exit_net(struct net *net)
1455 {
1456 	struct ipgre_net *ign;
1457 	LIST_HEAD(list);
1458 
1459 	ign = net_generic(net, ipgre_net_id);
1460 	rtnl_lock();
1461 	ipgre_destroy_tunnels(ign, &list);
1462 	unregister_netdevice_many(&list);
1463 	rtnl_unlock();
1464 }
1465 
1466 static struct pernet_operations ipgre_net_ops = {
1467 	.init = ipgre_init_net,
1468 	.exit = ipgre_exit_net,
1469 	.id   = &ipgre_net_id,
1470 	.size = sizeof(struct ipgre_net),
1471 };
1472 
1473 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1474 {
1475 	__be16 flags;
1476 
1477 	if (!data)
1478 		return 0;
1479 
1480 	flags = 0;
1481 	if (data[IFLA_GRE_IFLAGS])
1482 		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1483 	if (data[IFLA_GRE_OFLAGS])
1484 		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1485 	if (flags & (GRE_VERSION|GRE_ROUTING))
1486 		return -EINVAL;
1487 
1488 	return 0;
1489 }
1490 
1491 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1492 {
1493 	__be32 daddr;
1494 
1495 	if (tb[IFLA_ADDRESS]) {
1496 		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1497 			return -EINVAL;
1498 		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1499 			return -EADDRNOTAVAIL;
1500 	}
1501 
1502 	if (!data)
1503 		goto out;
1504 
1505 	if (data[IFLA_GRE_REMOTE]) {
1506 		memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1507 		if (!daddr)
1508 			return -EINVAL;
1509 	}
1510 
1511 out:
1512 	return ipgre_tunnel_validate(tb, data);
1513 }
1514 
1515 static void ipgre_netlink_parms(struct nlattr *data[],
1516 				struct ip_tunnel_parm *parms)
1517 {
1518 	memset(parms, 0, sizeof(*parms));
1519 
1520 	parms->iph.protocol = IPPROTO_GRE;
1521 
1522 	if (!data)
1523 		return;
1524 
1525 	if (data[IFLA_GRE_LINK])
1526 		parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1527 
1528 	if (data[IFLA_GRE_IFLAGS])
1529 		parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1530 
1531 	if (data[IFLA_GRE_OFLAGS])
1532 		parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1533 
1534 	if (data[IFLA_GRE_IKEY])
1535 		parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1536 
1537 	if (data[IFLA_GRE_OKEY])
1538 		parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1539 
1540 	if (data[IFLA_GRE_LOCAL])
1541 		parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1542 
1543 	if (data[IFLA_GRE_REMOTE])
1544 		parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1545 
1546 	if (data[IFLA_GRE_TTL])
1547 		parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1548 
1549 	if (data[IFLA_GRE_TOS])
1550 		parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1551 
1552 	if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1553 		parms->iph.frag_off = htons(IP_DF);
1554 }
1555 
1556 static int ipgre_tap_init(struct net_device *dev)
1557 {
1558 	struct ip_tunnel *tunnel;
1559 
1560 	tunnel = netdev_priv(dev);
1561 
1562 	tunnel->dev = dev;
1563 	strcpy(tunnel->parms.name, dev->name);
1564 
1565 	ipgre_tunnel_bind_dev(dev);
1566 
1567 	dev->tstats = alloc_percpu(struct pcpu_tstats);
1568 	if (!dev->tstats)
1569 		return -ENOMEM;
1570 
1571 	return 0;
1572 }
1573 
1574 static const struct net_device_ops ipgre_tap_netdev_ops = {
1575 	.ndo_init		= ipgre_tap_init,
1576 	.ndo_uninit		= ipgre_tunnel_uninit,
1577 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1578 	.ndo_set_mac_address 	= eth_mac_addr,
1579 	.ndo_validate_addr	= eth_validate_addr,
1580 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1581 	.ndo_get_stats64	= ipgre_get_stats64,
1582 };
1583 
1584 static void ipgre_tap_setup(struct net_device *dev)
1585 {
1586 
1587 	ether_setup(dev);
1588 
1589 	dev->netdev_ops		= &ipgre_tap_netdev_ops;
1590 	dev->destructor 	= ipgre_dev_free;
1591 
1592 	dev->iflink		= 0;
1593 	dev->features		|= NETIF_F_NETNS_LOCAL;
1594 }
1595 
1596 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1597 			 struct nlattr *data[])
1598 {
1599 	struct ip_tunnel *nt;
1600 	struct net *net = dev_net(dev);
1601 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1602 	int mtu;
1603 	int err;
1604 
1605 	nt = netdev_priv(dev);
1606 	ipgre_netlink_parms(data, &nt->parms);
1607 
1608 	if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1609 		return -EEXIST;
1610 
1611 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1612 		eth_hw_addr_random(dev);
1613 
1614 	mtu = ipgre_tunnel_bind_dev(dev);
1615 	if (!tb[IFLA_MTU])
1616 		dev->mtu = mtu;
1617 
1618 	/* Can use a lockless transmit, unless we generate output sequences */
1619 	if (!(nt->parms.o_flags & GRE_SEQ))
1620 		dev->features |= NETIF_F_LLTX;
1621 
1622 	err = register_netdevice(dev);
1623 	if (err)
1624 		goto out;
1625 
1626 	dev_hold(dev);
1627 	ipgre_tunnel_link(ign, nt);
1628 
1629 out:
1630 	return err;
1631 }
1632 
1633 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1634 			    struct nlattr *data[])
1635 {
1636 	struct ip_tunnel *t, *nt;
1637 	struct net *net = dev_net(dev);
1638 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1639 	struct ip_tunnel_parm p;
1640 	int mtu;
1641 
1642 	if (dev == ign->fb_tunnel_dev)
1643 		return -EINVAL;
1644 
1645 	nt = netdev_priv(dev);
1646 	ipgre_netlink_parms(data, &p);
1647 
1648 	t = ipgre_tunnel_locate(net, &p, 0);
1649 
1650 	if (t) {
1651 		if (t->dev != dev)
1652 			return -EEXIST;
1653 	} else {
1654 		t = nt;
1655 
1656 		if (dev->type != ARPHRD_ETHER) {
1657 			unsigned int nflags = 0;
1658 
1659 			if (ipv4_is_multicast(p.iph.daddr))
1660 				nflags = IFF_BROADCAST;
1661 			else if (p.iph.daddr)
1662 				nflags = IFF_POINTOPOINT;
1663 
1664 			if ((dev->flags ^ nflags) &
1665 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1666 				return -EINVAL;
1667 		}
1668 
1669 		ipgre_tunnel_unlink(ign, t);
1670 		t->parms.iph.saddr = p.iph.saddr;
1671 		t->parms.iph.daddr = p.iph.daddr;
1672 		t->parms.i_key = p.i_key;
1673 		if (dev->type != ARPHRD_ETHER) {
1674 			memcpy(dev->dev_addr, &p.iph.saddr, 4);
1675 			memcpy(dev->broadcast, &p.iph.daddr, 4);
1676 		}
1677 		ipgre_tunnel_link(ign, t);
1678 		netdev_state_change(dev);
1679 	}
1680 
1681 	t->parms.o_key = p.o_key;
1682 	t->parms.iph.ttl = p.iph.ttl;
1683 	t->parms.iph.tos = p.iph.tos;
1684 	t->parms.iph.frag_off = p.iph.frag_off;
1685 
1686 	if (t->parms.link != p.link) {
1687 		t->parms.link = p.link;
1688 		mtu = ipgre_tunnel_bind_dev(dev);
1689 		if (!tb[IFLA_MTU])
1690 			dev->mtu = mtu;
1691 		netdev_state_change(dev);
1692 	}
1693 
1694 	return 0;
1695 }
1696 
1697 static size_t ipgre_get_size(const struct net_device *dev)
1698 {
1699 	return
1700 		/* IFLA_GRE_LINK */
1701 		nla_total_size(4) +
1702 		/* IFLA_GRE_IFLAGS */
1703 		nla_total_size(2) +
1704 		/* IFLA_GRE_OFLAGS */
1705 		nla_total_size(2) +
1706 		/* IFLA_GRE_IKEY */
1707 		nla_total_size(4) +
1708 		/* IFLA_GRE_OKEY */
1709 		nla_total_size(4) +
1710 		/* IFLA_GRE_LOCAL */
1711 		nla_total_size(4) +
1712 		/* IFLA_GRE_REMOTE */
1713 		nla_total_size(4) +
1714 		/* IFLA_GRE_TTL */
1715 		nla_total_size(1) +
1716 		/* IFLA_GRE_TOS */
1717 		nla_total_size(1) +
1718 		/* IFLA_GRE_PMTUDISC */
1719 		nla_total_size(1) +
1720 		0;
1721 }
1722 
1723 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1724 {
1725 	struct ip_tunnel *t = netdev_priv(dev);
1726 	struct ip_tunnel_parm *p = &t->parms;
1727 
1728 	if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1729 	    nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) ||
1730 	    nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) ||
1731 	    nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1732 	    nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1733 	    nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
1734 	    nla_put_be32(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
1735 	    nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
1736 	    nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
1737 	    nla_put_u8(skb, IFLA_GRE_PMTUDISC,
1738 		       !!(p->iph.frag_off & htons(IP_DF))))
1739 		goto nla_put_failure;
1740 	return 0;
1741 
1742 nla_put_failure:
1743 	return -EMSGSIZE;
1744 }
1745 
1746 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1747 	[IFLA_GRE_LINK]		= { .type = NLA_U32 },
1748 	[IFLA_GRE_IFLAGS]	= { .type = NLA_U16 },
1749 	[IFLA_GRE_OFLAGS]	= { .type = NLA_U16 },
1750 	[IFLA_GRE_IKEY]		= { .type = NLA_U32 },
1751 	[IFLA_GRE_OKEY]		= { .type = NLA_U32 },
1752 	[IFLA_GRE_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1753 	[IFLA_GRE_REMOTE]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1754 	[IFLA_GRE_TTL]		= { .type = NLA_U8 },
1755 	[IFLA_GRE_TOS]		= { .type = NLA_U8 },
1756 	[IFLA_GRE_PMTUDISC]	= { .type = NLA_U8 },
1757 };
1758 
1759 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1760 	.kind		= "gre",
1761 	.maxtype	= IFLA_GRE_MAX,
1762 	.policy		= ipgre_policy,
1763 	.priv_size	= sizeof(struct ip_tunnel),
1764 	.setup		= ipgre_tunnel_setup,
1765 	.validate	= ipgre_tunnel_validate,
1766 	.newlink	= ipgre_newlink,
1767 	.changelink	= ipgre_changelink,
1768 	.get_size	= ipgre_get_size,
1769 	.fill_info	= ipgre_fill_info,
1770 };
1771 
1772 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1773 	.kind		= "gretap",
1774 	.maxtype	= IFLA_GRE_MAX,
1775 	.policy		= ipgre_policy,
1776 	.priv_size	= sizeof(struct ip_tunnel),
1777 	.setup		= ipgre_tap_setup,
1778 	.validate	= ipgre_tap_validate,
1779 	.newlink	= ipgre_newlink,
1780 	.changelink	= ipgre_changelink,
1781 	.get_size	= ipgre_get_size,
1782 	.fill_info	= ipgre_fill_info,
1783 };
1784 
1785 /*
1786  *	And now the modules code and kernel interface.
1787  */
1788 
1789 static int __init ipgre_init(void)
1790 {
1791 	int err;
1792 
1793 	pr_info("GRE over IPv4 tunneling driver\n");
1794 
1795 	err = register_pernet_device(&ipgre_net_ops);
1796 	if (err < 0)
1797 		return err;
1798 
1799 	err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1800 	if (err < 0) {
1801 		pr_info("%s: can't add protocol\n", __func__);
1802 		goto add_proto_failed;
1803 	}
1804 
1805 	err = rtnl_link_register(&ipgre_link_ops);
1806 	if (err < 0)
1807 		goto rtnl_link_failed;
1808 
1809 	err = rtnl_link_register(&ipgre_tap_ops);
1810 	if (err < 0)
1811 		goto tap_ops_failed;
1812 
1813 out:
1814 	return err;
1815 
1816 tap_ops_failed:
1817 	rtnl_link_unregister(&ipgre_link_ops);
1818 rtnl_link_failed:
1819 	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1820 add_proto_failed:
1821 	unregister_pernet_device(&ipgre_net_ops);
1822 	goto out;
1823 }
1824 
1825 static void __exit ipgre_fini(void)
1826 {
1827 	rtnl_link_unregister(&ipgre_tap_ops);
1828 	rtnl_link_unregister(&ipgre_link_ops);
1829 	if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1830 		pr_info("%s: can't remove protocol\n", __func__);
1831 	unregister_pernet_device(&ipgre_net_ops);
1832 }
1833 
1834 module_init(ipgre_init);
1835 module_exit(ipgre_fini);
1836 MODULE_LICENSE("GPL");
1837 MODULE_ALIAS_RTNL_LINK("gre");
1838 MODULE_ALIAS_RTNL_LINK("gretap");
1839 MODULE_ALIAS_NETDEV("gre0");
1840