xref: /openbmc/linux/net/ipv4/ip_gre.c (revision 54cbac81)
1 /*
2  *	Linux NET3:	GRE over IP protocol decoder.
3  *
4  *	Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *	This program is free software; you can redistribute it and/or
7  *	modify it under the terms of the GNU General Public License
8  *	as published by the Free Software Foundation; either version
9  *	2 of the License, or (at your option) any later version.
10  *
11  */
12 
13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14 
15 #include <linux/capability.h>
16 #include <linux/module.h>
17 #include <linux/types.h>
18 #include <linux/kernel.h>
19 #include <linux/slab.h>
20 #include <asm/uaccess.h>
21 #include <linux/skbuff.h>
22 #include <linux/netdevice.h>
23 #include <linux/in.h>
24 #include <linux/tcp.h>
25 #include <linux/udp.h>
26 #include <linux/if_arp.h>
27 #include <linux/mroute.h>
28 #include <linux/init.h>
29 #include <linux/in6.h>
30 #include <linux/inetdevice.h>
31 #include <linux/igmp.h>
32 #include <linux/netfilter_ipv4.h>
33 #include <linux/etherdevice.h>
34 #include <linux/if_ether.h>
35 
36 #include <net/sock.h>
37 #include <net/ip.h>
38 #include <net/icmp.h>
39 #include <net/protocol.h>
40 #include <net/ipip.h>
41 #include <net/arp.h>
42 #include <net/checksum.h>
43 #include <net/dsfield.h>
44 #include <net/inet_ecn.h>
45 #include <net/xfrm.h>
46 #include <net/net_namespace.h>
47 #include <net/netns/generic.h>
48 #include <net/rtnetlink.h>
49 #include <net/gre.h>
50 
51 #if IS_ENABLED(CONFIG_IPV6)
52 #include <net/ipv6.h>
53 #include <net/ip6_fib.h>
54 #include <net/ip6_route.h>
55 #endif
56 
57 /*
58    Problems & solutions
59    --------------------
60 
61    1. The most important issue is detecting local dead loops.
62    They would cause complete host lockup in transmit, which
63    would be "resolved" by stack overflow or, if queueing is enabled,
64    with infinite looping in net_bh.
65 
66    We cannot track such dead loops during route installation,
67    it is infeasible task. The most general solutions would be
68    to keep skb->encapsulation counter (sort of local ttl),
69    and silently drop packet when it expires. It is a good
70    solution, but it supposes maintaining new variable in ALL
71    skb, even if no tunneling is used.
72 
73    Current solution: xmit_recursion breaks dead loops. This is a percpu
74    counter, since when we enter the first ndo_xmit(), cpu migration is
75    forbidden. We force an exit if this counter reaches RECURSION_LIMIT
76 
77    2. Networking dead loops would not kill routers, but would really
78    kill network. IP hop limit plays role of "t->recursion" in this case,
79    if we copy it from packet being encapsulated to upper header.
80    It is very good solution, but it introduces two problems:
81 
82    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
83      do not work over tunnels.
84    - traceroute does not work. I planned to relay ICMP from tunnel,
85      so that this problem would be solved and traceroute output
86      would even more informative. This idea appeared to be wrong:
87      only Linux complies to rfc1812 now (yes, guys, Linux is the only
88      true router now :-)), all routers (at least, in neighbourhood of mine)
89      return only 8 bytes of payload. It is the end.
90 
91    Hence, if we want that OSPF worked or traceroute said something reasonable,
92    we should search for another solution.
93 
94    One of them is to parse packet trying to detect inner encapsulation
95    made by our node. It is difficult or even impossible, especially,
96    taking into account fragmentation. TO be short, ttl is not solution at all.
97 
98    Current solution: The solution was UNEXPECTEDLY SIMPLE.
99    We force DF flag on tunnels with preconfigured hop limit,
100    that is ALL. :-) Well, it does not remove the problem completely,
101    but exponential growth of network traffic is changed to linear
102    (branches, that exceed pmtu are pruned) and tunnel mtu
103    rapidly degrades to value <68, where looping stops.
104    Yes, it is not good if there exists a router in the loop,
105    which does not force DF, even when encapsulating packets have DF set.
106    But it is not our problem! Nobody could accuse us, we made
107    all that we could make. Even if it is your gated who injected
108    fatal route to network, even if it were you who configured
109    fatal static route: you are innocent. :-)
110 
111 
112 
113    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
114    practically identical code. It would be good to glue them
115    together, but it is not very evident, how to make them modular.
116    sit is integral part of IPv6, ipip and gre are naturally modular.
117    We could extract common parts (hash table, ioctl etc)
118    to a separate module (ip_tunnel.c).
119 
120    Alexey Kuznetsov.
121  */
122 
123 static bool log_ecn_error = true;
124 module_param(log_ecn_error, bool, 0644);
125 MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
126 
127 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
128 static int ipgre_tunnel_init(struct net_device *dev);
129 static void ipgre_tunnel_setup(struct net_device *dev);
130 static int ipgre_tunnel_bind_dev(struct net_device *dev);
131 
132 /* Fallback tunnel: no source, no destination, no key, no options */
133 
134 #define HASH_SIZE  16
135 
136 static int ipgre_net_id __read_mostly;
137 struct ipgre_net {
138 	struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
139 
140 	struct net_device *fb_tunnel_dev;
141 };
142 
143 /* Tunnel hash table */
144 
145 /*
146    4 hash tables:
147 
148    3: (remote,local)
149    2: (remote,*)
150    1: (*,local)
151    0: (*,*)
152 
153    We require exact key match i.e. if a key is present in packet
154    it will match only tunnel with the same key; if it is not present,
155    it will match only keyless tunnel.
156 
157    All keysless packets, if not matched configured keyless tunnels
158    will match fallback tunnel.
159  */
160 
161 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
162 
163 #define tunnels_r_l	tunnels[3]
164 #define tunnels_r	tunnels[2]
165 #define tunnels_l	tunnels[1]
166 #define tunnels_wc	tunnels[0]
167 
168 static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev,
169 						   struct rtnl_link_stats64 *tot)
170 {
171 	int i;
172 
173 	for_each_possible_cpu(i) {
174 		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
175 		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
176 		unsigned int start;
177 
178 		do {
179 			start = u64_stats_fetch_begin_bh(&tstats->syncp);
180 			rx_packets = tstats->rx_packets;
181 			tx_packets = tstats->tx_packets;
182 			rx_bytes = tstats->rx_bytes;
183 			tx_bytes = tstats->tx_bytes;
184 		} while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
185 
186 		tot->rx_packets += rx_packets;
187 		tot->tx_packets += tx_packets;
188 		tot->rx_bytes   += rx_bytes;
189 		tot->tx_bytes   += tx_bytes;
190 	}
191 
192 	tot->multicast = dev->stats.multicast;
193 	tot->rx_crc_errors = dev->stats.rx_crc_errors;
194 	tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
195 	tot->rx_length_errors = dev->stats.rx_length_errors;
196 	tot->rx_frame_errors = dev->stats.rx_frame_errors;
197 	tot->rx_errors = dev->stats.rx_errors;
198 
199 	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
200 	tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
201 	tot->tx_dropped = dev->stats.tx_dropped;
202 	tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
203 	tot->tx_errors = dev->stats.tx_errors;
204 
205 	return tot;
206 }
207 
208 /* Does key in tunnel parameters match packet */
209 static bool ipgre_key_match(const struct ip_tunnel_parm *p,
210 			    __be16 flags, __be32 key)
211 {
212 	if (p->i_flags & GRE_KEY) {
213 		if (flags & GRE_KEY)
214 			return key == p->i_key;
215 		else
216 			return false;	/* key expected, none present */
217 	} else
218 		return !(flags & GRE_KEY);
219 }
220 
221 /* Given src, dst and key, find appropriate for input tunnel. */
222 
223 static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
224 					     __be32 remote, __be32 local,
225 					     __be16 flags, __be32 key,
226 					     __be16 gre_proto)
227 {
228 	struct net *net = dev_net(dev);
229 	int link = dev->ifindex;
230 	unsigned int h0 = HASH(remote);
231 	unsigned int h1 = HASH(key);
232 	struct ip_tunnel *t, *cand = NULL;
233 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
234 	int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
235 		       ARPHRD_ETHER : ARPHRD_IPGRE;
236 	int score, cand_score = 4;
237 
238 	for_each_ip_tunnel_rcu(t, ign->tunnels_r_l[h0 ^ h1]) {
239 		if (local != t->parms.iph.saddr ||
240 		    remote != t->parms.iph.daddr ||
241 		    !(t->dev->flags & IFF_UP))
242 			continue;
243 
244 		if (!ipgre_key_match(&t->parms, flags, key))
245 			continue;
246 
247 		if (t->dev->type != ARPHRD_IPGRE &&
248 		    t->dev->type != dev_type)
249 			continue;
250 
251 		score = 0;
252 		if (t->parms.link != link)
253 			score |= 1;
254 		if (t->dev->type != dev_type)
255 			score |= 2;
256 		if (score == 0)
257 			return t;
258 
259 		if (score < cand_score) {
260 			cand = t;
261 			cand_score = score;
262 		}
263 	}
264 
265 	for_each_ip_tunnel_rcu(t, ign->tunnels_r[h0 ^ h1]) {
266 		if (remote != t->parms.iph.daddr ||
267 		    !(t->dev->flags & IFF_UP))
268 			continue;
269 
270 		if (!ipgre_key_match(&t->parms, flags, key))
271 			continue;
272 
273 		if (t->dev->type != ARPHRD_IPGRE &&
274 		    t->dev->type != dev_type)
275 			continue;
276 
277 		score = 0;
278 		if (t->parms.link != link)
279 			score |= 1;
280 		if (t->dev->type != dev_type)
281 			score |= 2;
282 		if (score == 0)
283 			return t;
284 
285 		if (score < cand_score) {
286 			cand = t;
287 			cand_score = score;
288 		}
289 	}
290 
291 	for_each_ip_tunnel_rcu(t, ign->tunnels_l[h1]) {
292 		if ((local != t->parms.iph.saddr &&
293 		     (local != t->parms.iph.daddr ||
294 		      !ipv4_is_multicast(local))) ||
295 		    !(t->dev->flags & IFF_UP))
296 			continue;
297 
298 		if (!ipgre_key_match(&t->parms, flags, key))
299 			continue;
300 
301 		if (t->dev->type != ARPHRD_IPGRE &&
302 		    t->dev->type != dev_type)
303 			continue;
304 
305 		score = 0;
306 		if (t->parms.link != link)
307 			score |= 1;
308 		if (t->dev->type != dev_type)
309 			score |= 2;
310 		if (score == 0)
311 			return t;
312 
313 		if (score < cand_score) {
314 			cand = t;
315 			cand_score = score;
316 		}
317 	}
318 
319 	for_each_ip_tunnel_rcu(t, ign->tunnels_wc[h1]) {
320 		if (t->parms.i_key != key ||
321 		    !(t->dev->flags & IFF_UP))
322 			continue;
323 
324 		if (t->dev->type != ARPHRD_IPGRE &&
325 		    t->dev->type != dev_type)
326 			continue;
327 
328 		score = 0;
329 		if (t->parms.link != link)
330 			score |= 1;
331 		if (t->dev->type != dev_type)
332 			score |= 2;
333 		if (score == 0)
334 			return t;
335 
336 		if (score < cand_score) {
337 			cand = t;
338 			cand_score = score;
339 		}
340 	}
341 
342 	if (cand != NULL)
343 		return cand;
344 
345 	dev = ign->fb_tunnel_dev;
346 	if (dev->flags & IFF_UP)
347 		return netdev_priv(dev);
348 
349 	return NULL;
350 }
351 
352 static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
353 		struct ip_tunnel_parm *parms)
354 {
355 	__be32 remote = parms->iph.daddr;
356 	__be32 local = parms->iph.saddr;
357 	__be32 key = parms->i_key;
358 	unsigned int h = HASH(key);
359 	int prio = 0;
360 
361 	if (local)
362 		prio |= 1;
363 	if (remote && !ipv4_is_multicast(remote)) {
364 		prio |= 2;
365 		h ^= HASH(remote);
366 	}
367 
368 	return &ign->tunnels[prio][h];
369 }
370 
371 static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
372 		struct ip_tunnel *t)
373 {
374 	return __ipgre_bucket(ign, &t->parms);
375 }
376 
377 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
378 {
379 	struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
380 
381 	rcu_assign_pointer(t->next, rtnl_dereference(*tp));
382 	rcu_assign_pointer(*tp, t);
383 }
384 
385 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
386 {
387 	struct ip_tunnel __rcu **tp;
388 	struct ip_tunnel *iter;
389 
390 	for (tp = ipgre_bucket(ign, t);
391 	     (iter = rtnl_dereference(*tp)) != NULL;
392 	     tp = &iter->next) {
393 		if (t == iter) {
394 			rcu_assign_pointer(*tp, t->next);
395 			break;
396 		}
397 	}
398 }
399 
400 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
401 					   struct ip_tunnel_parm *parms,
402 					   int type)
403 {
404 	__be32 remote = parms->iph.daddr;
405 	__be32 local = parms->iph.saddr;
406 	__be32 key = parms->i_key;
407 	int link = parms->link;
408 	struct ip_tunnel *t;
409 	struct ip_tunnel __rcu **tp;
410 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
411 
412 	for (tp = __ipgre_bucket(ign, parms);
413 	     (t = rtnl_dereference(*tp)) != NULL;
414 	     tp = &t->next)
415 		if (local == t->parms.iph.saddr &&
416 		    remote == t->parms.iph.daddr &&
417 		    key == t->parms.i_key &&
418 		    link == t->parms.link &&
419 		    type == t->dev->type)
420 			break;
421 
422 	return t;
423 }
424 
425 static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
426 		struct ip_tunnel_parm *parms, int create)
427 {
428 	struct ip_tunnel *t, *nt;
429 	struct net_device *dev;
430 	char name[IFNAMSIZ];
431 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
432 
433 	t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
434 	if (t || !create)
435 		return t;
436 
437 	if (parms->name[0])
438 		strlcpy(name, parms->name, IFNAMSIZ);
439 	else
440 		strcpy(name, "gre%d");
441 
442 	dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
443 	if (!dev)
444 		return NULL;
445 
446 	dev_net_set(dev, net);
447 
448 	nt = netdev_priv(dev);
449 	nt->parms = *parms;
450 	dev->rtnl_link_ops = &ipgre_link_ops;
451 
452 	dev->mtu = ipgre_tunnel_bind_dev(dev);
453 
454 	if (register_netdevice(dev) < 0)
455 		goto failed_free;
456 
457 	/* Can use a lockless transmit, unless we generate output sequences */
458 	if (!(nt->parms.o_flags & GRE_SEQ))
459 		dev->features |= NETIF_F_LLTX;
460 
461 	dev_hold(dev);
462 	ipgre_tunnel_link(ign, nt);
463 	return nt;
464 
465 failed_free:
466 	free_netdev(dev);
467 	return NULL;
468 }
469 
470 static void ipgre_tunnel_uninit(struct net_device *dev)
471 {
472 	struct net *net = dev_net(dev);
473 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
474 
475 	ipgre_tunnel_unlink(ign, netdev_priv(dev));
476 	dev_put(dev);
477 }
478 
479 
480 static void ipgre_err(struct sk_buff *skb, u32 info)
481 {
482 
483 /* All the routers (except for Linux) return only
484    8 bytes of packet payload. It means, that precise relaying of
485    ICMP in the real Internet is absolutely infeasible.
486 
487    Moreover, Cisco "wise men" put GRE key to the third word
488    in GRE header. It makes impossible maintaining even soft state for keyed
489    GRE tunnels with enabled checksum. Tell them "thank you".
490 
491    Well, I wonder, rfc1812 was written by Cisco employee,
492    what the hell these idiots break standards established
493    by themselves???
494  */
495 
496 	const struct iphdr *iph = (const struct iphdr *)skb->data;
497 	__be16	     *p = (__be16 *)(skb->data+(iph->ihl<<2));
498 	int grehlen = (iph->ihl<<2) + 4;
499 	const int type = icmp_hdr(skb)->type;
500 	const int code = icmp_hdr(skb)->code;
501 	struct ip_tunnel *t;
502 	__be16 flags;
503 	__be32 key = 0;
504 
505 	flags = p[0];
506 	if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
507 		if (flags&(GRE_VERSION|GRE_ROUTING))
508 			return;
509 		if (flags&GRE_KEY) {
510 			grehlen += 4;
511 			if (flags&GRE_CSUM)
512 				grehlen += 4;
513 		}
514 	}
515 
516 	/* If only 8 bytes returned, keyed message will be dropped here */
517 	if (skb_headlen(skb) < grehlen)
518 		return;
519 
520 	if (flags & GRE_KEY)
521 		key = *(((__be32 *)p) + (grehlen / 4) - 1);
522 
523 	switch (type) {
524 	default:
525 	case ICMP_PARAMETERPROB:
526 		return;
527 
528 	case ICMP_DEST_UNREACH:
529 		switch (code) {
530 		case ICMP_SR_FAILED:
531 		case ICMP_PORT_UNREACH:
532 			/* Impossible event. */
533 			return;
534 		default:
535 			/* All others are translated to HOST_UNREACH.
536 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
537 			   I believe they are just ether pollution. --ANK
538 			 */
539 			break;
540 		}
541 		break;
542 	case ICMP_TIME_EXCEEDED:
543 		if (code != ICMP_EXC_TTL)
544 			return;
545 		break;
546 
547 	case ICMP_REDIRECT:
548 		break;
549 	}
550 
551 	t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
552 				flags, key, p[1]);
553 
554 	if (t == NULL)
555 		return;
556 
557 	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
558 		ipv4_update_pmtu(skb, dev_net(skb->dev), info,
559 				 t->parms.link, 0, IPPROTO_GRE, 0);
560 		return;
561 	}
562 	if (type == ICMP_REDIRECT) {
563 		ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
564 			      IPPROTO_GRE, 0);
565 		return;
566 	}
567 	if (t->parms.iph.daddr == 0 ||
568 	    ipv4_is_multicast(t->parms.iph.daddr))
569 		return;
570 
571 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
572 		return;
573 
574 	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
575 		t->err_count++;
576 	else
577 		t->err_count = 1;
578 	t->err_time = jiffies;
579 }
580 
581 static inline u8
582 ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
583 {
584 	u8 inner = 0;
585 	if (skb->protocol == htons(ETH_P_IP))
586 		inner = old_iph->tos;
587 	else if (skb->protocol == htons(ETH_P_IPV6))
588 		inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
589 	return INET_ECN_encapsulate(tos, inner);
590 }
591 
592 static int ipgre_rcv(struct sk_buff *skb)
593 {
594 	const struct iphdr *iph;
595 	u8     *h;
596 	__be16    flags;
597 	__sum16   csum = 0;
598 	__be32 key = 0;
599 	u32    seqno = 0;
600 	struct ip_tunnel *tunnel;
601 	int    offset = 4;
602 	__be16 gre_proto;
603 	int    err;
604 
605 	if (!pskb_may_pull(skb, 16))
606 		goto drop;
607 
608 	iph = ip_hdr(skb);
609 	h = skb->data;
610 	flags = *(__be16 *)h;
611 
612 	if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
613 		/* - Version must be 0.
614 		   - We do not support routing headers.
615 		 */
616 		if (flags&(GRE_VERSION|GRE_ROUTING))
617 			goto drop;
618 
619 		if (flags&GRE_CSUM) {
620 			switch (skb->ip_summed) {
621 			case CHECKSUM_COMPLETE:
622 				csum = csum_fold(skb->csum);
623 				if (!csum)
624 					break;
625 				/* fall through */
626 			case CHECKSUM_NONE:
627 				skb->csum = 0;
628 				csum = __skb_checksum_complete(skb);
629 				skb->ip_summed = CHECKSUM_COMPLETE;
630 			}
631 			offset += 4;
632 		}
633 		if (flags&GRE_KEY) {
634 			key = *(__be32 *)(h + offset);
635 			offset += 4;
636 		}
637 		if (flags&GRE_SEQ) {
638 			seqno = ntohl(*(__be32 *)(h + offset));
639 			offset += 4;
640 		}
641 	}
642 
643 	gre_proto = *(__be16 *)(h + 2);
644 
645 	tunnel = ipgre_tunnel_lookup(skb->dev,
646 				     iph->saddr, iph->daddr, flags, key,
647 				     gre_proto);
648 	if (tunnel) {
649 		struct pcpu_tstats *tstats;
650 
651 		secpath_reset(skb);
652 
653 		skb->protocol = gre_proto;
654 		/* WCCP version 1 and 2 protocol decoding.
655 		 * - Change protocol to IP
656 		 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
657 		 */
658 		if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
659 			skb->protocol = htons(ETH_P_IP);
660 			if ((*(h + offset) & 0xF0) != 0x40)
661 				offset += 4;
662 		}
663 
664 		skb->mac_header = skb->network_header;
665 		__pskb_pull(skb, offset);
666 		skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
667 		skb->pkt_type = PACKET_HOST;
668 #ifdef CONFIG_NET_IPGRE_BROADCAST
669 		if (ipv4_is_multicast(iph->daddr)) {
670 			/* Looped back packet, drop it! */
671 			if (rt_is_output_route(skb_rtable(skb)))
672 				goto drop;
673 			tunnel->dev->stats.multicast++;
674 			skb->pkt_type = PACKET_BROADCAST;
675 		}
676 #endif
677 
678 		if (((flags&GRE_CSUM) && csum) ||
679 		    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
680 			tunnel->dev->stats.rx_crc_errors++;
681 			tunnel->dev->stats.rx_errors++;
682 			goto drop;
683 		}
684 		if (tunnel->parms.i_flags&GRE_SEQ) {
685 			if (!(flags&GRE_SEQ) ||
686 			    (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
687 				tunnel->dev->stats.rx_fifo_errors++;
688 				tunnel->dev->stats.rx_errors++;
689 				goto drop;
690 			}
691 			tunnel->i_seqno = seqno + 1;
692 		}
693 
694 		/* Warning: All skb pointers will be invalidated! */
695 		if (tunnel->dev->type == ARPHRD_ETHER) {
696 			if (!pskb_may_pull(skb, ETH_HLEN)) {
697 				tunnel->dev->stats.rx_length_errors++;
698 				tunnel->dev->stats.rx_errors++;
699 				goto drop;
700 			}
701 
702 			iph = ip_hdr(skb);
703 			skb->protocol = eth_type_trans(skb, tunnel->dev);
704 			skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
705 		}
706 
707 		__skb_tunnel_rx(skb, tunnel->dev);
708 
709 		skb_reset_network_header(skb);
710 		err = IP_ECN_decapsulate(iph, skb);
711 		if (unlikely(err)) {
712 			if (log_ecn_error)
713 				net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
714 						     &iph->saddr, iph->tos);
715 			if (err > 1) {
716 				++tunnel->dev->stats.rx_frame_errors;
717 				++tunnel->dev->stats.rx_errors;
718 				goto drop;
719 			}
720 		}
721 
722 		tstats = this_cpu_ptr(tunnel->dev->tstats);
723 		u64_stats_update_begin(&tstats->syncp);
724 		tstats->rx_packets++;
725 		tstats->rx_bytes += skb->len;
726 		u64_stats_update_end(&tstats->syncp);
727 
728 		gro_cells_receive(&tunnel->gro_cells, skb);
729 		return 0;
730 	}
731 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
732 
733 drop:
734 	kfree_skb(skb);
735 	return 0;
736 }
737 
738 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
739 {
740 	struct ip_tunnel *tunnel = netdev_priv(dev);
741 	const struct iphdr  *old_iph = ip_hdr(skb);
742 	const struct iphdr  *tiph;
743 	struct flowi4 fl4;
744 	u8     tos;
745 	__be16 df;
746 	struct rtable *rt;     			/* Route to the other host */
747 	struct net_device *tdev;		/* Device to other host */
748 	struct iphdr  *iph;			/* Our new IP header */
749 	unsigned int max_headroom;		/* The extra header space needed */
750 	int    gre_hlen;
751 	__be32 dst;
752 	int    mtu;
753 	u8     ttl;
754 
755 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
756 	    skb_checksum_help(skb))
757 		goto tx_error;
758 
759 	if (dev->type == ARPHRD_ETHER)
760 		IPCB(skb)->flags = 0;
761 
762 	if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
763 		gre_hlen = 0;
764 		if (skb->protocol == htons(ETH_P_IP))
765 			tiph = (const struct iphdr *)skb->data;
766 		else
767 			tiph = &tunnel->parms.iph;
768 	} else {
769 		gre_hlen = tunnel->hlen;
770 		tiph = &tunnel->parms.iph;
771 	}
772 
773 	if ((dst = tiph->daddr) == 0) {
774 		/* NBMA tunnel */
775 
776 		if (skb_dst(skb) == NULL) {
777 			dev->stats.tx_fifo_errors++;
778 			goto tx_error;
779 		}
780 
781 		if (skb->protocol == htons(ETH_P_IP)) {
782 			rt = skb_rtable(skb);
783 			dst = rt_nexthop(rt, old_iph->daddr);
784 		}
785 #if IS_ENABLED(CONFIG_IPV6)
786 		else if (skb->protocol == htons(ETH_P_IPV6)) {
787 			const struct in6_addr *addr6;
788 			struct neighbour *neigh;
789 			bool do_tx_error_icmp;
790 			int addr_type;
791 
792 			neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr);
793 			if (neigh == NULL)
794 				goto tx_error;
795 
796 			addr6 = (const struct in6_addr *)&neigh->primary_key;
797 			addr_type = ipv6_addr_type(addr6);
798 
799 			if (addr_type == IPV6_ADDR_ANY) {
800 				addr6 = &ipv6_hdr(skb)->daddr;
801 				addr_type = ipv6_addr_type(addr6);
802 			}
803 
804 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
805 				do_tx_error_icmp = true;
806 			else {
807 				do_tx_error_icmp = false;
808 				dst = addr6->s6_addr32[3];
809 			}
810 			neigh_release(neigh);
811 			if (do_tx_error_icmp)
812 				goto tx_error_icmp;
813 		}
814 #endif
815 		else
816 			goto tx_error;
817 	}
818 
819 	ttl = tiph->ttl;
820 	tos = tiph->tos;
821 	if (tos == 1) {
822 		tos = 0;
823 		if (skb->protocol == htons(ETH_P_IP))
824 			tos = old_iph->tos;
825 		else if (skb->protocol == htons(ETH_P_IPV6))
826 			tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
827 	}
828 
829 	rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr,
830 				 tunnel->parms.o_key, RT_TOS(tos),
831 				 tunnel->parms.link);
832 	if (IS_ERR(rt)) {
833 		dev->stats.tx_carrier_errors++;
834 		goto tx_error;
835 	}
836 	tdev = rt->dst.dev;
837 
838 	if (tdev == dev) {
839 		ip_rt_put(rt);
840 		dev->stats.collisions++;
841 		goto tx_error;
842 	}
843 
844 	df = tiph->frag_off;
845 	if (df)
846 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
847 	else
848 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
849 
850 	if (skb_dst(skb))
851 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
852 
853 	if (skb->protocol == htons(ETH_P_IP)) {
854 		df |= (old_iph->frag_off&htons(IP_DF));
855 
856 		if ((old_iph->frag_off&htons(IP_DF)) &&
857 		    mtu < ntohs(old_iph->tot_len)) {
858 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
859 			ip_rt_put(rt);
860 			goto tx_error;
861 		}
862 	}
863 #if IS_ENABLED(CONFIG_IPV6)
864 	else if (skb->protocol == htons(ETH_P_IPV6)) {
865 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
866 
867 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
868 			if ((tunnel->parms.iph.daddr &&
869 			     !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
870 			    rt6->rt6i_dst.plen == 128) {
871 				rt6->rt6i_flags |= RTF_MODIFIED;
872 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
873 			}
874 		}
875 
876 		if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
877 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
878 			ip_rt_put(rt);
879 			goto tx_error;
880 		}
881 	}
882 #endif
883 
884 	if (tunnel->err_count > 0) {
885 		if (time_before(jiffies,
886 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
887 			tunnel->err_count--;
888 
889 			dst_link_failure(skb);
890 		} else
891 			tunnel->err_count = 0;
892 	}
893 
894 	max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
895 
896 	if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
897 	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
898 		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
899 		if (max_headroom > dev->needed_headroom)
900 			dev->needed_headroom = max_headroom;
901 		if (!new_skb) {
902 			ip_rt_put(rt);
903 			dev->stats.tx_dropped++;
904 			dev_kfree_skb(skb);
905 			return NETDEV_TX_OK;
906 		}
907 		if (skb->sk)
908 			skb_set_owner_w(new_skb, skb->sk);
909 		dev_kfree_skb(skb);
910 		skb = new_skb;
911 		old_iph = ip_hdr(skb);
912 		/* Warning : tiph value might point to freed memory */
913 	}
914 
915 	skb_push(skb, gre_hlen);
916 	skb_reset_network_header(skb);
917 	skb_set_transport_header(skb, sizeof(*iph));
918 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
919 	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
920 			      IPSKB_REROUTED);
921 	skb_dst_drop(skb);
922 	skb_dst_set(skb, &rt->dst);
923 
924 	/*
925 	 *	Push down and install the IPIP header.
926 	 */
927 
928 	iph 			=	ip_hdr(skb);
929 	iph->version		=	4;
930 	iph->ihl		=	sizeof(struct iphdr) >> 2;
931 	iph->frag_off		=	df;
932 	iph->protocol		=	IPPROTO_GRE;
933 	iph->tos		=	ipgre_ecn_encapsulate(tos, old_iph, skb);
934 	iph->daddr		=	fl4.daddr;
935 	iph->saddr		=	fl4.saddr;
936 	iph->ttl		=	ttl;
937 
938 	if (ttl == 0) {
939 		if (skb->protocol == htons(ETH_P_IP))
940 			iph->ttl = old_iph->ttl;
941 #if IS_ENABLED(CONFIG_IPV6)
942 		else if (skb->protocol == htons(ETH_P_IPV6))
943 			iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
944 #endif
945 		else
946 			iph->ttl = ip4_dst_hoplimit(&rt->dst);
947 	}
948 
949 	((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
950 	((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
951 				   htons(ETH_P_TEB) : skb->protocol;
952 
953 	if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
954 		__be32 *ptr = (__be32 *)(((u8 *)iph) + tunnel->hlen - 4);
955 
956 		if (tunnel->parms.o_flags&GRE_SEQ) {
957 			++tunnel->o_seqno;
958 			*ptr = htonl(tunnel->o_seqno);
959 			ptr--;
960 		}
961 		if (tunnel->parms.o_flags&GRE_KEY) {
962 			*ptr = tunnel->parms.o_key;
963 			ptr--;
964 		}
965 		if (tunnel->parms.o_flags&GRE_CSUM) {
966 			*ptr = 0;
967 			*(__sum16 *)ptr = ip_compute_csum((void *)(iph+1), skb->len - sizeof(struct iphdr));
968 		}
969 	}
970 
971 	iptunnel_xmit(skb, dev);
972 	return NETDEV_TX_OK;
973 
974 #if IS_ENABLED(CONFIG_IPV6)
975 tx_error_icmp:
976 	dst_link_failure(skb);
977 #endif
978 tx_error:
979 	dev->stats.tx_errors++;
980 	dev_kfree_skb(skb);
981 	return NETDEV_TX_OK;
982 }
983 
984 static int ipgre_tunnel_bind_dev(struct net_device *dev)
985 {
986 	struct net_device *tdev = NULL;
987 	struct ip_tunnel *tunnel;
988 	const struct iphdr *iph;
989 	int hlen = LL_MAX_HEADER;
990 	int mtu = ETH_DATA_LEN;
991 	int addend = sizeof(struct iphdr) + 4;
992 
993 	tunnel = netdev_priv(dev);
994 	iph = &tunnel->parms.iph;
995 
996 	/* Guess output device to choose reasonable mtu and needed_headroom */
997 
998 	if (iph->daddr) {
999 		struct flowi4 fl4;
1000 		struct rtable *rt;
1001 
1002 		rt = ip_route_output_gre(dev_net(dev), &fl4,
1003 					 iph->daddr, iph->saddr,
1004 					 tunnel->parms.o_key,
1005 					 RT_TOS(iph->tos),
1006 					 tunnel->parms.link);
1007 		if (!IS_ERR(rt)) {
1008 			tdev = rt->dst.dev;
1009 			ip_rt_put(rt);
1010 		}
1011 
1012 		if (dev->type != ARPHRD_ETHER)
1013 			dev->flags |= IFF_POINTOPOINT;
1014 	}
1015 
1016 	if (!tdev && tunnel->parms.link)
1017 		tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
1018 
1019 	if (tdev) {
1020 		hlen = tdev->hard_header_len + tdev->needed_headroom;
1021 		mtu = tdev->mtu;
1022 	}
1023 	dev->iflink = tunnel->parms.link;
1024 
1025 	/* Precalculate GRE options length */
1026 	if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
1027 		if (tunnel->parms.o_flags&GRE_CSUM)
1028 			addend += 4;
1029 		if (tunnel->parms.o_flags&GRE_KEY)
1030 			addend += 4;
1031 		if (tunnel->parms.o_flags&GRE_SEQ)
1032 			addend += 4;
1033 	}
1034 	dev->needed_headroom = addend + hlen;
1035 	mtu -= dev->hard_header_len + addend;
1036 
1037 	if (mtu < 68)
1038 		mtu = 68;
1039 
1040 	tunnel->hlen = addend;
1041 
1042 	return mtu;
1043 }
1044 
1045 static int
1046 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
1047 {
1048 	int err = 0;
1049 	struct ip_tunnel_parm p;
1050 	struct ip_tunnel *t;
1051 	struct net *net = dev_net(dev);
1052 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1053 
1054 	switch (cmd) {
1055 	case SIOCGETTUNNEL:
1056 		t = NULL;
1057 		if (dev == ign->fb_tunnel_dev) {
1058 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
1059 				err = -EFAULT;
1060 				break;
1061 			}
1062 			t = ipgre_tunnel_locate(net, &p, 0);
1063 		}
1064 		if (t == NULL)
1065 			t = netdev_priv(dev);
1066 		memcpy(&p, &t->parms, sizeof(p));
1067 		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1068 			err = -EFAULT;
1069 		break;
1070 
1071 	case SIOCADDTUNNEL:
1072 	case SIOCCHGTUNNEL:
1073 		err = -EPERM;
1074 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
1075 			goto done;
1076 
1077 		err = -EFAULT;
1078 		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1079 			goto done;
1080 
1081 		err = -EINVAL;
1082 		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1083 		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1084 		    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1085 			goto done;
1086 		if (p.iph.ttl)
1087 			p.iph.frag_off |= htons(IP_DF);
1088 
1089 		if (!(p.i_flags&GRE_KEY))
1090 			p.i_key = 0;
1091 		if (!(p.o_flags&GRE_KEY))
1092 			p.o_key = 0;
1093 
1094 		t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1095 
1096 		if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1097 			if (t != NULL) {
1098 				if (t->dev != dev) {
1099 					err = -EEXIST;
1100 					break;
1101 				}
1102 			} else {
1103 				unsigned int nflags = 0;
1104 
1105 				t = netdev_priv(dev);
1106 
1107 				if (ipv4_is_multicast(p.iph.daddr))
1108 					nflags = IFF_BROADCAST;
1109 				else if (p.iph.daddr)
1110 					nflags = IFF_POINTOPOINT;
1111 
1112 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1113 					err = -EINVAL;
1114 					break;
1115 				}
1116 				ipgre_tunnel_unlink(ign, t);
1117 				synchronize_net();
1118 				t->parms.iph.saddr = p.iph.saddr;
1119 				t->parms.iph.daddr = p.iph.daddr;
1120 				t->parms.i_key = p.i_key;
1121 				t->parms.o_key = p.o_key;
1122 				memcpy(dev->dev_addr, &p.iph.saddr, 4);
1123 				memcpy(dev->broadcast, &p.iph.daddr, 4);
1124 				ipgre_tunnel_link(ign, t);
1125 				netdev_state_change(dev);
1126 			}
1127 		}
1128 
1129 		if (t) {
1130 			err = 0;
1131 			if (cmd == SIOCCHGTUNNEL) {
1132 				t->parms.iph.ttl = p.iph.ttl;
1133 				t->parms.iph.tos = p.iph.tos;
1134 				t->parms.iph.frag_off = p.iph.frag_off;
1135 				if (t->parms.link != p.link) {
1136 					t->parms.link = p.link;
1137 					dev->mtu = ipgre_tunnel_bind_dev(dev);
1138 					netdev_state_change(dev);
1139 				}
1140 			}
1141 			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1142 				err = -EFAULT;
1143 		} else
1144 			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1145 		break;
1146 
1147 	case SIOCDELTUNNEL:
1148 		err = -EPERM;
1149 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
1150 			goto done;
1151 
1152 		if (dev == ign->fb_tunnel_dev) {
1153 			err = -EFAULT;
1154 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1155 				goto done;
1156 			err = -ENOENT;
1157 			if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1158 				goto done;
1159 			err = -EPERM;
1160 			if (t == netdev_priv(ign->fb_tunnel_dev))
1161 				goto done;
1162 			dev = t->dev;
1163 		}
1164 		unregister_netdevice(dev);
1165 		err = 0;
1166 		break;
1167 
1168 	default:
1169 		err = -EINVAL;
1170 	}
1171 
1172 done:
1173 	return err;
1174 }
1175 
1176 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1177 {
1178 	struct ip_tunnel *tunnel = netdev_priv(dev);
1179 	if (new_mtu < 68 ||
1180 	    new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1181 		return -EINVAL;
1182 	dev->mtu = new_mtu;
1183 	return 0;
1184 }
1185 
1186 /* Nice toy. Unfortunately, useless in real life :-)
1187    It allows to construct virtual multiprotocol broadcast "LAN"
1188    over the Internet, provided multicast routing is tuned.
1189 
1190 
1191    I have no idea was this bicycle invented before me,
1192    so that I had to set ARPHRD_IPGRE to a random value.
1193    I have an impression, that Cisco could make something similar,
1194    but this feature is apparently missing in IOS<=11.2(8).
1195 
1196    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1197    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1198 
1199    ping -t 255 224.66.66.66
1200 
1201    If nobody answers, mbone does not work.
1202 
1203    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1204    ip addr add 10.66.66.<somewhat>/24 dev Universe
1205    ifconfig Universe up
1206    ifconfig Universe add fe80::<Your_real_addr>/10
1207    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1208    ftp 10.66.66.66
1209    ...
1210    ftp fec0:6666:6666::193.233.7.65
1211    ...
1212 
1213  */
1214 
1215 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1216 			unsigned short type,
1217 			const void *daddr, const void *saddr, unsigned int len)
1218 {
1219 	struct ip_tunnel *t = netdev_priv(dev);
1220 	struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1221 	__be16 *p = (__be16 *)(iph+1);
1222 
1223 	memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1224 	p[0]		= t->parms.o_flags;
1225 	p[1]		= htons(type);
1226 
1227 	/*
1228 	 *	Set the source hardware address.
1229 	 */
1230 
1231 	if (saddr)
1232 		memcpy(&iph->saddr, saddr, 4);
1233 	if (daddr)
1234 		memcpy(&iph->daddr, daddr, 4);
1235 	if (iph->daddr)
1236 		return t->hlen;
1237 
1238 	return -t->hlen;
1239 }
1240 
1241 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1242 {
1243 	const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
1244 	memcpy(haddr, &iph->saddr, 4);
1245 	return 4;
1246 }
1247 
1248 static const struct header_ops ipgre_header_ops = {
1249 	.create	= ipgre_header,
1250 	.parse	= ipgre_header_parse,
1251 };
1252 
1253 #ifdef CONFIG_NET_IPGRE_BROADCAST
1254 static int ipgre_open(struct net_device *dev)
1255 {
1256 	struct ip_tunnel *t = netdev_priv(dev);
1257 
1258 	if (ipv4_is_multicast(t->parms.iph.daddr)) {
1259 		struct flowi4 fl4;
1260 		struct rtable *rt;
1261 
1262 		rt = ip_route_output_gre(dev_net(dev), &fl4,
1263 					 t->parms.iph.daddr,
1264 					 t->parms.iph.saddr,
1265 					 t->parms.o_key,
1266 					 RT_TOS(t->parms.iph.tos),
1267 					 t->parms.link);
1268 		if (IS_ERR(rt))
1269 			return -EADDRNOTAVAIL;
1270 		dev = rt->dst.dev;
1271 		ip_rt_put(rt);
1272 		if (__in_dev_get_rtnl(dev) == NULL)
1273 			return -EADDRNOTAVAIL;
1274 		t->mlink = dev->ifindex;
1275 		ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1276 	}
1277 	return 0;
1278 }
1279 
1280 static int ipgre_close(struct net_device *dev)
1281 {
1282 	struct ip_tunnel *t = netdev_priv(dev);
1283 
1284 	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1285 		struct in_device *in_dev;
1286 		in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1287 		if (in_dev)
1288 			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1289 	}
1290 	return 0;
1291 }
1292 
1293 #endif
1294 
1295 static const struct net_device_ops ipgre_netdev_ops = {
1296 	.ndo_init		= ipgre_tunnel_init,
1297 	.ndo_uninit		= ipgre_tunnel_uninit,
1298 #ifdef CONFIG_NET_IPGRE_BROADCAST
1299 	.ndo_open		= ipgre_open,
1300 	.ndo_stop		= ipgre_close,
1301 #endif
1302 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1303 	.ndo_do_ioctl		= ipgre_tunnel_ioctl,
1304 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1305 	.ndo_get_stats64	= ipgre_get_stats64,
1306 };
1307 
1308 static void ipgre_dev_free(struct net_device *dev)
1309 {
1310 	struct ip_tunnel *tunnel = netdev_priv(dev);
1311 
1312 	gro_cells_destroy(&tunnel->gro_cells);
1313 	free_percpu(dev->tstats);
1314 	free_netdev(dev);
1315 }
1316 
1317 #define GRE_FEATURES (NETIF_F_SG |		\
1318 		      NETIF_F_FRAGLIST |	\
1319 		      NETIF_F_HIGHDMA |		\
1320 		      NETIF_F_HW_CSUM)
1321 
1322 static void ipgre_tunnel_setup(struct net_device *dev)
1323 {
1324 	dev->netdev_ops		= &ipgre_netdev_ops;
1325 	dev->destructor 	= ipgre_dev_free;
1326 
1327 	dev->type		= ARPHRD_IPGRE;
1328 	dev->needed_headroom 	= LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1329 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1330 	dev->flags		= IFF_NOARP;
1331 	dev->iflink		= 0;
1332 	dev->addr_len		= 4;
1333 	dev->features		|= NETIF_F_NETNS_LOCAL;
1334 	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
1335 
1336 	dev->features		|= GRE_FEATURES;
1337 	dev->hw_features	|= GRE_FEATURES;
1338 }
1339 
1340 static int ipgre_tunnel_init(struct net_device *dev)
1341 {
1342 	struct ip_tunnel *tunnel;
1343 	struct iphdr *iph;
1344 	int err;
1345 
1346 	tunnel = netdev_priv(dev);
1347 	iph = &tunnel->parms.iph;
1348 
1349 	tunnel->dev = dev;
1350 	strcpy(tunnel->parms.name, dev->name);
1351 
1352 	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1353 	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1354 
1355 	if (iph->daddr) {
1356 #ifdef CONFIG_NET_IPGRE_BROADCAST
1357 		if (ipv4_is_multicast(iph->daddr)) {
1358 			if (!iph->saddr)
1359 				return -EINVAL;
1360 			dev->flags = IFF_BROADCAST;
1361 			dev->header_ops = &ipgre_header_ops;
1362 		}
1363 #endif
1364 	} else
1365 		dev->header_ops = &ipgre_header_ops;
1366 
1367 	dev->tstats = alloc_percpu(struct pcpu_tstats);
1368 	if (!dev->tstats)
1369 		return -ENOMEM;
1370 
1371 	err = gro_cells_init(&tunnel->gro_cells, dev);
1372 	if (err) {
1373 		free_percpu(dev->tstats);
1374 		return err;
1375 	}
1376 
1377 	return 0;
1378 }
1379 
1380 static void ipgre_fb_tunnel_init(struct net_device *dev)
1381 {
1382 	struct ip_tunnel *tunnel = netdev_priv(dev);
1383 	struct iphdr *iph = &tunnel->parms.iph;
1384 
1385 	tunnel->dev = dev;
1386 	strcpy(tunnel->parms.name, dev->name);
1387 
1388 	iph->version		= 4;
1389 	iph->protocol		= IPPROTO_GRE;
1390 	iph->ihl		= 5;
1391 	tunnel->hlen		= sizeof(struct iphdr) + 4;
1392 
1393 	dev_hold(dev);
1394 }
1395 
1396 
1397 static const struct gre_protocol ipgre_protocol = {
1398 	.handler     = ipgre_rcv,
1399 	.err_handler = ipgre_err,
1400 };
1401 
1402 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1403 {
1404 	int prio;
1405 
1406 	for (prio = 0; prio < 4; prio++) {
1407 		int h;
1408 		for (h = 0; h < HASH_SIZE; h++) {
1409 			struct ip_tunnel *t;
1410 
1411 			t = rtnl_dereference(ign->tunnels[prio][h]);
1412 
1413 			while (t != NULL) {
1414 				unregister_netdevice_queue(t->dev, head);
1415 				t = rtnl_dereference(t->next);
1416 			}
1417 		}
1418 	}
1419 }
1420 
1421 static int __net_init ipgre_init_net(struct net *net)
1422 {
1423 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1424 	int err;
1425 
1426 	ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1427 					   ipgre_tunnel_setup);
1428 	if (!ign->fb_tunnel_dev) {
1429 		err = -ENOMEM;
1430 		goto err_alloc_dev;
1431 	}
1432 	dev_net_set(ign->fb_tunnel_dev, net);
1433 
1434 	ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1435 	ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1436 
1437 	if ((err = register_netdev(ign->fb_tunnel_dev)))
1438 		goto err_reg_dev;
1439 
1440 	rcu_assign_pointer(ign->tunnels_wc[0],
1441 			   netdev_priv(ign->fb_tunnel_dev));
1442 	return 0;
1443 
1444 err_reg_dev:
1445 	ipgre_dev_free(ign->fb_tunnel_dev);
1446 err_alloc_dev:
1447 	return err;
1448 }
1449 
1450 static void __net_exit ipgre_exit_net(struct net *net)
1451 {
1452 	struct ipgre_net *ign;
1453 	LIST_HEAD(list);
1454 
1455 	ign = net_generic(net, ipgre_net_id);
1456 	rtnl_lock();
1457 	ipgre_destroy_tunnels(ign, &list);
1458 	unregister_netdevice_many(&list);
1459 	rtnl_unlock();
1460 }
1461 
1462 static struct pernet_operations ipgre_net_ops = {
1463 	.init = ipgre_init_net,
1464 	.exit = ipgre_exit_net,
1465 	.id   = &ipgre_net_id,
1466 	.size = sizeof(struct ipgre_net),
1467 };
1468 
1469 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1470 {
1471 	__be16 flags;
1472 
1473 	if (!data)
1474 		return 0;
1475 
1476 	flags = 0;
1477 	if (data[IFLA_GRE_IFLAGS])
1478 		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1479 	if (data[IFLA_GRE_OFLAGS])
1480 		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1481 	if (flags & (GRE_VERSION|GRE_ROUTING))
1482 		return -EINVAL;
1483 
1484 	return 0;
1485 }
1486 
1487 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1488 {
1489 	__be32 daddr;
1490 
1491 	if (tb[IFLA_ADDRESS]) {
1492 		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1493 			return -EINVAL;
1494 		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1495 			return -EADDRNOTAVAIL;
1496 	}
1497 
1498 	if (!data)
1499 		goto out;
1500 
1501 	if (data[IFLA_GRE_REMOTE]) {
1502 		memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1503 		if (!daddr)
1504 			return -EINVAL;
1505 	}
1506 
1507 out:
1508 	return ipgre_tunnel_validate(tb, data);
1509 }
1510 
1511 static void ipgre_netlink_parms(struct nlattr *data[],
1512 				struct ip_tunnel_parm *parms)
1513 {
1514 	memset(parms, 0, sizeof(*parms));
1515 
1516 	parms->iph.protocol = IPPROTO_GRE;
1517 
1518 	if (!data)
1519 		return;
1520 
1521 	if (data[IFLA_GRE_LINK])
1522 		parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1523 
1524 	if (data[IFLA_GRE_IFLAGS])
1525 		parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1526 
1527 	if (data[IFLA_GRE_OFLAGS])
1528 		parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1529 
1530 	if (data[IFLA_GRE_IKEY])
1531 		parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1532 
1533 	if (data[IFLA_GRE_OKEY])
1534 		parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1535 
1536 	if (data[IFLA_GRE_LOCAL])
1537 		parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1538 
1539 	if (data[IFLA_GRE_REMOTE])
1540 		parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1541 
1542 	if (data[IFLA_GRE_TTL])
1543 		parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1544 
1545 	if (data[IFLA_GRE_TOS])
1546 		parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1547 
1548 	if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1549 		parms->iph.frag_off = htons(IP_DF);
1550 }
1551 
1552 static int ipgre_tap_init(struct net_device *dev)
1553 {
1554 	struct ip_tunnel *tunnel;
1555 
1556 	tunnel = netdev_priv(dev);
1557 
1558 	tunnel->dev = dev;
1559 	strcpy(tunnel->parms.name, dev->name);
1560 
1561 	ipgre_tunnel_bind_dev(dev);
1562 
1563 	dev->tstats = alloc_percpu(struct pcpu_tstats);
1564 	if (!dev->tstats)
1565 		return -ENOMEM;
1566 
1567 	return 0;
1568 }
1569 
1570 static const struct net_device_ops ipgre_tap_netdev_ops = {
1571 	.ndo_init		= ipgre_tap_init,
1572 	.ndo_uninit		= ipgre_tunnel_uninit,
1573 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1574 	.ndo_set_mac_address 	= eth_mac_addr,
1575 	.ndo_validate_addr	= eth_validate_addr,
1576 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1577 	.ndo_get_stats64	= ipgre_get_stats64,
1578 };
1579 
1580 static void ipgre_tap_setup(struct net_device *dev)
1581 {
1582 
1583 	ether_setup(dev);
1584 
1585 	dev->netdev_ops		= &ipgre_tap_netdev_ops;
1586 	dev->destructor 	= ipgre_dev_free;
1587 
1588 	dev->iflink		= 0;
1589 	dev->features		|= NETIF_F_NETNS_LOCAL;
1590 }
1591 
1592 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1593 			 struct nlattr *data[])
1594 {
1595 	struct ip_tunnel *nt;
1596 	struct net *net = dev_net(dev);
1597 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1598 	int mtu;
1599 	int err;
1600 
1601 	nt = netdev_priv(dev);
1602 	ipgre_netlink_parms(data, &nt->parms);
1603 
1604 	if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1605 		return -EEXIST;
1606 
1607 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1608 		eth_hw_addr_random(dev);
1609 
1610 	mtu = ipgre_tunnel_bind_dev(dev);
1611 	if (!tb[IFLA_MTU])
1612 		dev->mtu = mtu;
1613 
1614 	/* Can use a lockless transmit, unless we generate output sequences */
1615 	if (!(nt->parms.o_flags & GRE_SEQ))
1616 		dev->features |= NETIF_F_LLTX;
1617 
1618 	err = register_netdevice(dev);
1619 	if (err)
1620 		goto out;
1621 
1622 	dev_hold(dev);
1623 	ipgre_tunnel_link(ign, nt);
1624 
1625 out:
1626 	return err;
1627 }
1628 
1629 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1630 			    struct nlattr *data[])
1631 {
1632 	struct ip_tunnel *t, *nt;
1633 	struct net *net = dev_net(dev);
1634 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1635 	struct ip_tunnel_parm p;
1636 	int mtu;
1637 
1638 	if (dev == ign->fb_tunnel_dev)
1639 		return -EINVAL;
1640 
1641 	nt = netdev_priv(dev);
1642 	ipgre_netlink_parms(data, &p);
1643 
1644 	t = ipgre_tunnel_locate(net, &p, 0);
1645 
1646 	if (t) {
1647 		if (t->dev != dev)
1648 			return -EEXIST;
1649 	} else {
1650 		t = nt;
1651 
1652 		if (dev->type != ARPHRD_ETHER) {
1653 			unsigned int nflags = 0;
1654 
1655 			if (ipv4_is_multicast(p.iph.daddr))
1656 				nflags = IFF_BROADCAST;
1657 			else if (p.iph.daddr)
1658 				nflags = IFF_POINTOPOINT;
1659 
1660 			if ((dev->flags ^ nflags) &
1661 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1662 				return -EINVAL;
1663 		}
1664 
1665 		ipgre_tunnel_unlink(ign, t);
1666 		t->parms.iph.saddr = p.iph.saddr;
1667 		t->parms.iph.daddr = p.iph.daddr;
1668 		t->parms.i_key = p.i_key;
1669 		if (dev->type != ARPHRD_ETHER) {
1670 			memcpy(dev->dev_addr, &p.iph.saddr, 4);
1671 			memcpy(dev->broadcast, &p.iph.daddr, 4);
1672 		}
1673 		ipgre_tunnel_link(ign, t);
1674 		netdev_state_change(dev);
1675 	}
1676 
1677 	t->parms.o_key = p.o_key;
1678 	t->parms.iph.ttl = p.iph.ttl;
1679 	t->parms.iph.tos = p.iph.tos;
1680 	t->parms.iph.frag_off = p.iph.frag_off;
1681 
1682 	if (t->parms.link != p.link) {
1683 		t->parms.link = p.link;
1684 		mtu = ipgre_tunnel_bind_dev(dev);
1685 		if (!tb[IFLA_MTU])
1686 			dev->mtu = mtu;
1687 		netdev_state_change(dev);
1688 	}
1689 
1690 	return 0;
1691 }
1692 
1693 static size_t ipgre_get_size(const struct net_device *dev)
1694 {
1695 	return
1696 		/* IFLA_GRE_LINK */
1697 		nla_total_size(4) +
1698 		/* IFLA_GRE_IFLAGS */
1699 		nla_total_size(2) +
1700 		/* IFLA_GRE_OFLAGS */
1701 		nla_total_size(2) +
1702 		/* IFLA_GRE_IKEY */
1703 		nla_total_size(4) +
1704 		/* IFLA_GRE_OKEY */
1705 		nla_total_size(4) +
1706 		/* IFLA_GRE_LOCAL */
1707 		nla_total_size(4) +
1708 		/* IFLA_GRE_REMOTE */
1709 		nla_total_size(4) +
1710 		/* IFLA_GRE_TTL */
1711 		nla_total_size(1) +
1712 		/* IFLA_GRE_TOS */
1713 		nla_total_size(1) +
1714 		/* IFLA_GRE_PMTUDISC */
1715 		nla_total_size(1) +
1716 		0;
1717 }
1718 
1719 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1720 {
1721 	struct ip_tunnel *t = netdev_priv(dev);
1722 	struct ip_tunnel_parm *p = &t->parms;
1723 
1724 	if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1725 	    nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) ||
1726 	    nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) ||
1727 	    nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1728 	    nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1729 	    nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
1730 	    nla_put_be32(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
1731 	    nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
1732 	    nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
1733 	    nla_put_u8(skb, IFLA_GRE_PMTUDISC,
1734 		       !!(p->iph.frag_off & htons(IP_DF))))
1735 		goto nla_put_failure;
1736 	return 0;
1737 
1738 nla_put_failure:
1739 	return -EMSGSIZE;
1740 }
1741 
1742 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1743 	[IFLA_GRE_LINK]		= { .type = NLA_U32 },
1744 	[IFLA_GRE_IFLAGS]	= { .type = NLA_U16 },
1745 	[IFLA_GRE_OFLAGS]	= { .type = NLA_U16 },
1746 	[IFLA_GRE_IKEY]		= { .type = NLA_U32 },
1747 	[IFLA_GRE_OKEY]		= { .type = NLA_U32 },
1748 	[IFLA_GRE_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1749 	[IFLA_GRE_REMOTE]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1750 	[IFLA_GRE_TTL]		= { .type = NLA_U8 },
1751 	[IFLA_GRE_TOS]		= { .type = NLA_U8 },
1752 	[IFLA_GRE_PMTUDISC]	= { .type = NLA_U8 },
1753 };
1754 
1755 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1756 	.kind		= "gre",
1757 	.maxtype	= IFLA_GRE_MAX,
1758 	.policy		= ipgre_policy,
1759 	.priv_size	= sizeof(struct ip_tunnel),
1760 	.setup		= ipgre_tunnel_setup,
1761 	.validate	= ipgre_tunnel_validate,
1762 	.newlink	= ipgre_newlink,
1763 	.changelink	= ipgre_changelink,
1764 	.get_size	= ipgre_get_size,
1765 	.fill_info	= ipgre_fill_info,
1766 };
1767 
1768 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1769 	.kind		= "gretap",
1770 	.maxtype	= IFLA_GRE_MAX,
1771 	.policy		= ipgre_policy,
1772 	.priv_size	= sizeof(struct ip_tunnel),
1773 	.setup		= ipgre_tap_setup,
1774 	.validate	= ipgre_tap_validate,
1775 	.newlink	= ipgre_newlink,
1776 	.changelink	= ipgre_changelink,
1777 	.get_size	= ipgre_get_size,
1778 	.fill_info	= ipgre_fill_info,
1779 };
1780 
1781 /*
1782  *	And now the modules code and kernel interface.
1783  */
1784 
1785 static int __init ipgre_init(void)
1786 {
1787 	int err;
1788 
1789 	pr_info("GRE over IPv4 tunneling driver\n");
1790 
1791 	err = register_pernet_device(&ipgre_net_ops);
1792 	if (err < 0)
1793 		return err;
1794 
1795 	err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1796 	if (err < 0) {
1797 		pr_info("%s: can't add protocol\n", __func__);
1798 		goto add_proto_failed;
1799 	}
1800 
1801 	err = rtnl_link_register(&ipgre_link_ops);
1802 	if (err < 0)
1803 		goto rtnl_link_failed;
1804 
1805 	err = rtnl_link_register(&ipgre_tap_ops);
1806 	if (err < 0)
1807 		goto tap_ops_failed;
1808 
1809 out:
1810 	return err;
1811 
1812 tap_ops_failed:
1813 	rtnl_link_unregister(&ipgre_link_ops);
1814 rtnl_link_failed:
1815 	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1816 add_proto_failed:
1817 	unregister_pernet_device(&ipgre_net_ops);
1818 	goto out;
1819 }
1820 
1821 static void __exit ipgre_fini(void)
1822 {
1823 	rtnl_link_unregister(&ipgre_tap_ops);
1824 	rtnl_link_unregister(&ipgre_link_ops);
1825 	if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1826 		pr_info("%s: can't remove protocol\n", __func__);
1827 	unregister_pernet_device(&ipgre_net_ops);
1828 }
1829 
1830 module_init(ipgre_init);
1831 module_exit(ipgre_fini);
1832 MODULE_LICENSE("GPL");
1833 MODULE_ALIAS_RTNL_LINK("gre");
1834 MODULE_ALIAS_RTNL_LINK("gretap");
1835 MODULE_ALIAS_NETDEV("gre0");
1836