xref: /openbmc/linux/net/ipv4/ip_gre.c (revision 05bcf503)
1 /*
2  *	Linux NET3:	GRE over IP protocol decoder.
3  *
4  *	Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *	This program is free software; you can redistribute it and/or
7  *	modify it under the terms of the GNU General Public License
8  *	as published by the Free Software Foundation; either version
9  *	2 of the License, or (at your option) any later version.
10  *
11  */
12 
13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14 
15 #include <linux/capability.h>
16 #include <linux/module.h>
17 #include <linux/types.h>
18 #include <linux/kernel.h>
19 #include <linux/slab.h>
20 #include <asm/uaccess.h>
21 #include <linux/skbuff.h>
22 #include <linux/netdevice.h>
23 #include <linux/in.h>
24 #include <linux/tcp.h>
25 #include <linux/udp.h>
26 #include <linux/if_arp.h>
27 #include <linux/mroute.h>
28 #include <linux/init.h>
29 #include <linux/in6.h>
30 #include <linux/inetdevice.h>
31 #include <linux/igmp.h>
32 #include <linux/netfilter_ipv4.h>
33 #include <linux/etherdevice.h>
34 #include <linux/if_ether.h>
35 
36 #include <net/sock.h>
37 #include <net/ip.h>
38 #include <net/icmp.h>
39 #include <net/protocol.h>
40 #include <net/ipip.h>
41 #include <net/arp.h>
42 #include <net/checksum.h>
43 #include <net/dsfield.h>
44 #include <net/inet_ecn.h>
45 #include <net/xfrm.h>
46 #include <net/net_namespace.h>
47 #include <net/netns/generic.h>
48 #include <net/rtnetlink.h>
49 #include <net/gre.h>
50 
51 #if IS_ENABLED(CONFIG_IPV6)
52 #include <net/ipv6.h>
53 #include <net/ip6_fib.h>
54 #include <net/ip6_route.h>
55 #endif
56 
57 /*
58    Problems & solutions
59    --------------------
60 
61    1. The most important issue is detecting local dead loops.
62    They would cause complete host lockup in transmit, which
63    would be "resolved" by stack overflow or, if queueing is enabled,
64    with infinite looping in net_bh.
65 
66    We cannot track such dead loops during route installation,
67    it is infeasible task. The most general solutions would be
68    to keep skb->encapsulation counter (sort of local ttl),
69    and silently drop packet when it expires. It is a good
70    solution, but it supposes maintaining new variable in ALL
71    skb, even if no tunneling is used.
72 
73    Current solution: xmit_recursion breaks dead loops. This is a percpu
74    counter, since when we enter the first ndo_xmit(), cpu migration is
75    forbidden. We force an exit if this counter reaches RECURSION_LIMIT
76 
77    2. Networking dead loops would not kill routers, but would really
78    kill network. IP hop limit plays role of "t->recursion" in this case,
79    if we copy it from packet being encapsulated to upper header.
80    It is very good solution, but it introduces two problems:
81 
82    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
83      do not work over tunnels.
84    - traceroute does not work. I planned to relay ICMP from tunnel,
85      so that this problem would be solved and traceroute output
86      would even more informative. This idea appeared to be wrong:
87      only Linux complies to rfc1812 now (yes, guys, Linux is the only
88      true router now :-)), all routers (at least, in neighbourhood of mine)
89      return only 8 bytes of payload. It is the end.
90 
91    Hence, if we want that OSPF worked or traceroute said something reasonable,
92    we should search for another solution.
93 
94    One of them is to parse packet trying to detect inner encapsulation
95    made by our node. It is difficult or even impossible, especially,
96    taking into account fragmentation. TO be short, ttl is not solution at all.
97 
98    Current solution: The solution was UNEXPECTEDLY SIMPLE.
99    We force DF flag on tunnels with preconfigured hop limit,
100    that is ALL. :-) Well, it does not remove the problem completely,
101    but exponential growth of network traffic is changed to linear
102    (branches, that exceed pmtu are pruned) and tunnel mtu
103    rapidly degrades to value <68, where looping stops.
104    Yes, it is not good if there exists a router in the loop,
105    which does not force DF, even when encapsulating packets have DF set.
106    But it is not our problem! Nobody could accuse us, we made
107    all that we could make. Even if it is your gated who injected
108    fatal route to network, even if it were you who configured
109    fatal static route: you are innocent. :-)
110 
111 
112 
113    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
114    practically identical code. It would be good to glue them
115    together, but it is not very evident, how to make them modular.
116    sit is integral part of IPv6, ipip and gre are naturally modular.
117    We could extract common parts (hash table, ioctl etc)
118    to a separate module (ip_tunnel.c).
119 
120    Alexey Kuznetsov.
121  */
122 
123 static bool log_ecn_error = true;
124 module_param(log_ecn_error, bool, 0644);
125 MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
126 
127 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
128 static int ipgre_tunnel_init(struct net_device *dev);
129 static void ipgre_tunnel_setup(struct net_device *dev);
130 static int ipgre_tunnel_bind_dev(struct net_device *dev);
131 
132 /* Fallback tunnel: no source, no destination, no key, no options */
133 
134 #define HASH_SIZE  16
135 
136 static int ipgre_net_id __read_mostly;
137 struct ipgre_net {
138 	struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
139 
140 	struct net_device *fb_tunnel_dev;
141 };
142 
143 /* Tunnel hash table */
144 
145 /*
146    4 hash tables:
147 
148    3: (remote,local)
149    2: (remote,*)
150    1: (*,local)
151    0: (*,*)
152 
153    We require exact key match i.e. if a key is present in packet
154    it will match only tunnel with the same key; if it is not present,
155    it will match only keyless tunnel.
156 
157    All keysless packets, if not matched configured keyless tunnels
158    will match fallback tunnel.
159  */
160 
161 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
162 
163 #define tunnels_r_l	tunnels[3]
164 #define tunnels_r	tunnels[2]
165 #define tunnels_l	tunnels[1]
166 #define tunnels_wc	tunnels[0]
167 /*
168  * Locking : hash tables are protected by RCU and RTNL
169  */
170 
171 #define for_each_ip_tunnel_rcu(start) \
172 	for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
173 
174 /* often modified stats are per cpu, other are shared (netdev->stats) */
175 struct pcpu_tstats {
176 	u64	rx_packets;
177 	u64	rx_bytes;
178 	u64	tx_packets;
179 	u64	tx_bytes;
180 	struct u64_stats_sync	syncp;
181 };
182 
183 static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev,
184 						   struct rtnl_link_stats64 *tot)
185 {
186 	int i;
187 
188 	for_each_possible_cpu(i) {
189 		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
190 		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
191 		unsigned int start;
192 
193 		do {
194 			start = u64_stats_fetch_begin_bh(&tstats->syncp);
195 			rx_packets = tstats->rx_packets;
196 			tx_packets = tstats->tx_packets;
197 			rx_bytes = tstats->rx_bytes;
198 			tx_bytes = tstats->tx_bytes;
199 		} while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
200 
201 		tot->rx_packets += rx_packets;
202 		tot->tx_packets += tx_packets;
203 		tot->rx_bytes   += rx_bytes;
204 		tot->tx_bytes   += tx_bytes;
205 	}
206 
207 	tot->multicast = dev->stats.multicast;
208 	tot->rx_crc_errors = dev->stats.rx_crc_errors;
209 	tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
210 	tot->rx_length_errors = dev->stats.rx_length_errors;
211 	tot->rx_frame_errors = dev->stats.rx_frame_errors;
212 	tot->rx_errors = dev->stats.rx_errors;
213 
214 	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
215 	tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
216 	tot->tx_dropped = dev->stats.tx_dropped;
217 	tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
218 	tot->tx_errors = dev->stats.tx_errors;
219 
220 	return tot;
221 }
222 
223 /* Does key in tunnel parameters match packet */
224 static bool ipgre_key_match(const struct ip_tunnel_parm *p,
225 			    __be16 flags, __be32 key)
226 {
227 	if (p->i_flags & GRE_KEY) {
228 		if (flags & GRE_KEY)
229 			return key == p->i_key;
230 		else
231 			return false;	/* key expected, none present */
232 	} else
233 		return !(flags & GRE_KEY);
234 }
235 
236 /* Given src, dst and key, find appropriate for input tunnel. */
237 
238 static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
239 					     __be32 remote, __be32 local,
240 					     __be16 flags, __be32 key,
241 					     __be16 gre_proto)
242 {
243 	struct net *net = dev_net(dev);
244 	int link = dev->ifindex;
245 	unsigned int h0 = HASH(remote);
246 	unsigned int h1 = HASH(key);
247 	struct ip_tunnel *t, *cand = NULL;
248 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
249 	int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
250 		       ARPHRD_ETHER : ARPHRD_IPGRE;
251 	int score, cand_score = 4;
252 
253 	for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
254 		if (local != t->parms.iph.saddr ||
255 		    remote != t->parms.iph.daddr ||
256 		    !(t->dev->flags & IFF_UP))
257 			continue;
258 
259 		if (!ipgre_key_match(&t->parms, flags, key))
260 			continue;
261 
262 		if (t->dev->type != ARPHRD_IPGRE &&
263 		    t->dev->type != dev_type)
264 			continue;
265 
266 		score = 0;
267 		if (t->parms.link != link)
268 			score |= 1;
269 		if (t->dev->type != dev_type)
270 			score |= 2;
271 		if (score == 0)
272 			return t;
273 
274 		if (score < cand_score) {
275 			cand = t;
276 			cand_score = score;
277 		}
278 	}
279 
280 	for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
281 		if (remote != t->parms.iph.daddr ||
282 		    !(t->dev->flags & IFF_UP))
283 			continue;
284 
285 		if (!ipgre_key_match(&t->parms, flags, key))
286 			continue;
287 
288 		if (t->dev->type != ARPHRD_IPGRE &&
289 		    t->dev->type != dev_type)
290 			continue;
291 
292 		score = 0;
293 		if (t->parms.link != link)
294 			score |= 1;
295 		if (t->dev->type != dev_type)
296 			score |= 2;
297 		if (score == 0)
298 			return t;
299 
300 		if (score < cand_score) {
301 			cand = t;
302 			cand_score = score;
303 		}
304 	}
305 
306 	for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
307 		if ((local != t->parms.iph.saddr &&
308 		     (local != t->parms.iph.daddr ||
309 		      !ipv4_is_multicast(local))) ||
310 		    !(t->dev->flags & IFF_UP))
311 			continue;
312 
313 		if (!ipgre_key_match(&t->parms, flags, key))
314 			continue;
315 
316 		if (t->dev->type != ARPHRD_IPGRE &&
317 		    t->dev->type != dev_type)
318 			continue;
319 
320 		score = 0;
321 		if (t->parms.link != link)
322 			score |= 1;
323 		if (t->dev->type != dev_type)
324 			score |= 2;
325 		if (score == 0)
326 			return t;
327 
328 		if (score < cand_score) {
329 			cand = t;
330 			cand_score = score;
331 		}
332 	}
333 
334 	for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
335 		if (t->parms.i_key != key ||
336 		    !(t->dev->flags & IFF_UP))
337 			continue;
338 
339 		if (t->dev->type != ARPHRD_IPGRE &&
340 		    t->dev->type != dev_type)
341 			continue;
342 
343 		score = 0;
344 		if (t->parms.link != link)
345 			score |= 1;
346 		if (t->dev->type != dev_type)
347 			score |= 2;
348 		if (score == 0)
349 			return t;
350 
351 		if (score < cand_score) {
352 			cand = t;
353 			cand_score = score;
354 		}
355 	}
356 
357 	if (cand != NULL)
358 		return cand;
359 
360 	dev = ign->fb_tunnel_dev;
361 	if (dev->flags & IFF_UP)
362 		return netdev_priv(dev);
363 
364 	return NULL;
365 }
366 
367 static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
368 		struct ip_tunnel_parm *parms)
369 {
370 	__be32 remote = parms->iph.daddr;
371 	__be32 local = parms->iph.saddr;
372 	__be32 key = parms->i_key;
373 	unsigned int h = HASH(key);
374 	int prio = 0;
375 
376 	if (local)
377 		prio |= 1;
378 	if (remote && !ipv4_is_multicast(remote)) {
379 		prio |= 2;
380 		h ^= HASH(remote);
381 	}
382 
383 	return &ign->tunnels[prio][h];
384 }
385 
386 static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
387 		struct ip_tunnel *t)
388 {
389 	return __ipgre_bucket(ign, &t->parms);
390 }
391 
392 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
393 {
394 	struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
395 
396 	rcu_assign_pointer(t->next, rtnl_dereference(*tp));
397 	rcu_assign_pointer(*tp, t);
398 }
399 
400 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
401 {
402 	struct ip_tunnel __rcu **tp;
403 	struct ip_tunnel *iter;
404 
405 	for (tp = ipgre_bucket(ign, t);
406 	     (iter = rtnl_dereference(*tp)) != NULL;
407 	     tp = &iter->next) {
408 		if (t == iter) {
409 			rcu_assign_pointer(*tp, t->next);
410 			break;
411 		}
412 	}
413 }
414 
415 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
416 					   struct ip_tunnel_parm *parms,
417 					   int type)
418 {
419 	__be32 remote = parms->iph.daddr;
420 	__be32 local = parms->iph.saddr;
421 	__be32 key = parms->i_key;
422 	int link = parms->link;
423 	struct ip_tunnel *t;
424 	struct ip_tunnel __rcu **tp;
425 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
426 
427 	for (tp = __ipgre_bucket(ign, parms);
428 	     (t = rtnl_dereference(*tp)) != NULL;
429 	     tp = &t->next)
430 		if (local == t->parms.iph.saddr &&
431 		    remote == t->parms.iph.daddr &&
432 		    key == t->parms.i_key &&
433 		    link == t->parms.link &&
434 		    type == t->dev->type)
435 			break;
436 
437 	return t;
438 }
439 
440 static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
441 		struct ip_tunnel_parm *parms, int create)
442 {
443 	struct ip_tunnel *t, *nt;
444 	struct net_device *dev;
445 	char name[IFNAMSIZ];
446 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
447 
448 	t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
449 	if (t || !create)
450 		return t;
451 
452 	if (parms->name[0])
453 		strlcpy(name, parms->name, IFNAMSIZ);
454 	else
455 		strcpy(name, "gre%d");
456 
457 	dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
458 	if (!dev)
459 		return NULL;
460 
461 	dev_net_set(dev, net);
462 
463 	nt = netdev_priv(dev);
464 	nt->parms = *parms;
465 	dev->rtnl_link_ops = &ipgre_link_ops;
466 
467 	dev->mtu = ipgre_tunnel_bind_dev(dev);
468 
469 	if (register_netdevice(dev) < 0)
470 		goto failed_free;
471 
472 	/* Can use a lockless transmit, unless we generate output sequences */
473 	if (!(nt->parms.o_flags & GRE_SEQ))
474 		dev->features |= NETIF_F_LLTX;
475 
476 	dev_hold(dev);
477 	ipgre_tunnel_link(ign, nt);
478 	return nt;
479 
480 failed_free:
481 	free_netdev(dev);
482 	return NULL;
483 }
484 
485 static void ipgre_tunnel_uninit(struct net_device *dev)
486 {
487 	struct net *net = dev_net(dev);
488 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
489 
490 	ipgre_tunnel_unlink(ign, netdev_priv(dev));
491 	dev_put(dev);
492 }
493 
494 
495 static void ipgre_err(struct sk_buff *skb, u32 info)
496 {
497 
498 /* All the routers (except for Linux) return only
499    8 bytes of packet payload. It means, that precise relaying of
500    ICMP in the real Internet is absolutely infeasible.
501 
502    Moreover, Cisco "wise men" put GRE key to the third word
503    in GRE header. It makes impossible maintaining even soft state for keyed
504    GRE tunnels with enabled checksum. Tell them "thank you".
505 
506    Well, I wonder, rfc1812 was written by Cisco employee,
507    what the hell these idiots break standards established
508    by themselves???
509  */
510 
511 	const struct iphdr *iph = (const struct iphdr *)skb->data;
512 	__be16	     *p = (__be16 *)(skb->data+(iph->ihl<<2));
513 	int grehlen = (iph->ihl<<2) + 4;
514 	const int type = icmp_hdr(skb)->type;
515 	const int code = icmp_hdr(skb)->code;
516 	struct ip_tunnel *t;
517 	__be16 flags;
518 	__be32 key = 0;
519 
520 	flags = p[0];
521 	if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
522 		if (flags&(GRE_VERSION|GRE_ROUTING))
523 			return;
524 		if (flags&GRE_KEY) {
525 			grehlen += 4;
526 			if (flags&GRE_CSUM)
527 				grehlen += 4;
528 		}
529 	}
530 
531 	/* If only 8 bytes returned, keyed message will be dropped here */
532 	if (skb_headlen(skb) < grehlen)
533 		return;
534 
535 	if (flags & GRE_KEY)
536 		key = *(((__be32 *)p) + (grehlen / 4) - 1);
537 
538 	switch (type) {
539 	default:
540 	case ICMP_PARAMETERPROB:
541 		return;
542 
543 	case ICMP_DEST_UNREACH:
544 		switch (code) {
545 		case ICMP_SR_FAILED:
546 		case ICMP_PORT_UNREACH:
547 			/* Impossible event. */
548 			return;
549 		default:
550 			/* All others are translated to HOST_UNREACH.
551 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
552 			   I believe they are just ether pollution. --ANK
553 			 */
554 			break;
555 		}
556 		break;
557 	case ICMP_TIME_EXCEEDED:
558 		if (code != ICMP_EXC_TTL)
559 			return;
560 		break;
561 
562 	case ICMP_REDIRECT:
563 		break;
564 	}
565 
566 	t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
567 				flags, key, p[1]);
568 
569 	if (t == NULL)
570 		return;
571 
572 	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
573 		ipv4_update_pmtu(skb, dev_net(skb->dev), info,
574 				 t->parms.link, 0, IPPROTO_GRE, 0);
575 		return;
576 	}
577 	if (type == ICMP_REDIRECT) {
578 		ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
579 			      IPPROTO_GRE, 0);
580 		return;
581 	}
582 	if (t->parms.iph.daddr == 0 ||
583 	    ipv4_is_multicast(t->parms.iph.daddr))
584 		return;
585 
586 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
587 		return;
588 
589 	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
590 		t->err_count++;
591 	else
592 		t->err_count = 1;
593 	t->err_time = jiffies;
594 }
595 
596 static inline u8
597 ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
598 {
599 	u8 inner = 0;
600 	if (skb->protocol == htons(ETH_P_IP))
601 		inner = old_iph->tos;
602 	else if (skb->protocol == htons(ETH_P_IPV6))
603 		inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
604 	return INET_ECN_encapsulate(tos, inner);
605 }
606 
607 static int ipgre_rcv(struct sk_buff *skb)
608 {
609 	const struct iphdr *iph;
610 	u8     *h;
611 	__be16    flags;
612 	__sum16   csum = 0;
613 	__be32 key = 0;
614 	u32    seqno = 0;
615 	struct ip_tunnel *tunnel;
616 	int    offset = 4;
617 	__be16 gre_proto;
618 	int    err;
619 
620 	if (!pskb_may_pull(skb, 16))
621 		goto drop;
622 
623 	iph = ip_hdr(skb);
624 	h = skb->data;
625 	flags = *(__be16 *)h;
626 
627 	if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
628 		/* - Version must be 0.
629 		   - We do not support routing headers.
630 		 */
631 		if (flags&(GRE_VERSION|GRE_ROUTING))
632 			goto drop;
633 
634 		if (flags&GRE_CSUM) {
635 			switch (skb->ip_summed) {
636 			case CHECKSUM_COMPLETE:
637 				csum = csum_fold(skb->csum);
638 				if (!csum)
639 					break;
640 				/* fall through */
641 			case CHECKSUM_NONE:
642 				skb->csum = 0;
643 				csum = __skb_checksum_complete(skb);
644 				skb->ip_summed = CHECKSUM_COMPLETE;
645 			}
646 			offset += 4;
647 		}
648 		if (flags&GRE_KEY) {
649 			key = *(__be32 *)(h + offset);
650 			offset += 4;
651 		}
652 		if (flags&GRE_SEQ) {
653 			seqno = ntohl(*(__be32 *)(h + offset));
654 			offset += 4;
655 		}
656 	}
657 
658 	gre_proto = *(__be16 *)(h + 2);
659 
660 	tunnel = ipgre_tunnel_lookup(skb->dev,
661 				     iph->saddr, iph->daddr, flags, key,
662 				     gre_proto);
663 	if (tunnel) {
664 		struct pcpu_tstats *tstats;
665 
666 		secpath_reset(skb);
667 
668 		skb->protocol = gre_proto;
669 		/* WCCP version 1 and 2 protocol decoding.
670 		 * - Change protocol to IP
671 		 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
672 		 */
673 		if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
674 			skb->protocol = htons(ETH_P_IP);
675 			if ((*(h + offset) & 0xF0) != 0x40)
676 				offset += 4;
677 		}
678 
679 		skb->mac_header = skb->network_header;
680 		__pskb_pull(skb, offset);
681 		skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
682 		skb->pkt_type = PACKET_HOST;
683 #ifdef CONFIG_NET_IPGRE_BROADCAST
684 		if (ipv4_is_multicast(iph->daddr)) {
685 			/* Looped back packet, drop it! */
686 			if (rt_is_output_route(skb_rtable(skb)))
687 				goto drop;
688 			tunnel->dev->stats.multicast++;
689 			skb->pkt_type = PACKET_BROADCAST;
690 		}
691 #endif
692 
693 		if (((flags&GRE_CSUM) && csum) ||
694 		    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
695 			tunnel->dev->stats.rx_crc_errors++;
696 			tunnel->dev->stats.rx_errors++;
697 			goto drop;
698 		}
699 		if (tunnel->parms.i_flags&GRE_SEQ) {
700 			if (!(flags&GRE_SEQ) ||
701 			    (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
702 				tunnel->dev->stats.rx_fifo_errors++;
703 				tunnel->dev->stats.rx_errors++;
704 				goto drop;
705 			}
706 			tunnel->i_seqno = seqno + 1;
707 		}
708 
709 		/* Warning: All skb pointers will be invalidated! */
710 		if (tunnel->dev->type == ARPHRD_ETHER) {
711 			if (!pskb_may_pull(skb, ETH_HLEN)) {
712 				tunnel->dev->stats.rx_length_errors++;
713 				tunnel->dev->stats.rx_errors++;
714 				goto drop;
715 			}
716 
717 			iph = ip_hdr(skb);
718 			skb->protocol = eth_type_trans(skb, tunnel->dev);
719 			skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
720 		}
721 
722 		__skb_tunnel_rx(skb, tunnel->dev);
723 
724 		skb_reset_network_header(skb);
725 		err = IP_ECN_decapsulate(iph, skb);
726 		if (unlikely(err)) {
727 			if (log_ecn_error)
728 				net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
729 						     &iph->saddr, iph->tos);
730 			if (err > 1) {
731 				++tunnel->dev->stats.rx_frame_errors;
732 				++tunnel->dev->stats.rx_errors;
733 				goto drop;
734 			}
735 		}
736 
737 		tstats = this_cpu_ptr(tunnel->dev->tstats);
738 		u64_stats_update_begin(&tstats->syncp);
739 		tstats->rx_packets++;
740 		tstats->rx_bytes += skb->len;
741 		u64_stats_update_end(&tstats->syncp);
742 
743 		gro_cells_receive(&tunnel->gro_cells, skb);
744 		return 0;
745 	}
746 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
747 
748 drop:
749 	kfree_skb(skb);
750 	return 0;
751 }
752 
753 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
754 {
755 	struct ip_tunnel *tunnel = netdev_priv(dev);
756 	struct pcpu_tstats *tstats;
757 	const struct iphdr  *old_iph = ip_hdr(skb);
758 	const struct iphdr  *tiph;
759 	struct flowi4 fl4;
760 	u8     tos;
761 	__be16 df;
762 	struct rtable *rt;     			/* Route to the other host */
763 	struct net_device *tdev;		/* Device to other host */
764 	struct iphdr  *iph;			/* Our new IP header */
765 	unsigned int max_headroom;		/* The extra header space needed */
766 	int    gre_hlen;
767 	__be32 dst;
768 	int    mtu;
769 
770 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
771 	    skb_checksum_help(skb))
772 		goto tx_error;
773 
774 	if (dev->type == ARPHRD_ETHER)
775 		IPCB(skb)->flags = 0;
776 
777 	if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
778 		gre_hlen = 0;
779 		tiph = (const struct iphdr *)skb->data;
780 	} else {
781 		gre_hlen = tunnel->hlen;
782 		tiph = &tunnel->parms.iph;
783 	}
784 
785 	if ((dst = tiph->daddr) == 0) {
786 		/* NBMA tunnel */
787 
788 		if (skb_dst(skb) == NULL) {
789 			dev->stats.tx_fifo_errors++;
790 			goto tx_error;
791 		}
792 
793 		if (skb->protocol == htons(ETH_P_IP)) {
794 			rt = skb_rtable(skb);
795 			dst = rt_nexthop(rt, old_iph->daddr);
796 		}
797 #if IS_ENABLED(CONFIG_IPV6)
798 		else if (skb->protocol == htons(ETH_P_IPV6)) {
799 			const struct in6_addr *addr6;
800 			struct neighbour *neigh;
801 			bool do_tx_error_icmp;
802 			int addr_type;
803 
804 			neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr);
805 			if (neigh == NULL)
806 				goto tx_error;
807 
808 			addr6 = (const struct in6_addr *)&neigh->primary_key;
809 			addr_type = ipv6_addr_type(addr6);
810 
811 			if (addr_type == IPV6_ADDR_ANY) {
812 				addr6 = &ipv6_hdr(skb)->daddr;
813 				addr_type = ipv6_addr_type(addr6);
814 			}
815 
816 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
817 				do_tx_error_icmp = true;
818 			else {
819 				do_tx_error_icmp = false;
820 				dst = addr6->s6_addr32[3];
821 			}
822 			neigh_release(neigh);
823 			if (do_tx_error_icmp)
824 				goto tx_error_icmp;
825 		}
826 #endif
827 		else
828 			goto tx_error;
829 	}
830 
831 	tos = tiph->tos;
832 	if (tos == 1) {
833 		tos = 0;
834 		if (skb->protocol == htons(ETH_P_IP))
835 			tos = old_iph->tos;
836 		else if (skb->protocol == htons(ETH_P_IPV6))
837 			tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
838 	}
839 
840 	rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr,
841 				 tunnel->parms.o_key, RT_TOS(tos),
842 				 tunnel->parms.link);
843 	if (IS_ERR(rt)) {
844 		dev->stats.tx_carrier_errors++;
845 		goto tx_error;
846 	}
847 	tdev = rt->dst.dev;
848 
849 	if (tdev == dev) {
850 		ip_rt_put(rt);
851 		dev->stats.collisions++;
852 		goto tx_error;
853 	}
854 
855 	df = tiph->frag_off;
856 	if (df)
857 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
858 	else
859 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
860 
861 	if (skb_dst(skb))
862 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
863 
864 	if (skb->protocol == htons(ETH_P_IP)) {
865 		df |= (old_iph->frag_off&htons(IP_DF));
866 
867 		if ((old_iph->frag_off&htons(IP_DF)) &&
868 		    mtu < ntohs(old_iph->tot_len)) {
869 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
870 			ip_rt_put(rt);
871 			goto tx_error;
872 		}
873 	}
874 #if IS_ENABLED(CONFIG_IPV6)
875 	else if (skb->protocol == htons(ETH_P_IPV6)) {
876 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
877 
878 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
879 			if ((tunnel->parms.iph.daddr &&
880 			     !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
881 			    rt6->rt6i_dst.plen == 128) {
882 				rt6->rt6i_flags |= RTF_MODIFIED;
883 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
884 			}
885 		}
886 
887 		if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
888 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
889 			ip_rt_put(rt);
890 			goto tx_error;
891 		}
892 	}
893 #endif
894 
895 	if (tunnel->err_count > 0) {
896 		if (time_before(jiffies,
897 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
898 			tunnel->err_count--;
899 
900 			dst_link_failure(skb);
901 		} else
902 			tunnel->err_count = 0;
903 	}
904 
905 	max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
906 
907 	if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
908 	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
909 		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
910 		if (max_headroom > dev->needed_headroom)
911 			dev->needed_headroom = max_headroom;
912 		if (!new_skb) {
913 			ip_rt_put(rt);
914 			dev->stats.tx_dropped++;
915 			dev_kfree_skb(skb);
916 			return NETDEV_TX_OK;
917 		}
918 		if (skb->sk)
919 			skb_set_owner_w(new_skb, skb->sk);
920 		dev_kfree_skb(skb);
921 		skb = new_skb;
922 		old_iph = ip_hdr(skb);
923 	}
924 
925 	skb_reset_transport_header(skb);
926 	skb_push(skb, gre_hlen);
927 	skb_reset_network_header(skb);
928 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
929 	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
930 			      IPSKB_REROUTED);
931 	skb_dst_drop(skb);
932 	skb_dst_set(skb, &rt->dst);
933 
934 	/*
935 	 *	Push down and install the IPIP header.
936 	 */
937 
938 	iph 			=	ip_hdr(skb);
939 	iph->version		=	4;
940 	iph->ihl		=	sizeof(struct iphdr) >> 2;
941 	iph->frag_off		=	df;
942 	iph->protocol		=	IPPROTO_GRE;
943 	iph->tos		=	ipgre_ecn_encapsulate(tos, old_iph, skb);
944 	iph->daddr		=	fl4.daddr;
945 	iph->saddr		=	fl4.saddr;
946 
947 	if ((iph->ttl = tiph->ttl) == 0) {
948 		if (skb->protocol == htons(ETH_P_IP))
949 			iph->ttl = old_iph->ttl;
950 #if IS_ENABLED(CONFIG_IPV6)
951 		else if (skb->protocol == htons(ETH_P_IPV6))
952 			iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
953 #endif
954 		else
955 			iph->ttl = ip4_dst_hoplimit(&rt->dst);
956 	}
957 
958 	((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
959 	((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
960 				   htons(ETH_P_TEB) : skb->protocol;
961 
962 	if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
963 		__be32 *ptr = (__be32 *)(((u8 *)iph) + tunnel->hlen - 4);
964 
965 		if (tunnel->parms.o_flags&GRE_SEQ) {
966 			++tunnel->o_seqno;
967 			*ptr = htonl(tunnel->o_seqno);
968 			ptr--;
969 		}
970 		if (tunnel->parms.o_flags&GRE_KEY) {
971 			*ptr = tunnel->parms.o_key;
972 			ptr--;
973 		}
974 		if (tunnel->parms.o_flags&GRE_CSUM) {
975 			*ptr = 0;
976 			*(__sum16 *)ptr = ip_compute_csum((void *)(iph+1), skb->len - sizeof(struct iphdr));
977 		}
978 	}
979 
980 	nf_reset(skb);
981 	tstats = this_cpu_ptr(dev->tstats);
982 	__IPTUNNEL_XMIT(tstats, &dev->stats);
983 	return NETDEV_TX_OK;
984 
985 #if IS_ENABLED(CONFIG_IPV6)
986 tx_error_icmp:
987 	dst_link_failure(skb);
988 #endif
989 tx_error:
990 	dev->stats.tx_errors++;
991 	dev_kfree_skb(skb);
992 	return NETDEV_TX_OK;
993 }
994 
995 static int ipgre_tunnel_bind_dev(struct net_device *dev)
996 {
997 	struct net_device *tdev = NULL;
998 	struct ip_tunnel *tunnel;
999 	const struct iphdr *iph;
1000 	int hlen = LL_MAX_HEADER;
1001 	int mtu = ETH_DATA_LEN;
1002 	int addend = sizeof(struct iphdr) + 4;
1003 
1004 	tunnel = netdev_priv(dev);
1005 	iph = &tunnel->parms.iph;
1006 
1007 	/* Guess output device to choose reasonable mtu and needed_headroom */
1008 
1009 	if (iph->daddr) {
1010 		struct flowi4 fl4;
1011 		struct rtable *rt;
1012 
1013 		rt = ip_route_output_gre(dev_net(dev), &fl4,
1014 					 iph->daddr, iph->saddr,
1015 					 tunnel->parms.o_key,
1016 					 RT_TOS(iph->tos),
1017 					 tunnel->parms.link);
1018 		if (!IS_ERR(rt)) {
1019 			tdev = rt->dst.dev;
1020 			ip_rt_put(rt);
1021 		}
1022 
1023 		if (dev->type != ARPHRD_ETHER)
1024 			dev->flags |= IFF_POINTOPOINT;
1025 	}
1026 
1027 	if (!tdev && tunnel->parms.link)
1028 		tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
1029 
1030 	if (tdev) {
1031 		hlen = tdev->hard_header_len + tdev->needed_headroom;
1032 		mtu = tdev->mtu;
1033 	}
1034 	dev->iflink = tunnel->parms.link;
1035 
1036 	/* Precalculate GRE options length */
1037 	if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
1038 		if (tunnel->parms.o_flags&GRE_CSUM)
1039 			addend += 4;
1040 		if (tunnel->parms.o_flags&GRE_KEY)
1041 			addend += 4;
1042 		if (tunnel->parms.o_flags&GRE_SEQ)
1043 			addend += 4;
1044 	}
1045 	dev->needed_headroom = addend + hlen;
1046 	mtu -= dev->hard_header_len + addend;
1047 
1048 	if (mtu < 68)
1049 		mtu = 68;
1050 
1051 	tunnel->hlen = addend;
1052 
1053 	return mtu;
1054 }
1055 
1056 static int
1057 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
1058 {
1059 	int err = 0;
1060 	struct ip_tunnel_parm p;
1061 	struct ip_tunnel *t;
1062 	struct net *net = dev_net(dev);
1063 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1064 
1065 	switch (cmd) {
1066 	case SIOCGETTUNNEL:
1067 		t = NULL;
1068 		if (dev == ign->fb_tunnel_dev) {
1069 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
1070 				err = -EFAULT;
1071 				break;
1072 			}
1073 			t = ipgre_tunnel_locate(net, &p, 0);
1074 		}
1075 		if (t == NULL)
1076 			t = netdev_priv(dev);
1077 		memcpy(&p, &t->parms, sizeof(p));
1078 		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1079 			err = -EFAULT;
1080 		break;
1081 
1082 	case SIOCADDTUNNEL:
1083 	case SIOCCHGTUNNEL:
1084 		err = -EPERM;
1085 		if (!capable(CAP_NET_ADMIN))
1086 			goto done;
1087 
1088 		err = -EFAULT;
1089 		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1090 			goto done;
1091 
1092 		err = -EINVAL;
1093 		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1094 		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1095 		    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1096 			goto done;
1097 		if (p.iph.ttl)
1098 			p.iph.frag_off |= htons(IP_DF);
1099 
1100 		if (!(p.i_flags&GRE_KEY))
1101 			p.i_key = 0;
1102 		if (!(p.o_flags&GRE_KEY))
1103 			p.o_key = 0;
1104 
1105 		t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1106 
1107 		if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1108 			if (t != NULL) {
1109 				if (t->dev != dev) {
1110 					err = -EEXIST;
1111 					break;
1112 				}
1113 			} else {
1114 				unsigned int nflags = 0;
1115 
1116 				t = netdev_priv(dev);
1117 
1118 				if (ipv4_is_multicast(p.iph.daddr))
1119 					nflags = IFF_BROADCAST;
1120 				else if (p.iph.daddr)
1121 					nflags = IFF_POINTOPOINT;
1122 
1123 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1124 					err = -EINVAL;
1125 					break;
1126 				}
1127 				ipgre_tunnel_unlink(ign, t);
1128 				synchronize_net();
1129 				t->parms.iph.saddr = p.iph.saddr;
1130 				t->parms.iph.daddr = p.iph.daddr;
1131 				t->parms.i_key = p.i_key;
1132 				t->parms.o_key = p.o_key;
1133 				memcpy(dev->dev_addr, &p.iph.saddr, 4);
1134 				memcpy(dev->broadcast, &p.iph.daddr, 4);
1135 				ipgre_tunnel_link(ign, t);
1136 				netdev_state_change(dev);
1137 			}
1138 		}
1139 
1140 		if (t) {
1141 			err = 0;
1142 			if (cmd == SIOCCHGTUNNEL) {
1143 				t->parms.iph.ttl = p.iph.ttl;
1144 				t->parms.iph.tos = p.iph.tos;
1145 				t->parms.iph.frag_off = p.iph.frag_off;
1146 				if (t->parms.link != p.link) {
1147 					t->parms.link = p.link;
1148 					dev->mtu = ipgre_tunnel_bind_dev(dev);
1149 					netdev_state_change(dev);
1150 				}
1151 			}
1152 			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1153 				err = -EFAULT;
1154 		} else
1155 			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1156 		break;
1157 
1158 	case SIOCDELTUNNEL:
1159 		err = -EPERM;
1160 		if (!capable(CAP_NET_ADMIN))
1161 			goto done;
1162 
1163 		if (dev == ign->fb_tunnel_dev) {
1164 			err = -EFAULT;
1165 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1166 				goto done;
1167 			err = -ENOENT;
1168 			if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1169 				goto done;
1170 			err = -EPERM;
1171 			if (t == netdev_priv(ign->fb_tunnel_dev))
1172 				goto done;
1173 			dev = t->dev;
1174 		}
1175 		unregister_netdevice(dev);
1176 		err = 0;
1177 		break;
1178 
1179 	default:
1180 		err = -EINVAL;
1181 	}
1182 
1183 done:
1184 	return err;
1185 }
1186 
1187 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1188 {
1189 	struct ip_tunnel *tunnel = netdev_priv(dev);
1190 	if (new_mtu < 68 ||
1191 	    new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1192 		return -EINVAL;
1193 	dev->mtu = new_mtu;
1194 	return 0;
1195 }
1196 
1197 /* Nice toy. Unfortunately, useless in real life :-)
1198    It allows to construct virtual multiprotocol broadcast "LAN"
1199    over the Internet, provided multicast routing is tuned.
1200 
1201 
1202    I have no idea was this bicycle invented before me,
1203    so that I had to set ARPHRD_IPGRE to a random value.
1204    I have an impression, that Cisco could make something similar,
1205    but this feature is apparently missing in IOS<=11.2(8).
1206 
1207    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1208    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1209 
1210    ping -t 255 224.66.66.66
1211 
1212    If nobody answers, mbone does not work.
1213 
1214    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1215    ip addr add 10.66.66.<somewhat>/24 dev Universe
1216    ifconfig Universe up
1217    ifconfig Universe add fe80::<Your_real_addr>/10
1218    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1219    ftp 10.66.66.66
1220    ...
1221    ftp fec0:6666:6666::193.233.7.65
1222    ...
1223 
1224  */
1225 
1226 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1227 			unsigned short type,
1228 			const void *daddr, const void *saddr, unsigned int len)
1229 {
1230 	struct ip_tunnel *t = netdev_priv(dev);
1231 	struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1232 	__be16 *p = (__be16 *)(iph+1);
1233 
1234 	memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1235 	p[0]		= t->parms.o_flags;
1236 	p[1]		= htons(type);
1237 
1238 	/*
1239 	 *	Set the source hardware address.
1240 	 */
1241 
1242 	if (saddr)
1243 		memcpy(&iph->saddr, saddr, 4);
1244 	if (daddr)
1245 		memcpy(&iph->daddr, daddr, 4);
1246 	if (iph->daddr)
1247 		return t->hlen;
1248 
1249 	return -t->hlen;
1250 }
1251 
1252 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1253 {
1254 	const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
1255 	memcpy(haddr, &iph->saddr, 4);
1256 	return 4;
1257 }
1258 
1259 static const struct header_ops ipgre_header_ops = {
1260 	.create	= ipgre_header,
1261 	.parse	= ipgre_header_parse,
1262 };
1263 
1264 #ifdef CONFIG_NET_IPGRE_BROADCAST
1265 static int ipgre_open(struct net_device *dev)
1266 {
1267 	struct ip_tunnel *t = netdev_priv(dev);
1268 
1269 	if (ipv4_is_multicast(t->parms.iph.daddr)) {
1270 		struct flowi4 fl4;
1271 		struct rtable *rt;
1272 
1273 		rt = ip_route_output_gre(dev_net(dev), &fl4,
1274 					 t->parms.iph.daddr,
1275 					 t->parms.iph.saddr,
1276 					 t->parms.o_key,
1277 					 RT_TOS(t->parms.iph.tos),
1278 					 t->parms.link);
1279 		if (IS_ERR(rt))
1280 			return -EADDRNOTAVAIL;
1281 		dev = rt->dst.dev;
1282 		ip_rt_put(rt);
1283 		if (__in_dev_get_rtnl(dev) == NULL)
1284 			return -EADDRNOTAVAIL;
1285 		t->mlink = dev->ifindex;
1286 		ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1287 	}
1288 	return 0;
1289 }
1290 
1291 static int ipgre_close(struct net_device *dev)
1292 {
1293 	struct ip_tunnel *t = netdev_priv(dev);
1294 
1295 	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1296 		struct in_device *in_dev;
1297 		in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1298 		if (in_dev)
1299 			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1300 	}
1301 	return 0;
1302 }
1303 
1304 #endif
1305 
1306 static const struct net_device_ops ipgre_netdev_ops = {
1307 	.ndo_init		= ipgre_tunnel_init,
1308 	.ndo_uninit		= ipgre_tunnel_uninit,
1309 #ifdef CONFIG_NET_IPGRE_BROADCAST
1310 	.ndo_open		= ipgre_open,
1311 	.ndo_stop		= ipgre_close,
1312 #endif
1313 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1314 	.ndo_do_ioctl		= ipgre_tunnel_ioctl,
1315 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1316 	.ndo_get_stats64	= ipgre_get_stats64,
1317 };
1318 
1319 static void ipgre_dev_free(struct net_device *dev)
1320 {
1321 	struct ip_tunnel *tunnel = netdev_priv(dev);
1322 
1323 	gro_cells_destroy(&tunnel->gro_cells);
1324 	free_percpu(dev->tstats);
1325 	free_netdev(dev);
1326 }
1327 
1328 #define GRE_FEATURES (NETIF_F_SG |		\
1329 		      NETIF_F_FRAGLIST |	\
1330 		      NETIF_F_HIGHDMA |		\
1331 		      NETIF_F_HW_CSUM)
1332 
1333 static void ipgre_tunnel_setup(struct net_device *dev)
1334 {
1335 	dev->netdev_ops		= &ipgre_netdev_ops;
1336 	dev->destructor 	= ipgre_dev_free;
1337 
1338 	dev->type		= ARPHRD_IPGRE;
1339 	dev->needed_headroom 	= LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1340 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1341 	dev->flags		= IFF_NOARP;
1342 	dev->iflink		= 0;
1343 	dev->addr_len		= 4;
1344 	dev->features		|= NETIF_F_NETNS_LOCAL;
1345 	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
1346 
1347 	dev->features		|= GRE_FEATURES;
1348 	dev->hw_features	|= GRE_FEATURES;
1349 }
1350 
1351 static int ipgre_tunnel_init(struct net_device *dev)
1352 {
1353 	struct ip_tunnel *tunnel;
1354 	struct iphdr *iph;
1355 	int err;
1356 
1357 	tunnel = netdev_priv(dev);
1358 	iph = &tunnel->parms.iph;
1359 
1360 	tunnel->dev = dev;
1361 	strcpy(tunnel->parms.name, dev->name);
1362 
1363 	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1364 	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1365 
1366 	if (iph->daddr) {
1367 #ifdef CONFIG_NET_IPGRE_BROADCAST
1368 		if (ipv4_is_multicast(iph->daddr)) {
1369 			if (!iph->saddr)
1370 				return -EINVAL;
1371 			dev->flags = IFF_BROADCAST;
1372 			dev->header_ops = &ipgre_header_ops;
1373 		}
1374 #endif
1375 	} else
1376 		dev->header_ops = &ipgre_header_ops;
1377 
1378 	dev->tstats = alloc_percpu(struct pcpu_tstats);
1379 	if (!dev->tstats)
1380 		return -ENOMEM;
1381 
1382 	err = gro_cells_init(&tunnel->gro_cells, dev);
1383 	if (err) {
1384 		free_percpu(dev->tstats);
1385 		return err;
1386 	}
1387 
1388 	return 0;
1389 }
1390 
1391 static void ipgre_fb_tunnel_init(struct net_device *dev)
1392 {
1393 	struct ip_tunnel *tunnel = netdev_priv(dev);
1394 	struct iphdr *iph = &tunnel->parms.iph;
1395 
1396 	tunnel->dev = dev;
1397 	strcpy(tunnel->parms.name, dev->name);
1398 
1399 	iph->version		= 4;
1400 	iph->protocol		= IPPROTO_GRE;
1401 	iph->ihl		= 5;
1402 	tunnel->hlen		= sizeof(struct iphdr) + 4;
1403 
1404 	dev_hold(dev);
1405 }
1406 
1407 
1408 static const struct gre_protocol ipgre_protocol = {
1409 	.handler     = ipgre_rcv,
1410 	.err_handler = ipgre_err,
1411 };
1412 
1413 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1414 {
1415 	int prio;
1416 
1417 	for (prio = 0; prio < 4; prio++) {
1418 		int h;
1419 		for (h = 0; h < HASH_SIZE; h++) {
1420 			struct ip_tunnel *t;
1421 
1422 			t = rtnl_dereference(ign->tunnels[prio][h]);
1423 
1424 			while (t != NULL) {
1425 				unregister_netdevice_queue(t->dev, head);
1426 				t = rtnl_dereference(t->next);
1427 			}
1428 		}
1429 	}
1430 }
1431 
1432 static int __net_init ipgre_init_net(struct net *net)
1433 {
1434 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1435 	int err;
1436 
1437 	ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1438 					   ipgre_tunnel_setup);
1439 	if (!ign->fb_tunnel_dev) {
1440 		err = -ENOMEM;
1441 		goto err_alloc_dev;
1442 	}
1443 	dev_net_set(ign->fb_tunnel_dev, net);
1444 
1445 	ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1446 	ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1447 
1448 	if ((err = register_netdev(ign->fb_tunnel_dev)))
1449 		goto err_reg_dev;
1450 
1451 	rcu_assign_pointer(ign->tunnels_wc[0],
1452 			   netdev_priv(ign->fb_tunnel_dev));
1453 	return 0;
1454 
1455 err_reg_dev:
1456 	ipgre_dev_free(ign->fb_tunnel_dev);
1457 err_alloc_dev:
1458 	return err;
1459 }
1460 
1461 static void __net_exit ipgre_exit_net(struct net *net)
1462 {
1463 	struct ipgre_net *ign;
1464 	LIST_HEAD(list);
1465 
1466 	ign = net_generic(net, ipgre_net_id);
1467 	rtnl_lock();
1468 	ipgre_destroy_tunnels(ign, &list);
1469 	unregister_netdevice_many(&list);
1470 	rtnl_unlock();
1471 }
1472 
1473 static struct pernet_operations ipgre_net_ops = {
1474 	.init = ipgre_init_net,
1475 	.exit = ipgre_exit_net,
1476 	.id   = &ipgre_net_id,
1477 	.size = sizeof(struct ipgre_net),
1478 };
1479 
1480 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1481 {
1482 	__be16 flags;
1483 
1484 	if (!data)
1485 		return 0;
1486 
1487 	flags = 0;
1488 	if (data[IFLA_GRE_IFLAGS])
1489 		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1490 	if (data[IFLA_GRE_OFLAGS])
1491 		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1492 	if (flags & (GRE_VERSION|GRE_ROUTING))
1493 		return -EINVAL;
1494 
1495 	return 0;
1496 }
1497 
1498 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1499 {
1500 	__be32 daddr;
1501 
1502 	if (tb[IFLA_ADDRESS]) {
1503 		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1504 			return -EINVAL;
1505 		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1506 			return -EADDRNOTAVAIL;
1507 	}
1508 
1509 	if (!data)
1510 		goto out;
1511 
1512 	if (data[IFLA_GRE_REMOTE]) {
1513 		memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1514 		if (!daddr)
1515 			return -EINVAL;
1516 	}
1517 
1518 out:
1519 	return ipgre_tunnel_validate(tb, data);
1520 }
1521 
1522 static void ipgre_netlink_parms(struct nlattr *data[],
1523 				struct ip_tunnel_parm *parms)
1524 {
1525 	memset(parms, 0, sizeof(*parms));
1526 
1527 	parms->iph.protocol = IPPROTO_GRE;
1528 
1529 	if (!data)
1530 		return;
1531 
1532 	if (data[IFLA_GRE_LINK])
1533 		parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1534 
1535 	if (data[IFLA_GRE_IFLAGS])
1536 		parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1537 
1538 	if (data[IFLA_GRE_OFLAGS])
1539 		parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1540 
1541 	if (data[IFLA_GRE_IKEY])
1542 		parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1543 
1544 	if (data[IFLA_GRE_OKEY])
1545 		parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1546 
1547 	if (data[IFLA_GRE_LOCAL])
1548 		parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1549 
1550 	if (data[IFLA_GRE_REMOTE])
1551 		parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1552 
1553 	if (data[IFLA_GRE_TTL])
1554 		parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1555 
1556 	if (data[IFLA_GRE_TOS])
1557 		parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1558 
1559 	if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1560 		parms->iph.frag_off = htons(IP_DF);
1561 }
1562 
1563 static int ipgre_tap_init(struct net_device *dev)
1564 {
1565 	struct ip_tunnel *tunnel;
1566 
1567 	tunnel = netdev_priv(dev);
1568 
1569 	tunnel->dev = dev;
1570 	strcpy(tunnel->parms.name, dev->name);
1571 
1572 	ipgre_tunnel_bind_dev(dev);
1573 
1574 	dev->tstats = alloc_percpu(struct pcpu_tstats);
1575 	if (!dev->tstats)
1576 		return -ENOMEM;
1577 
1578 	return 0;
1579 }
1580 
1581 static const struct net_device_ops ipgre_tap_netdev_ops = {
1582 	.ndo_init		= ipgre_tap_init,
1583 	.ndo_uninit		= ipgre_tunnel_uninit,
1584 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1585 	.ndo_set_mac_address 	= eth_mac_addr,
1586 	.ndo_validate_addr	= eth_validate_addr,
1587 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1588 	.ndo_get_stats64	= ipgre_get_stats64,
1589 };
1590 
1591 static void ipgre_tap_setup(struct net_device *dev)
1592 {
1593 
1594 	ether_setup(dev);
1595 
1596 	dev->netdev_ops		= &ipgre_tap_netdev_ops;
1597 	dev->destructor 	= ipgre_dev_free;
1598 
1599 	dev->iflink		= 0;
1600 	dev->features		|= NETIF_F_NETNS_LOCAL;
1601 }
1602 
1603 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1604 			 struct nlattr *data[])
1605 {
1606 	struct ip_tunnel *nt;
1607 	struct net *net = dev_net(dev);
1608 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1609 	int mtu;
1610 	int err;
1611 
1612 	nt = netdev_priv(dev);
1613 	ipgre_netlink_parms(data, &nt->parms);
1614 
1615 	if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1616 		return -EEXIST;
1617 
1618 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1619 		eth_hw_addr_random(dev);
1620 
1621 	mtu = ipgre_tunnel_bind_dev(dev);
1622 	if (!tb[IFLA_MTU])
1623 		dev->mtu = mtu;
1624 
1625 	/* Can use a lockless transmit, unless we generate output sequences */
1626 	if (!(nt->parms.o_flags & GRE_SEQ))
1627 		dev->features |= NETIF_F_LLTX;
1628 
1629 	err = register_netdevice(dev);
1630 	if (err)
1631 		goto out;
1632 
1633 	dev_hold(dev);
1634 	ipgre_tunnel_link(ign, nt);
1635 
1636 out:
1637 	return err;
1638 }
1639 
1640 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1641 			    struct nlattr *data[])
1642 {
1643 	struct ip_tunnel *t, *nt;
1644 	struct net *net = dev_net(dev);
1645 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1646 	struct ip_tunnel_parm p;
1647 	int mtu;
1648 
1649 	if (dev == ign->fb_tunnel_dev)
1650 		return -EINVAL;
1651 
1652 	nt = netdev_priv(dev);
1653 	ipgre_netlink_parms(data, &p);
1654 
1655 	t = ipgre_tunnel_locate(net, &p, 0);
1656 
1657 	if (t) {
1658 		if (t->dev != dev)
1659 			return -EEXIST;
1660 	} else {
1661 		t = nt;
1662 
1663 		if (dev->type != ARPHRD_ETHER) {
1664 			unsigned int nflags = 0;
1665 
1666 			if (ipv4_is_multicast(p.iph.daddr))
1667 				nflags = IFF_BROADCAST;
1668 			else if (p.iph.daddr)
1669 				nflags = IFF_POINTOPOINT;
1670 
1671 			if ((dev->flags ^ nflags) &
1672 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1673 				return -EINVAL;
1674 		}
1675 
1676 		ipgre_tunnel_unlink(ign, t);
1677 		t->parms.iph.saddr = p.iph.saddr;
1678 		t->parms.iph.daddr = p.iph.daddr;
1679 		t->parms.i_key = p.i_key;
1680 		if (dev->type != ARPHRD_ETHER) {
1681 			memcpy(dev->dev_addr, &p.iph.saddr, 4);
1682 			memcpy(dev->broadcast, &p.iph.daddr, 4);
1683 		}
1684 		ipgre_tunnel_link(ign, t);
1685 		netdev_state_change(dev);
1686 	}
1687 
1688 	t->parms.o_key = p.o_key;
1689 	t->parms.iph.ttl = p.iph.ttl;
1690 	t->parms.iph.tos = p.iph.tos;
1691 	t->parms.iph.frag_off = p.iph.frag_off;
1692 
1693 	if (t->parms.link != p.link) {
1694 		t->parms.link = p.link;
1695 		mtu = ipgre_tunnel_bind_dev(dev);
1696 		if (!tb[IFLA_MTU])
1697 			dev->mtu = mtu;
1698 		netdev_state_change(dev);
1699 	}
1700 
1701 	return 0;
1702 }
1703 
1704 static size_t ipgre_get_size(const struct net_device *dev)
1705 {
1706 	return
1707 		/* IFLA_GRE_LINK */
1708 		nla_total_size(4) +
1709 		/* IFLA_GRE_IFLAGS */
1710 		nla_total_size(2) +
1711 		/* IFLA_GRE_OFLAGS */
1712 		nla_total_size(2) +
1713 		/* IFLA_GRE_IKEY */
1714 		nla_total_size(4) +
1715 		/* IFLA_GRE_OKEY */
1716 		nla_total_size(4) +
1717 		/* IFLA_GRE_LOCAL */
1718 		nla_total_size(4) +
1719 		/* IFLA_GRE_REMOTE */
1720 		nla_total_size(4) +
1721 		/* IFLA_GRE_TTL */
1722 		nla_total_size(1) +
1723 		/* IFLA_GRE_TOS */
1724 		nla_total_size(1) +
1725 		/* IFLA_GRE_PMTUDISC */
1726 		nla_total_size(1) +
1727 		0;
1728 }
1729 
1730 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1731 {
1732 	struct ip_tunnel *t = netdev_priv(dev);
1733 	struct ip_tunnel_parm *p = &t->parms;
1734 
1735 	if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1736 	    nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) ||
1737 	    nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) ||
1738 	    nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1739 	    nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1740 	    nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
1741 	    nla_put_be32(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
1742 	    nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
1743 	    nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
1744 	    nla_put_u8(skb, IFLA_GRE_PMTUDISC,
1745 		       !!(p->iph.frag_off & htons(IP_DF))))
1746 		goto nla_put_failure;
1747 	return 0;
1748 
1749 nla_put_failure:
1750 	return -EMSGSIZE;
1751 }
1752 
1753 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1754 	[IFLA_GRE_LINK]		= { .type = NLA_U32 },
1755 	[IFLA_GRE_IFLAGS]	= { .type = NLA_U16 },
1756 	[IFLA_GRE_OFLAGS]	= { .type = NLA_U16 },
1757 	[IFLA_GRE_IKEY]		= { .type = NLA_U32 },
1758 	[IFLA_GRE_OKEY]		= { .type = NLA_U32 },
1759 	[IFLA_GRE_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1760 	[IFLA_GRE_REMOTE]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1761 	[IFLA_GRE_TTL]		= { .type = NLA_U8 },
1762 	[IFLA_GRE_TOS]		= { .type = NLA_U8 },
1763 	[IFLA_GRE_PMTUDISC]	= { .type = NLA_U8 },
1764 };
1765 
1766 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1767 	.kind		= "gre",
1768 	.maxtype	= IFLA_GRE_MAX,
1769 	.policy		= ipgre_policy,
1770 	.priv_size	= sizeof(struct ip_tunnel),
1771 	.setup		= ipgre_tunnel_setup,
1772 	.validate	= ipgre_tunnel_validate,
1773 	.newlink	= ipgre_newlink,
1774 	.changelink	= ipgre_changelink,
1775 	.get_size	= ipgre_get_size,
1776 	.fill_info	= ipgre_fill_info,
1777 };
1778 
1779 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1780 	.kind		= "gretap",
1781 	.maxtype	= IFLA_GRE_MAX,
1782 	.policy		= ipgre_policy,
1783 	.priv_size	= sizeof(struct ip_tunnel),
1784 	.setup		= ipgre_tap_setup,
1785 	.validate	= ipgre_tap_validate,
1786 	.newlink	= ipgre_newlink,
1787 	.changelink	= ipgre_changelink,
1788 	.get_size	= ipgre_get_size,
1789 	.fill_info	= ipgre_fill_info,
1790 };
1791 
1792 /*
1793  *	And now the modules code and kernel interface.
1794  */
1795 
1796 static int __init ipgre_init(void)
1797 {
1798 	int err;
1799 
1800 	pr_info("GRE over IPv4 tunneling driver\n");
1801 
1802 	err = register_pernet_device(&ipgre_net_ops);
1803 	if (err < 0)
1804 		return err;
1805 
1806 	err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1807 	if (err < 0) {
1808 		pr_info("%s: can't add protocol\n", __func__);
1809 		goto add_proto_failed;
1810 	}
1811 
1812 	err = rtnl_link_register(&ipgre_link_ops);
1813 	if (err < 0)
1814 		goto rtnl_link_failed;
1815 
1816 	err = rtnl_link_register(&ipgre_tap_ops);
1817 	if (err < 0)
1818 		goto tap_ops_failed;
1819 
1820 out:
1821 	return err;
1822 
1823 tap_ops_failed:
1824 	rtnl_link_unregister(&ipgre_link_ops);
1825 rtnl_link_failed:
1826 	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1827 add_proto_failed:
1828 	unregister_pernet_device(&ipgre_net_ops);
1829 	goto out;
1830 }
1831 
1832 static void __exit ipgre_fini(void)
1833 {
1834 	rtnl_link_unregister(&ipgre_tap_ops);
1835 	rtnl_link_unregister(&ipgre_link_ops);
1836 	if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1837 		pr_info("%s: can't remove protocol\n", __func__);
1838 	unregister_pernet_device(&ipgre_net_ops);
1839 }
1840 
1841 module_init(ipgre_init);
1842 module_exit(ipgre_fini);
1843 MODULE_LICENSE("GPL");
1844 MODULE_ALIAS_RTNL_LINK("gre");
1845 MODULE_ALIAS_RTNL_LINK("gretap");
1846 MODULE_ALIAS_NETDEV("gre0");
1847