xref: /openbmc/linux/net/ipv4/ip_gre.c (revision 95e9fd10)
1 /*
2  *	Linux NET3:	GRE over IP protocol decoder.
3  *
4  *	Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *	This program is free software; you can redistribute it and/or
7  *	modify it under the terms of the GNU General Public License
8  *	as published by the Free Software Foundation; either version
9  *	2 of the License, or (at your option) any later version.
10  *
11  */
12 
13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14 
15 #include <linux/capability.h>
16 #include <linux/module.h>
17 #include <linux/types.h>
18 #include <linux/kernel.h>
19 #include <linux/slab.h>
20 #include <asm/uaccess.h>
21 #include <linux/skbuff.h>
22 #include <linux/netdevice.h>
23 #include <linux/in.h>
24 #include <linux/tcp.h>
25 #include <linux/udp.h>
26 #include <linux/if_arp.h>
27 #include <linux/mroute.h>
28 #include <linux/init.h>
29 #include <linux/in6.h>
30 #include <linux/inetdevice.h>
31 #include <linux/igmp.h>
32 #include <linux/netfilter_ipv4.h>
33 #include <linux/etherdevice.h>
34 #include <linux/if_ether.h>
35 
36 #include <net/sock.h>
37 #include <net/ip.h>
38 #include <net/icmp.h>
39 #include <net/protocol.h>
40 #include <net/ipip.h>
41 #include <net/arp.h>
42 #include <net/checksum.h>
43 #include <net/dsfield.h>
44 #include <net/inet_ecn.h>
45 #include <net/xfrm.h>
46 #include <net/net_namespace.h>
47 #include <net/netns/generic.h>
48 #include <net/rtnetlink.h>
49 #include <net/gre.h>
50 
51 #if IS_ENABLED(CONFIG_IPV6)
52 #include <net/ipv6.h>
53 #include <net/ip6_fib.h>
54 #include <net/ip6_route.h>
55 #endif
56 
57 /*
58    Problems & solutions
59    --------------------
60 
61    1. The most important issue is detecting local dead loops.
62    They would cause complete host lockup in transmit, which
63    would be "resolved" by stack overflow or, if queueing is enabled,
64    with infinite looping in net_bh.
65 
66    We cannot track such dead loops during route installation,
67    it is infeasible task. The most general solutions would be
68    to keep skb->encapsulation counter (sort of local ttl),
69    and silently drop packet when it expires. It is a good
70    solution, but it supposes maintaining new variable in ALL
71    skb, even if no tunneling is used.
72 
73    Current solution: xmit_recursion breaks dead loops. This is a percpu
74    counter, since when we enter the first ndo_xmit(), cpu migration is
75    forbidden. We force an exit if this counter reaches RECURSION_LIMIT
76 
77    2. Networking dead loops would not kill routers, but would really
78    kill network. IP hop limit plays role of "t->recursion" in this case,
79    if we copy it from packet being encapsulated to upper header.
80    It is very good solution, but it introduces two problems:
81 
82    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
83      do not work over tunnels.
84    - traceroute does not work. I planned to relay ICMP from tunnel,
85      so that this problem would be solved and traceroute output
86      would even more informative. This idea appeared to be wrong:
87      only Linux complies to rfc1812 now (yes, guys, Linux is the only
88      true router now :-)), all routers (at least, in neighbourhood of mine)
89      return only 8 bytes of payload. It is the end.
90 
91    Hence, if we want that OSPF worked or traceroute said something reasonable,
92    we should search for another solution.
93 
94    One of them is to parse packet trying to detect inner encapsulation
95    made by our node. It is difficult or even impossible, especially,
96    taking into account fragmentation. TO be short, ttl is not solution at all.
97 
98    Current solution: The solution was UNEXPECTEDLY SIMPLE.
99    We force DF flag on tunnels with preconfigured hop limit,
100    that is ALL. :-) Well, it does not remove the problem completely,
101    but exponential growth of network traffic is changed to linear
102    (branches, that exceed pmtu are pruned) and tunnel mtu
103    rapidly degrades to value <68, where looping stops.
104    Yes, it is not good if there exists a router in the loop,
105    which does not force DF, even when encapsulating packets have DF set.
106    But it is not our problem! Nobody could accuse us, we made
107    all that we could make. Even if it is your gated who injected
108    fatal route to network, even if it were you who configured
109    fatal static route: you are innocent. :-)
110 
111 
112 
113    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
114    practically identical code. It would be good to glue them
115    together, but it is not very evident, how to make them modular.
116    sit is integral part of IPv6, ipip and gre are naturally modular.
117    We could extract common parts (hash table, ioctl etc)
118    to a separate module (ip_tunnel.c).
119 
120    Alexey Kuznetsov.
121  */
122 
123 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
124 static int ipgre_tunnel_init(struct net_device *dev);
125 static void ipgre_tunnel_setup(struct net_device *dev);
126 static int ipgre_tunnel_bind_dev(struct net_device *dev);
127 
128 /* Fallback tunnel: no source, no destination, no key, no options */
129 
130 #define HASH_SIZE  16
131 
132 static int ipgre_net_id __read_mostly;
133 struct ipgre_net {
134 	struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
135 
136 	struct net_device *fb_tunnel_dev;
137 };
138 
139 /* Tunnel hash table */
140 
141 /*
142    4 hash tables:
143 
144    3: (remote,local)
145    2: (remote,*)
146    1: (*,local)
147    0: (*,*)
148 
149    We require exact key match i.e. if a key is present in packet
150    it will match only tunnel with the same key; if it is not present,
151    it will match only keyless tunnel.
152 
153    All keysless packets, if not matched configured keyless tunnels
154    will match fallback tunnel.
155  */
156 
157 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
158 
159 #define tunnels_r_l	tunnels[3]
160 #define tunnels_r	tunnels[2]
161 #define tunnels_l	tunnels[1]
162 #define tunnels_wc	tunnels[0]
163 /*
164  * Locking : hash tables are protected by RCU and RTNL
165  */
166 
167 #define for_each_ip_tunnel_rcu(start) \
168 	for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
169 
170 /* often modified stats are per cpu, other are shared (netdev->stats) */
171 struct pcpu_tstats {
172 	u64	rx_packets;
173 	u64	rx_bytes;
174 	u64	tx_packets;
175 	u64	tx_bytes;
176 	struct u64_stats_sync	syncp;
177 };
178 
179 static struct rtnl_link_stats64 *ipgre_get_stats64(struct net_device *dev,
180 						   struct rtnl_link_stats64 *tot)
181 {
182 	int i;
183 
184 	for_each_possible_cpu(i) {
185 		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
186 		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
187 		unsigned int start;
188 
189 		do {
190 			start = u64_stats_fetch_begin_bh(&tstats->syncp);
191 			rx_packets = tstats->rx_packets;
192 			tx_packets = tstats->tx_packets;
193 			rx_bytes = tstats->rx_bytes;
194 			tx_bytes = tstats->tx_bytes;
195 		} while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
196 
197 		tot->rx_packets += rx_packets;
198 		tot->tx_packets += tx_packets;
199 		tot->rx_bytes   += rx_bytes;
200 		tot->tx_bytes   += tx_bytes;
201 	}
202 
203 	tot->multicast = dev->stats.multicast;
204 	tot->rx_crc_errors = dev->stats.rx_crc_errors;
205 	tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
206 	tot->rx_length_errors = dev->stats.rx_length_errors;
207 	tot->rx_errors = dev->stats.rx_errors;
208 	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
209 	tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
210 	tot->tx_dropped = dev->stats.tx_dropped;
211 	tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
212 	tot->tx_errors = dev->stats.tx_errors;
213 
214 	return tot;
215 }
216 
217 /* Given src, dst and key, find appropriate for input tunnel. */
218 
219 static struct ip_tunnel *ipgre_tunnel_lookup(struct net_device *dev,
220 					     __be32 remote, __be32 local,
221 					     __be32 key, __be16 gre_proto)
222 {
223 	struct net *net = dev_net(dev);
224 	int link = dev->ifindex;
225 	unsigned int h0 = HASH(remote);
226 	unsigned int h1 = HASH(key);
227 	struct ip_tunnel *t, *cand = NULL;
228 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
229 	int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
230 		       ARPHRD_ETHER : ARPHRD_IPGRE;
231 	int score, cand_score = 4;
232 
233 	for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
234 		if (local != t->parms.iph.saddr ||
235 		    remote != t->parms.iph.daddr ||
236 		    key != t->parms.i_key ||
237 		    !(t->dev->flags & IFF_UP))
238 			continue;
239 
240 		if (t->dev->type != ARPHRD_IPGRE &&
241 		    t->dev->type != dev_type)
242 			continue;
243 
244 		score = 0;
245 		if (t->parms.link != link)
246 			score |= 1;
247 		if (t->dev->type != dev_type)
248 			score |= 2;
249 		if (score == 0)
250 			return t;
251 
252 		if (score < cand_score) {
253 			cand = t;
254 			cand_score = score;
255 		}
256 	}
257 
258 	for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
259 		if (remote != t->parms.iph.daddr ||
260 		    key != t->parms.i_key ||
261 		    !(t->dev->flags & IFF_UP))
262 			continue;
263 
264 		if (t->dev->type != ARPHRD_IPGRE &&
265 		    t->dev->type != dev_type)
266 			continue;
267 
268 		score = 0;
269 		if (t->parms.link != link)
270 			score |= 1;
271 		if (t->dev->type != dev_type)
272 			score |= 2;
273 		if (score == 0)
274 			return t;
275 
276 		if (score < cand_score) {
277 			cand = t;
278 			cand_score = score;
279 		}
280 	}
281 
282 	for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
283 		if ((local != t->parms.iph.saddr &&
284 		     (local != t->parms.iph.daddr ||
285 		      !ipv4_is_multicast(local))) ||
286 		    key != t->parms.i_key ||
287 		    !(t->dev->flags & IFF_UP))
288 			continue;
289 
290 		if (t->dev->type != ARPHRD_IPGRE &&
291 		    t->dev->type != dev_type)
292 			continue;
293 
294 		score = 0;
295 		if (t->parms.link != link)
296 			score |= 1;
297 		if (t->dev->type != dev_type)
298 			score |= 2;
299 		if (score == 0)
300 			return t;
301 
302 		if (score < cand_score) {
303 			cand = t;
304 			cand_score = score;
305 		}
306 	}
307 
308 	for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
309 		if (t->parms.i_key != key ||
310 		    !(t->dev->flags & IFF_UP))
311 			continue;
312 
313 		if (t->dev->type != ARPHRD_IPGRE &&
314 		    t->dev->type != dev_type)
315 			continue;
316 
317 		score = 0;
318 		if (t->parms.link != link)
319 			score |= 1;
320 		if (t->dev->type != dev_type)
321 			score |= 2;
322 		if (score == 0)
323 			return t;
324 
325 		if (score < cand_score) {
326 			cand = t;
327 			cand_score = score;
328 		}
329 	}
330 
331 	if (cand != NULL)
332 		return cand;
333 
334 	dev = ign->fb_tunnel_dev;
335 	if (dev->flags & IFF_UP)
336 		return netdev_priv(dev);
337 
338 	return NULL;
339 }
340 
341 static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
342 		struct ip_tunnel_parm *parms)
343 {
344 	__be32 remote = parms->iph.daddr;
345 	__be32 local = parms->iph.saddr;
346 	__be32 key = parms->i_key;
347 	unsigned int h = HASH(key);
348 	int prio = 0;
349 
350 	if (local)
351 		prio |= 1;
352 	if (remote && !ipv4_is_multicast(remote)) {
353 		prio |= 2;
354 		h ^= HASH(remote);
355 	}
356 
357 	return &ign->tunnels[prio][h];
358 }
359 
360 static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
361 		struct ip_tunnel *t)
362 {
363 	return __ipgre_bucket(ign, &t->parms);
364 }
365 
366 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
367 {
368 	struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
369 
370 	rcu_assign_pointer(t->next, rtnl_dereference(*tp));
371 	rcu_assign_pointer(*tp, t);
372 }
373 
374 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
375 {
376 	struct ip_tunnel __rcu **tp;
377 	struct ip_tunnel *iter;
378 
379 	for (tp = ipgre_bucket(ign, t);
380 	     (iter = rtnl_dereference(*tp)) != NULL;
381 	     tp = &iter->next) {
382 		if (t == iter) {
383 			rcu_assign_pointer(*tp, t->next);
384 			break;
385 		}
386 	}
387 }
388 
389 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
390 					   struct ip_tunnel_parm *parms,
391 					   int type)
392 {
393 	__be32 remote = parms->iph.daddr;
394 	__be32 local = parms->iph.saddr;
395 	__be32 key = parms->i_key;
396 	int link = parms->link;
397 	struct ip_tunnel *t;
398 	struct ip_tunnel __rcu **tp;
399 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
400 
401 	for (tp = __ipgre_bucket(ign, parms);
402 	     (t = rtnl_dereference(*tp)) != NULL;
403 	     tp = &t->next)
404 		if (local == t->parms.iph.saddr &&
405 		    remote == t->parms.iph.daddr &&
406 		    key == t->parms.i_key &&
407 		    link == t->parms.link &&
408 		    type == t->dev->type)
409 			break;
410 
411 	return t;
412 }
413 
414 static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
415 		struct ip_tunnel_parm *parms, int create)
416 {
417 	struct ip_tunnel *t, *nt;
418 	struct net_device *dev;
419 	char name[IFNAMSIZ];
420 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
421 
422 	t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
423 	if (t || !create)
424 		return t;
425 
426 	if (parms->name[0])
427 		strlcpy(name, parms->name, IFNAMSIZ);
428 	else
429 		strcpy(name, "gre%d");
430 
431 	dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
432 	if (!dev)
433 		return NULL;
434 
435 	dev_net_set(dev, net);
436 
437 	nt = netdev_priv(dev);
438 	nt->parms = *parms;
439 	dev->rtnl_link_ops = &ipgre_link_ops;
440 
441 	dev->mtu = ipgre_tunnel_bind_dev(dev);
442 
443 	if (register_netdevice(dev) < 0)
444 		goto failed_free;
445 
446 	/* Can use a lockless transmit, unless we generate output sequences */
447 	if (!(nt->parms.o_flags & GRE_SEQ))
448 		dev->features |= NETIF_F_LLTX;
449 
450 	dev_hold(dev);
451 	ipgre_tunnel_link(ign, nt);
452 	return nt;
453 
454 failed_free:
455 	free_netdev(dev);
456 	return NULL;
457 }
458 
459 static void ipgre_tunnel_uninit(struct net_device *dev)
460 {
461 	struct net *net = dev_net(dev);
462 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
463 
464 	ipgre_tunnel_unlink(ign, netdev_priv(dev));
465 	dev_put(dev);
466 }
467 
468 
469 static void ipgre_err(struct sk_buff *skb, u32 info)
470 {
471 
472 /* All the routers (except for Linux) return only
473    8 bytes of packet payload. It means, that precise relaying of
474    ICMP in the real Internet is absolutely infeasible.
475 
476    Moreover, Cisco "wise men" put GRE key to the third word
477    in GRE header. It makes impossible maintaining even soft state for keyed
478    GRE tunnels with enabled checksum. Tell them "thank you".
479 
480    Well, I wonder, rfc1812 was written by Cisco employee,
481    what the hell these idiots break standards established
482    by themselves???
483  */
484 
485 	const struct iphdr *iph = (const struct iphdr *)skb->data;
486 	__be16	     *p = (__be16 *)(skb->data+(iph->ihl<<2));
487 	int grehlen = (iph->ihl<<2) + 4;
488 	const int type = icmp_hdr(skb)->type;
489 	const int code = icmp_hdr(skb)->code;
490 	struct ip_tunnel *t;
491 	__be16 flags;
492 
493 	flags = p[0];
494 	if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
495 		if (flags&(GRE_VERSION|GRE_ROUTING))
496 			return;
497 		if (flags&GRE_KEY) {
498 			grehlen += 4;
499 			if (flags&GRE_CSUM)
500 				grehlen += 4;
501 		}
502 	}
503 
504 	/* If only 8 bytes returned, keyed message will be dropped here */
505 	if (skb_headlen(skb) < grehlen)
506 		return;
507 
508 	switch (type) {
509 	default:
510 	case ICMP_PARAMETERPROB:
511 		return;
512 
513 	case ICMP_DEST_UNREACH:
514 		switch (code) {
515 		case ICMP_SR_FAILED:
516 		case ICMP_PORT_UNREACH:
517 			/* Impossible event. */
518 			return;
519 		default:
520 			/* All others are translated to HOST_UNREACH.
521 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
522 			   I believe they are just ether pollution. --ANK
523 			 */
524 			break;
525 		}
526 		break;
527 	case ICMP_TIME_EXCEEDED:
528 		if (code != ICMP_EXC_TTL)
529 			return;
530 		break;
531 
532 	case ICMP_REDIRECT:
533 		break;
534 	}
535 
536 	rcu_read_lock();
537 	t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
538 				flags & GRE_KEY ?
539 				*(((__be32 *)p) + (grehlen / 4) - 1) : 0,
540 				p[1]);
541 	if (t == NULL)
542 		goto out;
543 
544 	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
545 		ipv4_update_pmtu(skb, dev_net(skb->dev), info,
546 				 t->parms.link, 0, IPPROTO_GRE, 0);
547 		goto out;
548 	}
549 	if (type == ICMP_REDIRECT) {
550 		ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
551 			      IPPROTO_GRE, 0);
552 		goto out;
553 	}
554 	if (t->parms.iph.daddr == 0 ||
555 	    ipv4_is_multicast(t->parms.iph.daddr))
556 		goto out;
557 
558 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
559 		goto out;
560 
561 	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
562 		t->err_count++;
563 	else
564 		t->err_count = 1;
565 	t->err_time = jiffies;
566 out:
567 	rcu_read_unlock();
568 }
569 
570 static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb)
571 {
572 	if (INET_ECN_is_ce(iph->tos)) {
573 		if (skb->protocol == htons(ETH_P_IP)) {
574 			IP_ECN_set_ce(ip_hdr(skb));
575 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
576 			IP6_ECN_set_ce(ipv6_hdr(skb));
577 		}
578 	}
579 }
580 
581 static inline u8
582 ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
583 {
584 	u8 inner = 0;
585 	if (skb->protocol == htons(ETH_P_IP))
586 		inner = old_iph->tos;
587 	else if (skb->protocol == htons(ETH_P_IPV6))
588 		inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
589 	return INET_ECN_encapsulate(tos, inner);
590 }
591 
592 static int ipgre_rcv(struct sk_buff *skb)
593 {
594 	const struct iphdr *iph;
595 	u8     *h;
596 	__be16    flags;
597 	__sum16   csum = 0;
598 	__be32 key = 0;
599 	u32    seqno = 0;
600 	struct ip_tunnel *tunnel;
601 	int    offset = 4;
602 	__be16 gre_proto;
603 
604 	if (!pskb_may_pull(skb, 16))
605 		goto drop_nolock;
606 
607 	iph = ip_hdr(skb);
608 	h = skb->data;
609 	flags = *(__be16 *)h;
610 
611 	if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
612 		/* - Version must be 0.
613 		   - We do not support routing headers.
614 		 */
615 		if (flags&(GRE_VERSION|GRE_ROUTING))
616 			goto drop_nolock;
617 
618 		if (flags&GRE_CSUM) {
619 			switch (skb->ip_summed) {
620 			case CHECKSUM_COMPLETE:
621 				csum = csum_fold(skb->csum);
622 				if (!csum)
623 					break;
624 				/* fall through */
625 			case CHECKSUM_NONE:
626 				skb->csum = 0;
627 				csum = __skb_checksum_complete(skb);
628 				skb->ip_summed = CHECKSUM_COMPLETE;
629 			}
630 			offset += 4;
631 		}
632 		if (flags&GRE_KEY) {
633 			key = *(__be32 *)(h + offset);
634 			offset += 4;
635 		}
636 		if (flags&GRE_SEQ) {
637 			seqno = ntohl(*(__be32 *)(h + offset));
638 			offset += 4;
639 		}
640 	}
641 
642 	gre_proto = *(__be16 *)(h + 2);
643 
644 	rcu_read_lock();
645 	if ((tunnel = ipgre_tunnel_lookup(skb->dev,
646 					  iph->saddr, iph->daddr, key,
647 					  gre_proto))) {
648 		struct pcpu_tstats *tstats;
649 
650 		secpath_reset(skb);
651 
652 		skb->protocol = gre_proto;
653 		/* WCCP version 1 and 2 protocol decoding.
654 		 * - Change protocol to IP
655 		 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
656 		 */
657 		if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
658 			skb->protocol = htons(ETH_P_IP);
659 			if ((*(h + offset) & 0xF0) != 0x40)
660 				offset += 4;
661 		}
662 
663 		skb->mac_header = skb->network_header;
664 		__pskb_pull(skb, offset);
665 		skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
666 		skb->pkt_type = PACKET_HOST;
667 #ifdef CONFIG_NET_IPGRE_BROADCAST
668 		if (ipv4_is_multicast(iph->daddr)) {
669 			/* Looped back packet, drop it! */
670 			if (rt_is_output_route(skb_rtable(skb)))
671 				goto drop;
672 			tunnel->dev->stats.multicast++;
673 			skb->pkt_type = PACKET_BROADCAST;
674 		}
675 #endif
676 
677 		if (((flags&GRE_CSUM) && csum) ||
678 		    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
679 			tunnel->dev->stats.rx_crc_errors++;
680 			tunnel->dev->stats.rx_errors++;
681 			goto drop;
682 		}
683 		if (tunnel->parms.i_flags&GRE_SEQ) {
684 			if (!(flags&GRE_SEQ) ||
685 			    (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
686 				tunnel->dev->stats.rx_fifo_errors++;
687 				tunnel->dev->stats.rx_errors++;
688 				goto drop;
689 			}
690 			tunnel->i_seqno = seqno + 1;
691 		}
692 
693 		/* Warning: All skb pointers will be invalidated! */
694 		if (tunnel->dev->type == ARPHRD_ETHER) {
695 			if (!pskb_may_pull(skb, ETH_HLEN)) {
696 				tunnel->dev->stats.rx_length_errors++;
697 				tunnel->dev->stats.rx_errors++;
698 				goto drop;
699 			}
700 
701 			iph = ip_hdr(skb);
702 			skb->protocol = eth_type_trans(skb, tunnel->dev);
703 			skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
704 		}
705 
706 		tstats = this_cpu_ptr(tunnel->dev->tstats);
707 		u64_stats_update_begin(&tstats->syncp);
708 		tstats->rx_packets++;
709 		tstats->rx_bytes += skb->len;
710 		u64_stats_update_end(&tstats->syncp);
711 
712 		__skb_tunnel_rx(skb, tunnel->dev);
713 
714 		skb_reset_network_header(skb);
715 		ipgre_ecn_decapsulate(iph, skb);
716 
717 		netif_rx(skb);
718 
719 		rcu_read_unlock();
720 		return 0;
721 	}
722 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
723 
724 drop:
725 	rcu_read_unlock();
726 drop_nolock:
727 	kfree_skb(skb);
728 	return 0;
729 }
730 
731 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
732 {
733 	struct ip_tunnel *tunnel = netdev_priv(dev);
734 	struct pcpu_tstats *tstats;
735 	const struct iphdr  *old_iph = ip_hdr(skb);
736 	const struct iphdr  *tiph;
737 	struct flowi4 fl4;
738 	u8     tos;
739 	__be16 df;
740 	struct rtable *rt;     			/* Route to the other host */
741 	struct net_device *tdev;		/* Device to other host */
742 	struct iphdr  *iph;			/* Our new IP header */
743 	unsigned int max_headroom;		/* The extra header space needed */
744 	int    gre_hlen;
745 	__be32 dst;
746 	int    mtu;
747 
748 	if (dev->type == ARPHRD_ETHER)
749 		IPCB(skb)->flags = 0;
750 
751 	if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
752 		gre_hlen = 0;
753 		tiph = (const struct iphdr *)skb->data;
754 	} else {
755 		gre_hlen = tunnel->hlen;
756 		tiph = &tunnel->parms.iph;
757 	}
758 
759 	if ((dst = tiph->daddr) == 0) {
760 		/* NBMA tunnel */
761 
762 		if (skb_dst(skb) == NULL) {
763 			dev->stats.tx_fifo_errors++;
764 			goto tx_error;
765 		}
766 
767 		if (skb->protocol == htons(ETH_P_IP)) {
768 			rt = skb_rtable(skb);
769 			dst = rt_nexthop(rt, old_iph->daddr);
770 		}
771 #if IS_ENABLED(CONFIG_IPV6)
772 		else if (skb->protocol == htons(ETH_P_IPV6)) {
773 			const struct in6_addr *addr6;
774 			struct neighbour *neigh;
775 			bool do_tx_error_icmp;
776 			int addr_type;
777 
778 			neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr);
779 			if (neigh == NULL)
780 				goto tx_error;
781 
782 			addr6 = (const struct in6_addr *)&neigh->primary_key;
783 			addr_type = ipv6_addr_type(addr6);
784 
785 			if (addr_type == IPV6_ADDR_ANY) {
786 				addr6 = &ipv6_hdr(skb)->daddr;
787 				addr_type = ipv6_addr_type(addr6);
788 			}
789 
790 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
791 				do_tx_error_icmp = true;
792 			else {
793 				do_tx_error_icmp = false;
794 				dst = addr6->s6_addr32[3];
795 			}
796 			neigh_release(neigh);
797 			if (do_tx_error_icmp)
798 				goto tx_error_icmp;
799 		}
800 #endif
801 		else
802 			goto tx_error;
803 	}
804 
805 	tos = tiph->tos;
806 	if (tos == 1) {
807 		tos = 0;
808 		if (skb->protocol == htons(ETH_P_IP))
809 			tos = old_iph->tos;
810 		else if (skb->protocol == htons(ETH_P_IPV6))
811 			tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
812 	}
813 
814 	rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr,
815 				 tunnel->parms.o_key, RT_TOS(tos),
816 				 tunnel->parms.link);
817 	if (IS_ERR(rt)) {
818 		dev->stats.tx_carrier_errors++;
819 		goto tx_error;
820 	}
821 	tdev = rt->dst.dev;
822 
823 	if (tdev == dev) {
824 		ip_rt_put(rt);
825 		dev->stats.collisions++;
826 		goto tx_error;
827 	}
828 
829 	df = tiph->frag_off;
830 	if (df)
831 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
832 	else
833 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
834 
835 	if (skb_dst(skb))
836 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
837 
838 	if (skb->protocol == htons(ETH_P_IP)) {
839 		df |= (old_iph->frag_off&htons(IP_DF));
840 
841 		if ((old_iph->frag_off&htons(IP_DF)) &&
842 		    mtu < ntohs(old_iph->tot_len)) {
843 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
844 			ip_rt_put(rt);
845 			goto tx_error;
846 		}
847 	}
848 #if IS_ENABLED(CONFIG_IPV6)
849 	else if (skb->protocol == htons(ETH_P_IPV6)) {
850 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
851 
852 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
853 			if ((tunnel->parms.iph.daddr &&
854 			     !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
855 			    rt6->rt6i_dst.plen == 128) {
856 				rt6->rt6i_flags |= RTF_MODIFIED;
857 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
858 			}
859 		}
860 
861 		if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
862 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
863 			ip_rt_put(rt);
864 			goto tx_error;
865 		}
866 	}
867 #endif
868 
869 	if (tunnel->err_count > 0) {
870 		if (time_before(jiffies,
871 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
872 			tunnel->err_count--;
873 
874 			dst_link_failure(skb);
875 		} else
876 			tunnel->err_count = 0;
877 	}
878 
879 	max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
880 
881 	if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
882 	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
883 		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
884 		if (max_headroom > dev->needed_headroom)
885 			dev->needed_headroom = max_headroom;
886 		if (!new_skb) {
887 			ip_rt_put(rt);
888 			dev->stats.tx_dropped++;
889 			dev_kfree_skb(skb);
890 			return NETDEV_TX_OK;
891 		}
892 		if (skb->sk)
893 			skb_set_owner_w(new_skb, skb->sk);
894 		dev_kfree_skb(skb);
895 		skb = new_skb;
896 		old_iph = ip_hdr(skb);
897 	}
898 
899 	skb_reset_transport_header(skb);
900 	skb_push(skb, gre_hlen);
901 	skb_reset_network_header(skb);
902 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
903 	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
904 			      IPSKB_REROUTED);
905 	skb_dst_drop(skb);
906 	skb_dst_set(skb, &rt->dst);
907 
908 	/*
909 	 *	Push down and install the IPIP header.
910 	 */
911 
912 	iph 			=	ip_hdr(skb);
913 	iph->version		=	4;
914 	iph->ihl		=	sizeof(struct iphdr) >> 2;
915 	iph->frag_off		=	df;
916 	iph->protocol		=	IPPROTO_GRE;
917 	iph->tos		=	ipgre_ecn_encapsulate(tos, old_iph, skb);
918 	iph->daddr		=	fl4.daddr;
919 	iph->saddr		=	fl4.saddr;
920 
921 	if ((iph->ttl = tiph->ttl) == 0) {
922 		if (skb->protocol == htons(ETH_P_IP))
923 			iph->ttl = old_iph->ttl;
924 #if IS_ENABLED(CONFIG_IPV6)
925 		else if (skb->protocol == htons(ETH_P_IPV6))
926 			iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
927 #endif
928 		else
929 			iph->ttl = ip4_dst_hoplimit(&rt->dst);
930 	}
931 
932 	((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
933 	((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
934 				   htons(ETH_P_TEB) : skb->protocol;
935 
936 	if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
937 		__be32 *ptr = (__be32 *)(((u8 *)iph) + tunnel->hlen - 4);
938 
939 		if (tunnel->parms.o_flags&GRE_SEQ) {
940 			++tunnel->o_seqno;
941 			*ptr = htonl(tunnel->o_seqno);
942 			ptr--;
943 		}
944 		if (tunnel->parms.o_flags&GRE_KEY) {
945 			*ptr = tunnel->parms.o_key;
946 			ptr--;
947 		}
948 		if (tunnel->parms.o_flags&GRE_CSUM) {
949 			*ptr = 0;
950 			*(__sum16 *)ptr = ip_compute_csum((void *)(iph+1), skb->len - sizeof(struct iphdr));
951 		}
952 	}
953 
954 	nf_reset(skb);
955 	tstats = this_cpu_ptr(dev->tstats);
956 	__IPTUNNEL_XMIT(tstats, &dev->stats);
957 	return NETDEV_TX_OK;
958 
959 #if IS_ENABLED(CONFIG_IPV6)
960 tx_error_icmp:
961 	dst_link_failure(skb);
962 #endif
963 tx_error:
964 	dev->stats.tx_errors++;
965 	dev_kfree_skb(skb);
966 	return NETDEV_TX_OK;
967 }
968 
969 static int ipgre_tunnel_bind_dev(struct net_device *dev)
970 {
971 	struct net_device *tdev = NULL;
972 	struct ip_tunnel *tunnel;
973 	const struct iphdr *iph;
974 	int hlen = LL_MAX_HEADER;
975 	int mtu = ETH_DATA_LEN;
976 	int addend = sizeof(struct iphdr) + 4;
977 
978 	tunnel = netdev_priv(dev);
979 	iph = &tunnel->parms.iph;
980 
981 	/* Guess output device to choose reasonable mtu and needed_headroom */
982 
983 	if (iph->daddr) {
984 		struct flowi4 fl4;
985 		struct rtable *rt;
986 
987 		rt = ip_route_output_gre(dev_net(dev), &fl4,
988 					 iph->daddr, iph->saddr,
989 					 tunnel->parms.o_key,
990 					 RT_TOS(iph->tos),
991 					 tunnel->parms.link);
992 		if (!IS_ERR(rt)) {
993 			tdev = rt->dst.dev;
994 			ip_rt_put(rt);
995 		}
996 
997 		if (dev->type != ARPHRD_ETHER)
998 			dev->flags |= IFF_POINTOPOINT;
999 	}
1000 
1001 	if (!tdev && tunnel->parms.link)
1002 		tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
1003 
1004 	if (tdev) {
1005 		hlen = tdev->hard_header_len + tdev->needed_headroom;
1006 		mtu = tdev->mtu;
1007 	}
1008 	dev->iflink = tunnel->parms.link;
1009 
1010 	/* Precalculate GRE options length */
1011 	if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
1012 		if (tunnel->parms.o_flags&GRE_CSUM)
1013 			addend += 4;
1014 		if (tunnel->parms.o_flags&GRE_KEY)
1015 			addend += 4;
1016 		if (tunnel->parms.o_flags&GRE_SEQ)
1017 			addend += 4;
1018 	}
1019 	dev->needed_headroom = addend + hlen;
1020 	mtu -= dev->hard_header_len + addend;
1021 
1022 	if (mtu < 68)
1023 		mtu = 68;
1024 
1025 	tunnel->hlen = addend;
1026 
1027 	return mtu;
1028 }
1029 
1030 static int
1031 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
1032 {
1033 	int err = 0;
1034 	struct ip_tunnel_parm p;
1035 	struct ip_tunnel *t;
1036 	struct net *net = dev_net(dev);
1037 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1038 
1039 	switch (cmd) {
1040 	case SIOCGETTUNNEL:
1041 		t = NULL;
1042 		if (dev == ign->fb_tunnel_dev) {
1043 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
1044 				err = -EFAULT;
1045 				break;
1046 			}
1047 			t = ipgre_tunnel_locate(net, &p, 0);
1048 		}
1049 		if (t == NULL)
1050 			t = netdev_priv(dev);
1051 		memcpy(&p, &t->parms, sizeof(p));
1052 		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1053 			err = -EFAULT;
1054 		break;
1055 
1056 	case SIOCADDTUNNEL:
1057 	case SIOCCHGTUNNEL:
1058 		err = -EPERM;
1059 		if (!capable(CAP_NET_ADMIN))
1060 			goto done;
1061 
1062 		err = -EFAULT;
1063 		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1064 			goto done;
1065 
1066 		err = -EINVAL;
1067 		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1068 		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1069 		    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1070 			goto done;
1071 		if (p.iph.ttl)
1072 			p.iph.frag_off |= htons(IP_DF);
1073 
1074 		if (!(p.i_flags&GRE_KEY))
1075 			p.i_key = 0;
1076 		if (!(p.o_flags&GRE_KEY))
1077 			p.o_key = 0;
1078 
1079 		t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1080 
1081 		if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1082 			if (t != NULL) {
1083 				if (t->dev != dev) {
1084 					err = -EEXIST;
1085 					break;
1086 				}
1087 			} else {
1088 				unsigned int nflags = 0;
1089 
1090 				t = netdev_priv(dev);
1091 
1092 				if (ipv4_is_multicast(p.iph.daddr))
1093 					nflags = IFF_BROADCAST;
1094 				else if (p.iph.daddr)
1095 					nflags = IFF_POINTOPOINT;
1096 
1097 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1098 					err = -EINVAL;
1099 					break;
1100 				}
1101 				ipgre_tunnel_unlink(ign, t);
1102 				synchronize_net();
1103 				t->parms.iph.saddr = p.iph.saddr;
1104 				t->parms.iph.daddr = p.iph.daddr;
1105 				t->parms.i_key = p.i_key;
1106 				t->parms.o_key = p.o_key;
1107 				memcpy(dev->dev_addr, &p.iph.saddr, 4);
1108 				memcpy(dev->broadcast, &p.iph.daddr, 4);
1109 				ipgre_tunnel_link(ign, t);
1110 				netdev_state_change(dev);
1111 			}
1112 		}
1113 
1114 		if (t) {
1115 			err = 0;
1116 			if (cmd == SIOCCHGTUNNEL) {
1117 				t->parms.iph.ttl = p.iph.ttl;
1118 				t->parms.iph.tos = p.iph.tos;
1119 				t->parms.iph.frag_off = p.iph.frag_off;
1120 				if (t->parms.link != p.link) {
1121 					t->parms.link = p.link;
1122 					dev->mtu = ipgre_tunnel_bind_dev(dev);
1123 					netdev_state_change(dev);
1124 				}
1125 			}
1126 			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1127 				err = -EFAULT;
1128 		} else
1129 			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1130 		break;
1131 
1132 	case SIOCDELTUNNEL:
1133 		err = -EPERM;
1134 		if (!capable(CAP_NET_ADMIN))
1135 			goto done;
1136 
1137 		if (dev == ign->fb_tunnel_dev) {
1138 			err = -EFAULT;
1139 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1140 				goto done;
1141 			err = -ENOENT;
1142 			if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1143 				goto done;
1144 			err = -EPERM;
1145 			if (t == netdev_priv(ign->fb_tunnel_dev))
1146 				goto done;
1147 			dev = t->dev;
1148 		}
1149 		unregister_netdevice(dev);
1150 		err = 0;
1151 		break;
1152 
1153 	default:
1154 		err = -EINVAL;
1155 	}
1156 
1157 done:
1158 	return err;
1159 }
1160 
1161 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1162 {
1163 	struct ip_tunnel *tunnel = netdev_priv(dev);
1164 	if (new_mtu < 68 ||
1165 	    new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1166 		return -EINVAL;
1167 	dev->mtu = new_mtu;
1168 	return 0;
1169 }
1170 
1171 /* Nice toy. Unfortunately, useless in real life :-)
1172    It allows to construct virtual multiprotocol broadcast "LAN"
1173    over the Internet, provided multicast routing is tuned.
1174 
1175 
1176    I have no idea was this bicycle invented before me,
1177    so that I had to set ARPHRD_IPGRE to a random value.
1178    I have an impression, that Cisco could make something similar,
1179    but this feature is apparently missing in IOS<=11.2(8).
1180 
1181    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1182    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1183 
1184    ping -t 255 224.66.66.66
1185 
1186    If nobody answers, mbone does not work.
1187 
1188    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1189    ip addr add 10.66.66.<somewhat>/24 dev Universe
1190    ifconfig Universe up
1191    ifconfig Universe add fe80::<Your_real_addr>/10
1192    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1193    ftp 10.66.66.66
1194    ...
1195    ftp fec0:6666:6666::193.233.7.65
1196    ...
1197 
1198  */
1199 
1200 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1201 			unsigned short type,
1202 			const void *daddr, const void *saddr, unsigned int len)
1203 {
1204 	struct ip_tunnel *t = netdev_priv(dev);
1205 	struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1206 	__be16 *p = (__be16 *)(iph+1);
1207 
1208 	memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1209 	p[0]		= t->parms.o_flags;
1210 	p[1]		= htons(type);
1211 
1212 	/*
1213 	 *	Set the source hardware address.
1214 	 */
1215 
1216 	if (saddr)
1217 		memcpy(&iph->saddr, saddr, 4);
1218 	if (daddr)
1219 		memcpy(&iph->daddr, daddr, 4);
1220 	if (iph->daddr)
1221 		return t->hlen;
1222 
1223 	return -t->hlen;
1224 }
1225 
1226 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1227 {
1228 	const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
1229 	memcpy(haddr, &iph->saddr, 4);
1230 	return 4;
1231 }
1232 
1233 static const struct header_ops ipgre_header_ops = {
1234 	.create	= ipgre_header,
1235 	.parse	= ipgre_header_parse,
1236 };
1237 
1238 #ifdef CONFIG_NET_IPGRE_BROADCAST
1239 static int ipgre_open(struct net_device *dev)
1240 {
1241 	struct ip_tunnel *t = netdev_priv(dev);
1242 
1243 	if (ipv4_is_multicast(t->parms.iph.daddr)) {
1244 		struct flowi4 fl4;
1245 		struct rtable *rt;
1246 
1247 		rt = ip_route_output_gre(dev_net(dev), &fl4,
1248 					 t->parms.iph.daddr,
1249 					 t->parms.iph.saddr,
1250 					 t->parms.o_key,
1251 					 RT_TOS(t->parms.iph.tos),
1252 					 t->parms.link);
1253 		if (IS_ERR(rt))
1254 			return -EADDRNOTAVAIL;
1255 		dev = rt->dst.dev;
1256 		ip_rt_put(rt);
1257 		if (__in_dev_get_rtnl(dev) == NULL)
1258 			return -EADDRNOTAVAIL;
1259 		t->mlink = dev->ifindex;
1260 		ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1261 	}
1262 	return 0;
1263 }
1264 
1265 static int ipgre_close(struct net_device *dev)
1266 {
1267 	struct ip_tunnel *t = netdev_priv(dev);
1268 
1269 	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1270 		struct in_device *in_dev;
1271 		in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1272 		if (in_dev)
1273 			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1274 	}
1275 	return 0;
1276 }
1277 
1278 #endif
1279 
1280 static const struct net_device_ops ipgre_netdev_ops = {
1281 	.ndo_init		= ipgre_tunnel_init,
1282 	.ndo_uninit		= ipgre_tunnel_uninit,
1283 #ifdef CONFIG_NET_IPGRE_BROADCAST
1284 	.ndo_open		= ipgre_open,
1285 	.ndo_stop		= ipgre_close,
1286 #endif
1287 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1288 	.ndo_do_ioctl		= ipgre_tunnel_ioctl,
1289 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1290 	.ndo_get_stats64	= ipgre_get_stats64,
1291 };
1292 
1293 static void ipgre_dev_free(struct net_device *dev)
1294 {
1295 	free_percpu(dev->tstats);
1296 	free_netdev(dev);
1297 }
1298 
1299 static void ipgre_tunnel_setup(struct net_device *dev)
1300 {
1301 	dev->netdev_ops		= &ipgre_netdev_ops;
1302 	dev->destructor 	= ipgre_dev_free;
1303 
1304 	dev->type		= ARPHRD_IPGRE;
1305 	dev->needed_headroom 	= LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1306 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1307 	dev->flags		= IFF_NOARP;
1308 	dev->iflink		= 0;
1309 	dev->addr_len		= 4;
1310 	dev->features		|= NETIF_F_NETNS_LOCAL;
1311 	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
1312 }
1313 
1314 static int ipgre_tunnel_init(struct net_device *dev)
1315 {
1316 	struct ip_tunnel *tunnel;
1317 	struct iphdr *iph;
1318 
1319 	tunnel = netdev_priv(dev);
1320 	iph = &tunnel->parms.iph;
1321 
1322 	tunnel->dev = dev;
1323 	strcpy(tunnel->parms.name, dev->name);
1324 
1325 	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1326 	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1327 
1328 	if (iph->daddr) {
1329 #ifdef CONFIG_NET_IPGRE_BROADCAST
1330 		if (ipv4_is_multicast(iph->daddr)) {
1331 			if (!iph->saddr)
1332 				return -EINVAL;
1333 			dev->flags = IFF_BROADCAST;
1334 			dev->header_ops = &ipgre_header_ops;
1335 		}
1336 #endif
1337 	} else
1338 		dev->header_ops = &ipgre_header_ops;
1339 
1340 	dev->tstats = alloc_percpu(struct pcpu_tstats);
1341 	if (!dev->tstats)
1342 		return -ENOMEM;
1343 
1344 	return 0;
1345 }
1346 
1347 static void ipgre_fb_tunnel_init(struct net_device *dev)
1348 {
1349 	struct ip_tunnel *tunnel = netdev_priv(dev);
1350 	struct iphdr *iph = &tunnel->parms.iph;
1351 
1352 	tunnel->dev = dev;
1353 	strcpy(tunnel->parms.name, dev->name);
1354 
1355 	iph->version		= 4;
1356 	iph->protocol		= IPPROTO_GRE;
1357 	iph->ihl		= 5;
1358 	tunnel->hlen		= sizeof(struct iphdr) + 4;
1359 
1360 	dev_hold(dev);
1361 }
1362 
1363 
1364 static const struct gre_protocol ipgre_protocol = {
1365 	.handler     = ipgre_rcv,
1366 	.err_handler = ipgre_err,
1367 };
1368 
1369 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1370 {
1371 	int prio;
1372 
1373 	for (prio = 0; prio < 4; prio++) {
1374 		int h;
1375 		for (h = 0; h < HASH_SIZE; h++) {
1376 			struct ip_tunnel *t;
1377 
1378 			t = rtnl_dereference(ign->tunnels[prio][h]);
1379 
1380 			while (t != NULL) {
1381 				unregister_netdevice_queue(t->dev, head);
1382 				t = rtnl_dereference(t->next);
1383 			}
1384 		}
1385 	}
1386 }
1387 
1388 static int __net_init ipgre_init_net(struct net *net)
1389 {
1390 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1391 	int err;
1392 
1393 	ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1394 					   ipgre_tunnel_setup);
1395 	if (!ign->fb_tunnel_dev) {
1396 		err = -ENOMEM;
1397 		goto err_alloc_dev;
1398 	}
1399 	dev_net_set(ign->fb_tunnel_dev, net);
1400 
1401 	ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1402 	ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1403 
1404 	if ((err = register_netdev(ign->fb_tunnel_dev)))
1405 		goto err_reg_dev;
1406 
1407 	rcu_assign_pointer(ign->tunnels_wc[0],
1408 			   netdev_priv(ign->fb_tunnel_dev));
1409 	return 0;
1410 
1411 err_reg_dev:
1412 	ipgre_dev_free(ign->fb_tunnel_dev);
1413 err_alloc_dev:
1414 	return err;
1415 }
1416 
1417 static void __net_exit ipgre_exit_net(struct net *net)
1418 {
1419 	struct ipgre_net *ign;
1420 	LIST_HEAD(list);
1421 
1422 	ign = net_generic(net, ipgre_net_id);
1423 	rtnl_lock();
1424 	ipgre_destroy_tunnels(ign, &list);
1425 	unregister_netdevice_many(&list);
1426 	rtnl_unlock();
1427 }
1428 
1429 static struct pernet_operations ipgre_net_ops = {
1430 	.init = ipgre_init_net,
1431 	.exit = ipgre_exit_net,
1432 	.id   = &ipgre_net_id,
1433 	.size = sizeof(struct ipgre_net),
1434 };
1435 
1436 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1437 {
1438 	__be16 flags;
1439 
1440 	if (!data)
1441 		return 0;
1442 
1443 	flags = 0;
1444 	if (data[IFLA_GRE_IFLAGS])
1445 		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1446 	if (data[IFLA_GRE_OFLAGS])
1447 		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1448 	if (flags & (GRE_VERSION|GRE_ROUTING))
1449 		return -EINVAL;
1450 
1451 	return 0;
1452 }
1453 
1454 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1455 {
1456 	__be32 daddr;
1457 
1458 	if (tb[IFLA_ADDRESS]) {
1459 		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1460 			return -EINVAL;
1461 		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1462 			return -EADDRNOTAVAIL;
1463 	}
1464 
1465 	if (!data)
1466 		goto out;
1467 
1468 	if (data[IFLA_GRE_REMOTE]) {
1469 		memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1470 		if (!daddr)
1471 			return -EINVAL;
1472 	}
1473 
1474 out:
1475 	return ipgre_tunnel_validate(tb, data);
1476 }
1477 
1478 static void ipgre_netlink_parms(struct nlattr *data[],
1479 				struct ip_tunnel_parm *parms)
1480 {
1481 	memset(parms, 0, sizeof(*parms));
1482 
1483 	parms->iph.protocol = IPPROTO_GRE;
1484 
1485 	if (!data)
1486 		return;
1487 
1488 	if (data[IFLA_GRE_LINK])
1489 		parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1490 
1491 	if (data[IFLA_GRE_IFLAGS])
1492 		parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1493 
1494 	if (data[IFLA_GRE_OFLAGS])
1495 		parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1496 
1497 	if (data[IFLA_GRE_IKEY])
1498 		parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1499 
1500 	if (data[IFLA_GRE_OKEY])
1501 		parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1502 
1503 	if (data[IFLA_GRE_LOCAL])
1504 		parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1505 
1506 	if (data[IFLA_GRE_REMOTE])
1507 		parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1508 
1509 	if (data[IFLA_GRE_TTL])
1510 		parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1511 
1512 	if (data[IFLA_GRE_TOS])
1513 		parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1514 
1515 	if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1516 		parms->iph.frag_off = htons(IP_DF);
1517 }
1518 
1519 static int ipgre_tap_init(struct net_device *dev)
1520 {
1521 	struct ip_tunnel *tunnel;
1522 
1523 	tunnel = netdev_priv(dev);
1524 
1525 	tunnel->dev = dev;
1526 	strcpy(tunnel->parms.name, dev->name);
1527 
1528 	ipgre_tunnel_bind_dev(dev);
1529 
1530 	dev->tstats = alloc_percpu(struct pcpu_tstats);
1531 	if (!dev->tstats)
1532 		return -ENOMEM;
1533 
1534 	return 0;
1535 }
1536 
1537 static const struct net_device_ops ipgre_tap_netdev_ops = {
1538 	.ndo_init		= ipgre_tap_init,
1539 	.ndo_uninit		= ipgre_tunnel_uninit,
1540 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1541 	.ndo_set_mac_address 	= eth_mac_addr,
1542 	.ndo_validate_addr	= eth_validate_addr,
1543 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1544 	.ndo_get_stats64	= ipgre_get_stats64,
1545 };
1546 
1547 static void ipgre_tap_setup(struct net_device *dev)
1548 {
1549 
1550 	ether_setup(dev);
1551 
1552 	dev->netdev_ops		= &ipgre_tap_netdev_ops;
1553 	dev->destructor 	= ipgre_dev_free;
1554 
1555 	dev->iflink		= 0;
1556 	dev->features		|= NETIF_F_NETNS_LOCAL;
1557 }
1558 
1559 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1560 			 struct nlattr *data[])
1561 {
1562 	struct ip_tunnel *nt;
1563 	struct net *net = dev_net(dev);
1564 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1565 	int mtu;
1566 	int err;
1567 
1568 	nt = netdev_priv(dev);
1569 	ipgre_netlink_parms(data, &nt->parms);
1570 
1571 	if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1572 		return -EEXIST;
1573 
1574 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1575 		eth_hw_addr_random(dev);
1576 
1577 	mtu = ipgre_tunnel_bind_dev(dev);
1578 	if (!tb[IFLA_MTU])
1579 		dev->mtu = mtu;
1580 
1581 	/* Can use a lockless transmit, unless we generate output sequences */
1582 	if (!(nt->parms.o_flags & GRE_SEQ))
1583 		dev->features |= NETIF_F_LLTX;
1584 
1585 	err = register_netdevice(dev);
1586 	if (err)
1587 		goto out;
1588 
1589 	dev_hold(dev);
1590 	ipgre_tunnel_link(ign, nt);
1591 
1592 out:
1593 	return err;
1594 }
1595 
1596 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1597 			    struct nlattr *data[])
1598 {
1599 	struct ip_tunnel *t, *nt;
1600 	struct net *net = dev_net(dev);
1601 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1602 	struct ip_tunnel_parm p;
1603 	int mtu;
1604 
1605 	if (dev == ign->fb_tunnel_dev)
1606 		return -EINVAL;
1607 
1608 	nt = netdev_priv(dev);
1609 	ipgre_netlink_parms(data, &p);
1610 
1611 	t = ipgre_tunnel_locate(net, &p, 0);
1612 
1613 	if (t) {
1614 		if (t->dev != dev)
1615 			return -EEXIST;
1616 	} else {
1617 		t = nt;
1618 
1619 		if (dev->type != ARPHRD_ETHER) {
1620 			unsigned int nflags = 0;
1621 
1622 			if (ipv4_is_multicast(p.iph.daddr))
1623 				nflags = IFF_BROADCAST;
1624 			else if (p.iph.daddr)
1625 				nflags = IFF_POINTOPOINT;
1626 
1627 			if ((dev->flags ^ nflags) &
1628 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1629 				return -EINVAL;
1630 		}
1631 
1632 		ipgre_tunnel_unlink(ign, t);
1633 		t->parms.iph.saddr = p.iph.saddr;
1634 		t->parms.iph.daddr = p.iph.daddr;
1635 		t->parms.i_key = p.i_key;
1636 		if (dev->type != ARPHRD_ETHER) {
1637 			memcpy(dev->dev_addr, &p.iph.saddr, 4);
1638 			memcpy(dev->broadcast, &p.iph.daddr, 4);
1639 		}
1640 		ipgre_tunnel_link(ign, t);
1641 		netdev_state_change(dev);
1642 	}
1643 
1644 	t->parms.o_key = p.o_key;
1645 	t->parms.iph.ttl = p.iph.ttl;
1646 	t->parms.iph.tos = p.iph.tos;
1647 	t->parms.iph.frag_off = p.iph.frag_off;
1648 
1649 	if (t->parms.link != p.link) {
1650 		t->parms.link = p.link;
1651 		mtu = ipgre_tunnel_bind_dev(dev);
1652 		if (!tb[IFLA_MTU])
1653 			dev->mtu = mtu;
1654 		netdev_state_change(dev);
1655 	}
1656 
1657 	return 0;
1658 }
1659 
1660 static size_t ipgre_get_size(const struct net_device *dev)
1661 {
1662 	return
1663 		/* IFLA_GRE_LINK */
1664 		nla_total_size(4) +
1665 		/* IFLA_GRE_IFLAGS */
1666 		nla_total_size(2) +
1667 		/* IFLA_GRE_OFLAGS */
1668 		nla_total_size(2) +
1669 		/* IFLA_GRE_IKEY */
1670 		nla_total_size(4) +
1671 		/* IFLA_GRE_OKEY */
1672 		nla_total_size(4) +
1673 		/* IFLA_GRE_LOCAL */
1674 		nla_total_size(4) +
1675 		/* IFLA_GRE_REMOTE */
1676 		nla_total_size(4) +
1677 		/* IFLA_GRE_TTL */
1678 		nla_total_size(1) +
1679 		/* IFLA_GRE_TOS */
1680 		nla_total_size(1) +
1681 		/* IFLA_GRE_PMTUDISC */
1682 		nla_total_size(1) +
1683 		0;
1684 }
1685 
1686 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1687 {
1688 	struct ip_tunnel *t = netdev_priv(dev);
1689 	struct ip_tunnel_parm *p = &t->parms;
1690 
1691 	if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1692 	    nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) ||
1693 	    nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) ||
1694 	    nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1695 	    nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1696 	    nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
1697 	    nla_put_be32(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
1698 	    nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
1699 	    nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
1700 	    nla_put_u8(skb, IFLA_GRE_PMTUDISC,
1701 		       !!(p->iph.frag_off & htons(IP_DF))))
1702 		goto nla_put_failure;
1703 	return 0;
1704 
1705 nla_put_failure:
1706 	return -EMSGSIZE;
1707 }
1708 
1709 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1710 	[IFLA_GRE_LINK]		= { .type = NLA_U32 },
1711 	[IFLA_GRE_IFLAGS]	= { .type = NLA_U16 },
1712 	[IFLA_GRE_OFLAGS]	= { .type = NLA_U16 },
1713 	[IFLA_GRE_IKEY]		= { .type = NLA_U32 },
1714 	[IFLA_GRE_OKEY]		= { .type = NLA_U32 },
1715 	[IFLA_GRE_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1716 	[IFLA_GRE_REMOTE]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1717 	[IFLA_GRE_TTL]		= { .type = NLA_U8 },
1718 	[IFLA_GRE_TOS]		= { .type = NLA_U8 },
1719 	[IFLA_GRE_PMTUDISC]	= { .type = NLA_U8 },
1720 };
1721 
1722 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1723 	.kind		= "gre",
1724 	.maxtype	= IFLA_GRE_MAX,
1725 	.policy		= ipgre_policy,
1726 	.priv_size	= sizeof(struct ip_tunnel),
1727 	.setup		= ipgre_tunnel_setup,
1728 	.validate	= ipgre_tunnel_validate,
1729 	.newlink	= ipgre_newlink,
1730 	.changelink	= ipgre_changelink,
1731 	.get_size	= ipgre_get_size,
1732 	.fill_info	= ipgre_fill_info,
1733 };
1734 
1735 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1736 	.kind		= "gretap",
1737 	.maxtype	= IFLA_GRE_MAX,
1738 	.policy		= ipgre_policy,
1739 	.priv_size	= sizeof(struct ip_tunnel),
1740 	.setup		= ipgre_tap_setup,
1741 	.validate	= ipgre_tap_validate,
1742 	.newlink	= ipgre_newlink,
1743 	.changelink	= ipgre_changelink,
1744 	.get_size	= ipgre_get_size,
1745 	.fill_info	= ipgre_fill_info,
1746 };
1747 
1748 /*
1749  *	And now the modules code and kernel interface.
1750  */
1751 
1752 static int __init ipgre_init(void)
1753 {
1754 	int err;
1755 
1756 	pr_info("GRE over IPv4 tunneling driver\n");
1757 
1758 	err = register_pernet_device(&ipgre_net_ops);
1759 	if (err < 0)
1760 		return err;
1761 
1762 	err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1763 	if (err < 0) {
1764 		pr_info("%s: can't add protocol\n", __func__);
1765 		goto add_proto_failed;
1766 	}
1767 
1768 	err = rtnl_link_register(&ipgre_link_ops);
1769 	if (err < 0)
1770 		goto rtnl_link_failed;
1771 
1772 	err = rtnl_link_register(&ipgre_tap_ops);
1773 	if (err < 0)
1774 		goto tap_ops_failed;
1775 
1776 out:
1777 	return err;
1778 
1779 tap_ops_failed:
1780 	rtnl_link_unregister(&ipgre_link_ops);
1781 rtnl_link_failed:
1782 	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1783 add_proto_failed:
1784 	unregister_pernet_device(&ipgre_net_ops);
1785 	goto out;
1786 }
1787 
1788 static void __exit ipgre_fini(void)
1789 {
1790 	rtnl_link_unregister(&ipgre_tap_ops);
1791 	rtnl_link_unregister(&ipgre_link_ops);
1792 	if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1793 		pr_info("%s: can't remove protocol\n", __func__);
1794 	unregister_pernet_device(&ipgre_net_ops);
1795 }
1796 
1797 module_init(ipgre_init);
1798 module_exit(ipgre_fini);
1799 MODULE_LICENSE("GPL");
1800 MODULE_ALIAS_RTNL_LINK("gre");
1801 MODULE_ALIAS_RTNL_LINK("gretap");
1802 MODULE_ALIAS_NETDEV("gre0");
1803