xref: /openbmc/linux/net/ipv4/ip_gre.c (revision f2cedb63df14342ad40a8b5b324fc5d94a60b665)
1 /*
2  *	Linux NET3:	GRE over IP protocol decoder.
3  *
4  *	Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *	This program is free software; you can redistribute it and/or
7  *	modify it under the terms of the GNU General Public License
8  *	as published by the Free Software Foundation; either version
9  *	2 of the License, or (at your option) any later version.
10  *
11  */
12 
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/slab.h>
18 #include <asm/uaccess.h>
19 #include <linux/skbuff.h>
20 #include <linux/netdevice.h>
21 #include <linux/in.h>
22 #include <linux/tcp.h>
23 #include <linux/udp.h>
24 #include <linux/if_arp.h>
25 #include <linux/mroute.h>
26 #include <linux/init.h>
27 #include <linux/in6.h>
28 #include <linux/inetdevice.h>
29 #include <linux/igmp.h>
30 #include <linux/netfilter_ipv4.h>
31 #include <linux/etherdevice.h>
32 #include <linux/if_ether.h>
33 
34 #include <net/sock.h>
35 #include <net/ip.h>
36 #include <net/icmp.h>
37 #include <net/protocol.h>
38 #include <net/ipip.h>
39 #include <net/arp.h>
40 #include <net/checksum.h>
41 #include <net/dsfield.h>
42 #include <net/inet_ecn.h>
43 #include <net/xfrm.h>
44 #include <net/net_namespace.h>
45 #include <net/netns/generic.h>
46 #include <net/rtnetlink.h>
47 #include <net/gre.h>
48 
49 #if IS_ENABLED(CONFIG_IPV6)
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #endif
54 
55 /*
56    Problems & solutions
57    --------------------
58 
59    1. The most important issue is detecting local dead loops.
60    They would cause complete host lockup in transmit, which
61    would be "resolved" by stack overflow or, if queueing is enabled,
62    with infinite looping in net_bh.
63 
64    We cannot track such dead loops during route installation,
65    it is infeasible task. The most general solutions would be
66    to keep skb->encapsulation counter (sort of local ttl),
67    and silently drop packet when it expires. It is a good
68    solution, but it supposes maintaing new variable in ALL
69    skb, even if no tunneling is used.
70 
71    Current solution: xmit_recursion breaks dead loops. This is a percpu
72    counter, since when we enter the first ndo_xmit(), cpu migration is
73    forbidden. We force an exit if this counter reaches RECURSION_LIMIT
74 
75    2. Networking dead loops would not kill routers, but would really
76    kill network. IP hop limit plays role of "t->recursion" in this case,
77    if we copy it from packet being encapsulated to upper header.
78    It is very good solution, but it introduces two problems:
79 
80    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
81      do not work over tunnels.
82    - traceroute does not work. I planned to relay ICMP from tunnel,
83      so that this problem would be solved and traceroute output
84      would even more informative. This idea appeared to be wrong:
85      only Linux complies to rfc1812 now (yes, guys, Linux is the only
86      true router now :-)), all routers (at least, in neighbourhood of mine)
87      return only 8 bytes of payload. It is the end.
88 
89    Hence, if we want that OSPF worked or traceroute said something reasonable,
90    we should search for another solution.
91 
92    One of them is to parse packet trying to detect inner encapsulation
93    made by our node. It is difficult or even impossible, especially,
94    taking into account fragmentation. TO be short, tt is not solution at all.
95 
96    Current solution: The solution was UNEXPECTEDLY SIMPLE.
97    We force DF flag on tunnels with preconfigured hop limit,
98    that is ALL. :-) Well, it does not remove the problem completely,
99    but exponential growth of network traffic is changed to linear
100    (branches, that exceed pmtu are pruned) and tunnel mtu
101    fastly degrades to value <68, where looping stops.
102    Yes, it is not good if there exists a router in the loop,
103    which does not force DF, even when encapsulating packets have DF set.
104    But it is not our problem! Nobody could accuse us, we made
105    all that we could make. Even if it is your gated who injected
106    fatal route to network, even if it were you who configured
107    fatal static route: you are innocent. :-)
108 
109 
110 
111    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
112    practically identical code. It would be good to glue them
113    together, but it is not very evident, how to make them modular.
114    sit is integral part of IPv6, ipip and gre are naturally modular.
115    We could extract common parts (hash table, ioctl etc)
116    to a separate module (ip_tunnel.c).
117 
118    Alexey Kuznetsov.
119  */
120 
121 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
122 static int ipgre_tunnel_init(struct net_device *dev);
123 static void ipgre_tunnel_setup(struct net_device *dev);
124 static int ipgre_tunnel_bind_dev(struct net_device *dev);
125 
126 /* Fallback tunnel: no source, no destination, no key, no options */
127 
128 #define HASH_SIZE  16
129 
130 static int ipgre_net_id __read_mostly;
131 struct ipgre_net {
132 	struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
133 
134 	struct net_device *fb_tunnel_dev;
135 };
136 
137 /* Tunnel hash table */
138 
139 /*
140    4 hash tables:
141 
142    3: (remote,local)
143    2: (remote,*)
144    1: (*,local)
145    0: (*,*)
146 
147    We require exact key match i.e. if a key is present in packet
148    it will match only tunnel with the same key; if it is not present,
149    it will match only keyless tunnel.
150 
151    All keysless packets, if not matched configured keyless tunnels
152    will match fallback tunnel.
153  */
154 
155 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
156 
157 #define tunnels_r_l	tunnels[3]
158 #define tunnels_r	tunnels[2]
159 #define tunnels_l	tunnels[1]
160 #define tunnels_wc	tunnels[0]
161 /*
162  * Locking : hash tables are protected by RCU and RTNL
163  */
164 
165 #define for_each_ip_tunnel_rcu(start) \
166 	for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
167 
168 /* often modified stats are per cpu, other are shared (netdev->stats) */
169 struct pcpu_tstats {
170 	unsigned long	rx_packets;
171 	unsigned long	rx_bytes;
172 	unsigned long	tx_packets;
173 	unsigned long	tx_bytes;
174 } __attribute__((aligned(4*sizeof(unsigned long))));
175 
176 static struct net_device_stats *ipgre_get_stats(struct net_device *dev)
177 {
178 	struct pcpu_tstats sum = { 0 };
179 	int i;
180 
181 	for_each_possible_cpu(i) {
182 		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
183 
184 		sum.rx_packets += tstats->rx_packets;
185 		sum.rx_bytes   += tstats->rx_bytes;
186 		sum.tx_packets += tstats->tx_packets;
187 		sum.tx_bytes   += tstats->tx_bytes;
188 	}
189 	dev->stats.rx_packets = sum.rx_packets;
190 	dev->stats.rx_bytes   = sum.rx_bytes;
191 	dev->stats.tx_packets = sum.tx_packets;
192 	dev->stats.tx_bytes   = sum.tx_bytes;
193 	return &dev->stats;
194 }
195 
196 /* Given src, dst and key, find appropriate for input tunnel. */
197 
198 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
199 					      __be32 remote, __be32 local,
200 					      __be32 key, __be16 gre_proto)
201 {
202 	struct net *net = dev_net(dev);
203 	int link = dev->ifindex;
204 	unsigned int h0 = HASH(remote);
205 	unsigned int h1 = HASH(key);
206 	struct ip_tunnel *t, *cand = NULL;
207 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
208 	int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
209 		       ARPHRD_ETHER : ARPHRD_IPGRE;
210 	int score, cand_score = 4;
211 
212 	for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
213 		if (local != t->parms.iph.saddr ||
214 		    remote != t->parms.iph.daddr ||
215 		    key != t->parms.i_key ||
216 		    !(t->dev->flags & IFF_UP))
217 			continue;
218 
219 		if (t->dev->type != ARPHRD_IPGRE &&
220 		    t->dev->type != dev_type)
221 			continue;
222 
223 		score = 0;
224 		if (t->parms.link != link)
225 			score |= 1;
226 		if (t->dev->type != dev_type)
227 			score |= 2;
228 		if (score == 0)
229 			return t;
230 
231 		if (score < cand_score) {
232 			cand = t;
233 			cand_score = score;
234 		}
235 	}
236 
237 	for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
238 		if (remote != t->parms.iph.daddr ||
239 		    key != t->parms.i_key ||
240 		    !(t->dev->flags & IFF_UP))
241 			continue;
242 
243 		if (t->dev->type != ARPHRD_IPGRE &&
244 		    t->dev->type != dev_type)
245 			continue;
246 
247 		score = 0;
248 		if (t->parms.link != link)
249 			score |= 1;
250 		if (t->dev->type != dev_type)
251 			score |= 2;
252 		if (score == 0)
253 			return t;
254 
255 		if (score < cand_score) {
256 			cand = t;
257 			cand_score = score;
258 		}
259 	}
260 
261 	for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
262 		if ((local != t->parms.iph.saddr &&
263 		     (local != t->parms.iph.daddr ||
264 		      !ipv4_is_multicast(local))) ||
265 		    key != t->parms.i_key ||
266 		    !(t->dev->flags & IFF_UP))
267 			continue;
268 
269 		if (t->dev->type != ARPHRD_IPGRE &&
270 		    t->dev->type != dev_type)
271 			continue;
272 
273 		score = 0;
274 		if (t->parms.link != link)
275 			score |= 1;
276 		if (t->dev->type != dev_type)
277 			score |= 2;
278 		if (score == 0)
279 			return t;
280 
281 		if (score < cand_score) {
282 			cand = t;
283 			cand_score = score;
284 		}
285 	}
286 
287 	for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
288 		if (t->parms.i_key != key ||
289 		    !(t->dev->flags & IFF_UP))
290 			continue;
291 
292 		if (t->dev->type != ARPHRD_IPGRE &&
293 		    t->dev->type != dev_type)
294 			continue;
295 
296 		score = 0;
297 		if (t->parms.link != link)
298 			score |= 1;
299 		if (t->dev->type != dev_type)
300 			score |= 2;
301 		if (score == 0)
302 			return t;
303 
304 		if (score < cand_score) {
305 			cand = t;
306 			cand_score = score;
307 		}
308 	}
309 
310 	if (cand != NULL)
311 		return cand;
312 
313 	dev = ign->fb_tunnel_dev;
314 	if (dev->flags & IFF_UP)
315 		return netdev_priv(dev);
316 
317 	return NULL;
318 }
319 
320 static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
321 		struct ip_tunnel_parm *parms)
322 {
323 	__be32 remote = parms->iph.daddr;
324 	__be32 local = parms->iph.saddr;
325 	__be32 key = parms->i_key;
326 	unsigned int h = HASH(key);
327 	int prio = 0;
328 
329 	if (local)
330 		prio |= 1;
331 	if (remote && !ipv4_is_multicast(remote)) {
332 		prio |= 2;
333 		h ^= HASH(remote);
334 	}
335 
336 	return &ign->tunnels[prio][h];
337 }
338 
339 static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
340 		struct ip_tunnel *t)
341 {
342 	return __ipgre_bucket(ign, &t->parms);
343 }
344 
345 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
346 {
347 	struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
348 
349 	rcu_assign_pointer(t->next, rtnl_dereference(*tp));
350 	rcu_assign_pointer(*tp, t);
351 }
352 
353 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
354 {
355 	struct ip_tunnel __rcu **tp;
356 	struct ip_tunnel *iter;
357 
358 	for (tp = ipgre_bucket(ign, t);
359 	     (iter = rtnl_dereference(*tp)) != NULL;
360 	     tp = &iter->next) {
361 		if (t == iter) {
362 			rcu_assign_pointer(*tp, t->next);
363 			break;
364 		}
365 	}
366 }
367 
368 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
369 					   struct ip_tunnel_parm *parms,
370 					   int type)
371 {
372 	__be32 remote = parms->iph.daddr;
373 	__be32 local = parms->iph.saddr;
374 	__be32 key = parms->i_key;
375 	int link = parms->link;
376 	struct ip_tunnel *t;
377 	struct ip_tunnel __rcu **tp;
378 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
379 
380 	for (tp = __ipgre_bucket(ign, parms);
381 	     (t = rtnl_dereference(*tp)) != NULL;
382 	     tp = &t->next)
383 		if (local == t->parms.iph.saddr &&
384 		    remote == t->parms.iph.daddr &&
385 		    key == t->parms.i_key &&
386 		    link == t->parms.link &&
387 		    type == t->dev->type)
388 			break;
389 
390 	return t;
391 }
392 
393 static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
394 		struct ip_tunnel_parm *parms, int create)
395 {
396 	struct ip_tunnel *t, *nt;
397 	struct net_device *dev;
398 	char name[IFNAMSIZ];
399 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
400 
401 	t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
402 	if (t || !create)
403 		return t;
404 
405 	if (parms->name[0])
406 		strlcpy(name, parms->name, IFNAMSIZ);
407 	else
408 		strcpy(name, "gre%d");
409 
410 	dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
411 	if (!dev)
412 		return NULL;
413 
414 	dev_net_set(dev, net);
415 
416 	nt = netdev_priv(dev);
417 	nt->parms = *parms;
418 	dev->rtnl_link_ops = &ipgre_link_ops;
419 
420 	dev->mtu = ipgre_tunnel_bind_dev(dev);
421 
422 	if (register_netdevice(dev) < 0)
423 		goto failed_free;
424 
425 	/* Can use a lockless transmit, unless we generate output sequences */
426 	if (!(nt->parms.o_flags & GRE_SEQ))
427 		dev->features |= NETIF_F_LLTX;
428 
429 	dev_hold(dev);
430 	ipgre_tunnel_link(ign, nt);
431 	return nt;
432 
433 failed_free:
434 	free_netdev(dev);
435 	return NULL;
436 }
437 
438 static void ipgre_tunnel_uninit(struct net_device *dev)
439 {
440 	struct net *net = dev_net(dev);
441 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
442 
443 	ipgre_tunnel_unlink(ign, netdev_priv(dev));
444 	dev_put(dev);
445 }
446 
447 
448 static void ipgre_err(struct sk_buff *skb, u32 info)
449 {
450 
451 /* All the routers (except for Linux) return only
452    8 bytes of packet payload. It means, that precise relaying of
453    ICMP in the real Internet is absolutely infeasible.
454 
455    Moreover, Cisco "wise men" put GRE key to the third word
456    in GRE header. It makes impossible maintaining even soft state for keyed
457    GRE tunnels with enabled checksum. Tell them "thank you".
458 
459    Well, I wonder, rfc1812 was written by Cisco employee,
460    what the hell these idiots break standrads established
461    by themself???
462  */
463 
464 	const struct iphdr *iph = (const struct iphdr *)skb->data;
465 	__be16	     *p = (__be16*)(skb->data+(iph->ihl<<2));
466 	int grehlen = (iph->ihl<<2) + 4;
467 	const int type = icmp_hdr(skb)->type;
468 	const int code = icmp_hdr(skb)->code;
469 	struct ip_tunnel *t;
470 	__be16 flags;
471 
472 	flags = p[0];
473 	if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
474 		if (flags&(GRE_VERSION|GRE_ROUTING))
475 			return;
476 		if (flags&GRE_KEY) {
477 			grehlen += 4;
478 			if (flags&GRE_CSUM)
479 				grehlen += 4;
480 		}
481 	}
482 
483 	/* If only 8 bytes returned, keyed message will be dropped here */
484 	if (skb_headlen(skb) < grehlen)
485 		return;
486 
487 	switch (type) {
488 	default:
489 	case ICMP_PARAMETERPROB:
490 		return;
491 
492 	case ICMP_DEST_UNREACH:
493 		switch (code) {
494 		case ICMP_SR_FAILED:
495 		case ICMP_PORT_UNREACH:
496 			/* Impossible event. */
497 			return;
498 		case ICMP_FRAG_NEEDED:
499 			/* Soft state for pmtu is maintained by IP core. */
500 			return;
501 		default:
502 			/* All others are translated to HOST_UNREACH.
503 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
504 			   I believe they are just ether pollution. --ANK
505 			 */
506 			break;
507 		}
508 		break;
509 	case ICMP_TIME_EXCEEDED:
510 		if (code != ICMP_EXC_TTL)
511 			return;
512 		break;
513 	}
514 
515 	rcu_read_lock();
516 	t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
517 				flags & GRE_KEY ?
518 				*(((__be32 *)p) + (grehlen / 4) - 1) : 0,
519 				p[1]);
520 	if (t == NULL || t->parms.iph.daddr == 0 ||
521 	    ipv4_is_multicast(t->parms.iph.daddr))
522 		goto out;
523 
524 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
525 		goto out;
526 
527 	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
528 		t->err_count++;
529 	else
530 		t->err_count = 1;
531 	t->err_time = jiffies;
532 out:
533 	rcu_read_unlock();
534 }
535 
536 static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb)
537 {
538 	if (INET_ECN_is_ce(iph->tos)) {
539 		if (skb->protocol == htons(ETH_P_IP)) {
540 			IP_ECN_set_ce(ip_hdr(skb));
541 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
542 			IP6_ECN_set_ce(ipv6_hdr(skb));
543 		}
544 	}
545 }
546 
547 static inline u8
548 ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
549 {
550 	u8 inner = 0;
551 	if (skb->protocol == htons(ETH_P_IP))
552 		inner = old_iph->tos;
553 	else if (skb->protocol == htons(ETH_P_IPV6))
554 		inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
555 	return INET_ECN_encapsulate(tos, inner);
556 }
557 
558 static int ipgre_rcv(struct sk_buff *skb)
559 {
560 	const struct iphdr *iph;
561 	u8     *h;
562 	__be16    flags;
563 	__sum16   csum = 0;
564 	__be32 key = 0;
565 	u32    seqno = 0;
566 	struct ip_tunnel *tunnel;
567 	int    offset = 4;
568 	__be16 gre_proto;
569 
570 	if (!pskb_may_pull(skb, 16))
571 		goto drop_nolock;
572 
573 	iph = ip_hdr(skb);
574 	h = skb->data;
575 	flags = *(__be16*)h;
576 
577 	if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
578 		/* - Version must be 0.
579 		   - We do not support routing headers.
580 		 */
581 		if (flags&(GRE_VERSION|GRE_ROUTING))
582 			goto drop_nolock;
583 
584 		if (flags&GRE_CSUM) {
585 			switch (skb->ip_summed) {
586 			case CHECKSUM_COMPLETE:
587 				csum = csum_fold(skb->csum);
588 				if (!csum)
589 					break;
590 				/* fall through */
591 			case CHECKSUM_NONE:
592 				skb->csum = 0;
593 				csum = __skb_checksum_complete(skb);
594 				skb->ip_summed = CHECKSUM_COMPLETE;
595 			}
596 			offset += 4;
597 		}
598 		if (flags&GRE_KEY) {
599 			key = *(__be32*)(h + offset);
600 			offset += 4;
601 		}
602 		if (flags&GRE_SEQ) {
603 			seqno = ntohl(*(__be32*)(h + offset));
604 			offset += 4;
605 		}
606 	}
607 
608 	gre_proto = *(__be16 *)(h + 2);
609 
610 	rcu_read_lock();
611 	if ((tunnel = ipgre_tunnel_lookup(skb->dev,
612 					  iph->saddr, iph->daddr, key,
613 					  gre_proto))) {
614 		struct pcpu_tstats *tstats;
615 
616 		secpath_reset(skb);
617 
618 		skb->protocol = gre_proto;
619 		/* WCCP version 1 and 2 protocol decoding.
620 		 * - Change protocol to IP
621 		 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
622 		 */
623 		if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
624 			skb->protocol = htons(ETH_P_IP);
625 			if ((*(h + offset) & 0xF0) != 0x40)
626 				offset += 4;
627 		}
628 
629 		skb->mac_header = skb->network_header;
630 		__pskb_pull(skb, offset);
631 		skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
632 		skb->pkt_type = PACKET_HOST;
633 #ifdef CONFIG_NET_IPGRE_BROADCAST
634 		if (ipv4_is_multicast(iph->daddr)) {
635 			/* Looped back packet, drop it! */
636 			if (rt_is_output_route(skb_rtable(skb)))
637 				goto drop;
638 			tunnel->dev->stats.multicast++;
639 			skb->pkt_type = PACKET_BROADCAST;
640 		}
641 #endif
642 
643 		if (((flags&GRE_CSUM) && csum) ||
644 		    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
645 			tunnel->dev->stats.rx_crc_errors++;
646 			tunnel->dev->stats.rx_errors++;
647 			goto drop;
648 		}
649 		if (tunnel->parms.i_flags&GRE_SEQ) {
650 			if (!(flags&GRE_SEQ) ||
651 			    (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
652 				tunnel->dev->stats.rx_fifo_errors++;
653 				tunnel->dev->stats.rx_errors++;
654 				goto drop;
655 			}
656 			tunnel->i_seqno = seqno + 1;
657 		}
658 
659 		/* Warning: All skb pointers will be invalidated! */
660 		if (tunnel->dev->type == ARPHRD_ETHER) {
661 			if (!pskb_may_pull(skb, ETH_HLEN)) {
662 				tunnel->dev->stats.rx_length_errors++;
663 				tunnel->dev->stats.rx_errors++;
664 				goto drop;
665 			}
666 
667 			iph = ip_hdr(skb);
668 			skb->protocol = eth_type_trans(skb, tunnel->dev);
669 			skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
670 		}
671 
672 		tstats = this_cpu_ptr(tunnel->dev->tstats);
673 		tstats->rx_packets++;
674 		tstats->rx_bytes += skb->len;
675 
676 		__skb_tunnel_rx(skb, tunnel->dev);
677 
678 		skb_reset_network_header(skb);
679 		ipgre_ecn_decapsulate(iph, skb);
680 
681 		netif_rx(skb);
682 
683 		rcu_read_unlock();
684 		return 0;
685 	}
686 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
687 
688 drop:
689 	rcu_read_unlock();
690 drop_nolock:
691 	kfree_skb(skb);
692 	return 0;
693 }
694 
695 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
696 {
697 	struct ip_tunnel *tunnel = netdev_priv(dev);
698 	struct pcpu_tstats *tstats;
699 	const struct iphdr  *old_iph = ip_hdr(skb);
700 	const struct iphdr  *tiph;
701 	struct flowi4 fl4;
702 	u8     tos;
703 	__be16 df;
704 	struct rtable *rt;     			/* Route to the other host */
705 	struct net_device *tdev;		/* Device to other host */
706 	struct iphdr  *iph;			/* Our new IP header */
707 	unsigned int max_headroom;		/* The extra header space needed */
708 	int    gre_hlen;
709 	__be32 dst;
710 	int    mtu;
711 
712 	if (dev->type == ARPHRD_ETHER)
713 		IPCB(skb)->flags = 0;
714 
715 	if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
716 		gre_hlen = 0;
717 		tiph = (const struct iphdr *)skb->data;
718 	} else {
719 		gre_hlen = tunnel->hlen;
720 		tiph = &tunnel->parms.iph;
721 	}
722 
723 	if ((dst = tiph->daddr) == 0) {
724 		/* NBMA tunnel */
725 
726 		if (skb_dst(skb) == NULL) {
727 			dev->stats.tx_fifo_errors++;
728 			goto tx_error;
729 		}
730 
731 		if (skb->protocol == htons(ETH_P_IP)) {
732 			rt = skb_rtable(skb);
733 			dst = rt->rt_gateway;
734 		}
735 #if IS_ENABLED(CONFIG_IPV6)
736 		else if (skb->protocol == htons(ETH_P_IPV6)) {
737 			const struct in6_addr *addr6;
738 			struct neighbour *neigh;
739 			bool do_tx_error_icmp;
740 			int addr_type;
741 
742 			neigh = dst_neigh_lookup(skb_dst(skb), &ipv6_hdr(skb)->daddr);
743 			if (neigh == NULL)
744 				goto tx_error;
745 
746 			addr6 = (const struct in6_addr *)&neigh->primary_key;
747 			addr_type = ipv6_addr_type(addr6);
748 
749 			if (addr_type == IPV6_ADDR_ANY) {
750 				addr6 = &ipv6_hdr(skb)->daddr;
751 				addr_type = ipv6_addr_type(addr6);
752 			}
753 
754 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
755 				do_tx_error_icmp = true;
756 			else {
757 				do_tx_error_icmp = false;
758 				dst = addr6->s6_addr32[3];
759 			}
760 			neigh_release(neigh);
761 			if (do_tx_error_icmp)
762 				goto tx_error_icmp;
763 		}
764 #endif
765 		else
766 			goto tx_error;
767 	}
768 
769 	tos = tiph->tos;
770 	if (tos == 1) {
771 		tos = 0;
772 		if (skb->protocol == htons(ETH_P_IP))
773 			tos = old_iph->tos;
774 		else if (skb->protocol == htons(ETH_P_IPV6))
775 			tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
776 	}
777 
778 	rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr,
779 				 tunnel->parms.o_key, RT_TOS(tos),
780 				 tunnel->parms.link);
781 	if (IS_ERR(rt)) {
782 		dev->stats.tx_carrier_errors++;
783 		goto tx_error;
784 	}
785 	tdev = rt->dst.dev;
786 
787 	if (tdev == dev) {
788 		ip_rt_put(rt);
789 		dev->stats.collisions++;
790 		goto tx_error;
791 	}
792 
793 	df = tiph->frag_off;
794 	if (df)
795 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
796 	else
797 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
798 
799 	if (skb_dst(skb))
800 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
801 
802 	if (skb->protocol == htons(ETH_P_IP)) {
803 		df |= (old_iph->frag_off&htons(IP_DF));
804 
805 		if ((old_iph->frag_off&htons(IP_DF)) &&
806 		    mtu < ntohs(old_iph->tot_len)) {
807 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
808 			ip_rt_put(rt);
809 			goto tx_error;
810 		}
811 	}
812 #if IS_ENABLED(CONFIG_IPV6)
813 	else if (skb->protocol == htons(ETH_P_IPV6)) {
814 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
815 
816 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
817 			if ((tunnel->parms.iph.daddr &&
818 			     !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
819 			    rt6->rt6i_dst.plen == 128) {
820 				rt6->rt6i_flags |= RTF_MODIFIED;
821 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
822 			}
823 		}
824 
825 		if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
826 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
827 			ip_rt_put(rt);
828 			goto tx_error;
829 		}
830 	}
831 #endif
832 
833 	if (tunnel->err_count > 0) {
834 		if (time_before(jiffies,
835 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
836 			tunnel->err_count--;
837 
838 			dst_link_failure(skb);
839 		} else
840 			tunnel->err_count = 0;
841 	}
842 
843 	max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
844 
845 	if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
846 	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
847 		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
848 		if (max_headroom > dev->needed_headroom)
849 			dev->needed_headroom = max_headroom;
850 		if (!new_skb) {
851 			ip_rt_put(rt);
852 			dev->stats.tx_dropped++;
853 			dev_kfree_skb(skb);
854 			return NETDEV_TX_OK;
855 		}
856 		if (skb->sk)
857 			skb_set_owner_w(new_skb, skb->sk);
858 		dev_kfree_skb(skb);
859 		skb = new_skb;
860 		old_iph = ip_hdr(skb);
861 	}
862 
863 	skb_reset_transport_header(skb);
864 	skb_push(skb, gre_hlen);
865 	skb_reset_network_header(skb);
866 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
867 	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
868 			      IPSKB_REROUTED);
869 	skb_dst_drop(skb);
870 	skb_dst_set(skb, &rt->dst);
871 
872 	/*
873 	 *	Push down and install the IPIP header.
874 	 */
875 
876 	iph 			=	ip_hdr(skb);
877 	iph->version		=	4;
878 	iph->ihl		=	sizeof(struct iphdr) >> 2;
879 	iph->frag_off		=	df;
880 	iph->protocol		=	IPPROTO_GRE;
881 	iph->tos		=	ipgre_ecn_encapsulate(tos, old_iph, skb);
882 	iph->daddr		=	fl4.daddr;
883 	iph->saddr		=	fl4.saddr;
884 
885 	if ((iph->ttl = tiph->ttl) == 0) {
886 		if (skb->protocol == htons(ETH_P_IP))
887 			iph->ttl = old_iph->ttl;
888 #if IS_ENABLED(CONFIG_IPV6)
889 		else if (skb->protocol == htons(ETH_P_IPV6))
890 			iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
891 #endif
892 		else
893 			iph->ttl = ip4_dst_hoplimit(&rt->dst);
894 	}
895 
896 	((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
897 	((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
898 				   htons(ETH_P_TEB) : skb->protocol;
899 
900 	if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
901 		__be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
902 
903 		if (tunnel->parms.o_flags&GRE_SEQ) {
904 			++tunnel->o_seqno;
905 			*ptr = htonl(tunnel->o_seqno);
906 			ptr--;
907 		}
908 		if (tunnel->parms.o_flags&GRE_KEY) {
909 			*ptr = tunnel->parms.o_key;
910 			ptr--;
911 		}
912 		if (tunnel->parms.o_flags&GRE_CSUM) {
913 			*ptr = 0;
914 			*(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
915 		}
916 	}
917 
918 	nf_reset(skb);
919 	tstats = this_cpu_ptr(dev->tstats);
920 	__IPTUNNEL_XMIT(tstats, &dev->stats);
921 	return NETDEV_TX_OK;
922 
923 #if IS_ENABLED(CONFIG_IPV6)
924 tx_error_icmp:
925 	dst_link_failure(skb);
926 #endif
927 tx_error:
928 	dev->stats.tx_errors++;
929 	dev_kfree_skb(skb);
930 	return NETDEV_TX_OK;
931 }
932 
933 static int ipgre_tunnel_bind_dev(struct net_device *dev)
934 {
935 	struct net_device *tdev = NULL;
936 	struct ip_tunnel *tunnel;
937 	const struct iphdr *iph;
938 	int hlen = LL_MAX_HEADER;
939 	int mtu = ETH_DATA_LEN;
940 	int addend = sizeof(struct iphdr) + 4;
941 
942 	tunnel = netdev_priv(dev);
943 	iph = &tunnel->parms.iph;
944 
945 	/* Guess output device to choose reasonable mtu and needed_headroom */
946 
947 	if (iph->daddr) {
948 		struct flowi4 fl4;
949 		struct rtable *rt;
950 
951 		rt = ip_route_output_gre(dev_net(dev), &fl4,
952 					 iph->daddr, iph->saddr,
953 					 tunnel->parms.o_key,
954 					 RT_TOS(iph->tos),
955 					 tunnel->parms.link);
956 		if (!IS_ERR(rt)) {
957 			tdev = rt->dst.dev;
958 			ip_rt_put(rt);
959 		}
960 
961 		if (dev->type != ARPHRD_ETHER)
962 			dev->flags |= IFF_POINTOPOINT;
963 	}
964 
965 	if (!tdev && tunnel->parms.link)
966 		tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
967 
968 	if (tdev) {
969 		hlen = tdev->hard_header_len + tdev->needed_headroom;
970 		mtu = tdev->mtu;
971 	}
972 	dev->iflink = tunnel->parms.link;
973 
974 	/* Precalculate GRE options length */
975 	if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
976 		if (tunnel->parms.o_flags&GRE_CSUM)
977 			addend += 4;
978 		if (tunnel->parms.o_flags&GRE_KEY)
979 			addend += 4;
980 		if (tunnel->parms.o_flags&GRE_SEQ)
981 			addend += 4;
982 	}
983 	dev->needed_headroom = addend + hlen;
984 	mtu -= dev->hard_header_len + addend;
985 
986 	if (mtu < 68)
987 		mtu = 68;
988 
989 	tunnel->hlen = addend;
990 
991 	return mtu;
992 }
993 
994 static int
995 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
996 {
997 	int err = 0;
998 	struct ip_tunnel_parm p;
999 	struct ip_tunnel *t;
1000 	struct net *net = dev_net(dev);
1001 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1002 
1003 	switch (cmd) {
1004 	case SIOCGETTUNNEL:
1005 		t = NULL;
1006 		if (dev == ign->fb_tunnel_dev) {
1007 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
1008 				err = -EFAULT;
1009 				break;
1010 			}
1011 			t = ipgre_tunnel_locate(net, &p, 0);
1012 		}
1013 		if (t == NULL)
1014 			t = netdev_priv(dev);
1015 		memcpy(&p, &t->parms, sizeof(p));
1016 		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1017 			err = -EFAULT;
1018 		break;
1019 
1020 	case SIOCADDTUNNEL:
1021 	case SIOCCHGTUNNEL:
1022 		err = -EPERM;
1023 		if (!capable(CAP_NET_ADMIN))
1024 			goto done;
1025 
1026 		err = -EFAULT;
1027 		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1028 			goto done;
1029 
1030 		err = -EINVAL;
1031 		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1032 		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1033 		    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1034 			goto done;
1035 		if (p.iph.ttl)
1036 			p.iph.frag_off |= htons(IP_DF);
1037 
1038 		if (!(p.i_flags&GRE_KEY))
1039 			p.i_key = 0;
1040 		if (!(p.o_flags&GRE_KEY))
1041 			p.o_key = 0;
1042 
1043 		t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1044 
1045 		if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1046 			if (t != NULL) {
1047 				if (t->dev != dev) {
1048 					err = -EEXIST;
1049 					break;
1050 				}
1051 			} else {
1052 				unsigned int nflags = 0;
1053 
1054 				t = netdev_priv(dev);
1055 
1056 				if (ipv4_is_multicast(p.iph.daddr))
1057 					nflags = IFF_BROADCAST;
1058 				else if (p.iph.daddr)
1059 					nflags = IFF_POINTOPOINT;
1060 
1061 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1062 					err = -EINVAL;
1063 					break;
1064 				}
1065 				ipgre_tunnel_unlink(ign, t);
1066 				synchronize_net();
1067 				t->parms.iph.saddr = p.iph.saddr;
1068 				t->parms.iph.daddr = p.iph.daddr;
1069 				t->parms.i_key = p.i_key;
1070 				t->parms.o_key = p.o_key;
1071 				memcpy(dev->dev_addr, &p.iph.saddr, 4);
1072 				memcpy(dev->broadcast, &p.iph.daddr, 4);
1073 				ipgre_tunnel_link(ign, t);
1074 				netdev_state_change(dev);
1075 			}
1076 		}
1077 
1078 		if (t) {
1079 			err = 0;
1080 			if (cmd == SIOCCHGTUNNEL) {
1081 				t->parms.iph.ttl = p.iph.ttl;
1082 				t->parms.iph.tos = p.iph.tos;
1083 				t->parms.iph.frag_off = p.iph.frag_off;
1084 				if (t->parms.link != p.link) {
1085 					t->parms.link = p.link;
1086 					dev->mtu = ipgre_tunnel_bind_dev(dev);
1087 					netdev_state_change(dev);
1088 				}
1089 			}
1090 			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1091 				err = -EFAULT;
1092 		} else
1093 			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1094 		break;
1095 
1096 	case SIOCDELTUNNEL:
1097 		err = -EPERM;
1098 		if (!capable(CAP_NET_ADMIN))
1099 			goto done;
1100 
1101 		if (dev == ign->fb_tunnel_dev) {
1102 			err = -EFAULT;
1103 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1104 				goto done;
1105 			err = -ENOENT;
1106 			if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1107 				goto done;
1108 			err = -EPERM;
1109 			if (t == netdev_priv(ign->fb_tunnel_dev))
1110 				goto done;
1111 			dev = t->dev;
1112 		}
1113 		unregister_netdevice(dev);
1114 		err = 0;
1115 		break;
1116 
1117 	default:
1118 		err = -EINVAL;
1119 	}
1120 
1121 done:
1122 	return err;
1123 }
1124 
1125 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1126 {
1127 	struct ip_tunnel *tunnel = netdev_priv(dev);
1128 	if (new_mtu < 68 ||
1129 	    new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1130 		return -EINVAL;
1131 	dev->mtu = new_mtu;
1132 	return 0;
1133 }
1134 
1135 /* Nice toy. Unfortunately, useless in real life :-)
1136    It allows to construct virtual multiprotocol broadcast "LAN"
1137    over the Internet, provided multicast routing is tuned.
1138 
1139 
1140    I have no idea was this bicycle invented before me,
1141    so that I had to set ARPHRD_IPGRE to a random value.
1142    I have an impression, that Cisco could make something similar,
1143    but this feature is apparently missing in IOS<=11.2(8).
1144 
1145    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1146    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1147 
1148    ping -t 255 224.66.66.66
1149 
1150    If nobody answers, mbone does not work.
1151 
1152    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1153    ip addr add 10.66.66.<somewhat>/24 dev Universe
1154    ifconfig Universe up
1155    ifconfig Universe add fe80::<Your_real_addr>/10
1156    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1157    ftp 10.66.66.66
1158    ...
1159    ftp fec0:6666:6666::193.233.7.65
1160    ...
1161 
1162  */
1163 
1164 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1165 			unsigned short type,
1166 			const void *daddr, const void *saddr, unsigned int len)
1167 {
1168 	struct ip_tunnel *t = netdev_priv(dev);
1169 	struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1170 	__be16 *p = (__be16*)(iph+1);
1171 
1172 	memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1173 	p[0]		= t->parms.o_flags;
1174 	p[1]		= htons(type);
1175 
1176 	/*
1177 	 *	Set the source hardware address.
1178 	 */
1179 
1180 	if (saddr)
1181 		memcpy(&iph->saddr, saddr, 4);
1182 	if (daddr)
1183 		memcpy(&iph->daddr, daddr, 4);
1184 	if (iph->daddr)
1185 		return t->hlen;
1186 
1187 	return -t->hlen;
1188 }
1189 
1190 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1191 {
1192 	const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
1193 	memcpy(haddr, &iph->saddr, 4);
1194 	return 4;
1195 }
1196 
1197 static const struct header_ops ipgre_header_ops = {
1198 	.create	= ipgre_header,
1199 	.parse	= ipgre_header_parse,
1200 };
1201 
1202 #ifdef CONFIG_NET_IPGRE_BROADCAST
1203 static int ipgre_open(struct net_device *dev)
1204 {
1205 	struct ip_tunnel *t = netdev_priv(dev);
1206 
1207 	if (ipv4_is_multicast(t->parms.iph.daddr)) {
1208 		struct flowi4 fl4;
1209 		struct rtable *rt;
1210 
1211 		rt = ip_route_output_gre(dev_net(dev), &fl4,
1212 					 t->parms.iph.daddr,
1213 					 t->parms.iph.saddr,
1214 					 t->parms.o_key,
1215 					 RT_TOS(t->parms.iph.tos),
1216 					 t->parms.link);
1217 		if (IS_ERR(rt))
1218 			return -EADDRNOTAVAIL;
1219 		dev = rt->dst.dev;
1220 		ip_rt_put(rt);
1221 		if (__in_dev_get_rtnl(dev) == NULL)
1222 			return -EADDRNOTAVAIL;
1223 		t->mlink = dev->ifindex;
1224 		ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1225 	}
1226 	return 0;
1227 }
1228 
1229 static int ipgre_close(struct net_device *dev)
1230 {
1231 	struct ip_tunnel *t = netdev_priv(dev);
1232 
1233 	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1234 		struct in_device *in_dev;
1235 		in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1236 		if (in_dev)
1237 			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1238 	}
1239 	return 0;
1240 }
1241 
1242 #endif
1243 
1244 static const struct net_device_ops ipgre_netdev_ops = {
1245 	.ndo_init		= ipgre_tunnel_init,
1246 	.ndo_uninit		= ipgre_tunnel_uninit,
1247 #ifdef CONFIG_NET_IPGRE_BROADCAST
1248 	.ndo_open		= ipgre_open,
1249 	.ndo_stop		= ipgre_close,
1250 #endif
1251 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1252 	.ndo_do_ioctl		= ipgre_tunnel_ioctl,
1253 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1254 	.ndo_get_stats		= ipgre_get_stats,
1255 };
1256 
1257 static void ipgre_dev_free(struct net_device *dev)
1258 {
1259 	free_percpu(dev->tstats);
1260 	free_netdev(dev);
1261 }
1262 
1263 static void ipgre_tunnel_setup(struct net_device *dev)
1264 {
1265 	dev->netdev_ops		= &ipgre_netdev_ops;
1266 	dev->destructor 	= ipgre_dev_free;
1267 
1268 	dev->type		= ARPHRD_IPGRE;
1269 	dev->needed_headroom 	= LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1270 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1271 	dev->flags		= IFF_NOARP;
1272 	dev->iflink		= 0;
1273 	dev->addr_len		= 4;
1274 	dev->features		|= NETIF_F_NETNS_LOCAL;
1275 	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
1276 }
1277 
1278 static int ipgre_tunnel_init(struct net_device *dev)
1279 {
1280 	struct ip_tunnel *tunnel;
1281 	struct iphdr *iph;
1282 
1283 	tunnel = netdev_priv(dev);
1284 	iph = &tunnel->parms.iph;
1285 
1286 	tunnel->dev = dev;
1287 	strcpy(tunnel->parms.name, dev->name);
1288 
1289 	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1290 	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1291 
1292 	if (iph->daddr) {
1293 #ifdef CONFIG_NET_IPGRE_BROADCAST
1294 		if (ipv4_is_multicast(iph->daddr)) {
1295 			if (!iph->saddr)
1296 				return -EINVAL;
1297 			dev->flags = IFF_BROADCAST;
1298 			dev->header_ops = &ipgre_header_ops;
1299 		}
1300 #endif
1301 	} else
1302 		dev->header_ops = &ipgre_header_ops;
1303 
1304 	dev->tstats = alloc_percpu(struct pcpu_tstats);
1305 	if (!dev->tstats)
1306 		return -ENOMEM;
1307 
1308 	return 0;
1309 }
1310 
1311 static void ipgre_fb_tunnel_init(struct net_device *dev)
1312 {
1313 	struct ip_tunnel *tunnel = netdev_priv(dev);
1314 	struct iphdr *iph = &tunnel->parms.iph;
1315 
1316 	tunnel->dev = dev;
1317 	strcpy(tunnel->parms.name, dev->name);
1318 
1319 	iph->version		= 4;
1320 	iph->protocol		= IPPROTO_GRE;
1321 	iph->ihl		= 5;
1322 	tunnel->hlen		= sizeof(struct iphdr) + 4;
1323 
1324 	dev_hold(dev);
1325 }
1326 
1327 
1328 static const struct gre_protocol ipgre_protocol = {
1329 	.handler     = ipgre_rcv,
1330 	.err_handler = ipgre_err,
1331 };
1332 
1333 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1334 {
1335 	int prio;
1336 
1337 	for (prio = 0; prio < 4; prio++) {
1338 		int h;
1339 		for (h = 0; h < HASH_SIZE; h++) {
1340 			struct ip_tunnel *t;
1341 
1342 			t = rtnl_dereference(ign->tunnels[prio][h]);
1343 
1344 			while (t != NULL) {
1345 				unregister_netdevice_queue(t->dev, head);
1346 				t = rtnl_dereference(t->next);
1347 			}
1348 		}
1349 	}
1350 }
1351 
1352 static int __net_init ipgre_init_net(struct net *net)
1353 {
1354 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1355 	int err;
1356 
1357 	ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1358 					   ipgre_tunnel_setup);
1359 	if (!ign->fb_tunnel_dev) {
1360 		err = -ENOMEM;
1361 		goto err_alloc_dev;
1362 	}
1363 	dev_net_set(ign->fb_tunnel_dev, net);
1364 
1365 	ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1366 	ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1367 
1368 	if ((err = register_netdev(ign->fb_tunnel_dev)))
1369 		goto err_reg_dev;
1370 
1371 	rcu_assign_pointer(ign->tunnels_wc[0],
1372 			   netdev_priv(ign->fb_tunnel_dev));
1373 	return 0;
1374 
1375 err_reg_dev:
1376 	ipgre_dev_free(ign->fb_tunnel_dev);
1377 err_alloc_dev:
1378 	return err;
1379 }
1380 
1381 static void __net_exit ipgre_exit_net(struct net *net)
1382 {
1383 	struct ipgre_net *ign;
1384 	LIST_HEAD(list);
1385 
1386 	ign = net_generic(net, ipgre_net_id);
1387 	rtnl_lock();
1388 	ipgre_destroy_tunnels(ign, &list);
1389 	unregister_netdevice_many(&list);
1390 	rtnl_unlock();
1391 }
1392 
1393 static struct pernet_operations ipgre_net_ops = {
1394 	.init = ipgre_init_net,
1395 	.exit = ipgre_exit_net,
1396 	.id   = &ipgre_net_id,
1397 	.size = sizeof(struct ipgre_net),
1398 };
1399 
1400 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1401 {
1402 	__be16 flags;
1403 
1404 	if (!data)
1405 		return 0;
1406 
1407 	flags = 0;
1408 	if (data[IFLA_GRE_IFLAGS])
1409 		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1410 	if (data[IFLA_GRE_OFLAGS])
1411 		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1412 	if (flags & (GRE_VERSION|GRE_ROUTING))
1413 		return -EINVAL;
1414 
1415 	return 0;
1416 }
1417 
1418 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1419 {
1420 	__be32 daddr;
1421 
1422 	if (tb[IFLA_ADDRESS]) {
1423 		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1424 			return -EINVAL;
1425 		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1426 			return -EADDRNOTAVAIL;
1427 	}
1428 
1429 	if (!data)
1430 		goto out;
1431 
1432 	if (data[IFLA_GRE_REMOTE]) {
1433 		memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1434 		if (!daddr)
1435 			return -EINVAL;
1436 	}
1437 
1438 out:
1439 	return ipgre_tunnel_validate(tb, data);
1440 }
1441 
1442 static void ipgre_netlink_parms(struct nlattr *data[],
1443 				struct ip_tunnel_parm *parms)
1444 {
1445 	memset(parms, 0, sizeof(*parms));
1446 
1447 	parms->iph.protocol = IPPROTO_GRE;
1448 
1449 	if (!data)
1450 		return;
1451 
1452 	if (data[IFLA_GRE_LINK])
1453 		parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1454 
1455 	if (data[IFLA_GRE_IFLAGS])
1456 		parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1457 
1458 	if (data[IFLA_GRE_OFLAGS])
1459 		parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1460 
1461 	if (data[IFLA_GRE_IKEY])
1462 		parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1463 
1464 	if (data[IFLA_GRE_OKEY])
1465 		parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1466 
1467 	if (data[IFLA_GRE_LOCAL])
1468 		parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1469 
1470 	if (data[IFLA_GRE_REMOTE])
1471 		parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1472 
1473 	if (data[IFLA_GRE_TTL])
1474 		parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1475 
1476 	if (data[IFLA_GRE_TOS])
1477 		parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1478 
1479 	if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1480 		parms->iph.frag_off = htons(IP_DF);
1481 }
1482 
1483 static int ipgre_tap_init(struct net_device *dev)
1484 {
1485 	struct ip_tunnel *tunnel;
1486 
1487 	tunnel = netdev_priv(dev);
1488 
1489 	tunnel->dev = dev;
1490 	strcpy(tunnel->parms.name, dev->name);
1491 
1492 	ipgre_tunnel_bind_dev(dev);
1493 
1494 	dev->tstats = alloc_percpu(struct pcpu_tstats);
1495 	if (!dev->tstats)
1496 		return -ENOMEM;
1497 
1498 	return 0;
1499 }
1500 
1501 static const struct net_device_ops ipgre_tap_netdev_ops = {
1502 	.ndo_init		= ipgre_tap_init,
1503 	.ndo_uninit		= ipgre_tunnel_uninit,
1504 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1505 	.ndo_set_mac_address 	= eth_mac_addr,
1506 	.ndo_validate_addr	= eth_validate_addr,
1507 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1508 	.ndo_get_stats		= ipgre_get_stats,
1509 };
1510 
1511 static void ipgre_tap_setup(struct net_device *dev)
1512 {
1513 
1514 	ether_setup(dev);
1515 
1516 	dev->netdev_ops		= &ipgre_tap_netdev_ops;
1517 	dev->destructor 	= ipgre_dev_free;
1518 
1519 	dev->iflink		= 0;
1520 	dev->features		|= NETIF_F_NETNS_LOCAL;
1521 }
1522 
1523 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1524 			 struct nlattr *data[])
1525 {
1526 	struct ip_tunnel *nt;
1527 	struct net *net = dev_net(dev);
1528 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1529 	int mtu;
1530 	int err;
1531 
1532 	nt = netdev_priv(dev);
1533 	ipgre_netlink_parms(data, &nt->parms);
1534 
1535 	if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1536 		return -EEXIST;
1537 
1538 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1539 		eth_hw_addr_random(dev);
1540 
1541 	mtu = ipgre_tunnel_bind_dev(dev);
1542 	if (!tb[IFLA_MTU])
1543 		dev->mtu = mtu;
1544 
1545 	/* Can use a lockless transmit, unless we generate output sequences */
1546 	if (!(nt->parms.o_flags & GRE_SEQ))
1547 		dev->features |= NETIF_F_LLTX;
1548 
1549 	err = register_netdevice(dev);
1550 	if (err)
1551 		goto out;
1552 
1553 	dev_hold(dev);
1554 	ipgre_tunnel_link(ign, nt);
1555 
1556 out:
1557 	return err;
1558 }
1559 
1560 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1561 			    struct nlattr *data[])
1562 {
1563 	struct ip_tunnel *t, *nt;
1564 	struct net *net = dev_net(dev);
1565 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1566 	struct ip_tunnel_parm p;
1567 	int mtu;
1568 
1569 	if (dev == ign->fb_tunnel_dev)
1570 		return -EINVAL;
1571 
1572 	nt = netdev_priv(dev);
1573 	ipgre_netlink_parms(data, &p);
1574 
1575 	t = ipgre_tunnel_locate(net, &p, 0);
1576 
1577 	if (t) {
1578 		if (t->dev != dev)
1579 			return -EEXIST;
1580 	} else {
1581 		t = nt;
1582 
1583 		if (dev->type != ARPHRD_ETHER) {
1584 			unsigned int nflags = 0;
1585 
1586 			if (ipv4_is_multicast(p.iph.daddr))
1587 				nflags = IFF_BROADCAST;
1588 			else if (p.iph.daddr)
1589 				nflags = IFF_POINTOPOINT;
1590 
1591 			if ((dev->flags ^ nflags) &
1592 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1593 				return -EINVAL;
1594 		}
1595 
1596 		ipgre_tunnel_unlink(ign, t);
1597 		t->parms.iph.saddr = p.iph.saddr;
1598 		t->parms.iph.daddr = p.iph.daddr;
1599 		t->parms.i_key = p.i_key;
1600 		if (dev->type != ARPHRD_ETHER) {
1601 			memcpy(dev->dev_addr, &p.iph.saddr, 4);
1602 			memcpy(dev->broadcast, &p.iph.daddr, 4);
1603 		}
1604 		ipgre_tunnel_link(ign, t);
1605 		netdev_state_change(dev);
1606 	}
1607 
1608 	t->parms.o_key = p.o_key;
1609 	t->parms.iph.ttl = p.iph.ttl;
1610 	t->parms.iph.tos = p.iph.tos;
1611 	t->parms.iph.frag_off = p.iph.frag_off;
1612 
1613 	if (t->parms.link != p.link) {
1614 		t->parms.link = p.link;
1615 		mtu = ipgre_tunnel_bind_dev(dev);
1616 		if (!tb[IFLA_MTU])
1617 			dev->mtu = mtu;
1618 		netdev_state_change(dev);
1619 	}
1620 
1621 	return 0;
1622 }
1623 
1624 static size_t ipgre_get_size(const struct net_device *dev)
1625 {
1626 	return
1627 		/* IFLA_GRE_LINK */
1628 		nla_total_size(4) +
1629 		/* IFLA_GRE_IFLAGS */
1630 		nla_total_size(2) +
1631 		/* IFLA_GRE_OFLAGS */
1632 		nla_total_size(2) +
1633 		/* IFLA_GRE_IKEY */
1634 		nla_total_size(4) +
1635 		/* IFLA_GRE_OKEY */
1636 		nla_total_size(4) +
1637 		/* IFLA_GRE_LOCAL */
1638 		nla_total_size(4) +
1639 		/* IFLA_GRE_REMOTE */
1640 		nla_total_size(4) +
1641 		/* IFLA_GRE_TTL */
1642 		nla_total_size(1) +
1643 		/* IFLA_GRE_TOS */
1644 		nla_total_size(1) +
1645 		/* IFLA_GRE_PMTUDISC */
1646 		nla_total_size(1) +
1647 		0;
1648 }
1649 
1650 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1651 {
1652 	struct ip_tunnel *t = netdev_priv(dev);
1653 	struct ip_tunnel_parm *p = &t->parms;
1654 
1655 	NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1656 	NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1657 	NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1658 	NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1659 	NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1660 	NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1661 	NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1662 	NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1663 	NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1664 	NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1665 
1666 	return 0;
1667 
1668 nla_put_failure:
1669 	return -EMSGSIZE;
1670 }
1671 
1672 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1673 	[IFLA_GRE_LINK]		= { .type = NLA_U32 },
1674 	[IFLA_GRE_IFLAGS]	= { .type = NLA_U16 },
1675 	[IFLA_GRE_OFLAGS]	= { .type = NLA_U16 },
1676 	[IFLA_GRE_IKEY]		= { .type = NLA_U32 },
1677 	[IFLA_GRE_OKEY]		= { .type = NLA_U32 },
1678 	[IFLA_GRE_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1679 	[IFLA_GRE_REMOTE]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1680 	[IFLA_GRE_TTL]		= { .type = NLA_U8 },
1681 	[IFLA_GRE_TOS]		= { .type = NLA_U8 },
1682 	[IFLA_GRE_PMTUDISC]	= { .type = NLA_U8 },
1683 };
1684 
1685 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1686 	.kind		= "gre",
1687 	.maxtype	= IFLA_GRE_MAX,
1688 	.policy		= ipgre_policy,
1689 	.priv_size	= sizeof(struct ip_tunnel),
1690 	.setup		= ipgre_tunnel_setup,
1691 	.validate	= ipgre_tunnel_validate,
1692 	.newlink	= ipgre_newlink,
1693 	.changelink	= ipgre_changelink,
1694 	.get_size	= ipgre_get_size,
1695 	.fill_info	= ipgre_fill_info,
1696 };
1697 
1698 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1699 	.kind		= "gretap",
1700 	.maxtype	= IFLA_GRE_MAX,
1701 	.policy		= ipgre_policy,
1702 	.priv_size	= sizeof(struct ip_tunnel),
1703 	.setup		= ipgre_tap_setup,
1704 	.validate	= ipgre_tap_validate,
1705 	.newlink	= ipgre_newlink,
1706 	.changelink	= ipgre_changelink,
1707 	.get_size	= ipgre_get_size,
1708 	.fill_info	= ipgre_fill_info,
1709 };
1710 
1711 /*
1712  *	And now the modules code and kernel interface.
1713  */
1714 
1715 static int __init ipgre_init(void)
1716 {
1717 	int err;
1718 
1719 	printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1720 
1721 	err = register_pernet_device(&ipgre_net_ops);
1722 	if (err < 0)
1723 		return err;
1724 
1725 	err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1726 	if (err < 0) {
1727 		printk(KERN_INFO "ipgre init: can't add protocol\n");
1728 		goto add_proto_failed;
1729 	}
1730 
1731 	err = rtnl_link_register(&ipgre_link_ops);
1732 	if (err < 0)
1733 		goto rtnl_link_failed;
1734 
1735 	err = rtnl_link_register(&ipgre_tap_ops);
1736 	if (err < 0)
1737 		goto tap_ops_failed;
1738 
1739 out:
1740 	return err;
1741 
1742 tap_ops_failed:
1743 	rtnl_link_unregister(&ipgre_link_ops);
1744 rtnl_link_failed:
1745 	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1746 add_proto_failed:
1747 	unregister_pernet_device(&ipgre_net_ops);
1748 	goto out;
1749 }
1750 
1751 static void __exit ipgre_fini(void)
1752 {
1753 	rtnl_link_unregister(&ipgre_tap_ops);
1754 	rtnl_link_unregister(&ipgre_link_ops);
1755 	if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1756 		printk(KERN_INFO "ipgre close: can't remove protocol\n");
1757 	unregister_pernet_device(&ipgre_net_ops);
1758 }
1759 
1760 module_init(ipgre_init);
1761 module_exit(ipgre_fini);
1762 MODULE_LICENSE("GPL");
1763 MODULE_ALIAS_RTNL_LINK("gre");
1764 MODULE_ALIAS_RTNL_LINK("gretap");
1765 MODULE_ALIAS_NETDEV("gre0");
1766