xref: /openbmc/linux/net/ipv4/ip_gre.c (revision 7fe2f639)
1 /*
2  *	Linux NET3:	GRE over IP protocol decoder.
3  *
4  *	Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *	This program is free software; you can redistribute it and/or
7  *	modify it under the terms of the GNU General Public License
8  *	as published by the Free Software Foundation; either version
9  *	2 of the License, or (at your option) any later version.
10  *
11  */
12 
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/slab.h>
18 #include <asm/uaccess.h>
19 #include <linux/skbuff.h>
20 #include <linux/netdevice.h>
21 #include <linux/in.h>
22 #include <linux/tcp.h>
23 #include <linux/udp.h>
24 #include <linux/if_arp.h>
25 #include <linux/mroute.h>
26 #include <linux/init.h>
27 #include <linux/in6.h>
28 #include <linux/inetdevice.h>
29 #include <linux/igmp.h>
30 #include <linux/netfilter_ipv4.h>
31 #include <linux/etherdevice.h>
32 #include <linux/if_ether.h>
33 
34 #include <net/sock.h>
35 #include <net/ip.h>
36 #include <net/icmp.h>
37 #include <net/protocol.h>
38 #include <net/ipip.h>
39 #include <net/arp.h>
40 #include <net/checksum.h>
41 #include <net/dsfield.h>
42 #include <net/inet_ecn.h>
43 #include <net/xfrm.h>
44 #include <net/net_namespace.h>
45 #include <net/netns/generic.h>
46 #include <net/rtnetlink.h>
47 #include <net/gre.h>
48 
49 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #endif
54 
55 /*
56    Problems & solutions
57    --------------------
58 
59    1. The most important issue is detecting local dead loops.
60    They would cause complete host lockup in transmit, which
61    would be "resolved" by stack overflow or, if queueing is enabled,
62    with infinite looping in net_bh.
63 
64    We cannot track such dead loops during route installation,
65    it is infeasible task. The most general solutions would be
66    to keep skb->encapsulation counter (sort of local ttl),
67    and silently drop packet when it expires. It is a good
68    solution, but it supposes maintaing new variable in ALL
69    skb, even if no tunneling is used.
70 
71    Current solution: xmit_recursion breaks dead loops. This is a percpu
72    counter, since when we enter the first ndo_xmit(), cpu migration is
73    forbidden. We force an exit if this counter reaches RECURSION_LIMIT
74 
75    2. Networking dead loops would not kill routers, but would really
76    kill network. IP hop limit plays role of "t->recursion" in this case,
77    if we copy it from packet being encapsulated to upper header.
78    It is very good solution, but it introduces two problems:
79 
80    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
81      do not work over tunnels.
82    - traceroute does not work. I planned to relay ICMP from tunnel,
83      so that this problem would be solved and traceroute output
84      would even more informative. This idea appeared to be wrong:
85      only Linux complies to rfc1812 now (yes, guys, Linux is the only
86      true router now :-)), all routers (at least, in neighbourhood of mine)
87      return only 8 bytes of payload. It is the end.
88 
89    Hence, if we want that OSPF worked or traceroute said something reasonable,
90    we should search for another solution.
91 
92    One of them is to parse packet trying to detect inner encapsulation
93    made by our node. It is difficult or even impossible, especially,
94    taking into account fragmentation. TO be short, tt is not solution at all.
95 
96    Current solution: The solution was UNEXPECTEDLY SIMPLE.
97    We force DF flag on tunnels with preconfigured hop limit,
98    that is ALL. :-) Well, it does not remove the problem completely,
99    but exponential growth of network traffic is changed to linear
100    (branches, that exceed pmtu are pruned) and tunnel mtu
101    fastly degrades to value <68, where looping stops.
102    Yes, it is not good if there exists a router in the loop,
103    which does not force DF, even when encapsulating packets have DF set.
104    But it is not our problem! Nobody could accuse us, we made
105    all that we could make. Even if it is your gated who injected
106    fatal route to network, even if it were you who configured
107    fatal static route: you are innocent. :-)
108 
109 
110 
111    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
112    practically identical code. It would be good to glue them
113    together, but it is not very evident, how to make them modular.
114    sit is integral part of IPv6, ipip and gre are naturally modular.
115    We could extract common parts (hash table, ioctl etc)
116    to a separate module (ip_tunnel.c).
117 
118    Alexey Kuznetsov.
119  */
120 
121 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
122 static int ipgre_tunnel_init(struct net_device *dev);
123 static void ipgre_tunnel_setup(struct net_device *dev);
124 static int ipgre_tunnel_bind_dev(struct net_device *dev);
125 
126 /* Fallback tunnel: no source, no destination, no key, no options */
127 
128 #define HASH_SIZE  16
129 
130 static int ipgre_net_id __read_mostly;
131 struct ipgre_net {
132 	struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
133 
134 	struct net_device *fb_tunnel_dev;
135 };
136 
137 /* Tunnel hash table */
138 
139 /*
140    4 hash tables:
141 
142    3: (remote,local)
143    2: (remote,*)
144    1: (*,local)
145    0: (*,*)
146 
147    We require exact key match i.e. if a key is present in packet
148    it will match only tunnel with the same key; if it is not present,
149    it will match only keyless tunnel.
150 
151    All keysless packets, if not matched configured keyless tunnels
152    will match fallback tunnel.
153  */
154 
155 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
156 
157 #define tunnels_r_l	tunnels[3]
158 #define tunnels_r	tunnels[2]
159 #define tunnels_l	tunnels[1]
160 #define tunnels_wc	tunnels[0]
161 /*
162  * Locking : hash tables are protected by RCU and RTNL
163  */
164 
165 #define for_each_ip_tunnel_rcu(start) \
166 	for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
167 
168 /* often modified stats are per cpu, other are shared (netdev->stats) */
169 struct pcpu_tstats {
170 	unsigned long	rx_packets;
171 	unsigned long	rx_bytes;
172 	unsigned long	tx_packets;
173 	unsigned long	tx_bytes;
174 };
175 
176 static struct net_device_stats *ipgre_get_stats(struct net_device *dev)
177 {
178 	struct pcpu_tstats sum = { 0 };
179 	int i;
180 
181 	for_each_possible_cpu(i) {
182 		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
183 
184 		sum.rx_packets += tstats->rx_packets;
185 		sum.rx_bytes   += tstats->rx_bytes;
186 		sum.tx_packets += tstats->tx_packets;
187 		sum.tx_bytes   += tstats->tx_bytes;
188 	}
189 	dev->stats.rx_packets = sum.rx_packets;
190 	dev->stats.rx_bytes   = sum.rx_bytes;
191 	dev->stats.tx_packets = sum.tx_packets;
192 	dev->stats.tx_bytes   = sum.tx_bytes;
193 	return &dev->stats;
194 }
195 
196 /* Given src, dst and key, find appropriate for input tunnel. */
197 
198 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
199 					      __be32 remote, __be32 local,
200 					      __be32 key, __be16 gre_proto)
201 {
202 	struct net *net = dev_net(dev);
203 	int link = dev->ifindex;
204 	unsigned int h0 = HASH(remote);
205 	unsigned int h1 = HASH(key);
206 	struct ip_tunnel *t, *cand = NULL;
207 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
208 	int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
209 		       ARPHRD_ETHER : ARPHRD_IPGRE;
210 	int score, cand_score = 4;
211 
212 	for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
213 		if (local != t->parms.iph.saddr ||
214 		    remote != t->parms.iph.daddr ||
215 		    key != t->parms.i_key ||
216 		    !(t->dev->flags & IFF_UP))
217 			continue;
218 
219 		if (t->dev->type != ARPHRD_IPGRE &&
220 		    t->dev->type != dev_type)
221 			continue;
222 
223 		score = 0;
224 		if (t->parms.link != link)
225 			score |= 1;
226 		if (t->dev->type != dev_type)
227 			score |= 2;
228 		if (score == 0)
229 			return t;
230 
231 		if (score < cand_score) {
232 			cand = t;
233 			cand_score = score;
234 		}
235 	}
236 
237 	for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
238 		if (remote != t->parms.iph.daddr ||
239 		    key != t->parms.i_key ||
240 		    !(t->dev->flags & IFF_UP))
241 			continue;
242 
243 		if (t->dev->type != ARPHRD_IPGRE &&
244 		    t->dev->type != dev_type)
245 			continue;
246 
247 		score = 0;
248 		if (t->parms.link != link)
249 			score |= 1;
250 		if (t->dev->type != dev_type)
251 			score |= 2;
252 		if (score == 0)
253 			return t;
254 
255 		if (score < cand_score) {
256 			cand = t;
257 			cand_score = score;
258 		}
259 	}
260 
261 	for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
262 		if ((local != t->parms.iph.saddr &&
263 		     (local != t->parms.iph.daddr ||
264 		      !ipv4_is_multicast(local))) ||
265 		    key != t->parms.i_key ||
266 		    !(t->dev->flags & IFF_UP))
267 			continue;
268 
269 		if (t->dev->type != ARPHRD_IPGRE &&
270 		    t->dev->type != dev_type)
271 			continue;
272 
273 		score = 0;
274 		if (t->parms.link != link)
275 			score |= 1;
276 		if (t->dev->type != dev_type)
277 			score |= 2;
278 		if (score == 0)
279 			return t;
280 
281 		if (score < cand_score) {
282 			cand = t;
283 			cand_score = score;
284 		}
285 	}
286 
287 	for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
288 		if (t->parms.i_key != key ||
289 		    !(t->dev->flags & IFF_UP))
290 			continue;
291 
292 		if (t->dev->type != ARPHRD_IPGRE &&
293 		    t->dev->type != dev_type)
294 			continue;
295 
296 		score = 0;
297 		if (t->parms.link != link)
298 			score |= 1;
299 		if (t->dev->type != dev_type)
300 			score |= 2;
301 		if (score == 0)
302 			return t;
303 
304 		if (score < cand_score) {
305 			cand = t;
306 			cand_score = score;
307 		}
308 	}
309 
310 	if (cand != NULL)
311 		return cand;
312 
313 	dev = ign->fb_tunnel_dev;
314 	if (dev->flags & IFF_UP)
315 		return netdev_priv(dev);
316 
317 	return NULL;
318 }
319 
320 static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
321 		struct ip_tunnel_parm *parms)
322 {
323 	__be32 remote = parms->iph.daddr;
324 	__be32 local = parms->iph.saddr;
325 	__be32 key = parms->i_key;
326 	unsigned int h = HASH(key);
327 	int prio = 0;
328 
329 	if (local)
330 		prio |= 1;
331 	if (remote && !ipv4_is_multicast(remote)) {
332 		prio |= 2;
333 		h ^= HASH(remote);
334 	}
335 
336 	return &ign->tunnels[prio][h];
337 }
338 
339 static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
340 		struct ip_tunnel *t)
341 {
342 	return __ipgre_bucket(ign, &t->parms);
343 }
344 
345 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
346 {
347 	struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
348 
349 	rcu_assign_pointer(t->next, rtnl_dereference(*tp));
350 	rcu_assign_pointer(*tp, t);
351 }
352 
353 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
354 {
355 	struct ip_tunnel __rcu **tp;
356 	struct ip_tunnel *iter;
357 
358 	for (tp = ipgre_bucket(ign, t);
359 	     (iter = rtnl_dereference(*tp)) != NULL;
360 	     tp = &iter->next) {
361 		if (t == iter) {
362 			rcu_assign_pointer(*tp, t->next);
363 			break;
364 		}
365 	}
366 }
367 
368 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
369 					   struct ip_tunnel_parm *parms,
370 					   int type)
371 {
372 	__be32 remote = parms->iph.daddr;
373 	__be32 local = parms->iph.saddr;
374 	__be32 key = parms->i_key;
375 	int link = parms->link;
376 	struct ip_tunnel *t;
377 	struct ip_tunnel __rcu **tp;
378 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
379 
380 	for (tp = __ipgre_bucket(ign, parms);
381 	     (t = rtnl_dereference(*tp)) != NULL;
382 	     tp = &t->next)
383 		if (local == t->parms.iph.saddr &&
384 		    remote == t->parms.iph.daddr &&
385 		    key == t->parms.i_key &&
386 		    link == t->parms.link &&
387 		    type == t->dev->type)
388 			break;
389 
390 	return t;
391 }
392 
393 static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
394 		struct ip_tunnel_parm *parms, int create)
395 {
396 	struct ip_tunnel *t, *nt;
397 	struct net_device *dev;
398 	char name[IFNAMSIZ];
399 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
400 
401 	t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
402 	if (t || !create)
403 		return t;
404 
405 	if (parms->name[0])
406 		strlcpy(name, parms->name, IFNAMSIZ);
407 	else
408 		strcpy(name, "gre%d");
409 
410 	dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
411 	if (!dev)
412 		return NULL;
413 
414 	dev_net_set(dev, net);
415 
416 	nt = netdev_priv(dev);
417 	nt->parms = *parms;
418 	dev->rtnl_link_ops = &ipgre_link_ops;
419 
420 	dev->mtu = ipgre_tunnel_bind_dev(dev);
421 
422 	if (register_netdevice(dev) < 0)
423 		goto failed_free;
424 
425 	dev_hold(dev);
426 	ipgre_tunnel_link(ign, nt);
427 	return nt;
428 
429 failed_free:
430 	free_netdev(dev);
431 	return NULL;
432 }
433 
434 static void ipgre_tunnel_uninit(struct net_device *dev)
435 {
436 	struct net *net = dev_net(dev);
437 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
438 
439 	ipgre_tunnel_unlink(ign, netdev_priv(dev));
440 	dev_put(dev);
441 }
442 
443 
444 static void ipgre_err(struct sk_buff *skb, u32 info)
445 {
446 
447 /* All the routers (except for Linux) return only
448    8 bytes of packet payload. It means, that precise relaying of
449    ICMP in the real Internet is absolutely infeasible.
450 
451    Moreover, Cisco "wise men" put GRE key to the third word
452    in GRE header. It makes impossible maintaining even soft state for keyed
453    GRE tunnels with enabled checksum. Tell them "thank you".
454 
455    Well, I wonder, rfc1812 was written by Cisco employee,
456    what the hell these idiots break standrads established
457    by themself???
458  */
459 
460 	const struct iphdr *iph = (const struct iphdr *)skb->data;
461 	__be16	     *p = (__be16*)(skb->data+(iph->ihl<<2));
462 	int grehlen = (iph->ihl<<2) + 4;
463 	const int type = icmp_hdr(skb)->type;
464 	const int code = icmp_hdr(skb)->code;
465 	struct ip_tunnel *t;
466 	__be16 flags;
467 
468 	flags = p[0];
469 	if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
470 		if (flags&(GRE_VERSION|GRE_ROUTING))
471 			return;
472 		if (flags&GRE_KEY) {
473 			grehlen += 4;
474 			if (flags&GRE_CSUM)
475 				grehlen += 4;
476 		}
477 	}
478 
479 	/* If only 8 bytes returned, keyed message will be dropped here */
480 	if (skb_headlen(skb) < grehlen)
481 		return;
482 
483 	switch (type) {
484 	default:
485 	case ICMP_PARAMETERPROB:
486 		return;
487 
488 	case ICMP_DEST_UNREACH:
489 		switch (code) {
490 		case ICMP_SR_FAILED:
491 		case ICMP_PORT_UNREACH:
492 			/* Impossible event. */
493 			return;
494 		case ICMP_FRAG_NEEDED:
495 			/* Soft state for pmtu is maintained by IP core. */
496 			return;
497 		default:
498 			/* All others are translated to HOST_UNREACH.
499 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
500 			   I believe they are just ether pollution. --ANK
501 			 */
502 			break;
503 		}
504 		break;
505 	case ICMP_TIME_EXCEEDED:
506 		if (code != ICMP_EXC_TTL)
507 			return;
508 		break;
509 	}
510 
511 	rcu_read_lock();
512 	t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
513 				flags & GRE_KEY ?
514 				*(((__be32 *)p) + (grehlen / 4) - 1) : 0,
515 				p[1]);
516 	if (t == NULL || t->parms.iph.daddr == 0 ||
517 	    ipv4_is_multicast(t->parms.iph.daddr))
518 		goto out;
519 
520 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
521 		goto out;
522 
523 	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
524 		t->err_count++;
525 	else
526 		t->err_count = 1;
527 	t->err_time = jiffies;
528 out:
529 	rcu_read_unlock();
530 }
531 
532 static inline void ipgre_ecn_decapsulate(const struct iphdr *iph, struct sk_buff *skb)
533 {
534 	if (INET_ECN_is_ce(iph->tos)) {
535 		if (skb->protocol == htons(ETH_P_IP)) {
536 			IP_ECN_set_ce(ip_hdr(skb));
537 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
538 			IP6_ECN_set_ce(ipv6_hdr(skb));
539 		}
540 	}
541 }
542 
543 static inline u8
544 ipgre_ecn_encapsulate(u8 tos, const struct iphdr *old_iph, struct sk_buff *skb)
545 {
546 	u8 inner = 0;
547 	if (skb->protocol == htons(ETH_P_IP))
548 		inner = old_iph->tos;
549 	else if (skb->protocol == htons(ETH_P_IPV6))
550 		inner = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
551 	return INET_ECN_encapsulate(tos, inner);
552 }
553 
554 static int ipgre_rcv(struct sk_buff *skb)
555 {
556 	const struct iphdr *iph;
557 	u8     *h;
558 	__be16    flags;
559 	__sum16   csum = 0;
560 	__be32 key = 0;
561 	u32    seqno = 0;
562 	struct ip_tunnel *tunnel;
563 	int    offset = 4;
564 	__be16 gre_proto;
565 
566 	if (!pskb_may_pull(skb, 16))
567 		goto drop_nolock;
568 
569 	iph = ip_hdr(skb);
570 	h = skb->data;
571 	flags = *(__be16*)h;
572 
573 	if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
574 		/* - Version must be 0.
575 		   - We do not support routing headers.
576 		 */
577 		if (flags&(GRE_VERSION|GRE_ROUTING))
578 			goto drop_nolock;
579 
580 		if (flags&GRE_CSUM) {
581 			switch (skb->ip_summed) {
582 			case CHECKSUM_COMPLETE:
583 				csum = csum_fold(skb->csum);
584 				if (!csum)
585 					break;
586 				/* fall through */
587 			case CHECKSUM_NONE:
588 				skb->csum = 0;
589 				csum = __skb_checksum_complete(skb);
590 				skb->ip_summed = CHECKSUM_COMPLETE;
591 			}
592 			offset += 4;
593 		}
594 		if (flags&GRE_KEY) {
595 			key = *(__be32*)(h + offset);
596 			offset += 4;
597 		}
598 		if (flags&GRE_SEQ) {
599 			seqno = ntohl(*(__be32*)(h + offset));
600 			offset += 4;
601 		}
602 	}
603 
604 	gre_proto = *(__be16 *)(h + 2);
605 
606 	rcu_read_lock();
607 	if ((tunnel = ipgre_tunnel_lookup(skb->dev,
608 					  iph->saddr, iph->daddr, key,
609 					  gre_proto))) {
610 		struct pcpu_tstats *tstats;
611 
612 		secpath_reset(skb);
613 
614 		skb->protocol = gre_proto;
615 		/* WCCP version 1 and 2 protocol decoding.
616 		 * - Change protocol to IP
617 		 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
618 		 */
619 		if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
620 			skb->protocol = htons(ETH_P_IP);
621 			if ((*(h + offset) & 0xF0) != 0x40)
622 				offset += 4;
623 		}
624 
625 		skb->mac_header = skb->network_header;
626 		__pskb_pull(skb, offset);
627 		skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
628 		skb->pkt_type = PACKET_HOST;
629 #ifdef CONFIG_NET_IPGRE_BROADCAST
630 		if (ipv4_is_multicast(iph->daddr)) {
631 			/* Looped back packet, drop it! */
632 			if (rt_is_output_route(skb_rtable(skb)))
633 				goto drop;
634 			tunnel->dev->stats.multicast++;
635 			skb->pkt_type = PACKET_BROADCAST;
636 		}
637 #endif
638 
639 		if (((flags&GRE_CSUM) && csum) ||
640 		    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
641 			tunnel->dev->stats.rx_crc_errors++;
642 			tunnel->dev->stats.rx_errors++;
643 			goto drop;
644 		}
645 		if (tunnel->parms.i_flags&GRE_SEQ) {
646 			if (!(flags&GRE_SEQ) ||
647 			    (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
648 				tunnel->dev->stats.rx_fifo_errors++;
649 				tunnel->dev->stats.rx_errors++;
650 				goto drop;
651 			}
652 			tunnel->i_seqno = seqno + 1;
653 		}
654 
655 		/* Warning: All skb pointers will be invalidated! */
656 		if (tunnel->dev->type == ARPHRD_ETHER) {
657 			if (!pskb_may_pull(skb, ETH_HLEN)) {
658 				tunnel->dev->stats.rx_length_errors++;
659 				tunnel->dev->stats.rx_errors++;
660 				goto drop;
661 			}
662 
663 			iph = ip_hdr(skb);
664 			skb->protocol = eth_type_trans(skb, tunnel->dev);
665 			skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
666 		}
667 
668 		tstats = this_cpu_ptr(tunnel->dev->tstats);
669 		tstats->rx_packets++;
670 		tstats->rx_bytes += skb->len;
671 
672 		__skb_tunnel_rx(skb, tunnel->dev);
673 
674 		skb_reset_network_header(skb);
675 		ipgre_ecn_decapsulate(iph, skb);
676 
677 		netif_rx(skb);
678 
679 		rcu_read_unlock();
680 		return 0;
681 	}
682 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
683 
684 drop:
685 	rcu_read_unlock();
686 drop_nolock:
687 	kfree_skb(skb);
688 	return 0;
689 }
690 
691 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
692 {
693 	struct ip_tunnel *tunnel = netdev_priv(dev);
694 	struct pcpu_tstats *tstats;
695 	const struct iphdr  *old_iph = ip_hdr(skb);
696 	const struct iphdr  *tiph;
697 	struct flowi4 fl4;
698 	u8     tos;
699 	__be16 df;
700 	struct rtable *rt;     			/* Route to the other host */
701 	struct net_device *tdev;		/* Device to other host */
702 	struct iphdr  *iph;			/* Our new IP header */
703 	unsigned int max_headroom;		/* The extra header space needed */
704 	int    gre_hlen;
705 	__be32 dst;
706 	int    mtu;
707 
708 	if (dev->type == ARPHRD_ETHER)
709 		IPCB(skb)->flags = 0;
710 
711 	if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
712 		gre_hlen = 0;
713 		tiph = (const struct iphdr *)skb->data;
714 	} else {
715 		gre_hlen = tunnel->hlen;
716 		tiph = &tunnel->parms.iph;
717 	}
718 
719 	if ((dst = tiph->daddr) == 0) {
720 		/* NBMA tunnel */
721 
722 		if (skb_dst(skb) == NULL) {
723 			dev->stats.tx_fifo_errors++;
724 			goto tx_error;
725 		}
726 
727 		if (skb->protocol == htons(ETH_P_IP)) {
728 			rt = skb_rtable(skb);
729 			if ((dst = rt->rt_gateway) == 0)
730 				goto tx_error_icmp;
731 		}
732 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
733 		else if (skb->protocol == htons(ETH_P_IPV6)) {
734 			const struct in6_addr *addr6;
735 			int addr_type;
736 			struct neighbour *neigh = skb_dst(skb)->neighbour;
737 
738 			if (neigh == NULL)
739 				goto tx_error;
740 
741 			addr6 = (const struct in6_addr *)&neigh->primary_key;
742 			addr_type = ipv6_addr_type(addr6);
743 
744 			if (addr_type == IPV6_ADDR_ANY) {
745 				addr6 = &ipv6_hdr(skb)->daddr;
746 				addr_type = ipv6_addr_type(addr6);
747 			}
748 
749 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
750 				goto tx_error_icmp;
751 
752 			dst = addr6->s6_addr32[3];
753 		}
754 #endif
755 		else
756 			goto tx_error;
757 	}
758 
759 	tos = tiph->tos;
760 	if (tos == 1) {
761 		tos = 0;
762 		if (skb->protocol == htons(ETH_P_IP))
763 			tos = old_iph->tos;
764 		else if (skb->protocol == htons(ETH_P_IPV6))
765 			tos = ipv6_get_dsfield((const struct ipv6hdr *)old_iph);
766 	}
767 
768 	rt = ip_route_output_gre(dev_net(dev), &fl4, dst, tiph->saddr,
769 				 tunnel->parms.o_key, RT_TOS(tos),
770 				 tunnel->parms.link);
771 	if (IS_ERR(rt)) {
772 		dev->stats.tx_carrier_errors++;
773 		goto tx_error;
774 	}
775 	tdev = rt->dst.dev;
776 
777 	if (tdev == dev) {
778 		ip_rt_put(rt);
779 		dev->stats.collisions++;
780 		goto tx_error;
781 	}
782 
783 	df = tiph->frag_off;
784 	if (df)
785 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
786 	else
787 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
788 
789 	if (skb_dst(skb))
790 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
791 
792 	if (skb->protocol == htons(ETH_P_IP)) {
793 		df |= (old_iph->frag_off&htons(IP_DF));
794 
795 		if ((old_iph->frag_off&htons(IP_DF)) &&
796 		    mtu < ntohs(old_iph->tot_len)) {
797 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
798 			ip_rt_put(rt);
799 			goto tx_error;
800 		}
801 	}
802 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
803 	else if (skb->protocol == htons(ETH_P_IPV6)) {
804 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
805 
806 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
807 			if ((tunnel->parms.iph.daddr &&
808 			     !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
809 			    rt6->rt6i_dst.plen == 128) {
810 				rt6->rt6i_flags |= RTF_MODIFIED;
811 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
812 			}
813 		}
814 
815 		if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
816 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
817 			ip_rt_put(rt);
818 			goto tx_error;
819 		}
820 	}
821 #endif
822 
823 	if (tunnel->err_count > 0) {
824 		if (time_before(jiffies,
825 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
826 			tunnel->err_count--;
827 
828 			dst_link_failure(skb);
829 		} else
830 			tunnel->err_count = 0;
831 	}
832 
833 	max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
834 
835 	if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
836 	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
837 		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
838 		if (max_headroom > dev->needed_headroom)
839 			dev->needed_headroom = max_headroom;
840 		if (!new_skb) {
841 			ip_rt_put(rt);
842 			dev->stats.tx_dropped++;
843 			dev_kfree_skb(skb);
844 			return NETDEV_TX_OK;
845 		}
846 		if (skb->sk)
847 			skb_set_owner_w(new_skb, skb->sk);
848 		dev_kfree_skb(skb);
849 		skb = new_skb;
850 		old_iph = ip_hdr(skb);
851 	}
852 
853 	skb_reset_transport_header(skb);
854 	skb_push(skb, gre_hlen);
855 	skb_reset_network_header(skb);
856 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
857 	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
858 			      IPSKB_REROUTED);
859 	skb_dst_drop(skb);
860 	skb_dst_set(skb, &rt->dst);
861 
862 	/*
863 	 *	Push down and install the IPIP header.
864 	 */
865 
866 	iph 			=	ip_hdr(skb);
867 	iph->version		=	4;
868 	iph->ihl		=	sizeof(struct iphdr) >> 2;
869 	iph->frag_off		=	df;
870 	iph->protocol		=	IPPROTO_GRE;
871 	iph->tos		=	ipgre_ecn_encapsulate(tos, old_iph, skb);
872 	iph->daddr		=	fl4.daddr;
873 	iph->saddr		=	fl4.saddr;
874 
875 	if ((iph->ttl = tiph->ttl) == 0) {
876 		if (skb->protocol == htons(ETH_P_IP))
877 			iph->ttl = old_iph->ttl;
878 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
879 		else if (skb->protocol == htons(ETH_P_IPV6))
880 			iph->ttl = ((const struct ipv6hdr *)old_iph)->hop_limit;
881 #endif
882 		else
883 			iph->ttl = ip4_dst_hoplimit(&rt->dst);
884 	}
885 
886 	((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
887 	((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
888 				   htons(ETH_P_TEB) : skb->protocol;
889 
890 	if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
891 		__be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
892 
893 		if (tunnel->parms.o_flags&GRE_SEQ) {
894 			++tunnel->o_seqno;
895 			*ptr = htonl(tunnel->o_seqno);
896 			ptr--;
897 		}
898 		if (tunnel->parms.o_flags&GRE_KEY) {
899 			*ptr = tunnel->parms.o_key;
900 			ptr--;
901 		}
902 		if (tunnel->parms.o_flags&GRE_CSUM) {
903 			*ptr = 0;
904 			*(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
905 		}
906 	}
907 
908 	nf_reset(skb);
909 	tstats = this_cpu_ptr(dev->tstats);
910 	__IPTUNNEL_XMIT(tstats, &dev->stats);
911 	return NETDEV_TX_OK;
912 
913 tx_error_icmp:
914 	dst_link_failure(skb);
915 
916 tx_error:
917 	dev->stats.tx_errors++;
918 	dev_kfree_skb(skb);
919 	return NETDEV_TX_OK;
920 }
921 
922 static int ipgre_tunnel_bind_dev(struct net_device *dev)
923 {
924 	struct net_device *tdev = NULL;
925 	struct ip_tunnel *tunnel;
926 	const struct iphdr *iph;
927 	int hlen = LL_MAX_HEADER;
928 	int mtu = ETH_DATA_LEN;
929 	int addend = sizeof(struct iphdr) + 4;
930 
931 	tunnel = netdev_priv(dev);
932 	iph = &tunnel->parms.iph;
933 
934 	/* Guess output device to choose reasonable mtu and needed_headroom */
935 
936 	if (iph->daddr) {
937 		struct flowi4 fl4;
938 		struct rtable *rt;
939 
940 		rt = ip_route_output_gre(dev_net(dev), &fl4,
941 					 iph->daddr, iph->saddr,
942 					 tunnel->parms.o_key,
943 					 RT_TOS(iph->tos),
944 					 tunnel->parms.link);
945 		if (!IS_ERR(rt)) {
946 			tdev = rt->dst.dev;
947 			ip_rt_put(rt);
948 		}
949 
950 		if (dev->type != ARPHRD_ETHER)
951 			dev->flags |= IFF_POINTOPOINT;
952 	}
953 
954 	if (!tdev && tunnel->parms.link)
955 		tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
956 
957 	if (tdev) {
958 		hlen = tdev->hard_header_len + tdev->needed_headroom;
959 		mtu = tdev->mtu;
960 	}
961 	dev->iflink = tunnel->parms.link;
962 
963 	/* Precalculate GRE options length */
964 	if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
965 		if (tunnel->parms.o_flags&GRE_CSUM)
966 			addend += 4;
967 		if (tunnel->parms.o_flags&GRE_KEY)
968 			addend += 4;
969 		if (tunnel->parms.o_flags&GRE_SEQ)
970 			addend += 4;
971 	}
972 	dev->needed_headroom = addend + hlen;
973 	mtu -= dev->hard_header_len + addend;
974 
975 	if (mtu < 68)
976 		mtu = 68;
977 
978 	tunnel->hlen = addend;
979 
980 	return mtu;
981 }
982 
983 static int
984 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
985 {
986 	int err = 0;
987 	struct ip_tunnel_parm p;
988 	struct ip_tunnel *t;
989 	struct net *net = dev_net(dev);
990 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
991 
992 	switch (cmd) {
993 	case SIOCGETTUNNEL:
994 		t = NULL;
995 		if (dev == ign->fb_tunnel_dev) {
996 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
997 				err = -EFAULT;
998 				break;
999 			}
1000 			t = ipgre_tunnel_locate(net, &p, 0);
1001 		}
1002 		if (t == NULL)
1003 			t = netdev_priv(dev);
1004 		memcpy(&p, &t->parms, sizeof(p));
1005 		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1006 			err = -EFAULT;
1007 		break;
1008 
1009 	case SIOCADDTUNNEL:
1010 	case SIOCCHGTUNNEL:
1011 		err = -EPERM;
1012 		if (!capable(CAP_NET_ADMIN))
1013 			goto done;
1014 
1015 		err = -EFAULT;
1016 		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1017 			goto done;
1018 
1019 		err = -EINVAL;
1020 		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1021 		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1022 		    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1023 			goto done;
1024 		if (p.iph.ttl)
1025 			p.iph.frag_off |= htons(IP_DF);
1026 
1027 		if (!(p.i_flags&GRE_KEY))
1028 			p.i_key = 0;
1029 		if (!(p.o_flags&GRE_KEY))
1030 			p.o_key = 0;
1031 
1032 		t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1033 
1034 		if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1035 			if (t != NULL) {
1036 				if (t->dev != dev) {
1037 					err = -EEXIST;
1038 					break;
1039 				}
1040 			} else {
1041 				unsigned int nflags = 0;
1042 
1043 				t = netdev_priv(dev);
1044 
1045 				if (ipv4_is_multicast(p.iph.daddr))
1046 					nflags = IFF_BROADCAST;
1047 				else if (p.iph.daddr)
1048 					nflags = IFF_POINTOPOINT;
1049 
1050 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1051 					err = -EINVAL;
1052 					break;
1053 				}
1054 				ipgre_tunnel_unlink(ign, t);
1055 				synchronize_net();
1056 				t->parms.iph.saddr = p.iph.saddr;
1057 				t->parms.iph.daddr = p.iph.daddr;
1058 				t->parms.i_key = p.i_key;
1059 				t->parms.o_key = p.o_key;
1060 				memcpy(dev->dev_addr, &p.iph.saddr, 4);
1061 				memcpy(dev->broadcast, &p.iph.daddr, 4);
1062 				ipgre_tunnel_link(ign, t);
1063 				netdev_state_change(dev);
1064 			}
1065 		}
1066 
1067 		if (t) {
1068 			err = 0;
1069 			if (cmd == SIOCCHGTUNNEL) {
1070 				t->parms.iph.ttl = p.iph.ttl;
1071 				t->parms.iph.tos = p.iph.tos;
1072 				t->parms.iph.frag_off = p.iph.frag_off;
1073 				if (t->parms.link != p.link) {
1074 					t->parms.link = p.link;
1075 					dev->mtu = ipgre_tunnel_bind_dev(dev);
1076 					netdev_state_change(dev);
1077 				}
1078 			}
1079 			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1080 				err = -EFAULT;
1081 		} else
1082 			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1083 		break;
1084 
1085 	case SIOCDELTUNNEL:
1086 		err = -EPERM;
1087 		if (!capable(CAP_NET_ADMIN))
1088 			goto done;
1089 
1090 		if (dev == ign->fb_tunnel_dev) {
1091 			err = -EFAULT;
1092 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1093 				goto done;
1094 			err = -ENOENT;
1095 			if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1096 				goto done;
1097 			err = -EPERM;
1098 			if (t == netdev_priv(ign->fb_tunnel_dev))
1099 				goto done;
1100 			dev = t->dev;
1101 		}
1102 		unregister_netdevice(dev);
1103 		err = 0;
1104 		break;
1105 
1106 	default:
1107 		err = -EINVAL;
1108 	}
1109 
1110 done:
1111 	return err;
1112 }
1113 
1114 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1115 {
1116 	struct ip_tunnel *tunnel = netdev_priv(dev);
1117 	if (new_mtu < 68 ||
1118 	    new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1119 		return -EINVAL;
1120 	dev->mtu = new_mtu;
1121 	return 0;
1122 }
1123 
1124 /* Nice toy. Unfortunately, useless in real life :-)
1125    It allows to construct virtual multiprotocol broadcast "LAN"
1126    over the Internet, provided multicast routing is tuned.
1127 
1128 
1129    I have no idea was this bicycle invented before me,
1130    so that I had to set ARPHRD_IPGRE to a random value.
1131    I have an impression, that Cisco could make something similar,
1132    but this feature is apparently missing in IOS<=11.2(8).
1133 
1134    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1135    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1136 
1137    ping -t 255 224.66.66.66
1138 
1139    If nobody answers, mbone does not work.
1140 
1141    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1142    ip addr add 10.66.66.<somewhat>/24 dev Universe
1143    ifconfig Universe up
1144    ifconfig Universe add fe80::<Your_real_addr>/10
1145    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1146    ftp 10.66.66.66
1147    ...
1148    ftp fec0:6666:6666::193.233.7.65
1149    ...
1150 
1151  */
1152 
1153 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1154 			unsigned short type,
1155 			const void *daddr, const void *saddr, unsigned int len)
1156 {
1157 	struct ip_tunnel *t = netdev_priv(dev);
1158 	struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1159 	__be16 *p = (__be16*)(iph+1);
1160 
1161 	memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1162 	p[0]		= t->parms.o_flags;
1163 	p[1]		= htons(type);
1164 
1165 	/*
1166 	 *	Set the source hardware address.
1167 	 */
1168 
1169 	if (saddr)
1170 		memcpy(&iph->saddr, saddr, 4);
1171 	if (daddr)
1172 		memcpy(&iph->daddr, daddr, 4);
1173 	if (iph->daddr)
1174 		return t->hlen;
1175 
1176 	return -t->hlen;
1177 }
1178 
1179 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1180 {
1181 	const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
1182 	memcpy(haddr, &iph->saddr, 4);
1183 	return 4;
1184 }
1185 
1186 static const struct header_ops ipgre_header_ops = {
1187 	.create	= ipgre_header,
1188 	.parse	= ipgre_header_parse,
1189 };
1190 
1191 #ifdef CONFIG_NET_IPGRE_BROADCAST
1192 static int ipgre_open(struct net_device *dev)
1193 {
1194 	struct ip_tunnel *t = netdev_priv(dev);
1195 
1196 	if (ipv4_is_multicast(t->parms.iph.daddr)) {
1197 		struct flowi4 fl4;
1198 		struct rtable *rt;
1199 
1200 		rt = ip_route_output_gre(dev_net(dev), &fl4,
1201 					 t->parms.iph.daddr,
1202 					 t->parms.iph.saddr,
1203 					 t->parms.o_key,
1204 					 RT_TOS(t->parms.iph.tos),
1205 					 t->parms.link);
1206 		if (IS_ERR(rt))
1207 			return -EADDRNOTAVAIL;
1208 		dev = rt->dst.dev;
1209 		ip_rt_put(rt);
1210 		if (__in_dev_get_rtnl(dev) == NULL)
1211 			return -EADDRNOTAVAIL;
1212 		t->mlink = dev->ifindex;
1213 		ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1214 	}
1215 	return 0;
1216 }
1217 
1218 static int ipgre_close(struct net_device *dev)
1219 {
1220 	struct ip_tunnel *t = netdev_priv(dev);
1221 
1222 	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1223 		struct in_device *in_dev;
1224 		in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1225 		if (in_dev)
1226 			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1227 	}
1228 	return 0;
1229 }
1230 
1231 #endif
1232 
1233 static const struct net_device_ops ipgre_netdev_ops = {
1234 	.ndo_init		= ipgre_tunnel_init,
1235 	.ndo_uninit		= ipgre_tunnel_uninit,
1236 #ifdef CONFIG_NET_IPGRE_BROADCAST
1237 	.ndo_open		= ipgre_open,
1238 	.ndo_stop		= ipgre_close,
1239 #endif
1240 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1241 	.ndo_do_ioctl		= ipgre_tunnel_ioctl,
1242 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1243 	.ndo_get_stats		= ipgre_get_stats,
1244 };
1245 
1246 static void ipgre_dev_free(struct net_device *dev)
1247 {
1248 	free_percpu(dev->tstats);
1249 	free_netdev(dev);
1250 }
1251 
1252 static void ipgre_tunnel_setup(struct net_device *dev)
1253 {
1254 	dev->netdev_ops		= &ipgre_netdev_ops;
1255 	dev->destructor 	= ipgre_dev_free;
1256 
1257 	dev->type		= ARPHRD_IPGRE;
1258 	dev->needed_headroom 	= LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1259 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1260 	dev->flags		= IFF_NOARP;
1261 	dev->iflink		= 0;
1262 	dev->addr_len		= 4;
1263 	dev->features		|= NETIF_F_NETNS_LOCAL;
1264 	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
1265 }
1266 
1267 static int ipgre_tunnel_init(struct net_device *dev)
1268 {
1269 	struct ip_tunnel *tunnel;
1270 	struct iphdr *iph;
1271 
1272 	tunnel = netdev_priv(dev);
1273 	iph = &tunnel->parms.iph;
1274 
1275 	tunnel->dev = dev;
1276 	strcpy(tunnel->parms.name, dev->name);
1277 
1278 	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1279 	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1280 
1281 	if (iph->daddr) {
1282 #ifdef CONFIG_NET_IPGRE_BROADCAST
1283 		if (ipv4_is_multicast(iph->daddr)) {
1284 			if (!iph->saddr)
1285 				return -EINVAL;
1286 			dev->flags = IFF_BROADCAST;
1287 			dev->header_ops = &ipgre_header_ops;
1288 		}
1289 #endif
1290 	} else
1291 		dev->header_ops = &ipgre_header_ops;
1292 
1293 	dev->tstats = alloc_percpu(struct pcpu_tstats);
1294 	if (!dev->tstats)
1295 		return -ENOMEM;
1296 
1297 	return 0;
1298 }
1299 
1300 static void ipgre_fb_tunnel_init(struct net_device *dev)
1301 {
1302 	struct ip_tunnel *tunnel = netdev_priv(dev);
1303 	struct iphdr *iph = &tunnel->parms.iph;
1304 
1305 	tunnel->dev = dev;
1306 	strcpy(tunnel->parms.name, dev->name);
1307 
1308 	iph->version		= 4;
1309 	iph->protocol		= IPPROTO_GRE;
1310 	iph->ihl		= 5;
1311 	tunnel->hlen		= sizeof(struct iphdr) + 4;
1312 
1313 	dev_hold(dev);
1314 }
1315 
1316 
1317 static const struct gre_protocol ipgre_protocol = {
1318 	.handler     = ipgre_rcv,
1319 	.err_handler = ipgre_err,
1320 };
1321 
1322 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1323 {
1324 	int prio;
1325 
1326 	for (prio = 0; prio < 4; prio++) {
1327 		int h;
1328 		for (h = 0; h < HASH_SIZE; h++) {
1329 			struct ip_tunnel *t;
1330 
1331 			t = rtnl_dereference(ign->tunnels[prio][h]);
1332 
1333 			while (t != NULL) {
1334 				unregister_netdevice_queue(t->dev, head);
1335 				t = rtnl_dereference(t->next);
1336 			}
1337 		}
1338 	}
1339 }
1340 
1341 static int __net_init ipgre_init_net(struct net *net)
1342 {
1343 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1344 	int err;
1345 
1346 	ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1347 					   ipgre_tunnel_setup);
1348 	if (!ign->fb_tunnel_dev) {
1349 		err = -ENOMEM;
1350 		goto err_alloc_dev;
1351 	}
1352 	dev_net_set(ign->fb_tunnel_dev, net);
1353 
1354 	ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1355 	ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1356 
1357 	if ((err = register_netdev(ign->fb_tunnel_dev)))
1358 		goto err_reg_dev;
1359 
1360 	rcu_assign_pointer(ign->tunnels_wc[0],
1361 			   netdev_priv(ign->fb_tunnel_dev));
1362 	return 0;
1363 
1364 err_reg_dev:
1365 	ipgre_dev_free(ign->fb_tunnel_dev);
1366 err_alloc_dev:
1367 	return err;
1368 }
1369 
1370 static void __net_exit ipgre_exit_net(struct net *net)
1371 {
1372 	struct ipgre_net *ign;
1373 	LIST_HEAD(list);
1374 
1375 	ign = net_generic(net, ipgre_net_id);
1376 	rtnl_lock();
1377 	ipgre_destroy_tunnels(ign, &list);
1378 	unregister_netdevice_many(&list);
1379 	rtnl_unlock();
1380 }
1381 
1382 static struct pernet_operations ipgre_net_ops = {
1383 	.init = ipgre_init_net,
1384 	.exit = ipgre_exit_net,
1385 	.id   = &ipgre_net_id,
1386 	.size = sizeof(struct ipgre_net),
1387 };
1388 
1389 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1390 {
1391 	__be16 flags;
1392 
1393 	if (!data)
1394 		return 0;
1395 
1396 	flags = 0;
1397 	if (data[IFLA_GRE_IFLAGS])
1398 		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1399 	if (data[IFLA_GRE_OFLAGS])
1400 		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1401 	if (flags & (GRE_VERSION|GRE_ROUTING))
1402 		return -EINVAL;
1403 
1404 	return 0;
1405 }
1406 
1407 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1408 {
1409 	__be32 daddr;
1410 
1411 	if (tb[IFLA_ADDRESS]) {
1412 		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1413 			return -EINVAL;
1414 		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1415 			return -EADDRNOTAVAIL;
1416 	}
1417 
1418 	if (!data)
1419 		goto out;
1420 
1421 	if (data[IFLA_GRE_REMOTE]) {
1422 		memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1423 		if (!daddr)
1424 			return -EINVAL;
1425 	}
1426 
1427 out:
1428 	return ipgre_tunnel_validate(tb, data);
1429 }
1430 
1431 static void ipgre_netlink_parms(struct nlattr *data[],
1432 				struct ip_tunnel_parm *parms)
1433 {
1434 	memset(parms, 0, sizeof(*parms));
1435 
1436 	parms->iph.protocol = IPPROTO_GRE;
1437 
1438 	if (!data)
1439 		return;
1440 
1441 	if (data[IFLA_GRE_LINK])
1442 		parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1443 
1444 	if (data[IFLA_GRE_IFLAGS])
1445 		parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1446 
1447 	if (data[IFLA_GRE_OFLAGS])
1448 		parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1449 
1450 	if (data[IFLA_GRE_IKEY])
1451 		parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1452 
1453 	if (data[IFLA_GRE_OKEY])
1454 		parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1455 
1456 	if (data[IFLA_GRE_LOCAL])
1457 		parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1458 
1459 	if (data[IFLA_GRE_REMOTE])
1460 		parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1461 
1462 	if (data[IFLA_GRE_TTL])
1463 		parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1464 
1465 	if (data[IFLA_GRE_TOS])
1466 		parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1467 
1468 	if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1469 		parms->iph.frag_off = htons(IP_DF);
1470 }
1471 
1472 static int ipgre_tap_init(struct net_device *dev)
1473 {
1474 	struct ip_tunnel *tunnel;
1475 
1476 	tunnel = netdev_priv(dev);
1477 
1478 	tunnel->dev = dev;
1479 	strcpy(tunnel->parms.name, dev->name);
1480 
1481 	ipgre_tunnel_bind_dev(dev);
1482 
1483 	dev->tstats = alloc_percpu(struct pcpu_tstats);
1484 	if (!dev->tstats)
1485 		return -ENOMEM;
1486 
1487 	return 0;
1488 }
1489 
1490 static const struct net_device_ops ipgre_tap_netdev_ops = {
1491 	.ndo_init		= ipgre_tap_init,
1492 	.ndo_uninit		= ipgre_tunnel_uninit,
1493 	.ndo_start_xmit		= ipgre_tunnel_xmit,
1494 	.ndo_set_mac_address 	= eth_mac_addr,
1495 	.ndo_validate_addr	= eth_validate_addr,
1496 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
1497 	.ndo_get_stats		= ipgre_get_stats,
1498 };
1499 
1500 static void ipgre_tap_setup(struct net_device *dev)
1501 {
1502 
1503 	ether_setup(dev);
1504 
1505 	dev->netdev_ops		= &ipgre_tap_netdev_ops;
1506 	dev->destructor 	= ipgre_dev_free;
1507 
1508 	dev->iflink		= 0;
1509 	dev->features		|= NETIF_F_NETNS_LOCAL;
1510 }
1511 
1512 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1513 			 struct nlattr *data[])
1514 {
1515 	struct ip_tunnel *nt;
1516 	struct net *net = dev_net(dev);
1517 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1518 	int mtu;
1519 	int err;
1520 
1521 	nt = netdev_priv(dev);
1522 	ipgre_netlink_parms(data, &nt->parms);
1523 
1524 	if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1525 		return -EEXIST;
1526 
1527 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1528 		random_ether_addr(dev->dev_addr);
1529 
1530 	mtu = ipgre_tunnel_bind_dev(dev);
1531 	if (!tb[IFLA_MTU])
1532 		dev->mtu = mtu;
1533 
1534 	/* Can use a lockless transmit, unless we generate output sequences */
1535 	if (!(nt->parms.o_flags & GRE_SEQ))
1536 		dev->features |= NETIF_F_LLTX;
1537 
1538 	err = register_netdevice(dev);
1539 	if (err)
1540 		goto out;
1541 
1542 	dev_hold(dev);
1543 	ipgre_tunnel_link(ign, nt);
1544 
1545 out:
1546 	return err;
1547 }
1548 
1549 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1550 			    struct nlattr *data[])
1551 {
1552 	struct ip_tunnel *t, *nt;
1553 	struct net *net = dev_net(dev);
1554 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1555 	struct ip_tunnel_parm p;
1556 	int mtu;
1557 
1558 	if (dev == ign->fb_tunnel_dev)
1559 		return -EINVAL;
1560 
1561 	nt = netdev_priv(dev);
1562 	ipgre_netlink_parms(data, &p);
1563 
1564 	t = ipgre_tunnel_locate(net, &p, 0);
1565 
1566 	if (t) {
1567 		if (t->dev != dev)
1568 			return -EEXIST;
1569 	} else {
1570 		t = nt;
1571 
1572 		if (dev->type != ARPHRD_ETHER) {
1573 			unsigned int nflags = 0;
1574 
1575 			if (ipv4_is_multicast(p.iph.daddr))
1576 				nflags = IFF_BROADCAST;
1577 			else if (p.iph.daddr)
1578 				nflags = IFF_POINTOPOINT;
1579 
1580 			if ((dev->flags ^ nflags) &
1581 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1582 				return -EINVAL;
1583 		}
1584 
1585 		ipgre_tunnel_unlink(ign, t);
1586 		t->parms.iph.saddr = p.iph.saddr;
1587 		t->parms.iph.daddr = p.iph.daddr;
1588 		t->parms.i_key = p.i_key;
1589 		if (dev->type != ARPHRD_ETHER) {
1590 			memcpy(dev->dev_addr, &p.iph.saddr, 4);
1591 			memcpy(dev->broadcast, &p.iph.daddr, 4);
1592 		}
1593 		ipgre_tunnel_link(ign, t);
1594 		netdev_state_change(dev);
1595 	}
1596 
1597 	t->parms.o_key = p.o_key;
1598 	t->parms.iph.ttl = p.iph.ttl;
1599 	t->parms.iph.tos = p.iph.tos;
1600 	t->parms.iph.frag_off = p.iph.frag_off;
1601 
1602 	if (t->parms.link != p.link) {
1603 		t->parms.link = p.link;
1604 		mtu = ipgre_tunnel_bind_dev(dev);
1605 		if (!tb[IFLA_MTU])
1606 			dev->mtu = mtu;
1607 		netdev_state_change(dev);
1608 	}
1609 
1610 	return 0;
1611 }
1612 
1613 static size_t ipgre_get_size(const struct net_device *dev)
1614 {
1615 	return
1616 		/* IFLA_GRE_LINK */
1617 		nla_total_size(4) +
1618 		/* IFLA_GRE_IFLAGS */
1619 		nla_total_size(2) +
1620 		/* IFLA_GRE_OFLAGS */
1621 		nla_total_size(2) +
1622 		/* IFLA_GRE_IKEY */
1623 		nla_total_size(4) +
1624 		/* IFLA_GRE_OKEY */
1625 		nla_total_size(4) +
1626 		/* IFLA_GRE_LOCAL */
1627 		nla_total_size(4) +
1628 		/* IFLA_GRE_REMOTE */
1629 		nla_total_size(4) +
1630 		/* IFLA_GRE_TTL */
1631 		nla_total_size(1) +
1632 		/* IFLA_GRE_TOS */
1633 		nla_total_size(1) +
1634 		/* IFLA_GRE_PMTUDISC */
1635 		nla_total_size(1) +
1636 		0;
1637 }
1638 
1639 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1640 {
1641 	struct ip_tunnel *t = netdev_priv(dev);
1642 	struct ip_tunnel_parm *p = &t->parms;
1643 
1644 	NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1645 	NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1646 	NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1647 	NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1648 	NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1649 	NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1650 	NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1651 	NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1652 	NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1653 	NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1654 
1655 	return 0;
1656 
1657 nla_put_failure:
1658 	return -EMSGSIZE;
1659 }
1660 
1661 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1662 	[IFLA_GRE_LINK]		= { .type = NLA_U32 },
1663 	[IFLA_GRE_IFLAGS]	= { .type = NLA_U16 },
1664 	[IFLA_GRE_OFLAGS]	= { .type = NLA_U16 },
1665 	[IFLA_GRE_IKEY]		= { .type = NLA_U32 },
1666 	[IFLA_GRE_OKEY]		= { .type = NLA_U32 },
1667 	[IFLA_GRE_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1668 	[IFLA_GRE_REMOTE]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1669 	[IFLA_GRE_TTL]		= { .type = NLA_U8 },
1670 	[IFLA_GRE_TOS]		= { .type = NLA_U8 },
1671 	[IFLA_GRE_PMTUDISC]	= { .type = NLA_U8 },
1672 };
1673 
1674 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1675 	.kind		= "gre",
1676 	.maxtype	= IFLA_GRE_MAX,
1677 	.policy		= ipgre_policy,
1678 	.priv_size	= sizeof(struct ip_tunnel),
1679 	.setup		= ipgre_tunnel_setup,
1680 	.validate	= ipgre_tunnel_validate,
1681 	.newlink	= ipgre_newlink,
1682 	.changelink	= ipgre_changelink,
1683 	.get_size	= ipgre_get_size,
1684 	.fill_info	= ipgre_fill_info,
1685 };
1686 
1687 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1688 	.kind		= "gretap",
1689 	.maxtype	= IFLA_GRE_MAX,
1690 	.policy		= ipgre_policy,
1691 	.priv_size	= sizeof(struct ip_tunnel),
1692 	.setup		= ipgre_tap_setup,
1693 	.validate	= ipgre_tap_validate,
1694 	.newlink	= ipgre_newlink,
1695 	.changelink	= ipgre_changelink,
1696 	.get_size	= ipgre_get_size,
1697 	.fill_info	= ipgre_fill_info,
1698 };
1699 
1700 /*
1701  *	And now the modules code and kernel interface.
1702  */
1703 
1704 static int __init ipgre_init(void)
1705 {
1706 	int err;
1707 
1708 	printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1709 
1710 	err = register_pernet_device(&ipgre_net_ops);
1711 	if (err < 0)
1712 		return err;
1713 
1714 	err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1715 	if (err < 0) {
1716 		printk(KERN_INFO "ipgre init: can't add protocol\n");
1717 		goto add_proto_failed;
1718 	}
1719 
1720 	err = rtnl_link_register(&ipgre_link_ops);
1721 	if (err < 0)
1722 		goto rtnl_link_failed;
1723 
1724 	err = rtnl_link_register(&ipgre_tap_ops);
1725 	if (err < 0)
1726 		goto tap_ops_failed;
1727 
1728 out:
1729 	return err;
1730 
1731 tap_ops_failed:
1732 	rtnl_link_unregister(&ipgre_link_ops);
1733 rtnl_link_failed:
1734 	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1735 add_proto_failed:
1736 	unregister_pernet_device(&ipgre_net_ops);
1737 	goto out;
1738 }
1739 
1740 static void __exit ipgre_fini(void)
1741 {
1742 	rtnl_link_unregister(&ipgre_tap_ops);
1743 	rtnl_link_unregister(&ipgre_link_ops);
1744 	if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1745 		printk(KERN_INFO "ipgre close: can't remove protocol\n");
1746 	unregister_pernet_device(&ipgre_net_ops);
1747 }
1748 
1749 module_init(ipgre_init);
1750 module_exit(ipgre_fini);
1751 MODULE_LICENSE("GPL");
1752 MODULE_ALIAS_RTNL_LINK("gre");
1753 MODULE_ALIAS_RTNL_LINK("gretap");
1754 MODULE_ALIAS_NETDEV("gre0");
1755