xref: /openbmc/linux/net/ipv4/ip_gre.c (revision ee89bd6b)
1 /*
2  *	Linux NET3:	GRE over IP protocol decoder.
3  *
4  *	Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *	This program is free software; you can redistribute it and/or
7  *	modify it under the terms of the GNU General Public License
8  *	as published by the Free Software Foundation; either version
9  *	2 of the License, or (at your option) any later version.
10  *
11  */
12 
13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14 
15 #include <linux/capability.h>
16 #include <linux/module.h>
17 #include <linux/types.h>
18 #include <linux/kernel.h>
19 #include <linux/slab.h>
20 #include <asm/uaccess.h>
21 #include <linux/skbuff.h>
22 #include <linux/netdevice.h>
23 #include <linux/in.h>
24 #include <linux/tcp.h>
25 #include <linux/udp.h>
26 #include <linux/if_arp.h>
27 #include <linux/mroute.h>
28 #include <linux/init.h>
29 #include <linux/in6.h>
30 #include <linux/inetdevice.h>
31 #include <linux/igmp.h>
32 #include <linux/netfilter_ipv4.h>
33 #include <linux/etherdevice.h>
34 #include <linux/if_ether.h>
35 
36 #include <net/sock.h>
37 #include <net/ip.h>
38 #include <net/icmp.h>
39 #include <net/protocol.h>
40 #include <net/ip_tunnels.h>
41 #include <net/arp.h>
42 #include <net/checksum.h>
43 #include <net/dsfield.h>
44 #include <net/inet_ecn.h>
45 #include <net/xfrm.h>
46 #include <net/net_namespace.h>
47 #include <net/netns/generic.h>
48 #include <net/rtnetlink.h>
49 #include <net/gre.h>
50 
51 #if IS_ENABLED(CONFIG_IPV6)
52 #include <net/ipv6.h>
53 #include <net/ip6_fib.h>
54 #include <net/ip6_route.h>
55 #endif
56 
57 /*
58    Problems & solutions
59    --------------------
60 
61    1. The most important issue is detecting local dead loops.
62    They would cause complete host lockup in transmit, which
63    would be "resolved" by stack overflow or, if queueing is enabled,
64    with infinite looping in net_bh.
65 
66    We cannot track such dead loops during route installation,
67    it is infeasible task. The most general solutions would be
68    to keep skb->encapsulation counter (sort of local ttl),
69    and silently drop packet when it expires. It is a good
70    solution, but it supposes maintaining new variable in ALL
71    skb, even if no tunneling is used.
72 
73    Current solution: xmit_recursion breaks dead loops. This is a percpu
74    counter, since when we enter the first ndo_xmit(), cpu migration is
75    forbidden. We force an exit if this counter reaches RECURSION_LIMIT
76 
77    2. Networking dead loops would not kill routers, but would really
78    kill network. IP hop limit plays role of "t->recursion" in this case,
79    if we copy it from packet being encapsulated to upper header.
80    It is very good solution, but it introduces two problems:
81 
82    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
83      do not work over tunnels.
84    - traceroute does not work. I planned to relay ICMP from tunnel,
85      so that this problem would be solved and traceroute output
86      would even more informative. This idea appeared to be wrong:
87      only Linux complies to rfc1812 now (yes, guys, Linux is the only
88      true router now :-)), all routers (at least, in neighbourhood of mine)
89      return only 8 bytes of payload. It is the end.
90 
91    Hence, if we want that OSPF worked or traceroute said something reasonable,
92    we should search for another solution.
93 
94    One of them is to parse packet trying to detect inner encapsulation
95    made by our node. It is difficult or even impossible, especially,
96    taking into account fragmentation. TO be short, ttl is not solution at all.
97 
98    Current solution: The solution was UNEXPECTEDLY SIMPLE.
99    We force DF flag on tunnels with preconfigured hop limit,
100    that is ALL. :-) Well, it does not remove the problem completely,
101    but exponential growth of network traffic is changed to linear
102    (branches, that exceed pmtu are pruned) and tunnel mtu
103    rapidly degrades to value <68, where looping stops.
104    Yes, it is not good if there exists a router in the loop,
105    which does not force DF, even when encapsulating packets have DF set.
106    But it is not our problem! Nobody could accuse us, we made
107    all that we could make. Even if it is your gated who injected
108    fatal route to network, even if it were you who configured
109    fatal static route: you are innocent. :-)
110 
111    Alexey Kuznetsov.
112  */
113 
114 static bool log_ecn_error = true;
115 module_param(log_ecn_error, bool, 0644);
116 MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
117 
118 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
119 static int ipgre_tunnel_init(struct net_device *dev);
120 
121 static int ipgre_net_id __read_mostly;
122 static int gre_tap_net_id __read_mostly;
123 
124 static __sum16 check_checksum(struct sk_buff *skb)
125 {
126 	__sum16 csum = 0;
127 
128 	switch (skb->ip_summed) {
129 	case CHECKSUM_COMPLETE:
130 		csum = csum_fold(skb->csum);
131 
132 		if (!csum)
133 			break;
134 		/* Fall through. */
135 
136 	case CHECKSUM_NONE:
137 		skb->csum = 0;
138 		csum = __skb_checksum_complete(skb);
139 		skb->ip_summed = CHECKSUM_COMPLETE;
140 		break;
141 	}
142 
143 	return csum;
144 }
145 
146 static int ip_gre_calc_hlen(__be16 o_flags)
147 {
148 	int addend = 4;
149 
150 	if (o_flags&TUNNEL_CSUM)
151 		addend += 4;
152 	if (o_flags&TUNNEL_KEY)
153 		addend += 4;
154 	if (o_flags&TUNNEL_SEQ)
155 		addend += 4;
156 	return addend;
157 }
158 
159 static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
160 			    bool *csum_err, int *hdr_len)
161 {
162 	unsigned int ip_hlen = ip_hdrlen(skb);
163 	const struct gre_base_hdr *greh;
164 	__be32 *options;
165 
166 	if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr))))
167 		return -EINVAL;
168 
169 	greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen);
170 	if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
171 		return -EINVAL;
172 
173 	tpi->flags = gre_flags_to_tnl_flags(greh->flags);
174 	*hdr_len = ip_gre_calc_hlen(tpi->flags);
175 
176 	if (!pskb_may_pull(skb, *hdr_len))
177 		return -EINVAL;
178 
179 	greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen);
180 
181 	tpi->proto = greh->protocol;
182 
183 	options = (__be32 *)(greh + 1);
184 	if (greh->flags & GRE_CSUM) {
185 		if (check_checksum(skb)) {
186 			*csum_err = true;
187 			return -EINVAL;
188 		}
189 		options++;
190 	}
191 
192 	if (greh->flags & GRE_KEY) {
193 		tpi->key = *options;
194 		options++;
195 	} else
196 		tpi->key = 0;
197 
198 	if (unlikely(greh->flags & GRE_SEQ)) {
199 		tpi->seq = *options;
200 		options++;
201 	} else
202 		tpi->seq = 0;
203 
204 	/* WCCP version 1 and 2 protocol decoding.
205 	 * - Change protocol to IP
206 	 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
207 	 */
208 	if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
209 		tpi->proto = htons(ETH_P_IP);
210 		if ((*(u8 *)options & 0xF0) != 0x40) {
211 			*hdr_len += 4;
212 			if (!pskb_may_pull(skb, *hdr_len))
213 				return -EINVAL;
214 		}
215 	}
216 
217 	return 0;
218 }
219 
220 static void ipgre_err(struct sk_buff *skb, u32 info)
221 {
222 
223 	/* All the routers (except for Linux) return only
224 	   8 bytes of packet payload. It means, that precise relaying of
225 	   ICMP in the real Internet is absolutely infeasible.
226 
227 	   Moreover, Cisco "wise men" put GRE key to the third word
228 	   in GRE header. It makes impossible maintaining even soft
229 	   state for keyed GRE tunnels with enabled checksum. Tell
230 	   them "thank you".
231 
232 	   Well, I wonder, rfc1812 was written by Cisco employee,
233 	   what the hell these idiots break standards established
234 	   by themselves???
235 	   */
236 	struct net *net = dev_net(skb->dev);
237 	struct ip_tunnel_net *itn;
238 	const struct iphdr *iph;
239 	const int type = icmp_hdr(skb)->type;
240 	const int code = icmp_hdr(skb)->code;
241 	struct ip_tunnel *t;
242 	struct tnl_ptk_info tpi;
243 	int hdr_len;
244 	bool csum_err = false;
245 
246 	if (parse_gre_header(skb, &tpi, &csum_err, &hdr_len)) {
247 		if (!csum_err)          /* ignore csum errors. */
248 			return;
249 	}
250 
251 	switch (type) {
252 	default:
253 	case ICMP_PARAMETERPROB:
254 		return;
255 
256 	case ICMP_DEST_UNREACH:
257 		switch (code) {
258 		case ICMP_SR_FAILED:
259 		case ICMP_PORT_UNREACH:
260 			/* Impossible event. */
261 			return;
262 		default:
263 			/* All others are translated to HOST_UNREACH.
264 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
265 			   I believe they are just ether pollution. --ANK
266 			 */
267 			break;
268 		}
269 		break;
270 	case ICMP_TIME_EXCEEDED:
271 		if (code != ICMP_EXC_TTL)
272 			return;
273 		break;
274 
275 	case ICMP_REDIRECT:
276 		break;
277 	}
278 
279 	if (tpi.proto == htons(ETH_P_TEB))
280 		itn = net_generic(net, gre_tap_net_id);
281 	else
282 		itn = net_generic(net, ipgre_net_id);
283 
284 	iph = (const struct iphdr *)skb->data;
285 	t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi.flags,
286 			     iph->daddr, iph->saddr, tpi.key);
287 
288 	if (t == NULL)
289 		return;
290 
291 	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
292 		ipv4_update_pmtu(skb, dev_net(skb->dev), info,
293 				 t->parms.link, 0, IPPROTO_GRE, 0);
294 		return;
295 	}
296 	if (type == ICMP_REDIRECT) {
297 		ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
298 			      IPPROTO_GRE, 0);
299 		return;
300 	}
301 	if (t->parms.iph.daddr == 0 ||
302 	    ipv4_is_multicast(t->parms.iph.daddr))
303 		return;
304 
305 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
306 		return;
307 
308 	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
309 		t->err_count++;
310 	else
311 		t->err_count = 1;
312 	t->err_time = jiffies;
313 }
314 
315 static int ipgre_rcv(struct sk_buff *skb)
316 {
317 	struct net *net = dev_net(skb->dev);
318 	struct ip_tunnel_net *itn;
319 	const struct iphdr *iph;
320 	struct ip_tunnel *tunnel;
321 	struct tnl_ptk_info tpi;
322 	int hdr_len;
323 	bool csum_err = false;
324 
325 	if (parse_gre_header(skb, &tpi, &csum_err, &hdr_len) < 0)
326 		goto drop;
327 
328 	if (tpi.proto == htons(ETH_P_TEB))
329 		itn = net_generic(net, gre_tap_net_id);
330 	else
331 		itn = net_generic(net, ipgre_net_id);
332 
333 	iph = ip_hdr(skb);
334 	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi.flags,
335 				  iph->saddr, iph->daddr, tpi.key);
336 
337 	if (tunnel) {
338 		ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error);
339 		return 0;
340 	}
341 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
342 drop:
343 	kfree_skb(skb);
344 	return 0;
345 }
346 
347 static struct sk_buff *handle_offloads(struct ip_tunnel *tunnel, struct sk_buff *skb)
348 {
349 	int err;
350 
351 	if (skb_is_gso(skb)) {
352 		err = skb_unclone(skb, GFP_ATOMIC);
353 		if (unlikely(err))
354 			goto error;
355 		skb_shinfo(skb)->gso_type |= SKB_GSO_GRE;
356 		return skb;
357 	} else if (skb->ip_summed == CHECKSUM_PARTIAL &&
358 		   tunnel->parms.o_flags&TUNNEL_CSUM) {
359 		err = skb_checksum_help(skb);
360 		if (unlikely(err))
361 			goto error;
362 	} else if (skb->ip_summed != CHECKSUM_PARTIAL)
363 		skb->ip_summed = CHECKSUM_NONE;
364 
365 	return skb;
366 
367 error:
368 	kfree_skb(skb);
369 	return ERR_PTR(err);
370 }
371 
372 static struct sk_buff *gre_build_header(struct sk_buff *skb,
373 					const struct tnl_ptk_info *tpi,
374 					int hdr_len)
375 {
376 	struct gre_base_hdr *greh;
377 
378 	skb_push(skb, hdr_len);
379 
380 	greh = (struct gre_base_hdr *)skb->data;
381 	greh->flags = tnl_flags_to_gre_flags(tpi->flags);
382 	greh->protocol = tpi->proto;
383 
384 	if (tpi->flags&(TUNNEL_KEY|TUNNEL_CSUM|TUNNEL_SEQ)) {
385 		__be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4);
386 
387 		if (tpi->flags&TUNNEL_SEQ) {
388 			*ptr = tpi->seq;
389 			ptr--;
390 		}
391 		if (tpi->flags&TUNNEL_KEY) {
392 			*ptr = tpi->key;
393 			ptr--;
394 		}
395 		if (tpi->flags&TUNNEL_CSUM &&
396 		    !(skb_shinfo(skb)->gso_type & SKB_GSO_GRE)) {
397 			*(__sum16 *)ptr = 0;
398 			*(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
399 								 skb->len, 0));
400 		}
401 	}
402 
403 	return skb;
404 }
405 
406 static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
407 		       const struct iphdr *tnl_params,
408 		       __be16 proto)
409 {
410 	struct ip_tunnel *tunnel = netdev_priv(dev);
411 	struct tnl_ptk_info tpi;
412 
413 	if (likely(!skb->encapsulation)) {
414 		skb_reset_inner_headers(skb);
415 		skb->encapsulation = 1;
416 	}
417 
418 	tpi.flags = tunnel->parms.o_flags;
419 	tpi.proto = proto;
420 	tpi.key = tunnel->parms.o_key;
421 	if (tunnel->parms.o_flags & TUNNEL_SEQ)
422 		tunnel->o_seqno++;
423 	tpi.seq = htonl(tunnel->o_seqno);
424 
425 	/* Push GRE header. */
426 	skb = gre_build_header(skb, &tpi, tunnel->hlen);
427 	if (unlikely(!skb)) {
428 		dev->stats.tx_dropped++;
429 		return;
430 	}
431 
432 	ip_tunnel_xmit(skb, dev, tnl_params);
433 }
434 
435 static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
436 			      struct net_device *dev)
437 {
438 	struct ip_tunnel *tunnel = netdev_priv(dev);
439 	const struct iphdr *tnl_params;
440 
441 	skb = handle_offloads(tunnel, skb);
442 	if (IS_ERR(skb))
443 		goto out;
444 
445 	if (dev->header_ops) {
446 		/* Need space for new headers */
447 		if (skb_cow_head(skb, dev->needed_headroom -
448 				      (tunnel->hlen + sizeof(struct iphdr))))
449 			goto free_skb;
450 
451 		tnl_params = (const struct iphdr *)skb->data;
452 
453 		/* Pull skb since ip_tunnel_xmit() needs skb->data pointing
454 		 * to gre header.
455 		 */
456 		skb_pull(skb, tunnel->hlen + sizeof(struct iphdr));
457 	} else {
458 		if (skb_cow_head(skb, dev->needed_headroom))
459 			goto free_skb;
460 
461 		tnl_params = &tunnel->parms.iph;
462 	}
463 
464 	__gre_xmit(skb, dev, tnl_params, skb->protocol);
465 
466 	return NETDEV_TX_OK;
467 
468 free_skb:
469 	dev_kfree_skb(skb);
470 out:
471 	dev->stats.tx_dropped++;
472 	return NETDEV_TX_OK;
473 }
474 
475 static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
476 				struct net_device *dev)
477 {
478 	struct ip_tunnel *tunnel = netdev_priv(dev);
479 
480 	skb = handle_offloads(tunnel, skb);
481 	if (IS_ERR(skb))
482 		goto out;
483 
484 	if (skb_cow_head(skb, dev->needed_headroom))
485 		goto free_skb;
486 
487 	__gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB));
488 
489 	return NETDEV_TX_OK;
490 
491 free_skb:
492 	dev_kfree_skb(skb);
493 out:
494 	dev->stats.tx_dropped++;
495 	return NETDEV_TX_OK;
496 }
497 
498 static int ipgre_tunnel_ioctl(struct net_device *dev,
499 			      struct ifreq *ifr, int cmd)
500 {
501 	int err = 0;
502 	struct ip_tunnel_parm p;
503 
504 	if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
505 		return -EFAULT;
506 	if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
507 	    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
508 	    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING))) {
509 		return -EINVAL;
510 	}
511 	p.i_flags = gre_flags_to_tnl_flags(p.i_flags);
512 	p.o_flags = gre_flags_to_tnl_flags(p.o_flags);
513 
514 	err = ip_tunnel_ioctl(dev, &p, cmd);
515 	if (err)
516 		return err;
517 
518 	p.i_flags = tnl_flags_to_gre_flags(p.i_flags);
519 	p.o_flags = tnl_flags_to_gre_flags(p.o_flags);
520 
521 	if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
522 		return -EFAULT;
523 	return 0;
524 }
525 
526 /* Nice toy. Unfortunately, useless in real life :-)
527    It allows to construct virtual multiprotocol broadcast "LAN"
528    over the Internet, provided multicast routing is tuned.
529 
530 
531    I have no idea was this bicycle invented before me,
532    so that I had to set ARPHRD_IPGRE to a random value.
533    I have an impression, that Cisco could make something similar,
534    but this feature is apparently missing in IOS<=11.2(8).
535 
536    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
537    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
538 
539    ping -t 255 224.66.66.66
540 
541    If nobody answers, mbone does not work.
542 
543    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
544    ip addr add 10.66.66.<somewhat>/24 dev Universe
545    ifconfig Universe up
546    ifconfig Universe add fe80::<Your_real_addr>/10
547    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
548    ftp 10.66.66.66
549    ...
550    ftp fec0:6666:6666::193.233.7.65
551    ...
552  */
553 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
554 			unsigned short type,
555 			const void *daddr, const void *saddr, unsigned int len)
556 {
557 	struct ip_tunnel *t = netdev_priv(dev);
558 	struct iphdr *iph;
559 	struct gre_base_hdr *greh;
560 
561 	iph = (struct iphdr *)skb_push(skb, t->hlen + sizeof(*iph));
562 	greh = (struct gre_base_hdr *)(iph+1);
563 	greh->flags = tnl_flags_to_gre_flags(t->parms.o_flags);
564 	greh->protocol = htons(type);
565 
566 	memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
567 
568 	/* Set the source hardware address. */
569 	if (saddr)
570 		memcpy(&iph->saddr, saddr, 4);
571 	if (daddr)
572 		memcpy(&iph->daddr, daddr, 4);
573 	if (iph->daddr)
574 		return t->hlen;
575 
576 	return -(t->hlen + sizeof(*iph));
577 }
578 
579 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
580 {
581 	const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
582 	memcpy(haddr, &iph->saddr, 4);
583 	return 4;
584 }
585 
586 static const struct header_ops ipgre_header_ops = {
587 	.create	= ipgre_header,
588 	.parse	= ipgre_header_parse,
589 };
590 
591 #ifdef CONFIG_NET_IPGRE_BROADCAST
592 static int ipgre_open(struct net_device *dev)
593 {
594 	struct ip_tunnel *t = netdev_priv(dev);
595 
596 	if (ipv4_is_multicast(t->parms.iph.daddr)) {
597 		struct flowi4 fl4;
598 		struct rtable *rt;
599 
600 		rt = ip_route_output_gre(dev_net(dev), &fl4,
601 					 t->parms.iph.daddr,
602 					 t->parms.iph.saddr,
603 					 t->parms.o_key,
604 					 RT_TOS(t->parms.iph.tos),
605 					 t->parms.link);
606 		if (IS_ERR(rt))
607 			return -EADDRNOTAVAIL;
608 		dev = rt->dst.dev;
609 		ip_rt_put(rt);
610 		if (__in_dev_get_rtnl(dev) == NULL)
611 			return -EADDRNOTAVAIL;
612 		t->mlink = dev->ifindex;
613 		ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
614 	}
615 	return 0;
616 }
617 
618 static int ipgre_close(struct net_device *dev)
619 {
620 	struct ip_tunnel *t = netdev_priv(dev);
621 
622 	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
623 		struct in_device *in_dev;
624 		in_dev = inetdev_by_index(dev_net(dev), t->mlink);
625 		if (in_dev)
626 			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
627 	}
628 	return 0;
629 }
630 #endif
631 
632 static const struct net_device_ops ipgre_netdev_ops = {
633 	.ndo_init		= ipgre_tunnel_init,
634 	.ndo_uninit		= ip_tunnel_uninit,
635 #ifdef CONFIG_NET_IPGRE_BROADCAST
636 	.ndo_open		= ipgre_open,
637 	.ndo_stop		= ipgre_close,
638 #endif
639 	.ndo_start_xmit		= ipgre_xmit,
640 	.ndo_do_ioctl		= ipgre_tunnel_ioctl,
641 	.ndo_change_mtu		= ip_tunnel_change_mtu,
642 	.ndo_get_stats64	= ip_tunnel_get_stats64,
643 };
644 
645 #define GRE_FEATURES (NETIF_F_SG |		\
646 		      NETIF_F_FRAGLIST |	\
647 		      NETIF_F_HIGHDMA |		\
648 		      NETIF_F_HW_CSUM)
649 
650 static void ipgre_tunnel_setup(struct net_device *dev)
651 {
652 	dev->netdev_ops		= &ipgre_netdev_ops;
653 	ip_tunnel_setup(dev, ipgre_net_id);
654 }
655 
656 static void __gre_tunnel_init(struct net_device *dev)
657 {
658 	struct ip_tunnel *tunnel;
659 
660 	tunnel = netdev_priv(dev);
661 	tunnel->hlen = ip_gre_calc_hlen(tunnel->parms.o_flags);
662 	tunnel->parms.iph.protocol = IPPROTO_GRE;
663 
664 	dev->needed_headroom	= LL_MAX_HEADER + sizeof(struct iphdr) + 4;
665 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 4;
666 
667 	dev->features		|= NETIF_F_NETNS_LOCAL | GRE_FEATURES;
668 	dev->hw_features	|= GRE_FEATURES;
669 
670 	if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
671 		/* TCP offload with GRE SEQ is not supported. */
672 		dev->features    |= NETIF_F_GSO_SOFTWARE;
673 		dev->hw_features |= NETIF_F_GSO_SOFTWARE;
674 		/* Can use a lockless transmit, unless we generate
675 		 * output sequences
676 		 */
677 		dev->features |= NETIF_F_LLTX;
678 	}
679 }
680 
681 static int ipgre_tunnel_init(struct net_device *dev)
682 {
683 	struct ip_tunnel *tunnel = netdev_priv(dev);
684 	struct iphdr *iph = &tunnel->parms.iph;
685 
686 	__gre_tunnel_init(dev);
687 
688 	memcpy(dev->dev_addr, &iph->saddr, 4);
689 	memcpy(dev->broadcast, &iph->daddr, 4);
690 
691 	dev->type		= ARPHRD_IPGRE;
692 	dev->flags		= IFF_NOARP;
693 	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
694 	dev->addr_len		= 4;
695 
696 	if (iph->daddr) {
697 #ifdef CONFIG_NET_IPGRE_BROADCAST
698 		if (ipv4_is_multicast(iph->daddr)) {
699 			if (!iph->saddr)
700 				return -EINVAL;
701 			dev->flags = IFF_BROADCAST;
702 			dev->header_ops = &ipgre_header_ops;
703 		}
704 #endif
705 	} else
706 		dev->header_ops = &ipgre_header_ops;
707 
708 	return ip_tunnel_init(dev);
709 }
710 
711 static const struct gre_protocol ipgre_protocol = {
712 	.handler     = ipgre_rcv,
713 	.err_handler = ipgre_err,
714 };
715 
716 static int __net_init ipgre_init_net(struct net *net)
717 {
718 	return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
719 }
720 
721 static void __net_exit ipgre_exit_net(struct net *net)
722 {
723 	struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id);
724 	ip_tunnel_delete_net(itn);
725 }
726 
727 static struct pernet_operations ipgre_net_ops = {
728 	.init = ipgre_init_net,
729 	.exit = ipgre_exit_net,
730 	.id   = &ipgre_net_id,
731 	.size = sizeof(struct ip_tunnel_net),
732 };
733 
734 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
735 {
736 	__be16 flags;
737 
738 	if (!data)
739 		return 0;
740 
741 	flags = 0;
742 	if (data[IFLA_GRE_IFLAGS])
743 		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
744 	if (data[IFLA_GRE_OFLAGS])
745 		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
746 	if (flags & (GRE_VERSION|GRE_ROUTING))
747 		return -EINVAL;
748 
749 	return 0;
750 }
751 
752 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
753 {
754 	__be32 daddr;
755 
756 	if (tb[IFLA_ADDRESS]) {
757 		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
758 			return -EINVAL;
759 		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
760 			return -EADDRNOTAVAIL;
761 	}
762 
763 	if (!data)
764 		goto out;
765 
766 	if (data[IFLA_GRE_REMOTE]) {
767 		memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
768 		if (!daddr)
769 			return -EINVAL;
770 	}
771 
772 out:
773 	return ipgre_tunnel_validate(tb, data);
774 }
775 
776 static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[],
777 			       struct ip_tunnel_parm *parms)
778 {
779 	memset(parms, 0, sizeof(*parms));
780 
781 	parms->iph.protocol = IPPROTO_GRE;
782 
783 	if (!data)
784 		return;
785 
786 	if (data[IFLA_GRE_LINK])
787 		parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
788 
789 	if (data[IFLA_GRE_IFLAGS])
790 		parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS]));
791 
792 	if (data[IFLA_GRE_OFLAGS])
793 		parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS]));
794 
795 	if (data[IFLA_GRE_IKEY])
796 		parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
797 
798 	if (data[IFLA_GRE_OKEY])
799 		parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
800 
801 	if (data[IFLA_GRE_LOCAL])
802 		parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
803 
804 	if (data[IFLA_GRE_REMOTE])
805 		parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
806 
807 	if (data[IFLA_GRE_TTL])
808 		parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
809 
810 	if (data[IFLA_GRE_TOS])
811 		parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
812 
813 	if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
814 		parms->iph.frag_off = htons(IP_DF);
815 }
816 
817 static int gre_tap_init(struct net_device *dev)
818 {
819 	__gre_tunnel_init(dev);
820 
821 	return ip_tunnel_init(dev);
822 }
823 
824 static const struct net_device_ops gre_tap_netdev_ops = {
825 	.ndo_init		= gre_tap_init,
826 	.ndo_uninit		= ip_tunnel_uninit,
827 	.ndo_start_xmit		= gre_tap_xmit,
828 	.ndo_set_mac_address 	= eth_mac_addr,
829 	.ndo_validate_addr	= eth_validate_addr,
830 	.ndo_change_mtu		= ip_tunnel_change_mtu,
831 	.ndo_get_stats64	= ip_tunnel_get_stats64,
832 };
833 
834 static void ipgre_tap_setup(struct net_device *dev)
835 {
836 	ether_setup(dev);
837 	dev->netdev_ops		= &gre_tap_netdev_ops;
838 	ip_tunnel_setup(dev, gre_tap_net_id);
839 }
840 
841 static int ipgre_newlink(struct net *src_net, struct net_device *dev,
842 			 struct nlattr *tb[], struct nlattr *data[])
843 {
844 	struct ip_tunnel_parm p;
845 
846 	ipgre_netlink_parms(data, tb, &p);
847 	return ip_tunnel_newlink(dev, tb, &p);
848 }
849 
850 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
851 			    struct nlattr *data[])
852 {
853 	struct ip_tunnel_parm p;
854 
855 	ipgre_netlink_parms(data, tb, &p);
856 	return ip_tunnel_changelink(dev, tb, &p);
857 }
858 
859 static size_t ipgre_get_size(const struct net_device *dev)
860 {
861 	return
862 		/* IFLA_GRE_LINK */
863 		nla_total_size(4) +
864 		/* IFLA_GRE_IFLAGS */
865 		nla_total_size(2) +
866 		/* IFLA_GRE_OFLAGS */
867 		nla_total_size(2) +
868 		/* IFLA_GRE_IKEY */
869 		nla_total_size(4) +
870 		/* IFLA_GRE_OKEY */
871 		nla_total_size(4) +
872 		/* IFLA_GRE_LOCAL */
873 		nla_total_size(4) +
874 		/* IFLA_GRE_REMOTE */
875 		nla_total_size(4) +
876 		/* IFLA_GRE_TTL */
877 		nla_total_size(1) +
878 		/* IFLA_GRE_TOS */
879 		nla_total_size(1) +
880 		/* IFLA_GRE_PMTUDISC */
881 		nla_total_size(1) +
882 		0;
883 }
884 
885 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
886 {
887 	struct ip_tunnel *t = netdev_priv(dev);
888 	struct ip_tunnel_parm *p = &t->parms;
889 
890 	if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
891 	    nla_put_be16(skb, IFLA_GRE_IFLAGS, tnl_flags_to_gre_flags(p->i_flags)) ||
892 	    nla_put_be16(skb, IFLA_GRE_OFLAGS, tnl_flags_to_gre_flags(p->o_flags)) ||
893 	    nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
894 	    nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
895 	    nla_put_be32(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
896 	    nla_put_be32(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
897 	    nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
898 	    nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
899 	    nla_put_u8(skb, IFLA_GRE_PMTUDISC,
900 		       !!(p->iph.frag_off & htons(IP_DF))))
901 		goto nla_put_failure;
902 	return 0;
903 
904 nla_put_failure:
905 	return -EMSGSIZE;
906 }
907 
908 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
909 	[IFLA_GRE_LINK]		= { .type = NLA_U32 },
910 	[IFLA_GRE_IFLAGS]	= { .type = NLA_U16 },
911 	[IFLA_GRE_OFLAGS]	= { .type = NLA_U16 },
912 	[IFLA_GRE_IKEY]		= { .type = NLA_U32 },
913 	[IFLA_GRE_OKEY]		= { .type = NLA_U32 },
914 	[IFLA_GRE_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
915 	[IFLA_GRE_REMOTE]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
916 	[IFLA_GRE_TTL]		= { .type = NLA_U8 },
917 	[IFLA_GRE_TOS]		= { .type = NLA_U8 },
918 	[IFLA_GRE_PMTUDISC]	= { .type = NLA_U8 },
919 };
920 
921 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
922 	.kind		= "gre",
923 	.maxtype	= IFLA_GRE_MAX,
924 	.policy		= ipgre_policy,
925 	.priv_size	= sizeof(struct ip_tunnel),
926 	.setup		= ipgre_tunnel_setup,
927 	.validate	= ipgre_tunnel_validate,
928 	.newlink	= ipgre_newlink,
929 	.changelink	= ipgre_changelink,
930 	.dellink	= ip_tunnel_dellink,
931 	.get_size	= ipgre_get_size,
932 	.fill_info	= ipgre_fill_info,
933 };
934 
935 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
936 	.kind		= "gretap",
937 	.maxtype	= IFLA_GRE_MAX,
938 	.policy		= ipgre_policy,
939 	.priv_size	= sizeof(struct ip_tunnel),
940 	.setup		= ipgre_tap_setup,
941 	.validate	= ipgre_tap_validate,
942 	.newlink	= ipgre_newlink,
943 	.changelink	= ipgre_changelink,
944 	.dellink	= ip_tunnel_dellink,
945 	.get_size	= ipgre_get_size,
946 	.fill_info	= ipgre_fill_info,
947 };
948 
949 static int __net_init ipgre_tap_init_net(struct net *net)
950 {
951 	return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, NULL);
952 }
953 
954 static void __net_exit ipgre_tap_exit_net(struct net *net)
955 {
956 	struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id);
957 	ip_tunnel_delete_net(itn);
958 }
959 
960 static struct pernet_operations ipgre_tap_net_ops = {
961 	.init = ipgre_tap_init_net,
962 	.exit = ipgre_tap_exit_net,
963 	.id   = &gre_tap_net_id,
964 	.size = sizeof(struct ip_tunnel_net),
965 };
966 
967 static int __init ipgre_init(void)
968 {
969 	int err;
970 
971 	pr_info("GRE over IPv4 tunneling driver\n");
972 
973 	err = register_pernet_device(&ipgre_net_ops);
974 	if (err < 0)
975 		return err;
976 
977 	err = register_pernet_device(&ipgre_tap_net_ops);
978 	if (err < 0)
979 		goto pnet_tap_faied;
980 
981 	err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
982 	if (err < 0) {
983 		pr_info("%s: can't add protocol\n", __func__);
984 		goto add_proto_failed;
985 	}
986 
987 	err = rtnl_link_register(&ipgre_link_ops);
988 	if (err < 0)
989 		goto rtnl_link_failed;
990 
991 	err = rtnl_link_register(&ipgre_tap_ops);
992 	if (err < 0)
993 		goto tap_ops_failed;
994 
995 	return 0;
996 
997 tap_ops_failed:
998 	rtnl_link_unregister(&ipgre_link_ops);
999 rtnl_link_failed:
1000 	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1001 add_proto_failed:
1002 	unregister_pernet_device(&ipgre_tap_net_ops);
1003 pnet_tap_faied:
1004 	unregister_pernet_device(&ipgre_net_ops);
1005 	return err;
1006 }
1007 
1008 static void __exit ipgre_fini(void)
1009 {
1010 	rtnl_link_unregister(&ipgre_tap_ops);
1011 	rtnl_link_unregister(&ipgre_link_ops);
1012 	if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1013 		pr_info("%s: can't remove protocol\n", __func__);
1014 	unregister_pernet_device(&ipgre_tap_net_ops);
1015 	unregister_pernet_device(&ipgre_net_ops);
1016 }
1017 
1018 module_init(ipgre_init);
1019 module_exit(ipgre_fini);
1020 MODULE_LICENSE("GPL");
1021 MODULE_ALIAS_RTNL_LINK("gre");
1022 MODULE_ALIAS_RTNL_LINK("gretap");
1023 MODULE_ALIAS_NETDEV("gre0");
1024 MODULE_ALIAS_NETDEV("gretap0");
1025