xref: /openbmc/linux/net/ipv4/ip_gre.c (revision 83268fa6)
1 /*
2  *	Linux NET3:	GRE over IP protocol decoder.
3  *
4  *	Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *	This program is free software; you can redistribute it and/or
7  *	modify it under the terms of the GNU General Public License
8  *	as published by the Free Software Foundation; either version
9  *	2 of the License, or (at your option) any later version.
10  *
11  */
12 
13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14 
15 #include <linux/capability.h>
16 #include <linux/module.h>
17 #include <linux/types.h>
18 #include <linux/kernel.h>
19 #include <linux/slab.h>
20 #include <linux/uaccess.h>
21 #include <linux/skbuff.h>
22 #include <linux/netdevice.h>
23 #include <linux/in.h>
24 #include <linux/tcp.h>
25 #include <linux/udp.h>
26 #include <linux/if_arp.h>
27 #include <linux/if_vlan.h>
28 #include <linux/init.h>
29 #include <linux/in6.h>
30 #include <linux/inetdevice.h>
31 #include <linux/igmp.h>
32 #include <linux/netfilter_ipv4.h>
33 #include <linux/etherdevice.h>
34 #include <linux/if_ether.h>
35 
36 #include <net/sock.h>
37 #include <net/ip.h>
38 #include <net/icmp.h>
39 #include <net/protocol.h>
40 #include <net/ip_tunnels.h>
41 #include <net/arp.h>
42 #include <net/checksum.h>
43 #include <net/dsfield.h>
44 #include <net/inet_ecn.h>
45 #include <net/xfrm.h>
46 #include <net/net_namespace.h>
47 #include <net/netns/generic.h>
48 #include <net/rtnetlink.h>
49 #include <net/gre.h>
50 #include <net/dst_metadata.h>
51 #include <net/erspan.h>
52 
53 /*
54    Problems & solutions
55    --------------------
56 
57    1. The most important issue is detecting local dead loops.
58    They would cause complete host lockup in transmit, which
59    would be "resolved" by stack overflow or, if queueing is enabled,
60    with infinite looping in net_bh.
61 
62    We cannot track such dead loops during route installation,
63    it is infeasible task. The most general solutions would be
64    to keep skb->encapsulation counter (sort of local ttl),
65    and silently drop packet when it expires. It is a good
66    solution, but it supposes maintaining new variable in ALL
67    skb, even if no tunneling is used.
68 
69    Current solution: xmit_recursion breaks dead loops. This is a percpu
70    counter, since when we enter the first ndo_xmit(), cpu migration is
71    forbidden. We force an exit if this counter reaches RECURSION_LIMIT
72 
73    2. Networking dead loops would not kill routers, but would really
74    kill network. IP hop limit plays role of "t->recursion" in this case,
75    if we copy it from packet being encapsulated to upper header.
76    It is very good solution, but it introduces two problems:
77 
78    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
79      do not work over tunnels.
80    - traceroute does not work. I planned to relay ICMP from tunnel,
81      so that this problem would be solved and traceroute output
82      would even more informative. This idea appeared to be wrong:
83      only Linux complies to rfc1812 now (yes, guys, Linux is the only
84      true router now :-)), all routers (at least, in neighbourhood of mine)
85      return only 8 bytes of payload. It is the end.
86 
87    Hence, if we want that OSPF worked or traceroute said something reasonable,
88    we should search for another solution.
89 
90    One of them is to parse packet trying to detect inner encapsulation
91    made by our node. It is difficult or even impossible, especially,
92    taking into account fragmentation. TO be short, ttl is not solution at all.
93 
94    Current solution: The solution was UNEXPECTEDLY SIMPLE.
95    We force DF flag on tunnels with preconfigured hop limit,
96    that is ALL. :-) Well, it does not remove the problem completely,
97    but exponential growth of network traffic is changed to linear
98    (branches, that exceed pmtu are pruned) and tunnel mtu
99    rapidly degrades to value <68, where looping stops.
100    Yes, it is not good if there exists a router in the loop,
101    which does not force DF, even when encapsulating packets have DF set.
102    But it is not our problem! Nobody could accuse us, we made
103    all that we could make. Even if it is your gated who injected
104    fatal route to network, even if it were you who configured
105    fatal static route: you are innocent. :-)
106 
107    Alexey Kuznetsov.
108  */
109 
110 static bool log_ecn_error = true;
111 module_param(log_ecn_error, bool, 0644);
112 MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
113 
114 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
115 static int ipgre_tunnel_init(struct net_device *dev);
116 static void erspan_build_header(struct sk_buff *skb,
117 				u32 id, u32 index,
118 				bool truncate, bool is_ipv4);
119 
120 static unsigned int ipgre_net_id __read_mostly;
121 static unsigned int gre_tap_net_id __read_mostly;
122 static unsigned int erspan_net_id __read_mostly;
123 
124 static void ipgre_err(struct sk_buff *skb, u32 info,
125 		      const struct tnl_ptk_info *tpi)
126 {
127 
128 	/* All the routers (except for Linux) return only
129 	   8 bytes of packet payload. It means, that precise relaying of
130 	   ICMP in the real Internet is absolutely infeasible.
131 
132 	   Moreover, Cisco "wise men" put GRE key to the third word
133 	   in GRE header. It makes impossible maintaining even soft
134 	   state for keyed GRE tunnels with enabled checksum. Tell
135 	   them "thank you".
136 
137 	   Well, I wonder, rfc1812 was written by Cisco employee,
138 	   what the hell these idiots break standards established
139 	   by themselves???
140 	   */
141 	struct net *net = dev_net(skb->dev);
142 	struct ip_tunnel_net *itn;
143 	const struct iphdr *iph;
144 	const int type = icmp_hdr(skb)->type;
145 	const int code = icmp_hdr(skb)->code;
146 	unsigned int data_len = 0;
147 	struct ip_tunnel *t;
148 
149 	switch (type) {
150 	default:
151 	case ICMP_PARAMETERPROB:
152 		return;
153 
154 	case ICMP_DEST_UNREACH:
155 		switch (code) {
156 		case ICMP_SR_FAILED:
157 		case ICMP_PORT_UNREACH:
158 			/* Impossible event. */
159 			return;
160 		default:
161 			/* All others are translated to HOST_UNREACH.
162 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
163 			   I believe they are just ether pollution. --ANK
164 			 */
165 			break;
166 		}
167 		break;
168 
169 	case ICMP_TIME_EXCEEDED:
170 		if (code != ICMP_EXC_TTL)
171 			return;
172 		data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */
173 		break;
174 
175 	case ICMP_REDIRECT:
176 		break;
177 	}
178 
179 	if (tpi->proto == htons(ETH_P_TEB))
180 		itn = net_generic(net, gre_tap_net_id);
181 	else if (tpi->proto == htons(ETH_P_ERSPAN) ||
182 		 tpi->proto == htons(ETH_P_ERSPAN2))
183 		itn = net_generic(net, erspan_net_id);
184 	else
185 		itn = net_generic(net, ipgre_net_id);
186 
187 	iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
188 	t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
189 			     iph->daddr, iph->saddr, tpi->key);
190 
191 	if (!t)
192 		return;
193 
194 #if IS_ENABLED(CONFIG_IPV6)
195        if (tpi->proto == htons(ETH_P_IPV6) &&
196            !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len,
197 				       type, data_len))
198                return;
199 #endif
200 
201 	if (t->parms.iph.daddr == 0 ||
202 	    ipv4_is_multicast(t->parms.iph.daddr))
203 		return;
204 
205 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
206 		return;
207 
208 	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
209 		t->err_count++;
210 	else
211 		t->err_count = 1;
212 	t->err_time = jiffies;
213 }
214 
215 static void gre_err(struct sk_buff *skb, u32 info)
216 {
217 	/* All the routers (except for Linux) return only
218 	 * 8 bytes of packet payload. It means, that precise relaying of
219 	 * ICMP in the real Internet is absolutely infeasible.
220 	 *
221 	 * Moreover, Cisco "wise men" put GRE key to the third word
222 	 * in GRE header. It makes impossible maintaining even soft
223 	 * state for keyed
224 	 * GRE tunnels with enabled checksum. Tell them "thank you".
225 	 *
226 	 * Well, I wonder, rfc1812 was written by Cisco employee,
227 	 * what the hell these idiots break standards established
228 	 * by themselves???
229 	 */
230 
231 	const struct iphdr *iph = (struct iphdr *)skb->data;
232 	const int type = icmp_hdr(skb)->type;
233 	const int code = icmp_hdr(skb)->code;
234 	struct tnl_ptk_info tpi;
235 
236 	if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IP),
237 			     iph->ihl * 4) < 0)
238 		return;
239 
240 	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
241 		ipv4_update_pmtu(skb, dev_net(skb->dev), info,
242 				 skb->dev->ifindex, IPPROTO_GRE);
243 		return;
244 	}
245 	if (type == ICMP_REDIRECT) {
246 		ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex,
247 			      IPPROTO_GRE);
248 		return;
249 	}
250 
251 	ipgre_err(skb, info, &tpi);
252 }
253 
254 static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
255 		      int gre_hdr_len)
256 {
257 	struct net *net = dev_net(skb->dev);
258 	struct metadata_dst *tun_dst = NULL;
259 	struct erspan_base_hdr *ershdr;
260 	struct erspan_metadata *pkt_md;
261 	struct ip_tunnel_net *itn;
262 	struct ip_tunnel *tunnel;
263 	const struct iphdr *iph;
264 	struct erspan_md2 *md2;
265 	int ver;
266 	int len;
267 
268 	itn = net_generic(net, erspan_net_id);
269 	len = gre_hdr_len + sizeof(*ershdr);
270 
271 	/* Check based hdr len */
272 	if (unlikely(!pskb_may_pull(skb, len)))
273 		return PACKET_REJECT;
274 
275 	iph = ip_hdr(skb);
276 	ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len);
277 	ver = ershdr->ver;
278 
279 	/* The original GRE header does not have key field,
280 	 * Use ERSPAN 10-bit session ID as key.
281 	 */
282 	tpi->key = cpu_to_be32(get_session_id(ershdr));
283 	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
284 				  tpi->flags | TUNNEL_KEY,
285 				  iph->saddr, iph->daddr, tpi->key);
286 
287 	if (tunnel) {
288 		len = gre_hdr_len + erspan_hdr_len(ver);
289 		if (unlikely(!pskb_may_pull(skb, len)))
290 			return PACKET_REJECT;
291 
292 		ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len);
293 		pkt_md = (struct erspan_metadata *)(ershdr + 1);
294 
295 		if (__iptunnel_pull_header(skb,
296 					   len,
297 					   htons(ETH_P_TEB),
298 					   false, false) < 0)
299 			goto drop;
300 
301 		if (tunnel->collect_md) {
302 			struct ip_tunnel_info *info;
303 			struct erspan_metadata *md;
304 			__be64 tun_id;
305 			__be16 flags;
306 
307 			tpi->flags |= TUNNEL_KEY;
308 			flags = tpi->flags;
309 			tun_id = key32_to_tunnel_id(tpi->key);
310 
311 			tun_dst = ip_tun_rx_dst(skb, flags,
312 						tun_id, sizeof(*md));
313 			if (!tun_dst)
314 				return PACKET_REJECT;
315 
316 			md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
317 			md->version = ver;
318 			md2 = &md->u.md2;
319 			memcpy(md2, pkt_md, ver == 1 ? ERSPAN_V1_MDSIZE :
320 						       ERSPAN_V2_MDSIZE);
321 
322 			info = &tun_dst->u.tun_info;
323 			info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
324 			info->options_len = sizeof(*md);
325 		}
326 
327 		skb_reset_mac_header(skb);
328 		ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
329 		return PACKET_RCVD;
330 	}
331 	return PACKET_REJECT;
332 
333 drop:
334 	kfree_skb(skb);
335 	return PACKET_RCVD;
336 }
337 
338 static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
339 		       struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
340 {
341 	struct metadata_dst *tun_dst = NULL;
342 	const struct iphdr *iph;
343 	struct ip_tunnel *tunnel;
344 
345 	iph = ip_hdr(skb);
346 	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
347 				  iph->saddr, iph->daddr, tpi->key);
348 
349 	if (tunnel) {
350 		if (__iptunnel_pull_header(skb, hdr_len, tpi->proto,
351 					   raw_proto, false) < 0)
352 			goto drop;
353 
354 		if (tunnel->dev->type != ARPHRD_NONE)
355 			skb_pop_mac_header(skb);
356 		else
357 			skb_reset_mac_header(skb);
358 		if (tunnel->collect_md) {
359 			__be16 flags;
360 			__be64 tun_id;
361 
362 			flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
363 			tun_id = key32_to_tunnel_id(tpi->key);
364 			tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
365 			if (!tun_dst)
366 				return PACKET_REJECT;
367 		}
368 
369 		ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
370 		return PACKET_RCVD;
371 	}
372 	return PACKET_NEXT;
373 
374 drop:
375 	kfree_skb(skb);
376 	return PACKET_RCVD;
377 }
378 
379 static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
380 		     int hdr_len)
381 {
382 	struct net *net = dev_net(skb->dev);
383 	struct ip_tunnel_net *itn;
384 	int res;
385 
386 	if (tpi->proto == htons(ETH_P_TEB))
387 		itn = net_generic(net, gre_tap_net_id);
388 	else
389 		itn = net_generic(net, ipgre_net_id);
390 
391 	res = __ipgre_rcv(skb, tpi, itn, hdr_len, false);
392 	if (res == PACKET_NEXT && tpi->proto == htons(ETH_P_TEB)) {
393 		/* ipgre tunnels in collect metadata mode should receive
394 		 * also ETH_P_TEB traffic.
395 		 */
396 		itn = net_generic(net, ipgre_net_id);
397 		res = __ipgre_rcv(skb, tpi, itn, hdr_len, true);
398 	}
399 	return res;
400 }
401 
402 static int gre_rcv(struct sk_buff *skb)
403 {
404 	struct tnl_ptk_info tpi;
405 	bool csum_err = false;
406 	int hdr_len;
407 
408 #ifdef CONFIG_NET_IPGRE_BROADCAST
409 	if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
410 		/* Looped back packet, drop it! */
411 		if (rt_is_output_route(skb_rtable(skb)))
412 			goto drop;
413 	}
414 #endif
415 
416 	hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP), 0);
417 	if (hdr_len < 0)
418 		goto drop;
419 
420 	if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) ||
421 		     tpi.proto == htons(ETH_P_ERSPAN2))) {
422 		if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
423 			return 0;
424 		goto out;
425 	}
426 
427 	if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
428 		return 0;
429 
430 out:
431 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
432 drop:
433 	kfree_skb(skb);
434 	return 0;
435 }
436 
437 static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
438 		       const struct iphdr *tnl_params,
439 		       __be16 proto)
440 {
441 	struct ip_tunnel *tunnel = netdev_priv(dev);
442 
443 	if (tunnel->parms.o_flags & TUNNEL_SEQ)
444 		tunnel->o_seqno++;
445 
446 	/* Push GRE header. */
447 	gre_build_header(skb, tunnel->tun_hlen,
448 			 tunnel->parms.o_flags, proto, tunnel->parms.o_key,
449 			 htonl(tunnel->o_seqno));
450 
451 	ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
452 }
453 
454 static int gre_handle_offloads(struct sk_buff *skb, bool csum)
455 {
456 	return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
457 }
458 
459 static struct rtable *gre_get_rt(struct sk_buff *skb,
460 				 struct net_device *dev,
461 				 struct flowi4 *fl,
462 				 const struct ip_tunnel_key *key)
463 {
464 	struct net *net = dev_net(dev);
465 
466 	memset(fl, 0, sizeof(*fl));
467 	fl->daddr = key->u.ipv4.dst;
468 	fl->saddr = key->u.ipv4.src;
469 	fl->flowi4_tos = RT_TOS(key->tos);
470 	fl->flowi4_mark = skb->mark;
471 	fl->flowi4_proto = IPPROTO_GRE;
472 
473 	return ip_route_output_key(net, fl);
474 }
475 
476 static struct rtable *prepare_fb_xmit(struct sk_buff *skb,
477 				      struct net_device *dev,
478 				      struct flowi4 *fl,
479 				      int tunnel_hlen)
480 {
481 	struct ip_tunnel_info *tun_info;
482 	const struct ip_tunnel_key *key;
483 	struct rtable *rt = NULL;
484 	int min_headroom;
485 	bool use_cache;
486 	int err;
487 
488 	tun_info = skb_tunnel_info(skb);
489 	key = &tun_info->key;
490 	use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
491 
492 	if (use_cache)
493 		rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl->saddr);
494 	if (!rt) {
495 		rt = gre_get_rt(skb, dev, fl, key);
496 		if (IS_ERR(rt))
497 			goto err_free_skb;
498 		if (use_cache)
499 			dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
500 					  fl->saddr);
501 	}
502 
503 	min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
504 			+ tunnel_hlen + sizeof(struct iphdr);
505 	if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
506 		int head_delta = SKB_DATA_ALIGN(min_headroom -
507 						skb_headroom(skb) +
508 						16);
509 		err = pskb_expand_head(skb, max_t(int, head_delta, 0),
510 				       0, GFP_ATOMIC);
511 		if (unlikely(err))
512 			goto err_free_rt;
513 	}
514 	return rt;
515 
516 err_free_rt:
517 	ip_rt_put(rt);
518 err_free_skb:
519 	kfree_skb(skb);
520 	dev->stats.tx_dropped++;
521 	return NULL;
522 }
523 
524 static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
525 			__be16 proto)
526 {
527 	struct ip_tunnel *tunnel = netdev_priv(dev);
528 	struct ip_tunnel_info *tun_info;
529 	const struct ip_tunnel_key *key;
530 	struct rtable *rt = NULL;
531 	struct flowi4 fl;
532 	int tunnel_hlen;
533 	__be16 df, flags;
534 
535 	tun_info = skb_tunnel_info(skb);
536 	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
537 		     ip_tunnel_info_af(tun_info) != AF_INET))
538 		goto err_free_skb;
539 
540 	key = &tun_info->key;
541 	tunnel_hlen = gre_calc_hlen(key->tun_flags);
542 
543 	rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen);
544 	if (!rt)
545 		return;
546 
547 	/* Push Tunnel header. */
548 	if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM)))
549 		goto err_free_rt;
550 
551 	flags = tun_info->key.tun_flags &
552 		(TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ);
553 	gre_build_header(skb, tunnel_hlen, flags, proto,
554 			 tunnel_id_to_key32(tun_info->key.tun_id),
555 			 (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0);
556 
557 	df = key->tun_flags & TUNNEL_DONT_FRAGMENT ?  htons(IP_DF) : 0;
558 
559 	iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE,
560 		      key->tos, key->ttl, df, false);
561 	return;
562 
563 err_free_rt:
564 	ip_rt_put(rt);
565 err_free_skb:
566 	kfree_skb(skb);
567 	dev->stats.tx_dropped++;
568 }
569 
570 static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
571 			   __be16 proto)
572 {
573 	struct ip_tunnel *tunnel = netdev_priv(dev);
574 	struct ip_tunnel_info *tun_info;
575 	const struct ip_tunnel_key *key;
576 	struct erspan_metadata *md;
577 	struct rtable *rt = NULL;
578 	bool truncate = false;
579 	struct flowi4 fl;
580 	int tunnel_hlen;
581 	int version;
582 	__be16 df;
583 	int nhoff;
584 	int thoff;
585 
586 	tun_info = skb_tunnel_info(skb);
587 	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
588 		     ip_tunnel_info_af(tun_info) != AF_INET))
589 		goto err_free_skb;
590 
591 	key = &tun_info->key;
592 	if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT))
593 		goto err_free_rt;
594 	md = ip_tunnel_info_opts(tun_info);
595 	if (!md)
596 		goto err_free_rt;
597 
598 	/* ERSPAN has fixed 8 byte GRE header */
599 	version = md->version;
600 	tunnel_hlen = 8 + erspan_hdr_len(version);
601 
602 	rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen);
603 	if (!rt)
604 		return;
605 
606 	if (gre_handle_offloads(skb, false))
607 		goto err_free_rt;
608 
609 	if (skb->len > dev->mtu + dev->hard_header_len) {
610 		pskb_trim(skb, dev->mtu + dev->hard_header_len);
611 		truncate = true;
612 	}
613 
614 	nhoff = skb_network_header(skb) - skb_mac_header(skb);
615 	if (skb->protocol == htons(ETH_P_IP) &&
616 	    (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff))
617 		truncate = true;
618 
619 	thoff = skb_transport_header(skb) - skb_mac_header(skb);
620 	if (skb->protocol == htons(ETH_P_IPV6) &&
621 	    (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff))
622 		truncate = true;
623 
624 	if (version == 1) {
625 		erspan_build_header(skb, ntohl(tunnel_id_to_key32(key->tun_id)),
626 				    ntohl(md->u.index), truncate, true);
627 	} else if (version == 2) {
628 		erspan_build_header_v2(skb,
629 				       ntohl(tunnel_id_to_key32(key->tun_id)),
630 				       md->u.md2.dir,
631 				       get_hwid(&md->u.md2),
632 				       truncate, true);
633 	} else {
634 		goto err_free_rt;
635 	}
636 
637 	gre_build_header(skb, 8, TUNNEL_SEQ,
638 			 htons(ETH_P_ERSPAN), 0, htonl(tunnel->o_seqno++));
639 
640 	df = key->tun_flags & TUNNEL_DONT_FRAGMENT ?  htons(IP_DF) : 0;
641 
642 	iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE,
643 		      key->tos, key->ttl, df, false);
644 	return;
645 
646 err_free_rt:
647 	ip_rt_put(rt);
648 err_free_skb:
649 	kfree_skb(skb);
650 	dev->stats.tx_dropped++;
651 }
652 
653 static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
654 {
655 	struct ip_tunnel_info *info = skb_tunnel_info(skb);
656 	struct rtable *rt;
657 	struct flowi4 fl4;
658 
659 	if (ip_tunnel_info_af(info) != AF_INET)
660 		return -EINVAL;
661 
662 	rt = gre_get_rt(skb, dev, &fl4, &info->key);
663 	if (IS_ERR(rt))
664 		return PTR_ERR(rt);
665 
666 	ip_rt_put(rt);
667 	info->key.u.ipv4.src = fl4.saddr;
668 	return 0;
669 }
670 
671 static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
672 			      struct net_device *dev)
673 {
674 	struct ip_tunnel *tunnel = netdev_priv(dev);
675 	const struct iphdr *tnl_params;
676 
677 	if (tunnel->collect_md) {
678 		gre_fb_xmit(skb, dev, skb->protocol);
679 		return NETDEV_TX_OK;
680 	}
681 
682 	if (dev->header_ops) {
683 		/* Need space for new headers */
684 		if (skb_cow_head(skb, dev->needed_headroom -
685 				      (tunnel->hlen + sizeof(struct iphdr))))
686 			goto free_skb;
687 
688 		tnl_params = (const struct iphdr *)skb->data;
689 
690 		/* Pull skb since ip_tunnel_xmit() needs skb->data pointing
691 		 * to gre header.
692 		 */
693 		skb_pull(skb, tunnel->hlen + sizeof(struct iphdr));
694 		skb_reset_mac_header(skb);
695 	} else {
696 		if (skb_cow_head(skb, dev->needed_headroom))
697 			goto free_skb;
698 
699 		tnl_params = &tunnel->parms.iph;
700 	}
701 
702 	if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
703 		goto free_skb;
704 
705 	__gre_xmit(skb, dev, tnl_params, skb->protocol);
706 	return NETDEV_TX_OK;
707 
708 free_skb:
709 	kfree_skb(skb);
710 	dev->stats.tx_dropped++;
711 	return NETDEV_TX_OK;
712 }
713 
714 static netdev_tx_t erspan_xmit(struct sk_buff *skb,
715 			       struct net_device *dev)
716 {
717 	struct ip_tunnel *tunnel = netdev_priv(dev);
718 	bool truncate = false;
719 
720 	if (tunnel->collect_md) {
721 		erspan_fb_xmit(skb, dev, skb->protocol);
722 		return NETDEV_TX_OK;
723 	}
724 
725 	if (gre_handle_offloads(skb, false))
726 		goto free_skb;
727 
728 	if (skb_cow_head(skb, dev->needed_headroom))
729 		goto free_skb;
730 
731 	if (skb->len > dev->mtu + dev->hard_header_len) {
732 		pskb_trim(skb, dev->mtu + dev->hard_header_len);
733 		truncate = true;
734 	}
735 
736 	/* Push ERSPAN header */
737 	if (tunnel->erspan_ver == 1)
738 		erspan_build_header(skb, ntohl(tunnel->parms.o_key),
739 				    tunnel->index,
740 				    truncate, true);
741 	else if (tunnel->erspan_ver == 2)
742 		erspan_build_header_v2(skb, ntohl(tunnel->parms.o_key),
743 				       tunnel->dir, tunnel->hwid,
744 				       truncate, true);
745 	else
746 		goto free_skb;
747 
748 	tunnel->parms.o_flags &= ~TUNNEL_KEY;
749 	__gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_ERSPAN));
750 	return NETDEV_TX_OK;
751 
752 free_skb:
753 	kfree_skb(skb);
754 	dev->stats.tx_dropped++;
755 	return NETDEV_TX_OK;
756 }
757 
758 static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
759 				struct net_device *dev)
760 {
761 	struct ip_tunnel *tunnel = netdev_priv(dev);
762 
763 	if (tunnel->collect_md) {
764 		gre_fb_xmit(skb, dev, htons(ETH_P_TEB));
765 		return NETDEV_TX_OK;
766 	}
767 
768 	if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
769 		goto free_skb;
770 
771 	if (skb_cow_head(skb, dev->needed_headroom))
772 		goto free_skb;
773 
774 	__gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB));
775 	return NETDEV_TX_OK;
776 
777 free_skb:
778 	kfree_skb(skb);
779 	dev->stats.tx_dropped++;
780 	return NETDEV_TX_OK;
781 }
782 
783 static void ipgre_link_update(struct net_device *dev, bool set_mtu)
784 {
785 	struct ip_tunnel *tunnel = netdev_priv(dev);
786 	int len;
787 
788 	len = tunnel->tun_hlen;
789 	tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
790 	len = tunnel->tun_hlen - len;
791 	tunnel->hlen = tunnel->hlen + len;
792 
793 	dev->needed_headroom = dev->needed_headroom + len;
794 	if (set_mtu)
795 		dev->mtu = max_t(int, dev->mtu - len, 68);
796 
797 	if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
798 		if (!(tunnel->parms.o_flags & TUNNEL_CSUM) ||
799 		    tunnel->encap.type == TUNNEL_ENCAP_NONE) {
800 			dev->features |= NETIF_F_GSO_SOFTWARE;
801 			dev->hw_features |= NETIF_F_GSO_SOFTWARE;
802 		} else {
803 			dev->features &= ~NETIF_F_GSO_SOFTWARE;
804 			dev->hw_features &= ~NETIF_F_GSO_SOFTWARE;
805 		}
806 		dev->features |= NETIF_F_LLTX;
807 	} else {
808 		dev->hw_features &= ~NETIF_F_GSO_SOFTWARE;
809 		dev->features &= ~(NETIF_F_LLTX | NETIF_F_GSO_SOFTWARE);
810 	}
811 }
812 
813 static int ipgre_tunnel_ioctl(struct net_device *dev,
814 			      struct ifreq *ifr, int cmd)
815 {
816 	struct ip_tunnel_parm p;
817 	int err;
818 
819 	if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
820 		return -EFAULT;
821 
822 	if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
823 		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
824 		    p.iph.ihl != 5 || (p.iph.frag_off & htons(~IP_DF)) ||
825 		    ((p.i_flags | p.o_flags) & (GRE_VERSION | GRE_ROUTING)))
826 			return -EINVAL;
827 	}
828 
829 	p.i_flags = gre_flags_to_tnl_flags(p.i_flags);
830 	p.o_flags = gre_flags_to_tnl_flags(p.o_flags);
831 
832 	err = ip_tunnel_ioctl(dev, &p, cmd);
833 	if (err)
834 		return err;
835 
836 	if (cmd == SIOCCHGTUNNEL) {
837 		struct ip_tunnel *t = netdev_priv(dev);
838 
839 		t->parms.i_flags = p.i_flags;
840 		t->parms.o_flags = p.o_flags;
841 
842 		if (strcmp(dev->rtnl_link_ops->kind, "erspan"))
843 			ipgre_link_update(dev, true);
844 	}
845 
846 	p.i_flags = gre_tnl_flags_to_gre_flags(p.i_flags);
847 	p.o_flags = gre_tnl_flags_to_gre_flags(p.o_flags);
848 
849 	if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
850 		return -EFAULT;
851 
852 	return 0;
853 }
854 
855 /* Nice toy. Unfortunately, useless in real life :-)
856    It allows to construct virtual multiprotocol broadcast "LAN"
857    over the Internet, provided multicast routing is tuned.
858 
859 
860    I have no idea was this bicycle invented before me,
861    so that I had to set ARPHRD_IPGRE to a random value.
862    I have an impression, that Cisco could make something similar,
863    but this feature is apparently missing in IOS<=11.2(8).
864 
865    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
866    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
867 
868    ping -t 255 224.66.66.66
869 
870    If nobody answers, mbone does not work.
871 
872    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
873    ip addr add 10.66.66.<somewhat>/24 dev Universe
874    ifconfig Universe up
875    ifconfig Universe add fe80::<Your_real_addr>/10
876    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
877    ftp 10.66.66.66
878    ...
879    ftp fec0:6666:6666::193.233.7.65
880    ...
881  */
882 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
883 			unsigned short type,
884 			const void *daddr, const void *saddr, unsigned int len)
885 {
886 	struct ip_tunnel *t = netdev_priv(dev);
887 	struct iphdr *iph;
888 	struct gre_base_hdr *greh;
889 
890 	iph = skb_push(skb, t->hlen + sizeof(*iph));
891 	greh = (struct gre_base_hdr *)(iph+1);
892 	greh->flags = gre_tnl_flags_to_gre_flags(t->parms.o_flags);
893 	greh->protocol = htons(type);
894 
895 	memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
896 
897 	/* Set the source hardware address. */
898 	if (saddr)
899 		memcpy(&iph->saddr, saddr, 4);
900 	if (daddr)
901 		memcpy(&iph->daddr, daddr, 4);
902 	if (iph->daddr)
903 		return t->hlen + sizeof(*iph);
904 
905 	return -(t->hlen + sizeof(*iph));
906 }
907 
908 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
909 {
910 	const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
911 	memcpy(haddr, &iph->saddr, 4);
912 	return 4;
913 }
914 
915 static const struct header_ops ipgre_header_ops = {
916 	.create	= ipgre_header,
917 	.parse	= ipgre_header_parse,
918 };
919 
920 #ifdef CONFIG_NET_IPGRE_BROADCAST
921 static int ipgre_open(struct net_device *dev)
922 {
923 	struct ip_tunnel *t = netdev_priv(dev);
924 
925 	if (ipv4_is_multicast(t->parms.iph.daddr)) {
926 		struct flowi4 fl4;
927 		struct rtable *rt;
928 
929 		rt = ip_route_output_gre(t->net, &fl4,
930 					 t->parms.iph.daddr,
931 					 t->parms.iph.saddr,
932 					 t->parms.o_key,
933 					 RT_TOS(t->parms.iph.tos),
934 					 t->parms.link);
935 		if (IS_ERR(rt))
936 			return -EADDRNOTAVAIL;
937 		dev = rt->dst.dev;
938 		ip_rt_put(rt);
939 		if (!__in_dev_get_rtnl(dev))
940 			return -EADDRNOTAVAIL;
941 		t->mlink = dev->ifindex;
942 		ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
943 	}
944 	return 0;
945 }
946 
947 static int ipgre_close(struct net_device *dev)
948 {
949 	struct ip_tunnel *t = netdev_priv(dev);
950 
951 	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
952 		struct in_device *in_dev;
953 		in_dev = inetdev_by_index(t->net, t->mlink);
954 		if (in_dev)
955 			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
956 	}
957 	return 0;
958 }
959 #endif
960 
961 static const struct net_device_ops ipgre_netdev_ops = {
962 	.ndo_init		= ipgre_tunnel_init,
963 	.ndo_uninit		= ip_tunnel_uninit,
964 #ifdef CONFIG_NET_IPGRE_BROADCAST
965 	.ndo_open		= ipgre_open,
966 	.ndo_stop		= ipgre_close,
967 #endif
968 	.ndo_start_xmit		= ipgre_xmit,
969 	.ndo_do_ioctl		= ipgre_tunnel_ioctl,
970 	.ndo_change_mtu		= ip_tunnel_change_mtu,
971 	.ndo_get_stats64	= ip_tunnel_get_stats64,
972 	.ndo_get_iflink		= ip_tunnel_get_iflink,
973 };
974 
975 #define GRE_FEATURES (NETIF_F_SG |		\
976 		      NETIF_F_FRAGLIST |	\
977 		      NETIF_F_HIGHDMA |		\
978 		      NETIF_F_HW_CSUM)
979 
980 static void ipgre_tunnel_setup(struct net_device *dev)
981 {
982 	dev->netdev_ops		= &ipgre_netdev_ops;
983 	dev->type		= ARPHRD_IPGRE;
984 	ip_tunnel_setup(dev, ipgre_net_id);
985 }
986 
987 static void __gre_tunnel_init(struct net_device *dev)
988 {
989 	struct ip_tunnel *tunnel;
990 
991 	tunnel = netdev_priv(dev);
992 	tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
993 	tunnel->parms.iph.protocol = IPPROTO_GRE;
994 
995 	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
996 
997 	dev->features		|= GRE_FEATURES;
998 	dev->hw_features	|= GRE_FEATURES;
999 
1000 	if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
1001 		/* TCP offload with GRE SEQ is not supported, nor
1002 		 * can we support 2 levels of outer headers requiring
1003 		 * an update.
1004 		 */
1005 		if (!(tunnel->parms.o_flags & TUNNEL_CSUM) ||
1006 		    (tunnel->encap.type == TUNNEL_ENCAP_NONE)) {
1007 			dev->features    |= NETIF_F_GSO_SOFTWARE;
1008 			dev->hw_features |= NETIF_F_GSO_SOFTWARE;
1009 		}
1010 
1011 		/* Can use a lockless transmit, unless we generate
1012 		 * output sequences
1013 		 */
1014 		dev->features |= NETIF_F_LLTX;
1015 	}
1016 }
1017 
1018 static int ipgre_tunnel_init(struct net_device *dev)
1019 {
1020 	struct ip_tunnel *tunnel = netdev_priv(dev);
1021 	struct iphdr *iph = &tunnel->parms.iph;
1022 
1023 	__gre_tunnel_init(dev);
1024 
1025 	memcpy(dev->dev_addr, &iph->saddr, 4);
1026 	memcpy(dev->broadcast, &iph->daddr, 4);
1027 
1028 	dev->flags		= IFF_NOARP;
1029 	netif_keep_dst(dev);
1030 	dev->addr_len		= 4;
1031 
1032 	if (iph->daddr && !tunnel->collect_md) {
1033 #ifdef CONFIG_NET_IPGRE_BROADCAST
1034 		if (ipv4_is_multicast(iph->daddr)) {
1035 			if (!iph->saddr)
1036 				return -EINVAL;
1037 			dev->flags = IFF_BROADCAST;
1038 			dev->header_ops = &ipgre_header_ops;
1039 		}
1040 #endif
1041 	} else if (!tunnel->collect_md) {
1042 		dev->header_ops = &ipgre_header_ops;
1043 	}
1044 
1045 	return ip_tunnel_init(dev);
1046 }
1047 
1048 static const struct gre_protocol ipgre_protocol = {
1049 	.handler     = gre_rcv,
1050 	.err_handler = gre_err,
1051 };
1052 
1053 static int __net_init ipgre_init_net(struct net *net)
1054 {
1055 	return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
1056 }
1057 
1058 static void __net_exit ipgre_exit_batch_net(struct list_head *list_net)
1059 {
1060 	ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops);
1061 }
1062 
1063 static struct pernet_operations ipgre_net_ops = {
1064 	.init = ipgre_init_net,
1065 	.exit_batch = ipgre_exit_batch_net,
1066 	.id   = &ipgre_net_id,
1067 	.size = sizeof(struct ip_tunnel_net),
1068 };
1069 
1070 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
1071 				 struct netlink_ext_ack *extack)
1072 {
1073 	__be16 flags;
1074 
1075 	if (!data)
1076 		return 0;
1077 
1078 	flags = 0;
1079 	if (data[IFLA_GRE_IFLAGS])
1080 		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1081 	if (data[IFLA_GRE_OFLAGS])
1082 		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1083 	if (flags & (GRE_VERSION|GRE_ROUTING))
1084 		return -EINVAL;
1085 
1086 	if (data[IFLA_GRE_COLLECT_METADATA] &&
1087 	    data[IFLA_GRE_ENCAP_TYPE] &&
1088 	    nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]) != TUNNEL_ENCAP_NONE)
1089 		return -EINVAL;
1090 
1091 	return 0;
1092 }
1093 
1094 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[],
1095 			      struct netlink_ext_ack *extack)
1096 {
1097 	__be32 daddr;
1098 
1099 	if (tb[IFLA_ADDRESS]) {
1100 		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1101 			return -EINVAL;
1102 		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1103 			return -EADDRNOTAVAIL;
1104 	}
1105 
1106 	if (!data)
1107 		goto out;
1108 
1109 	if (data[IFLA_GRE_REMOTE]) {
1110 		memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1111 		if (!daddr)
1112 			return -EINVAL;
1113 	}
1114 
1115 out:
1116 	return ipgre_tunnel_validate(tb, data, extack);
1117 }
1118 
1119 static int erspan_validate(struct nlattr *tb[], struct nlattr *data[],
1120 			   struct netlink_ext_ack *extack)
1121 {
1122 	__be16 flags = 0;
1123 	int ret;
1124 
1125 	if (!data)
1126 		return 0;
1127 
1128 	ret = ipgre_tap_validate(tb, data, extack);
1129 	if (ret)
1130 		return ret;
1131 
1132 	/* ERSPAN should only have GRE sequence and key flag */
1133 	if (data[IFLA_GRE_OFLAGS])
1134 		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1135 	if (data[IFLA_GRE_IFLAGS])
1136 		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1137 	if (!data[IFLA_GRE_COLLECT_METADATA] &&
1138 	    flags != (GRE_SEQ | GRE_KEY))
1139 		return -EINVAL;
1140 
1141 	/* ERSPAN Session ID only has 10-bit. Since we reuse
1142 	 * 32-bit key field as ID, check it's range.
1143 	 */
1144 	if (data[IFLA_GRE_IKEY] &&
1145 	    (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK))
1146 		return -EINVAL;
1147 
1148 	if (data[IFLA_GRE_OKEY] &&
1149 	    (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK))
1150 		return -EINVAL;
1151 
1152 	return 0;
1153 }
1154 
1155 static int ipgre_netlink_parms(struct net_device *dev,
1156 				struct nlattr *data[],
1157 				struct nlattr *tb[],
1158 				struct ip_tunnel_parm *parms,
1159 				__u32 *fwmark)
1160 {
1161 	struct ip_tunnel *t = netdev_priv(dev);
1162 
1163 	memset(parms, 0, sizeof(*parms));
1164 
1165 	parms->iph.protocol = IPPROTO_GRE;
1166 
1167 	if (!data)
1168 		return 0;
1169 
1170 	if (data[IFLA_GRE_LINK])
1171 		parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1172 
1173 	if (data[IFLA_GRE_IFLAGS])
1174 		parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS]));
1175 
1176 	if (data[IFLA_GRE_OFLAGS])
1177 		parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS]));
1178 
1179 	if (data[IFLA_GRE_IKEY])
1180 		parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1181 
1182 	if (data[IFLA_GRE_OKEY])
1183 		parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1184 
1185 	if (data[IFLA_GRE_LOCAL])
1186 		parms->iph.saddr = nla_get_in_addr(data[IFLA_GRE_LOCAL]);
1187 
1188 	if (data[IFLA_GRE_REMOTE])
1189 		parms->iph.daddr = nla_get_in_addr(data[IFLA_GRE_REMOTE]);
1190 
1191 	if (data[IFLA_GRE_TTL])
1192 		parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1193 
1194 	if (data[IFLA_GRE_TOS])
1195 		parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1196 
1197 	if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) {
1198 		if (t->ignore_df)
1199 			return -EINVAL;
1200 		parms->iph.frag_off = htons(IP_DF);
1201 	}
1202 
1203 	if (data[IFLA_GRE_COLLECT_METADATA]) {
1204 		t->collect_md = true;
1205 		if (dev->type == ARPHRD_IPGRE)
1206 			dev->type = ARPHRD_NONE;
1207 	}
1208 
1209 	if (data[IFLA_GRE_IGNORE_DF]) {
1210 		if (nla_get_u8(data[IFLA_GRE_IGNORE_DF])
1211 		  && (parms->iph.frag_off & htons(IP_DF)))
1212 			return -EINVAL;
1213 		t->ignore_df = !!nla_get_u8(data[IFLA_GRE_IGNORE_DF]);
1214 	}
1215 
1216 	if (data[IFLA_GRE_FWMARK])
1217 		*fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);
1218 
1219 	if (data[IFLA_GRE_ERSPAN_VER]) {
1220 		t->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]);
1221 
1222 		if (t->erspan_ver != 1 && t->erspan_ver != 2)
1223 			return -EINVAL;
1224 	}
1225 
1226 	if (t->erspan_ver == 1) {
1227 		if (data[IFLA_GRE_ERSPAN_INDEX]) {
1228 			t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
1229 			if (t->index & ~INDEX_MASK)
1230 				return -EINVAL;
1231 		}
1232 	} else if (t->erspan_ver == 2) {
1233 		if (data[IFLA_GRE_ERSPAN_DIR]) {
1234 			t->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]);
1235 			if (t->dir & ~(DIR_MASK >> DIR_OFFSET))
1236 				return -EINVAL;
1237 		}
1238 		if (data[IFLA_GRE_ERSPAN_HWID]) {
1239 			t->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]);
1240 			if (t->hwid & ~(HWID_MASK >> HWID_OFFSET))
1241 				return -EINVAL;
1242 		}
1243 	}
1244 
1245 	return 0;
1246 }
1247 
1248 /* This function returns true when ENCAP attributes are present in the nl msg */
1249 static bool ipgre_netlink_encap_parms(struct nlattr *data[],
1250 				      struct ip_tunnel_encap *ipencap)
1251 {
1252 	bool ret = false;
1253 
1254 	memset(ipencap, 0, sizeof(*ipencap));
1255 
1256 	if (!data)
1257 		return ret;
1258 
1259 	if (data[IFLA_GRE_ENCAP_TYPE]) {
1260 		ret = true;
1261 		ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]);
1262 	}
1263 
1264 	if (data[IFLA_GRE_ENCAP_FLAGS]) {
1265 		ret = true;
1266 		ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]);
1267 	}
1268 
1269 	if (data[IFLA_GRE_ENCAP_SPORT]) {
1270 		ret = true;
1271 		ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]);
1272 	}
1273 
1274 	if (data[IFLA_GRE_ENCAP_DPORT]) {
1275 		ret = true;
1276 		ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]);
1277 	}
1278 
1279 	return ret;
1280 }
1281 
1282 static int gre_tap_init(struct net_device *dev)
1283 {
1284 	__gre_tunnel_init(dev);
1285 	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1286 	netif_keep_dst(dev);
1287 
1288 	return ip_tunnel_init(dev);
1289 }
1290 
1291 static const struct net_device_ops gre_tap_netdev_ops = {
1292 	.ndo_init		= gre_tap_init,
1293 	.ndo_uninit		= ip_tunnel_uninit,
1294 	.ndo_start_xmit		= gre_tap_xmit,
1295 	.ndo_set_mac_address 	= eth_mac_addr,
1296 	.ndo_validate_addr	= eth_validate_addr,
1297 	.ndo_change_mtu		= ip_tunnel_change_mtu,
1298 	.ndo_get_stats64	= ip_tunnel_get_stats64,
1299 	.ndo_get_iflink		= ip_tunnel_get_iflink,
1300 	.ndo_fill_metadata_dst	= gre_fill_metadata_dst,
1301 };
1302 
1303 static int erspan_tunnel_init(struct net_device *dev)
1304 {
1305 	struct ip_tunnel *tunnel = netdev_priv(dev);
1306 
1307 	tunnel->tun_hlen = 8;
1308 	tunnel->parms.iph.protocol = IPPROTO_GRE;
1309 	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
1310 		       erspan_hdr_len(tunnel->erspan_ver);
1311 
1312 	dev->features		|= GRE_FEATURES;
1313 	dev->hw_features	|= GRE_FEATURES;
1314 	dev->priv_flags		|= IFF_LIVE_ADDR_CHANGE;
1315 	netif_keep_dst(dev);
1316 
1317 	return ip_tunnel_init(dev);
1318 }
1319 
1320 static const struct net_device_ops erspan_netdev_ops = {
1321 	.ndo_init		= erspan_tunnel_init,
1322 	.ndo_uninit		= ip_tunnel_uninit,
1323 	.ndo_start_xmit		= erspan_xmit,
1324 	.ndo_set_mac_address	= eth_mac_addr,
1325 	.ndo_validate_addr	= eth_validate_addr,
1326 	.ndo_change_mtu		= ip_tunnel_change_mtu,
1327 	.ndo_get_stats64	= ip_tunnel_get_stats64,
1328 	.ndo_get_iflink		= ip_tunnel_get_iflink,
1329 	.ndo_fill_metadata_dst	= gre_fill_metadata_dst,
1330 };
1331 
1332 static void ipgre_tap_setup(struct net_device *dev)
1333 {
1334 	ether_setup(dev);
1335 	dev->max_mtu = 0;
1336 	dev->netdev_ops	= &gre_tap_netdev_ops;
1337 	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1338 	dev->priv_flags	|= IFF_LIVE_ADDR_CHANGE;
1339 	ip_tunnel_setup(dev, gre_tap_net_id);
1340 }
1341 
1342 bool is_gretap_dev(const struct net_device *dev)
1343 {
1344 	return dev->netdev_ops == &gre_tap_netdev_ops;
1345 }
1346 EXPORT_SYMBOL_GPL(is_gretap_dev);
1347 
1348 static int ipgre_newlink(struct net *src_net, struct net_device *dev,
1349 			 struct nlattr *tb[], struct nlattr *data[],
1350 			 struct netlink_ext_ack *extack)
1351 {
1352 	struct ip_tunnel_parm p;
1353 	struct ip_tunnel_encap ipencap;
1354 	__u32 fwmark = 0;
1355 	int err;
1356 
1357 	if (ipgre_netlink_encap_parms(data, &ipencap)) {
1358 		struct ip_tunnel *t = netdev_priv(dev);
1359 		err = ip_tunnel_encap_setup(t, &ipencap);
1360 
1361 		if (err < 0)
1362 			return err;
1363 	}
1364 
1365 	err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
1366 	if (err < 0)
1367 		return err;
1368 	return ip_tunnel_newlink(dev, tb, &p, fwmark);
1369 }
1370 
1371 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1372 			    struct nlattr *data[],
1373 			    struct netlink_ext_ack *extack)
1374 {
1375 	struct ip_tunnel *t = netdev_priv(dev);
1376 	struct ip_tunnel_encap ipencap;
1377 	__u32 fwmark = t->fwmark;
1378 	struct ip_tunnel_parm p;
1379 	int err;
1380 
1381 	if (ipgre_netlink_encap_parms(data, &ipencap)) {
1382 		err = ip_tunnel_encap_setup(t, &ipencap);
1383 
1384 		if (err < 0)
1385 			return err;
1386 	}
1387 
1388 	err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
1389 	if (err < 0)
1390 		return err;
1391 
1392 	err = ip_tunnel_changelink(dev, tb, &p, fwmark);
1393 	if (err < 0)
1394 		return err;
1395 
1396 	t->parms.i_flags = p.i_flags;
1397 	t->parms.o_flags = p.o_flags;
1398 
1399 	if (strcmp(dev->rtnl_link_ops->kind, "erspan"))
1400 		ipgre_link_update(dev, !tb[IFLA_MTU]);
1401 
1402 	return 0;
1403 }
1404 
1405 static size_t ipgre_get_size(const struct net_device *dev)
1406 {
1407 	return
1408 		/* IFLA_GRE_LINK */
1409 		nla_total_size(4) +
1410 		/* IFLA_GRE_IFLAGS */
1411 		nla_total_size(2) +
1412 		/* IFLA_GRE_OFLAGS */
1413 		nla_total_size(2) +
1414 		/* IFLA_GRE_IKEY */
1415 		nla_total_size(4) +
1416 		/* IFLA_GRE_OKEY */
1417 		nla_total_size(4) +
1418 		/* IFLA_GRE_LOCAL */
1419 		nla_total_size(4) +
1420 		/* IFLA_GRE_REMOTE */
1421 		nla_total_size(4) +
1422 		/* IFLA_GRE_TTL */
1423 		nla_total_size(1) +
1424 		/* IFLA_GRE_TOS */
1425 		nla_total_size(1) +
1426 		/* IFLA_GRE_PMTUDISC */
1427 		nla_total_size(1) +
1428 		/* IFLA_GRE_ENCAP_TYPE */
1429 		nla_total_size(2) +
1430 		/* IFLA_GRE_ENCAP_FLAGS */
1431 		nla_total_size(2) +
1432 		/* IFLA_GRE_ENCAP_SPORT */
1433 		nla_total_size(2) +
1434 		/* IFLA_GRE_ENCAP_DPORT */
1435 		nla_total_size(2) +
1436 		/* IFLA_GRE_COLLECT_METADATA */
1437 		nla_total_size(0) +
1438 		/* IFLA_GRE_IGNORE_DF */
1439 		nla_total_size(1) +
1440 		/* IFLA_GRE_FWMARK */
1441 		nla_total_size(4) +
1442 		/* IFLA_GRE_ERSPAN_INDEX */
1443 		nla_total_size(4) +
1444 		/* IFLA_GRE_ERSPAN_VER */
1445 		nla_total_size(1) +
1446 		/* IFLA_GRE_ERSPAN_DIR */
1447 		nla_total_size(1) +
1448 		/* IFLA_GRE_ERSPAN_HWID */
1449 		nla_total_size(2) +
1450 		0;
1451 }
1452 
1453 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1454 {
1455 	struct ip_tunnel *t = netdev_priv(dev);
1456 	struct ip_tunnel_parm *p = &t->parms;
1457 
1458 	if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1459 	    nla_put_be16(skb, IFLA_GRE_IFLAGS,
1460 			 gre_tnl_flags_to_gre_flags(p->i_flags)) ||
1461 	    nla_put_be16(skb, IFLA_GRE_OFLAGS,
1462 			 gre_tnl_flags_to_gre_flags(p->o_flags)) ||
1463 	    nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1464 	    nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1465 	    nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
1466 	    nla_put_in_addr(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
1467 	    nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
1468 	    nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
1469 	    nla_put_u8(skb, IFLA_GRE_PMTUDISC,
1470 		       !!(p->iph.frag_off & htons(IP_DF))) ||
1471 	    nla_put_u32(skb, IFLA_GRE_FWMARK, t->fwmark))
1472 		goto nla_put_failure;
1473 
1474 	if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
1475 			t->encap.type) ||
1476 	    nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT,
1477 			 t->encap.sport) ||
1478 	    nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT,
1479 			 t->encap.dport) ||
1480 	    nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS,
1481 			t->encap.flags))
1482 		goto nla_put_failure;
1483 
1484 	if (nla_put_u8(skb, IFLA_GRE_IGNORE_DF, t->ignore_df))
1485 		goto nla_put_failure;
1486 
1487 	if (t->collect_md) {
1488 		if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA))
1489 			goto nla_put_failure;
1490 	}
1491 
1492 	if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver))
1493 		goto nla_put_failure;
1494 
1495 	if (t->erspan_ver == 1) {
1496 		if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index))
1497 			goto nla_put_failure;
1498 	} else if (t->erspan_ver == 2) {
1499 		if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir))
1500 			goto nla_put_failure;
1501 		if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid))
1502 			goto nla_put_failure;
1503 	}
1504 
1505 	return 0;
1506 
1507 nla_put_failure:
1508 	return -EMSGSIZE;
1509 }
1510 
1511 static void erspan_setup(struct net_device *dev)
1512 {
1513 	struct ip_tunnel *t = netdev_priv(dev);
1514 
1515 	ether_setup(dev);
1516 	dev->netdev_ops = &erspan_netdev_ops;
1517 	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1518 	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1519 	ip_tunnel_setup(dev, erspan_net_id);
1520 	t->erspan_ver = 1;
1521 }
1522 
1523 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1524 	[IFLA_GRE_LINK]		= { .type = NLA_U32 },
1525 	[IFLA_GRE_IFLAGS]	= { .type = NLA_U16 },
1526 	[IFLA_GRE_OFLAGS]	= { .type = NLA_U16 },
1527 	[IFLA_GRE_IKEY]		= { .type = NLA_U32 },
1528 	[IFLA_GRE_OKEY]		= { .type = NLA_U32 },
1529 	[IFLA_GRE_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1530 	[IFLA_GRE_REMOTE]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1531 	[IFLA_GRE_TTL]		= { .type = NLA_U8 },
1532 	[IFLA_GRE_TOS]		= { .type = NLA_U8 },
1533 	[IFLA_GRE_PMTUDISC]	= { .type = NLA_U8 },
1534 	[IFLA_GRE_ENCAP_TYPE]	= { .type = NLA_U16 },
1535 	[IFLA_GRE_ENCAP_FLAGS]	= { .type = NLA_U16 },
1536 	[IFLA_GRE_ENCAP_SPORT]	= { .type = NLA_U16 },
1537 	[IFLA_GRE_ENCAP_DPORT]	= { .type = NLA_U16 },
1538 	[IFLA_GRE_COLLECT_METADATA]	= { .type = NLA_FLAG },
1539 	[IFLA_GRE_IGNORE_DF]	= { .type = NLA_U8 },
1540 	[IFLA_GRE_FWMARK]	= { .type = NLA_U32 },
1541 	[IFLA_GRE_ERSPAN_INDEX]	= { .type = NLA_U32 },
1542 	[IFLA_GRE_ERSPAN_VER]	= { .type = NLA_U8 },
1543 	[IFLA_GRE_ERSPAN_DIR]	= { .type = NLA_U8 },
1544 	[IFLA_GRE_ERSPAN_HWID]	= { .type = NLA_U16 },
1545 };
1546 
1547 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1548 	.kind		= "gre",
1549 	.maxtype	= IFLA_GRE_MAX,
1550 	.policy		= ipgre_policy,
1551 	.priv_size	= sizeof(struct ip_tunnel),
1552 	.setup		= ipgre_tunnel_setup,
1553 	.validate	= ipgre_tunnel_validate,
1554 	.newlink	= ipgre_newlink,
1555 	.changelink	= ipgre_changelink,
1556 	.dellink	= ip_tunnel_dellink,
1557 	.get_size	= ipgre_get_size,
1558 	.fill_info	= ipgre_fill_info,
1559 	.get_link_net	= ip_tunnel_get_link_net,
1560 };
1561 
1562 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1563 	.kind		= "gretap",
1564 	.maxtype	= IFLA_GRE_MAX,
1565 	.policy		= ipgre_policy,
1566 	.priv_size	= sizeof(struct ip_tunnel),
1567 	.setup		= ipgre_tap_setup,
1568 	.validate	= ipgre_tap_validate,
1569 	.newlink	= ipgre_newlink,
1570 	.changelink	= ipgre_changelink,
1571 	.dellink	= ip_tunnel_dellink,
1572 	.get_size	= ipgre_get_size,
1573 	.fill_info	= ipgre_fill_info,
1574 	.get_link_net	= ip_tunnel_get_link_net,
1575 };
1576 
1577 static struct rtnl_link_ops erspan_link_ops __read_mostly = {
1578 	.kind		= "erspan",
1579 	.maxtype	= IFLA_GRE_MAX,
1580 	.policy		= ipgre_policy,
1581 	.priv_size	= sizeof(struct ip_tunnel),
1582 	.setup		= erspan_setup,
1583 	.validate	= erspan_validate,
1584 	.newlink	= ipgre_newlink,
1585 	.changelink	= ipgre_changelink,
1586 	.dellink	= ip_tunnel_dellink,
1587 	.get_size	= ipgre_get_size,
1588 	.fill_info	= ipgre_fill_info,
1589 	.get_link_net	= ip_tunnel_get_link_net,
1590 };
1591 
1592 struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
1593 					u8 name_assign_type)
1594 {
1595 	struct nlattr *tb[IFLA_MAX + 1];
1596 	struct net_device *dev;
1597 	LIST_HEAD(list_kill);
1598 	struct ip_tunnel *t;
1599 	int err;
1600 
1601 	memset(&tb, 0, sizeof(tb));
1602 
1603 	dev = rtnl_create_link(net, name, name_assign_type,
1604 			       &ipgre_tap_ops, tb);
1605 	if (IS_ERR(dev))
1606 		return dev;
1607 
1608 	/* Configure flow based GRE device. */
1609 	t = netdev_priv(dev);
1610 	t->collect_md = true;
1611 
1612 	err = ipgre_newlink(net, dev, tb, NULL, NULL);
1613 	if (err < 0) {
1614 		free_netdev(dev);
1615 		return ERR_PTR(err);
1616 	}
1617 
1618 	/* openvswitch users expect packet sizes to be unrestricted,
1619 	 * so set the largest MTU we can.
1620 	 */
1621 	err = __ip_tunnel_change_mtu(dev, IP_MAX_MTU, false);
1622 	if (err)
1623 		goto out;
1624 
1625 	err = rtnl_configure_link(dev, NULL);
1626 	if (err < 0)
1627 		goto out;
1628 
1629 	return dev;
1630 out:
1631 	ip_tunnel_dellink(dev, &list_kill);
1632 	unregister_netdevice_many(&list_kill);
1633 	return ERR_PTR(err);
1634 }
1635 EXPORT_SYMBOL_GPL(gretap_fb_dev_create);
1636 
1637 static int __net_init ipgre_tap_init_net(struct net *net)
1638 {
1639 	return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
1640 }
1641 
1642 static void __net_exit ipgre_tap_exit_batch_net(struct list_head *list_net)
1643 {
1644 	ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops);
1645 }
1646 
1647 static struct pernet_operations ipgre_tap_net_ops = {
1648 	.init = ipgre_tap_init_net,
1649 	.exit_batch = ipgre_tap_exit_batch_net,
1650 	.id   = &gre_tap_net_id,
1651 	.size = sizeof(struct ip_tunnel_net),
1652 };
1653 
1654 static int __net_init erspan_init_net(struct net *net)
1655 {
1656 	return ip_tunnel_init_net(net, erspan_net_id,
1657 				  &erspan_link_ops, "erspan0");
1658 }
1659 
1660 static void __net_exit erspan_exit_batch_net(struct list_head *net_list)
1661 {
1662 	ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops);
1663 }
1664 
1665 static struct pernet_operations erspan_net_ops = {
1666 	.init = erspan_init_net,
1667 	.exit_batch = erspan_exit_batch_net,
1668 	.id   = &erspan_net_id,
1669 	.size = sizeof(struct ip_tunnel_net),
1670 };
1671 
1672 static int __init ipgre_init(void)
1673 {
1674 	int err;
1675 
1676 	pr_info("GRE over IPv4 tunneling driver\n");
1677 
1678 	err = register_pernet_device(&ipgre_net_ops);
1679 	if (err < 0)
1680 		return err;
1681 
1682 	err = register_pernet_device(&ipgre_tap_net_ops);
1683 	if (err < 0)
1684 		goto pnet_tap_failed;
1685 
1686 	err = register_pernet_device(&erspan_net_ops);
1687 	if (err < 0)
1688 		goto pnet_erspan_failed;
1689 
1690 	err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1691 	if (err < 0) {
1692 		pr_info("%s: can't add protocol\n", __func__);
1693 		goto add_proto_failed;
1694 	}
1695 
1696 	err = rtnl_link_register(&ipgre_link_ops);
1697 	if (err < 0)
1698 		goto rtnl_link_failed;
1699 
1700 	err = rtnl_link_register(&ipgre_tap_ops);
1701 	if (err < 0)
1702 		goto tap_ops_failed;
1703 
1704 	err = rtnl_link_register(&erspan_link_ops);
1705 	if (err < 0)
1706 		goto erspan_link_failed;
1707 
1708 	return 0;
1709 
1710 erspan_link_failed:
1711 	rtnl_link_unregister(&ipgre_tap_ops);
1712 tap_ops_failed:
1713 	rtnl_link_unregister(&ipgre_link_ops);
1714 rtnl_link_failed:
1715 	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1716 add_proto_failed:
1717 	unregister_pernet_device(&erspan_net_ops);
1718 pnet_erspan_failed:
1719 	unregister_pernet_device(&ipgre_tap_net_ops);
1720 pnet_tap_failed:
1721 	unregister_pernet_device(&ipgre_net_ops);
1722 	return err;
1723 }
1724 
1725 static void __exit ipgre_fini(void)
1726 {
1727 	rtnl_link_unregister(&ipgre_tap_ops);
1728 	rtnl_link_unregister(&ipgre_link_ops);
1729 	rtnl_link_unregister(&erspan_link_ops);
1730 	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1731 	unregister_pernet_device(&ipgre_tap_net_ops);
1732 	unregister_pernet_device(&ipgre_net_ops);
1733 	unregister_pernet_device(&erspan_net_ops);
1734 }
1735 
1736 module_init(ipgre_init);
1737 module_exit(ipgre_fini);
1738 MODULE_LICENSE("GPL");
1739 MODULE_ALIAS_RTNL_LINK("gre");
1740 MODULE_ALIAS_RTNL_LINK("gretap");
1741 MODULE_ALIAS_RTNL_LINK("erspan");
1742 MODULE_ALIAS_NETDEV("gre0");
1743 MODULE_ALIAS_NETDEV("gretap0");
1744 MODULE_ALIAS_NETDEV("erspan0");
1745