xref: /openbmc/linux/net/ipv4/ip_gre.c (revision 6aa7de05)
1 /*
2  *	Linux NET3:	GRE over IP protocol decoder.
3  *
4  *	Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *	This program is free software; you can redistribute it and/or
7  *	modify it under the terms of the GNU General Public License
8  *	as published by the Free Software Foundation; either version
9  *	2 of the License, or (at your option) any later version.
10  *
11  */
12 
13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14 
15 #include <linux/capability.h>
16 #include <linux/module.h>
17 #include <linux/types.h>
18 #include <linux/kernel.h>
19 #include <linux/slab.h>
20 #include <linux/uaccess.h>
21 #include <linux/skbuff.h>
22 #include <linux/netdevice.h>
23 #include <linux/in.h>
24 #include <linux/tcp.h>
25 #include <linux/udp.h>
26 #include <linux/if_arp.h>
27 #include <linux/if_vlan.h>
28 #include <linux/init.h>
29 #include <linux/in6.h>
30 #include <linux/inetdevice.h>
31 #include <linux/igmp.h>
32 #include <linux/netfilter_ipv4.h>
33 #include <linux/etherdevice.h>
34 #include <linux/if_ether.h>
35 
36 #include <net/sock.h>
37 #include <net/ip.h>
38 #include <net/icmp.h>
39 #include <net/protocol.h>
40 #include <net/ip_tunnels.h>
41 #include <net/arp.h>
42 #include <net/checksum.h>
43 #include <net/dsfield.h>
44 #include <net/inet_ecn.h>
45 #include <net/xfrm.h>
46 #include <net/net_namespace.h>
47 #include <net/netns/generic.h>
48 #include <net/rtnetlink.h>
49 #include <net/gre.h>
50 #include <net/dst_metadata.h>
51 #include <net/erspan.h>
52 
53 /*
54    Problems & solutions
55    --------------------
56 
57    1. The most important issue is detecting local dead loops.
58    They would cause complete host lockup in transmit, which
59    would be "resolved" by stack overflow or, if queueing is enabled,
60    with infinite looping in net_bh.
61 
62    We cannot track such dead loops during route installation,
63    it is infeasible task. The most general solutions would be
64    to keep skb->encapsulation counter (sort of local ttl),
65    and silently drop packet when it expires. It is a good
66    solution, but it supposes maintaining new variable in ALL
67    skb, even if no tunneling is used.
68 
69    Current solution: xmit_recursion breaks dead loops. This is a percpu
70    counter, since when we enter the first ndo_xmit(), cpu migration is
71    forbidden. We force an exit if this counter reaches RECURSION_LIMIT
72 
73    2. Networking dead loops would not kill routers, but would really
74    kill network. IP hop limit plays role of "t->recursion" in this case,
75    if we copy it from packet being encapsulated to upper header.
76    It is very good solution, but it introduces two problems:
77 
78    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
79      do not work over tunnels.
80    - traceroute does not work. I planned to relay ICMP from tunnel,
81      so that this problem would be solved and traceroute output
82      would even more informative. This idea appeared to be wrong:
83      only Linux complies to rfc1812 now (yes, guys, Linux is the only
84      true router now :-)), all routers (at least, in neighbourhood of mine)
85      return only 8 bytes of payload. It is the end.
86 
87    Hence, if we want that OSPF worked or traceroute said something reasonable,
88    we should search for another solution.
89 
90    One of them is to parse packet trying to detect inner encapsulation
91    made by our node. It is difficult or even impossible, especially,
92    taking into account fragmentation. TO be short, ttl is not solution at all.
93 
94    Current solution: The solution was UNEXPECTEDLY SIMPLE.
95    We force DF flag on tunnels with preconfigured hop limit,
96    that is ALL. :-) Well, it does not remove the problem completely,
97    but exponential growth of network traffic is changed to linear
98    (branches, that exceed pmtu are pruned) and tunnel mtu
99    rapidly degrades to value <68, where looping stops.
100    Yes, it is not good if there exists a router in the loop,
101    which does not force DF, even when encapsulating packets have DF set.
102    But it is not our problem! Nobody could accuse us, we made
103    all that we could make. Even if it is your gated who injected
104    fatal route to network, even if it were you who configured
105    fatal static route: you are innocent. :-)
106 
107    Alexey Kuznetsov.
108  */
109 
110 static bool log_ecn_error = true;
111 module_param(log_ecn_error, bool, 0644);
112 MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
113 
114 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
115 static int ipgre_tunnel_init(struct net_device *dev);
116 static void erspan_build_header(struct sk_buff *skb,
117 				__be32 id, u32 index, bool truncate);
118 
119 static unsigned int ipgre_net_id __read_mostly;
120 static unsigned int gre_tap_net_id __read_mostly;
121 static unsigned int erspan_net_id __read_mostly;
122 
123 static void ipgre_err(struct sk_buff *skb, u32 info,
124 		      const struct tnl_ptk_info *tpi)
125 {
126 
127 	/* All the routers (except for Linux) return only
128 	   8 bytes of packet payload. It means, that precise relaying of
129 	   ICMP in the real Internet is absolutely infeasible.
130 
131 	   Moreover, Cisco "wise men" put GRE key to the third word
132 	   in GRE header. It makes impossible maintaining even soft
133 	   state for keyed GRE tunnels with enabled checksum. Tell
134 	   them "thank you".
135 
136 	   Well, I wonder, rfc1812 was written by Cisco employee,
137 	   what the hell these idiots break standards established
138 	   by themselves???
139 	   */
140 	struct net *net = dev_net(skb->dev);
141 	struct ip_tunnel_net *itn;
142 	const struct iphdr *iph;
143 	const int type = icmp_hdr(skb)->type;
144 	const int code = icmp_hdr(skb)->code;
145 	unsigned int data_len = 0;
146 	struct ip_tunnel *t;
147 
148 	switch (type) {
149 	default:
150 	case ICMP_PARAMETERPROB:
151 		return;
152 
153 	case ICMP_DEST_UNREACH:
154 		switch (code) {
155 		case ICMP_SR_FAILED:
156 		case ICMP_PORT_UNREACH:
157 			/* Impossible event. */
158 			return;
159 		default:
160 			/* All others are translated to HOST_UNREACH.
161 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
162 			   I believe they are just ether pollution. --ANK
163 			 */
164 			break;
165 		}
166 		break;
167 
168 	case ICMP_TIME_EXCEEDED:
169 		if (code != ICMP_EXC_TTL)
170 			return;
171 		data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */
172 		break;
173 
174 	case ICMP_REDIRECT:
175 		break;
176 	}
177 
178 	if (tpi->proto == htons(ETH_P_TEB))
179 		itn = net_generic(net, gre_tap_net_id);
180 	else
181 		itn = net_generic(net, ipgre_net_id);
182 
183 	iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
184 	t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
185 			     iph->daddr, iph->saddr, tpi->key);
186 
187 	if (!t)
188 		return;
189 
190 #if IS_ENABLED(CONFIG_IPV6)
191        if (tpi->proto == htons(ETH_P_IPV6) &&
192            !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len,
193 				       type, data_len))
194                return;
195 #endif
196 
197 	if (t->parms.iph.daddr == 0 ||
198 	    ipv4_is_multicast(t->parms.iph.daddr))
199 		return;
200 
201 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
202 		return;
203 
204 	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
205 		t->err_count++;
206 	else
207 		t->err_count = 1;
208 	t->err_time = jiffies;
209 }
210 
211 static void gre_err(struct sk_buff *skb, u32 info)
212 {
213 	/* All the routers (except for Linux) return only
214 	 * 8 bytes of packet payload. It means, that precise relaying of
215 	 * ICMP in the real Internet is absolutely infeasible.
216 	 *
217 	 * Moreover, Cisco "wise men" put GRE key to the third word
218 	 * in GRE header. It makes impossible maintaining even soft
219 	 * state for keyed
220 	 * GRE tunnels with enabled checksum. Tell them "thank you".
221 	 *
222 	 * Well, I wonder, rfc1812 was written by Cisco employee,
223 	 * what the hell these idiots break standards established
224 	 * by themselves???
225 	 */
226 
227 	const struct iphdr *iph = (struct iphdr *)skb->data;
228 	const int type = icmp_hdr(skb)->type;
229 	const int code = icmp_hdr(skb)->code;
230 	struct tnl_ptk_info tpi;
231 	bool csum_err = false;
232 
233 	if (gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP),
234 			     iph->ihl * 4) < 0) {
235 		if (!csum_err)		/* ignore csum errors. */
236 			return;
237 	}
238 
239 	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
240 		ipv4_update_pmtu(skb, dev_net(skb->dev), info,
241 				 skb->dev->ifindex, 0, IPPROTO_GRE, 0);
242 		return;
243 	}
244 	if (type == ICMP_REDIRECT) {
245 		ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0,
246 			      IPPROTO_GRE, 0);
247 		return;
248 	}
249 
250 	ipgre_err(skb, info, &tpi);
251 }
252 
253 static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
254 		      int gre_hdr_len)
255 {
256 	struct net *net = dev_net(skb->dev);
257 	struct metadata_dst *tun_dst = NULL;
258 	struct ip_tunnel_net *itn;
259 	struct ip_tunnel *tunnel;
260 	struct erspanhdr *ershdr;
261 	const struct iphdr *iph;
262 	__be32 index;
263 	int len;
264 
265 	itn = net_generic(net, erspan_net_id);
266 	len = gre_hdr_len + sizeof(*ershdr);
267 
268 	if (unlikely(!pskb_may_pull(skb, len)))
269 		return -ENOMEM;
270 
271 	iph = ip_hdr(skb);
272 	ershdr = (struct erspanhdr *)(skb->data + gre_hdr_len);
273 
274 	/* The original GRE header does not have key field,
275 	 * Use ERSPAN 10-bit session ID as key.
276 	 */
277 	tpi->key = cpu_to_be32(ntohs(ershdr->session_id) & ID_MASK);
278 	index = ershdr->md.index;
279 	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
280 				  tpi->flags | TUNNEL_KEY,
281 				  iph->saddr, iph->daddr, tpi->key);
282 
283 	if (tunnel) {
284 		if (__iptunnel_pull_header(skb,
285 					   gre_hdr_len + sizeof(*ershdr),
286 					   htons(ETH_P_TEB),
287 					   false, false) < 0)
288 			goto drop;
289 
290 		if (tunnel->collect_md) {
291 			struct ip_tunnel_info *info;
292 			struct erspan_metadata *md;
293 			__be64 tun_id;
294 			__be16 flags;
295 
296 			tpi->flags |= TUNNEL_KEY;
297 			flags = tpi->flags;
298 			tun_id = key32_to_tunnel_id(tpi->key);
299 
300 			tun_dst = ip_tun_rx_dst(skb, flags,
301 						tun_id, sizeof(*md));
302 			if (!tun_dst)
303 				return PACKET_REJECT;
304 
305 			md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
306 			if (!md)
307 				return PACKET_REJECT;
308 
309 			md->index = index;
310 			info = &tun_dst->u.tun_info;
311 			info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
312 			info->options_len = sizeof(*md);
313 		} else {
314 			tunnel->index = ntohl(index);
315 		}
316 
317 		skb_reset_mac_header(skb);
318 		ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
319 		return PACKET_RCVD;
320 	}
321 drop:
322 	kfree_skb(skb);
323 	return PACKET_RCVD;
324 }
325 
326 static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
327 		       struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
328 {
329 	struct metadata_dst *tun_dst = NULL;
330 	const struct iphdr *iph;
331 	struct ip_tunnel *tunnel;
332 
333 	iph = ip_hdr(skb);
334 	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
335 				  iph->saddr, iph->daddr, tpi->key);
336 
337 	if (tunnel) {
338 		if (__iptunnel_pull_header(skb, hdr_len, tpi->proto,
339 					   raw_proto, false) < 0)
340 			goto drop;
341 
342 		if (tunnel->dev->type != ARPHRD_NONE)
343 			skb_pop_mac_header(skb);
344 		else
345 			skb_reset_mac_header(skb);
346 		if (tunnel->collect_md) {
347 			__be16 flags;
348 			__be64 tun_id;
349 
350 			flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
351 			tun_id = key32_to_tunnel_id(tpi->key);
352 			tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
353 			if (!tun_dst)
354 				return PACKET_REJECT;
355 		}
356 
357 		ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
358 		return PACKET_RCVD;
359 	}
360 	return PACKET_NEXT;
361 
362 drop:
363 	kfree_skb(skb);
364 	return PACKET_RCVD;
365 }
366 
367 static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
368 		     int hdr_len)
369 {
370 	struct net *net = dev_net(skb->dev);
371 	struct ip_tunnel_net *itn;
372 	int res;
373 
374 	if (tpi->proto == htons(ETH_P_TEB))
375 		itn = net_generic(net, gre_tap_net_id);
376 	else
377 		itn = net_generic(net, ipgre_net_id);
378 
379 	res = __ipgre_rcv(skb, tpi, itn, hdr_len, false);
380 	if (res == PACKET_NEXT && tpi->proto == htons(ETH_P_TEB)) {
381 		/* ipgre tunnels in collect metadata mode should receive
382 		 * also ETH_P_TEB traffic.
383 		 */
384 		itn = net_generic(net, ipgre_net_id);
385 		res = __ipgre_rcv(skb, tpi, itn, hdr_len, true);
386 	}
387 	return res;
388 }
389 
390 static int gre_rcv(struct sk_buff *skb)
391 {
392 	struct tnl_ptk_info tpi;
393 	bool csum_err = false;
394 	int hdr_len;
395 
396 #ifdef CONFIG_NET_IPGRE_BROADCAST
397 	if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
398 		/* Looped back packet, drop it! */
399 		if (rt_is_output_route(skb_rtable(skb)))
400 			goto drop;
401 	}
402 #endif
403 
404 	hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP), 0);
405 	if (hdr_len < 0)
406 		goto drop;
407 
408 	if (unlikely(tpi.proto == htons(ETH_P_ERSPAN))) {
409 		if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
410 			return 0;
411 	}
412 
413 	if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
414 		return 0;
415 
416 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
417 drop:
418 	kfree_skb(skb);
419 	return 0;
420 }
421 
422 static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
423 		       const struct iphdr *tnl_params,
424 		       __be16 proto)
425 {
426 	struct ip_tunnel *tunnel = netdev_priv(dev);
427 
428 	if (tunnel->parms.o_flags & TUNNEL_SEQ)
429 		tunnel->o_seqno++;
430 
431 	/* Push GRE header. */
432 	gre_build_header(skb, tunnel->tun_hlen,
433 			 tunnel->parms.o_flags, proto, tunnel->parms.o_key,
434 			 htonl(tunnel->o_seqno));
435 
436 	ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
437 }
438 
439 static int gre_handle_offloads(struct sk_buff *skb, bool csum)
440 {
441 	return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
442 }
443 
444 static struct rtable *gre_get_rt(struct sk_buff *skb,
445 				 struct net_device *dev,
446 				 struct flowi4 *fl,
447 				 const struct ip_tunnel_key *key)
448 {
449 	struct net *net = dev_net(dev);
450 
451 	memset(fl, 0, sizeof(*fl));
452 	fl->daddr = key->u.ipv4.dst;
453 	fl->saddr = key->u.ipv4.src;
454 	fl->flowi4_tos = RT_TOS(key->tos);
455 	fl->flowi4_mark = skb->mark;
456 	fl->flowi4_proto = IPPROTO_GRE;
457 
458 	return ip_route_output_key(net, fl);
459 }
460 
461 static struct rtable *prepare_fb_xmit(struct sk_buff *skb,
462 				      struct net_device *dev,
463 				      struct flowi4 *fl,
464 				      int tunnel_hlen)
465 {
466 	struct ip_tunnel_info *tun_info;
467 	const struct ip_tunnel_key *key;
468 	struct rtable *rt = NULL;
469 	int min_headroom;
470 	bool use_cache;
471 	int err;
472 
473 	tun_info = skb_tunnel_info(skb);
474 	key = &tun_info->key;
475 	use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
476 
477 	if (use_cache)
478 		rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl->saddr);
479 	if (!rt) {
480 		rt = gre_get_rt(skb, dev, fl, key);
481 		if (IS_ERR(rt))
482 			goto err_free_skb;
483 		if (use_cache)
484 			dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
485 					  fl->saddr);
486 	}
487 
488 	min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
489 			+ tunnel_hlen + sizeof(struct iphdr);
490 	if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
491 		int head_delta = SKB_DATA_ALIGN(min_headroom -
492 						skb_headroom(skb) +
493 						16);
494 		err = pskb_expand_head(skb, max_t(int, head_delta, 0),
495 				       0, GFP_ATOMIC);
496 		if (unlikely(err))
497 			goto err_free_rt;
498 	}
499 	return rt;
500 
501 err_free_rt:
502 	ip_rt_put(rt);
503 err_free_skb:
504 	kfree_skb(skb);
505 	dev->stats.tx_dropped++;
506 	return NULL;
507 }
508 
509 static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
510 			__be16 proto)
511 {
512 	struct ip_tunnel_info *tun_info;
513 	const struct ip_tunnel_key *key;
514 	struct rtable *rt = NULL;
515 	struct flowi4 fl;
516 	int tunnel_hlen;
517 	__be16 df, flags;
518 
519 	tun_info = skb_tunnel_info(skb);
520 	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
521 		     ip_tunnel_info_af(tun_info) != AF_INET))
522 		goto err_free_skb;
523 
524 	key = &tun_info->key;
525 	tunnel_hlen = gre_calc_hlen(key->tun_flags);
526 
527 	rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen);
528 	if (!rt)
529 		return;
530 
531 	/* Push Tunnel header. */
532 	if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM)))
533 		goto err_free_rt;
534 
535 	flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY);
536 	gre_build_header(skb, tunnel_hlen, flags, proto,
537 			 tunnel_id_to_key32(tun_info->key.tun_id), 0);
538 
539 	df = key->tun_flags & TUNNEL_DONT_FRAGMENT ?  htons(IP_DF) : 0;
540 
541 	iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE,
542 		      key->tos, key->ttl, df, false);
543 	return;
544 
545 err_free_rt:
546 	ip_rt_put(rt);
547 err_free_skb:
548 	kfree_skb(skb);
549 	dev->stats.tx_dropped++;
550 }
551 
552 static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
553 			   __be16 proto)
554 {
555 	struct ip_tunnel *tunnel = netdev_priv(dev);
556 	struct ip_tunnel_info *tun_info;
557 	const struct ip_tunnel_key *key;
558 	struct erspan_metadata *md;
559 	struct rtable *rt = NULL;
560 	bool truncate = false;
561 	struct flowi4 fl;
562 	int tunnel_hlen;
563 	__be16 df;
564 
565 	tun_info = skb_tunnel_info(skb);
566 	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
567 		     ip_tunnel_info_af(tun_info) != AF_INET))
568 		goto err_free_skb;
569 
570 	key = &tun_info->key;
571 
572 	/* ERSPAN has fixed 8 byte GRE header */
573 	tunnel_hlen = 8 + sizeof(struct erspanhdr);
574 
575 	rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen);
576 	if (!rt)
577 		return;
578 
579 	if (gre_handle_offloads(skb, false))
580 		goto err_free_rt;
581 
582 	if (skb->len > dev->mtu) {
583 		pskb_trim(skb, dev->mtu);
584 		truncate = true;
585 	}
586 
587 	md = ip_tunnel_info_opts(tun_info);
588 	if (!md)
589 		goto err_free_rt;
590 
591 	erspan_build_header(skb, tunnel_id_to_key32(key->tun_id),
592 			    ntohl(md->index), truncate);
593 
594 	gre_build_header(skb, 8, TUNNEL_SEQ,
595 			 htons(ETH_P_ERSPAN), 0, htonl(tunnel->o_seqno++));
596 
597 	df = key->tun_flags & TUNNEL_DONT_FRAGMENT ?  htons(IP_DF) : 0;
598 
599 	iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE,
600 		      key->tos, key->ttl, df, false);
601 	return;
602 
603 err_free_rt:
604 	ip_rt_put(rt);
605 err_free_skb:
606 	kfree_skb(skb);
607 	dev->stats.tx_dropped++;
608 }
609 
610 static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
611 {
612 	struct ip_tunnel_info *info = skb_tunnel_info(skb);
613 	struct rtable *rt;
614 	struct flowi4 fl4;
615 
616 	if (ip_tunnel_info_af(info) != AF_INET)
617 		return -EINVAL;
618 
619 	rt = gre_get_rt(skb, dev, &fl4, &info->key);
620 	if (IS_ERR(rt))
621 		return PTR_ERR(rt);
622 
623 	ip_rt_put(rt);
624 	info->key.u.ipv4.src = fl4.saddr;
625 	return 0;
626 }
627 
628 static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
629 			      struct net_device *dev)
630 {
631 	struct ip_tunnel *tunnel = netdev_priv(dev);
632 	const struct iphdr *tnl_params;
633 
634 	if (tunnel->collect_md) {
635 		gre_fb_xmit(skb, dev, skb->protocol);
636 		return NETDEV_TX_OK;
637 	}
638 
639 	if (dev->header_ops) {
640 		/* Need space for new headers */
641 		if (skb_cow_head(skb, dev->needed_headroom -
642 				      (tunnel->hlen + sizeof(struct iphdr))))
643 			goto free_skb;
644 
645 		tnl_params = (const struct iphdr *)skb->data;
646 
647 		/* Pull skb since ip_tunnel_xmit() needs skb->data pointing
648 		 * to gre header.
649 		 */
650 		skb_pull(skb, tunnel->hlen + sizeof(struct iphdr));
651 		skb_reset_mac_header(skb);
652 	} else {
653 		if (skb_cow_head(skb, dev->needed_headroom))
654 			goto free_skb;
655 
656 		tnl_params = &tunnel->parms.iph;
657 	}
658 
659 	if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
660 		goto free_skb;
661 
662 	__gre_xmit(skb, dev, tnl_params, skb->protocol);
663 	return NETDEV_TX_OK;
664 
665 free_skb:
666 	kfree_skb(skb);
667 	dev->stats.tx_dropped++;
668 	return NETDEV_TX_OK;
669 }
670 
671 static inline u8 tos_to_cos(u8 tos)
672 {
673 	u8 dscp, cos;
674 
675 	dscp = tos >> 2;
676 	cos = dscp >> 3;
677 	return cos;
678 }
679 
680 static void erspan_build_header(struct sk_buff *skb,
681 				__be32 id, u32 index, bool truncate)
682 {
683 	struct iphdr *iphdr = ip_hdr(skb);
684 	struct ethhdr *eth = eth_hdr(skb);
685 	enum erspan_encap_type enc_type;
686 	struct erspanhdr *ershdr;
687 	struct qtag_prefix {
688 		__be16 eth_type;
689 		__be16 tci;
690 	} *qp;
691 	u16 vlan_tci = 0;
692 
693 	enc_type = ERSPAN_ENCAP_NOVLAN;
694 
695 	/* If mirrored packet has vlan tag, extract tci and
696 	 *  perserve vlan header in the mirrored frame.
697 	 */
698 	if (eth->h_proto == htons(ETH_P_8021Q)) {
699 		qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN);
700 		vlan_tci = ntohs(qp->tci);
701 		enc_type = ERSPAN_ENCAP_INFRAME;
702 	}
703 
704 	skb_push(skb, sizeof(*ershdr));
705 	ershdr = (struct erspanhdr *)skb->data;
706 	memset(ershdr, 0, sizeof(*ershdr));
707 
708 	ershdr->ver_vlan = htons((vlan_tci & VLAN_MASK) |
709 				 (ERSPAN_VERSION << VER_OFFSET));
710 	ershdr->session_id = htons((u16)(ntohl(id) & ID_MASK) |
711 			   ((tos_to_cos(iphdr->tos) << COS_OFFSET) & COS_MASK) |
712 			   (enc_type << EN_OFFSET & EN_MASK) |
713 			   ((truncate << T_OFFSET) & T_MASK));
714 	ershdr->md.index = htonl(index & INDEX_MASK);
715 }
716 
717 static netdev_tx_t erspan_xmit(struct sk_buff *skb,
718 			       struct net_device *dev)
719 {
720 	struct ip_tunnel *tunnel = netdev_priv(dev);
721 	bool truncate = false;
722 
723 	if (tunnel->collect_md) {
724 		erspan_fb_xmit(skb, dev, skb->protocol);
725 		return NETDEV_TX_OK;
726 	}
727 
728 	if (gre_handle_offloads(skb, false))
729 		goto free_skb;
730 
731 	if (skb_cow_head(skb, dev->needed_headroom))
732 		goto free_skb;
733 
734 	if (skb->len - dev->hard_header_len > dev->mtu) {
735 		pskb_trim(skb, dev->mtu);
736 		truncate = true;
737 	}
738 
739 	/* Push ERSPAN header */
740 	erspan_build_header(skb, tunnel->parms.o_key, tunnel->index, truncate);
741 	tunnel->parms.o_flags &= ~TUNNEL_KEY;
742 	__gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_ERSPAN));
743 	return NETDEV_TX_OK;
744 
745 free_skb:
746 	kfree_skb(skb);
747 	dev->stats.tx_dropped++;
748 	return NETDEV_TX_OK;
749 }
750 
751 static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
752 				struct net_device *dev)
753 {
754 	struct ip_tunnel *tunnel = netdev_priv(dev);
755 
756 	if (tunnel->collect_md) {
757 		gre_fb_xmit(skb, dev, htons(ETH_P_TEB));
758 		return NETDEV_TX_OK;
759 	}
760 
761 	if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
762 		goto free_skb;
763 
764 	if (skb_cow_head(skb, dev->needed_headroom))
765 		goto free_skb;
766 
767 	__gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB));
768 	return NETDEV_TX_OK;
769 
770 free_skb:
771 	kfree_skb(skb);
772 	dev->stats.tx_dropped++;
773 	return NETDEV_TX_OK;
774 }
775 
776 static int ipgre_tunnel_ioctl(struct net_device *dev,
777 			      struct ifreq *ifr, int cmd)
778 {
779 	int err;
780 	struct ip_tunnel_parm p;
781 
782 	if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
783 		return -EFAULT;
784 	if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
785 		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
786 		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
787 		    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
788 			return -EINVAL;
789 	}
790 	p.i_flags = gre_flags_to_tnl_flags(p.i_flags);
791 	p.o_flags = gre_flags_to_tnl_flags(p.o_flags);
792 
793 	err = ip_tunnel_ioctl(dev, &p, cmd);
794 	if (err)
795 		return err;
796 
797 	p.i_flags = gre_tnl_flags_to_gre_flags(p.i_flags);
798 	p.o_flags = gre_tnl_flags_to_gre_flags(p.o_flags);
799 
800 	if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
801 		return -EFAULT;
802 	return 0;
803 }
804 
805 /* Nice toy. Unfortunately, useless in real life :-)
806    It allows to construct virtual multiprotocol broadcast "LAN"
807    over the Internet, provided multicast routing is tuned.
808 
809 
810    I have no idea was this bicycle invented before me,
811    so that I had to set ARPHRD_IPGRE to a random value.
812    I have an impression, that Cisco could make something similar,
813    but this feature is apparently missing in IOS<=11.2(8).
814 
815    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
816    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
817 
818    ping -t 255 224.66.66.66
819 
820    If nobody answers, mbone does not work.
821 
822    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
823    ip addr add 10.66.66.<somewhat>/24 dev Universe
824    ifconfig Universe up
825    ifconfig Universe add fe80::<Your_real_addr>/10
826    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
827    ftp 10.66.66.66
828    ...
829    ftp fec0:6666:6666::193.233.7.65
830    ...
831  */
832 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
833 			unsigned short type,
834 			const void *daddr, const void *saddr, unsigned int len)
835 {
836 	struct ip_tunnel *t = netdev_priv(dev);
837 	struct iphdr *iph;
838 	struct gre_base_hdr *greh;
839 
840 	iph = skb_push(skb, t->hlen + sizeof(*iph));
841 	greh = (struct gre_base_hdr *)(iph+1);
842 	greh->flags = gre_tnl_flags_to_gre_flags(t->parms.o_flags);
843 	greh->protocol = htons(type);
844 
845 	memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
846 
847 	/* Set the source hardware address. */
848 	if (saddr)
849 		memcpy(&iph->saddr, saddr, 4);
850 	if (daddr)
851 		memcpy(&iph->daddr, daddr, 4);
852 	if (iph->daddr)
853 		return t->hlen + sizeof(*iph);
854 
855 	return -(t->hlen + sizeof(*iph));
856 }
857 
858 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
859 {
860 	const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
861 	memcpy(haddr, &iph->saddr, 4);
862 	return 4;
863 }
864 
865 static const struct header_ops ipgre_header_ops = {
866 	.create	= ipgre_header,
867 	.parse	= ipgre_header_parse,
868 };
869 
870 #ifdef CONFIG_NET_IPGRE_BROADCAST
871 static int ipgre_open(struct net_device *dev)
872 {
873 	struct ip_tunnel *t = netdev_priv(dev);
874 
875 	if (ipv4_is_multicast(t->parms.iph.daddr)) {
876 		struct flowi4 fl4;
877 		struct rtable *rt;
878 
879 		rt = ip_route_output_gre(t->net, &fl4,
880 					 t->parms.iph.daddr,
881 					 t->parms.iph.saddr,
882 					 t->parms.o_key,
883 					 RT_TOS(t->parms.iph.tos),
884 					 t->parms.link);
885 		if (IS_ERR(rt))
886 			return -EADDRNOTAVAIL;
887 		dev = rt->dst.dev;
888 		ip_rt_put(rt);
889 		if (!__in_dev_get_rtnl(dev))
890 			return -EADDRNOTAVAIL;
891 		t->mlink = dev->ifindex;
892 		ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
893 	}
894 	return 0;
895 }
896 
897 static int ipgre_close(struct net_device *dev)
898 {
899 	struct ip_tunnel *t = netdev_priv(dev);
900 
901 	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
902 		struct in_device *in_dev;
903 		in_dev = inetdev_by_index(t->net, t->mlink);
904 		if (in_dev)
905 			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
906 	}
907 	return 0;
908 }
909 #endif
910 
911 static const struct net_device_ops ipgre_netdev_ops = {
912 	.ndo_init		= ipgre_tunnel_init,
913 	.ndo_uninit		= ip_tunnel_uninit,
914 #ifdef CONFIG_NET_IPGRE_BROADCAST
915 	.ndo_open		= ipgre_open,
916 	.ndo_stop		= ipgre_close,
917 #endif
918 	.ndo_start_xmit		= ipgre_xmit,
919 	.ndo_do_ioctl		= ipgre_tunnel_ioctl,
920 	.ndo_change_mtu		= ip_tunnel_change_mtu,
921 	.ndo_get_stats64	= ip_tunnel_get_stats64,
922 	.ndo_get_iflink		= ip_tunnel_get_iflink,
923 };
924 
925 #define GRE_FEATURES (NETIF_F_SG |		\
926 		      NETIF_F_FRAGLIST |	\
927 		      NETIF_F_HIGHDMA |		\
928 		      NETIF_F_HW_CSUM)
929 
930 static void ipgre_tunnel_setup(struct net_device *dev)
931 {
932 	dev->netdev_ops		= &ipgre_netdev_ops;
933 	dev->type		= ARPHRD_IPGRE;
934 	ip_tunnel_setup(dev, ipgre_net_id);
935 }
936 
937 static void __gre_tunnel_init(struct net_device *dev)
938 {
939 	struct ip_tunnel *tunnel;
940 	int t_hlen;
941 
942 	tunnel = netdev_priv(dev);
943 	tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
944 	tunnel->parms.iph.protocol = IPPROTO_GRE;
945 
946 	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
947 
948 	t_hlen = tunnel->hlen + sizeof(struct iphdr);
949 
950 	dev->needed_headroom	= LL_MAX_HEADER + t_hlen + 4;
951 	dev->mtu		= ETH_DATA_LEN - t_hlen - 4;
952 
953 	dev->features		|= GRE_FEATURES;
954 	dev->hw_features	|= GRE_FEATURES;
955 
956 	if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
957 		/* TCP offload with GRE SEQ is not supported, nor
958 		 * can we support 2 levels of outer headers requiring
959 		 * an update.
960 		 */
961 		if (!(tunnel->parms.o_flags & TUNNEL_CSUM) ||
962 		    (tunnel->encap.type == TUNNEL_ENCAP_NONE)) {
963 			dev->features    |= NETIF_F_GSO_SOFTWARE;
964 			dev->hw_features |= NETIF_F_GSO_SOFTWARE;
965 		}
966 
967 		/* Can use a lockless transmit, unless we generate
968 		 * output sequences
969 		 */
970 		dev->features |= NETIF_F_LLTX;
971 	}
972 }
973 
974 static int ipgre_tunnel_init(struct net_device *dev)
975 {
976 	struct ip_tunnel *tunnel = netdev_priv(dev);
977 	struct iphdr *iph = &tunnel->parms.iph;
978 
979 	__gre_tunnel_init(dev);
980 
981 	memcpy(dev->dev_addr, &iph->saddr, 4);
982 	memcpy(dev->broadcast, &iph->daddr, 4);
983 
984 	dev->flags		= IFF_NOARP;
985 	netif_keep_dst(dev);
986 	dev->addr_len		= 4;
987 
988 	if (iph->daddr && !tunnel->collect_md) {
989 #ifdef CONFIG_NET_IPGRE_BROADCAST
990 		if (ipv4_is_multicast(iph->daddr)) {
991 			if (!iph->saddr)
992 				return -EINVAL;
993 			dev->flags = IFF_BROADCAST;
994 			dev->header_ops = &ipgre_header_ops;
995 		}
996 #endif
997 	} else if (!tunnel->collect_md) {
998 		dev->header_ops = &ipgre_header_ops;
999 	}
1000 
1001 	return ip_tunnel_init(dev);
1002 }
1003 
1004 static const struct gre_protocol ipgre_protocol = {
1005 	.handler     = gre_rcv,
1006 	.err_handler = gre_err,
1007 };
1008 
1009 static int __net_init ipgre_init_net(struct net *net)
1010 {
1011 	return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
1012 }
1013 
1014 static void __net_exit ipgre_exit_net(struct net *net)
1015 {
1016 	struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id);
1017 	ip_tunnel_delete_net(itn, &ipgre_link_ops);
1018 }
1019 
1020 static struct pernet_operations ipgre_net_ops = {
1021 	.init = ipgre_init_net,
1022 	.exit = ipgre_exit_net,
1023 	.id   = &ipgre_net_id,
1024 	.size = sizeof(struct ip_tunnel_net),
1025 };
1026 
1027 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
1028 				 struct netlink_ext_ack *extack)
1029 {
1030 	__be16 flags;
1031 
1032 	if (!data)
1033 		return 0;
1034 
1035 	flags = 0;
1036 	if (data[IFLA_GRE_IFLAGS])
1037 		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1038 	if (data[IFLA_GRE_OFLAGS])
1039 		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1040 	if (flags & (GRE_VERSION|GRE_ROUTING))
1041 		return -EINVAL;
1042 
1043 	if (data[IFLA_GRE_COLLECT_METADATA] &&
1044 	    data[IFLA_GRE_ENCAP_TYPE] &&
1045 	    nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]) != TUNNEL_ENCAP_NONE)
1046 		return -EINVAL;
1047 
1048 	return 0;
1049 }
1050 
1051 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[],
1052 			      struct netlink_ext_ack *extack)
1053 {
1054 	__be32 daddr;
1055 
1056 	if (tb[IFLA_ADDRESS]) {
1057 		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1058 			return -EINVAL;
1059 		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1060 			return -EADDRNOTAVAIL;
1061 	}
1062 
1063 	if (!data)
1064 		goto out;
1065 
1066 	if (data[IFLA_GRE_REMOTE]) {
1067 		memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1068 		if (!daddr)
1069 			return -EINVAL;
1070 	}
1071 
1072 out:
1073 	return ipgre_tunnel_validate(tb, data, extack);
1074 }
1075 
1076 static int erspan_validate(struct nlattr *tb[], struct nlattr *data[],
1077 			   struct netlink_ext_ack *extack)
1078 {
1079 	__be16 flags = 0;
1080 	int ret;
1081 
1082 	if (!data)
1083 		return 0;
1084 
1085 	ret = ipgre_tap_validate(tb, data, extack);
1086 	if (ret)
1087 		return ret;
1088 
1089 	/* ERSPAN should only have GRE sequence and key flag */
1090 	if (data[IFLA_GRE_OFLAGS])
1091 		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1092 	if (data[IFLA_GRE_IFLAGS])
1093 		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1094 	if (!data[IFLA_GRE_COLLECT_METADATA] &&
1095 	    flags != (GRE_SEQ | GRE_KEY))
1096 		return -EINVAL;
1097 
1098 	/* ERSPAN Session ID only has 10-bit. Since we reuse
1099 	 * 32-bit key field as ID, check it's range.
1100 	 */
1101 	if (data[IFLA_GRE_IKEY] &&
1102 	    (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK))
1103 		return -EINVAL;
1104 
1105 	if (data[IFLA_GRE_OKEY] &&
1106 	    (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK))
1107 		return -EINVAL;
1108 
1109 	return 0;
1110 }
1111 
1112 static int ipgre_netlink_parms(struct net_device *dev,
1113 				struct nlattr *data[],
1114 				struct nlattr *tb[],
1115 				struct ip_tunnel_parm *parms,
1116 				__u32 *fwmark)
1117 {
1118 	struct ip_tunnel *t = netdev_priv(dev);
1119 
1120 	memset(parms, 0, sizeof(*parms));
1121 
1122 	parms->iph.protocol = IPPROTO_GRE;
1123 
1124 	if (!data)
1125 		return 0;
1126 
1127 	if (data[IFLA_GRE_LINK])
1128 		parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1129 
1130 	if (data[IFLA_GRE_IFLAGS])
1131 		parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS]));
1132 
1133 	if (data[IFLA_GRE_OFLAGS])
1134 		parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS]));
1135 
1136 	if (data[IFLA_GRE_IKEY])
1137 		parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1138 
1139 	if (data[IFLA_GRE_OKEY])
1140 		parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1141 
1142 	if (data[IFLA_GRE_LOCAL])
1143 		parms->iph.saddr = nla_get_in_addr(data[IFLA_GRE_LOCAL]);
1144 
1145 	if (data[IFLA_GRE_REMOTE])
1146 		parms->iph.daddr = nla_get_in_addr(data[IFLA_GRE_REMOTE]);
1147 
1148 	if (data[IFLA_GRE_TTL])
1149 		parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1150 
1151 	if (data[IFLA_GRE_TOS])
1152 		parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1153 
1154 	if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) {
1155 		if (t->ignore_df)
1156 			return -EINVAL;
1157 		parms->iph.frag_off = htons(IP_DF);
1158 	}
1159 
1160 	if (data[IFLA_GRE_COLLECT_METADATA]) {
1161 		t->collect_md = true;
1162 		if (dev->type == ARPHRD_IPGRE)
1163 			dev->type = ARPHRD_NONE;
1164 	}
1165 
1166 	if (data[IFLA_GRE_IGNORE_DF]) {
1167 		if (nla_get_u8(data[IFLA_GRE_IGNORE_DF])
1168 		  && (parms->iph.frag_off & htons(IP_DF)))
1169 			return -EINVAL;
1170 		t->ignore_df = !!nla_get_u8(data[IFLA_GRE_IGNORE_DF]);
1171 	}
1172 
1173 	if (data[IFLA_GRE_FWMARK])
1174 		*fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);
1175 
1176 	if (data[IFLA_GRE_ERSPAN_INDEX]) {
1177 		t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
1178 
1179 		if (t->index & ~INDEX_MASK)
1180 			return -EINVAL;
1181 	}
1182 
1183 	return 0;
1184 }
1185 
1186 /* This function returns true when ENCAP attributes are present in the nl msg */
1187 static bool ipgre_netlink_encap_parms(struct nlattr *data[],
1188 				      struct ip_tunnel_encap *ipencap)
1189 {
1190 	bool ret = false;
1191 
1192 	memset(ipencap, 0, sizeof(*ipencap));
1193 
1194 	if (!data)
1195 		return ret;
1196 
1197 	if (data[IFLA_GRE_ENCAP_TYPE]) {
1198 		ret = true;
1199 		ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]);
1200 	}
1201 
1202 	if (data[IFLA_GRE_ENCAP_FLAGS]) {
1203 		ret = true;
1204 		ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]);
1205 	}
1206 
1207 	if (data[IFLA_GRE_ENCAP_SPORT]) {
1208 		ret = true;
1209 		ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]);
1210 	}
1211 
1212 	if (data[IFLA_GRE_ENCAP_DPORT]) {
1213 		ret = true;
1214 		ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]);
1215 	}
1216 
1217 	return ret;
1218 }
1219 
1220 static int gre_tap_init(struct net_device *dev)
1221 {
1222 	__gre_tunnel_init(dev);
1223 	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1224 	netif_keep_dst(dev);
1225 
1226 	return ip_tunnel_init(dev);
1227 }
1228 
1229 static const struct net_device_ops gre_tap_netdev_ops = {
1230 	.ndo_init		= gre_tap_init,
1231 	.ndo_uninit		= ip_tunnel_uninit,
1232 	.ndo_start_xmit		= gre_tap_xmit,
1233 	.ndo_set_mac_address 	= eth_mac_addr,
1234 	.ndo_validate_addr	= eth_validate_addr,
1235 	.ndo_change_mtu		= ip_tunnel_change_mtu,
1236 	.ndo_get_stats64	= ip_tunnel_get_stats64,
1237 	.ndo_get_iflink		= ip_tunnel_get_iflink,
1238 	.ndo_fill_metadata_dst	= gre_fill_metadata_dst,
1239 };
1240 
1241 static int erspan_tunnel_init(struct net_device *dev)
1242 {
1243 	struct ip_tunnel *tunnel = netdev_priv(dev);
1244 	int t_hlen;
1245 
1246 	tunnel->tun_hlen = 8;
1247 	tunnel->parms.iph.protocol = IPPROTO_GRE;
1248 	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
1249 		       sizeof(struct erspanhdr);
1250 	t_hlen = tunnel->hlen + sizeof(struct iphdr);
1251 
1252 	dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4;
1253 	dev->mtu = ETH_DATA_LEN - t_hlen - 4;
1254 	dev->features		|= GRE_FEATURES;
1255 	dev->hw_features	|= GRE_FEATURES;
1256 	dev->priv_flags		|= IFF_LIVE_ADDR_CHANGE;
1257 	netif_keep_dst(dev);
1258 
1259 	return ip_tunnel_init(dev);
1260 }
1261 
1262 static const struct net_device_ops erspan_netdev_ops = {
1263 	.ndo_init		= erspan_tunnel_init,
1264 	.ndo_uninit		= ip_tunnel_uninit,
1265 	.ndo_start_xmit		= erspan_xmit,
1266 	.ndo_set_mac_address	= eth_mac_addr,
1267 	.ndo_validate_addr	= eth_validate_addr,
1268 	.ndo_change_mtu		= ip_tunnel_change_mtu,
1269 	.ndo_get_stats64	= ip_tunnel_get_stats64,
1270 	.ndo_get_iflink		= ip_tunnel_get_iflink,
1271 	.ndo_fill_metadata_dst	= gre_fill_metadata_dst,
1272 };
1273 
1274 static void ipgre_tap_setup(struct net_device *dev)
1275 {
1276 	ether_setup(dev);
1277 	dev->netdev_ops	= &gre_tap_netdev_ops;
1278 	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1279 	dev->priv_flags	|= IFF_LIVE_ADDR_CHANGE;
1280 	ip_tunnel_setup(dev, gre_tap_net_id);
1281 }
1282 
1283 static int ipgre_newlink(struct net *src_net, struct net_device *dev,
1284 			 struct nlattr *tb[], struct nlattr *data[],
1285 			 struct netlink_ext_ack *extack)
1286 {
1287 	struct ip_tunnel_parm p;
1288 	struct ip_tunnel_encap ipencap;
1289 	__u32 fwmark = 0;
1290 	int err;
1291 
1292 	if (ipgre_netlink_encap_parms(data, &ipencap)) {
1293 		struct ip_tunnel *t = netdev_priv(dev);
1294 		err = ip_tunnel_encap_setup(t, &ipencap);
1295 
1296 		if (err < 0)
1297 			return err;
1298 	}
1299 
1300 	err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
1301 	if (err < 0)
1302 		return err;
1303 	return ip_tunnel_newlink(dev, tb, &p, fwmark);
1304 }
1305 
1306 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1307 			    struct nlattr *data[],
1308 			    struct netlink_ext_ack *extack)
1309 {
1310 	struct ip_tunnel *t = netdev_priv(dev);
1311 	struct ip_tunnel_parm p;
1312 	struct ip_tunnel_encap ipencap;
1313 	__u32 fwmark = t->fwmark;
1314 	int err;
1315 
1316 	if (ipgre_netlink_encap_parms(data, &ipencap)) {
1317 		err = ip_tunnel_encap_setup(t, &ipencap);
1318 
1319 		if (err < 0)
1320 			return err;
1321 	}
1322 
1323 	err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
1324 	if (err < 0)
1325 		return err;
1326 	return ip_tunnel_changelink(dev, tb, &p, fwmark);
1327 }
1328 
1329 static size_t ipgre_get_size(const struct net_device *dev)
1330 {
1331 	return
1332 		/* IFLA_GRE_LINK */
1333 		nla_total_size(4) +
1334 		/* IFLA_GRE_IFLAGS */
1335 		nla_total_size(2) +
1336 		/* IFLA_GRE_OFLAGS */
1337 		nla_total_size(2) +
1338 		/* IFLA_GRE_IKEY */
1339 		nla_total_size(4) +
1340 		/* IFLA_GRE_OKEY */
1341 		nla_total_size(4) +
1342 		/* IFLA_GRE_LOCAL */
1343 		nla_total_size(4) +
1344 		/* IFLA_GRE_REMOTE */
1345 		nla_total_size(4) +
1346 		/* IFLA_GRE_TTL */
1347 		nla_total_size(1) +
1348 		/* IFLA_GRE_TOS */
1349 		nla_total_size(1) +
1350 		/* IFLA_GRE_PMTUDISC */
1351 		nla_total_size(1) +
1352 		/* IFLA_GRE_ENCAP_TYPE */
1353 		nla_total_size(2) +
1354 		/* IFLA_GRE_ENCAP_FLAGS */
1355 		nla_total_size(2) +
1356 		/* IFLA_GRE_ENCAP_SPORT */
1357 		nla_total_size(2) +
1358 		/* IFLA_GRE_ENCAP_DPORT */
1359 		nla_total_size(2) +
1360 		/* IFLA_GRE_COLLECT_METADATA */
1361 		nla_total_size(0) +
1362 		/* IFLA_GRE_IGNORE_DF */
1363 		nla_total_size(1) +
1364 		/* IFLA_GRE_FWMARK */
1365 		nla_total_size(4) +
1366 		/* IFLA_GRE_ERSPAN_INDEX */
1367 		nla_total_size(4) +
1368 		0;
1369 }
1370 
1371 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1372 {
1373 	struct ip_tunnel *t = netdev_priv(dev);
1374 	struct ip_tunnel_parm *p = &t->parms;
1375 
1376 	if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1377 	    nla_put_be16(skb, IFLA_GRE_IFLAGS,
1378 			 gre_tnl_flags_to_gre_flags(p->i_flags)) ||
1379 	    nla_put_be16(skb, IFLA_GRE_OFLAGS,
1380 			 gre_tnl_flags_to_gre_flags(p->o_flags)) ||
1381 	    nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1382 	    nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1383 	    nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
1384 	    nla_put_in_addr(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
1385 	    nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
1386 	    nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
1387 	    nla_put_u8(skb, IFLA_GRE_PMTUDISC,
1388 		       !!(p->iph.frag_off & htons(IP_DF))) ||
1389 	    nla_put_u32(skb, IFLA_GRE_FWMARK, t->fwmark))
1390 		goto nla_put_failure;
1391 
1392 	if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
1393 			t->encap.type) ||
1394 	    nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT,
1395 			 t->encap.sport) ||
1396 	    nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT,
1397 			 t->encap.dport) ||
1398 	    nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS,
1399 			t->encap.flags))
1400 		goto nla_put_failure;
1401 
1402 	if (nla_put_u8(skb, IFLA_GRE_IGNORE_DF, t->ignore_df))
1403 		goto nla_put_failure;
1404 
1405 	if (t->collect_md) {
1406 		if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA))
1407 			goto nla_put_failure;
1408 	}
1409 
1410 	if (t->index)
1411 		if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index))
1412 			goto nla_put_failure;
1413 
1414 	return 0;
1415 
1416 nla_put_failure:
1417 	return -EMSGSIZE;
1418 }
1419 
1420 static void erspan_setup(struct net_device *dev)
1421 {
1422 	ether_setup(dev);
1423 	dev->netdev_ops = &erspan_netdev_ops;
1424 	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1425 	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1426 	ip_tunnel_setup(dev, erspan_net_id);
1427 }
1428 
1429 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1430 	[IFLA_GRE_LINK]		= { .type = NLA_U32 },
1431 	[IFLA_GRE_IFLAGS]	= { .type = NLA_U16 },
1432 	[IFLA_GRE_OFLAGS]	= { .type = NLA_U16 },
1433 	[IFLA_GRE_IKEY]		= { .type = NLA_U32 },
1434 	[IFLA_GRE_OKEY]		= { .type = NLA_U32 },
1435 	[IFLA_GRE_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1436 	[IFLA_GRE_REMOTE]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1437 	[IFLA_GRE_TTL]		= { .type = NLA_U8 },
1438 	[IFLA_GRE_TOS]		= { .type = NLA_U8 },
1439 	[IFLA_GRE_PMTUDISC]	= { .type = NLA_U8 },
1440 	[IFLA_GRE_ENCAP_TYPE]	= { .type = NLA_U16 },
1441 	[IFLA_GRE_ENCAP_FLAGS]	= { .type = NLA_U16 },
1442 	[IFLA_GRE_ENCAP_SPORT]	= { .type = NLA_U16 },
1443 	[IFLA_GRE_ENCAP_DPORT]	= { .type = NLA_U16 },
1444 	[IFLA_GRE_COLLECT_METADATA]	= { .type = NLA_FLAG },
1445 	[IFLA_GRE_IGNORE_DF]	= { .type = NLA_U8 },
1446 	[IFLA_GRE_FWMARK]	= { .type = NLA_U32 },
1447 	[IFLA_GRE_ERSPAN_INDEX]	= { .type = NLA_U32 },
1448 };
1449 
1450 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1451 	.kind		= "gre",
1452 	.maxtype	= IFLA_GRE_MAX,
1453 	.policy		= ipgre_policy,
1454 	.priv_size	= sizeof(struct ip_tunnel),
1455 	.setup		= ipgre_tunnel_setup,
1456 	.validate	= ipgre_tunnel_validate,
1457 	.newlink	= ipgre_newlink,
1458 	.changelink	= ipgre_changelink,
1459 	.dellink	= ip_tunnel_dellink,
1460 	.get_size	= ipgre_get_size,
1461 	.fill_info	= ipgre_fill_info,
1462 	.get_link_net	= ip_tunnel_get_link_net,
1463 };
1464 
1465 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1466 	.kind		= "gretap",
1467 	.maxtype	= IFLA_GRE_MAX,
1468 	.policy		= ipgre_policy,
1469 	.priv_size	= sizeof(struct ip_tunnel),
1470 	.setup		= ipgre_tap_setup,
1471 	.validate	= ipgre_tap_validate,
1472 	.newlink	= ipgre_newlink,
1473 	.changelink	= ipgre_changelink,
1474 	.dellink	= ip_tunnel_dellink,
1475 	.get_size	= ipgre_get_size,
1476 	.fill_info	= ipgre_fill_info,
1477 	.get_link_net	= ip_tunnel_get_link_net,
1478 };
1479 
1480 static struct rtnl_link_ops erspan_link_ops __read_mostly = {
1481 	.kind		= "erspan",
1482 	.maxtype	= IFLA_GRE_MAX,
1483 	.policy		= ipgre_policy,
1484 	.priv_size	= sizeof(struct ip_tunnel),
1485 	.setup		= erspan_setup,
1486 	.validate	= erspan_validate,
1487 	.newlink	= ipgre_newlink,
1488 	.changelink	= ipgre_changelink,
1489 	.dellink	= ip_tunnel_dellink,
1490 	.get_size	= ipgre_get_size,
1491 	.fill_info	= ipgre_fill_info,
1492 	.get_link_net	= ip_tunnel_get_link_net,
1493 };
1494 
1495 struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
1496 					u8 name_assign_type)
1497 {
1498 	struct nlattr *tb[IFLA_MAX + 1];
1499 	struct net_device *dev;
1500 	LIST_HEAD(list_kill);
1501 	struct ip_tunnel *t;
1502 	int err;
1503 
1504 	memset(&tb, 0, sizeof(tb));
1505 
1506 	dev = rtnl_create_link(net, name, name_assign_type,
1507 			       &ipgre_tap_ops, tb);
1508 	if (IS_ERR(dev))
1509 		return dev;
1510 
1511 	/* Configure flow based GRE device. */
1512 	t = netdev_priv(dev);
1513 	t->collect_md = true;
1514 
1515 	err = ipgre_newlink(net, dev, tb, NULL, NULL);
1516 	if (err < 0) {
1517 		free_netdev(dev);
1518 		return ERR_PTR(err);
1519 	}
1520 
1521 	/* openvswitch users expect packet sizes to be unrestricted,
1522 	 * so set the largest MTU we can.
1523 	 */
1524 	err = __ip_tunnel_change_mtu(dev, IP_MAX_MTU, false);
1525 	if (err)
1526 		goto out;
1527 
1528 	err = rtnl_configure_link(dev, NULL);
1529 	if (err < 0)
1530 		goto out;
1531 
1532 	return dev;
1533 out:
1534 	ip_tunnel_dellink(dev, &list_kill);
1535 	unregister_netdevice_many(&list_kill);
1536 	return ERR_PTR(err);
1537 }
1538 EXPORT_SYMBOL_GPL(gretap_fb_dev_create);
1539 
1540 static int __net_init ipgre_tap_init_net(struct net *net)
1541 {
1542 	return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
1543 }
1544 
1545 static void __net_exit ipgre_tap_exit_net(struct net *net)
1546 {
1547 	struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id);
1548 	ip_tunnel_delete_net(itn, &ipgre_tap_ops);
1549 }
1550 
1551 static struct pernet_operations ipgre_tap_net_ops = {
1552 	.init = ipgre_tap_init_net,
1553 	.exit = ipgre_tap_exit_net,
1554 	.id   = &gre_tap_net_id,
1555 	.size = sizeof(struct ip_tunnel_net),
1556 };
1557 
1558 static int __net_init erspan_init_net(struct net *net)
1559 {
1560 	return ip_tunnel_init_net(net, erspan_net_id,
1561 				  &erspan_link_ops, "erspan0");
1562 }
1563 
1564 static void __net_exit erspan_exit_net(struct net *net)
1565 {
1566 	struct ip_tunnel_net *itn = net_generic(net, erspan_net_id);
1567 
1568 	ip_tunnel_delete_net(itn, &erspan_link_ops);
1569 }
1570 
1571 static struct pernet_operations erspan_net_ops = {
1572 	.init = erspan_init_net,
1573 	.exit = erspan_exit_net,
1574 	.id   = &erspan_net_id,
1575 	.size = sizeof(struct ip_tunnel_net),
1576 };
1577 
1578 static int __init ipgre_init(void)
1579 {
1580 	int err;
1581 
1582 	pr_info("GRE over IPv4 tunneling driver\n");
1583 
1584 	err = register_pernet_device(&ipgre_net_ops);
1585 	if (err < 0)
1586 		return err;
1587 
1588 	err = register_pernet_device(&ipgre_tap_net_ops);
1589 	if (err < 0)
1590 		goto pnet_tap_failed;
1591 
1592 	err = register_pernet_device(&erspan_net_ops);
1593 	if (err < 0)
1594 		goto pnet_erspan_failed;
1595 
1596 	err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1597 	if (err < 0) {
1598 		pr_info("%s: can't add protocol\n", __func__);
1599 		goto add_proto_failed;
1600 	}
1601 
1602 	err = rtnl_link_register(&ipgre_link_ops);
1603 	if (err < 0)
1604 		goto rtnl_link_failed;
1605 
1606 	err = rtnl_link_register(&ipgre_tap_ops);
1607 	if (err < 0)
1608 		goto tap_ops_failed;
1609 
1610 	err = rtnl_link_register(&erspan_link_ops);
1611 	if (err < 0)
1612 		goto erspan_link_failed;
1613 
1614 	return 0;
1615 
1616 erspan_link_failed:
1617 	rtnl_link_unregister(&ipgre_tap_ops);
1618 tap_ops_failed:
1619 	rtnl_link_unregister(&ipgre_link_ops);
1620 rtnl_link_failed:
1621 	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1622 add_proto_failed:
1623 	unregister_pernet_device(&erspan_net_ops);
1624 pnet_erspan_failed:
1625 	unregister_pernet_device(&ipgre_tap_net_ops);
1626 pnet_tap_failed:
1627 	unregister_pernet_device(&ipgre_net_ops);
1628 	return err;
1629 }
1630 
1631 static void __exit ipgre_fini(void)
1632 {
1633 	rtnl_link_unregister(&ipgre_tap_ops);
1634 	rtnl_link_unregister(&ipgre_link_ops);
1635 	rtnl_link_unregister(&erspan_link_ops);
1636 	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1637 	unregister_pernet_device(&ipgre_tap_net_ops);
1638 	unregister_pernet_device(&ipgre_net_ops);
1639 	unregister_pernet_device(&erspan_net_ops);
1640 }
1641 
1642 module_init(ipgre_init);
1643 module_exit(ipgre_fini);
1644 MODULE_LICENSE("GPL");
1645 MODULE_ALIAS_RTNL_LINK("gre");
1646 MODULE_ALIAS_RTNL_LINK("gretap");
1647 MODULE_ALIAS_RTNL_LINK("erspan");
1648 MODULE_ALIAS_NETDEV("gre0");
1649 MODULE_ALIAS_NETDEV("gretap0");
1650 MODULE_ALIAS_NETDEV("erspan0");
1651