xref: /openbmc/linux/net/ipv4/ip_gre.c (revision 8cb5d748)
1 /*
2  *	Linux NET3:	GRE over IP protocol decoder.
3  *
4  *	Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *	This program is free software; you can redistribute it and/or
7  *	modify it under the terms of the GNU General Public License
8  *	as published by the Free Software Foundation; either version
9  *	2 of the License, or (at your option) any later version.
10  *
11  */
12 
13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14 
15 #include <linux/capability.h>
16 #include <linux/module.h>
17 #include <linux/types.h>
18 #include <linux/kernel.h>
19 #include <linux/slab.h>
20 #include <linux/uaccess.h>
21 #include <linux/skbuff.h>
22 #include <linux/netdevice.h>
23 #include <linux/in.h>
24 #include <linux/tcp.h>
25 #include <linux/udp.h>
26 #include <linux/if_arp.h>
27 #include <linux/if_vlan.h>
28 #include <linux/init.h>
29 #include <linux/in6.h>
30 #include <linux/inetdevice.h>
31 #include <linux/igmp.h>
32 #include <linux/netfilter_ipv4.h>
33 #include <linux/etherdevice.h>
34 #include <linux/if_ether.h>
35 
36 #include <net/sock.h>
37 #include <net/ip.h>
38 #include <net/icmp.h>
39 #include <net/protocol.h>
40 #include <net/ip_tunnels.h>
41 #include <net/arp.h>
42 #include <net/checksum.h>
43 #include <net/dsfield.h>
44 #include <net/inet_ecn.h>
45 #include <net/xfrm.h>
46 #include <net/net_namespace.h>
47 #include <net/netns/generic.h>
48 #include <net/rtnetlink.h>
49 #include <net/gre.h>
50 #include <net/dst_metadata.h>
51 #include <net/erspan.h>
52 
53 /*
54    Problems & solutions
55    --------------------
56 
57    1. The most important issue is detecting local dead loops.
58    They would cause complete host lockup in transmit, which
59    would be "resolved" by stack overflow or, if queueing is enabled,
60    with infinite looping in net_bh.
61 
62    We cannot track such dead loops during route installation,
63    it is infeasible task. The most general solutions would be
64    to keep skb->encapsulation counter (sort of local ttl),
65    and silently drop packet when it expires. It is a good
66    solution, but it supposes maintaining new variable in ALL
67    skb, even if no tunneling is used.
68 
69    Current solution: xmit_recursion breaks dead loops. This is a percpu
70    counter, since when we enter the first ndo_xmit(), cpu migration is
71    forbidden. We force an exit if this counter reaches RECURSION_LIMIT
72 
73    2. Networking dead loops would not kill routers, but would really
74    kill network. IP hop limit plays role of "t->recursion" in this case,
75    if we copy it from packet being encapsulated to upper header.
76    It is very good solution, but it introduces two problems:
77 
78    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
79      do not work over tunnels.
80    - traceroute does not work. I planned to relay ICMP from tunnel,
81      so that this problem would be solved and traceroute output
82      would even more informative. This idea appeared to be wrong:
83      only Linux complies to rfc1812 now (yes, guys, Linux is the only
84      true router now :-)), all routers (at least, in neighbourhood of mine)
85      return only 8 bytes of payload. It is the end.
86 
87    Hence, if we want that OSPF worked or traceroute said something reasonable,
88    we should search for another solution.
89 
90    One of them is to parse packet trying to detect inner encapsulation
91    made by our node. It is difficult or even impossible, especially,
92    taking into account fragmentation. TO be short, ttl is not solution at all.
93 
94    Current solution: The solution was UNEXPECTEDLY SIMPLE.
95    We force DF flag on tunnels with preconfigured hop limit,
96    that is ALL. :-) Well, it does not remove the problem completely,
97    but exponential growth of network traffic is changed to linear
98    (branches, that exceed pmtu are pruned) and tunnel mtu
99    rapidly degrades to value <68, where looping stops.
100    Yes, it is not good if there exists a router in the loop,
101    which does not force DF, even when encapsulating packets have DF set.
102    But it is not our problem! Nobody could accuse us, we made
103    all that we could make. Even if it is your gated who injected
104    fatal route to network, even if it were you who configured
105    fatal static route: you are innocent. :-)
106 
107    Alexey Kuznetsov.
108  */
109 
110 static bool log_ecn_error = true;
111 module_param(log_ecn_error, bool, 0644);
112 MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
113 
114 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
115 static int ipgre_tunnel_init(struct net_device *dev);
116 static void erspan_build_header(struct sk_buff *skb,
117 				__be32 id, u32 index, bool truncate);
118 
119 static unsigned int ipgre_net_id __read_mostly;
120 static unsigned int gre_tap_net_id __read_mostly;
121 static unsigned int erspan_net_id __read_mostly;
122 
123 static void ipgre_err(struct sk_buff *skb, u32 info,
124 		      const struct tnl_ptk_info *tpi)
125 {
126 
127 	/* All the routers (except for Linux) return only
128 	   8 bytes of packet payload. It means, that precise relaying of
129 	   ICMP in the real Internet is absolutely infeasible.
130 
131 	   Moreover, Cisco "wise men" put GRE key to the third word
132 	   in GRE header. It makes impossible maintaining even soft
133 	   state for keyed GRE tunnels with enabled checksum. Tell
134 	   them "thank you".
135 
136 	   Well, I wonder, rfc1812 was written by Cisco employee,
137 	   what the hell these idiots break standards established
138 	   by themselves???
139 	   */
140 	struct net *net = dev_net(skb->dev);
141 	struct ip_tunnel_net *itn;
142 	const struct iphdr *iph;
143 	const int type = icmp_hdr(skb)->type;
144 	const int code = icmp_hdr(skb)->code;
145 	unsigned int data_len = 0;
146 	struct ip_tunnel *t;
147 
148 	switch (type) {
149 	default:
150 	case ICMP_PARAMETERPROB:
151 		return;
152 
153 	case ICMP_DEST_UNREACH:
154 		switch (code) {
155 		case ICMP_SR_FAILED:
156 		case ICMP_PORT_UNREACH:
157 			/* Impossible event. */
158 			return;
159 		default:
160 			/* All others are translated to HOST_UNREACH.
161 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
162 			   I believe they are just ether pollution. --ANK
163 			 */
164 			break;
165 		}
166 		break;
167 
168 	case ICMP_TIME_EXCEEDED:
169 		if (code != ICMP_EXC_TTL)
170 			return;
171 		data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */
172 		break;
173 
174 	case ICMP_REDIRECT:
175 		break;
176 	}
177 
178 	if (tpi->proto == htons(ETH_P_TEB))
179 		itn = net_generic(net, gre_tap_net_id);
180 	else
181 		itn = net_generic(net, ipgre_net_id);
182 
183 	iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
184 	t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
185 			     iph->daddr, iph->saddr, tpi->key);
186 
187 	if (!t)
188 		return;
189 
190 #if IS_ENABLED(CONFIG_IPV6)
191        if (tpi->proto == htons(ETH_P_IPV6) &&
192            !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len,
193 				       type, data_len))
194                return;
195 #endif
196 
197 	if (t->parms.iph.daddr == 0 ||
198 	    ipv4_is_multicast(t->parms.iph.daddr))
199 		return;
200 
201 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
202 		return;
203 
204 	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
205 		t->err_count++;
206 	else
207 		t->err_count = 1;
208 	t->err_time = jiffies;
209 }
210 
211 static void gre_err(struct sk_buff *skb, u32 info)
212 {
213 	/* All the routers (except for Linux) return only
214 	 * 8 bytes of packet payload. It means, that precise relaying of
215 	 * ICMP in the real Internet is absolutely infeasible.
216 	 *
217 	 * Moreover, Cisco "wise men" put GRE key to the third word
218 	 * in GRE header. It makes impossible maintaining even soft
219 	 * state for keyed
220 	 * GRE tunnels with enabled checksum. Tell them "thank you".
221 	 *
222 	 * Well, I wonder, rfc1812 was written by Cisco employee,
223 	 * what the hell these idiots break standards established
224 	 * by themselves???
225 	 */
226 
227 	const struct iphdr *iph = (struct iphdr *)skb->data;
228 	const int type = icmp_hdr(skb)->type;
229 	const int code = icmp_hdr(skb)->code;
230 	struct tnl_ptk_info tpi;
231 	bool csum_err = false;
232 
233 	if (gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP),
234 			     iph->ihl * 4) < 0) {
235 		if (!csum_err)		/* ignore csum errors. */
236 			return;
237 	}
238 
239 	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
240 		ipv4_update_pmtu(skb, dev_net(skb->dev), info,
241 				 skb->dev->ifindex, 0, IPPROTO_GRE, 0);
242 		return;
243 	}
244 	if (type == ICMP_REDIRECT) {
245 		ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0,
246 			      IPPROTO_GRE, 0);
247 		return;
248 	}
249 
250 	ipgre_err(skb, info, &tpi);
251 }
252 
253 static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
254 		      int gre_hdr_len)
255 {
256 	struct net *net = dev_net(skb->dev);
257 	struct metadata_dst *tun_dst = NULL;
258 	struct ip_tunnel_net *itn;
259 	struct ip_tunnel *tunnel;
260 	struct erspanhdr *ershdr;
261 	const struct iphdr *iph;
262 	__be32 session_id;
263 	__be32 index;
264 	int len;
265 
266 	itn = net_generic(net, erspan_net_id);
267 	len = gre_hdr_len + sizeof(*ershdr);
268 
269 	if (unlikely(!pskb_may_pull(skb, len)))
270 		return -ENOMEM;
271 
272 	iph = ip_hdr(skb);
273 	ershdr = (struct erspanhdr *)(skb->data + gre_hdr_len);
274 
275 	/* The original GRE header does not have key field,
276 	 * Use ERSPAN 10-bit session ID as key.
277 	 */
278 	session_id = cpu_to_be32(ntohs(ershdr->session_id));
279 	tpi->key = session_id;
280 	index = ershdr->md.index;
281 	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
282 				  tpi->flags | TUNNEL_KEY,
283 				  iph->saddr, iph->daddr, tpi->key);
284 
285 	if (tunnel) {
286 		if (__iptunnel_pull_header(skb,
287 					   gre_hdr_len + sizeof(*ershdr),
288 					   htons(ETH_P_TEB),
289 					   false, false) < 0)
290 			goto drop;
291 
292 		if (tunnel->collect_md) {
293 			struct ip_tunnel_info *info;
294 			struct erspan_metadata *md;
295 			__be64 tun_id;
296 			__be16 flags;
297 
298 			tpi->flags |= TUNNEL_KEY;
299 			flags = tpi->flags;
300 			tun_id = key32_to_tunnel_id(tpi->key);
301 
302 			tun_dst = ip_tun_rx_dst(skb, flags,
303 						tun_id, sizeof(*md));
304 			if (!tun_dst)
305 				return PACKET_REJECT;
306 
307 			md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
308 			if (!md)
309 				return PACKET_REJECT;
310 
311 			md->index = index;
312 			info = &tun_dst->u.tun_info;
313 			info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
314 			info->options_len = sizeof(*md);
315 		} else {
316 			tunnel->index = ntohl(index);
317 		}
318 
319 		skb_reset_mac_header(skb);
320 		ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
321 		return PACKET_RCVD;
322 	}
323 drop:
324 	kfree_skb(skb);
325 	return PACKET_RCVD;
326 }
327 
328 static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
329 		       struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
330 {
331 	struct metadata_dst *tun_dst = NULL;
332 	const struct iphdr *iph;
333 	struct ip_tunnel *tunnel;
334 
335 	iph = ip_hdr(skb);
336 	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
337 				  iph->saddr, iph->daddr, tpi->key);
338 
339 	if (tunnel) {
340 		if (__iptunnel_pull_header(skb, hdr_len, tpi->proto,
341 					   raw_proto, false) < 0)
342 			goto drop;
343 
344 		if (tunnel->dev->type != ARPHRD_NONE)
345 			skb_pop_mac_header(skb);
346 		else
347 			skb_reset_mac_header(skb);
348 		if (tunnel->collect_md) {
349 			__be16 flags;
350 			__be64 tun_id;
351 
352 			flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
353 			tun_id = key32_to_tunnel_id(tpi->key);
354 			tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
355 			if (!tun_dst)
356 				return PACKET_REJECT;
357 		}
358 
359 		ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
360 		return PACKET_RCVD;
361 	}
362 	return PACKET_NEXT;
363 
364 drop:
365 	kfree_skb(skb);
366 	return PACKET_RCVD;
367 }
368 
369 static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
370 		     int hdr_len)
371 {
372 	struct net *net = dev_net(skb->dev);
373 	struct ip_tunnel_net *itn;
374 	int res;
375 
376 	if (tpi->proto == htons(ETH_P_TEB))
377 		itn = net_generic(net, gre_tap_net_id);
378 	else
379 		itn = net_generic(net, ipgre_net_id);
380 
381 	res = __ipgre_rcv(skb, tpi, itn, hdr_len, false);
382 	if (res == PACKET_NEXT && tpi->proto == htons(ETH_P_TEB)) {
383 		/* ipgre tunnels in collect metadata mode should receive
384 		 * also ETH_P_TEB traffic.
385 		 */
386 		itn = net_generic(net, ipgre_net_id);
387 		res = __ipgre_rcv(skb, tpi, itn, hdr_len, true);
388 	}
389 	return res;
390 }
391 
392 static int gre_rcv(struct sk_buff *skb)
393 {
394 	struct tnl_ptk_info tpi;
395 	bool csum_err = false;
396 	int hdr_len;
397 
398 #ifdef CONFIG_NET_IPGRE_BROADCAST
399 	if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
400 		/* Looped back packet, drop it! */
401 		if (rt_is_output_route(skb_rtable(skb)))
402 			goto drop;
403 	}
404 #endif
405 
406 	hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP), 0);
407 	if (hdr_len < 0)
408 		goto drop;
409 
410 	if (unlikely(tpi.proto == htons(ETH_P_ERSPAN))) {
411 		if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
412 			return 0;
413 	}
414 
415 	if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
416 		return 0;
417 
418 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
419 drop:
420 	kfree_skb(skb);
421 	return 0;
422 }
423 
424 static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
425 		       const struct iphdr *tnl_params,
426 		       __be16 proto)
427 {
428 	struct ip_tunnel *tunnel = netdev_priv(dev);
429 
430 	if (tunnel->parms.o_flags & TUNNEL_SEQ)
431 		tunnel->o_seqno++;
432 
433 	/* Push GRE header. */
434 	gre_build_header(skb, tunnel->tun_hlen,
435 			 tunnel->parms.o_flags, proto, tunnel->parms.o_key,
436 			 htonl(tunnel->o_seqno));
437 
438 	ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
439 }
440 
441 static int gre_handle_offloads(struct sk_buff *skb, bool csum)
442 {
443 	return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
444 }
445 
446 static struct rtable *gre_get_rt(struct sk_buff *skb,
447 				 struct net_device *dev,
448 				 struct flowi4 *fl,
449 				 const struct ip_tunnel_key *key)
450 {
451 	struct net *net = dev_net(dev);
452 
453 	memset(fl, 0, sizeof(*fl));
454 	fl->daddr = key->u.ipv4.dst;
455 	fl->saddr = key->u.ipv4.src;
456 	fl->flowi4_tos = RT_TOS(key->tos);
457 	fl->flowi4_mark = skb->mark;
458 	fl->flowi4_proto = IPPROTO_GRE;
459 
460 	return ip_route_output_key(net, fl);
461 }
462 
463 static struct rtable *prepare_fb_xmit(struct sk_buff *skb,
464 				      struct net_device *dev,
465 				      struct flowi4 *fl,
466 				      int tunnel_hlen)
467 {
468 	struct ip_tunnel_info *tun_info;
469 	const struct ip_tunnel_key *key;
470 	struct rtable *rt = NULL;
471 	int min_headroom;
472 	bool use_cache;
473 	int err;
474 
475 	tun_info = skb_tunnel_info(skb);
476 	key = &tun_info->key;
477 	use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
478 
479 	if (use_cache)
480 		rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl->saddr);
481 	if (!rt) {
482 		rt = gre_get_rt(skb, dev, fl, key);
483 		if (IS_ERR(rt))
484 			goto err_free_skb;
485 		if (use_cache)
486 			dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
487 					  fl->saddr);
488 	}
489 
490 	min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
491 			+ tunnel_hlen + sizeof(struct iphdr);
492 	if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
493 		int head_delta = SKB_DATA_ALIGN(min_headroom -
494 						skb_headroom(skb) +
495 						16);
496 		err = pskb_expand_head(skb, max_t(int, head_delta, 0),
497 				       0, GFP_ATOMIC);
498 		if (unlikely(err))
499 			goto err_free_rt;
500 	}
501 	return rt;
502 
503 err_free_rt:
504 	ip_rt_put(rt);
505 err_free_skb:
506 	kfree_skb(skb);
507 	dev->stats.tx_dropped++;
508 	return NULL;
509 }
510 
511 static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
512 			__be16 proto)
513 {
514 	struct ip_tunnel_info *tun_info;
515 	const struct ip_tunnel_key *key;
516 	struct rtable *rt = NULL;
517 	struct flowi4 fl;
518 	int tunnel_hlen;
519 	__be16 df, flags;
520 
521 	tun_info = skb_tunnel_info(skb);
522 	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
523 		     ip_tunnel_info_af(tun_info) != AF_INET))
524 		goto err_free_skb;
525 
526 	key = &tun_info->key;
527 	tunnel_hlen = gre_calc_hlen(key->tun_flags);
528 
529 	rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen);
530 	if (!rt)
531 		return;
532 
533 	/* Push Tunnel header. */
534 	if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM)))
535 		goto err_free_rt;
536 
537 	flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY);
538 	gre_build_header(skb, tunnel_hlen, flags, proto,
539 			 tunnel_id_to_key32(tun_info->key.tun_id), 0);
540 
541 	df = key->tun_flags & TUNNEL_DONT_FRAGMENT ?  htons(IP_DF) : 0;
542 
543 	iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE,
544 		      key->tos, key->ttl, df, false);
545 	return;
546 
547 err_free_rt:
548 	ip_rt_put(rt);
549 err_free_skb:
550 	kfree_skb(skb);
551 	dev->stats.tx_dropped++;
552 }
553 
554 static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
555 			   __be16 proto)
556 {
557 	struct ip_tunnel *tunnel = netdev_priv(dev);
558 	struct ip_tunnel_info *tun_info;
559 	const struct ip_tunnel_key *key;
560 	struct erspan_metadata *md;
561 	struct rtable *rt = NULL;
562 	bool truncate = false;
563 	struct flowi4 fl;
564 	int tunnel_hlen;
565 	__be16 df;
566 
567 	tun_info = skb_tunnel_info(skb);
568 	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
569 		     ip_tunnel_info_af(tun_info) != AF_INET))
570 		goto err_free_skb;
571 
572 	key = &tun_info->key;
573 
574 	/* ERSPAN has fixed 8 byte GRE header */
575 	tunnel_hlen = 8 + sizeof(struct erspanhdr);
576 
577 	rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen);
578 	if (!rt)
579 		return;
580 
581 	if (gre_handle_offloads(skb, false))
582 		goto err_free_rt;
583 
584 	if (skb->len > dev->mtu) {
585 		pskb_trim(skb, dev->mtu);
586 		truncate = true;
587 	}
588 
589 	md = ip_tunnel_info_opts(tun_info);
590 	if (!md)
591 		goto err_free_rt;
592 
593 	erspan_build_header(skb, tunnel_id_to_key32(key->tun_id),
594 			    ntohl(md->index), truncate);
595 
596 	gre_build_header(skb, 8, TUNNEL_SEQ,
597 			 htons(ETH_P_ERSPAN), 0, htonl(tunnel->o_seqno++));
598 
599 	df = key->tun_flags & TUNNEL_DONT_FRAGMENT ?  htons(IP_DF) : 0;
600 
601 	iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE,
602 		      key->tos, key->ttl, df, false);
603 	return;
604 
605 err_free_rt:
606 	ip_rt_put(rt);
607 err_free_skb:
608 	kfree_skb(skb);
609 	dev->stats.tx_dropped++;
610 }
611 
612 static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
613 {
614 	struct ip_tunnel_info *info = skb_tunnel_info(skb);
615 	struct rtable *rt;
616 	struct flowi4 fl4;
617 
618 	if (ip_tunnel_info_af(info) != AF_INET)
619 		return -EINVAL;
620 
621 	rt = gre_get_rt(skb, dev, &fl4, &info->key);
622 	if (IS_ERR(rt))
623 		return PTR_ERR(rt);
624 
625 	ip_rt_put(rt);
626 	info->key.u.ipv4.src = fl4.saddr;
627 	return 0;
628 }
629 
630 static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
631 			      struct net_device *dev)
632 {
633 	struct ip_tunnel *tunnel = netdev_priv(dev);
634 	const struct iphdr *tnl_params;
635 
636 	if (tunnel->collect_md) {
637 		gre_fb_xmit(skb, dev, skb->protocol);
638 		return NETDEV_TX_OK;
639 	}
640 
641 	if (dev->header_ops) {
642 		/* Need space for new headers */
643 		if (skb_cow_head(skb, dev->needed_headroom -
644 				      (tunnel->hlen + sizeof(struct iphdr))))
645 			goto free_skb;
646 
647 		tnl_params = (const struct iphdr *)skb->data;
648 
649 		/* Pull skb since ip_tunnel_xmit() needs skb->data pointing
650 		 * to gre header.
651 		 */
652 		skb_pull(skb, tunnel->hlen + sizeof(struct iphdr));
653 		skb_reset_mac_header(skb);
654 	} else {
655 		if (skb_cow_head(skb, dev->needed_headroom))
656 			goto free_skb;
657 
658 		tnl_params = &tunnel->parms.iph;
659 	}
660 
661 	if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
662 		goto free_skb;
663 
664 	__gre_xmit(skb, dev, tnl_params, skb->protocol);
665 	return NETDEV_TX_OK;
666 
667 free_skb:
668 	kfree_skb(skb);
669 	dev->stats.tx_dropped++;
670 	return NETDEV_TX_OK;
671 }
672 
673 static inline u8 tos_to_cos(u8 tos)
674 {
675 	u8 dscp, cos;
676 
677 	dscp = tos >> 2;
678 	cos = dscp >> 3;
679 	return cos;
680 }
681 
682 static void erspan_build_header(struct sk_buff *skb,
683 				__be32 id, u32 index, bool truncate)
684 {
685 	struct iphdr *iphdr = ip_hdr(skb);
686 	struct ethhdr *eth = eth_hdr(skb);
687 	enum erspan_encap_type enc_type;
688 	struct erspanhdr *ershdr;
689 	struct qtag_prefix {
690 		__be16 eth_type;
691 		__be16 tci;
692 	} *qp;
693 	u16 vlan_tci = 0;
694 
695 	enc_type = ERSPAN_ENCAP_NOVLAN;
696 
697 	/* If mirrored packet has vlan tag, extract tci and
698 	 *  perserve vlan header in the mirrored frame.
699 	 */
700 	if (eth->h_proto == htons(ETH_P_8021Q)) {
701 		qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN);
702 		vlan_tci = ntohs(qp->tci);
703 		enc_type = ERSPAN_ENCAP_INFRAME;
704 	}
705 
706 	skb_push(skb, sizeof(*ershdr));
707 	ershdr = (struct erspanhdr *)skb->data;
708 	memset(ershdr, 0, sizeof(*ershdr));
709 
710 	ershdr->ver_vlan = htons((vlan_tci & VLAN_MASK) |
711 				 (ERSPAN_VERSION << VER_OFFSET));
712 	ershdr->session_id = htons((u16)(ntohl(id) & ID_MASK) |
713 			   ((tos_to_cos(iphdr->tos) << COS_OFFSET) & COS_MASK) |
714 			   (enc_type << EN_OFFSET & EN_MASK) |
715 			   ((truncate << T_OFFSET) & T_MASK));
716 	ershdr->md.index = htonl(index & INDEX_MASK);
717 }
718 
719 static netdev_tx_t erspan_xmit(struct sk_buff *skb,
720 			       struct net_device *dev)
721 {
722 	struct ip_tunnel *tunnel = netdev_priv(dev);
723 	bool truncate = false;
724 
725 	if (tunnel->collect_md) {
726 		erspan_fb_xmit(skb, dev, skb->protocol);
727 		return NETDEV_TX_OK;
728 	}
729 
730 	if (gre_handle_offloads(skb, false))
731 		goto free_skb;
732 
733 	if (skb_cow_head(skb, dev->needed_headroom))
734 		goto free_skb;
735 
736 	if (skb->len > dev->mtu) {
737 		pskb_trim(skb, dev->mtu);
738 		truncate = true;
739 	}
740 
741 	/* Push ERSPAN header */
742 	erspan_build_header(skb, tunnel->parms.o_key, tunnel->index, truncate);
743 	tunnel->parms.o_flags &= ~TUNNEL_KEY;
744 	__gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_ERSPAN));
745 	return NETDEV_TX_OK;
746 
747 free_skb:
748 	kfree_skb(skb);
749 	dev->stats.tx_dropped++;
750 	return NETDEV_TX_OK;
751 }
752 
753 static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
754 				struct net_device *dev)
755 {
756 	struct ip_tunnel *tunnel = netdev_priv(dev);
757 
758 	if (tunnel->collect_md) {
759 		gre_fb_xmit(skb, dev, htons(ETH_P_TEB));
760 		return NETDEV_TX_OK;
761 	}
762 
763 	if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
764 		goto free_skb;
765 
766 	if (skb_cow_head(skb, dev->needed_headroom))
767 		goto free_skb;
768 
769 	__gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB));
770 	return NETDEV_TX_OK;
771 
772 free_skb:
773 	kfree_skb(skb);
774 	dev->stats.tx_dropped++;
775 	return NETDEV_TX_OK;
776 }
777 
778 static int ipgre_tunnel_ioctl(struct net_device *dev,
779 			      struct ifreq *ifr, int cmd)
780 {
781 	int err;
782 	struct ip_tunnel_parm p;
783 
784 	if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
785 		return -EFAULT;
786 	if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
787 		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
788 		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
789 		    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
790 			return -EINVAL;
791 	}
792 	p.i_flags = gre_flags_to_tnl_flags(p.i_flags);
793 	p.o_flags = gre_flags_to_tnl_flags(p.o_flags);
794 
795 	err = ip_tunnel_ioctl(dev, &p, cmd);
796 	if (err)
797 		return err;
798 
799 	p.i_flags = gre_tnl_flags_to_gre_flags(p.i_flags);
800 	p.o_flags = gre_tnl_flags_to_gre_flags(p.o_flags);
801 
802 	if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
803 		return -EFAULT;
804 	return 0;
805 }
806 
807 /* Nice toy. Unfortunately, useless in real life :-)
808    It allows to construct virtual multiprotocol broadcast "LAN"
809    over the Internet, provided multicast routing is tuned.
810 
811 
812    I have no idea was this bicycle invented before me,
813    so that I had to set ARPHRD_IPGRE to a random value.
814    I have an impression, that Cisco could make something similar,
815    but this feature is apparently missing in IOS<=11.2(8).
816 
817    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
818    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
819 
820    ping -t 255 224.66.66.66
821 
822    If nobody answers, mbone does not work.
823 
824    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
825    ip addr add 10.66.66.<somewhat>/24 dev Universe
826    ifconfig Universe up
827    ifconfig Universe add fe80::<Your_real_addr>/10
828    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
829    ftp 10.66.66.66
830    ...
831    ftp fec0:6666:6666::193.233.7.65
832    ...
833  */
834 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
835 			unsigned short type,
836 			const void *daddr, const void *saddr, unsigned int len)
837 {
838 	struct ip_tunnel *t = netdev_priv(dev);
839 	struct iphdr *iph;
840 	struct gre_base_hdr *greh;
841 
842 	iph = skb_push(skb, t->hlen + sizeof(*iph));
843 	greh = (struct gre_base_hdr *)(iph+1);
844 	greh->flags = gre_tnl_flags_to_gre_flags(t->parms.o_flags);
845 	greh->protocol = htons(type);
846 
847 	memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
848 
849 	/* Set the source hardware address. */
850 	if (saddr)
851 		memcpy(&iph->saddr, saddr, 4);
852 	if (daddr)
853 		memcpy(&iph->daddr, daddr, 4);
854 	if (iph->daddr)
855 		return t->hlen + sizeof(*iph);
856 
857 	return -(t->hlen + sizeof(*iph));
858 }
859 
860 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
861 {
862 	const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
863 	memcpy(haddr, &iph->saddr, 4);
864 	return 4;
865 }
866 
867 static const struct header_ops ipgre_header_ops = {
868 	.create	= ipgre_header,
869 	.parse	= ipgre_header_parse,
870 };
871 
872 #ifdef CONFIG_NET_IPGRE_BROADCAST
873 static int ipgre_open(struct net_device *dev)
874 {
875 	struct ip_tunnel *t = netdev_priv(dev);
876 
877 	if (ipv4_is_multicast(t->parms.iph.daddr)) {
878 		struct flowi4 fl4;
879 		struct rtable *rt;
880 
881 		rt = ip_route_output_gre(t->net, &fl4,
882 					 t->parms.iph.daddr,
883 					 t->parms.iph.saddr,
884 					 t->parms.o_key,
885 					 RT_TOS(t->parms.iph.tos),
886 					 t->parms.link);
887 		if (IS_ERR(rt))
888 			return -EADDRNOTAVAIL;
889 		dev = rt->dst.dev;
890 		ip_rt_put(rt);
891 		if (!__in_dev_get_rtnl(dev))
892 			return -EADDRNOTAVAIL;
893 		t->mlink = dev->ifindex;
894 		ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
895 	}
896 	return 0;
897 }
898 
899 static int ipgre_close(struct net_device *dev)
900 {
901 	struct ip_tunnel *t = netdev_priv(dev);
902 
903 	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
904 		struct in_device *in_dev;
905 		in_dev = inetdev_by_index(t->net, t->mlink);
906 		if (in_dev)
907 			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
908 	}
909 	return 0;
910 }
911 #endif
912 
913 static const struct net_device_ops ipgre_netdev_ops = {
914 	.ndo_init		= ipgre_tunnel_init,
915 	.ndo_uninit		= ip_tunnel_uninit,
916 #ifdef CONFIG_NET_IPGRE_BROADCAST
917 	.ndo_open		= ipgre_open,
918 	.ndo_stop		= ipgre_close,
919 #endif
920 	.ndo_start_xmit		= ipgre_xmit,
921 	.ndo_do_ioctl		= ipgre_tunnel_ioctl,
922 	.ndo_change_mtu		= ip_tunnel_change_mtu,
923 	.ndo_get_stats64	= ip_tunnel_get_stats64,
924 	.ndo_get_iflink		= ip_tunnel_get_iflink,
925 };
926 
927 #define GRE_FEATURES (NETIF_F_SG |		\
928 		      NETIF_F_FRAGLIST |	\
929 		      NETIF_F_HIGHDMA |		\
930 		      NETIF_F_HW_CSUM)
931 
932 static void ipgre_tunnel_setup(struct net_device *dev)
933 {
934 	dev->netdev_ops		= &ipgre_netdev_ops;
935 	dev->type		= ARPHRD_IPGRE;
936 	ip_tunnel_setup(dev, ipgre_net_id);
937 }
938 
939 static void __gre_tunnel_init(struct net_device *dev)
940 {
941 	struct ip_tunnel *tunnel;
942 	int t_hlen;
943 
944 	tunnel = netdev_priv(dev);
945 	tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
946 	tunnel->parms.iph.protocol = IPPROTO_GRE;
947 
948 	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
949 
950 	t_hlen = tunnel->hlen + sizeof(struct iphdr);
951 
952 	dev->needed_headroom	= LL_MAX_HEADER + t_hlen + 4;
953 	dev->mtu		= ETH_DATA_LEN - t_hlen - 4;
954 
955 	dev->features		|= GRE_FEATURES;
956 	dev->hw_features	|= GRE_FEATURES;
957 
958 	if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
959 		/* TCP offload with GRE SEQ is not supported, nor
960 		 * can we support 2 levels of outer headers requiring
961 		 * an update.
962 		 */
963 		if (!(tunnel->parms.o_flags & TUNNEL_CSUM) ||
964 		    (tunnel->encap.type == TUNNEL_ENCAP_NONE)) {
965 			dev->features    |= NETIF_F_GSO_SOFTWARE;
966 			dev->hw_features |= NETIF_F_GSO_SOFTWARE;
967 		}
968 
969 		/* Can use a lockless transmit, unless we generate
970 		 * output sequences
971 		 */
972 		dev->features |= NETIF_F_LLTX;
973 	}
974 }
975 
976 static int ipgre_tunnel_init(struct net_device *dev)
977 {
978 	struct ip_tunnel *tunnel = netdev_priv(dev);
979 	struct iphdr *iph = &tunnel->parms.iph;
980 
981 	__gre_tunnel_init(dev);
982 
983 	memcpy(dev->dev_addr, &iph->saddr, 4);
984 	memcpy(dev->broadcast, &iph->daddr, 4);
985 
986 	dev->flags		= IFF_NOARP;
987 	netif_keep_dst(dev);
988 	dev->addr_len		= 4;
989 
990 	if (iph->daddr && !tunnel->collect_md) {
991 #ifdef CONFIG_NET_IPGRE_BROADCAST
992 		if (ipv4_is_multicast(iph->daddr)) {
993 			if (!iph->saddr)
994 				return -EINVAL;
995 			dev->flags = IFF_BROADCAST;
996 			dev->header_ops = &ipgre_header_ops;
997 		}
998 #endif
999 	} else if (!tunnel->collect_md) {
1000 		dev->header_ops = &ipgre_header_ops;
1001 	}
1002 
1003 	return ip_tunnel_init(dev);
1004 }
1005 
1006 static const struct gre_protocol ipgre_protocol = {
1007 	.handler     = gre_rcv,
1008 	.err_handler = gre_err,
1009 };
1010 
1011 static int __net_init ipgre_init_net(struct net *net)
1012 {
1013 	return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
1014 }
1015 
1016 static void __net_exit ipgre_exit_net(struct net *net)
1017 {
1018 	struct ip_tunnel_net *itn = net_generic(net, ipgre_net_id);
1019 	ip_tunnel_delete_net(itn, &ipgre_link_ops);
1020 }
1021 
1022 static struct pernet_operations ipgre_net_ops = {
1023 	.init = ipgre_init_net,
1024 	.exit = ipgre_exit_net,
1025 	.id   = &ipgre_net_id,
1026 	.size = sizeof(struct ip_tunnel_net),
1027 };
1028 
1029 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
1030 				 struct netlink_ext_ack *extack)
1031 {
1032 	__be16 flags;
1033 
1034 	if (!data)
1035 		return 0;
1036 
1037 	flags = 0;
1038 	if (data[IFLA_GRE_IFLAGS])
1039 		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1040 	if (data[IFLA_GRE_OFLAGS])
1041 		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1042 	if (flags & (GRE_VERSION|GRE_ROUTING))
1043 		return -EINVAL;
1044 
1045 	if (data[IFLA_GRE_COLLECT_METADATA] &&
1046 	    data[IFLA_GRE_ENCAP_TYPE] &&
1047 	    nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]) != TUNNEL_ENCAP_NONE)
1048 		return -EINVAL;
1049 
1050 	return 0;
1051 }
1052 
1053 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[],
1054 			      struct netlink_ext_ack *extack)
1055 {
1056 	__be32 daddr;
1057 
1058 	if (tb[IFLA_ADDRESS]) {
1059 		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1060 			return -EINVAL;
1061 		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1062 			return -EADDRNOTAVAIL;
1063 	}
1064 
1065 	if (!data)
1066 		goto out;
1067 
1068 	if (data[IFLA_GRE_REMOTE]) {
1069 		memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1070 		if (!daddr)
1071 			return -EINVAL;
1072 	}
1073 
1074 out:
1075 	return ipgre_tunnel_validate(tb, data, extack);
1076 }
1077 
1078 static int erspan_validate(struct nlattr *tb[], struct nlattr *data[],
1079 			   struct netlink_ext_ack *extack)
1080 {
1081 	__be16 flags = 0;
1082 	int ret;
1083 
1084 	if (!data)
1085 		return 0;
1086 
1087 	ret = ipgre_tap_validate(tb, data, extack);
1088 	if (ret)
1089 		return ret;
1090 
1091 	/* ERSPAN should only have GRE sequence and key flag */
1092 	if (data[IFLA_GRE_OFLAGS])
1093 		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1094 	if (data[IFLA_GRE_IFLAGS])
1095 		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1096 	if (!data[IFLA_GRE_COLLECT_METADATA] &&
1097 	    flags != (GRE_SEQ | GRE_KEY))
1098 		return -EINVAL;
1099 
1100 	/* ERSPAN Session ID only has 10-bit. Since we reuse
1101 	 * 32-bit key field as ID, check it's range.
1102 	 */
1103 	if (data[IFLA_GRE_IKEY] &&
1104 	    (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK))
1105 		return -EINVAL;
1106 
1107 	if (data[IFLA_GRE_OKEY] &&
1108 	    (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK))
1109 		return -EINVAL;
1110 
1111 	return 0;
1112 }
1113 
1114 static int ipgre_netlink_parms(struct net_device *dev,
1115 				struct nlattr *data[],
1116 				struct nlattr *tb[],
1117 				struct ip_tunnel_parm *parms,
1118 				__u32 *fwmark)
1119 {
1120 	struct ip_tunnel *t = netdev_priv(dev);
1121 
1122 	memset(parms, 0, sizeof(*parms));
1123 
1124 	parms->iph.protocol = IPPROTO_GRE;
1125 
1126 	if (!data)
1127 		return 0;
1128 
1129 	if (data[IFLA_GRE_LINK])
1130 		parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1131 
1132 	if (data[IFLA_GRE_IFLAGS])
1133 		parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS]));
1134 
1135 	if (data[IFLA_GRE_OFLAGS])
1136 		parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS]));
1137 
1138 	if (data[IFLA_GRE_IKEY])
1139 		parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1140 
1141 	if (data[IFLA_GRE_OKEY])
1142 		parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1143 
1144 	if (data[IFLA_GRE_LOCAL])
1145 		parms->iph.saddr = nla_get_in_addr(data[IFLA_GRE_LOCAL]);
1146 
1147 	if (data[IFLA_GRE_REMOTE])
1148 		parms->iph.daddr = nla_get_in_addr(data[IFLA_GRE_REMOTE]);
1149 
1150 	if (data[IFLA_GRE_TTL])
1151 		parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1152 
1153 	if (data[IFLA_GRE_TOS])
1154 		parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1155 
1156 	if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) {
1157 		if (t->ignore_df)
1158 			return -EINVAL;
1159 		parms->iph.frag_off = htons(IP_DF);
1160 	}
1161 
1162 	if (data[IFLA_GRE_COLLECT_METADATA]) {
1163 		t->collect_md = true;
1164 		if (dev->type == ARPHRD_IPGRE)
1165 			dev->type = ARPHRD_NONE;
1166 	}
1167 
1168 	if (data[IFLA_GRE_IGNORE_DF]) {
1169 		if (nla_get_u8(data[IFLA_GRE_IGNORE_DF])
1170 		  && (parms->iph.frag_off & htons(IP_DF)))
1171 			return -EINVAL;
1172 		t->ignore_df = !!nla_get_u8(data[IFLA_GRE_IGNORE_DF]);
1173 	}
1174 
1175 	if (data[IFLA_GRE_FWMARK])
1176 		*fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);
1177 
1178 	if (data[IFLA_GRE_ERSPAN_INDEX]) {
1179 		t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
1180 
1181 		if (t->index & ~INDEX_MASK)
1182 			return -EINVAL;
1183 	}
1184 
1185 	return 0;
1186 }
1187 
1188 /* This function returns true when ENCAP attributes are present in the nl msg */
1189 static bool ipgre_netlink_encap_parms(struct nlattr *data[],
1190 				      struct ip_tunnel_encap *ipencap)
1191 {
1192 	bool ret = false;
1193 
1194 	memset(ipencap, 0, sizeof(*ipencap));
1195 
1196 	if (!data)
1197 		return ret;
1198 
1199 	if (data[IFLA_GRE_ENCAP_TYPE]) {
1200 		ret = true;
1201 		ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]);
1202 	}
1203 
1204 	if (data[IFLA_GRE_ENCAP_FLAGS]) {
1205 		ret = true;
1206 		ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]);
1207 	}
1208 
1209 	if (data[IFLA_GRE_ENCAP_SPORT]) {
1210 		ret = true;
1211 		ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]);
1212 	}
1213 
1214 	if (data[IFLA_GRE_ENCAP_DPORT]) {
1215 		ret = true;
1216 		ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]);
1217 	}
1218 
1219 	return ret;
1220 }
1221 
1222 static int gre_tap_init(struct net_device *dev)
1223 {
1224 	__gre_tunnel_init(dev);
1225 	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1226 
1227 	return ip_tunnel_init(dev);
1228 }
1229 
1230 static const struct net_device_ops gre_tap_netdev_ops = {
1231 	.ndo_init		= gre_tap_init,
1232 	.ndo_uninit		= ip_tunnel_uninit,
1233 	.ndo_start_xmit		= gre_tap_xmit,
1234 	.ndo_set_mac_address 	= eth_mac_addr,
1235 	.ndo_validate_addr	= eth_validate_addr,
1236 	.ndo_change_mtu		= ip_tunnel_change_mtu,
1237 	.ndo_get_stats64	= ip_tunnel_get_stats64,
1238 	.ndo_get_iflink		= ip_tunnel_get_iflink,
1239 	.ndo_fill_metadata_dst	= gre_fill_metadata_dst,
1240 };
1241 
1242 static int erspan_tunnel_init(struct net_device *dev)
1243 {
1244 	struct ip_tunnel *tunnel = netdev_priv(dev);
1245 	int t_hlen;
1246 
1247 	tunnel->tun_hlen = 8;
1248 	tunnel->parms.iph.protocol = IPPROTO_GRE;
1249 	t_hlen = tunnel->hlen + sizeof(struct iphdr) + sizeof(struct erspanhdr);
1250 
1251 	dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4;
1252 	dev->mtu = ETH_DATA_LEN - t_hlen - 4;
1253 	dev->features		|= GRE_FEATURES;
1254 	dev->hw_features	|= GRE_FEATURES;
1255 	dev->priv_flags		|= IFF_LIVE_ADDR_CHANGE;
1256 
1257 	return ip_tunnel_init(dev);
1258 }
1259 
1260 static const struct net_device_ops erspan_netdev_ops = {
1261 	.ndo_init		= erspan_tunnel_init,
1262 	.ndo_uninit		= ip_tunnel_uninit,
1263 	.ndo_start_xmit		= erspan_xmit,
1264 	.ndo_set_mac_address	= eth_mac_addr,
1265 	.ndo_validate_addr	= eth_validate_addr,
1266 	.ndo_change_mtu		= ip_tunnel_change_mtu,
1267 	.ndo_get_stats64	= ip_tunnel_get_stats64,
1268 	.ndo_get_iflink		= ip_tunnel_get_iflink,
1269 	.ndo_fill_metadata_dst	= gre_fill_metadata_dst,
1270 };
1271 
1272 static void ipgre_tap_setup(struct net_device *dev)
1273 {
1274 	ether_setup(dev);
1275 	dev->netdev_ops	= &gre_tap_netdev_ops;
1276 	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1277 	dev->priv_flags	|= IFF_LIVE_ADDR_CHANGE;
1278 	ip_tunnel_setup(dev, gre_tap_net_id);
1279 }
1280 
1281 static int ipgre_newlink(struct net *src_net, struct net_device *dev,
1282 			 struct nlattr *tb[], struct nlattr *data[],
1283 			 struct netlink_ext_ack *extack)
1284 {
1285 	struct ip_tunnel_parm p;
1286 	struct ip_tunnel_encap ipencap;
1287 	__u32 fwmark = 0;
1288 	int err;
1289 
1290 	if (ipgre_netlink_encap_parms(data, &ipencap)) {
1291 		struct ip_tunnel *t = netdev_priv(dev);
1292 		err = ip_tunnel_encap_setup(t, &ipencap);
1293 
1294 		if (err < 0)
1295 			return err;
1296 	}
1297 
1298 	err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
1299 	if (err < 0)
1300 		return err;
1301 	return ip_tunnel_newlink(dev, tb, &p, fwmark);
1302 }
1303 
1304 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1305 			    struct nlattr *data[],
1306 			    struct netlink_ext_ack *extack)
1307 {
1308 	struct ip_tunnel *t = netdev_priv(dev);
1309 	struct ip_tunnel_parm p;
1310 	struct ip_tunnel_encap ipencap;
1311 	__u32 fwmark = t->fwmark;
1312 	int err;
1313 
1314 	if (ipgre_netlink_encap_parms(data, &ipencap)) {
1315 		err = ip_tunnel_encap_setup(t, &ipencap);
1316 
1317 		if (err < 0)
1318 			return err;
1319 	}
1320 
1321 	err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
1322 	if (err < 0)
1323 		return err;
1324 	return ip_tunnel_changelink(dev, tb, &p, fwmark);
1325 }
1326 
1327 static size_t ipgre_get_size(const struct net_device *dev)
1328 {
1329 	return
1330 		/* IFLA_GRE_LINK */
1331 		nla_total_size(4) +
1332 		/* IFLA_GRE_IFLAGS */
1333 		nla_total_size(2) +
1334 		/* IFLA_GRE_OFLAGS */
1335 		nla_total_size(2) +
1336 		/* IFLA_GRE_IKEY */
1337 		nla_total_size(4) +
1338 		/* IFLA_GRE_OKEY */
1339 		nla_total_size(4) +
1340 		/* IFLA_GRE_LOCAL */
1341 		nla_total_size(4) +
1342 		/* IFLA_GRE_REMOTE */
1343 		nla_total_size(4) +
1344 		/* IFLA_GRE_TTL */
1345 		nla_total_size(1) +
1346 		/* IFLA_GRE_TOS */
1347 		nla_total_size(1) +
1348 		/* IFLA_GRE_PMTUDISC */
1349 		nla_total_size(1) +
1350 		/* IFLA_GRE_ENCAP_TYPE */
1351 		nla_total_size(2) +
1352 		/* IFLA_GRE_ENCAP_FLAGS */
1353 		nla_total_size(2) +
1354 		/* IFLA_GRE_ENCAP_SPORT */
1355 		nla_total_size(2) +
1356 		/* IFLA_GRE_ENCAP_DPORT */
1357 		nla_total_size(2) +
1358 		/* IFLA_GRE_COLLECT_METADATA */
1359 		nla_total_size(0) +
1360 		/* IFLA_GRE_IGNORE_DF */
1361 		nla_total_size(1) +
1362 		/* IFLA_GRE_FWMARK */
1363 		nla_total_size(4) +
1364 		/* IFLA_GRE_ERSPAN_INDEX */
1365 		nla_total_size(4) +
1366 		0;
1367 }
1368 
1369 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1370 {
1371 	struct ip_tunnel *t = netdev_priv(dev);
1372 	struct ip_tunnel_parm *p = &t->parms;
1373 
1374 	if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1375 	    nla_put_be16(skb, IFLA_GRE_IFLAGS,
1376 			 gre_tnl_flags_to_gre_flags(p->i_flags)) ||
1377 	    nla_put_be16(skb, IFLA_GRE_OFLAGS,
1378 			 gre_tnl_flags_to_gre_flags(p->o_flags)) ||
1379 	    nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1380 	    nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1381 	    nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
1382 	    nla_put_in_addr(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
1383 	    nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
1384 	    nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
1385 	    nla_put_u8(skb, IFLA_GRE_PMTUDISC,
1386 		       !!(p->iph.frag_off & htons(IP_DF))) ||
1387 	    nla_put_u32(skb, IFLA_GRE_FWMARK, t->fwmark))
1388 		goto nla_put_failure;
1389 
1390 	if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
1391 			t->encap.type) ||
1392 	    nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT,
1393 			 t->encap.sport) ||
1394 	    nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT,
1395 			 t->encap.dport) ||
1396 	    nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS,
1397 			t->encap.flags))
1398 		goto nla_put_failure;
1399 
1400 	if (nla_put_u8(skb, IFLA_GRE_IGNORE_DF, t->ignore_df))
1401 		goto nla_put_failure;
1402 
1403 	if (t->collect_md) {
1404 		if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA))
1405 			goto nla_put_failure;
1406 	}
1407 
1408 	if (t->index)
1409 		if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index))
1410 			goto nla_put_failure;
1411 
1412 	return 0;
1413 
1414 nla_put_failure:
1415 	return -EMSGSIZE;
1416 }
1417 
1418 static void erspan_setup(struct net_device *dev)
1419 {
1420 	ether_setup(dev);
1421 	dev->netdev_ops = &erspan_netdev_ops;
1422 	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1423 	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1424 	ip_tunnel_setup(dev, erspan_net_id);
1425 }
1426 
1427 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1428 	[IFLA_GRE_LINK]		= { .type = NLA_U32 },
1429 	[IFLA_GRE_IFLAGS]	= { .type = NLA_U16 },
1430 	[IFLA_GRE_OFLAGS]	= { .type = NLA_U16 },
1431 	[IFLA_GRE_IKEY]		= { .type = NLA_U32 },
1432 	[IFLA_GRE_OKEY]		= { .type = NLA_U32 },
1433 	[IFLA_GRE_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1434 	[IFLA_GRE_REMOTE]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1435 	[IFLA_GRE_TTL]		= { .type = NLA_U8 },
1436 	[IFLA_GRE_TOS]		= { .type = NLA_U8 },
1437 	[IFLA_GRE_PMTUDISC]	= { .type = NLA_U8 },
1438 	[IFLA_GRE_ENCAP_TYPE]	= { .type = NLA_U16 },
1439 	[IFLA_GRE_ENCAP_FLAGS]	= { .type = NLA_U16 },
1440 	[IFLA_GRE_ENCAP_SPORT]	= { .type = NLA_U16 },
1441 	[IFLA_GRE_ENCAP_DPORT]	= { .type = NLA_U16 },
1442 	[IFLA_GRE_COLLECT_METADATA]	= { .type = NLA_FLAG },
1443 	[IFLA_GRE_IGNORE_DF]	= { .type = NLA_U8 },
1444 	[IFLA_GRE_FWMARK]	= { .type = NLA_U32 },
1445 	[IFLA_GRE_ERSPAN_INDEX]	= { .type = NLA_U32 },
1446 };
1447 
1448 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1449 	.kind		= "gre",
1450 	.maxtype	= IFLA_GRE_MAX,
1451 	.policy		= ipgre_policy,
1452 	.priv_size	= sizeof(struct ip_tunnel),
1453 	.setup		= ipgre_tunnel_setup,
1454 	.validate	= ipgre_tunnel_validate,
1455 	.newlink	= ipgre_newlink,
1456 	.changelink	= ipgre_changelink,
1457 	.dellink	= ip_tunnel_dellink,
1458 	.get_size	= ipgre_get_size,
1459 	.fill_info	= ipgre_fill_info,
1460 	.get_link_net	= ip_tunnel_get_link_net,
1461 };
1462 
1463 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1464 	.kind		= "gretap",
1465 	.maxtype	= IFLA_GRE_MAX,
1466 	.policy		= ipgre_policy,
1467 	.priv_size	= sizeof(struct ip_tunnel),
1468 	.setup		= ipgre_tap_setup,
1469 	.validate	= ipgre_tap_validate,
1470 	.newlink	= ipgre_newlink,
1471 	.changelink	= ipgre_changelink,
1472 	.dellink	= ip_tunnel_dellink,
1473 	.get_size	= ipgre_get_size,
1474 	.fill_info	= ipgre_fill_info,
1475 	.get_link_net	= ip_tunnel_get_link_net,
1476 };
1477 
1478 static struct rtnl_link_ops erspan_link_ops __read_mostly = {
1479 	.kind		= "erspan",
1480 	.maxtype	= IFLA_GRE_MAX,
1481 	.policy		= ipgre_policy,
1482 	.priv_size	= sizeof(struct ip_tunnel),
1483 	.setup		= erspan_setup,
1484 	.validate	= erspan_validate,
1485 	.newlink	= ipgre_newlink,
1486 	.changelink	= ipgre_changelink,
1487 	.dellink	= ip_tunnel_dellink,
1488 	.get_size	= ipgre_get_size,
1489 	.fill_info	= ipgre_fill_info,
1490 	.get_link_net	= ip_tunnel_get_link_net,
1491 };
1492 
1493 struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
1494 					u8 name_assign_type)
1495 {
1496 	struct nlattr *tb[IFLA_MAX + 1];
1497 	struct net_device *dev;
1498 	LIST_HEAD(list_kill);
1499 	struct ip_tunnel *t;
1500 	int err;
1501 
1502 	memset(&tb, 0, sizeof(tb));
1503 
1504 	dev = rtnl_create_link(net, name, name_assign_type,
1505 			       &ipgre_tap_ops, tb);
1506 	if (IS_ERR(dev))
1507 		return dev;
1508 
1509 	/* Configure flow based GRE device. */
1510 	t = netdev_priv(dev);
1511 	t->collect_md = true;
1512 
1513 	err = ipgre_newlink(net, dev, tb, NULL, NULL);
1514 	if (err < 0) {
1515 		free_netdev(dev);
1516 		return ERR_PTR(err);
1517 	}
1518 
1519 	/* openvswitch users expect packet sizes to be unrestricted,
1520 	 * so set the largest MTU we can.
1521 	 */
1522 	err = __ip_tunnel_change_mtu(dev, IP_MAX_MTU, false);
1523 	if (err)
1524 		goto out;
1525 
1526 	err = rtnl_configure_link(dev, NULL);
1527 	if (err < 0)
1528 		goto out;
1529 
1530 	return dev;
1531 out:
1532 	ip_tunnel_dellink(dev, &list_kill);
1533 	unregister_netdevice_many(&list_kill);
1534 	return ERR_PTR(err);
1535 }
1536 EXPORT_SYMBOL_GPL(gretap_fb_dev_create);
1537 
1538 static int __net_init ipgre_tap_init_net(struct net *net)
1539 {
1540 	return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
1541 }
1542 
1543 static void __net_exit ipgre_tap_exit_net(struct net *net)
1544 {
1545 	struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id);
1546 	ip_tunnel_delete_net(itn, &ipgre_tap_ops);
1547 }
1548 
1549 static struct pernet_operations ipgre_tap_net_ops = {
1550 	.init = ipgre_tap_init_net,
1551 	.exit = ipgre_tap_exit_net,
1552 	.id   = &gre_tap_net_id,
1553 	.size = sizeof(struct ip_tunnel_net),
1554 };
1555 
1556 static int __net_init erspan_init_net(struct net *net)
1557 {
1558 	return ip_tunnel_init_net(net, erspan_net_id,
1559 				  &erspan_link_ops, "erspan0");
1560 }
1561 
1562 static void __net_exit erspan_exit_net(struct net *net)
1563 {
1564 	struct ip_tunnel_net *itn = net_generic(net, erspan_net_id);
1565 
1566 	ip_tunnel_delete_net(itn, &erspan_link_ops);
1567 }
1568 
1569 static struct pernet_operations erspan_net_ops = {
1570 	.init = erspan_init_net,
1571 	.exit = erspan_exit_net,
1572 	.id   = &erspan_net_id,
1573 	.size = sizeof(struct ip_tunnel_net),
1574 };
1575 
1576 static int __init ipgre_init(void)
1577 {
1578 	int err;
1579 
1580 	pr_info("GRE over IPv4 tunneling driver\n");
1581 
1582 	err = register_pernet_device(&ipgre_net_ops);
1583 	if (err < 0)
1584 		return err;
1585 
1586 	err = register_pernet_device(&ipgre_tap_net_ops);
1587 	if (err < 0)
1588 		goto pnet_tap_failed;
1589 
1590 	err = register_pernet_device(&erspan_net_ops);
1591 	if (err < 0)
1592 		goto pnet_erspan_failed;
1593 
1594 	err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1595 	if (err < 0) {
1596 		pr_info("%s: can't add protocol\n", __func__);
1597 		goto add_proto_failed;
1598 	}
1599 
1600 	err = rtnl_link_register(&ipgre_link_ops);
1601 	if (err < 0)
1602 		goto rtnl_link_failed;
1603 
1604 	err = rtnl_link_register(&ipgre_tap_ops);
1605 	if (err < 0)
1606 		goto tap_ops_failed;
1607 
1608 	err = rtnl_link_register(&erspan_link_ops);
1609 	if (err < 0)
1610 		goto erspan_link_failed;
1611 
1612 	return 0;
1613 
1614 erspan_link_failed:
1615 	rtnl_link_unregister(&ipgre_tap_ops);
1616 tap_ops_failed:
1617 	rtnl_link_unregister(&ipgre_link_ops);
1618 rtnl_link_failed:
1619 	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1620 add_proto_failed:
1621 	unregister_pernet_device(&erspan_net_ops);
1622 pnet_erspan_failed:
1623 	unregister_pernet_device(&ipgre_tap_net_ops);
1624 pnet_tap_failed:
1625 	unregister_pernet_device(&ipgre_net_ops);
1626 	return err;
1627 }
1628 
1629 static void __exit ipgre_fini(void)
1630 {
1631 	rtnl_link_unregister(&ipgre_tap_ops);
1632 	rtnl_link_unregister(&ipgre_link_ops);
1633 	rtnl_link_unregister(&erspan_link_ops);
1634 	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1635 	unregister_pernet_device(&ipgre_tap_net_ops);
1636 	unregister_pernet_device(&ipgre_net_ops);
1637 	unregister_pernet_device(&erspan_net_ops);
1638 }
1639 
1640 module_init(ipgre_init);
1641 module_exit(ipgre_fini);
1642 MODULE_LICENSE("GPL");
1643 MODULE_ALIAS_RTNL_LINK("gre");
1644 MODULE_ALIAS_RTNL_LINK("gretap");
1645 MODULE_ALIAS_RTNL_LINK("erspan");
1646 MODULE_ALIAS_NETDEV("gre0");
1647 MODULE_ALIAS_NETDEV("gretap0");
1648 MODULE_ALIAS_NETDEV("erspan0");
1649