xref: /openbmc/linux/net/ipv4/ip_gre.c (revision 7d545e77)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	Linux NET3:	GRE over IP protocol decoder.
4  *
5  *	Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
6  */
7 
8 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9 
10 #include <linux/capability.h>
11 #include <linux/module.h>
12 #include <linux/types.h>
13 #include <linux/kernel.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/skbuff.h>
17 #include <linux/netdevice.h>
18 #include <linux/in.h>
19 #include <linux/tcp.h>
20 #include <linux/udp.h>
21 #include <linux/if_arp.h>
22 #include <linux/if_vlan.h>
23 #include <linux/init.h>
24 #include <linux/in6.h>
25 #include <linux/inetdevice.h>
26 #include <linux/igmp.h>
27 #include <linux/netfilter_ipv4.h>
28 #include <linux/etherdevice.h>
29 #include <linux/if_ether.h>
30 
31 #include <net/sock.h>
32 #include <net/ip.h>
33 #include <net/icmp.h>
34 #include <net/protocol.h>
35 #include <net/ip_tunnels.h>
36 #include <net/arp.h>
37 #include <net/checksum.h>
38 #include <net/dsfield.h>
39 #include <net/inet_ecn.h>
40 #include <net/xfrm.h>
41 #include <net/net_namespace.h>
42 #include <net/netns/generic.h>
43 #include <net/rtnetlink.h>
44 #include <net/gre.h>
45 #include <net/dst_metadata.h>
46 #include <net/erspan.h>
47 
48 /*
49    Problems & solutions
50    --------------------
51 
52    1. The most important issue is detecting local dead loops.
53    They would cause complete host lockup in transmit, which
54    would be "resolved" by stack overflow or, if queueing is enabled,
55    with infinite looping in net_bh.
56 
57    We cannot track such dead loops during route installation,
58    it is infeasible task. The most general solutions would be
59    to keep skb->encapsulation counter (sort of local ttl),
60    and silently drop packet when it expires. It is a good
61    solution, but it supposes maintaining new variable in ALL
62    skb, even if no tunneling is used.
63 
64    Current solution: xmit_recursion breaks dead loops. This is a percpu
65    counter, since when we enter the first ndo_xmit(), cpu migration is
66    forbidden. We force an exit if this counter reaches RECURSION_LIMIT
67 
68    2. Networking dead loops would not kill routers, but would really
69    kill network. IP hop limit plays role of "t->recursion" in this case,
70    if we copy it from packet being encapsulated to upper header.
71    It is very good solution, but it introduces two problems:
72 
73    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
74      do not work over tunnels.
75    - traceroute does not work. I planned to relay ICMP from tunnel,
76      so that this problem would be solved and traceroute output
77      would even more informative. This idea appeared to be wrong:
78      only Linux complies to rfc1812 now (yes, guys, Linux is the only
79      true router now :-)), all routers (at least, in neighbourhood of mine)
80      return only 8 bytes of payload. It is the end.
81 
82    Hence, if we want that OSPF worked or traceroute said something reasonable,
83    we should search for another solution.
84 
85    One of them is to parse packet trying to detect inner encapsulation
86    made by our node. It is difficult or even impossible, especially,
87    taking into account fragmentation. TO be short, ttl is not solution at all.
88 
89    Current solution: The solution was UNEXPECTEDLY SIMPLE.
90    We force DF flag on tunnels with preconfigured hop limit,
91    that is ALL. :-) Well, it does not remove the problem completely,
92    but exponential growth of network traffic is changed to linear
93    (branches, that exceed pmtu are pruned) and tunnel mtu
94    rapidly degrades to value <68, where looping stops.
95    Yes, it is not good if there exists a router in the loop,
96    which does not force DF, even when encapsulating packets have DF set.
97    But it is not our problem! Nobody could accuse us, we made
98    all that we could make. Even if it is your gated who injected
99    fatal route to network, even if it were you who configured
100    fatal static route: you are innocent. :-)
101 
102    Alexey Kuznetsov.
103  */
104 
105 static bool log_ecn_error = true;
106 module_param(log_ecn_error, bool, 0644);
107 MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
108 
109 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
110 static int ipgre_tunnel_init(struct net_device *dev);
111 static void erspan_build_header(struct sk_buff *skb,
112 				u32 id, u32 index,
113 				bool truncate, bool is_ipv4);
114 
115 static unsigned int ipgre_net_id __read_mostly;
116 static unsigned int gre_tap_net_id __read_mostly;
117 static unsigned int erspan_net_id __read_mostly;
118 
119 static int ipgre_err(struct sk_buff *skb, u32 info,
120 		     const struct tnl_ptk_info *tpi)
121 {
122 
123 	/* All the routers (except for Linux) return only
124 	   8 bytes of packet payload. It means, that precise relaying of
125 	   ICMP in the real Internet is absolutely infeasible.
126 
127 	   Moreover, Cisco "wise men" put GRE key to the third word
128 	   in GRE header. It makes impossible maintaining even soft
129 	   state for keyed GRE tunnels with enabled checksum. Tell
130 	   them "thank you".
131 
132 	   Well, I wonder, rfc1812 was written by Cisco employee,
133 	   what the hell these idiots break standards established
134 	   by themselves???
135 	   */
136 	struct net *net = dev_net(skb->dev);
137 	struct ip_tunnel_net *itn;
138 	const struct iphdr *iph;
139 	const int type = icmp_hdr(skb)->type;
140 	const int code = icmp_hdr(skb)->code;
141 	unsigned int data_len = 0;
142 	struct ip_tunnel *t;
143 
144 	if (tpi->proto == htons(ETH_P_TEB))
145 		itn = net_generic(net, gre_tap_net_id);
146 	else if (tpi->proto == htons(ETH_P_ERSPAN) ||
147 		 tpi->proto == htons(ETH_P_ERSPAN2))
148 		itn = net_generic(net, erspan_net_id);
149 	else
150 		itn = net_generic(net, ipgre_net_id);
151 
152 	iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
153 	t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
154 			     iph->daddr, iph->saddr, tpi->key);
155 
156 	if (!t)
157 		return -ENOENT;
158 
159 	switch (type) {
160 	default:
161 	case ICMP_PARAMETERPROB:
162 		return 0;
163 
164 	case ICMP_DEST_UNREACH:
165 		switch (code) {
166 		case ICMP_SR_FAILED:
167 		case ICMP_PORT_UNREACH:
168 			/* Impossible event. */
169 			return 0;
170 		default:
171 			/* All others are translated to HOST_UNREACH.
172 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
173 			   I believe they are just ether pollution. --ANK
174 			 */
175 			break;
176 		}
177 		break;
178 
179 	case ICMP_TIME_EXCEEDED:
180 		if (code != ICMP_EXC_TTL)
181 			return 0;
182 		data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */
183 		break;
184 
185 	case ICMP_REDIRECT:
186 		break;
187 	}
188 
189 #if IS_ENABLED(CONFIG_IPV6)
190        if (tpi->proto == htons(ETH_P_IPV6) &&
191            !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len,
192 				       type, data_len))
193                return 0;
194 #endif
195 
196 	if (t->parms.iph.daddr == 0 ||
197 	    ipv4_is_multicast(t->parms.iph.daddr))
198 		return 0;
199 
200 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
201 		return 0;
202 
203 	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
204 		t->err_count++;
205 	else
206 		t->err_count = 1;
207 	t->err_time = jiffies;
208 
209 	return 0;
210 }
211 
212 static void gre_err(struct sk_buff *skb, u32 info)
213 {
214 	/* All the routers (except for Linux) return only
215 	 * 8 bytes of packet payload. It means, that precise relaying of
216 	 * ICMP in the real Internet is absolutely infeasible.
217 	 *
218 	 * Moreover, Cisco "wise men" put GRE key to the third word
219 	 * in GRE header. It makes impossible maintaining even soft
220 	 * state for keyed
221 	 * GRE tunnels with enabled checksum. Tell them "thank you".
222 	 *
223 	 * Well, I wonder, rfc1812 was written by Cisco employee,
224 	 * what the hell these idiots break standards established
225 	 * by themselves???
226 	 */
227 
228 	const struct iphdr *iph = (struct iphdr *)skb->data;
229 	const int type = icmp_hdr(skb)->type;
230 	const int code = icmp_hdr(skb)->code;
231 	struct tnl_ptk_info tpi;
232 
233 	if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IP),
234 			     iph->ihl * 4) < 0)
235 		return;
236 
237 	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
238 		ipv4_update_pmtu(skb, dev_net(skb->dev), info,
239 				 skb->dev->ifindex, IPPROTO_GRE);
240 		return;
241 	}
242 	if (type == ICMP_REDIRECT) {
243 		ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex,
244 			      IPPROTO_GRE);
245 		return;
246 	}
247 
248 	ipgre_err(skb, info, &tpi);
249 }
250 
251 static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
252 		      int gre_hdr_len)
253 {
254 	struct net *net = dev_net(skb->dev);
255 	struct metadata_dst *tun_dst = NULL;
256 	struct erspan_base_hdr *ershdr;
257 	struct ip_tunnel_net *itn;
258 	struct ip_tunnel *tunnel;
259 	const struct iphdr *iph;
260 	struct erspan_md2 *md2;
261 	int ver;
262 	int len;
263 
264 	itn = net_generic(net, erspan_net_id);
265 
266 	iph = ip_hdr(skb);
267 	ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len);
268 	ver = ershdr->ver;
269 
270 	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
271 				  tpi->flags | TUNNEL_KEY,
272 				  iph->saddr, iph->daddr, tpi->key);
273 
274 	if (tunnel) {
275 		len = gre_hdr_len + erspan_hdr_len(ver);
276 		if (unlikely(!pskb_may_pull(skb, len)))
277 			return PACKET_REJECT;
278 
279 		if (__iptunnel_pull_header(skb,
280 					   len,
281 					   htons(ETH_P_TEB),
282 					   false, false) < 0)
283 			goto drop;
284 
285 		if (tunnel->collect_md) {
286 			struct erspan_metadata *pkt_md, *md;
287 			struct ip_tunnel_info *info;
288 			unsigned char *gh;
289 			__be64 tun_id;
290 			__be16 flags;
291 
292 			tpi->flags |= TUNNEL_KEY;
293 			flags = tpi->flags;
294 			tun_id = key32_to_tunnel_id(tpi->key);
295 
296 			tun_dst = ip_tun_rx_dst(skb, flags,
297 						tun_id, sizeof(*md));
298 			if (!tun_dst)
299 				return PACKET_REJECT;
300 
301 			/* skb can be uncloned in __iptunnel_pull_header, so
302 			 * old pkt_md is no longer valid and we need to reset
303 			 * it
304 			 */
305 			gh = skb_network_header(skb) +
306 			     skb_network_header_len(skb);
307 			pkt_md = (struct erspan_metadata *)(gh + gre_hdr_len +
308 							    sizeof(*ershdr));
309 			md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
310 			md->version = ver;
311 			md2 = &md->u.md2;
312 			memcpy(md2, pkt_md, ver == 1 ? ERSPAN_V1_MDSIZE :
313 						       ERSPAN_V2_MDSIZE);
314 
315 			info = &tun_dst->u.tun_info;
316 			info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
317 			info->options_len = sizeof(*md);
318 		}
319 
320 		skb_reset_mac_header(skb);
321 		ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
322 		return PACKET_RCVD;
323 	}
324 	return PACKET_REJECT;
325 
326 drop:
327 	kfree_skb(skb);
328 	return PACKET_RCVD;
329 }
330 
331 static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
332 		       struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
333 {
334 	struct metadata_dst *tun_dst = NULL;
335 	const struct iphdr *iph;
336 	struct ip_tunnel *tunnel;
337 
338 	iph = ip_hdr(skb);
339 	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
340 				  iph->saddr, iph->daddr, tpi->key);
341 
342 	if (tunnel) {
343 		if (__iptunnel_pull_header(skb, hdr_len, tpi->proto,
344 					   raw_proto, false) < 0)
345 			goto drop;
346 
347 		if (tunnel->dev->type != ARPHRD_NONE)
348 			skb_pop_mac_header(skb);
349 		else
350 			skb_reset_mac_header(skb);
351 		if (tunnel->collect_md) {
352 			__be16 flags;
353 			__be64 tun_id;
354 
355 			flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
356 			tun_id = key32_to_tunnel_id(tpi->key);
357 			tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
358 			if (!tun_dst)
359 				return PACKET_REJECT;
360 		}
361 
362 		ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
363 		return PACKET_RCVD;
364 	}
365 	return PACKET_NEXT;
366 
367 drop:
368 	kfree_skb(skb);
369 	return PACKET_RCVD;
370 }
371 
372 static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
373 		     int hdr_len)
374 {
375 	struct net *net = dev_net(skb->dev);
376 	struct ip_tunnel_net *itn;
377 	int res;
378 
379 	if (tpi->proto == htons(ETH_P_TEB))
380 		itn = net_generic(net, gre_tap_net_id);
381 	else
382 		itn = net_generic(net, ipgre_net_id);
383 
384 	res = __ipgre_rcv(skb, tpi, itn, hdr_len, false);
385 	if (res == PACKET_NEXT && tpi->proto == htons(ETH_P_TEB)) {
386 		/* ipgre tunnels in collect metadata mode should receive
387 		 * also ETH_P_TEB traffic.
388 		 */
389 		itn = net_generic(net, ipgre_net_id);
390 		res = __ipgre_rcv(skb, tpi, itn, hdr_len, true);
391 	}
392 	return res;
393 }
394 
395 static int gre_rcv(struct sk_buff *skb)
396 {
397 	struct tnl_ptk_info tpi;
398 	bool csum_err = false;
399 	int hdr_len;
400 
401 #ifdef CONFIG_NET_IPGRE_BROADCAST
402 	if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
403 		/* Looped back packet, drop it! */
404 		if (rt_is_output_route(skb_rtable(skb)))
405 			goto drop;
406 	}
407 #endif
408 
409 	hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP), 0);
410 	if (hdr_len < 0)
411 		goto drop;
412 
413 	if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) ||
414 		     tpi.proto == htons(ETH_P_ERSPAN2))) {
415 		if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
416 			return 0;
417 		goto out;
418 	}
419 
420 	if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
421 		return 0;
422 
423 out:
424 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
425 drop:
426 	kfree_skb(skb);
427 	return 0;
428 }
429 
430 static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
431 		       const struct iphdr *tnl_params,
432 		       __be16 proto)
433 {
434 	struct ip_tunnel *tunnel = netdev_priv(dev);
435 
436 	if (tunnel->parms.o_flags & TUNNEL_SEQ)
437 		tunnel->o_seqno++;
438 
439 	/* Push GRE header. */
440 	gre_build_header(skb, tunnel->tun_hlen,
441 			 tunnel->parms.o_flags, proto, tunnel->parms.o_key,
442 			 htonl(tunnel->o_seqno));
443 
444 	ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
445 }
446 
447 static int gre_handle_offloads(struct sk_buff *skb, bool csum)
448 {
449 	return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
450 }
451 
452 static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
453 			__be16 proto)
454 {
455 	struct ip_tunnel *tunnel = netdev_priv(dev);
456 	struct ip_tunnel_info *tun_info;
457 	const struct ip_tunnel_key *key;
458 	int tunnel_hlen;
459 	__be16 flags;
460 
461 	tun_info = skb_tunnel_info(skb);
462 	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
463 		     ip_tunnel_info_af(tun_info) != AF_INET))
464 		goto err_free_skb;
465 
466 	key = &tun_info->key;
467 	tunnel_hlen = gre_calc_hlen(key->tun_flags);
468 
469 	if (skb_cow_head(skb, dev->needed_headroom))
470 		goto err_free_skb;
471 
472 	/* Push Tunnel header. */
473 	if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM)))
474 		goto err_free_skb;
475 
476 	flags = tun_info->key.tun_flags &
477 		(TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ);
478 	gre_build_header(skb, tunnel_hlen, flags, proto,
479 			 tunnel_id_to_key32(tun_info->key.tun_id),
480 			 (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0);
481 
482 	ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen);
483 
484 	return;
485 
486 err_free_skb:
487 	kfree_skb(skb);
488 	dev->stats.tx_dropped++;
489 }
490 
491 static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev)
492 {
493 	struct ip_tunnel *tunnel = netdev_priv(dev);
494 	struct ip_tunnel_info *tun_info;
495 	const struct ip_tunnel_key *key;
496 	struct erspan_metadata *md;
497 	bool truncate = false;
498 	__be16 proto;
499 	int tunnel_hlen;
500 	int version;
501 	int nhoff;
502 	int thoff;
503 
504 	tun_info = skb_tunnel_info(skb);
505 	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
506 		     ip_tunnel_info_af(tun_info) != AF_INET))
507 		goto err_free_skb;
508 
509 	key = &tun_info->key;
510 	if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT))
511 		goto err_free_skb;
512 	md = ip_tunnel_info_opts(tun_info);
513 	if (!md)
514 		goto err_free_skb;
515 
516 	/* ERSPAN has fixed 8 byte GRE header */
517 	version = md->version;
518 	tunnel_hlen = 8 + erspan_hdr_len(version);
519 
520 	if (skb_cow_head(skb, dev->needed_headroom))
521 		goto err_free_skb;
522 
523 	if (gre_handle_offloads(skb, false))
524 		goto err_free_skb;
525 
526 	if (skb->len > dev->mtu + dev->hard_header_len) {
527 		pskb_trim(skb, dev->mtu + dev->hard_header_len);
528 		truncate = true;
529 	}
530 
531 	nhoff = skb_network_header(skb) - skb_mac_header(skb);
532 	if (skb->protocol == htons(ETH_P_IP) &&
533 	    (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff))
534 		truncate = true;
535 
536 	thoff = skb_transport_header(skb) - skb_mac_header(skb);
537 	if (skb->protocol == htons(ETH_P_IPV6) &&
538 	    (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff))
539 		truncate = true;
540 
541 	if (version == 1) {
542 		erspan_build_header(skb, ntohl(tunnel_id_to_key32(key->tun_id)),
543 				    ntohl(md->u.index), truncate, true);
544 		proto = htons(ETH_P_ERSPAN);
545 	} else if (version == 2) {
546 		erspan_build_header_v2(skb,
547 				       ntohl(tunnel_id_to_key32(key->tun_id)),
548 				       md->u.md2.dir,
549 				       get_hwid(&md->u.md2),
550 				       truncate, true);
551 		proto = htons(ETH_P_ERSPAN2);
552 	} else {
553 		goto err_free_skb;
554 	}
555 
556 	gre_build_header(skb, 8, TUNNEL_SEQ,
557 			 proto, 0, htonl(tunnel->o_seqno++));
558 
559 	ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen);
560 
561 	return;
562 
563 err_free_skb:
564 	kfree_skb(skb);
565 	dev->stats.tx_dropped++;
566 }
567 
568 static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
569 {
570 	struct ip_tunnel_info *info = skb_tunnel_info(skb);
571 	const struct ip_tunnel_key *key;
572 	struct rtable *rt;
573 	struct flowi4 fl4;
574 
575 	if (ip_tunnel_info_af(info) != AF_INET)
576 		return -EINVAL;
577 
578 	key = &info->key;
579 	ip_tunnel_init_flow(&fl4, IPPROTO_GRE, key->u.ipv4.dst, key->u.ipv4.src,
580 			    tunnel_id_to_key32(key->tun_id), key->tos, 0,
581 			    skb->mark, skb_get_hash(skb));
582 	rt = ip_route_output_key(dev_net(dev), &fl4);
583 	if (IS_ERR(rt))
584 		return PTR_ERR(rt);
585 
586 	ip_rt_put(rt);
587 	info->key.u.ipv4.src = fl4.saddr;
588 	return 0;
589 }
590 
591 static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
592 			      struct net_device *dev)
593 {
594 	struct ip_tunnel *tunnel = netdev_priv(dev);
595 	const struct iphdr *tnl_params;
596 
597 	if (!pskb_inet_may_pull(skb))
598 		goto free_skb;
599 
600 	if (tunnel->collect_md) {
601 		gre_fb_xmit(skb, dev, skb->protocol);
602 		return NETDEV_TX_OK;
603 	}
604 
605 	if (dev->header_ops) {
606 		/* Need space for new headers */
607 		if (skb_cow_head(skb, dev->needed_headroom -
608 				      (tunnel->hlen + sizeof(struct iphdr))))
609 			goto free_skb;
610 
611 		tnl_params = (const struct iphdr *)skb->data;
612 
613 		/* Pull skb since ip_tunnel_xmit() needs skb->data pointing
614 		 * to gre header.
615 		 */
616 		skb_pull(skb, tunnel->hlen + sizeof(struct iphdr));
617 		skb_reset_mac_header(skb);
618 	} else {
619 		if (skb_cow_head(skb, dev->needed_headroom))
620 			goto free_skb;
621 
622 		tnl_params = &tunnel->parms.iph;
623 	}
624 
625 	if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
626 		goto free_skb;
627 
628 	__gre_xmit(skb, dev, tnl_params, skb->protocol);
629 	return NETDEV_TX_OK;
630 
631 free_skb:
632 	kfree_skb(skb);
633 	dev->stats.tx_dropped++;
634 	return NETDEV_TX_OK;
635 }
636 
637 static netdev_tx_t erspan_xmit(struct sk_buff *skb,
638 			       struct net_device *dev)
639 {
640 	struct ip_tunnel *tunnel = netdev_priv(dev);
641 	bool truncate = false;
642 	__be16 proto;
643 
644 	if (!pskb_inet_may_pull(skb))
645 		goto free_skb;
646 
647 	if (tunnel->collect_md) {
648 		erspan_fb_xmit(skb, dev);
649 		return NETDEV_TX_OK;
650 	}
651 
652 	if (gre_handle_offloads(skb, false))
653 		goto free_skb;
654 
655 	if (skb_cow_head(skb, dev->needed_headroom))
656 		goto free_skb;
657 
658 	if (skb->len > dev->mtu + dev->hard_header_len) {
659 		pskb_trim(skb, dev->mtu + dev->hard_header_len);
660 		truncate = true;
661 	}
662 
663 	/* Push ERSPAN header */
664 	if (tunnel->erspan_ver == 1) {
665 		erspan_build_header(skb, ntohl(tunnel->parms.o_key),
666 				    tunnel->index,
667 				    truncate, true);
668 		proto = htons(ETH_P_ERSPAN);
669 	} else if (tunnel->erspan_ver == 2) {
670 		erspan_build_header_v2(skb, ntohl(tunnel->parms.o_key),
671 				       tunnel->dir, tunnel->hwid,
672 				       truncate, true);
673 		proto = htons(ETH_P_ERSPAN2);
674 	} else {
675 		goto free_skb;
676 	}
677 
678 	tunnel->parms.o_flags &= ~TUNNEL_KEY;
679 	__gre_xmit(skb, dev, &tunnel->parms.iph, proto);
680 	return NETDEV_TX_OK;
681 
682 free_skb:
683 	kfree_skb(skb);
684 	dev->stats.tx_dropped++;
685 	return NETDEV_TX_OK;
686 }
687 
688 static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
689 				struct net_device *dev)
690 {
691 	struct ip_tunnel *tunnel = netdev_priv(dev);
692 
693 	if (!pskb_inet_may_pull(skb))
694 		goto free_skb;
695 
696 	if (tunnel->collect_md) {
697 		gre_fb_xmit(skb, dev, htons(ETH_P_TEB));
698 		return NETDEV_TX_OK;
699 	}
700 
701 	if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
702 		goto free_skb;
703 
704 	if (skb_cow_head(skb, dev->needed_headroom))
705 		goto free_skb;
706 
707 	__gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB));
708 	return NETDEV_TX_OK;
709 
710 free_skb:
711 	kfree_skb(skb);
712 	dev->stats.tx_dropped++;
713 	return NETDEV_TX_OK;
714 }
715 
716 static void ipgre_link_update(struct net_device *dev, bool set_mtu)
717 {
718 	struct ip_tunnel *tunnel = netdev_priv(dev);
719 	int len;
720 
721 	len = tunnel->tun_hlen;
722 	tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
723 	len = tunnel->tun_hlen - len;
724 	tunnel->hlen = tunnel->hlen + len;
725 
726 	dev->needed_headroom = dev->needed_headroom + len;
727 	if (set_mtu)
728 		dev->mtu = max_t(int, dev->mtu - len, 68);
729 
730 	if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
731 		if (!(tunnel->parms.o_flags & TUNNEL_CSUM) ||
732 		    tunnel->encap.type == TUNNEL_ENCAP_NONE) {
733 			dev->features |= NETIF_F_GSO_SOFTWARE;
734 			dev->hw_features |= NETIF_F_GSO_SOFTWARE;
735 		} else {
736 			dev->features &= ~NETIF_F_GSO_SOFTWARE;
737 			dev->hw_features &= ~NETIF_F_GSO_SOFTWARE;
738 		}
739 		dev->features |= NETIF_F_LLTX;
740 	} else {
741 		dev->hw_features &= ~NETIF_F_GSO_SOFTWARE;
742 		dev->features &= ~(NETIF_F_LLTX | NETIF_F_GSO_SOFTWARE);
743 	}
744 }
745 
746 static int ipgre_tunnel_ioctl(struct net_device *dev,
747 			      struct ifreq *ifr, int cmd)
748 {
749 	struct ip_tunnel_parm p;
750 	int err;
751 
752 	if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
753 		return -EFAULT;
754 
755 	if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
756 		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
757 		    p.iph.ihl != 5 || (p.iph.frag_off & htons(~IP_DF)) ||
758 		    ((p.i_flags | p.o_flags) & (GRE_VERSION | GRE_ROUTING)))
759 			return -EINVAL;
760 	}
761 
762 	p.i_flags = gre_flags_to_tnl_flags(p.i_flags);
763 	p.o_flags = gre_flags_to_tnl_flags(p.o_flags);
764 
765 	err = ip_tunnel_ioctl(dev, &p, cmd);
766 	if (err)
767 		return err;
768 
769 	if (cmd == SIOCCHGTUNNEL) {
770 		struct ip_tunnel *t = netdev_priv(dev);
771 
772 		t->parms.i_flags = p.i_flags;
773 		t->parms.o_flags = p.o_flags;
774 
775 		if (strcmp(dev->rtnl_link_ops->kind, "erspan"))
776 			ipgre_link_update(dev, true);
777 	}
778 
779 	p.i_flags = gre_tnl_flags_to_gre_flags(p.i_flags);
780 	p.o_flags = gre_tnl_flags_to_gre_flags(p.o_flags);
781 
782 	if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
783 		return -EFAULT;
784 
785 	return 0;
786 }
787 
788 /* Nice toy. Unfortunately, useless in real life :-)
789    It allows to construct virtual multiprotocol broadcast "LAN"
790    over the Internet, provided multicast routing is tuned.
791 
792 
793    I have no idea was this bicycle invented before me,
794    so that I had to set ARPHRD_IPGRE to a random value.
795    I have an impression, that Cisco could make something similar,
796    but this feature is apparently missing in IOS<=11.2(8).
797 
798    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
799    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
800 
801    ping -t 255 224.66.66.66
802 
803    If nobody answers, mbone does not work.
804 
805    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
806    ip addr add 10.66.66.<somewhat>/24 dev Universe
807    ifconfig Universe up
808    ifconfig Universe add fe80::<Your_real_addr>/10
809    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
810    ftp 10.66.66.66
811    ...
812    ftp fec0:6666:6666::193.233.7.65
813    ...
814  */
815 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
816 			unsigned short type,
817 			const void *daddr, const void *saddr, unsigned int len)
818 {
819 	struct ip_tunnel *t = netdev_priv(dev);
820 	struct iphdr *iph;
821 	struct gre_base_hdr *greh;
822 
823 	iph = skb_push(skb, t->hlen + sizeof(*iph));
824 	greh = (struct gre_base_hdr *)(iph+1);
825 	greh->flags = gre_tnl_flags_to_gre_flags(t->parms.o_flags);
826 	greh->protocol = htons(type);
827 
828 	memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
829 
830 	/* Set the source hardware address. */
831 	if (saddr)
832 		memcpy(&iph->saddr, saddr, 4);
833 	if (daddr)
834 		memcpy(&iph->daddr, daddr, 4);
835 	if (iph->daddr)
836 		return t->hlen + sizeof(*iph);
837 
838 	return -(t->hlen + sizeof(*iph));
839 }
840 
841 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
842 {
843 	const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
844 	memcpy(haddr, &iph->saddr, 4);
845 	return 4;
846 }
847 
848 static const struct header_ops ipgre_header_ops = {
849 	.create	= ipgre_header,
850 	.parse	= ipgre_header_parse,
851 };
852 
853 #ifdef CONFIG_NET_IPGRE_BROADCAST
854 static int ipgre_open(struct net_device *dev)
855 {
856 	struct ip_tunnel *t = netdev_priv(dev);
857 
858 	if (ipv4_is_multicast(t->parms.iph.daddr)) {
859 		struct flowi4 fl4;
860 		struct rtable *rt;
861 
862 		rt = ip_route_output_gre(t->net, &fl4,
863 					 t->parms.iph.daddr,
864 					 t->parms.iph.saddr,
865 					 t->parms.o_key,
866 					 RT_TOS(t->parms.iph.tos),
867 					 t->parms.link);
868 		if (IS_ERR(rt))
869 			return -EADDRNOTAVAIL;
870 		dev = rt->dst.dev;
871 		ip_rt_put(rt);
872 		if (!__in_dev_get_rtnl(dev))
873 			return -EADDRNOTAVAIL;
874 		t->mlink = dev->ifindex;
875 		ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
876 	}
877 	return 0;
878 }
879 
880 static int ipgre_close(struct net_device *dev)
881 {
882 	struct ip_tunnel *t = netdev_priv(dev);
883 
884 	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
885 		struct in_device *in_dev;
886 		in_dev = inetdev_by_index(t->net, t->mlink);
887 		if (in_dev)
888 			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
889 	}
890 	return 0;
891 }
892 #endif
893 
894 static const struct net_device_ops ipgre_netdev_ops = {
895 	.ndo_init		= ipgre_tunnel_init,
896 	.ndo_uninit		= ip_tunnel_uninit,
897 #ifdef CONFIG_NET_IPGRE_BROADCAST
898 	.ndo_open		= ipgre_open,
899 	.ndo_stop		= ipgre_close,
900 #endif
901 	.ndo_start_xmit		= ipgre_xmit,
902 	.ndo_do_ioctl		= ipgre_tunnel_ioctl,
903 	.ndo_change_mtu		= ip_tunnel_change_mtu,
904 	.ndo_get_stats64	= ip_tunnel_get_stats64,
905 	.ndo_get_iflink		= ip_tunnel_get_iflink,
906 };
907 
908 #define GRE_FEATURES (NETIF_F_SG |		\
909 		      NETIF_F_FRAGLIST |	\
910 		      NETIF_F_HIGHDMA |		\
911 		      NETIF_F_HW_CSUM)
912 
913 static void ipgre_tunnel_setup(struct net_device *dev)
914 {
915 	dev->netdev_ops		= &ipgre_netdev_ops;
916 	dev->type		= ARPHRD_IPGRE;
917 	ip_tunnel_setup(dev, ipgre_net_id);
918 }
919 
920 static void __gre_tunnel_init(struct net_device *dev)
921 {
922 	struct ip_tunnel *tunnel;
923 
924 	tunnel = netdev_priv(dev);
925 	tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
926 	tunnel->parms.iph.protocol = IPPROTO_GRE;
927 
928 	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
929 
930 	dev->features		|= GRE_FEATURES;
931 	dev->hw_features	|= GRE_FEATURES;
932 
933 	if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
934 		/* TCP offload with GRE SEQ is not supported, nor
935 		 * can we support 2 levels of outer headers requiring
936 		 * an update.
937 		 */
938 		if (!(tunnel->parms.o_flags & TUNNEL_CSUM) ||
939 		    (tunnel->encap.type == TUNNEL_ENCAP_NONE)) {
940 			dev->features    |= NETIF_F_GSO_SOFTWARE;
941 			dev->hw_features |= NETIF_F_GSO_SOFTWARE;
942 		}
943 
944 		/* Can use a lockless transmit, unless we generate
945 		 * output sequences
946 		 */
947 		dev->features |= NETIF_F_LLTX;
948 	}
949 }
950 
951 static int ipgre_tunnel_init(struct net_device *dev)
952 {
953 	struct ip_tunnel *tunnel = netdev_priv(dev);
954 	struct iphdr *iph = &tunnel->parms.iph;
955 
956 	__gre_tunnel_init(dev);
957 
958 	memcpy(dev->dev_addr, &iph->saddr, 4);
959 	memcpy(dev->broadcast, &iph->daddr, 4);
960 
961 	dev->flags		= IFF_NOARP;
962 	netif_keep_dst(dev);
963 	dev->addr_len		= 4;
964 
965 	if (iph->daddr && !tunnel->collect_md) {
966 #ifdef CONFIG_NET_IPGRE_BROADCAST
967 		if (ipv4_is_multicast(iph->daddr)) {
968 			if (!iph->saddr)
969 				return -EINVAL;
970 			dev->flags = IFF_BROADCAST;
971 			dev->header_ops = &ipgre_header_ops;
972 		}
973 #endif
974 	} else if (!tunnel->collect_md) {
975 		dev->header_ops = &ipgre_header_ops;
976 	}
977 
978 	return ip_tunnel_init(dev);
979 }
980 
981 static const struct gre_protocol ipgre_protocol = {
982 	.handler     = gre_rcv,
983 	.err_handler = gre_err,
984 };
985 
986 static int __net_init ipgre_init_net(struct net *net)
987 {
988 	return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
989 }
990 
991 static void __net_exit ipgre_exit_batch_net(struct list_head *list_net)
992 {
993 	ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops);
994 }
995 
996 static struct pernet_operations ipgre_net_ops = {
997 	.init = ipgre_init_net,
998 	.exit_batch = ipgre_exit_batch_net,
999 	.id   = &ipgre_net_id,
1000 	.size = sizeof(struct ip_tunnel_net),
1001 };
1002 
1003 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
1004 				 struct netlink_ext_ack *extack)
1005 {
1006 	__be16 flags;
1007 
1008 	if (!data)
1009 		return 0;
1010 
1011 	flags = 0;
1012 	if (data[IFLA_GRE_IFLAGS])
1013 		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1014 	if (data[IFLA_GRE_OFLAGS])
1015 		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1016 	if (flags & (GRE_VERSION|GRE_ROUTING))
1017 		return -EINVAL;
1018 
1019 	if (data[IFLA_GRE_COLLECT_METADATA] &&
1020 	    data[IFLA_GRE_ENCAP_TYPE] &&
1021 	    nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]) != TUNNEL_ENCAP_NONE)
1022 		return -EINVAL;
1023 
1024 	return 0;
1025 }
1026 
1027 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[],
1028 			      struct netlink_ext_ack *extack)
1029 {
1030 	__be32 daddr;
1031 
1032 	if (tb[IFLA_ADDRESS]) {
1033 		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1034 			return -EINVAL;
1035 		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1036 			return -EADDRNOTAVAIL;
1037 	}
1038 
1039 	if (!data)
1040 		goto out;
1041 
1042 	if (data[IFLA_GRE_REMOTE]) {
1043 		memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1044 		if (!daddr)
1045 			return -EINVAL;
1046 	}
1047 
1048 out:
1049 	return ipgre_tunnel_validate(tb, data, extack);
1050 }
1051 
1052 static int erspan_validate(struct nlattr *tb[], struct nlattr *data[],
1053 			   struct netlink_ext_ack *extack)
1054 {
1055 	__be16 flags = 0;
1056 	int ret;
1057 
1058 	if (!data)
1059 		return 0;
1060 
1061 	ret = ipgre_tap_validate(tb, data, extack);
1062 	if (ret)
1063 		return ret;
1064 
1065 	/* ERSPAN should only have GRE sequence and key flag */
1066 	if (data[IFLA_GRE_OFLAGS])
1067 		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1068 	if (data[IFLA_GRE_IFLAGS])
1069 		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1070 	if (!data[IFLA_GRE_COLLECT_METADATA] &&
1071 	    flags != (GRE_SEQ | GRE_KEY))
1072 		return -EINVAL;
1073 
1074 	/* ERSPAN Session ID only has 10-bit. Since we reuse
1075 	 * 32-bit key field as ID, check it's range.
1076 	 */
1077 	if (data[IFLA_GRE_IKEY] &&
1078 	    (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK))
1079 		return -EINVAL;
1080 
1081 	if (data[IFLA_GRE_OKEY] &&
1082 	    (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK))
1083 		return -EINVAL;
1084 
1085 	return 0;
1086 }
1087 
1088 static int ipgre_netlink_parms(struct net_device *dev,
1089 				struct nlattr *data[],
1090 				struct nlattr *tb[],
1091 				struct ip_tunnel_parm *parms,
1092 				__u32 *fwmark)
1093 {
1094 	struct ip_tunnel *t = netdev_priv(dev);
1095 
1096 	memset(parms, 0, sizeof(*parms));
1097 
1098 	parms->iph.protocol = IPPROTO_GRE;
1099 
1100 	if (!data)
1101 		return 0;
1102 
1103 	if (data[IFLA_GRE_LINK])
1104 		parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1105 
1106 	if (data[IFLA_GRE_IFLAGS])
1107 		parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS]));
1108 
1109 	if (data[IFLA_GRE_OFLAGS])
1110 		parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS]));
1111 
1112 	if (data[IFLA_GRE_IKEY])
1113 		parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1114 
1115 	if (data[IFLA_GRE_OKEY])
1116 		parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1117 
1118 	if (data[IFLA_GRE_LOCAL])
1119 		parms->iph.saddr = nla_get_in_addr(data[IFLA_GRE_LOCAL]);
1120 
1121 	if (data[IFLA_GRE_REMOTE])
1122 		parms->iph.daddr = nla_get_in_addr(data[IFLA_GRE_REMOTE]);
1123 
1124 	if (data[IFLA_GRE_TTL])
1125 		parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1126 
1127 	if (data[IFLA_GRE_TOS])
1128 		parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1129 
1130 	if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) {
1131 		if (t->ignore_df)
1132 			return -EINVAL;
1133 		parms->iph.frag_off = htons(IP_DF);
1134 	}
1135 
1136 	if (data[IFLA_GRE_COLLECT_METADATA]) {
1137 		t->collect_md = true;
1138 		if (dev->type == ARPHRD_IPGRE)
1139 			dev->type = ARPHRD_NONE;
1140 	}
1141 
1142 	if (data[IFLA_GRE_IGNORE_DF]) {
1143 		if (nla_get_u8(data[IFLA_GRE_IGNORE_DF])
1144 		  && (parms->iph.frag_off & htons(IP_DF)))
1145 			return -EINVAL;
1146 		t->ignore_df = !!nla_get_u8(data[IFLA_GRE_IGNORE_DF]);
1147 	}
1148 
1149 	if (data[IFLA_GRE_FWMARK])
1150 		*fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);
1151 
1152 	if (data[IFLA_GRE_ERSPAN_VER]) {
1153 		t->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]);
1154 
1155 		if (t->erspan_ver != 1 && t->erspan_ver != 2)
1156 			return -EINVAL;
1157 	}
1158 
1159 	if (t->erspan_ver == 1) {
1160 		if (data[IFLA_GRE_ERSPAN_INDEX]) {
1161 			t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
1162 			if (t->index & ~INDEX_MASK)
1163 				return -EINVAL;
1164 		}
1165 	} else if (t->erspan_ver == 2) {
1166 		if (data[IFLA_GRE_ERSPAN_DIR]) {
1167 			t->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]);
1168 			if (t->dir & ~(DIR_MASK >> DIR_OFFSET))
1169 				return -EINVAL;
1170 		}
1171 		if (data[IFLA_GRE_ERSPAN_HWID]) {
1172 			t->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]);
1173 			if (t->hwid & ~(HWID_MASK >> HWID_OFFSET))
1174 				return -EINVAL;
1175 		}
1176 	}
1177 
1178 	return 0;
1179 }
1180 
1181 /* This function returns true when ENCAP attributes are present in the nl msg */
1182 static bool ipgre_netlink_encap_parms(struct nlattr *data[],
1183 				      struct ip_tunnel_encap *ipencap)
1184 {
1185 	bool ret = false;
1186 
1187 	memset(ipencap, 0, sizeof(*ipencap));
1188 
1189 	if (!data)
1190 		return ret;
1191 
1192 	if (data[IFLA_GRE_ENCAP_TYPE]) {
1193 		ret = true;
1194 		ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]);
1195 	}
1196 
1197 	if (data[IFLA_GRE_ENCAP_FLAGS]) {
1198 		ret = true;
1199 		ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]);
1200 	}
1201 
1202 	if (data[IFLA_GRE_ENCAP_SPORT]) {
1203 		ret = true;
1204 		ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]);
1205 	}
1206 
1207 	if (data[IFLA_GRE_ENCAP_DPORT]) {
1208 		ret = true;
1209 		ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]);
1210 	}
1211 
1212 	return ret;
1213 }
1214 
1215 static int gre_tap_init(struct net_device *dev)
1216 {
1217 	__gre_tunnel_init(dev);
1218 	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1219 	netif_keep_dst(dev);
1220 
1221 	return ip_tunnel_init(dev);
1222 }
1223 
1224 static const struct net_device_ops gre_tap_netdev_ops = {
1225 	.ndo_init		= gre_tap_init,
1226 	.ndo_uninit		= ip_tunnel_uninit,
1227 	.ndo_start_xmit		= gre_tap_xmit,
1228 	.ndo_set_mac_address 	= eth_mac_addr,
1229 	.ndo_validate_addr	= eth_validate_addr,
1230 	.ndo_change_mtu		= ip_tunnel_change_mtu,
1231 	.ndo_get_stats64	= ip_tunnel_get_stats64,
1232 	.ndo_get_iflink		= ip_tunnel_get_iflink,
1233 	.ndo_fill_metadata_dst	= gre_fill_metadata_dst,
1234 };
1235 
1236 static int erspan_tunnel_init(struct net_device *dev)
1237 {
1238 	struct ip_tunnel *tunnel = netdev_priv(dev);
1239 
1240 	tunnel->tun_hlen = 8;
1241 	tunnel->parms.iph.protocol = IPPROTO_GRE;
1242 	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
1243 		       erspan_hdr_len(tunnel->erspan_ver);
1244 
1245 	dev->features		|= GRE_FEATURES;
1246 	dev->hw_features	|= GRE_FEATURES;
1247 	dev->priv_flags		|= IFF_LIVE_ADDR_CHANGE;
1248 	netif_keep_dst(dev);
1249 
1250 	return ip_tunnel_init(dev);
1251 }
1252 
1253 static const struct net_device_ops erspan_netdev_ops = {
1254 	.ndo_init		= erspan_tunnel_init,
1255 	.ndo_uninit		= ip_tunnel_uninit,
1256 	.ndo_start_xmit		= erspan_xmit,
1257 	.ndo_set_mac_address	= eth_mac_addr,
1258 	.ndo_validate_addr	= eth_validate_addr,
1259 	.ndo_change_mtu		= ip_tunnel_change_mtu,
1260 	.ndo_get_stats64	= ip_tunnel_get_stats64,
1261 	.ndo_get_iflink		= ip_tunnel_get_iflink,
1262 	.ndo_fill_metadata_dst	= gre_fill_metadata_dst,
1263 };
1264 
1265 static void ipgre_tap_setup(struct net_device *dev)
1266 {
1267 	ether_setup(dev);
1268 	dev->max_mtu = 0;
1269 	dev->netdev_ops	= &gre_tap_netdev_ops;
1270 	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1271 	dev->priv_flags	|= IFF_LIVE_ADDR_CHANGE;
1272 	ip_tunnel_setup(dev, gre_tap_net_id);
1273 }
1274 
1275 static int ipgre_newlink(struct net *src_net, struct net_device *dev,
1276 			 struct nlattr *tb[], struct nlattr *data[],
1277 			 struct netlink_ext_ack *extack)
1278 {
1279 	struct ip_tunnel_parm p;
1280 	struct ip_tunnel_encap ipencap;
1281 	__u32 fwmark = 0;
1282 	int err;
1283 
1284 	if (ipgre_netlink_encap_parms(data, &ipencap)) {
1285 		struct ip_tunnel *t = netdev_priv(dev);
1286 		err = ip_tunnel_encap_setup(t, &ipencap);
1287 
1288 		if (err < 0)
1289 			return err;
1290 	}
1291 
1292 	err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
1293 	if (err < 0)
1294 		return err;
1295 	return ip_tunnel_newlink(dev, tb, &p, fwmark);
1296 }
1297 
1298 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1299 			    struct nlattr *data[],
1300 			    struct netlink_ext_ack *extack)
1301 {
1302 	struct ip_tunnel *t = netdev_priv(dev);
1303 	struct ip_tunnel_encap ipencap;
1304 	__u32 fwmark = t->fwmark;
1305 	struct ip_tunnel_parm p;
1306 	int err;
1307 
1308 	if (ipgre_netlink_encap_parms(data, &ipencap)) {
1309 		err = ip_tunnel_encap_setup(t, &ipencap);
1310 
1311 		if (err < 0)
1312 			return err;
1313 	}
1314 
1315 	err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
1316 	if (err < 0)
1317 		return err;
1318 
1319 	err = ip_tunnel_changelink(dev, tb, &p, fwmark);
1320 	if (err < 0)
1321 		return err;
1322 
1323 	t->parms.i_flags = p.i_flags;
1324 	t->parms.o_flags = p.o_flags;
1325 
1326 	if (strcmp(dev->rtnl_link_ops->kind, "erspan"))
1327 		ipgre_link_update(dev, !tb[IFLA_MTU]);
1328 
1329 	return 0;
1330 }
1331 
1332 static size_t ipgre_get_size(const struct net_device *dev)
1333 {
1334 	return
1335 		/* IFLA_GRE_LINK */
1336 		nla_total_size(4) +
1337 		/* IFLA_GRE_IFLAGS */
1338 		nla_total_size(2) +
1339 		/* IFLA_GRE_OFLAGS */
1340 		nla_total_size(2) +
1341 		/* IFLA_GRE_IKEY */
1342 		nla_total_size(4) +
1343 		/* IFLA_GRE_OKEY */
1344 		nla_total_size(4) +
1345 		/* IFLA_GRE_LOCAL */
1346 		nla_total_size(4) +
1347 		/* IFLA_GRE_REMOTE */
1348 		nla_total_size(4) +
1349 		/* IFLA_GRE_TTL */
1350 		nla_total_size(1) +
1351 		/* IFLA_GRE_TOS */
1352 		nla_total_size(1) +
1353 		/* IFLA_GRE_PMTUDISC */
1354 		nla_total_size(1) +
1355 		/* IFLA_GRE_ENCAP_TYPE */
1356 		nla_total_size(2) +
1357 		/* IFLA_GRE_ENCAP_FLAGS */
1358 		nla_total_size(2) +
1359 		/* IFLA_GRE_ENCAP_SPORT */
1360 		nla_total_size(2) +
1361 		/* IFLA_GRE_ENCAP_DPORT */
1362 		nla_total_size(2) +
1363 		/* IFLA_GRE_COLLECT_METADATA */
1364 		nla_total_size(0) +
1365 		/* IFLA_GRE_IGNORE_DF */
1366 		nla_total_size(1) +
1367 		/* IFLA_GRE_FWMARK */
1368 		nla_total_size(4) +
1369 		/* IFLA_GRE_ERSPAN_INDEX */
1370 		nla_total_size(4) +
1371 		/* IFLA_GRE_ERSPAN_VER */
1372 		nla_total_size(1) +
1373 		/* IFLA_GRE_ERSPAN_DIR */
1374 		nla_total_size(1) +
1375 		/* IFLA_GRE_ERSPAN_HWID */
1376 		nla_total_size(2) +
1377 		0;
1378 }
1379 
1380 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1381 {
1382 	struct ip_tunnel *t = netdev_priv(dev);
1383 	struct ip_tunnel_parm *p = &t->parms;
1384 	__be16 o_flags = p->o_flags;
1385 
1386 	if (t->erspan_ver == 1 || t->erspan_ver == 2) {
1387 		if (!t->collect_md)
1388 			o_flags |= TUNNEL_KEY;
1389 
1390 		if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver))
1391 			goto nla_put_failure;
1392 
1393 		if (t->erspan_ver == 1) {
1394 			if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index))
1395 				goto nla_put_failure;
1396 		} else {
1397 			if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir))
1398 				goto nla_put_failure;
1399 			if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid))
1400 				goto nla_put_failure;
1401 		}
1402 	}
1403 
1404 	if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1405 	    nla_put_be16(skb, IFLA_GRE_IFLAGS,
1406 			 gre_tnl_flags_to_gre_flags(p->i_flags)) ||
1407 	    nla_put_be16(skb, IFLA_GRE_OFLAGS,
1408 			 gre_tnl_flags_to_gre_flags(o_flags)) ||
1409 	    nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1410 	    nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1411 	    nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
1412 	    nla_put_in_addr(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
1413 	    nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
1414 	    nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
1415 	    nla_put_u8(skb, IFLA_GRE_PMTUDISC,
1416 		       !!(p->iph.frag_off & htons(IP_DF))) ||
1417 	    nla_put_u32(skb, IFLA_GRE_FWMARK, t->fwmark))
1418 		goto nla_put_failure;
1419 
1420 	if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
1421 			t->encap.type) ||
1422 	    nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT,
1423 			 t->encap.sport) ||
1424 	    nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT,
1425 			 t->encap.dport) ||
1426 	    nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS,
1427 			t->encap.flags))
1428 		goto nla_put_failure;
1429 
1430 	if (nla_put_u8(skb, IFLA_GRE_IGNORE_DF, t->ignore_df))
1431 		goto nla_put_failure;
1432 
1433 	if (t->collect_md) {
1434 		if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA))
1435 			goto nla_put_failure;
1436 	}
1437 
1438 	return 0;
1439 
1440 nla_put_failure:
1441 	return -EMSGSIZE;
1442 }
1443 
1444 static void erspan_setup(struct net_device *dev)
1445 {
1446 	struct ip_tunnel *t = netdev_priv(dev);
1447 
1448 	ether_setup(dev);
1449 	dev->netdev_ops = &erspan_netdev_ops;
1450 	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1451 	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1452 	ip_tunnel_setup(dev, erspan_net_id);
1453 	t->erspan_ver = 1;
1454 }
1455 
1456 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1457 	[IFLA_GRE_LINK]		= { .type = NLA_U32 },
1458 	[IFLA_GRE_IFLAGS]	= { .type = NLA_U16 },
1459 	[IFLA_GRE_OFLAGS]	= { .type = NLA_U16 },
1460 	[IFLA_GRE_IKEY]		= { .type = NLA_U32 },
1461 	[IFLA_GRE_OKEY]		= { .type = NLA_U32 },
1462 	[IFLA_GRE_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1463 	[IFLA_GRE_REMOTE]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1464 	[IFLA_GRE_TTL]		= { .type = NLA_U8 },
1465 	[IFLA_GRE_TOS]		= { .type = NLA_U8 },
1466 	[IFLA_GRE_PMTUDISC]	= { .type = NLA_U8 },
1467 	[IFLA_GRE_ENCAP_TYPE]	= { .type = NLA_U16 },
1468 	[IFLA_GRE_ENCAP_FLAGS]	= { .type = NLA_U16 },
1469 	[IFLA_GRE_ENCAP_SPORT]	= { .type = NLA_U16 },
1470 	[IFLA_GRE_ENCAP_DPORT]	= { .type = NLA_U16 },
1471 	[IFLA_GRE_COLLECT_METADATA]	= { .type = NLA_FLAG },
1472 	[IFLA_GRE_IGNORE_DF]	= { .type = NLA_U8 },
1473 	[IFLA_GRE_FWMARK]	= { .type = NLA_U32 },
1474 	[IFLA_GRE_ERSPAN_INDEX]	= { .type = NLA_U32 },
1475 	[IFLA_GRE_ERSPAN_VER]	= { .type = NLA_U8 },
1476 	[IFLA_GRE_ERSPAN_DIR]	= { .type = NLA_U8 },
1477 	[IFLA_GRE_ERSPAN_HWID]	= { .type = NLA_U16 },
1478 };
1479 
1480 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1481 	.kind		= "gre",
1482 	.maxtype	= IFLA_GRE_MAX,
1483 	.policy		= ipgre_policy,
1484 	.priv_size	= sizeof(struct ip_tunnel),
1485 	.setup		= ipgre_tunnel_setup,
1486 	.validate	= ipgre_tunnel_validate,
1487 	.newlink	= ipgre_newlink,
1488 	.changelink	= ipgre_changelink,
1489 	.dellink	= ip_tunnel_dellink,
1490 	.get_size	= ipgre_get_size,
1491 	.fill_info	= ipgre_fill_info,
1492 	.get_link_net	= ip_tunnel_get_link_net,
1493 };
1494 
1495 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1496 	.kind		= "gretap",
1497 	.maxtype	= IFLA_GRE_MAX,
1498 	.policy		= ipgre_policy,
1499 	.priv_size	= sizeof(struct ip_tunnel),
1500 	.setup		= ipgre_tap_setup,
1501 	.validate	= ipgre_tap_validate,
1502 	.newlink	= ipgre_newlink,
1503 	.changelink	= ipgre_changelink,
1504 	.dellink	= ip_tunnel_dellink,
1505 	.get_size	= ipgre_get_size,
1506 	.fill_info	= ipgre_fill_info,
1507 	.get_link_net	= ip_tunnel_get_link_net,
1508 };
1509 
1510 static struct rtnl_link_ops erspan_link_ops __read_mostly = {
1511 	.kind		= "erspan",
1512 	.maxtype	= IFLA_GRE_MAX,
1513 	.policy		= ipgre_policy,
1514 	.priv_size	= sizeof(struct ip_tunnel),
1515 	.setup		= erspan_setup,
1516 	.validate	= erspan_validate,
1517 	.newlink	= ipgre_newlink,
1518 	.changelink	= ipgre_changelink,
1519 	.dellink	= ip_tunnel_dellink,
1520 	.get_size	= ipgre_get_size,
1521 	.fill_info	= ipgre_fill_info,
1522 	.get_link_net	= ip_tunnel_get_link_net,
1523 };
1524 
1525 struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
1526 					u8 name_assign_type)
1527 {
1528 	struct nlattr *tb[IFLA_MAX + 1];
1529 	struct net_device *dev;
1530 	LIST_HEAD(list_kill);
1531 	struct ip_tunnel *t;
1532 	int err;
1533 
1534 	memset(&tb, 0, sizeof(tb));
1535 
1536 	dev = rtnl_create_link(net, name, name_assign_type,
1537 			       &ipgre_tap_ops, tb, NULL);
1538 	if (IS_ERR(dev))
1539 		return dev;
1540 
1541 	/* Configure flow based GRE device. */
1542 	t = netdev_priv(dev);
1543 	t->collect_md = true;
1544 
1545 	err = ipgre_newlink(net, dev, tb, NULL, NULL);
1546 	if (err < 0) {
1547 		free_netdev(dev);
1548 		return ERR_PTR(err);
1549 	}
1550 
1551 	/* openvswitch users expect packet sizes to be unrestricted,
1552 	 * so set the largest MTU we can.
1553 	 */
1554 	err = __ip_tunnel_change_mtu(dev, IP_MAX_MTU, false);
1555 	if (err)
1556 		goto out;
1557 
1558 	err = rtnl_configure_link(dev, NULL);
1559 	if (err < 0)
1560 		goto out;
1561 
1562 	return dev;
1563 out:
1564 	ip_tunnel_dellink(dev, &list_kill);
1565 	unregister_netdevice_many(&list_kill);
1566 	return ERR_PTR(err);
1567 }
1568 EXPORT_SYMBOL_GPL(gretap_fb_dev_create);
1569 
1570 static int __net_init ipgre_tap_init_net(struct net *net)
1571 {
1572 	return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
1573 }
1574 
1575 static void __net_exit ipgre_tap_exit_batch_net(struct list_head *list_net)
1576 {
1577 	ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops);
1578 }
1579 
1580 static struct pernet_operations ipgre_tap_net_ops = {
1581 	.init = ipgre_tap_init_net,
1582 	.exit_batch = ipgre_tap_exit_batch_net,
1583 	.id   = &gre_tap_net_id,
1584 	.size = sizeof(struct ip_tunnel_net),
1585 };
1586 
1587 static int __net_init erspan_init_net(struct net *net)
1588 {
1589 	return ip_tunnel_init_net(net, erspan_net_id,
1590 				  &erspan_link_ops, "erspan0");
1591 }
1592 
1593 static void __net_exit erspan_exit_batch_net(struct list_head *net_list)
1594 {
1595 	ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops);
1596 }
1597 
1598 static struct pernet_operations erspan_net_ops = {
1599 	.init = erspan_init_net,
1600 	.exit_batch = erspan_exit_batch_net,
1601 	.id   = &erspan_net_id,
1602 	.size = sizeof(struct ip_tunnel_net),
1603 };
1604 
1605 static int __init ipgre_init(void)
1606 {
1607 	int err;
1608 
1609 	pr_info("GRE over IPv4 tunneling driver\n");
1610 
1611 	err = register_pernet_device(&ipgre_net_ops);
1612 	if (err < 0)
1613 		return err;
1614 
1615 	err = register_pernet_device(&ipgre_tap_net_ops);
1616 	if (err < 0)
1617 		goto pnet_tap_failed;
1618 
1619 	err = register_pernet_device(&erspan_net_ops);
1620 	if (err < 0)
1621 		goto pnet_erspan_failed;
1622 
1623 	err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1624 	if (err < 0) {
1625 		pr_info("%s: can't add protocol\n", __func__);
1626 		goto add_proto_failed;
1627 	}
1628 
1629 	err = rtnl_link_register(&ipgre_link_ops);
1630 	if (err < 0)
1631 		goto rtnl_link_failed;
1632 
1633 	err = rtnl_link_register(&ipgre_tap_ops);
1634 	if (err < 0)
1635 		goto tap_ops_failed;
1636 
1637 	err = rtnl_link_register(&erspan_link_ops);
1638 	if (err < 0)
1639 		goto erspan_link_failed;
1640 
1641 	return 0;
1642 
1643 erspan_link_failed:
1644 	rtnl_link_unregister(&ipgre_tap_ops);
1645 tap_ops_failed:
1646 	rtnl_link_unregister(&ipgre_link_ops);
1647 rtnl_link_failed:
1648 	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1649 add_proto_failed:
1650 	unregister_pernet_device(&erspan_net_ops);
1651 pnet_erspan_failed:
1652 	unregister_pernet_device(&ipgre_tap_net_ops);
1653 pnet_tap_failed:
1654 	unregister_pernet_device(&ipgre_net_ops);
1655 	return err;
1656 }
1657 
1658 static void __exit ipgre_fini(void)
1659 {
1660 	rtnl_link_unregister(&ipgre_tap_ops);
1661 	rtnl_link_unregister(&ipgre_link_ops);
1662 	rtnl_link_unregister(&erspan_link_ops);
1663 	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1664 	unregister_pernet_device(&ipgre_tap_net_ops);
1665 	unregister_pernet_device(&ipgre_net_ops);
1666 	unregister_pernet_device(&erspan_net_ops);
1667 }
1668 
1669 module_init(ipgre_init);
1670 module_exit(ipgre_fini);
1671 MODULE_LICENSE("GPL");
1672 MODULE_ALIAS_RTNL_LINK("gre");
1673 MODULE_ALIAS_RTNL_LINK("gretap");
1674 MODULE_ALIAS_RTNL_LINK("erspan");
1675 MODULE_ALIAS_NETDEV("gre0");
1676 MODULE_ALIAS_NETDEV("gretap0");
1677 MODULE_ALIAS_NETDEV("erspan0");
1678