xref: /openbmc/linux/net/ipv4/ip_gre.c (revision bf070bb0)
1 /*
2  *	Linux NET3:	GRE over IP protocol decoder.
3  *
4  *	Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *	This program is free software; you can redistribute it and/or
7  *	modify it under the terms of the GNU General Public License
8  *	as published by the Free Software Foundation; either version
9  *	2 of the License, or (at your option) any later version.
10  *
11  */
12 
13 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14 
15 #include <linux/capability.h>
16 #include <linux/module.h>
17 #include <linux/types.h>
18 #include <linux/kernel.h>
19 #include <linux/slab.h>
20 #include <linux/uaccess.h>
21 #include <linux/skbuff.h>
22 #include <linux/netdevice.h>
23 #include <linux/in.h>
24 #include <linux/tcp.h>
25 #include <linux/udp.h>
26 #include <linux/if_arp.h>
27 #include <linux/if_vlan.h>
28 #include <linux/init.h>
29 #include <linux/in6.h>
30 #include <linux/inetdevice.h>
31 #include <linux/igmp.h>
32 #include <linux/netfilter_ipv4.h>
33 #include <linux/etherdevice.h>
34 #include <linux/if_ether.h>
35 
36 #include <net/sock.h>
37 #include <net/ip.h>
38 #include <net/icmp.h>
39 #include <net/protocol.h>
40 #include <net/ip_tunnels.h>
41 #include <net/arp.h>
42 #include <net/checksum.h>
43 #include <net/dsfield.h>
44 #include <net/inet_ecn.h>
45 #include <net/xfrm.h>
46 #include <net/net_namespace.h>
47 #include <net/netns/generic.h>
48 #include <net/rtnetlink.h>
49 #include <net/gre.h>
50 #include <net/dst_metadata.h>
51 #include <net/erspan.h>
52 
53 /*
54    Problems & solutions
55    --------------------
56 
57    1. The most important issue is detecting local dead loops.
58    They would cause complete host lockup in transmit, which
59    would be "resolved" by stack overflow or, if queueing is enabled,
60    with infinite looping in net_bh.
61 
62    We cannot track such dead loops during route installation,
63    it is infeasible task. The most general solutions would be
64    to keep skb->encapsulation counter (sort of local ttl),
65    and silently drop packet when it expires. It is a good
66    solution, but it supposes maintaining new variable in ALL
67    skb, even if no tunneling is used.
68 
69    Current solution: xmit_recursion breaks dead loops. This is a percpu
70    counter, since when we enter the first ndo_xmit(), cpu migration is
71    forbidden. We force an exit if this counter reaches RECURSION_LIMIT
72 
73    2. Networking dead loops would not kill routers, but would really
74    kill network. IP hop limit plays role of "t->recursion" in this case,
75    if we copy it from packet being encapsulated to upper header.
76    It is very good solution, but it introduces two problems:
77 
78    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
79      do not work over tunnels.
80    - traceroute does not work. I planned to relay ICMP from tunnel,
81      so that this problem would be solved and traceroute output
82      would even more informative. This idea appeared to be wrong:
83      only Linux complies to rfc1812 now (yes, guys, Linux is the only
84      true router now :-)), all routers (at least, in neighbourhood of mine)
85      return only 8 bytes of payload. It is the end.
86 
87    Hence, if we want that OSPF worked or traceroute said something reasonable,
88    we should search for another solution.
89 
90    One of them is to parse packet trying to detect inner encapsulation
91    made by our node. It is difficult or even impossible, especially,
92    taking into account fragmentation. TO be short, ttl is not solution at all.
93 
94    Current solution: The solution was UNEXPECTEDLY SIMPLE.
95    We force DF flag on tunnels with preconfigured hop limit,
96    that is ALL. :-) Well, it does not remove the problem completely,
97    but exponential growth of network traffic is changed to linear
98    (branches, that exceed pmtu are pruned) and tunnel mtu
99    rapidly degrades to value <68, where looping stops.
100    Yes, it is not good if there exists a router in the loop,
101    which does not force DF, even when encapsulating packets have DF set.
102    But it is not our problem! Nobody could accuse us, we made
103    all that we could make. Even if it is your gated who injected
104    fatal route to network, even if it were you who configured
105    fatal static route: you are innocent. :-)
106 
107    Alexey Kuznetsov.
108  */
109 
110 static bool log_ecn_error = true;
111 module_param(log_ecn_error, bool, 0644);
112 MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
113 
114 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
115 static int ipgre_tunnel_init(struct net_device *dev);
116 static void erspan_build_header(struct sk_buff *skb,
117 				__be32 id, u32 index, bool truncate);
118 
119 static unsigned int ipgre_net_id __read_mostly;
120 static unsigned int gre_tap_net_id __read_mostly;
121 static unsigned int erspan_net_id __read_mostly;
122 
123 static void ipgre_err(struct sk_buff *skb, u32 info,
124 		      const struct tnl_ptk_info *tpi)
125 {
126 
127 	/* All the routers (except for Linux) return only
128 	   8 bytes of packet payload. It means, that precise relaying of
129 	   ICMP in the real Internet is absolutely infeasible.
130 
131 	   Moreover, Cisco "wise men" put GRE key to the third word
132 	   in GRE header. It makes impossible maintaining even soft
133 	   state for keyed GRE tunnels with enabled checksum. Tell
134 	   them "thank you".
135 
136 	   Well, I wonder, rfc1812 was written by Cisco employee,
137 	   what the hell these idiots break standards established
138 	   by themselves???
139 	   */
140 	struct net *net = dev_net(skb->dev);
141 	struct ip_tunnel_net *itn;
142 	const struct iphdr *iph;
143 	const int type = icmp_hdr(skb)->type;
144 	const int code = icmp_hdr(skb)->code;
145 	unsigned int data_len = 0;
146 	struct ip_tunnel *t;
147 
148 	switch (type) {
149 	default:
150 	case ICMP_PARAMETERPROB:
151 		return;
152 
153 	case ICMP_DEST_UNREACH:
154 		switch (code) {
155 		case ICMP_SR_FAILED:
156 		case ICMP_PORT_UNREACH:
157 			/* Impossible event. */
158 			return;
159 		default:
160 			/* All others are translated to HOST_UNREACH.
161 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
162 			   I believe they are just ether pollution. --ANK
163 			 */
164 			break;
165 		}
166 		break;
167 
168 	case ICMP_TIME_EXCEEDED:
169 		if (code != ICMP_EXC_TTL)
170 			return;
171 		data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */
172 		break;
173 
174 	case ICMP_REDIRECT:
175 		break;
176 	}
177 
178 	if (tpi->proto == htons(ETH_P_TEB))
179 		itn = net_generic(net, gre_tap_net_id);
180 	else
181 		itn = net_generic(net, ipgre_net_id);
182 
183 	iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
184 	t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
185 			     iph->daddr, iph->saddr, tpi->key);
186 
187 	if (!t)
188 		return;
189 
190 #if IS_ENABLED(CONFIG_IPV6)
191        if (tpi->proto == htons(ETH_P_IPV6) &&
192            !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len,
193 				       type, data_len))
194                return;
195 #endif
196 
197 	if (t->parms.iph.daddr == 0 ||
198 	    ipv4_is_multicast(t->parms.iph.daddr))
199 		return;
200 
201 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
202 		return;
203 
204 	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
205 		t->err_count++;
206 	else
207 		t->err_count = 1;
208 	t->err_time = jiffies;
209 }
210 
211 static void gre_err(struct sk_buff *skb, u32 info)
212 {
213 	/* All the routers (except for Linux) return only
214 	 * 8 bytes of packet payload. It means, that precise relaying of
215 	 * ICMP in the real Internet is absolutely infeasible.
216 	 *
217 	 * Moreover, Cisco "wise men" put GRE key to the third word
218 	 * in GRE header. It makes impossible maintaining even soft
219 	 * state for keyed
220 	 * GRE tunnels with enabled checksum. Tell them "thank you".
221 	 *
222 	 * Well, I wonder, rfc1812 was written by Cisco employee,
223 	 * what the hell these idiots break standards established
224 	 * by themselves???
225 	 */
226 
227 	const struct iphdr *iph = (struct iphdr *)skb->data;
228 	const int type = icmp_hdr(skb)->type;
229 	const int code = icmp_hdr(skb)->code;
230 	struct tnl_ptk_info tpi;
231 	bool csum_err = false;
232 
233 	if (gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP),
234 			     iph->ihl * 4) < 0) {
235 		if (!csum_err)		/* ignore csum errors. */
236 			return;
237 	}
238 
239 	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
240 		ipv4_update_pmtu(skb, dev_net(skb->dev), info,
241 				 skb->dev->ifindex, 0, IPPROTO_GRE, 0);
242 		return;
243 	}
244 	if (type == ICMP_REDIRECT) {
245 		ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0,
246 			      IPPROTO_GRE, 0);
247 		return;
248 	}
249 
250 	ipgre_err(skb, info, &tpi);
251 }
252 
253 static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
254 		      int gre_hdr_len)
255 {
256 	struct net *net = dev_net(skb->dev);
257 	struct metadata_dst *tun_dst = NULL;
258 	struct ip_tunnel_net *itn;
259 	struct ip_tunnel *tunnel;
260 	struct erspanhdr *ershdr;
261 	const struct iphdr *iph;
262 	__be32 index;
263 	int len;
264 
265 	itn = net_generic(net, erspan_net_id);
266 	len = gre_hdr_len + sizeof(*ershdr);
267 
268 	if (unlikely(!pskb_may_pull(skb, len)))
269 		return -ENOMEM;
270 
271 	iph = ip_hdr(skb);
272 	ershdr = (struct erspanhdr *)(skb->data + gre_hdr_len);
273 
274 	/* The original GRE header does not have key field,
275 	 * Use ERSPAN 10-bit session ID as key.
276 	 */
277 	tpi->key = cpu_to_be32(ntohs(ershdr->session_id) & ID_MASK);
278 	index = ershdr->md.index;
279 	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
280 				  tpi->flags | TUNNEL_KEY,
281 				  iph->saddr, iph->daddr, tpi->key);
282 
283 	if (tunnel) {
284 		if (__iptunnel_pull_header(skb,
285 					   gre_hdr_len + sizeof(*ershdr),
286 					   htons(ETH_P_TEB),
287 					   false, false) < 0)
288 			goto drop;
289 
290 		if (tunnel->collect_md) {
291 			struct ip_tunnel_info *info;
292 			struct erspan_metadata *md;
293 			__be64 tun_id;
294 			__be16 flags;
295 
296 			tpi->flags |= TUNNEL_KEY;
297 			flags = tpi->flags;
298 			tun_id = key32_to_tunnel_id(tpi->key);
299 
300 			tun_dst = ip_tun_rx_dst(skb, flags,
301 						tun_id, sizeof(*md));
302 			if (!tun_dst)
303 				return PACKET_REJECT;
304 
305 			md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
306 			if (!md)
307 				return PACKET_REJECT;
308 
309 			md->index = index;
310 			info = &tun_dst->u.tun_info;
311 			info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
312 			info->options_len = sizeof(*md);
313 		} else {
314 			tunnel->index = ntohl(index);
315 		}
316 
317 		skb_reset_mac_header(skb);
318 		ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
319 		return PACKET_RCVD;
320 	}
321 drop:
322 	kfree_skb(skb);
323 	return PACKET_RCVD;
324 }
325 
326 static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
327 		       struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
328 {
329 	struct metadata_dst *tun_dst = NULL;
330 	const struct iphdr *iph;
331 	struct ip_tunnel *tunnel;
332 
333 	iph = ip_hdr(skb);
334 	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
335 				  iph->saddr, iph->daddr, tpi->key);
336 
337 	if (tunnel) {
338 		if (__iptunnel_pull_header(skb, hdr_len, tpi->proto,
339 					   raw_proto, false) < 0)
340 			goto drop;
341 
342 		if (tunnel->dev->type != ARPHRD_NONE)
343 			skb_pop_mac_header(skb);
344 		else
345 			skb_reset_mac_header(skb);
346 		if (tunnel->collect_md) {
347 			__be16 flags;
348 			__be64 tun_id;
349 
350 			flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
351 			tun_id = key32_to_tunnel_id(tpi->key);
352 			tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
353 			if (!tun_dst)
354 				return PACKET_REJECT;
355 		}
356 
357 		ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
358 		return PACKET_RCVD;
359 	}
360 	return PACKET_NEXT;
361 
362 drop:
363 	kfree_skb(skb);
364 	return PACKET_RCVD;
365 }
366 
367 static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
368 		     int hdr_len)
369 {
370 	struct net *net = dev_net(skb->dev);
371 	struct ip_tunnel_net *itn;
372 	int res;
373 
374 	if (tpi->proto == htons(ETH_P_TEB))
375 		itn = net_generic(net, gre_tap_net_id);
376 	else
377 		itn = net_generic(net, ipgre_net_id);
378 
379 	res = __ipgre_rcv(skb, tpi, itn, hdr_len, false);
380 	if (res == PACKET_NEXT && tpi->proto == htons(ETH_P_TEB)) {
381 		/* ipgre tunnels in collect metadata mode should receive
382 		 * also ETH_P_TEB traffic.
383 		 */
384 		itn = net_generic(net, ipgre_net_id);
385 		res = __ipgre_rcv(skb, tpi, itn, hdr_len, true);
386 	}
387 	return res;
388 }
389 
390 static int gre_rcv(struct sk_buff *skb)
391 {
392 	struct tnl_ptk_info tpi;
393 	bool csum_err = false;
394 	int hdr_len;
395 
396 #ifdef CONFIG_NET_IPGRE_BROADCAST
397 	if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
398 		/* Looped back packet, drop it! */
399 		if (rt_is_output_route(skb_rtable(skb)))
400 			goto drop;
401 	}
402 #endif
403 
404 	hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP), 0);
405 	if (hdr_len < 0)
406 		goto drop;
407 
408 	if (unlikely(tpi.proto == htons(ETH_P_ERSPAN))) {
409 		if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
410 			return 0;
411 	}
412 
413 	if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
414 		return 0;
415 
416 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
417 drop:
418 	kfree_skb(skb);
419 	return 0;
420 }
421 
422 static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
423 		       const struct iphdr *tnl_params,
424 		       __be16 proto)
425 {
426 	struct ip_tunnel *tunnel = netdev_priv(dev);
427 
428 	if (tunnel->parms.o_flags & TUNNEL_SEQ)
429 		tunnel->o_seqno++;
430 
431 	/* Push GRE header. */
432 	gre_build_header(skb, tunnel->tun_hlen,
433 			 tunnel->parms.o_flags, proto, tunnel->parms.o_key,
434 			 htonl(tunnel->o_seqno));
435 
436 	ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
437 }
438 
439 static int gre_handle_offloads(struct sk_buff *skb, bool csum)
440 {
441 	return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
442 }
443 
444 static struct rtable *gre_get_rt(struct sk_buff *skb,
445 				 struct net_device *dev,
446 				 struct flowi4 *fl,
447 				 const struct ip_tunnel_key *key)
448 {
449 	struct net *net = dev_net(dev);
450 
451 	memset(fl, 0, sizeof(*fl));
452 	fl->daddr = key->u.ipv4.dst;
453 	fl->saddr = key->u.ipv4.src;
454 	fl->flowi4_tos = RT_TOS(key->tos);
455 	fl->flowi4_mark = skb->mark;
456 	fl->flowi4_proto = IPPROTO_GRE;
457 
458 	return ip_route_output_key(net, fl);
459 }
460 
461 static struct rtable *prepare_fb_xmit(struct sk_buff *skb,
462 				      struct net_device *dev,
463 				      struct flowi4 *fl,
464 				      int tunnel_hlen)
465 {
466 	struct ip_tunnel_info *tun_info;
467 	const struct ip_tunnel_key *key;
468 	struct rtable *rt = NULL;
469 	int min_headroom;
470 	bool use_cache;
471 	int err;
472 
473 	tun_info = skb_tunnel_info(skb);
474 	key = &tun_info->key;
475 	use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
476 
477 	if (use_cache)
478 		rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl->saddr);
479 	if (!rt) {
480 		rt = gre_get_rt(skb, dev, fl, key);
481 		if (IS_ERR(rt))
482 			goto err_free_skb;
483 		if (use_cache)
484 			dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
485 					  fl->saddr);
486 	}
487 
488 	min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
489 			+ tunnel_hlen + sizeof(struct iphdr);
490 	if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
491 		int head_delta = SKB_DATA_ALIGN(min_headroom -
492 						skb_headroom(skb) +
493 						16);
494 		err = pskb_expand_head(skb, max_t(int, head_delta, 0),
495 				       0, GFP_ATOMIC);
496 		if (unlikely(err))
497 			goto err_free_rt;
498 	}
499 	return rt;
500 
501 err_free_rt:
502 	ip_rt_put(rt);
503 err_free_skb:
504 	kfree_skb(skb);
505 	dev->stats.tx_dropped++;
506 	return NULL;
507 }
508 
509 static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
510 			__be16 proto)
511 {
512 	struct ip_tunnel_info *tun_info;
513 	const struct ip_tunnel_key *key;
514 	struct rtable *rt = NULL;
515 	struct flowi4 fl;
516 	int tunnel_hlen;
517 	__be16 df, flags;
518 
519 	tun_info = skb_tunnel_info(skb);
520 	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
521 		     ip_tunnel_info_af(tun_info) != AF_INET))
522 		goto err_free_skb;
523 
524 	key = &tun_info->key;
525 	tunnel_hlen = gre_calc_hlen(key->tun_flags);
526 
527 	rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen);
528 	if (!rt)
529 		return;
530 
531 	/* Push Tunnel header. */
532 	if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM)))
533 		goto err_free_rt;
534 
535 	flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY);
536 	gre_build_header(skb, tunnel_hlen, flags, proto,
537 			 tunnel_id_to_key32(tun_info->key.tun_id), 0);
538 
539 	df = key->tun_flags & TUNNEL_DONT_FRAGMENT ?  htons(IP_DF) : 0;
540 
541 	iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE,
542 		      key->tos, key->ttl, df, false);
543 	return;
544 
545 err_free_rt:
546 	ip_rt_put(rt);
547 err_free_skb:
548 	kfree_skb(skb);
549 	dev->stats.tx_dropped++;
550 }
551 
552 static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
553 			   __be16 proto)
554 {
555 	struct ip_tunnel *tunnel = netdev_priv(dev);
556 	struct ip_tunnel_info *tun_info;
557 	const struct ip_tunnel_key *key;
558 	struct erspan_metadata *md;
559 	struct rtable *rt = NULL;
560 	bool truncate = false;
561 	struct flowi4 fl;
562 	int tunnel_hlen;
563 	__be16 df;
564 
565 	tun_info = skb_tunnel_info(skb);
566 	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
567 		     ip_tunnel_info_af(tun_info) != AF_INET))
568 		goto err_free_skb;
569 
570 	key = &tun_info->key;
571 
572 	/* ERSPAN has fixed 8 byte GRE header */
573 	tunnel_hlen = 8 + sizeof(struct erspanhdr);
574 
575 	rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen);
576 	if (!rt)
577 		return;
578 
579 	if (gre_handle_offloads(skb, false))
580 		goto err_free_rt;
581 
582 	if (skb->len > dev->mtu + dev->hard_header_len) {
583 		pskb_trim(skb, dev->mtu + dev->hard_header_len);
584 		truncate = true;
585 	}
586 
587 	md = ip_tunnel_info_opts(tun_info);
588 	if (!md)
589 		goto err_free_rt;
590 
591 	erspan_build_header(skb, tunnel_id_to_key32(key->tun_id),
592 			    ntohl(md->index), truncate);
593 
594 	gre_build_header(skb, 8, TUNNEL_SEQ,
595 			 htons(ETH_P_ERSPAN), 0, htonl(tunnel->o_seqno++));
596 
597 	df = key->tun_flags & TUNNEL_DONT_FRAGMENT ?  htons(IP_DF) : 0;
598 
599 	iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE,
600 		      key->tos, key->ttl, df, false);
601 	return;
602 
603 err_free_rt:
604 	ip_rt_put(rt);
605 err_free_skb:
606 	kfree_skb(skb);
607 	dev->stats.tx_dropped++;
608 }
609 
610 static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
611 {
612 	struct ip_tunnel_info *info = skb_tunnel_info(skb);
613 	struct rtable *rt;
614 	struct flowi4 fl4;
615 
616 	if (ip_tunnel_info_af(info) != AF_INET)
617 		return -EINVAL;
618 
619 	rt = gre_get_rt(skb, dev, &fl4, &info->key);
620 	if (IS_ERR(rt))
621 		return PTR_ERR(rt);
622 
623 	ip_rt_put(rt);
624 	info->key.u.ipv4.src = fl4.saddr;
625 	return 0;
626 }
627 
628 static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
629 			      struct net_device *dev)
630 {
631 	struct ip_tunnel *tunnel = netdev_priv(dev);
632 	const struct iphdr *tnl_params;
633 
634 	if (tunnel->collect_md) {
635 		gre_fb_xmit(skb, dev, skb->protocol);
636 		return NETDEV_TX_OK;
637 	}
638 
639 	if (dev->header_ops) {
640 		/* Need space for new headers */
641 		if (skb_cow_head(skb, dev->needed_headroom -
642 				      (tunnel->hlen + sizeof(struct iphdr))))
643 			goto free_skb;
644 
645 		tnl_params = (const struct iphdr *)skb->data;
646 
647 		/* Pull skb since ip_tunnel_xmit() needs skb->data pointing
648 		 * to gre header.
649 		 */
650 		skb_pull(skb, tunnel->hlen + sizeof(struct iphdr));
651 		skb_reset_mac_header(skb);
652 	} else {
653 		if (skb_cow_head(skb, dev->needed_headroom))
654 			goto free_skb;
655 
656 		tnl_params = &tunnel->parms.iph;
657 	}
658 
659 	if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
660 		goto free_skb;
661 
662 	__gre_xmit(skb, dev, tnl_params, skb->protocol);
663 	return NETDEV_TX_OK;
664 
665 free_skb:
666 	kfree_skb(skb);
667 	dev->stats.tx_dropped++;
668 	return NETDEV_TX_OK;
669 }
670 
671 static inline u8 tos_to_cos(u8 tos)
672 {
673 	u8 dscp, cos;
674 
675 	dscp = tos >> 2;
676 	cos = dscp >> 3;
677 	return cos;
678 }
679 
680 static void erspan_build_header(struct sk_buff *skb,
681 				__be32 id, u32 index, bool truncate)
682 {
683 	struct iphdr *iphdr = ip_hdr(skb);
684 	struct ethhdr *eth = eth_hdr(skb);
685 	enum erspan_encap_type enc_type;
686 	struct erspanhdr *ershdr;
687 	struct qtag_prefix {
688 		__be16 eth_type;
689 		__be16 tci;
690 	} *qp;
691 	u16 vlan_tci = 0;
692 
693 	enc_type = ERSPAN_ENCAP_NOVLAN;
694 
695 	/* If mirrored packet has vlan tag, extract tci and
696 	 *  perserve vlan header in the mirrored frame.
697 	 */
698 	if (eth->h_proto == htons(ETH_P_8021Q)) {
699 		qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN);
700 		vlan_tci = ntohs(qp->tci);
701 		enc_type = ERSPAN_ENCAP_INFRAME;
702 	}
703 
704 	skb_push(skb, sizeof(*ershdr));
705 	ershdr = (struct erspanhdr *)skb->data;
706 	memset(ershdr, 0, sizeof(*ershdr));
707 
708 	ershdr->ver_vlan = htons((vlan_tci & VLAN_MASK) |
709 				 (ERSPAN_VERSION << VER_OFFSET));
710 	ershdr->session_id = htons((u16)(ntohl(id) & ID_MASK) |
711 			   ((tos_to_cos(iphdr->tos) << COS_OFFSET) & COS_MASK) |
712 			   (enc_type << EN_OFFSET & EN_MASK) |
713 			   ((truncate << T_OFFSET) & T_MASK));
714 	ershdr->md.index = htonl(index & INDEX_MASK);
715 }
716 
717 static netdev_tx_t erspan_xmit(struct sk_buff *skb,
718 			       struct net_device *dev)
719 {
720 	struct ip_tunnel *tunnel = netdev_priv(dev);
721 	bool truncate = false;
722 
723 	if (tunnel->collect_md) {
724 		erspan_fb_xmit(skb, dev, skb->protocol);
725 		return NETDEV_TX_OK;
726 	}
727 
728 	if (gre_handle_offloads(skb, false))
729 		goto free_skb;
730 
731 	if (skb_cow_head(skb, dev->needed_headroom))
732 		goto free_skb;
733 
734 	if (skb->len > dev->mtu + dev->hard_header_len) {
735 		pskb_trim(skb, dev->mtu + dev->hard_header_len);
736 		truncate = true;
737 	}
738 
739 	/* Push ERSPAN header */
740 	erspan_build_header(skb, tunnel->parms.o_key, tunnel->index, truncate);
741 	tunnel->parms.o_flags &= ~TUNNEL_KEY;
742 	__gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_ERSPAN));
743 	return NETDEV_TX_OK;
744 
745 free_skb:
746 	kfree_skb(skb);
747 	dev->stats.tx_dropped++;
748 	return NETDEV_TX_OK;
749 }
750 
751 static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
752 				struct net_device *dev)
753 {
754 	struct ip_tunnel *tunnel = netdev_priv(dev);
755 
756 	if (tunnel->collect_md) {
757 		gre_fb_xmit(skb, dev, htons(ETH_P_TEB));
758 		return NETDEV_TX_OK;
759 	}
760 
761 	if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
762 		goto free_skb;
763 
764 	if (skb_cow_head(skb, dev->needed_headroom))
765 		goto free_skb;
766 
767 	__gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB));
768 	return NETDEV_TX_OK;
769 
770 free_skb:
771 	kfree_skb(skb);
772 	dev->stats.tx_dropped++;
773 	return NETDEV_TX_OK;
774 }
775 
776 static void ipgre_link_update(struct net_device *dev, bool set_mtu)
777 {
778 	struct ip_tunnel *tunnel = netdev_priv(dev);
779 	int len;
780 
781 	len = tunnel->tun_hlen;
782 	tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
783 	len = tunnel->tun_hlen - len;
784 	tunnel->hlen = tunnel->hlen + len;
785 
786 	dev->needed_headroom = dev->needed_headroom + len;
787 	if (set_mtu)
788 		dev->mtu = max_t(int, dev->mtu - len, 68);
789 
790 	if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
791 		if (!(tunnel->parms.o_flags & TUNNEL_CSUM) ||
792 		    tunnel->encap.type == TUNNEL_ENCAP_NONE) {
793 			dev->features |= NETIF_F_GSO_SOFTWARE;
794 			dev->hw_features |= NETIF_F_GSO_SOFTWARE;
795 		}
796 		dev->features |= NETIF_F_LLTX;
797 	}
798 }
799 
800 static int ipgre_tunnel_ioctl(struct net_device *dev,
801 			      struct ifreq *ifr, int cmd)
802 {
803 	struct ip_tunnel_parm p;
804 	int err;
805 
806 	if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
807 		return -EFAULT;
808 
809 	if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
810 		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
811 		    p.iph.ihl != 5 || (p.iph.frag_off & htons(~IP_DF)) ||
812 		    ((p.i_flags | p.o_flags) & (GRE_VERSION | GRE_ROUTING)))
813 			return -EINVAL;
814 	}
815 
816 	p.i_flags = gre_flags_to_tnl_flags(p.i_flags);
817 	p.o_flags = gre_flags_to_tnl_flags(p.o_flags);
818 
819 	err = ip_tunnel_ioctl(dev, &p, cmd);
820 	if (err)
821 		return err;
822 
823 	if (cmd == SIOCCHGTUNNEL) {
824 		struct ip_tunnel *t = netdev_priv(dev);
825 
826 		t->parms.i_flags = p.i_flags;
827 		t->parms.o_flags = p.o_flags;
828 
829 		if (strcmp(dev->rtnl_link_ops->kind, "erspan"))
830 			ipgre_link_update(dev, true);
831 	}
832 
833 	p.i_flags = gre_tnl_flags_to_gre_flags(p.i_flags);
834 	p.o_flags = gre_tnl_flags_to_gre_flags(p.o_flags);
835 
836 	if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
837 		return -EFAULT;
838 
839 	return 0;
840 }
841 
842 /* Nice toy. Unfortunately, useless in real life :-)
843    It allows to construct virtual multiprotocol broadcast "LAN"
844    over the Internet, provided multicast routing is tuned.
845 
846 
847    I have no idea was this bicycle invented before me,
848    so that I had to set ARPHRD_IPGRE to a random value.
849    I have an impression, that Cisco could make something similar,
850    but this feature is apparently missing in IOS<=11.2(8).
851 
852    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
853    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
854 
855    ping -t 255 224.66.66.66
856 
857    If nobody answers, mbone does not work.
858 
859    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
860    ip addr add 10.66.66.<somewhat>/24 dev Universe
861    ifconfig Universe up
862    ifconfig Universe add fe80::<Your_real_addr>/10
863    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
864    ftp 10.66.66.66
865    ...
866    ftp fec0:6666:6666::193.233.7.65
867    ...
868  */
869 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
870 			unsigned short type,
871 			const void *daddr, const void *saddr, unsigned int len)
872 {
873 	struct ip_tunnel *t = netdev_priv(dev);
874 	struct iphdr *iph;
875 	struct gre_base_hdr *greh;
876 
877 	iph = skb_push(skb, t->hlen + sizeof(*iph));
878 	greh = (struct gre_base_hdr *)(iph+1);
879 	greh->flags = gre_tnl_flags_to_gre_flags(t->parms.o_flags);
880 	greh->protocol = htons(type);
881 
882 	memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
883 
884 	/* Set the source hardware address. */
885 	if (saddr)
886 		memcpy(&iph->saddr, saddr, 4);
887 	if (daddr)
888 		memcpy(&iph->daddr, daddr, 4);
889 	if (iph->daddr)
890 		return t->hlen + sizeof(*iph);
891 
892 	return -(t->hlen + sizeof(*iph));
893 }
894 
895 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
896 {
897 	const struct iphdr *iph = (const struct iphdr *) skb_mac_header(skb);
898 	memcpy(haddr, &iph->saddr, 4);
899 	return 4;
900 }
901 
902 static const struct header_ops ipgre_header_ops = {
903 	.create	= ipgre_header,
904 	.parse	= ipgre_header_parse,
905 };
906 
907 #ifdef CONFIG_NET_IPGRE_BROADCAST
908 static int ipgre_open(struct net_device *dev)
909 {
910 	struct ip_tunnel *t = netdev_priv(dev);
911 
912 	if (ipv4_is_multicast(t->parms.iph.daddr)) {
913 		struct flowi4 fl4;
914 		struct rtable *rt;
915 
916 		rt = ip_route_output_gre(t->net, &fl4,
917 					 t->parms.iph.daddr,
918 					 t->parms.iph.saddr,
919 					 t->parms.o_key,
920 					 RT_TOS(t->parms.iph.tos),
921 					 t->parms.link);
922 		if (IS_ERR(rt))
923 			return -EADDRNOTAVAIL;
924 		dev = rt->dst.dev;
925 		ip_rt_put(rt);
926 		if (!__in_dev_get_rtnl(dev))
927 			return -EADDRNOTAVAIL;
928 		t->mlink = dev->ifindex;
929 		ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
930 	}
931 	return 0;
932 }
933 
934 static int ipgre_close(struct net_device *dev)
935 {
936 	struct ip_tunnel *t = netdev_priv(dev);
937 
938 	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
939 		struct in_device *in_dev;
940 		in_dev = inetdev_by_index(t->net, t->mlink);
941 		if (in_dev)
942 			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
943 	}
944 	return 0;
945 }
946 #endif
947 
948 static const struct net_device_ops ipgre_netdev_ops = {
949 	.ndo_init		= ipgre_tunnel_init,
950 	.ndo_uninit		= ip_tunnel_uninit,
951 #ifdef CONFIG_NET_IPGRE_BROADCAST
952 	.ndo_open		= ipgre_open,
953 	.ndo_stop		= ipgre_close,
954 #endif
955 	.ndo_start_xmit		= ipgre_xmit,
956 	.ndo_do_ioctl		= ipgre_tunnel_ioctl,
957 	.ndo_change_mtu		= ip_tunnel_change_mtu,
958 	.ndo_get_stats64	= ip_tunnel_get_stats64,
959 	.ndo_get_iflink		= ip_tunnel_get_iflink,
960 };
961 
962 #define GRE_FEATURES (NETIF_F_SG |		\
963 		      NETIF_F_FRAGLIST |	\
964 		      NETIF_F_HIGHDMA |		\
965 		      NETIF_F_HW_CSUM)
966 
967 static void ipgre_tunnel_setup(struct net_device *dev)
968 {
969 	dev->netdev_ops		= &ipgre_netdev_ops;
970 	dev->type		= ARPHRD_IPGRE;
971 	ip_tunnel_setup(dev, ipgre_net_id);
972 }
973 
974 static void __gre_tunnel_init(struct net_device *dev)
975 {
976 	struct ip_tunnel *tunnel;
977 	int t_hlen;
978 
979 	tunnel = netdev_priv(dev);
980 	tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
981 	tunnel->parms.iph.protocol = IPPROTO_GRE;
982 
983 	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
984 
985 	t_hlen = tunnel->hlen + sizeof(struct iphdr);
986 
987 	dev->needed_headroom	= LL_MAX_HEADER + t_hlen + 4;
988 	dev->mtu		= ETH_DATA_LEN - t_hlen - 4;
989 
990 	dev->features		|= GRE_FEATURES;
991 	dev->hw_features	|= GRE_FEATURES;
992 
993 	if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
994 		/* TCP offload with GRE SEQ is not supported, nor
995 		 * can we support 2 levels of outer headers requiring
996 		 * an update.
997 		 */
998 		if (!(tunnel->parms.o_flags & TUNNEL_CSUM) ||
999 		    (tunnel->encap.type == TUNNEL_ENCAP_NONE)) {
1000 			dev->features    |= NETIF_F_GSO_SOFTWARE;
1001 			dev->hw_features |= NETIF_F_GSO_SOFTWARE;
1002 		}
1003 
1004 		/* Can use a lockless transmit, unless we generate
1005 		 * output sequences
1006 		 */
1007 		dev->features |= NETIF_F_LLTX;
1008 	}
1009 }
1010 
1011 static int ipgre_tunnel_init(struct net_device *dev)
1012 {
1013 	struct ip_tunnel *tunnel = netdev_priv(dev);
1014 	struct iphdr *iph = &tunnel->parms.iph;
1015 
1016 	__gre_tunnel_init(dev);
1017 
1018 	memcpy(dev->dev_addr, &iph->saddr, 4);
1019 	memcpy(dev->broadcast, &iph->daddr, 4);
1020 
1021 	dev->flags		= IFF_NOARP;
1022 	netif_keep_dst(dev);
1023 	dev->addr_len		= 4;
1024 
1025 	if (iph->daddr && !tunnel->collect_md) {
1026 #ifdef CONFIG_NET_IPGRE_BROADCAST
1027 		if (ipv4_is_multicast(iph->daddr)) {
1028 			if (!iph->saddr)
1029 				return -EINVAL;
1030 			dev->flags = IFF_BROADCAST;
1031 			dev->header_ops = &ipgre_header_ops;
1032 		}
1033 #endif
1034 	} else if (!tunnel->collect_md) {
1035 		dev->header_ops = &ipgre_header_ops;
1036 	}
1037 
1038 	return ip_tunnel_init(dev);
1039 }
1040 
1041 static const struct gre_protocol ipgre_protocol = {
1042 	.handler     = gre_rcv,
1043 	.err_handler = gre_err,
1044 };
1045 
1046 static int __net_init ipgre_init_net(struct net *net)
1047 {
1048 	return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
1049 }
1050 
1051 static void __net_exit ipgre_exit_batch_net(struct list_head *list_net)
1052 {
1053 	ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops);
1054 }
1055 
1056 static struct pernet_operations ipgre_net_ops = {
1057 	.init = ipgre_init_net,
1058 	.exit_batch = ipgre_exit_batch_net,
1059 	.id   = &ipgre_net_id,
1060 	.size = sizeof(struct ip_tunnel_net),
1061 };
1062 
1063 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
1064 				 struct netlink_ext_ack *extack)
1065 {
1066 	__be16 flags;
1067 
1068 	if (!data)
1069 		return 0;
1070 
1071 	flags = 0;
1072 	if (data[IFLA_GRE_IFLAGS])
1073 		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1074 	if (data[IFLA_GRE_OFLAGS])
1075 		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1076 	if (flags & (GRE_VERSION|GRE_ROUTING))
1077 		return -EINVAL;
1078 
1079 	if (data[IFLA_GRE_COLLECT_METADATA] &&
1080 	    data[IFLA_GRE_ENCAP_TYPE] &&
1081 	    nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]) != TUNNEL_ENCAP_NONE)
1082 		return -EINVAL;
1083 
1084 	return 0;
1085 }
1086 
1087 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[],
1088 			      struct netlink_ext_ack *extack)
1089 {
1090 	__be32 daddr;
1091 
1092 	if (tb[IFLA_ADDRESS]) {
1093 		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1094 			return -EINVAL;
1095 		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1096 			return -EADDRNOTAVAIL;
1097 	}
1098 
1099 	if (!data)
1100 		goto out;
1101 
1102 	if (data[IFLA_GRE_REMOTE]) {
1103 		memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1104 		if (!daddr)
1105 			return -EINVAL;
1106 	}
1107 
1108 out:
1109 	return ipgre_tunnel_validate(tb, data, extack);
1110 }
1111 
1112 static int erspan_validate(struct nlattr *tb[], struct nlattr *data[],
1113 			   struct netlink_ext_ack *extack)
1114 {
1115 	__be16 flags = 0;
1116 	int ret;
1117 
1118 	if (!data)
1119 		return 0;
1120 
1121 	ret = ipgre_tap_validate(tb, data, extack);
1122 	if (ret)
1123 		return ret;
1124 
1125 	/* ERSPAN should only have GRE sequence and key flag */
1126 	if (data[IFLA_GRE_OFLAGS])
1127 		flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1128 	if (data[IFLA_GRE_IFLAGS])
1129 		flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1130 	if (!data[IFLA_GRE_COLLECT_METADATA] &&
1131 	    flags != (GRE_SEQ | GRE_KEY))
1132 		return -EINVAL;
1133 
1134 	/* ERSPAN Session ID only has 10-bit. Since we reuse
1135 	 * 32-bit key field as ID, check it's range.
1136 	 */
1137 	if (data[IFLA_GRE_IKEY] &&
1138 	    (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK))
1139 		return -EINVAL;
1140 
1141 	if (data[IFLA_GRE_OKEY] &&
1142 	    (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK))
1143 		return -EINVAL;
1144 
1145 	return 0;
1146 }
1147 
1148 static int ipgre_netlink_parms(struct net_device *dev,
1149 				struct nlattr *data[],
1150 				struct nlattr *tb[],
1151 				struct ip_tunnel_parm *parms,
1152 				__u32 *fwmark)
1153 {
1154 	struct ip_tunnel *t = netdev_priv(dev);
1155 
1156 	memset(parms, 0, sizeof(*parms));
1157 
1158 	parms->iph.protocol = IPPROTO_GRE;
1159 
1160 	if (!data)
1161 		return 0;
1162 
1163 	if (data[IFLA_GRE_LINK])
1164 		parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1165 
1166 	if (data[IFLA_GRE_IFLAGS])
1167 		parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS]));
1168 
1169 	if (data[IFLA_GRE_OFLAGS])
1170 		parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS]));
1171 
1172 	if (data[IFLA_GRE_IKEY])
1173 		parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1174 
1175 	if (data[IFLA_GRE_OKEY])
1176 		parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1177 
1178 	if (data[IFLA_GRE_LOCAL])
1179 		parms->iph.saddr = nla_get_in_addr(data[IFLA_GRE_LOCAL]);
1180 
1181 	if (data[IFLA_GRE_REMOTE])
1182 		parms->iph.daddr = nla_get_in_addr(data[IFLA_GRE_REMOTE]);
1183 
1184 	if (data[IFLA_GRE_TTL])
1185 		parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1186 
1187 	if (data[IFLA_GRE_TOS])
1188 		parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1189 
1190 	if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) {
1191 		if (t->ignore_df)
1192 			return -EINVAL;
1193 		parms->iph.frag_off = htons(IP_DF);
1194 	}
1195 
1196 	if (data[IFLA_GRE_COLLECT_METADATA]) {
1197 		t->collect_md = true;
1198 		if (dev->type == ARPHRD_IPGRE)
1199 			dev->type = ARPHRD_NONE;
1200 	}
1201 
1202 	if (data[IFLA_GRE_IGNORE_DF]) {
1203 		if (nla_get_u8(data[IFLA_GRE_IGNORE_DF])
1204 		  && (parms->iph.frag_off & htons(IP_DF)))
1205 			return -EINVAL;
1206 		t->ignore_df = !!nla_get_u8(data[IFLA_GRE_IGNORE_DF]);
1207 	}
1208 
1209 	if (data[IFLA_GRE_FWMARK])
1210 		*fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);
1211 
1212 	if (data[IFLA_GRE_ERSPAN_INDEX]) {
1213 		t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
1214 
1215 		if (t->index & ~INDEX_MASK)
1216 			return -EINVAL;
1217 	}
1218 
1219 	return 0;
1220 }
1221 
1222 /* This function returns true when ENCAP attributes are present in the nl msg */
1223 static bool ipgre_netlink_encap_parms(struct nlattr *data[],
1224 				      struct ip_tunnel_encap *ipencap)
1225 {
1226 	bool ret = false;
1227 
1228 	memset(ipencap, 0, sizeof(*ipencap));
1229 
1230 	if (!data)
1231 		return ret;
1232 
1233 	if (data[IFLA_GRE_ENCAP_TYPE]) {
1234 		ret = true;
1235 		ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]);
1236 	}
1237 
1238 	if (data[IFLA_GRE_ENCAP_FLAGS]) {
1239 		ret = true;
1240 		ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]);
1241 	}
1242 
1243 	if (data[IFLA_GRE_ENCAP_SPORT]) {
1244 		ret = true;
1245 		ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]);
1246 	}
1247 
1248 	if (data[IFLA_GRE_ENCAP_DPORT]) {
1249 		ret = true;
1250 		ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]);
1251 	}
1252 
1253 	return ret;
1254 }
1255 
1256 static int gre_tap_init(struct net_device *dev)
1257 {
1258 	__gre_tunnel_init(dev);
1259 	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1260 	netif_keep_dst(dev);
1261 
1262 	return ip_tunnel_init(dev);
1263 }
1264 
1265 static const struct net_device_ops gre_tap_netdev_ops = {
1266 	.ndo_init		= gre_tap_init,
1267 	.ndo_uninit		= ip_tunnel_uninit,
1268 	.ndo_start_xmit		= gre_tap_xmit,
1269 	.ndo_set_mac_address 	= eth_mac_addr,
1270 	.ndo_validate_addr	= eth_validate_addr,
1271 	.ndo_change_mtu		= ip_tunnel_change_mtu,
1272 	.ndo_get_stats64	= ip_tunnel_get_stats64,
1273 	.ndo_get_iflink		= ip_tunnel_get_iflink,
1274 	.ndo_fill_metadata_dst	= gre_fill_metadata_dst,
1275 };
1276 
1277 static int erspan_tunnel_init(struct net_device *dev)
1278 {
1279 	struct ip_tunnel *tunnel = netdev_priv(dev);
1280 	int t_hlen;
1281 
1282 	tunnel->tun_hlen = 8;
1283 	tunnel->parms.iph.protocol = IPPROTO_GRE;
1284 	tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
1285 		       sizeof(struct erspanhdr);
1286 	t_hlen = tunnel->hlen + sizeof(struct iphdr);
1287 
1288 	dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4;
1289 	dev->mtu = ETH_DATA_LEN - t_hlen - 4;
1290 	dev->features		|= GRE_FEATURES;
1291 	dev->hw_features	|= GRE_FEATURES;
1292 	dev->priv_flags		|= IFF_LIVE_ADDR_CHANGE;
1293 	netif_keep_dst(dev);
1294 
1295 	return ip_tunnel_init(dev);
1296 }
1297 
1298 static const struct net_device_ops erspan_netdev_ops = {
1299 	.ndo_init		= erspan_tunnel_init,
1300 	.ndo_uninit		= ip_tunnel_uninit,
1301 	.ndo_start_xmit		= erspan_xmit,
1302 	.ndo_set_mac_address	= eth_mac_addr,
1303 	.ndo_validate_addr	= eth_validate_addr,
1304 	.ndo_change_mtu		= ip_tunnel_change_mtu,
1305 	.ndo_get_stats64	= ip_tunnel_get_stats64,
1306 	.ndo_get_iflink		= ip_tunnel_get_iflink,
1307 	.ndo_fill_metadata_dst	= gre_fill_metadata_dst,
1308 };
1309 
1310 static void ipgre_tap_setup(struct net_device *dev)
1311 {
1312 	ether_setup(dev);
1313 	dev->netdev_ops	= &gre_tap_netdev_ops;
1314 	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1315 	dev->priv_flags	|= IFF_LIVE_ADDR_CHANGE;
1316 	ip_tunnel_setup(dev, gre_tap_net_id);
1317 }
1318 
1319 static int ipgre_newlink(struct net *src_net, struct net_device *dev,
1320 			 struct nlattr *tb[], struct nlattr *data[],
1321 			 struct netlink_ext_ack *extack)
1322 {
1323 	struct ip_tunnel_parm p;
1324 	struct ip_tunnel_encap ipencap;
1325 	__u32 fwmark = 0;
1326 	int err;
1327 
1328 	if (ipgre_netlink_encap_parms(data, &ipencap)) {
1329 		struct ip_tunnel *t = netdev_priv(dev);
1330 		err = ip_tunnel_encap_setup(t, &ipencap);
1331 
1332 		if (err < 0)
1333 			return err;
1334 	}
1335 
1336 	err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
1337 	if (err < 0)
1338 		return err;
1339 	return ip_tunnel_newlink(dev, tb, &p, fwmark);
1340 }
1341 
1342 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1343 			    struct nlattr *data[],
1344 			    struct netlink_ext_ack *extack)
1345 {
1346 	struct ip_tunnel *t = netdev_priv(dev);
1347 	struct ip_tunnel_encap ipencap;
1348 	__u32 fwmark = t->fwmark;
1349 	struct ip_tunnel_parm p;
1350 	int err;
1351 
1352 	if (ipgre_netlink_encap_parms(data, &ipencap)) {
1353 		err = ip_tunnel_encap_setup(t, &ipencap);
1354 
1355 		if (err < 0)
1356 			return err;
1357 	}
1358 
1359 	err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
1360 	if (err < 0)
1361 		return err;
1362 
1363 	err = ip_tunnel_changelink(dev, tb, &p, fwmark);
1364 	if (err < 0)
1365 		return err;
1366 
1367 	t->parms.i_flags = p.i_flags;
1368 	t->parms.o_flags = p.o_flags;
1369 
1370 	if (strcmp(dev->rtnl_link_ops->kind, "erspan"))
1371 		ipgre_link_update(dev, !tb[IFLA_MTU]);
1372 
1373 	return 0;
1374 }
1375 
1376 static size_t ipgre_get_size(const struct net_device *dev)
1377 {
1378 	return
1379 		/* IFLA_GRE_LINK */
1380 		nla_total_size(4) +
1381 		/* IFLA_GRE_IFLAGS */
1382 		nla_total_size(2) +
1383 		/* IFLA_GRE_OFLAGS */
1384 		nla_total_size(2) +
1385 		/* IFLA_GRE_IKEY */
1386 		nla_total_size(4) +
1387 		/* IFLA_GRE_OKEY */
1388 		nla_total_size(4) +
1389 		/* IFLA_GRE_LOCAL */
1390 		nla_total_size(4) +
1391 		/* IFLA_GRE_REMOTE */
1392 		nla_total_size(4) +
1393 		/* IFLA_GRE_TTL */
1394 		nla_total_size(1) +
1395 		/* IFLA_GRE_TOS */
1396 		nla_total_size(1) +
1397 		/* IFLA_GRE_PMTUDISC */
1398 		nla_total_size(1) +
1399 		/* IFLA_GRE_ENCAP_TYPE */
1400 		nla_total_size(2) +
1401 		/* IFLA_GRE_ENCAP_FLAGS */
1402 		nla_total_size(2) +
1403 		/* IFLA_GRE_ENCAP_SPORT */
1404 		nla_total_size(2) +
1405 		/* IFLA_GRE_ENCAP_DPORT */
1406 		nla_total_size(2) +
1407 		/* IFLA_GRE_COLLECT_METADATA */
1408 		nla_total_size(0) +
1409 		/* IFLA_GRE_IGNORE_DF */
1410 		nla_total_size(1) +
1411 		/* IFLA_GRE_FWMARK */
1412 		nla_total_size(4) +
1413 		/* IFLA_GRE_ERSPAN_INDEX */
1414 		nla_total_size(4) +
1415 		0;
1416 }
1417 
1418 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1419 {
1420 	struct ip_tunnel *t = netdev_priv(dev);
1421 	struct ip_tunnel_parm *p = &t->parms;
1422 
1423 	if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
1424 	    nla_put_be16(skb, IFLA_GRE_IFLAGS,
1425 			 gre_tnl_flags_to_gre_flags(p->i_flags)) ||
1426 	    nla_put_be16(skb, IFLA_GRE_OFLAGS,
1427 			 gre_tnl_flags_to_gre_flags(p->o_flags)) ||
1428 	    nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
1429 	    nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
1430 	    nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
1431 	    nla_put_in_addr(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
1432 	    nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
1433 	    nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
1434 	    nla_put_u8(skb, IFLA_GRE_PMTUDISC,
1435 		       !!(p->iph.frag_off & htons(IP_DF))) ||
1436 	    nla_put_u32(skb, IFLA_GRE_FWMARK, t->fwmark))
1437 		goto nla_put_failure;
1438 
1439 	if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
1440 			t->encap.type) ||
1441 	    nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT,
1442 			 t->encap.sport) ||
1443 	    nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT,
1444 			 t->encap.dport) ||
1445 	    nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS,
1446 			t->encap.flags))
1447 		goto nla_put_failure;
1448 
1449 	if (nla_put_u8(skb, IFLA_GRE_IGNORE_DF, t->ignore_df))
1450 		goto nla_put_failure;
1451 
1452 	if (t->collect_md) {
1453 		if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA))
1454 			goto nla_put_failure;
1455 	}
1456 
1457 	if (t->index)
1458 		if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index))
1459 			goto nla_put_failure;
1460 
1461 	return 0;
1462 
1463 nla_put_failure:
1464 	return -EMSGSIZE;
1465 }
1466 
1467 static void erspan_setup(struct net_device *dev)
1468 {
1469 	ether_setup(dev);
1470 	dev->netdev_ops = &erspan_netdev_ops;
1471 	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1472 	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1473 	ip_tunnel_setup(dev, erspan_net_id);
1474 }
1475 
1476 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1477 	[IFLA_GRE_LINK]		= { .type = NLA_U32 },
1478 	[IFLA_GRE_IFLAGS]	= { .type = NLA_U16 },
1479 	[IFLA_GRE_OFLAGS]	= { .type = NLA_U16 },
1480 	[IFLA_GRE_IKEY]		= { .type = NLA_U32 },
1481 	[IFLA_GRE_OKEY]		= { .type = NLA_U32 },
1482 	[IFLA_GRE_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1483 	[IFLA_GRE_REMOTE]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1484 	[IFLA_GRE_TTL]		= { .type = NLA_U8 },
1485 	[IFLA_GRE_TOS]		= { .type = NLA_U8 },
1486 	[IFLA_GRE_PMTUDISC]	= { .type = NLA_U8 },
1487 	[IFLA_GRE_ENCAP_TYPE]	= { .type = NLA_U16 },
1488 	[IFLA_GRE_ENCAP_FLAGS]	= { .type = NLA_U16 },
1489 	[IFLA_GRE_ENCAP_SPORT]	= { .type = NLA_U16 },
1490 	[IFLA_GRE_ENCAP_DPORT]	= { .type = NLA_U16 },
1491 	[IFLA_GRE_COLLECT_METADATA]	= { .type = NLA_FLAG },
1492 	[IFLA_GRE_IGNORE_DF]	= { .type = NLA_U8 },
1493 	[IFLA_GRE_FWMARK]	= { .type = NLA_U32 },
1494 	[IFLA_GRE_ERSPAN_INDEX]	= { .type = NLA_U32 },
1495 };
1496 
1497 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1498 	.kind		= "gre",
1499 	.maxtype	= IFLA_GRE_MAX,
1500 	.policy		= ipgre_policy,
1501 	.priv_size	= sizeof(struct ip_tunnel),
1502 	.setup		= ipgre_tunnel_setup,
1503 	.validate	= ipgre_tunnel_validate,
1504 	.newlink	= ipgre_newlink,
1505 	.changelink	= ipgre_changelink,
1506 	.dellink	= ip_tunnel_dellink,
1507 	.get_size	= ipgre_get_size,
1508 	.fill_info	= ipgre_fill_info,
1509 	.get_link_net	= ip_tunnel_get_link_net,
1510 };
1511 
1512 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1513 	.kind		= "gretap",
1514 	.maxtype	= IFLA_GRE_MAX,
1515 	.policy		= ipgre_policy,
1516 	.priv_size	= sizeof(struct ip_tunnel),
1517 	.setup		= ipgre_tap_setup,
1518 	.validate	= ipgre_tap_validate,
1519 	.newlink	= ipgre_newlink,
1520 	.changelink	= ipgre_changelink,
1521 	.dellink	= ip_tunnel_dellink,
1522 	.get_size	= ipgre_get_size,
1523 	.fill_info	= ipgre_fill_info,
1524 	.get_link_net	= ip_tunnel_get_link_net,
1525 };
1526 
1527 static struct rtnl_link_ops erspan_link_ops __read_mostly = {
1528 	.kind		= "erspan",
1529 	.maxtype	= IFLA_GRE_MAX,
1530 	.policy		= ipgre_policy,
1531 	.priv_size	= sizeof(struct ip_tunnel),
1532 	.setup		= erspan_setup,
1533 	.validate	= erspan_validate,
1534 	.newlink	= ipgre_newlink,
1535 	.changelink	= ipgre_changelink,
1536 	.dellink	= ip_tunnel_dellink,
1537 	.get_size	= ipgre_get_size,
1538 	.fill_info	= ipgre_fill_info,
1539 	.get_link_net	= ip_tunnel_get_link_net,
1540 };
1541 
1542 struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
1543 					u8 name_assign_type)
1544 {
1545 	struct nlattr *tb[IFLA_MAX + 1];
1546 	struct net_device *dev;
1547 	LIST_HEAD(list_kill);
1548 	struct ip_tunnel *t;
1549 	int err;
1550 
1551 	memset(&tb, 0, sizeof(tb));
1552 
1553 	dev = rtnl_create_link(net, name, name_assign_type,
1554 			       &ipgre_tap_ops, tb);
1555 	if (IS_ERR(dev))
1556 		return dev;
1557 
1558 	/* Configure flow based GRE device. */
1559 	t = netdev_priv(dev);
1560 	t->collect_md = true;
1561 
1562 	err = ipgre_newlink(net, dev, tb, NULL, NULL);
1563 	if (err < 0) {
1564 		free_netdev(dev);
1565 		return ERR_PTR(err);
1566 	}
1567 
1568 	/* openvswitch users expect packet sizes to be unrestricted,
1569 	 * so set the largest MTU we can.
1570 	 */
1571 	err = __ip_tunnel_change_mtu(dev, IP_MAX_MTU, false);
1572 	if (err)
1573 		goto out;
1574 
1575 	err = rtnl_configure_link(dev, NULL);
1576 	if (err < 0)
1577 		goto out;
1578 
1579 	return dev;
1580 out:
1581 	ip_tunnel_dellink(dev, &list_kill);
1582 	unregister_netdevice_many(&list_kill);
1583 	return ERR_PTR(err);
1584 }
1585 EXPORT_SYMBOL_GPL(gretap_fb_dev_create);
1586 
1587 static int __net_init ipgre_tap_init_net(struct net *net)
1588 {
1589 	return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
1590 }
1591 
1592 static void __net_exit ipgre_tap_exit_batch_net(struct list_head *list_net)
1593 {
1594 	ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops);
1595 }
1596 
1597 static struct pernet_operations ipgre_tap_net_ops = {
1598 	.init = ipgre_tap_init_net,
1599 	.exit_batch = ipgre_tap_exit_batch_net,
1600 	.id   = &gre_tap_net_id,
1601 	.size = sizeof(struct ip_tunnel_net),
1602 };
1603 
1604 static int __net_init erspan_init_net(struct net *net)
1605 {
1606 	return ip_tunnel_init_net(net, erspan_net_id,
1607 				  &erspan_link_ops, "erspan0");
1608 }
1609 
1610 static void __net_exit erspan_exit_batch_net(struct list_head *net_list)
1611 {
1612 	ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops);
1613 }
1614 
1615 static struct pernet_operations erspan_net_ops = {
1616 	.init = erspan_init_net,
1617 	.exit_batch = erspan_exit_batch_net,
1618 	.id   = &erspan_net_id,
1619 	.size = sizeof(struct ip_tunnel_net),
1620 };
1621 
1622 static int __init ipgre_init(void)
1623 {
1624 	int err;
1625 
1626 	pr_info("GRE over IPv4 tunneling driver\n");
1627 
1628 	err = register_pernet_device(&ipgre_net_ops);
1629 	if (err < 0)
1630 		return err;
1631 
1632 	err = register_pernet_device(&ipgre_tap_net_ops);
1633 	if (err < 0)
1634 		goto pnet_tap_failed;
1635 
1636 	err = register_pernet_device(&erspan_net_ops);
1637 	if (err < 0)
1638 		goto pnet_erspan_failed;
1639 
1640 	err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1641 	if (err < 0) {
1642 		pr_info("%s: can't add protocol\n", __func__);
1643 		goto add_proto_failed;
1644 	}
1645 
1646 	err = rtnl_link_register(&ipgre_link_ops);
1647 	if (err < 0)
1648 		goto rtnl_link_failed;
1649 
1650 	err = rtnl_link_register(&ipgre_tap_ops);
1651 	if (err < 0)
1652 		goto tap_ops_failed;
1653 
1654 	err = rtnl_link_register(&erspan_link_ops);
1655 	if (err < 0)
1656 		goto erspan_link_failed;
1657 
1658 	return 0;
1659 
1660 erspan_link_failed:
1661 	rtnl_link_unregister(&ipgre_tap_ops);
1662 tap_ops_failed:
1663 	rtnl_link_unregister(&ipgre_link_ops);
1664 rtnl_link_failed:
1665 	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1666 add_proto_failed:
1667 	unregister_pernet_device(&erspan_net_ops);
1668 pnet_erspan_failed:
1669 	unregister_pernet_device(&ipgre_tap_net_ops);
1670 pnet_tap_failed:
1671 	unregister_pernet_device(&ipgre_net_ops);
1672 	return err;
1673 }
1674 
1675 static void __exit ipgre_fini(void)
1676 {
1677 	rtnl_link_unregister(&ipgre_tap_ops);
1678 	rtnl_link_unregister(&ipgre_link_ops);
1679 	rtnl_link_unregister(&erspan_link_ops);
1680 	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1681 	unregister_pernet_device(&ipgre_tap_net_ops);
1682 	unregister_pernet_device(&ipgre_net_ops);
1683 	unregister_pernet_device(&erspan_net_ops);
1684 }
1685 
1686 module_init(ipgre_init);
1687 module_exit(ipgre_fini);
1688 MODULE_LICENSE("GPL");
1689 MODULE_ALIAS_RTNL_LINK("gre");
1690 MODULE_ALIAS_RTNL_LINK("gretap");
1691 MODULE_ALIAS_RTNL_LINK("erspan");
1692 MODULE_ALIAS_NETDEV("gre0");
1693 MODULE_ALIAS_NETDEV("gretap0");
1694 MODULE_ALIAS_NETDEV("erspan0");
1695