xref: /openbmc/linux/net/ipv4/ip_tunnel.c (revision 609e478b)
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20 
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/mroute.h>
34 #include <linux/init.h>
35 #include <linux/in6.h>
36 #include <linux/inetdevice.h>
37 #include <linux/igmp.h>
38 #include <linux/netfilter_ipv4.h>
39 #include <linux/etherdevice.h>
40 #include <linux/if_ether.h>
41 #include <linux/if_vlan.h>
42 #include <linux/rculist.h>
43 #include <linux/err.h>
44 
45 #include <net/sock.h>
46 #include <net/ip.h>
47 #include <net/icmp.h>
48 #include <net/protocol.h>
49 #include <net/ip_tunnels.h>
50 #include <net/arp.h>
51 #include <net/checksum.h>
52 #include <net/dsfield.h>
53 #include <net/inet_ecn.h>
54 #include <net/xfrm.h>
55 #include <net/net_namespace.h>
56 #include <net/netns/generic.h>
57 #include <net/rtnetlink.h>
58 #include <net/udp.h>
59 #include <net/gue.h>
60 
61 #if IS_ENABLED(CONFIG_IPV6)
62 #include <net/ipv6.h>
63 #include <net/ip6_fib.h>
64 #include <net/ip6_route.h>
65 #endif
66 
67 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
68 {
69 	return hash_32((__force u32)key ^ (__force u32)remote,
70 			 IP_TNL_HASH_BITS);
71 }
72 
73 static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
74 			     struct dst_entry *dst, __be32 saddr)
75 {
76 	struct dst_entry *old_dst;
77 
78 	dst_clone(dst);
79 	old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
80 	dst_release(old_dst);
81 	idst->saddr = saddr;
82 }
83 
84 static noinline void tunnel_dst_set(struct ip_tunnel *t,
85 			   struct dst_entry *dst, __be32 saddr)
86 {
87 	__tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr);
88 }
89 
90 static void tunnel_dst_reset(struct ip_tunnel *t)
91 {
92 	tunnel_dst_set(t, NULL, 0);
93 }
94 
95 void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
96 {
97 	int i;
98 
99 	for_each_possible_cpu(i)
100 		__tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0);
101 }
102 EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
103 
104 static struct rtable *tunnel_rtable_get(struct ip_tunnel *t,
105 					u32 cookie, __be32 *saddr)
106 {
107 	struct ip_tunnel_dst *idst;
108 	struct dst_entry *dst;
109 
110 	rcu_read_lock();
111 	idst = raw_cpu_ptr(t->dst_cache);
112 	dst = rcu_dereference(idst->dst);
113 	if (dst && !atomic_inc_not_zero(&dst->__refcnt))
114 		dst = NULL;
115 	if (dst) {
116 		if (!dst->obsolete || dst->ops->check(dst, cookie)) {
117 			*saddr = idst->saddr;
118 		} else {
119 			tunnel_dst_reset(t);
120 			dst_release(dst);
121 			dst = NULL;
122 		}
123 	}
124 	rcu_read_unlock();
125 	return (struct rtable *)dst;
126 }
127 
128 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
129 				__be16 flags, __be32 key)
130 {
131 	if (p->i_flags & TUNNEL_KEY) {
132 		if (flags & TUNNEL_KEY)
133 			return key == p->i_key;
134 		else
135 			/* key expected, none present */
136 			return false;
137 	} else
138 		return !(flags & TUNNEL_KEY);
139 }
140 
141 /* Fallback tunnel: no source, no destination, no key, no options
142 
143    Tunnel hash table:
144    We require exact key match i.e. if a key is present in packet
145    it will match only tunnel with the same key; if it is not present,
146    it will match only keyless tunnel.
147 
148    All keysless packets, if not matched configured keyless tunnels
149    will match fallback tunnel.
150    Given src, dst and key, find appropriate for input tunnel.
151 */
152 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
153 				   int link, __be16 flags,
154 				   __be32 remote, __be32 local,
155 				   __be32 key)
156 {
157 	unsigned int hash;
158 	struct ip_tunnel *t, *cand = NULL;
159 	struct hlist_head *head;
160 
161 	hash = ip_tunnel_hash(key, remote);
162 	head = &itn->tunnels[hash];
163 
164 	hlist_for_each_entry_rcu(t, head, hash_node) {
165 		if (local != t->parms.iph.saddr ||
166 		    remote != t->parms.iph.daddr ||
167 		    !(t->dev->flags & IFF_UP))
168 			continue;
169 
170 		if (!ip_tunnel_key_match(&t->parms, flags, key))
171 			continue;
172 
173 		if (t->parms.link == link)
174 			return t;
175 		else
176 			cand = t;
177 	}
178 
179 	hlist_for_each_entry_rcu(t, head, hash_node) {
180 		if (remote != t->parms.iph.daddr ||
181 		    t->parms.iph.saddr != 0 ||
182 		    !(t->dev->flags & IFF_UP))
183 			continue;
184 
185 		if (!ip_tunnel_key_match(&t->parms, flags, key))
186 			continue;
187 
188 		if (t->parms.link == link)
189 			return t;
190 		else if (!cand)
191 			cand = t;
192 	}
193 
194 	hash = ip_tunnel_hash(key, 0);
195 	head = &itn->tunnels[hash];
196 
197 	hlist_for_each_entry_rcu(t, head, hash_node) {
198 		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
199 		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
200 			continue;
201 
202 		if (!(t->dev->flags & IFF_UP))
203 			continue;
204 
205 		if (!ip_tunnel_key_match(&t->parms, flags, key))
206 			continue;
207 
208 		if (t->parms.link == link)
209 			return t;
210 		else if (!cand)
211 			cand = t;
212 	}
213 
214 	if (flags & TUNNEL_NO_KEY)
215 		goto skip_key_lookup;
216 
217 	hlist_for_each_entry_rcu(t, head, hash_node) {
218 		if (t->parms.i_key != key ||
219 		    t->parms.iph.saddr != 0 ||
220 		    t->parms.iph.daddr != 0 ||
221 		    !(t->dev->flags & IFF_UP))
222 			continue;
223 
224 		if (t->parms.link == link)
225 			return t;
226 		else if (!cand)
227 			cand = t;
228 	}
229 
230 skip_key_lookup:
231 	if (cand)
232 		return cand;
233 
234 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
235 		return netdev_priv(itn->fb_tunnel_dev);
236 
237 
238 	return NULL;
239 }
240 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
241 
242 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
243 				    struct ip_tunnel_parm *parms)
244 {
245 	unsigned int h;
246 	__be32 remote;
247 	__be32 i_key = parms->i_key;
248 
249 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
250 		remote = parms->iph.daddr;
251 	else
252 		remote = 0;
253 
254 	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
255 		i_key = 0;
256 
257 	h = ip_tunnel_hash(i_key, remote);
258 	return &itn->tunnels[h];
259 }
260 
261 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
262 {
263 	struct hlist_head *head = ip_bucket(itn, &t->parms);
264 
265 	hlist_add_head_rcu(&t->hash_node, head);
266 }
267 
268 static void ip_tunnel_del(struct ip_tunnel *t)
269 {
270 	hlist_del_init_rcu(&t->hash_node);
271 }
272 
273 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
274 					struct ip_tunnel_parm *parms,
275 					int type)
276 {
277 	__be32 remote = parms->iph.daddr;
278 	__be32 local = parms->iph.saddr;
279 	__be32 key = parms->i_key;
280 	__be16 flags = parms->i_flags;
281 	int link = parms->link;
282 	struct ip_tunnel *t = NULL;
283 	struct hlist_head *head = ip_bucket(itn, parms);
284 
285 	hlist_for_each_entry_rcu(t, head, hash_node) {
286 		if (local == t->parms.iph.saddr &&
287 		    remote == t->parms.iph.daddr &&
288 		    link == t->parms.link &&
289 		    type == t->dev->type &&
290 		    ip_tunnel_key_match(&t->parms, flags, key))
291 			break;
292 	}
293 	return t;
294 }
295 
296 static struct net_device *__ip_tunnel_create(struct net *net,
297 					     const struct rtnl_link_ops *ops,
298 					     struct ip_tunnel_parm *parms)
299 {
300 	int err;
301 	struct ip_tunnel *tunnel;
302 	struct net_device *dev;
303 	char name[IFNAMSIZ];
304 
305 	if (parms->name[0])
306 		strlcpy(name, parms->name, IFNAMSIZ);
307 	else {
308 		if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
309 			err = -E2BIG;
310 			goto failed;
311 		}
312 		strlcpy(name, ops->kind, IFNAMSIZ);
313 		strncat(name, "%d", 2);
314 	}
315 
316 	ASSERT_RTNL();
317 	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
318 	if (!dev) {
319 		err = -ENOMEM;
320 		goto failed;
321 	}
322 	dev_net_set(dev, net);
323 
324 	dev->rtnl_link_ops = ops;
325 
326 	tunnel = netdev_priv(dev);
327 	tunnel->parms = *parms;
328 	tunnel->net = net;
329 
330 	err = register_netdevice(dev);
331 	if (err)
332 		goto failed_free;
333 
334 	return dev;
335 
336 failed_free:
337 	free_netdev(dev);
338 failed:
339 	return ERR_PTR(err);
340 }
341 
342 static inline void init_tunnel_flow(struct flowi4 *fl4,
343 				    int proto,
344 				    __be32 daddr, __be32 saddr,
345 				    __be32 key, __u8 tos, int oif)
346 {
347 	memset(fl4, 0, sizeof(*fl4));
348 	fl4->flowi4_oif = oif;
349 	fl4->daddr = daddr;
350 	fl4->saddr = saddr;
351 	fl4->flowi4_tos = tos;
352 	fl4->flowi4_proto = proto;
353 	fl4->fl4_gre_key = key;
354 }
355 
356 static int ip_tunnel_bind_dev(struct net_device *dev)
357 {
358 	struct net_device *tdev = NULL;
359 	struct ip_tunnel *tunnel = netdev_priv(dev);
360 	const struct iphdr *iph;
361 	int hlen = LL_MAX_HEADER;
362 	int mtu = ETH_DATA_LEN;
363 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
364 
365 	iph = &tunnel->parms.iph;
366 
367 	/* Guess output device to choose reasonable mtu and needed_headroom */
368 	if (iph->daddr) {
369 		struct flowi4 fl4;
370 		struct rtable *rt;
371 
372 		init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
373 				 iph->saddr, tunnel->parms.o_key,
374 				 RT_TOS(iph->tos), tunnel->parms.link);
375 		rt = ip_route_output_key(tunnel->net, &fl4);
376 
377 		if (!IS_ERR(rt)) {
378 			tdev = rt->dst.dev;
379 			tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
380 			ip_rt_put(rt);
381 		}
382 		if (dev->type != ARPHRD_ETHER)
383 			dev->flags |= IFF_POINTOPOINT;
384 	}
385 
386 	if (!tdev && tunnel->parms.link)
387 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
388 
389 	if (tdev) {
390 		hlen = tdev->hard_header_len + tdev->needed_headroom;
391 		mtu = tdev->mtu;
392 	}
393 	dev->iflink = tunnel->parms.link;
394 
395 	dev->needed_headroom = t_hlen + hlen;
396 	mtu -= (dev->hard_header_len + t_hlen);
397 
398 	if (mtu < 68)
399 		mtu = 68;
400 
401 	return mtu;
402 }
403 
404 static struct ip_tunnel *ip_tunnel_create(struct net *net,
405 					  struct ip_tunnel_net *itn,
406 					  struct ip_tunnel_parm *parms)
407 {
408 	struct ip_tunnel *nt;
409 	struct net_device *dev;
410 
411 	BUG_ON(!itn->fb_tunnel_dev);
412 	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
413 	if (IS_ERR(dev))
414 		return ERR_CAST(dev);
415 
416 	dev->mtu = ip_tunnel_bind_dev(dev);
417 
418 	nt = netdev_priv(dev);
419 	ip_tunnel_add(itn, nt);
420 	return nt;
421 }
422 
423 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
424 		  const struct tnl_ptk_info *tpi, bool log_ecn_error)
425 {
426 	struct pcpu_sw_netstats *tstats;
427 	const struct iphdr *iph = ip_hdr(skb);
428 	int err;
429 
430 #ifdef CONFIG_NET_IPGRE_BROADCAST
431 	if (ipv4_is_multicast(iph->daddr)) {
432 		tunnel->dev->stats.multicast++;
433 		skb->pkt_type = PACKET_BROADCAST;
434 	}
435 #endif
436 
437 	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
438 	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
439 		tunnel->dev->stats.rx_crc_errors++;
440 		tunnel->dev->stats.rx_errors++;
441 		goto drop;
442 	}
443 
444 	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
445 		if (!(tpi->flags&TUNNEL_SEQ) ||
446 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
447 			tunnel->dev->stats.rx_fifo_errors++;
448 			tunnel->dev->stats.rx_errors++;
449 			goto drop;
450 		}
451 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
452 	}
453 
454 	skb_reset_network_header(skb);
455 
456 	err = IP_ECN_decapsulate(iph, skb);
457 	if (unlikely(err)) {
458 		if (log_ecn_error)
459 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
460 					&iph->saddr, iph->tos);
461 		if (err > 1) {
462 			++tunnel->dev->stats.rx_frame_errors;
463 			++tunnel->dev->stats.rx_errors;
464 			goto drop;
465 		}
466 	}
467 
468 	tstats = this_cpu_ptr(tunnel->dev->tstats);
469 	u64_stats_update_begin(&tstats->syncp);
470 	tstats->rx_packets++;
471 	tstats->rx_bytes += skb->len;
472 	u64_stats_update_end(&tstats->syncp);
473 
474 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
475 
476 	if (tunnel->dev->type == ARPHRD_ETHER) {
477 		skb->protocol = eth_type_trans(skb, tunnel->dev);
478 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
479 	} else {
480 		skb->dev = tunnel->dev;
481 	}
482 
483 	gro_cells_receive(&tunnel->gro_cells, skb);
484 	return 0;
485 
486 drop:
487 	kfree_skb(skb);
488 	return 0;
489 }
490 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
491 
492 static int ip_encap_hlen(struct ip_tunnel_encap *e)
493 {
494 	switch (e->type) {
495 	case TUNNEL_ENCAP_NONE:
496 		return 0;
497 	case TUNNEL_ENCAP_FOU:
498 		return sizeof(struct udphdr);
499 	case TUNNEL_ENCAP_GUE:
500 		return sizeof(struct udphdr) + sizeof(struct guehdr);
501 	default:
502 		return -EINVAL;
503 	}
504 }
505 
506 int ip_tunnel_encap_setup(struct ip_tunnel *t,
507 			  struct ip_tunnel_encap *ipencap)
508 {
509 	int hlen;
510 
511 	memset(&t->encap, 0, sizeof(t->encap));
512 
513 	hlen = ip_encap_hlen(ipencap);
514 	if (hlen < 0)
515 		return hlen;
516 
517 	t->encap.type = ipencap->type;
518 	t->encap.sport = ipencap->sport;
519 	t->encap.dport = ipencap->dport;
520 	t->encap.flags = ipencap->flags;
521 
522 	t->encap_hlen = hlen;
523 	t->hlen = t->encap_hlen + t->tun_hlen;
524 
525 	return 0;
526 }
527 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
528 
529 static int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
530 			    size_t hdr_len, u8 *protocol, struct flowi4 *fl4)
531 {
532 	struct udphdr *uh;
533 	__be16 sport;
534 	bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM);
535 	int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
536 
537 	skb = iptunnel_handle_offloads(skb, csum, type);
538 
539 	if (IS_ERR(skb))
540 		return PTR_ERR(skb);
541 
542 	/* Get length and hash before making space in skb */
543 
544 	sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
545 					       skb, 0, 0, false);
546 
547 	skb_push(skb, hdr_len);
548 
549 	skb_reset_transport_header(skb);
550 	uh = udp_hdr(skb);
551 
552 	if (e->type == TUNNEL_ENCAP_GUE) {
553 		struct guehdr *guehdr = (struct guehdr *)&uh[1];
554 
555 		guehdr->version = 0;
556 		guehdr->hlen = 0;
557 		guehdr->flags = 0;
558 		guehdr->next_hdr = *protocol;
559 	}
560 
561 	uh->dest = e->dport;
562 	uh->source = sport;
563 	uh->len = htons(skb->len);
564 	uh->check = 0;
565 	udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb,
566 		     fl4->saddr, fl4->daddr, skb->len);
567 
568 	*protocol = IPPROTO_UDP;
569 
570 	return 0;
571 }
572 
573 int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
574 		    u8 *protocol, struct flowi4 *fl4)
575 {
576 	switch (t->encap.type) {
577 	case TUNNEL_ENCAP_NONE:
578 		return 0;
579 	case TUNNEL_ENCAP_FOU:
580 	case TUNNEL_ENCAP_GUE:
581 		return fou_build_header(skb, &t->encap, t->encap_hlen,
582 					protocol, fl4);
583 	default:
584 		return -EINVAL;
585 	}
586 }
587 EXPORT_SYMBOL(ip_tunnel_encap);
588 
589 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
590 			    struct rtable *rt, __be16 df)
591 {
592 	struct ip_tunnel *tunnel = netdev_priv(dev);
593 	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
594 	int mtu;
595 
596 	if (df)
597 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
598 					- sizeof(struct iphdr) - tunnel->hlen;
599 	else
600 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
601 
602 	if (skb_dst(skb))
603 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
604 
605 	if (skb->protocol == htons(ETH_P_IP)) {
606 		if (!skb_is_gso(skb) &&
607 		    (df & htons(IP_DF)) && mtu < pkt_size) {
608 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
609 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
610 			return -E2BIG;
611 		}
612 	}
613 #if IS_ENABLED(CONFIG_IPV6)
614 	else if (skb->protocol == htons(ETH_P_IPV6)) {
615 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
616 
617 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
618 			   mtu >= IPV6_MIN_MTU) {
619 			if ((tunnel->parms.iph.daddr &&
620 			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
621 			    rt6->rt6i_dst.plen == 128) {
622 				rt6->rt6i_flags |= RTF_MODIFIED;
623 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
624 			}
625 		}
626 
627 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
628 					mtu < pkt_size) {
629 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
630 			return -E2BIG;
631 		}
632 	}
633 #endif
634 	return 0;
635 }
636 
637 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
638 		    const struct iphdr *tnl_params, u8 protocol)
639 {
640 	struct ip_tunnel *tunnel = netdev_priv(dev);
641 	const struct iphdr *inner_iph;
642 	struct flowi4 fl4;
643 	u8     tos, ttl;
644 	__be16 df;
645 	struct rtable *rt;		/* Route to the other host */
646 	unsigned int max_headroom;	/* The extra header space needed */
647 	__be32 dst;
648 	int err;
649 	bool connected;
650 
651 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
652 	connected = (tunnel->parms.iph.daddr != 0);
653 
654 	dst = tnl_params->daddr;
655 	if (dst == 0) {
656 		/* NBMA tunnel */
657 
658 		if (skb_dst(skb) == NULL) {
659 			dev->stats.tx_fifo_errors++;
660 			goto tx_error;
661 		}
662 
663 		if (skb->protocol == htons(ETH_P_IP)) {
664 			rt = skb_rtable(skb);
665 			dst = rt_nexthop(rt, inner_iph->daddr);
666 		}
667 #if IS_ENABLED(CONFIG_IPV6)
668 		else if (skb->protocol == htons(ETH_P_IPV6)) {
669 			const struct in6_addr *addr6;
670 			struct neighbour *neigh;
671 			bool do_tx_error_icmp;
672 			int addr_type;
673 
674 			neigh = dst_neigh_lookup(skb_dst(skb),
675 						 &ipv6_hdr(skb)->daddr);
676 			if (neigh == NULL)
677 				goto tx_error;
678 
679 			addr6 = (const struct in6_addr *)&neigh->primary_key;
680 			addr_type = ipv6_addr_type(addr6);
681 
682 			if (addr_type == IPV6_ADDR_ANY) {
683 				addr6 = &ipv6_hdr(skb)->daddr;
684 				addr_type = ipv6_addr_type(addr6);
685 			}
686 
687 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
688 				do_tx_error_icmp = true;
689 			else {
690 				do_tx_error_icmp = false;
691 				dst = addr6->s6_addr32[3];
692 			}
693 			neigh_release(neigh);
694 			if (do_tx_error_icmp)
695 				goto tx_error_icmp;
696 		}
697 #endif
698 		else
699 			goto tx_error;
700 
701 		connected = false;
702 	}
703 
704 	tos = tnl_params->tos;
705 	if (tos & 0x1) {
706 		tos &= ~0x1;
707 		if (skb->protocol == htons(ETH_P_IP)) {
708 			tos = inner_iph->tos;
709 			connected = false;
710 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
711 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
712 			connected = false;
713 		}
714 	}
715 
716 	init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
717 			 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
718 
719 	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
720 		goto tx_error;
721 
722 	rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL;
723 
724 	if (!rt) {
725 		rt = ip_route_output_key(tunnel->net, &fl4);
726 
727 		if (IS_ERR(rt)) {
728 			dev->stats.tx_carrier_errors++;
729 			goto tx_error;
730 		}
731 		if (connected)
732 			tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
733 	}
734 
735 	if (rt->dst.dev == dev) {
736 		ip_rt_put(rt);
737 		dev->stats.collisions++;
738 		goto tx_error;
739 	}
740 
741 	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) {
742 		ip_rt_put(rt);
743 		goto tx_error;
744 	}
745 
746 	if (tunnel->err_count > 0) {
747 		if (time_before(jiffies,
748 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
749 			tunnel->err_count--;
750 
751 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
752 			dst_link_failure(skb);
753 		} else
754 			tunnel->err_count = 0;
755 	}
756 
757 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
758 	ttl = tnl_params->ttl;
759 	if (ttl == 0) {
760 		if (skb->protocol == htons(ETH_P_IP))
761 			ttl = inner_iph->ttl;
762 #if IS_ENABLED(CONFIG_IPV6)
763 		else if (skb->protocol == htons(ETH_P_IPV6))
764 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
765 #endif
766 		else
767 			ttl = ip4_dst_hoplimit(&rt->dst);
768 	}
769 
770 	df = tnl_params->frag_off;
771 	if (skb->protocol == htons(ETH_P_IP))
772 		df |= (inner_iph->frag_off&htons(IP_DF));
773 
774 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
775 			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
776 	if (max_headroom > dev->needed_headroom)
777 		dev->needed_headroom = max_headroom;
778 
779 	if (skb_cow_head(skb, dev->needed_headroom)) {
780 		ip_rt_put(rt);
781 		dev->stats.tx_dropped++;
782 		kfree_skb(skb);
783 		return;
784 	}
785 
786 	err = iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, protocol,
787 			    tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
788 	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
789 
790 	return;
791 
792 #if IS_ENABLED(CONFIG_IPV6)
793 tx_error_icmp:
794 	dst_link_failure(skb);
795 #endif
796 tx_error:
797 	dev->stats.tx_errors++;
798 	kfree_skb(skb);
799 }
800 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
801 
802 static void ip_tunnel_update(struct ip_tunnel_net *itn,
803 			     struct ip_tunnel *t,
804 			     struct net_device *dev,
805 			     struct ip_tunnel_parm *p,
806 			     bool set_mtu)
807 {
808 	ip_tunnel_del(t);
809 	t->parms.iph.saddr = p->iph.saddr;
810 	t->parms.iph.daddr = p->iph.daddr;
811 	t->parms.i_key = p->i_key;
812 	t->parms.o_key = p->o_key;
813 	if (dev->type != ARPHRD_ETHER) {
814 		memcpy(dev->dev_addr, &p->iph.saddr, 4);
815 		memcpy(dev->broadcast, &p->iph.daddr, 4);
816 	}
817 	ip_tunnel_add(itn, t);
818 
819 	t->parms.iph.ttl = p->iph.ttl;
820 	t->parms.iph.tos = p->iph.tos;
821 	t->parms.iph.frag_off = p->iph.frag_off;
822 
823 	if (t->parms.link != p->link) {
824 		int mtu;
825 
826 		t->parms.link = p->link;
827 		mtu = ip_tunnel_bind_dev(dev);
828 		if (set_mtu)
829 			dev->mtu = mtu;
830 	}
831 	ip_tunnel_dst_reset_all(t);
832 	netdev_state_change(dev);
833 }
834 
835 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
836 {
837 	int err = 0;
838 	struct ip_tunnel *t = netdev_priv(dev);
839 	struct net *net = t->net;
840 	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
841 
842 	BUG_ON(!itn->fb_tunnel_dev);
843 	switch (cmd) {
844 	case SIOCGETTUNNEL:
845 		if (dev == itn->fb_tunnel_dev) {
846 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
847 			if (t == NULL)
848 				t = netdev_priv(dev);
849 		}
850 		memcpy(p, &t->parms, sizeof(*p));
851 		break;
852 
853 	case SIOCADDTUNNEL:
854 	case SIOCCHGTUNNEL:
855 		err = -EPERM;
856 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
857 			goto done;
858 		if (p->iph.ttl)
859 			p->iph.frag_off |= htons(IP_DF);
860 		if (!(p->i_flags & VTI_ISVTI)) {
861 			if (!(p->i_flags & TUNNEL_KEY))
862 				p->i_key = 0;
863 			if (!(p->o_flags & TUNNEL_KEY))
864 				p->o_key = 0;
865 		}
866 
867 		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
868 
869 		if (cmd == SIOCADDTUNNEL) {
870 			if (!t) {
871 				t = ip_tunnel_create(net, itn, p);
872 				err = PTR_ERR_OR_ZERO(t);
873 				break;
874 			}
875 
876 			err = -EEXIST;
877 			break;
878 		}
879 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
880 			if (t != NULL) {
881 				if (t->dev != dev) {
882 					err = -EEXIST;
883 					break;
884 				}
885 			} else {
886 				unsigned int nflags = 0;
887 
888 				if (ipv4_is_multicast(p->iph.daddr))
889 					nflags = IFF_BROADCAST;
890 				else if (p->iph.daddr)
891 					nflags = IFF_POINTOPOINT;
892 
893 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
894 					err = -EINVAL;
895 					break;
896 				}
897 
898 				t = netdev_priv(dev);
899 			}
900 		}
901 
902 		if (t) {
903 			err = 0;
904 			ip_tunnel_update(itn, t, dev, p, true);
905 		} else {
906 			err = -ENOENT;
907 		}
908 		break;
909 
910 	case SIOCDELTUNNEL:
911 		err = -EPERM;
912 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
913 			goto done;
914 
915 		if (dev == itn->fb_tunnel_dev) {
916 			err = -ENOENT;
917 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
918 			if (t == NULL)
919 				goto done;
920 			err = -EPERM;
921 			if (t == netdev_priv(itn->fb_tunnel_dev))
922 				goto done;
923 			dev = t->dev;
924 		}
925 		unregister_netdevice(dev);
926 		err = 0;
927 		break;
928 
929 	default:
930 		err = -EINVAL;
931 	}
932 
933 done:
934 	return err;
935 }
936 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
937 
938 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
939 {
940 	struct ip_tunnel *tunnel = netdev_priv(dev);
941 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
942 
943 	if (new_mtu < 68 ||
944 	    new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
945 		return -EINVAL;
946 	dev->mtu = new_mtu;
947 	return 0;
948 }
949 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
950 
951 static void ip_tunnel_dev_free(struct net_device *dev)
952 {
953 	struct ip_tunnel *tunnel = netdev_priv(dev);
954 
955 	gro_cells_destroy(&tunnel->gro_cells);
956 	free_percpu(tunnel->dst_cache);
957 	free_percpu(dev->tstats);
958 	free_netdev(dev);
959 }
960 
961 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
962 {
963 	struct ip_tunnel *tunnel = netdev_priv(dev);
964 	struct ip_tunnel_net *itn;
965 
966 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
967 
968 	if (itn->fb_tunnel_dev != dev) {
969 		ip_tunnel_del(netdev_priv(dev));
970 		unregister_netdevice_queue(dev, head);
971 	}
972 }
973 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
974 
975 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
976 				  struct rtnl_link_ops *ops, char *devname)
977 {
978 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
979 	struct ip_tunnel_parm parms;
980 	unsigned int i;
981 
982 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
983 		INIT_HLIST_HEAD(&itn->tunnels[i]);
984 
985 	if (!ops) {
986 		itn->fb_tunnel_dev = NULL;
987 		return 0;
988 	}
989 
990 	memset(&parms, 0, sizeof(parms));
991 	if (devname)
992 		strlcpy(parms.name, devname, IFNAMSIZ);
993 
994 	rtnl_lock();
995 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
996 	/* FB netdevice is special: we have one, and only one per netns.
997 	 * Allowing to move it to another netns is clearly unsafe.
998 	 */
999 	if (!IS_ERR(itn->fb_tunnel_dev)) {
1000 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1001 		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1002 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1003 	}
1004 	rtnl_unlock();
1005 
1006 	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1007 }
1008 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1009 
1010 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1011 			      struct rtnl_link_ops *ops)
1012 {
1013 	struct net *net = dev_net(itn->fb_tunnel_dev);
1014 	struct net_device *dev, *aux;
1015 	int h;
1016 
1017 	for_each_netdev_safe(net, dev, aux)
1018 		if (dev->rtnl_link_ops == ops)
1019 			unregister_netdevice_queue(dev, head);
1020 
1021 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1022 		struct ip_tunnel *t;
1023 		struct hlist_node *n;
1024 		struct hlist_head *thead = &itn->tunnels[h];
1025 
1026 		hlist_for_each_entry_safe(t, n, thead, hash_node)
1027 			/* If dev is in the same netns, it has already
1028 			 * been added to the list by the previous loop.
1029 			 */
1030 			if (!net_eq(dev_net(t->dev), net))
1031 				unregister_netdevice_queue(t->dev, head);
1032 	}
1033 }
1034 
1035 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1036 {
1037 	LIST_HEAD(list);
1038 
1039 	rtnl_lock();
1040 	ip_tunnel_destroy(itn, &list, ops);
1041 	unregister_netdevice_many(&list);
1042 	rtnl_unlock();
1043 }
1044 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1045 
1046 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1047 		      struct ip_tunnel_parm *p)
1048 {
1049 	struct ip_tunnel *nt;
1050 	struct net *net = dev_net(dev);
1051 	struct ip_tunnel_net *itn;
1052 	int mtu;
1053 	int err;
1054 
1055 	nt = netdev_priv(dev);
1056 	itn = net_generic(net, nt->ip_tnl_net_id);
1057 
1058 	if (ip_tunnel_find(itn, p, dev->type))
1059 		return -EEXIST;
1060 
1061 	nt->net = net;
1062 	nt->parms = *p;
1063 	err = register_netdevice(dev);
1064 	if (err)
1065 		goto out;
1066 
1067 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1068 		eth_hw_addr_random(dev);
1069 
1070 	mtu = ip_tunnel_bind_dev(dev);
1071 	if (!tb[IFLA_MTU])
1072 		dev->mtu = mtu;
1073 
1074 	ip_tunnel_add(itn, nt);
1075 
1076 out:
1077 	return err;
1078 }
1079 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1080 
1081 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1082 			 struct ip_tunnel_parm *p)
1083 {
1084 	struct ip_tunnel *t;
1085 	struct ip_tunnel *tunnel = netdev_priv(dev);
1086 	struct net *net = tunnel->net;
1087 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1088 
1089 	if (dev == itn->fb_tunnel_dev)
1090 		return -EINVAL;
1091 
1092 	t = ip_tunnel_find(itn, p, dev->type);
1093 
1094 	if (t) {
1095 		if (t->dev != dev)
1096 			return -EEXIST;
1097 	} else {
1098 		t = tunnel;
1099 
1100 		if (dev->type != ARPHRD_ETHER) {
1101 			unsigned int nflags = 0;
1102 
1103 			if (ipv4_is_multicast(p->iph.daddr))
1104 				nflags = IFF_BROADCAST;
1105 			else if (p->iph.daddr)
1106 				nflags = IFF_POINTOPOINT;
1107 
1108 			if ((dev->flags ^ nflags) &
1109 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1110 				return -EINVAL;
1111 		}
1112 	}
1113 
1114 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1115 	return 0;
1116 }
1117 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1118 
1119 int ip_tunnel_init(struct net_device *dev)
1120 {
1121 	struct ip_tunnel *tunnel = netdev_priv(dev);
1122 	struct iphdr *iph = &tunnel->parms.iph;
1123 	int err;
1124 
1125 	dev->destructor	= ip_tunnel_dev_free;
1126 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1127 	if (!dev->tstats)
1128 		return -ENOMEM;
1129 
1130 	tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
1131 	if (!tunnel->dst_cache) {
1132 		free_percpu(dev->tstats);
1133 		return -ENOMEM;
1134 	}
1135 
1136 	err = gro_cells_init(&tunnel->gro_cells, dev);
1137 	if (err) {
1138 		free_percpu(tunnel->dst_cache);
1139 		free_percpu(dev->tstats);
1140 		return err;
1141 	}
1142 
1143 	tunnel->dev = dev;
1144 	tunnel->net = dev_net(dev);
1145 	strcpy(tunnel->parms.name, dev->name);
1146 	iph->version		= 4;
1147 	iph->ihl		= 5;
1148 
1149 	return 0;
1150 }
1151 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1152 
1153 void ip_tunnel_uninit(struct net_device *dev)
1154 {
1155 	struct ip_tunnel *tunnel = netdev_priv(dev);
1156 	struct net *net = tunnel->net;
1157 	struct ip_tunnel_net *itn;
1158 
1159 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1160 	/* fb_tunnel_dev will be unregisted in net-exit call. */
1161 	if (itn->fb_tunnel_dev != dev)
1162 		ip_tunnel_del(netdev_priv(dev));
1163 
1164 	ip_tunnel_dst_reset_all(tunnel);
1165 }
1166 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1167 
1168 /* Do least required initialization, rest of init is done in tunnel_init call */
1169 void ip_tunnel_setup(struct net_device *dev, int net_id)
1170 {
1171 	struct ip_tunnel *tunnel = netdev_priv(dev);
1172 	tunnel->ip_tnl_net_id = net_id;
1173 }
1174 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1175 
1176 MODULE_LICENSE("GPL");
1177