xref: /openbmc/linux/net/ipv4/ip_tunnel.c (revision df3305156f989339529b3d6744b898d498fb1f7b)
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20 
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/mroute.h>
34 #include <linux/init.h>
35 #include <linux/in6.h>
36 #include <linux/inetdevice.h>
37 #include <linux/igmp.h>
38 #include <linux/netfilter_ipv4.h>
39 #include <linux/etherdevice.h>
40 #include <linux/if_ether.h>
41 #include <linux/if_vlan.h>
42 #include <linux/rculist.h>
43 #include <linux/err.h>
44 
45 #include <net/sock.h>
46 #include <net/ip.h>
47 #include <net/icmp.h>
48 #include <net/protocol.h>
49 #include <net/ip_tunnels.h>
50 #include <net/arp.h>
51 #include <net/checksum.h>
52 #include <net/dsfield.h>
53 #include <net/inet_ecn.h>
54 #include <net/xfrm.h>
55 #include <net/net_namespace.h>
56 #include <net/netns/generic.h>
57 #include <net/rtnetlink.h>
58 #include <net/udp.h>
59 
60 #if IS_ENABLED(CONFIG_IPV6)
61 #include <net/ipv6.h>
62 #include <net/ip6_fib.h>
63 #include <net/ip6_route.h>
64 #endif
65 
66 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
67 {
68 	return hash_32((__force u32)key ^ (__force u32)remote,
69 			 IP_TNL_HASH_BITS);
70 }
71 
72 static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
73 			     struct dst_entry *dst, __be32 saddr)
74 {
75 	struct dst_entry *old_dst;
76 
77 	dst_clone(dst);
78 	old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
79 	dst_release(old_dst);
80 	idst->saddr = saddr;
81 }
82 
83 static noinline void tunnel_dst_set(struct ip_tunnel *t,
84 			   struct dst_entry *dst, __be32 saddr)
85 {
86 	__tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr);
87 }
88 
89 static void tunnel_dst_reset(struct ip_tunnel *t)
90 {
91 	tunnel_dst_set(t, NULL, 0);
92 }
93 
94 void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
95 {
96 	int i;
97 
98 	for_each_possible_cpu(i)
99 		__tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0);
100 }
101 EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
102 
103 static struct rtable *tunnel_rtable_get(struct ip_tunnel *t,
104 					u32 cookie, __be32 *saddr)
105 {
106 	struct ip_tunnel_dst *idst;
107 	struct dst_entry *dst;
108 
109 	rcu_read_lock();
110 	idst = raw_cpu_ptr(t->dst_cache);
111 	dst = rcu_dereference(idst->dst);
112 	if (dst && !atomic_inc_not_zero(&dst->__refcnt))
113 		dst = NULL;
114 	if (dst) {
115 		if (!dst->obsolete || dst->ops->check(dst, cookie)) {
116 			*saddr = idst->saddr;
117 		} else {
118 			tunnel_dst_reset(t);
119 			dst_release(dst);
120 			dst = NULL;
121 		}
122 	}
123 	rcu_read_unlock();
124 	return (struct rtable *)dst;
125 }
126 
127 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
128 				__be16 flags, __be32 key)
129 {
130 	if (p->i_flags & TUNNEL_KEY) {
131 		if (flags & TUNNEL_KEY)
132 			return key == p->i_key;
133 		else
134 			/* key expected, none present */
135 			return false;
136 	} else
137 		return !(flags & TUNNEL_KEY);
138 }
139 
140 /* Fallback tunnel: no source, no destination, no key, no options
141 
142    Tunnel hash table:
143    We require exact key match i.e. if a key is present in packet
144    it will match only tunnel with the same key; if it is not present,
145    it will match only keyless tunnel.
146 
147    All keysless packets, if not matched configured keyless tunnels
148    will match fallback tunnel.
149    Given src, dst and key, find appropriate for input tunnel.
150 */
151 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
152 				   int link, __be16 flags,
153 				   __be32 remote, __be32 local,
154 				   __be32 key)
155 {
156 	unsigned int hash;
157 	struct ip_tunnel *t, *cand = NULL;
158 	struct hlist_head *head;
159 
160 	hash = ip_tunnel_hash(key, remote);
161 	head = &itn->tunnels[hash];
162 
163 	hlist_for_each_entry_rcu(t, head, hash_node) {
164 		if (local != t->parms.iph.saddr ||
165 		    remote != t->parms.iph.daddr ||
166 		    !(t->dev->flags & IFF_UP))
167 			continue;
168 
169 		if (!ip_tunnel_key_match(&t->parms, flags, key))
170 			continue;
171 
172 		if (t->parms.link == link)
173 			return t;
174 		else
175 			cand = t;
176 	}
177 
178 	hlist_for_each_entry_rcu(t, head, hash_node) {
179 		if (remote != t->parms.iph.daddr ||
180 		    t->parms.iph.saddr != 0 ||
181 		    !(t->dev->flags & IFF_UP))
182 			continue;
183 
184 		if (!ip_tunnel_key_match(&t->parms, flags, key))
185 			continue;
186 
187 		if (t->parms.link == link)
188 			return t;
189 		else if (!cand)
190 			cand = t;
191 	}
192 
193 	hash = ip_tunnel_hash(key, 0);
194 	head = &itn->tunnels[hash];
195 
196 	hlist_for_each_entry_rcu(t, head, hash_node) {
197 		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
198 		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
199 			continue;
200 
201 		if (!(t->dev->flags & IFF_UP))
202 			continue;
203 
204 		if (!ip_tunnel_key_match(&t->parms, flags, key))
205 			continue;
206 
207 		if (t->parms.link == link)
208 			return t;
209 		else if (!cand)
210 			cand = t;
211 	}
212 
213 	if (flags & TUNNEL_NO_KEY)
214 		goto skip_key_lookup;
215 
216 	hlist_for_each_entry_rcu(t, head, hash_node) {
217 		if (t->parms.i_key != key ||
218 		    t->parms.iph.saddr != 0 ||
219 		    t->parms.iph.daddr != 0 ||
220 		    !(t->dev->flags & IFF_UP))
221 			continue;
222 
223 		if (t->parms.link == link)
224 			return t;
225 		else if (!cand)
226 			cand = t;
227 	}
228 
229 skip_key_lookup:
230 	if (cand)
231 		return cand;
232 
233 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
234 		return netdev_priv(itn->fb_tunnel_dev);
235 
236 
237 	return NULL;
238 }
239 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
240 
241 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
242 				    struct ip_tunnel_parm *parms)
243 {
244 	unsigned int h;
245 	__be32 remote;
246 	__be32 i_key = parms->i_key;
247 
248 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
249 		remote = parms->iph.daddr;
250 	else
251 		remote = 0;
252 
253 	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
254 		i_key = 0;
255 
256 	h = ip_tunnel_hash(i_key, remote);
257 	return &itn->tunnels[h];
258 }
259 
260 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
261 {
262 	struct hlist_head *head = ip_bucket(itn, &t->parms);
263 
264 	hlist_add_head_rcu(&t->hash_node, head);
265 }
266 
267 static void ip_tunnel_del(struct ip_tunnel *t)
268 {
269 	hlist_del_init_rcu(&t->hash_node);
270 }
271 
272 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
273 					struct ip_tunnel_parm *parms,
274 					int type)
275 {
276 	__be32 remote = parms->iph.daddr;
277 	__be32 local = parms->iph.saddr;
278 	__be32 key = parms->i_key;
279 	__be16 flags = parms->i_flags;
280 	int link = parms->link;
281 	struct ip_tunnel *t = NULL;
282 	struct hlist_head *head = ip_bucket(itn, parms);
283 
284 	hlist_for_each_entry_rcu(t, head, hash_node) {
285 		if (local == t->parms.iph.saddr &&
286 		    remote == t->parms.iph.daddr &&
287 		    link == t->parms.link &&
288 		    type == t->dev->type &&
289 		    ip_tunnel_key_match(&t->parms, flags, key))
290 			break;
291 	}
292 	return t;
293 }
294 
295 static struct net_device *__ip_tunnel_create(struct net *net,
296 					     const struct rtnl_link_ops *ops,
297 					     struct ip_tunnel_parm *parms)
298 {
299 	int err;
300 	struct ip_tunnel *tunnel;
301 	struct net_device *dev;
302 	char name[IFNAMSIZ];
303 
304 	if (parms->name[0])
305 		strlcpy(name, parms->name, IFNAMSIZ);
306 	else {
307 		if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
308 			err = -E2BIG;
309 			goto failed;
310 		}
311 		strlcpy(name, ops->kind, IFNAMSIZ);
312 		strncat(name, "%d", 2);
313 	}
314 
315 	ASSERT_RTNL();
316 	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
317 	if (!dev) {
318 		err = -ENOMEM;
319 		goto failed;
320 	}
321 	dev_net_set(dev, net);
322 
323 	dev->rtnl_link_ops = ops;
324 
325 	tunnel = netdev_priv(dev);
326 	tunnel->parms = *parms;
327 	tunnel->net = net;
328 
329 	err = register_netdevice(dev);
330 	if (err)
331 		goto failed_free;
332 
333 	return dev;
334 
335 failed_free:
336 	free_netdev(dev);
337 failed:
338 	return ERR_PTR(err);
339 }
340 
341 static inline void init_tunnel_flow(struct flowi4 *fl4,
342 				    int proto,
343 				    __be32 daddr, __be32 saddr,
344 				    __be32 key, __u8 tos, int oif)
345 {
346 	memset(fl4, 0, sizeof(*fl4));
347 	fl4->flowi4_oif = oif;
348 	fl4->daddr = daddr;
349 	fl4->saddr = saddr;
350 	fl4->flowi4_tos = tos;
351 	fl4->flowi4_proto = proto;
352 	fl4->fl4_gre_key = key;
353 }
354 
355 static int ip_tunnel_bind_dev(struct net_device *dev)
356 {
357 	struct net_device *tdev = NULL;
358 	struct ip_tunnel *tunnel = netdev_priv(dev);
359 	const struct iphdr *iph;
360 	int hlen = LL_MAX_HEADER;
361 	int mtu = ETH_DATA_LEN;
362 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
363 
364 	iph = &tunnel->parms.iph;
365 
366 	/* Guess output device to choose reasonable mtu and needed_headroom */
367 	if (iph->daddr) {
368 		struct flowi4 fl4;
369 		struct rtable *rt;
370 
371 		init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
372 				 iph->saddr, tunnel->parms.o_key,
373 				 RT_TOS(iph->tos), tunnel->parms.link);
374 		rt = ip_route_output_key(tunnel->net, &fl4);
375 
376 		if (!IS_ERR(rt)) {
377 			tdev = rt->dst.dev;
378 			tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
379 			ip_rt_put(rt);
380 		}
381 		if (dev->type != ARPHRD_ETHER)
382 			dev->flags |= IFF_POINTOPOINT;
383 	}
384 
385 	if (!tdev && tunnel->parms.link)
386 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
387 
388 	if (tdev) {
389 		hlen = tdev->hard_header_len + tdev->needed_headroom;
390 		mtu = tdev->mtu;
391 	}
392 	dev->iflink = tunnel->parms.link;
393 
394 	dev->needed_headroom = t_hlen + hlen;
395 	mtu -= (dev->hard_header_len + t_hlen);
396 
397 	if (mtu < 68)
398 		mtu = 68;
399 
400 	return mtu;
401 }
402 
403 static struct ip_tunnel *ip_tunnel_create(struct net *net,
404 					  struct ip_tunnel_net *itn,
405 					  struct ip_tunnel_parm *parms)
406 {
407 	struct ip_tunnel *nt;
408 	struct net_device *dev;
409 
410 	BUG_ON(!itn->fb_tunnel_dev);
411 	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
412 	if (IS_ERR(dev))
413 		return ERR_CAST(dev);
414 
415 	dev->mtu = ip_tunnel_bind_dev(dev);
416 
417 	nt = netdev_priv(dev);
418 	ip_tunnel_add(itn, nt);
419 	return nt;
420 }
421 
422 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
423 		  const struct tnl_ptk_info *tpi, bool log_ecn_error)
424 {
425 	struct pcpu_sw_netstats *tstats;
426 	const struct iphdr *iph = ip_hdr(skb);
427 	int err;
428 
429 #ifdef CONFIG_NET_IPGRE_BROADCAST
430 	if (ipv4_is_multicast(iph->daddr)) {
431 		tunnel->dev->stats.multicast++;
432 		skb->pkt_type = PACKET_BROADCAST;
433 	}
434 #endif
435 
436 	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
437 	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
438 		tunnel->dev->stats.rx_crc_errors++;
439 		tunnel->dev->stats.rx_errors++;
440 		goto drop;
441 	}
442 
443 	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
444 		if (!(tpi->flags&TUNNEL_SEQ) ||
445 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
446 			tunnel->dev->stats.rx_fifo_errors++;
447 			tunnel->dev->stats.rx_errors++;
448 			goto drop;
449 		}
450 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
451 	}
452 
453 	skb_reset_network_header(skb);
454 
455 	err = IP_ECN_decapsulate(iph, skb);
456 	if (unlikely(err)) {
457 		if (log_ecn_error)
458 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
459 					&iph->saddr, iph->tos);
460 		if (err > 1) {
461 			++tunnel->dev->stats.rx_frame_errors;
462 			++tunnel->dev->stats.rx_errors;
463 			goto drop;
464 		}
465 	}
466 
467 	tstats = this_cpu_ptr(tunnel->dev->tstats);
468 	u64_stats_update_begin(&tstats->syncp);
469 	tstats->rx_packets++;
470 	tstats->rx_bytes += skb->len;
471 	u64_stats_update_end(&tstats->syncp);
472 
473 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
474 
475 	if (tunnel->dev->type == ARPHRD_ETHER) {
476 		skb->protocol = eth_type_trans(skb, tunnel->dev);
477 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
478 	} else {
479 		skb->dev = tunnel->dev;
480 	}
481 
482 	gro_cells_receive(&tunnel->gro_cells, skb);
483 	return 0;
484 
485 drop:
486 	kfree_skb(skb);
487 	return 0;
488 }
489 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
490 
491 static int ip_encap_hlen(struct ip_tunnel_encap *e)
492 {
493 	const struct ip_tunnel_encap_ops *ops;
494 	int hlen = -EINVAL;
495 
496 	if (e->type == TUNNEL_ENCAP_NONE)
497 		return 0;
498 
499 	if (e->type >= MAX_IPTUN_ENCAP_OPS)
500 		return -EINVAL;
501 
502 	rcu_read_lock();
503 	ops = rcu_dereference(iptun_encaps[e->type]);
504 	if (likely(ops && ops->encap_hlen))
505 		hlen = ops->encap_hlen(e);
506 	rcu_read_unlock();
507 
508 	return hlen;
509 }
510 
511 const struct ip_tunnel_encap_ops __rcu *
512 		iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
513 
514 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
515 			    unsigned int num)
516 {
517 	if (num >= MAX_IPTUN_ENCAP_OPS)
518 		return -ERANGE;
519 
520 	return !cmpxchg((const struct ip_tunnel_encap_ops **)
521 			&iptun_encaps[num],
522 			NULL, ops) ? 0 : -1;
523 }
524 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
525 
526 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
527 			    unsigned int num)
528 {
529 	int ret;
530 
531 	if (num >= MAX_IPTUN_ENCAP_OPS)
532 		return -ERANGE;
533 
534 	ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
535 		       &iptun_encaps[num],
536 		       ops, NULL) == ops) ? 0 : -1;
537 
538 	synchronize_net();
539 
540 	return ret;
541 }
542 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
543 
544 int ip_tunnel_encap_setup(struct ip_tunnel *t,
545 			  struct ip_tunnel_encap *ipencap)
546 {
547 	int hlen;
548 
549 	memset(&t->encap, 0, sizeof(t->encap));
550 
551 	hlen = ip_encap_hlen(ipencap);
552 	if (hlen < 0)
553 		return hlen;
554 
555 	t->encap.type = ipencap->type;
556 	t->encap.sport = ipencap->sport;
557 	t->encap.dport = ipencap->dport;
558 	t->encap.flags = ipencap->flags;
559 
560 	t->encap_hlen = hlen;
561 	t->hlen = t->encap_hlen + t->tun_hlen;
562 
563 	return 0;
564 }
565 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
566 
567 int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
568 		    u8 *protocol, struct flowi4 *fl4)
569 {
570 	const struct ip_tunnel_encap_ops *ops;
571 	int ret = -EINVAL;
572 
573 	if (t->encap.type == TUNNEL_ENCAP_NONE)
574 		return 0;
575 
576 	if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
577 		return -EINVAL;
578 
579 	rcu_read_lock();
580 	ops = rcu_dereference(iptun_encaps[t->encap.type]);
581 	if (likely(ops && ops->build_header))
582 		ret = ops->build_header(skb, &t->encap, protocol, fl4);
583 	rcu_read_unlock();
584 
585 	return ret;
586 }
587 EXPORT_SYMBOL(ip_tunnel_encap);
588 
589 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
590 			    struct rtable *rt, __be16 df)
591 {
592 	struct ip_tunnel *tunnel = netdev_priv(dev);
593 	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
594 	int mtu;
595 
596 	if (df)
597 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
598 					- sizeof(struct iphdr) - tunnel->hlen;
599 	else
600 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
601 
602 	if (skb_dst(skb))
603 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
604 
605 	if (skb->protocol == htons(ETH_P_IP)) {
606 		if (!skb_is_gso(skb) &&
607 		    (df & htons(IP_DF)) && mtu < pkt_size) {
608 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
609 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
610 			return -E2BIG;
611 		}
612 	}
613 #if IS_ENABLED(CONFIG_IPV6)
614 	else if (skb->protocol == htons(ETH_P_IPV6)) {
615 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
616 
617 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
618 			   mtu >= IPV6_MIN_MTU) {
619 			if ((tunnel->parms.iph.daddr &&
620 			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
621 			    rt6->rt6i_dst.plen == 128) {
622 				rt6->rt6i_flags |= RTF_MODIFIED;
623 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
624 			}
625 		}
626 
627 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
628 					mtu < pkt_size) {
629 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
630 			return -E2BIG;
631 		}
632 	}
633 #endif
634 	return 0;
635 }
636 
637 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
638 		    const struct iphdr *tnl_params, u8 protocol)
639 {
640 	struct ip_tunnel *tunnel = netdev_priv(dev);
641 	const struct iphdr *inner_iph;
642 	struct flowi4 fl4;
643 	u8     tos, ttl;
644 	__be16 df;
645 	struct rtable *rt;		/* Route to the other host */
646 	unsigned int max_headroom;	/* The extra header space needed */
647 	__be32 dst;
648 	int err;
649 	bool connected;
650 
651 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
652 	connected = (tunnel->parms.iph.daddr != 0);
653 
654 	dst = tnl_params->daddr;
655 	if (dst == 0) {
656 		/* NBMA tunnel */
657 
658 		if (skb_dst(skb) == NULL) {
659 			dev->stats.tx_fifo_errors++;
660 			goto tx_error;
661 		}
662 
663 		if (skb->protocol == htons(ETH_P_IP)) {
664 			rt = skb_rtable(skb);
665 			dst = rt_nexthop(rt, inner_iph->daddr);
666 		}
667 #if IS_ENABLED(CONFIG_IPV6)
668 		else if (skb->protocol == htons(ETH_P_IPV6)) {
669 			const struct in6_addr *addr6;
670 			struct neighbour *neigh;
671 			bool do_tx_error_icmp;
672 			int addr_type;
673 
674 			neigh = dst_neigh_lookup(skb_dst(skb),
675 						 &ipv6_hdr(skb)->daddr);
676 			if (neigh == NULL)
677 				goto tx_error;
678 
679 			addr6 = (const struct in6_addr *)&neigh->primary_key;
680 			addr_type = ipv6_addr_type(addr6);
681 
682 			if (addr_type == IPV6_ADDR_ANY) {
683 				addr6 = &ipv6_hdr(skb)->daddr;
684 				addr_type = ipv6_addr_type(addr6);
685 			}
686 
687 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
688 				do_tx_error_icmp = true;
689 			else {
690 				do_tx_error_icmp = false;
691 				dst = addr6->s6_addr32[3];
692 			}
693 			neigh_release(neigh);
694 			if (do_tx_error_icmp)
695 				goto tx_error_icmp;
696 		}
697 #endif
698 		else
699 			goto tx_error;
700 
701 		connected = false;
702 	}
703 
704 	tos = tnl_params->tos;
705 	if (tos & 0x1) {
706 		tos &= ~0x1;
707 		if (skb->protocol == htons(ETH_P_IP)) {
708 			tos = inner_iph->tos;
709 			connected = false;
710 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
711 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
712 			connected = false;
713 		}
714 	}
715 
716 	init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
717 			 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
718 
719 	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
720 		goto tx_error;
721 
722 	rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL;
723 
724 	if (!rt) {
725 		rt = ip_route_output_key(tunnel->net, &fl4);
726 
727 		if (IS_ERR(rt)) {
728 			dev->stats.tx_carrier_errors++;
729 			goto tx_error;
730 		}
731 		if (connected)
732 			tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
733 	}
734 
735 	if (rt->dst.dev == dev) {
736 		ip_rt_put(rt);
737 		dev->stats.collisions++;
738 		goto tx_error;
739 	}
740 
741 	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) {
742 		ip_rt_put(rt);
743 		goto tx_error;
744 	}
745 
746 	if (tunnel->err_count > 0) {
747 		if (time_before(jiffies,
748 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
749 			tunnel->err_count--;
750 
751 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
752 			dst_link_failure(skb);
753 		} else
754 			tunnel->err_count = 0;
755 	}
756 
757 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
758 	ttl = tnl_params->ttl;
759 	if (ttl == 0) {
760 		if (skb->protocol == htons(ETH_P_IP))
761 			ttl = inner_iph->ttl;
762 #if IS_ENABLED(CONFIG_IPV6)
763 		else if (skb->protocol == htons(ETH_P_IPV6))
764 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
765 #endif
766 		else
767 			ttl = ip4_dst_hoplimit(&rt->dst);
768 	}
769 
770 	df = tnl_params->frag_off;
771 	if (skb->protocol == htons(ETH_P_IP))
772 		df |= (inner_iph->frag_off&htons(IP_DF));
773 
774 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
775 			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
776 	if (max_headroom > dev->needed_headroom)
777 		dev->needed_headroom = max_headroom;
778 
779 	if (skb_cow_head(skb, dev->needed_headroom)) {
780 		ip_rt_put(rt);
781 		dev->stats.tx_dropped++;
782 		kfree_skb(skb);
783 		return;
784 	}
785 
786 	err = iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, protocol,
787 			    tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
788 	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
789 
790 	return;
791 
792 #if IS_ENABLED(CONFIG_IPV6)
793 tx_error_icmp:
794 	dst_link_failure(skb);
795 #endif
796 tx_error:
797 	dev->stats.tx_errors++;
798 	kfree_skb(skb);
799 }
800 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
801 
802 static void ip_tunnel_update(struct ip_tunnel_net *itn,
803 			     struct ip_tunnel *t,
804 			     struct net_device *dev,
805 			     struct ip_tunnel_parm *p,
806 			     bool set_mtu)
807 {
808 	ip_tunnel_del(t);
809 	t->parms.iph.saddr = p->iph.saddr;
810 	t->parms.iph.daddr = p->iph.daddr;
811 	t->parms.i_key = p->i_key;
812 	t->parms.o_key = p->o_key;
813 	if (dev->type != ARPHRD_ETHER) {
814 		memcpy(dev->dev_addr, &p->iph.saddr, 4);
815 		memcpy(dev->broadcast, &p->iph.daddr, 4);
816 	}
817 	ip_tunnel_add(itn, t);
818 
819 	t->parms.iph.ttl = p->iph.ttl;
820 	t->parms.iph.tos = p->iph.tos;
821 	t->parms.iph.frag_off = p->iph.frag_off;
822 
823 	if (t->parms.link != p->link) {
824 		int mtu;
825 
826 		t->parms.link = p->link;
827 		mtu = ip_tunnel_bind_dev(dev);
828 		if (set_mtu)
829 			dev->mtu = mtu;
830 	}
831 	ip_tunnel_dst_reset_all(t);
832 	netdev_state_change(dev);
833 }
834 
835 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
836 {
837 	int err = 0;
838 	struct ip_tunnel *t = netdev_priv(dev);
839 	struct net *net = t->net;
840 	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
841 
842 	BUG_ON(!itn->fb_tunnel_dev);
843 	switch (cmd) {
844 	case SIOCGETTUNNEL:
845 		if (dev == itn->fb_tunnel_dev) {
846 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
847 			if (t == NULL)
848 				t = netdev_priv(dev);
849 		}
850 		memcpy(p, &t->parms, sizeof(*p));
851 		break;
852 
853 	case SIOCADDTUNNEL:
854 	case SIOCCHGTUNNEL:
855 		err = -EPERM;
856 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
857 			goto done;
858 		if (p->iph.ttl)
859 			p->iph.frag_off |= htons(IP_DF);
860 		if (!(p->i_flags & VTI_ISVTI)) {
861 			if (!(p->i_flags & TUNNEL_KEY))
862 				p->i_key = 0;
863 			if (!(p->o_flags & TUNNEL_KEY))
864 				p->o_key = 0;
865 		}
866 
867 		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
868 
869 		if (cmd == SIOCADDTUNNEL) {
870 			if (!t) {
871 				t = ip_tunnel_create(net, itn, p);
872 				err = PTR_ERR_OR_ZERO(t);
873 				break;
874 			}
875 
876 			err = -EEXIST;
877 			break;
878 		}
879 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
880 			if (t != NULL) {
881 				if (t->dev != dev) {
882 					err = -EEXIST;
883 					break;
884 				}
885 			} else {
886 				unsigned int nflags = 0;
887 
888 				if (ipv4_is_multicast(p->iph.daddr))
889 					nflags = IFF_BROADCAST;
890 				else if (p->iph.daddr)
891 					nflags = IFF_POINTOPOINT;
892 
893 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
894 					err = -EINVAL;
895 					break;
896 				}
897 
898 				t = netdev_priv(dev);
899 			}
900 		}
901 
902 		if (t) {
903 			err = 0;
904 			ip_tunnel_update(itn, t, dev, p, true);
905 		} else {
906 			err = -ENOENT;
907 		}
908 		break;
909 
910 	case SIOCDELTUNNEL:
911 		err = -EPERM;
912 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
913 			goto done;
914 
915 		if (dev == itn->fb_tunnel_dev) {
916 			err = -ENOENT;
917 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
918 			if (t == NULL)
919 				goto done;
920 			err = -EPERM;
921 			if (t == netdev_priv(itn->fb_tunnel_dev))
922 				goto done;
923 			dev = t->dev;
924 		}
925 		unregister_netdevice(dev);
926 		err = 0;
927 		break;
928 
929 	default:
930 		err = -EINVAL;
931 	}
932 
933 done:
934 	return err;
935 }
936 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
937 
938 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
939 {
940 	struct ip_tunnel *tunnel = netdev_priv(dev);
941 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
942 
943 	if (new_mtu < 68 ||
944 	    new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
945 		return -EINVAL;
946 	dev->mtu = new_mtu;
947 	return 0;
948 }
949 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
950 
951 static void ip_tunnel_dev_free(struct net_device *dev)
952 {
953 	struct ip_tunnel *tunnel = netdev_priv(dev);
954 
955 	gro_cells_destroy(&tunnel->gro_cells);
956 	free_percpu(tunnel->dst_cache);
957 	free_percpu(dev->tstats);
958 	free_netdev(dev);
959 }
960 
961 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
962 {
963 	struct ip_tunnel *tunnel = netdev_priv(dev);
964 	struct ip_tunnel_net *itn;
965 
966 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
967 
968 	if (itn->fb_tunnel_dev != dev) {
969 		ip_tunnel_del(netdev_priv(dev));
970 		unregister_netdevice_queue(dev, head);
971 	}
972 }
973 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
974 
975 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
976 {
977 	struct ip_tunnel *tunnel = netdev_priv(dev);
978 
979 	return tunnel->net;
980 }
981 EXPORT_SYMBOL(ip_tunnel_get_link_net);
982 
983 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
984 				  struct rtnl_link_ops *ops, char *devname)
985 {
986 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
987 	struct ip_tunnel_parm parms;
988 	unsigned int i;
989 
990 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
991 		INIT_HLIST_HEAD(&itn->tunnels[i]);
992 
993 	if (!ops) {
994 		itn->fb_tunnel_dev = NULL;
995 		return 0;
996 	}
997 
998 	memset(&parms, 0, sizeof(parms));
999 	if (devname)
1000 		strlcpy(parms.name, devname, IFNAMSIZ);
1001 
1002 	rtnl_lock();
1003 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1004 	/* FB netdevice is special: we have one, and only one per netns.
1005 	 * Allowing to move it to another netns is clearly unsafe.
1006 	 */
1007 	if (!IS_ERR(itn->fb_tunnel_dev)) {
1008 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1009 		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1010 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1011 	}
1012 	rtnl_unlock();
1013 
1014 	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1015 }
1016 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1017 
1018 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1019 			      struct rtnl_link_ops *ops)
1020 {
1021 	struct net *net = dev_net(itn->fb_tunnel_dev);
1022 	struct net_device *dev, *aux;
1023 	int h;
1024 
1025 	for_each_netdev_safe(net, dev, aux)
1026 		if (dev->rtnl_link_ops == ops)
1027 			unregister_netdevice_queue(dev, head);
1028 
1029 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1030 		struct ip_tunnel *t;
1031 		struct hlist_node *n;
1032 		struct hlist_head *thead = &itn->tunnels[h];
1033 
1034 		hlist_for_each_entry_safe(t, n, thead, hash_node)
1035 			/* If dev is in the same netns, it has already
1036 			 * been added to the list by the previous loop.
1037 			 */
1038 			if (!net_eq(dev_net(t->dev), net))
1039 				unregister_netdevice_queue(t->dev, head);
1040 	}
1041 }
1042 
1043 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1044 {
1045 	LIST_HEAD(list);
1046 
1047 	rtnl_lock();
1048 	ip_tunnel_destroy(itn, &list, ops);
1049 	unregister_netdevice_many(&list);
1050 	rtnl_unlock();
1051 }
1052 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1053 
1054 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1055 		      struct ip_tunnel_parm *p)
1056 {
1057 	struct ip_tunnel *nt;
1058 	struct net *net = dev_net(dev);
1059 	struct ip_tunnel_net *itn;
1060 	int mtu;
1061 	int err;
1062 
1063 	nt = netdev_priv(dev);
1064 	itn = net_generic(net, nt->ip_tnl_net_id);
1065 
1066 	if (ip_tunnel_find(itn, p, dev->type))
1067 		return -EEXIST;
1068 
1069 	nt->net = net;
1070 	nt->parms = *p;
1071 	err = register_netdevice(dev);
1072 	if (err)
1073 		goto out;
1074 
1075 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1076 		eth_hw_addr_random(dev);
1077 
1078 	mtu = ip_tunnel_bind_dev(dev);
1079 	if (!tb[IFLA_MTU])
1080 		dev->mtu = mtu;
1081 
1082 	ip_tunnel_add(itn, nt);
1083 
1084 out:
1085 	return err;
1086 }
1087 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1088 
1089 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1090 			 struct ip_tunnel_parm *p)
1091 {
1092 	struct ip_tunnel *t;
1093 	struct ip_tunnel *tunnel = netdev_priv(dev);
1094 	struct net *net = tunnel->net;
1095 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1096 
1097 	if (dev == itn->fb_tunnel_dev)
1098 		return -EINVAL;
1099 
1100 	t = ip_tunnel_find(itn, p, dev->type);
1101 
1102 	if (t) {
1103 		if (t->dev != dev)
1104 			return -EEXIST;
1105 	} else {
1106 		t = tunnel;
1107 
1108 		if (dev->type != ARPHRD_ETHER) {
1109 			unsigned int nflags = 0;
1110 
1111 			if (ipv4_is_multicast(p->iph.daddr))
1112 				nflags = IFF_BROADCAST;
1113 			else if (p->iph.daddr)
1114 				nflags = IFF_POINTOPOINT;
1115 
1116 			if ((dev->flags ^ nflags) &
1117 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1118 				return -EINVAL;
1119 		}
1120 	}
1121 
1122 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1123 	return 0;
1124 }
1125 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1126 
1127 int ip_tunnel_init(struct net_device *dev)
1128 {
1129 	struct ip_tunnel *tunnel = netdev_priv(dev);
1130 	struct iphdr *iph = &tunnel->parms.iph;
1131 	int err;
1132 
1133 	dev->destructor	= ip_tunnel_dev_free;
1134 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1135 	if (!dev->tstats)
1136 		return -ENOMEM;
1137 
1138 	tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
1139 	if (!tunnel->dst_cache) {
1140 		free_percpu(dev->tstats);
1141 		return -ENOMEM;
1142 	}
1143 
1144 	err = gro_cells_init(&tunnel->gro_cells, dev);
1145 	if (err) {
1146 		free_percpu(tunnel->dst_cache);
1147 		free_percpu(dev->tstats);
1148 		return err;
1149 	}
1150 
1151 	tunnel->dev = dev;
1152 	tunnel->net = dev_net(dev);
1153 	strcpy(tunnel->parms.name, dev->name);
1154 	iph->version		= 4;
1155 	iph->ihl		= 5;
1156 
1157 	return 0;
1158 }
1159 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1160 
1161 void ip_tunnel_uninit(struct net_device *dev)
1162 {
1163 	struct ip_tunnel *tunnel = netdev_priv(dev);
1164 	struct net *net = tunnel->net;
1165 	struct ip_tunnel_net *itn;
1166 
1167 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1168 	/* fb_tunnel_dev will be unregisted in net-exit call. */
1169 	if (itn->fb_tunnel_dev != dev)
1170 		ip_tunnel_del(netdev_priv(dev));
1171 
1172 	ip_tunnel_dst_reset_all(tunnel);
1173 }
1174 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1175 
1176 /* Do least required initialization, rest of init is done in tunnel_init call */
1177 void ip_tunnel_setup(struct net_device *dev, int net_id)
1178 {
1179 	struct ip_tunnel *tunnel = netdev_priv(dev);
1180 	tunnel->ip_tnl_net_id = net_id;
1181 }
1182 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1183 
1184 MODULE_LICENSE("GPL");
1185