xref: /openbmc/linux/net/ipv4/ip_tunnel.c (revision a8da474e)
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20 
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/mroute.h>
34 #include <linux/init.h>
35 #include <linux/in6.h>
36 #include <linux/inetdevice.h>
37 #include <linux/igmp.h>
38 #include <linux/netfilter_ipv4.h>
39 #include <linux/etherdevice.h>
40 #include <linux/if_ether.h>
41 #include <linux/if_vlan.h>
42 #include <linux/rculist.h>
43 #include <linux/err.h>
44 
45 #include <net/sock.h>
46 #include <net/ip.h>
47 #include <net/icmp.h>
48 #include <net/protocol.h>
49 #include <net/ip_tunnels.h>
50 #include <net/arp.h>
51 #include <net/checksum.h>
52 #include <net/dsfield.h>
53 #include <net/inet_ecn.h>
54 #include <net/xfrm.h>
55 #include <net/net_namespace.h>
56 #include <net/netns/generic.h>
57 #include <net/rtnetlink.h>
58 #include <net/udp.h>
59 
60 #if IS_ENABLED(CONFIG_IPV6)
61 #include <net/ipv6.h>
62 #include <net/ip6_fib.h>
63 #include <net/ip6_route.h>
64 #endif
65 
66 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
67 {
68 	return hash_32((__force u32)key ^ (__force u32)remote,
69 			 IP_TNL_HASH_BITS);
70 }
71 
72 static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
73 			     struct dst_entry *dst, __be32 saddr)
74 {
75 	struct dst_entry *old_dst;
76 
77 	dst_clone(dst);
78 	old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
79 	dst_release(old_dst);
80 	idst->saddr = saddr;
81 }
82 
83 static noinline void tunnel_dst_set(struct ip_tunnel *t,
84 			   struct dst_entry *dst, __be32 saddr)
85 {
86 	__tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr);
87 }
88 
89 static void tunnel_dst_reset(struct ip_tunnel *t)
90 {
91 	tunnel_dst_set(t, NULL, 0);
92 }
93 
94 void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
95 {
96 	int i;
97 
98 	for_each_possible_cpu(i)
99 		__tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0);
100 }
101 EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
102 
103 static struct rtable *tunnel_rtable_get(struct ip_tunnel *t,
104 					u32 cookie, __be32 *saddr)
105 {
106 	struct ip_tunnel_dst *idst;
107 	struct dst_entry *dst;
108 
109 	rcu_read_lock();
110 	idst = raw_cpu_ptr(t->dst_cache);
111 	dst = rcu_dereference(idst->dst);
112 	if (dst && !atomic_inc_not_zero(&dst->__refcnt))
113 		dst = NULL;
114 	if (dst) {
115 		if (!dst->obsolete || dst->ops->check(dst, cookie)) {
116 			*saddr = idst->saddr;
117 		} else {
118 			tunnel_dst_reset(t);
119 			dst_release(dst);
120 			dst = NULL;
121 		}
122 	}
123 	rcu_read_unlock();
124 	return (struct rtable *)dst;
125 }
126 
127 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
128 				__be16 flags, __be32 key)
129 {
130 	if (p->i_flags & TUNNEL_KEY) {
131 		if (flags & TUNNEL_KEY)
132 			return key == p->i_key;
133 		else
134 			/* key expected, none present */
135 			return false;
136 	} else
137 		return !(flags & TUNNEL_KEY);
138 }
139 
140 /* Fallback tunnel: no source, no destination, no key, no options
141 
142    Tunnel hash table:
143    We require exact key match i.e. if a key is present in packet
144    it will match only tunnel with the same key; if it is not present,
145    it will match only keyless tunnel.
146 
147    All keysless packets, if not matched configured keyless tunnels
148    will match fallback tunnel.
149    Given src, dst and key, find appropriate for input tunnel.
150 */
151 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
152 				   int link, __be16 flags,
153 				   __be32 remote, __be32 local,
154 				   __be32 key)
155 {
156 	unsigned int hash;
157 	struct ip_tunnel *t, *cand = NULL;
158 	struct hlist_head *head;
159 
160 	hash = ip_tunnel_hash(key, remote);
161 	head = &itn->tunnels[hash];
162 
163 	hlist_for_each_entry_rcu(t, head, hash_node) {
164 		if (local != t->parms.iph.saddr ||
165 		    remote != t->parms.iph.daddr ||
166 		    !(t->dev->flags & IFF_UP))
167 			continue;
168 
169 		if (!ip_tunnel_key_match(&t->parms, flags, key))
170 			continue;
171 
172 		if (t->parms.link == link)
173 			return t;
174 		else
175 			cand = t;
176 	}
177 
178 	hlist_for_each_entry_rcu(t, head, hash_node) {
179 		if (remote != t->parms.iph.daddr ||
180 		    t->parms.iph.saddr != 0 ||
181 		    !(t->dev->flags & IFF_UP))
182 			continue;
183 
184 		if (!ip_tunnel_key_match(&t->parms, flags, key))
185 			continue;
186 
187 		if (t->parms.link == link)
188 			return t;
189 		else if (!cand)
190 			cand = t;
191 	}
192 
193 	hash = ip_tunnel_hash(key, 0);
194 	head = &itn->tunnels[hash];
195 
196 	hlist_for_each_entry_rcu(t, head, hash_node) {
197 		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
198 		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
199 			continue;
200 
201 		if (!(t->dev->flags & IFF_UP))
202 			continue;
203 
204 		if (!ip_tunnel_key_match(&t->parms, flags, key))
205 			continue;
206 
207 		if (t->parms.link == link)
208 			return t;
209 		else if (!cand)
210 			cand = t;
211 	}
212 
213 	if (flags & TUNNEL_NO_KEY)
214 		goto skip_key_lookup;
215 
216 	hlist_for_each_entry_rcu(t, head, hash_node) {
217 		if (t->parms.i_key != key ||
218 		    t->parms.iph.saddr != 0 ||
219 		    t->parms.iph.daddr != 0 ||
220 		    !(t->dev->flags & IFF_UP))
221 			continue;
222 
223 		if (t->parms.link == link)
224 			return t;
225 		else if (!cand)
226 			cand = t;
227 	}
228 
229 skip_key_lookup:
230 	if (cand)
231 		return cand;
232 
233 	t = rcu_dereference(itn->collect_md_tun);
234 	if (t)
235 		return t;
236 
237 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
238 		return netdev_priv(itn->fb_tunnel_dev);
239 
240 	return NULL;
241 }
242 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
243 
244 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
245 				    struct ip_tunnel_parm *parms)
246 {
247 	unsigned int h;
248 	__be32 remote;
249 	__be32 i_key = parms->i_key;
250 
251 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
252 		remote = parms->iph.daddr;
253 	else
254 		remote = 0;
255 
256 	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
257 		i_key = 0;
258 
259 	h = ip_tunnel_hash(i_key, remote);
260 	return &itn->tunnels[h];
261 }
262 
263 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
264 {
265 	struct hlist_head *head = ip_bucket(itn, &t->parms);
266 
267 	if (t->collect_md)
268 		rcu_assign_pointer(itn->collect_md_tun, t);
269 	hlist_add_head_rcu(&t->hash_node, head);
270 }
271 
272 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
273 {
274 	if (t->collect_md)
275 		rcu_assign_pointer(itn->collect_md_tun, NULL);
276 	hlist_del_init_rcu(&t->hash_node);
277 }
278 
279 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
280 					struct ip_tunnel_parm *parms,
281 					int type)
282 {
283 	__be32 remote = parms->iph.daddr;
284 	__be32 local = parms->iph.saddr;
285 	__be32 key = parms->i_key;
286 	__be16 flags = parms->i_flags;
287 	int link = parms->link;
288 	struct ip_tunnel *t = NULL;
289 	struct hlist_head *head = ip_bucket(itn, parms);
290 
291 	hlist_for_each_entry_rcu(t, head, hash_node) {
292 		if (local == t->parms.iph.saddr &&
293 		    remote == t->parms.iph.daddr &&
294 		    link == t->parms.link &&
295 		    type == t->dev->type &&
296 		    ip_tunnel_key_match(&t->parms, flags, key))
297 			break;
298 	}
299 	return t;
300 }
301 
302 static struct net_device *__ip_tunnel_create(struct net *net,
303 					     const struct rtnl_link_ops *ops,
304 					     struct ip_tunnel_parm *parms)
305 {
306 	int err;
307 	struct ip_tunnel *tunnel;
308 	struct net_device *dev;
309 	char name[IFNAMSIZ];
310 
311 	if (parms->name[0])
312 		strlcpy(name, parms->name, IFNAMSIZ);
313 	else {
314 		if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
315 			err = -E2BIG;
316 			goto failed;
317 		}
318 		strlcpy(name, ops->kind, IFNAMSIZ);
319 		strncat(name, "%d", 2);
320 	}
321 
322 	ASSERT_RTNL();
323 	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
324 	if (!dev) {
325 		err = -ENOMEM;
326 		goto failed;
327 	}
328 	dev_net_set(dev, net);
329 
330 	dev->rtnl_link_ops = ops;
331 
332 	tunnel = netdev_priv(dev);
333 	tunnel->parms = *parms;
334 	tunnel->net = net;
335 
336 	err = register_netdevice(dev);
337 	if (err)
338 		goto failed_free;
339 
340 	return dev;
341 
342 failed_free:
343 	free_netdev(dev);
344 failed:
345 	return ERR_PTR(err);
346 }
347 
348 static inline void init_tunnel_flow(struct flowi4 *fl4,
349 				    int proto,
350 				    __be32 daddr, __be32 saddr,
351 				    __be32 key, __u8 tos, int oif)
352 {
353 	memset(fl4, 0, sizeof(*fl4));
354 	fl4->flowi4_oif = oif;
355 	fl4->daddr = daddr;
356 	fl4->saddr = saddr;
357 	fl4->flowi4_tos = tos;
358 	fl4->flowi4_proto = proto;
359 	fl4->fl4_gre_key = key;
360 }
361 
362 static int ip_tunnel_bind_dev(struct net_device *dev)
363 {
364 	struct net_device *tdev = NULL;
365 	struct ip_tunnel *tunnel = netdev_priv(dev);
366 	const struct iphdr *iph;
367 	int hlen = LL_MAX_HEADER;
368 	int mtu = ETH_DATA_LEN;
369 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
370 
371 	iph = &tunnel->parms.iph;
372 
373 	/* Guess output device to choose reasonable mtu and needed_headroom */
374 	if (iph->daddr) {
375 		struct flowi4 fl4;
376 		struct rtable *rt;
377 
378 		init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
379 				 iph->saddr, tunnel->parms.o_key,
380 				 RT_TOS(iph->tos), tunnel->parms.link);
381 		rt = ip_route_output_key(tunnel->net, &fl4);
382 
383 		if (!IS_ERR(rt)) {
384 			tdev = rt->dst.dev;
385 			tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
386 			ip_rt_put(rt);
387 		}
388 		if (dev->type != ARPHRD_ETHER)
389 			dev->flags |= IFF_POINTOPOINT;
390 	}
391 
392 	if (!tdev && tunnel->parms.link)
393 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
394 
395 	if (tdev) {
396 		hlen = tdev->hard_header_len + tdev->needed_headroom;
397 		mtu = tdev->mtu;
398 	}
399 
400 	dev->needed_headroom = t_hlen + hlen;
401 	mtu -= (dev->hard_header_len + t_hlen);
402 
403 	if (mtu < 68)
404 		mtu = 68;
405 
406 	return mtu;
407 }
408 
409 static struct ip_tunnel *ip_tunnel_create(struct net *net,
410 					  struct ip_tunnel_net *itn,
411 					  struct ip_tunnel_parm *parms)
412 {
413 	struct ip_tunnel *nt;
414 	struct net_device *dev;
415 
416 	BUG_ON(!itn->fb_tunnel_dev);
417 	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
418 	if (IS_ERR(dev))
419 		return ERR_CAST(dev);
420 
421 	dev->mtu = ip_tunnel_bind_dev(dev);
422 
423 	nt = netdev_priv(dev);
424 	ip_tunnel_add(itn, nt);
425 	return nt;
426 }
427 
428 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
429 		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
430 		  bool log_ecn_error)
431 {
432 	struct pcpu_sw_netstats *tstats;
433 	const struct iphdr *iph = ip_hdr(skb);
434 	int err;
435 
436 #ifdef CONFIG_NET_IPGRE_BROADCAST
437 	if (ipv4_is_multicast(iph->daddr)) {
438 		tunnel->dev->stats.multicast++;
439 		skb->pkt_type = PACKET_BROADCAST;
440 	}
441 #endif
442 
443 	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
444 	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
445 		tunnel->dev->stats.rx_crc_errors++;
446 		tunnel->dev->stats.rx_errors++;
447 		goto drop;
448 	}
449 
450 	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
451 		if (!(tpi->flags&TUNNEL_SEQ) ||
452 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
453 			tunnel->dev->stats.rx_fifo_errors++;
454 			tunnel->dev->stats.rx_errors++;
455 			goto drop;
456 		}
457 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
458 	}
459 
460 	skb_reset_network_header(skb);
461 
462 	err = IP_ECN_decapsulate(iph, skb);
463 	if (unlikely(err)) {
464 		if (log_ecn_error)
465 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
466 					&iph->saddr, iph->tos);
467 		if (err > 1) {
468 			++tunnel->dev->stats.rx_frame_errors;
469 			++tunnel->dev->stats.rx_errors;
470 			goto drop;
471 		}
472 	}
473 
474 	tstats = this_cpu_ptr(tunnel->dev->tstats);
475 	u64_stats_update_begin(&tstats->syncp);
476 	tstats->rx_packets++;
477 	tstats->rx_bytes += skb->len;
478 	u64_stats_update_end(&tstats->syncp);
479 
480 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
481 
482 	if (tunnel->dev->type == ARPHRD_ETHER) {
483 		skb->protocol = eth_type_trans(skb, tunnel->dev);
484 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
485 	} else {
486 		skb->dev = tunnel->dev;
487 	}
488 
489 	if (tun_dst)
490 		skb_dst_set(skb, (struct dst_entry *)tun_dst);
491 
492 	gro_cells_receive(&tunnel->gro_cells, skb);
493 	return 0;
494 
495 drop:
496 	kfree_skb(skb);
497 	return 0;
498 }
499 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
500 
501 static int ip_encap_hlen(struct ip_tunnel_encap *e)
502 {
503 	const struct ip_tunnel_encap_ops *ops;
504 	int hlen = -EINVAL;
505 
506 	if (e->type == TUNNEL_ENCAP_NONE)
507 		return 0;
508 
509 	if (e->type >= MAX_IPTUN_ENCAP_OPS)
510 		return -EINVAL;
511 
512 	rcu_read_lock();
513 	ops = rcu_dereference(iptun_encaps[e->type]);
514 	if (likely(ops && ops->encap_hlen))
515 		hlen = ops->encap_hlen(e);
516 	rcu_read_unlock();
517 
518 	return hlen;
519 }
520 
521 const struct ip_tunnel_encap_ops __rcu *
522 		iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
523 
524 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
525 			    unsigned int num)
526 {
527 	if (num >= MAX_IPTUN_ENCAP_OPS)
528 		return -ERANGE;
529 
530 	return !cmpxchg((const struct ip_tunnel_encap_ops **)
531 			&iptun_encaps[num],
532 			NULL, ops) ? 0 : -1;
533 }
534 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
535 
536 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
537 			    unsigned int num)
538 {
539 	int ret;
540 
541 	if (num >= MAX_IPTUN_ENCAP_OPS)
542 		return -ERANGE;
543 
544 	ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
545 		       &iptun_encaps[num],
546 		       ops, NULL) == ops) ? 0 : -1;
547 
548 	synchronize_net();
549 
550 	return ret;
551 }
552 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
553 
554 int ip_tunnel_encap_setup(struct ip_tunnel *t,
555 			  struct ip_tunnel_encap *ipencap)
556 {
557 	int hlen;
558 
559 	memset(&t->encap, 0, sizeof(t->encap));
560 
561 	hlen = ip_encap_hlen(ipencap);
562 	if (hlen < 0)
563 		return hlen;
564 
565 	t->encap.type = ipencap->type;
566 	t->encap.sport = ipencap->sport;
567 	t->encap.dport = ipencap->dport;
568 	t->encap.flags = ipencap->flags;
569 
570 	t->encap_hlen = hlen;
571 	t->hlen = t->encap_hlen + t->tun_hlen;
572 
573 	return 0;
574 }
575 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
576 
577 int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
578 		    u8 *protocol, struct flowi4 *fl4)
579 {
580 	const struct ip_tunnel_encap_ops *ops;
581 	int ret = -EINVAL;
582 
583 	if (t->encap.type == TUNNEL_ENCAP_NONE)
584 		return 0;
585 
586 	if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
587 		return -EINVAL;
588 
589 	rcu_read_lock();
590 	ops = rcu_dereference(iptun_encaps[t->encap.type]);
591 	if (likely(ops && ops->build_header))
592 		ret = ops->build_header(skb, &t->encap, protocol, fl4);
593 	rcu_read_unlock();
594 
595 	return ret;
596 }
597 EXPORT_SYMBOL(ip_tunnel_encap);
598 
599 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
600 			    struct rtable *rt, __be16 df,
601 			    const struct iphdr *inner_iph)
602 {
603 	struct ip_tunnel *tunnel = netdev_priv(dev);
604 	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
605 	int mtu;
606 
607 	if (df)
608 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
609 					- sizeof(struct iphdr) - tunnel->hlen;
610 	else
611 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
612 
613 	if (skb_dst(skb))
614 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
615 
616 	if (skb->protocol == htons(ETH_P_IP)) {
617 		if (!skb_is_gso(skb) &&
618 		    (inner_iph->frag_off & htons(IP_DF)) &&
619 		    mtu < pkt_size) {
620 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
621 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
622 			return -E2BIG;
623 		}
624 	}
625 #if IS_ENABLED(CONFIG_IPV6)
626 	else if (skb->protocol == htons(ETH_P_IPV6)) {
627 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
628 
629 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
630 			   mtu >= IPV6_MIN_MTU) {
631 			if ((tunnel->parms.iph.daddr &&
632 			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
633 			    rt6->rt6i_dst.plen == 128) {
634 				rt6->rt6i_flags |= RTF_MODIFIED;
635 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
636 			}
637 		}
638 
639 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
640 					mtu < pkt_size) {
641 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
642 			return -E2BIG;
643 		}
644 	}
645 #endif
646 	return 0;
647 }
648 
649 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
650 		    const struct iphdr *tnl_params, u8 protocol)
651 {
652 	struct ip_tunnel *tunnel = netdev_priv(dev);
653 	const struct iphdr *inner_iph;
654 	struct flowi4 fl4;
655 	u8     tos, ttl;
656 	__be16 df;
657 	struct rtable *rt;		/* Route to the other host */
658 	unsigned int max_headroom;	/* The extra header space needed */
659 	__be32 dst;
660 	int err;
661 	bool connected;
662 
663 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
664 	connected = (tunnel->parms.iph.daddr != 0);
665 
666 	dst = tnl_params->daddr;
667 	if (dst == 0) {
668 		/* NBMA tunnel */
669 
670 		if (!skb_dst(skb)) {
671 			dev->stats.tx_fifo_errors++;
672 			goto tx_error;
673 		}
674 
675 		if (skb->protocol == htons(ETH_P_IP)) {
676 			rt = skb_rtable(skb);
677 			dst = rt_nexthop(rt, inner_iph->daddr);
678 		}
679 #if IS_ENABLED(CONFIG_IPV6)
680 		else if (skb->protocol == htons(ETH_P_IPV6)) {
681 			const struct in6_addr *addr6;
682 			struct neighbour *neigh;
683 			bool do_tx_error_icmp;
684 			int addr_type;
685 
686 			neigh = dst_neigh_lookup(skb_dst(skb),
687 						 &ipv6_hdr(skb)->daddr);
688 			if (!neigh)
689 				goto tx_error;
690 
691 			addr6 = (const struct in6_addr *)&neigh->primary_key;
692 			addr_type = ipv6_addr_type(addr6);
693 
694 			if (addr_type == IPV6_ADDR_ANY) {
695 				addr6 = &ipv6_hdr(skb)->daddr;
696 				addr_type = ipv6_addr_type(addr6);
697 			}
698 
699 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
700 				do_tx_error_icmp = true;
701 			else {
702 				do_tx_error_icmp = false;
703 				dst = addr6->s6_addr32[3];
704 			}
705 			neigh_release(neigh);
706 			if (do_tx_error_icmp)
707 				goto tx_error_icmp;
708 		}
709 #endif
710 		else
711 			goto tx_error;
712 
713 		connected = false;
714 	}
715 
716 	tos = tnl_params->tos;
717 	if (tos & 0x1) {
718 		tos &= ~0x1;
719 		if (skb->protocol == htons(ETH_P_IP)) {
720 			tos = inner_iph->tos;
721 			connected = false;
722 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
723 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
724 			connected = false;
725 		}
726 	}
727 
728 	init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
729 			 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
730 
731 	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
732 		goto tx_error;
733 
734 	rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL;
735 
736 	if (!rt) {
737 		rt = ip_route_output_key(tunnel->net, &fl4);
738 
739 		if (IS_ERR(rt)) {
740 			dev->stats.tx_carrier_errors++;
741 			goto tx_error;
742 		}
743 		if (connected)
744 			tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
745 	}
746 
747 	if (rt->dst.dev == dev) {
748 		ip_rt_put(rt);
749 		dev->stats.collisions++;
750 		goto tx_error;
751 	}
752 
753 	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
754 		ip_rt_put(rt);
755 		goto tx_error;
756 	}
757 
758 	if (tunnel->err_count > 0) {
759 		if (time_before(jiffies,
760 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
761 			tunnel->err_count--;
762 
763 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
764 			dst_link_failure(skb);
765 		} else
766 			tunnel->err_count = 0;
767 	}
768 
769 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
770 	ttl = tnl_params->ttl;
771 	if (ttl == 0) {
772 		if (skb->protocol == htons(ETH_P_IP))
773 			ttl = inner_iph->ttl;
774 #if IS_ENABLED(CONFIG_IPV6)
775 		else if (skb->protocol == htons(ETH_P_IPV6))
776 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
777 #endif
778 		else
779 			ttl = ip4_dst_hoplimit(&rt->dst);
780 	}
781 
782 	df = tnl_params->frag_off;
783 	if (skb->protocol == htons(ETH_P_IP))
784 		df |= (inner_iph->frag_off&htons(IP_DF));
785 
786 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
787 			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
788 	if (max_headroom > dev->needed_headroom)
789 		dev->needed_headroom = max_headroom;
790 
791 	if (skb_cow_head(skb, dev->needed_headroom)) {
792 		ip_rt_put(rt);
793 		dev->stats.tx_dropped++;
794 		kfree_skb(skb);
795 		return;
796 	}
797 
798 	err = iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol,
799 			    tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
800 	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
801 
802 	return;
803 
804 #if IS_ENABLED(CONFIG_IPV6)
805 tx_error_icmp:
806 	dst_link_failure(skb);
807 #endif
808 tx_error:
809 	dev->stats.tx_errors++;
810 	kfree_skb(skb);
811 }
812 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
813 
814 static void ip_tunnel_update(struct ip_tunnel_net *itn,
815 			     struct ip_tunnel *t,
816 			     struct net_device *dev,
817 			     struct ip_tunnel_parm *p,
818 			     bool set_mtu)
819 {
820 	ip_tunnel_del(itn, t);
821 	t->parms.iph.saddr = p->iph.saddr;
822 	t->parms.iph.daddr = p->iph.daddr;
823 	t->parms.i_key = p->i_key;
824 	t->parms.o_key = p->o_key;
825 	if (dev->type != ARPHRD_ETHER) {
826 		memcpy(dev->dev_addr, &p->iph.saddr, 4);
827 		memcpy(dev->broadcast, &p->iph.daddr, 4);
828 	}
829 	ip_tunnel_add(itn, t);
830 
831 	t->parms.iph.ttl = p->iph.ttl;
832 	t->parms.iph.tos = p->iph.tos;
833 	t->parms.iph.frag_off = p->iph.frag_off;
834 
835 	if (t->parms.link != p->link) {
836 		int mtu;
837 
838 		t->parms.link = p->link;
839 		mtu = ip_tunnel_bind_dev(dev);
840 		if (set_mtu)
841 			dev->mtu = mtu;
842 	}
843 	ip_tunnel_dst_reset_all(t);
844 	netdev_state_change(dev);
845 }
846 
847 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
848 {
849 	int err = 0;
850 	struct ip_tunnel *t = netdev_priv(dev);
851 	struct net *net = t->net;
852 	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
853 
854 	BUG_ON(!itn->fb_tunnel_dev);
855 	switch (cmd) {
856 	case SIOCGETTUNNEL:
857 		if (dev == itn->fb_tunnel_dev) {
858 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
859 			if (!t)
860 				t = netdev_priv(dev);
861 		}
862 		memcpy(p, &t->parms, sizeof(*p));
863 		break;
864 
865 	case SIOCADDTUNNEL:
866 	case SIOCCHGTUNNEL:
867 		err = -EPERM;
868 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
869 			goto done;
870 		if (p->iph.ttl)
871 			p->iph.frag_off |= htons(IP_DF);
872 		if (!(p->i_flags & VTI_ISVTI)) {
873 			if (!(p->i_flags & TUNNEL_KEY))
874 				p->i_key = 0;
875 			if (!(p->o_flags & TUNNEL_KEY))
876 				p->o_key = 0;
877 		}
878 
879 		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
880 
881 		if (cmd == SIOCADDTUNNEL) {
882 			if (!t) {
883 				t = ip_tunnel_create(net, itn, p);
884 				err = PTR_ERR_OR_ZERO(t);
885 				break;
886 			}
887 
888 			err = -EEXIST;
889 			break;
890 		}
891 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
892 			if (t) {
893 				if (t->dev != dev) {
894 					err = -EEXIST;
895 					break;
896 				}
897 			} else {
898 				unsigned int nflags = 0;
899 
900 				if (ipv4_is_multicast(p->iph.daddr))
901 					nflags = IFF_BROADCAST;
902 				else if (p->iph.daddr)
903 					nflags = IFF_POINTOPOINT;
904 
905 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
906 					err = -EINVAL;
907 					break;
908 				}
909 
910 				t = netdev_priv(dev);
911 			}
912 		}
913 
914 		if (t) {
915 			err = 0;
916 			ip_tunnel_update(itn, t, dev, p, true);
917 		} else {
918 			err = -ENOENT;
919 		}
920 		break;
921 
922 	case SIOCDELTUNNEL:
923 		err = -EPERM;
924 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
925 			goto done;
926 
927 		if (dev == itn->fb_tunnel_dev) {
928 			err = -ENOENT;
929 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
930 			if (!t)
931 				goto done;
932 			err = -EPERM;
933 			if (t == netdev_priv(itn->fb_tunnel_dev))
934 				goto done;
935 			dev = t->dev;
936 		}
937 		unregister_netdevice(dev);
938 		err = 0;
939 		break;
940 
941 	default:
942 		err = -EINVAL;
943 	}
944 
945 done:
946 	return err;
947 }
948 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
949 
950 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
951 {
952 	struct ip_tunnel *tunnel = netdev_priv(dev);
953 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
954 
955 	if (new_mtu < 68 ||
956 	    new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
957 		return -EINVAL;
958 	dev->mtu = new_mtu;
959 	return 0;
960 }
961 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
962 
963 static void ip_tunnel_dev_free(struct net_device *dev)
964 {
965 	struct ip_tunnel *tunnel = netdev_priv(dev);
966 
967 	gro_cells_destroy(&tunnel->gro_cells);
968 	free_percpu(tunnel->dst_cache);
969 	free_percpu(dev->tstats);
970 	free_netdev(dev);
971 }
972 
973 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
974 {
975 	struct ip_tunnel *tunnel = netdev_priv(dev);
976 	struct ip_tunnel_net *itn;
977 
978 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
979 
980 	if (itn->fb_tunnel_dev != dev) {
981 		ip_tunnel_del(itn, netdev_priv(dev));
982 		unregister_netdevice_queue(dev, head);
983 	}
984 }
985 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
986 
987 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
988 {
989 	struct ip_tunnel *tunnel = netdev_priv(dev);
990 
991 	return tunnel->net;
992 }
993 EXPORT_SYMBOL(ip_tunnel_get_link_net);
994 
995 int ip_tunnel_get_iflink(const struct net_device *dev)
996 {
997 	struct ip_tunnel *tunnel = netdev_priv(dev);
998 
999 	return tunnel->parms.link;
1000 }
1001 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1002 
1003 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
1004 				  struct rtnl_link_ops *ops, char *devname)
1005 {
1006 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1007 	struct ip_tunnel_parm parms;
1008 	unsigned int i;
1009 
1010 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1011 		INIT_HLIST_HEAD(&itn->tunnels[i]);
1012 
1013 	if (!ops) {
1014 		itn->fb_tunnel_dev = NULL;
1015 		return 0;
1016 	}
1017 
1018 	memset(&parms, 0, sizeof(parms));
1019 	if (devname)
1020 		strlcpy(parms.name, devname, IFNAMSIZ);
1021 
1022 	rtnl_lock();
1023 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1024 	/* FB netdevice is special: we have one, and only one per netns.
1025 	 * Allowing to move it to another netns is clearly unsafe.
1026 	 */
1027 	if (!IS_ERR(itn->fb_tunnel_dev)) {
1028 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1029 		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1030 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1031 	}
1032 	rtnl_unlock();
1033 
1034 	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1035 }
1036 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1037 
1038 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1039 			      struct rtnl_link_ops *ops)
1040 {
1041 	struct net *net = dev_net(itn->fb_tunnel_dev);
1042 	struct net_device *dev, *aux;
1043 	int h;
1044 
1045 	for_each_netdev_safe(net, dev, aux)
1046 		if (dev->rtnl_link_ops == ops)
1047 			unregister_netdevice_queue(dev, head);
1048 
1049 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1050 		struct ip_tunnel *t;
1051 		struct hlist_node *n;
1052 		struct hlist_head *thead = &itn->tunnels[h];
1053 
1054 		hlist_for_each_entry_safe(t, n, thead, hash_node)
1055 			/* If dev is in the same netns, it has already
1056 			 * been added to the list by the previous loop.
1057 			 */
1058 			if (!net_eq(dev_net(t->dev), net))
1059 				unregister_netdevice_queue(t->dev, head);
1060 	}
1061 }
1062 
1063 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1064 {
1065 	LIST_HEAD(list);
1066 
1067 	rtnl_lock();
1068 	ip_tunnel_destroy(itn, &list, ops);
1069 	unregister_netdevice_many(&list);
1070 	rtnl_unlock();
1071 }
1072 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1073 
1074 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1075 		      struct ip_tunnel_parm *p)
1076 {
1077 	struct ip_tunnel *nt;
1078 	struct net *net = dev_net(dev);
1079 	struct ip_tunnel_net *itn;
1080 	int mtu;
1081 	int err;
1082 
1083 	nt = netdev_priv(dev);
1084 	itn = net_generic(net, nt->ip_tnl_net_id);
1085 
1086 	if (nt->collect_md) {
1087 		if (rtnl_dereference(itn->collect_md_tun))
1088 			return -EEXIST;
1089 	} else {
1090 		if (ip_tunnel_find(itn, p, dev->type))
1091 			return -EEXIST;
1092 	}
1093 
1094 	nt->net = net;
1095 	nt->parms = *p;
1096 	err = register_netdevice(dev);
1097 	if (err)
1098 		goto out;
1099 
1100 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1101 		eth_hw_addr_random(dev);
1102 
1103 	mtu = ip_tunnel_bind_dev(dev);
1104 	if (!tb[IFLA_MTU])
1105 		dev->mtu = mtu;
1106 
1107 	ip_tunnel_add(itn, nt);
1108 out:
1109 	return err;
1110 }
1111 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1112 
1113 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1114 			 struct ip_tunnel_parm *p)
1115 {
1116 	struct ip_tunnel *t;
1117 	struct ip_tunnel *tunnel = netdev_priv(dev);
1118 	struct net *net = tunnel->net;
1119 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1120 
1121 	if (dev == itn->fb_tunnel_dev)
1122 		return -EINVAL;
1123 
1124 	t = ip_tunnel_find(itn, p, dev->type);
1125 
1126 	if (t) {
1127 		if (t->dev != dev)
1128 			return -EEXIST;
1129 	} else {
1130 		t = tunnel;
1131 
1132 		if (dev->type != ARPHRD_ETHER) {
1133 			unsigned int nflags = 0;
1134 
1135 			if (ipv4_is_multicast(p->iph.daddr))
1136 				nflags = IFF_BROADCAST;
1137 			else if (p->iph.daddr)
1138 				nflags = IFF_POINTOPOINT;
1139 
1140 			if ((dev->flags ^ nflags) &
1141 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1142 				return -EINVAL;
1143 		}
1144 	}
1145 
1146 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1147 	return 0;
1148 }
1149 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1150 
1151 int ip_tunnel_init(struct net_device *dev)
1152 {
1153 	struct ip_tunnel *tunnel = netdev_priv(dev);
1154 	struct iphdr *iph = &tunnel->parms.iph;
1155 	int err;
1156 
1157 	dev->destructor	= ip_tunnel_dev_free;
1158 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1159 	if (!dev->tstats)
1160 		return -ENOMEM;
1161 
1162 	tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
1163 	if (!tunnel->dst_cache) {
1164 		free_percpu(dev->tstats);
1165 		return -ENOMEM;
1166 	}
1167 
1168 	err = gro_cells_init(&tunnel->gro_cells, dev);
1169 	if (err) {
1170 		free_percpu(tunnel->dst_cache);
1171 		free_percpu(dev->tstats);
1172 		return err;
1173 	}
1174 
1175 	tunnel->dev = dev;
1176 	tunnel->net = dev_net(dev);
1177 	strcpy(tunnel->parms.name, dev->name);
1178 	iph->version		= 4;
1179 	iph->ihl		= 5;
1180 
1181 	if (tunnel->collect_md) {
1182 		dev->features |= NETIF_F_NETNS_LOCAL;
1183 		netif_keep_dst(dev);
1184 	}
1185 	return 0;
1186 }
1187 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1188 
1189 void ip_tunnel_uninit(struct net_device *dev)
1190 {
1191 	struct ip_tunnel *tunnel = netdev_priv(dev);
1192 	struct net *net = tunnel->net;
1193 	struct ip_tunnel_net *itn;
1194 
1195 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1196 	/* fb_tunnel_dev will be unregisted in net-exit call. */
1197 	if (itn->fb_tunnel_dev != dev)
1198 		ip_tunnel_del(itn, netdev_priv(dev));
1199 
1200 	ip_tunnel_dst_reset_all(tunnel);
1201 }
1202 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1203 
1204 /* Do least required initialization, rest of init is done in tunnel_init call */
1205 void ip_tunnel_setup(struct net_device *dev, int net_id)
1206 {
1207 	struct ip_tunnel *tunnel = netdev_priv(dev);
1208 	tunnel->ip_tnl_net_id = net_id;
1209 }
1210 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1211 
1212 MODULE_LICENSE("GPL");
1213