xref: /openbmc/linux/net/ipv4/ip_tunnel.c (revision 9b799b78)
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20 
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/mroute.h>
34 #include <linux/init.h>
35 #include <linux/in6.h>
36 #include <linux/inetdevice.h>
37 #include <linux/igmp.h>
38 #include <linux/netfilter_ipv4.h>
39 #include <linux/etherdevice.h>
40 #include <linux/if_ether.h>
41 #include <linux/if_vlan.h>
42 #include <linux/rculist.h>
43 #include <linux/err.h>
44 
45 #include <net/sock.h>
46 #include <net/ip.h>
47 #include <net/icmp.h>
48 #include <net/protocol.h>
49 #include <net/ip_tunnels.h>
50 #include <net/arp.h>
51 #include <net/checksum.h>
52 #include <net/dsfield.h>
53 #include <net/inet_ecn.h>
54 #include <net/xfrm.h>
55 #include <net/net_namespace.h>
56 #include <net/netns/generic.h>
57 #include <net/rtnetlink.h>
58 #include <net/udp.h>
59 
60 #if IS_ENABLED(CONFIG_IPV6)
61 #include <net/ipv6.h>
62 #include <net/ip6_fib.h>
63 #include <net/ip6_route.h>
64 #endif
65 
66 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
67 {
68 	return hash_32((__force u32)key ^ (__force u32)remote,
69 			 IP_TNL_HASH_BITS);
70 }
71 
72 static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
73 			     struct dst_entry *dst, __be32 saddr)
74 {
75 	struct dst_entry *old_dst;
76 
77 	dst_clone(dst);
78 	old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
79 	dst_release(old_dst);
80 	idst->saddr = saddr;
81 }
82 
83 static noinline void tunnel_dst_set(struct ip_tunnel *t,
84 			   struct dst_entry *dst, __be32 saddr)
85 {
86 	__tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr);
87 }
88 
89 static void tunnel_dst_reset(struct ip_tunnel *t)
90 {
91 	tunnel_dst_set(t, NULL, 0);
92 }
93 
94 void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
95 {
96 	int i;
97 
98 	for_each_possible_cpu(i)
99 		__tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0);
100 }
101 EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
102 
103 static struct rtable *tunnel_rtable_get(struct ip_tunnel *t,
104 					u32 cookie, __be32 *saddr)
105 {
106 	struct ip_tunnel_dst *idst;
107 	struct dst_entry *dst;
108 
109 	rcu_read_lock();
110 	idst = raw_cpu_ptr(t->dst_cache);
111 	dst = rcu_dereference(idst->dst);
112 	if (dst && !atomic_inc_not_zero(&dst->__refcnt))
113 		dst = NULL;
114 	if (dst) {
115 		if (!dst->obsolete || dst->ops->check(dst, cookie)) {
116 			*saddr = idst->saddr;
117 		} else {
118 			tunnel_dst_reset(t);
119 			dst_release(dst);
120 			dst = NULL;
121 		}
122 	}
123 	rcu_read_unlock();
124 	return (struct rtable *)dst;
125 }
126 
127 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
128 				__be16 flags, __be32 key)
129 {
130 	if (p->i_flags & TUNNEL_KEY) {
131 		if (flags & TUNNEL_KEY)
132 			return key == p->i_key;
133 		else
134 			/* key expected, none present */
135 			return false;
136 	} else
137 		return !(flags & TUNNEL_KEY);
138 }
139 
140 /* Fallback tunnel: no source, no destination, no key, no options
141 
142    Tunnel hash table:
143    We require exact key match i.e. if a key is present in packet
144    it will match only tunnel with the same key; if it is not present,
145    it will match only keyless tunnel.
146 
147    All keysless packets, if not matched configured keyless tunnels
148    will match fallback tunnel.
149    Given src, dst and key, find appropriate for input tunnel.
150 */
151 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
152 				   int link, __be16 flags,
153 				   __be32 remote, __be32 local,
154 				   __be32 key)
155 {
156 	unsigned int hash;
157 	struct ip_tunnel *t, *cand = NULL;
158 	struct hlist_head *head;
159 
160 	hash = ip_tunnel_hash(key, remote);
161 	head = &itn->tunnels[hash];
162 
163 	hlist_for_each_entry_rcu(t, head, hash_node) {
164 		if (local != t->parms.iph.saddr ||
165 		    remote != t->parms.iph.daddr ||
166 		    !(t->dev->flags & IFF_UP))
167 			continue;
168 
169 		if (!ip_tunnel_key_match(&t->parms, flags, key))
170 			continue;
171 
172 		if (t->parms.link == link)
173 			return t;
174 		else
175 			cand = t;
176 	}
177 
178 	hlist_for_each_entry_rcu(t, head, hash_node) {
179 		if (remote != t->parms.iph.daddr ||
180 		    t->parms.iph.saddr != 0 ||
181 		    !(t->dev->flags & IFF_UP))
182 			continue;
183 
184 		if (!ip_tunnel_key_match(&t->parms, flags, key))
185 			continue;
186 
187 		if (t->parms.link == link)
188 			return t;
189 		else if (!cand)
190 			cand = t;
191 	}
192 
193 	hash = ip_tunnel_hash(key, 0);
194 	head = &itn->tunnels[hash];
195 
196 	hlist_for_each_entry_rcu(t, head, hash_node) {
197 		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
198 		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
199 			continue;
200 
201 		if (!(t->dev->flags & IFF_UP))
202 			continue;
203 
204 		if (!ip_tunnel_key_match(&t->parms, flags, key))
205 			continue;
206 
207 		if (t->parms.link == link)
208 			return t;
209 		else if (!cand)
210 			cand = t;
211 	}
212 
213 	if (flags & TUNNEL_NO_KEY)
214 		goto skip_key_lookup;
215 
216 	hlist_for_each_entry_rcu(t, head, hash_node) {
217 		if (t->parms.i_key != key ||
218 		    t->parms.iph.saddr != 0 ||
219 		    t->parms.iph.daddr != 0 ||
220 		    !(t->dev->flags & IFF_UP))
221 			continue;
222 
223 		if (t->parms.link == link)
224 			return t;
225 		else if (!cand)
226 			cand = t;
227 	}
228 
229 skip_key_lookup:
230 	if (cand)
231 		return cand;
232 
233 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
234 		return netdev_priv(itn->fb_tunnel_dev);
235 
236 
237 	return NULL;
238 }
239 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
240 
241 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
242 				    struct ip_tunnel_parm *parms)
243 {
244 	unsigned int h;
245 	__be32 remote;
246 	__be32 i_key = parms->i_key;
247 
248 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
249 		remote = parms->iph.daddr;
250 	else
251 		remote = 0;
252 
253 	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
254 		i_key = 0;
255 
256 	h = ip_tunnel_hash(i_key, remote);
257 	return &itn->tunnels[h];
258 }
259 
260 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
261 {
262 	struct hlist_head *head = ip_bucket(itn, &t->parms);
263 
264 	hlist_add_head_rcu(&t->hash_node, head);
265 }
266 
267 static void ip_tunnel_del(struct ip_tunnel *t)
268 {
269 	hlist_del_init_rcu(&t->hash_node);
270 }
271 
272 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
273 					struct ip_tunnel_parm *parms,
274 					int type)
275 {
276 	__be32 remote = parms->iph.daddr;
277 	__be32 local = parms->iph.saddr;
278 	__be32 key = parms->i_key;
279 	__be16 flags = parms->i_flags;
280 	int link = parms->link;
281 	struct ip_tunnel *t = NULL;
282 	struct hlist_head *head = ip_bucket(itn, parms);
283 
284 	hlist_for_each_entry_rcu(t, head, hash_node) {
285 		if (local == t->parms.iph.saddr &&
286 		    remote == t->parms.iph.daddr &&
287 		    link == t->parms.link &&
288 		    type == t->dev->type &&
289 		    ip_tunnel_key_match(&t->parms, flags, key))
290 			break;
291 	}
292 	return t;
293 }
294 
295 static struct net_device *__ip_tunnel_create(struct net *net,
296 					     const struct rtnl_link_ops *ops,
297 					     struct ip_tunnel_parm *parms)
298 {
299 	int err;
300 	struct ip_tunnel *tunnel;
301 	struct net_device *dev;
302 	char name[IFNAMSIZ];
303 
304 	if (parms->name[0])
305 		strlcpy(name, parms->name, IFNAMSIZ);
306 	else {
307 		if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
308 			err = -E2BIG;
309 			goto failed;
310 		}
311 		strlcpy(name, ops->kind, IFNAMSIZ);
312 		strncat(name, "%d", 2);
313 	}
314 
315 	ASSERT_RTNL();
316 	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
317 	if (!dev) {
318 		err = -ENOMEM;
319 		goto failed;
320 	}
321 	dev_net_set(dev, net);
322 
323 	dev->rtnl_link_ops = ops;
324 
325 	tunnel = netdev_priv(dev);
326 	tunnel->parms = *parms;
327 	tunnel->net = net;
328 
329 	err = register_netdevice(dev);
330 	if (err)
331 		goto failed_free;
332 
333 	return dev;
334 
335 failed_free:
336 	free_netdev(dev);
337 failed:
338 	return ERR_PTR(err);
339 }
340 
341 static inline void init_tunnel_flow(struct flowi4 *fl4,
342 				    int proto,
343 				    __be32 daddr, __be32 saddr,
344 				    __be32 key, __u8 tos, int oif)
345 {
346 	memset(fl4, 0, sizeof(*fl4));
347 	fl4->flowi4_oif = oif;
348 	fl4->daddr = daddr;
349 	fl4->saddr = saddr;
350 	fl4->flowi4_tos = tos;
351 	fl4->flowi4_proto = proto;
352 	fl4->fl4_gre_key = key;
353 }
354 
355 static int ip_tunnel_bind_dev(struct net_device *dev)
356 {
357 	struct net_device *tdev = NULL;
358 	struct ip_tunnel *tunnel = netdev_priv(dev);
359 	const struct iphdr *iph;
360 	int hlen = LL_MAX_HEADER;
361 	int mtu = ETH_DATA_LEN;
362 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
363 
364 	iph = &tunnel->parms.iph;
365 
366 	/* Guess output device to choose reasonable mtu and needed_headroom */
367 	if (iph->daddr) {
368 		struct flowi4 fl4;
369 		struct rtable *rt;
370 
371 		init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
372 				 iph->saddr, tunnel->parms.o_key,
373 				 RT_TOS(iph->tos), tunnel->parms.link);
374 		rt = ip_route_output_key(tunnel->net, &fl4);
375 
376 		if (!IS_ERR(rt)) {
377 			tdev = rt->dst.dev;
378 			tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
379 			ip_rt_put(rt);
380 		}
381 		if (dev->type != ARPHRD_ETHER)
382 			dev->flags |= IFF_POINTOPOINT;
383 	}
384 
385 	if (!tdev && tunnel->parms.link)
386 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
387 
388 	if (tdev) {
389 		hlen = tdev->hard_header_len + tdev->needed_headroom;
390 		mtu = tdev->mtu;
391 	}
392 
393 	dev->needed_headroom = t_hlen + hlen;
394 	mtu -= (dev->hard_header_len + t_hlen);
395 
396 	if (mtu < 68)
397 		mtu = 68;
398 
399 	return mtu;
400 }
401 
402 static struct ip_tunnel *ip_tunnel_create(struct net *net,
403 					  struct ip_tunnel_net *itn,
404 					  struct ip_tunnel_parm *parms)
405 {
406 	struct ip_tunnel *nt;
407 	struct net_device *dev;
408 
409 	BUG_ON(!itn->fb_tunnel_dev);
410 	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
411 	if (IS_ERR(dev))
412 		return ERR_CAST(dev);
413 
414 	dev->mtu = ip_tunnel_bind_dev(dev);
415 
416 	nt = netdev_priv(dev);
417 	ip_tunnel_add(itn, nt);
418 	return nt;
419 }
420 
421 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
422 		  const struct tnl_ptk_info *tpi, bool log_ecn_error)
423 {
424 	struct pcpu_sw_netstats *tstats;
425 	const struct iphdr *iph = ip_hdr(skb);
426 	int err;
427 
428 #ifdef CONFIG_NET_IPGRE_BROADCAST
429 	if (ipv4_is_multicast(iph->daddr)) {
430 		tunnel->dev->stats.multicast++;
431 		skb->pkt_type = PACKET_BROADCAST;
432 	}
433 #endif
434 
435 	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
436 	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
437 		tunnel->dev->stats.rx_crc_errors++;
438 		tunnel->dev->stats.rx_errors++;
439 		goto drop;
440 	}
441 
442 	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
443 		if (!(tpi->flags&TUNNEL_SEQ) ||
444 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
445 			tunnel->dev->stats.rx_fifo_errors++;
446 			tunnel->dev->stats.rx_errors++;
447 			goto drop;
448 		}
449 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
450 	}
451 
452 	skb_reset_network_header(skb);
453 
454 	err = IP_ECN_decapsulate(iph, skb);
455 	if (unlikely(err)) {
456 		if (log_ecn_error)
457 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
458 					&iph->saddr, iph->tos);
459 		if (err > 1) {
460 			++tunnel->dev->stats.rx_frame_errors;
461 			++tunnel->dev->stats.rx_errors;
462 			goto drop;
463 		}
464 	}
465 
466 	tstats = this_cpu_ptr(tunnel->dev->tstats);
467 	u64_stats_update_begin(&tstats->syncp);
468 	tstats->rx_packets++;
469 	tstats->rx_bytes += skb->len;
470 	u64_stats_update_end(&tstats->syncp);
471 
472 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
473 
474 	if (tunnel->dev->type == ARPHRD_ETHER) {
475 		skb->protocol = eth_type_trans(skb, tunnel->dev);
476 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
477 	} else {
478 		skb->dev = tunnel->dev;
479 	}
480 
481 	gro_cells_receive(&tunnel->gro_cells, skb);
482 	return 0;
483 
484 drop:
485 	kfree_skb(skb);
486 	return 0;
487 }
488 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
489 
490 static int ip_encap_hlen(struct ip_tunnel_encap *e)
491 {
492 	const struct ip_tunnel_encap_ops *ops;
493 	int hlen = -EINVAL;
494 
495 	if (e->type == TUNNEL_ENCAP_NONE)
496 		return 0;
497 
498 	if (e->type >= MAX_IPTUN_ENCAP_OPS)
499 		return -EINVAL;
500 
501 	rcu_read_lock();
502 	ops = rcu_dereference(iptun_encaps[e->type]);
503 	if (likely(ops && ops->encap_hlen))
504 		hlen = ops->encap_hlen(e);
505 	rcu_read_unlock();
506 
507 	return hlen;
508 }
509 
510 const struct ip_tunnel_encap_ops __rcu *
511 		iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
512 
513 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
514 			    unsigned int num)
515 {
516 	if (num >= MAX_IPTUN_ENCAP_OPS)
517 		return -ERANGE;
518 
519 	return !cmpxchg((const struct ip_tunnel_encap_ops **)
520 			&iptun_encaps[num],
521 			NULL, ops) ? 0 : -1;
522 }
523 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
524 
525 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
526 			    unsigned int num)
527 {
528 	int ret;
529 
530 	if (num >= MAX_IPTUN_ENCAP_OPS)
531 		return -ERANGE;
532 
533 	ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
534 		       &iptun_encaps[num],
535 		       ops, NULL) == ops) ? 0 : -1;
536 
537 	synchronize_net();
538 
539 	return ret;
540 }
541 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
542 
543 int ip_tunnel_encap_setup(struct ip_tunnel *t,
544 			  struct ip_tunnel_encap *ipencap)
545 {
546 	int hlen;
547 
548 	memset(&t->encap, 0, sizeof(t->encap));
549 
550 	hlen = ip_encap_hlen(ipencap);
551 	if (hlen < 0)
552 		return hlen;
553 
554 	t->encap.type = ipencap->type;
555 	t->encap.sport = ipencap->sport;
556 	t->encap.dport = ipencap->dport;
557 	t->encap.flags = ipencap->flags;
558 
559 	t->encap_hlen = hlen;
560 	t->hlen = t->encap_hlen + t->tun_hlen;
561 
562 	return 0;
563 }
564 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
565 
566 int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
567 		    u8 *protocol, struct flowi4 *fl4)
568 {
569 	const struct ip_tunnel_encap_ops *ops;
570 	int ret = -EINVAL;
571 
572 	if (t->encap.type == TUNNEL_ENCAP_NONE)
573 		return 0;
574 
575 	if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
576 		return -EINVAL;
577 
578 	rcu_read_lock();
579 	ops = rcu_dereference(iptun_encaps[t->encap.type]);
580 	if (likely(ops && ops->build_header))
581 		ret = ops->build_header(skb, &t->encap, protocol, fl4);
582 	rcu_read_unlock();
583 
584 	return ret;
585 }
586 EXPORT_SYMBOL(ip_tunnel_encap);
587 
588 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
589 			    struct rtable *rt, __be16 df)
590 {
591 	struct ip_tunnel *tunnel = netdev_priv(dev);
592 	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
593 	int mtu;
594 
595 	if (df)
596 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
597 					- sizeof(struct iphdr) - tunnel->hlen;
598 	else
599 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
600 
601 	if (skb_dst(skb))
602 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
603 
604 	if (skb->protocol == htons(ETH_P_IP)) {
605 		if (!skb_is_gso(skb) &&
606 		    (df & htons(IP_DF)) && mtu < pkt_size) {
607 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
608 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
609 			return -E2BIG;
610 		}
611 	}
612 #if IS_ENABLED(CONFIG_IPV6)
613 	else if (skb->protocol == htons(ETH_P_IPV6)) {
614 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
615 
616 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
617 			   mtu >= IPV6_MIN_MTU) {
618 			if ((tunnel->parms.iph.daddr &&
619 			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
620 			    rt6->rt6i_dst.plen == 128) {
621 				rt6->rt6i_flags |= RTF_MODIFIED;
622 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
623 			}
624 		}
625 
626 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
627 					mtu < pkt_size) {
628 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
629 			return -E2BIG;
630 		}
631 	}
632 #endif
633 	return 0;
634 }
635 
636 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
637 		    const struct iphdr *tnl_params, u8 protocol)
638 {
639 	struct ip_tunnel *tunnel = netdev_priv(dev);
640 	const struct iphdr *inner_iph;
641 	struct flowi4 fl4;
642 	u8     tos, ttl;
643 	__be16 df;
644 	struct rtable *rt;		/* Route to the other host */
645 	unsigned int max_headroom;	/* The extra header space needed */
646 	__be32 dst;
647 	int err;
648 	bool connected;
649 
650 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
651 	connected = (tunnel->parms.iph.daddr != 0);
652 
653 	dst = tnl_params->daddr;
654 	if (dst == 0) {
655 		/* NBMA tunnel */
656 
657 		if (!skb_dst(skb)) {
658 			dev->stats.tx_fifo_errors++;
659 			goto tx_error;
660 		}
661 
662 		if (skb->protocol == htons(ETH_P_IP)) {
663 			rt = skb_rtable(skb);
664 			dst = rt_nexthop(rt, inner_iph->daddr);
665 		}
666 #if IS_ENABLED(CONFIG_IPV6)
667 		else if (skb->protocol == htons(ETH_P_IPV6)) {
668 			const struct in6_addr *addr6;
669 			struct neighbour *neigh;
670 			bool do_tx_error_icmp;
671 			int addr_type;
672 
673 			neigh = dst_neigh_lookup(skb_dst(skb),
674 						 &ipv6_hdr(skb)->daddr);
675 			if (!neigh)
676 				goto tx_error;
677 
678 			addr6 = (const struct in6_addr *)&neigh->primary_key;
679 			addr_type = ipv6_addr_type(addr6);
680 
681 			if (addr_type == IPV6_ADDR_ANY) {
682 				addr6 = &ipv6_hdr(skb)->daddr;
683 				addr_type = ipv6_addr_type(addr6);
684 			}
685 
686 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
687 				do_tx_error_icmp = true;
688 			else {
689 				do_tx_error_icmp = false;
690 				dst = addr6->s6_addr32[3];
691 			}
692 			neigh_release(neigh);
693 			if (do_tx_error_icmp)
694 				goto tx_error_icmp;
695 		}
696 #endif
697 		else
698 			goto tx_error;
699 
700 		connected = false;
701 	}
702 
703 	tos = tnl_params->tos;
704 	if (tos & 0x1) {
705 		tos &= ~0x1;
706 		if (skb->protocol == htons(ETH_P_IP)) {
707 			tos = inner_iph->tos;
708 			connected = false;
709 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
710 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
711 			connected = false;
712 		}
713 	}
714 
715 	init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
716 			 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
717 
718 	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
719 		goto tx_error;
720 
721 	rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL;
722 
723 	if (!rt) {
724 		rt = ip_route_output_key(tunnel->net, &fl4);
725 
726 		if (IS_ERR(rt)) {
727 			dev->stats.tx_carrier_errors++;
728 			goto tx_error;
729 		}
730 		if (connected)
731 			tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
732 	}
733 
734 	if (rt->dst.dev == dev) {
735 		ip_rt_put(rt);
736 		dev->stats.collisions++;
737 		goto tx_error;
738 	}
739 
740 	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) {
741 		ip_rt_put(rt);
742 		goto tx_error;
743 	}
744 
745 	if (tunnel->err_count > 0) {
746 		if (time_before(jiffies,
747 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
748 			tunnel->err_count--;
749 
750 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
751 			dst_link_failure(skb);
752 		} else
753 			tunnel->err_count = 0;
754 	}
755 
756 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
757 	ttl = tnl_params->ttl;
758 	if (ttl == 0) {
759 		if (skb->protocol == htons(ETH_P_IP))
760 			ttl = inner_iph->ttl;
761 #if IS_ENABLED(CONFIG_IPV6)
762 		else if (skb->protocol == htons(ETH_P_IPV6))
763 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
764 #endif
765 		else
766 			ttl = ip4_dst_hoplimit(&rt->dst);
767 	}
768 
769 	df = tnl_params->frag_off;
770 	if (skb->protocol == htons(ETH_P_IP))
771 		df |= (inner_iph->frag_off&htons(IP_DF));
772 
773 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
774 			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
775 	if (max_headroom > dev->needed_headroom)
776 		dev->needed_headroom = max_headroom;
777 
778 	if (skb_cow_head(skb, dev->needed_headroom)) {
779 		ip_rt_put(rt);
780 		dev->stats.tx_dropped++;
781 		kfree_skb(skb);
782 		return;
783 	}
784 
785 	err = iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol,
786 			    tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
787 	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
788 
789 	return;
790 
791 #if IS_ENABLED(CONFIG_IPV6)
792 tx_error_icmp:
793 	dst_link_failure(skb);
794 #endif
795 tx_error:
796 	dev->stats.tx_errors++;
797 	kfree_skb(skb);
798 }
799 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
800 
801 static void ip_tunnel_update(struct ip_tunnel_net *itn,
802 			     struct ip_tunnel *t,
803 			     struct net_device *dev,
804 			     struct ip_tunnel_parm *p,
805 			     bool set_mtu)
806 {
807 	ip_tunnel_del(t);
808 	t->parms.iph.saddr = p->iph.saddr;
809 	t->parms.iph.daddr = p->iph.daddr;
810 	t->parms.i_key = p->i_key;
811 	t->parms.o_key = p->o_key;
812 	if (dev->type != ARPHRD_ETHER) {
813 		memcpy(dev->dev_addr, &p->iph.saddr, 4);
814 		memcpy(dev->broadcast, &p->iph.daddr, 4);
815 	}
816 	ip_tunnel_add(itn, t);
817 
818 	t->parms.iph.ttl = p->iph.ttl;
819 	t->parms.iph.tos = p->iph.tos;
820 	t->parms.iph.frag_off = p->iph.frag_off;
821 
822 	if (t->parms.link != p->link) {
823 		int mtu;
824 
825 		t->parms.link = p->link;
826 		mtu = ip_tunnel_bind_dev(dev);
827 		if (set_mtu)
828 			dev->mtu = mtu;
829 	}
830 	ip_tunnel_dst_reset_all(t);
831 	netdev_state_change(dev);
832 }
833 
834 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
835 {
836 	int err = 0;
837 	struct ip_tunnel *t = netdev_priv(dev);
838 	struct net *net = t->net;
839 	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
840 
841 	BUG_ON(!itn->fb_tunnel_dev);
842 	switch (cmd) {
843 	case SIOCGETTUNNEL:
844 		if (dev == itn->fb_tunnel_dev) {
845 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
846 			if (!t)
847 				t = netdev_priv(dev);
848 		}
849 		memcpy(p, &t->parms, sizeof(*p));
850 		break;
851 
852 	case SIOCADDTUNNEL:
853 	case SIOCCHGTUNNEL:
854 		err = -EPERM;
855 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
856 			goto done;
857 		if (p->iph.ttl)
858 			p->iph.frag_off |= htons(IP_DF);
859 		if (!(p->i_flags & VTI_ISVTI)) {
860 			if (!(p->i_flags & TUNNEL_KEY))
861 				p->i_key = 0;
862 			if (!(p->o_flags & TUNNEL_KEY))
863 				p->o_key = 0;
864 		}
865 
866 		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
867 
868 		if (cmd == SIOCADDTUNNEL) {
869 			if (!t) {
870 				t = ip_tunnel_create(net, itn, p);
871 				err = PTR_ERR_OR_ZERO(t);
872 				break;
873 			}
874 
875 			err = -EEXIST;
876 			break;
877 		}
878 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
879 			if (t) {
880 				if (t->dev != dev) {
881 					err = -EEXIST;
882 					break;
883 				}
884 			} else {
885 				unsigned int nflags = 0;
886 
887 				if (ipv4_is_multicast(p->iph.daddr))
888 					nflags = IFF_BROADCAST;
889 				else if (p->iph.daddr)
890 					nflags = IFF_POINTOPOINT;
891 
892 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
893 					err = -EINVAL;
894 					break;
895 				}
896 
897 				t = netdev_priv(dev);
898 			}
899 		}
900 
901 		if (t) {
902 			err = 0;
903 			ip_tunnel_update(itn, t, dev, p, true);
904 		} else {
905 			err = -ENOENT;
906 		}
907 		break;
908 
909 	case SIOCDELTUNNEL:
910 		err = -EPERM;
911 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
912 			goto done;
913 
914 		if (dev == itn->fb_tunnel_dev) {
915 			err = -ENOENT;
916 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
917 			if (!t)
918 				goto done;
919 			err = -EPERM;
920 			if (t == netdev_priv(itn->fb_tunnel_dev))
921 				goto done;
922 			dev = t->dev;
923 		}
924 		unregister_netdevice(dev);
925 		err = 0;
926 		break;
927 
928 	default:
929 		err = -EINVAL;
930 	}
931 
932 done:
933 	return err;
934 }
935 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
936 
937 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
938 {
939 	struct ip_tunnel *tunnel = netdev_priv(dev);
940 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
941 
942 	if (new_mtu < 68 ||
943 	    new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
944 		return -EINVAL;
945 	dev->mtu = new_mtu;
946 	return 0;
947 }
948 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
949 
950 static void ip_tunnel_dev_free(struct net_device *dev)
951 {
952 	struct ip_tunnel *tunnel = netdev_priv(dev);
953 
954 	gro_cells_destroy(&tunnel->gro_cells);
955 	free_percpu(tunnel->dst_cache);
956 	free_percpu(dev->tstats);
957 	free_netdev(dev);
958 }
959 
960 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
961 {
962 	struct ip_tunnel *tunnel = netdev_priv(dev);
963 	struct ip_tunnel_net *itn;
964 
965 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
966 
967 	if (itn->fb_tunnel_dev != dev) {
968 		ip_tunnel_del(netdev_priv(dev));
969 		unregister_netdevice_queue(dev, head);
970 	}
971 }
972 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
973 
974 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
975 {
976 	struct ip_tunnel *tunnel = netdev_priv(dev);
977 
978 	return tunnel->net;
979 }
980 EXPORT_SYMBOL(ip_tunnel_get_link_net);
981 
982 int ip_tunnel_get_iflink(const struct net_device *dev)
983 {
984 	struct ip_tunnel *tunnel = netdev_priv(dev);
985 
986 	return tunnel->parms.link;
987 }
988 EXPORT_SYMBOL(ip_tunnel_get_iflink);
989 
990 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
991 				  struct rtnl_link_ops *ops, char *devname)
992 {
993 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
994 	struct ip_tunnel_parm parms;
995 	unsigned int i;
996 
997 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
998 		INIT_HLIST_HEAD(&itn->tunnels[i]);
999 
1000 	if (!ops) {
1001 		itn->fb_tunnel_dev = NULL;
1002 		return 0;
1003 	}
1004 
1005 	memset(&parms, 0, sizeof(parms));
1006 	if (devname)
1007 		strlcpy(parms.name, devname, IFNAMSIZ);
1008 
1009 	rtnl_lock();
1010 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1011 	/* FB netdevice is special: we have one, and only one per netns.
1012 	 * Allowing to move it to another netns is clearly unsafe.
1013 	 */
1014 	if (!IS_ERR(itn->fb_tunnel_dev)) {
1015 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1016 		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1017 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1018 	}
1019 	rtnl_unlock();
1020 
1021 	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1022 }
1023 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1024 
1025 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1026 			      struct rtnl_link_ops *ops)
1027 {
1028 	struct net *net = dev_net(itn->fb_tunnel_dev);
1029 	struct net_device *dev, *aux;
1030 	int h;
1031 
1032 	for_each_netdev_safe(net, dev, aux)
1033 		if (dev->rtnl_link_ops == ops)
1034 			unregister_netdevice_queue(dev, head);
1035 
1036 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1037 		struct ip_tunnel *t;
1038 		struct hlist_node *n;
1039 		struct hlist_head *thead = &itn->tunnels[h];
1040 
1041 		hlist_for_each_entry_safe(t, n, thead, hash_node)
1042 			/* If dev is in the same netns, it has already
1043 			 * been added to the list by the previous loop.
1044 			 */
1045 			if (!net_eq(dev_net(t->dev), net))
1046 				unregister_netdevice_queue(t->dev, head);
1047 	}
1048 }
1049 
1050 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1051 {
1052 	LIST_HEAD(list);
1053 
1054 	rtnl_lock();
1055 	ip_tunnel_destroy(itn, &list, ops);
1056 	unregister_netdevice_many(&list);
1057 	rtnl_unlock();
1058 }
1059 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1060 
1061 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1062 		      struct ip_tunnel_parm *p)
1063 {
1064 	struct ip_tunnel *nt;
1065 	struct net *net = dev_net(dev);
1066 	struct ip_tunnel_net *itn;
1067 	int mtu;
1068 	int err;
1069 
1070 	nt = netdev_priv(dev);
1071 	itn = net_generic(net, nt->ip_tnl_net_id);
1072 
1073 	if (ip_tunnel_find(itn, p, dev->type))
1074 		return -EEXIST;
1075 
1076 	nt->net = net;
1077 	nt->parms = *p;
1078 	err = register_netdevice(dev);
1079 	if (err)
1080 		goto out;
1081 
1082 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1083 		eth_hw_addr_random(dev);
1084 
1085 	mtu = ip_tunnel_bind_dev(dev);
1086 	if (!tb[IFLA_MTU])
1087 		dev->mtu = mtu;
1088 
1089 	ip_tunnel_add(itn, nt);
1090 
1091 out:
1092 	return err;
1093 }
1094 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1095 
1096 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1097 			 struct ip_tunnel_parm *p)
1098 {
1099 	struct ip_tunnel *t;
1100 	struct ip_tunnel *tunnel = netdev_priv(dev);
1101 	struct net *net = tunnel->net;
1102 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1103 
1104 	if (dev == itn->fb_tunnel_dev)
1105 		return -EINVAL;
1106 
1107 	t = ip_tunnel_find(itn, p, dev->type);
1108 
1109 	if (t) {
1110 		if (t->dev != dev)
1111 			return -EEXIST;
1112 	} else {
1113 		t = tunnel;
1114 
1115 		if (dev->type != ARPHRD_ETHER) {
1116 			unsigned int nflags = 0;
1117 
1118 			if (ipv4_is_multicast(p->iph.daddr))
1119 				nflags = IFF_BROADCAST;
1120 			else if (p->iph.daddr)
1121 				nflags = IFF_POINTOPOINT;
1122 
1123 			if ((dev->flags ^ nflags) &
1124 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1125 				return -EINVAL;
1126 		}
1127 	}
1128 
1129 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1130 	return 0;
1131 }
1132 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1133 
1134 int ip_tunnel_init(struct net_device *dev)
1135 {
1136 	struct ip_tunnel *tunnel = netdev_priv(dev);
1137 	struct iphdr *iph = &tunnel->parms.iph;
1138 	int err;
1139 
1140 	dev->destructor	= ip_tunnel_dev_free;
1141 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1142 	if (!dev->tstats)
1143 		return -ENOMEM;
1144 
1145 	tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
1146 	if (!tunnel->dst_cache) {
1147 		free_percpu(dev->tstats);
1148 		return -ENOMEM;
1149 	}
1150 
1151 	err = gro_cells_init(&tunnel->gro_cells, dev);
1152 	if (err) {
1153 		free_percpu(tunnel->dst_cache);
1154 		free_percpu(dev->tstats);
1155 		return err;
1156 	}
1157 
1158 	tunnel->dev = dev;
1159 	tunnel->net = dev_net(dev);
1160 	strcpy(tunnel->parms.name, dev->name);
1161 	iph->version		= 4;
1162 	iph->ihl		= 5;
1163 
1164 	return 0;
1165 }
1166 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1167 
1168 void ip_tunnel_uninit(struct net_device *dev)
1169 {
1170 	struct ip_tunnel *tunnel = netdev_priv(dev);
1171 	struct net *net = tunnel->net;
1172 	struct ip_tunnel_net *itn;
1173 
1174 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1175 	/* fb_tunnel_dev will be unregisted in net-exit call. */
1176 	if (itn->fb_tunnel_dev != dev)
1177 		ip_tunnel_del(netdev_priv(dev));
1178 
1179 	ip_tunnel_dst_reset_all(tunnel);
1180 }
1181 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1182 
1183 /* Do least required initialization, rest of init is done in tunnel_init call */
1184 void ip_tunnel_setup(struct net_device *dev, int net_id)
1185 {
1186 	struct ip_tunnel *tunnel = netdev_priv(dev);
1187 	tunnel->ip_tnl_net_id = net_id;
1188 }
1189 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1190 
1191 MODULE_LICENSE("GPL");
1192