xref: /openbmc/linux/net/ipv4/ip_tunnel.c (revision a8fe58ce)
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20 
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/init.h>
34 #include <linux/in6.h>
35 #include <linux/inetdevice.h>
36 #include <linux/igmp.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/etherdevice.h>
39 #include <linux/if_ether.h>
40 #include <linux/if_vlan.h>
41 #include <linux/rculist.h>
42 #include <linux/err.h>
43 
44 #include <net/sock.h>
45 #include <net/ip.h>
46 #include <net/icmp.h>
47 #include <net/protocol.h>
48 #include <net/ip_tunnels.h>
49 #include <net/arp.h>
50 #include <net/checksum.h>
51 #include <net/dsfield.h>
52 #include <net/inet_ecn.h>
53 #include <net/xfrm.h>
54 #include <net/net_namespace.h>
55 #include <net/netns/generic.h>
56 #include <net/rtnetlink.h>
57 #include <net/udp.h>
58 
59 #if IS_ENABLED(CONFIG_IPV6)
60 #include <net/ipv6.h>
61 #include <net/ip6_fib.h>
62 #include <net/ip6_route.h>
63 #endif
64 
65 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
66 {
67 	return hash_32((__force u32)key ^ (__force u32)remote,
68 			 IP_TNL_HASH_BITS);
69 }
70 
71 static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
72 			     struct dst_entry *dst, __be32 saddr)
73 {
74 	struct dst_entry *old_dst;
75 
76 	dst_clone(dst);
77 	old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
78 	dst_release(old_dst);
79 	idst->saddr = saddr;
80 }
81 
82 static noinline void tunnel_dst_set(struct ip_tunnel *t,
83 			   struct dst_entry *dst, __be32 saddr)
84 {
85 	__tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr);
86 }
87 
88 static void tunnel_dst_reset(struct ip_tunnel *t)
89 {
90 	tunnel_dst_set(t, NULL, 0);
91 }
92 
93 void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
94 {
95 	int i;
96 
97 	for_each_possible_cpu(i)
98 		__tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0);
99 }
100 EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
101 
102 static struct rtable *tunnel_rtable_get(struct ip_tunnel *t,
103 					u32 cookie, __be32 *saddr)
104 {
105 	struct ip_tunnel_dst *idst;
106 	struct dst_entry *dst;
107 
108 	rcu_read_lock();
109 	idst = raw_cpu_ptr(t->dst_cache);
110 	dst = rcu_dereference(idst->dst);
111 	if (dst && !atomic_inc_not_zero(&dst->__refcnt))
112 		dst = NULL;
113 	if (dst) {
114 		if (!dst->obsolete || dst->ops->check(dst, cookie)) {
115 			*saddr = idst->saddr;
116 		} else {
117 			tunnel_dst_reset(t);
118 			dst_release(dst);
119 			dst = NULL;
120 		}
121 	}
122 	rcu_read_unlock();
123 	return (struct rtable *)dst;
124 }
125 
126 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
127 				__be16 flags, __be32 key)
128 {
129 	if (p->i_flags & TUNNEL_KEY) {
130 		if (flags & TUNNEL_KEY)
131 			return key == p->i_key;
132 		else
133 			/* key expected, none present */
134 			return false;
135 	} else
136 		return !(flags & TUNNEL_KEY);
137 }
138 
139 /* Fallback tunnel: no source, no destination, no key, no options
140 
141    Tunnel hash table:
142    We require exact key match i.e. if a key is present in packet
143    it will match only tunnel with the same key; if it is not present,
144    it will match only keyless tunnel.
145 
146    All keysless packets, if not matched configured keyless tunnels
147    will match fallback tunnel.
148    Given src, dst and key, find appropriate for input tunnel.
149 */
150 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
151 				   int link, __be16 flags,
152 				   __be32 remote, __be32 local,
153 				   __be32 key)
154 {
155 	unsigned int hash;
156 	struct ip_tunnel *t, *cand = NULL;
157 	struct hlist_head *head;
158 
159 	hash = ip_tunnel_hash(key, remote);
160 	head = &itn->tunnels[hash];
161 
162 	hlist_for_each_entry_rcu(t, head, hash_node) {
163 		if (local != t->parms.iph.saddr ||
164 		    remote != t->parms.iph.daddr ||
165 		    !(t->dev->flags & IFF_UP))
166 			continue;
167 
168 		if (!ip_tunnel_key_match(&t->parms, flags, key))
169 			continue;
170 
171 		if (t->parms.link == link)
172 			return t;
173 		else
174 			cand = t;
175 	}
176 
177 	hlist_for_each_entry_rcu(t, head, hash_node) {
178 		if (remote != t->parms.iph.daddr ||
179 		    t->parms.iph.saddr != 0 ||
180 		    !(t->dev->flags & IFF_UP))
181 			continue;
182 
183 		if (!ip_tunnel_key_match(&t->parms, flags, key))
184 			continue;
185 
186 		if (t->parms.link == link)
187 			return t;
188 		else if (!cand)
189 			cand = t;
190 	}
191 
192 	hash = ip_tunnel_hash(key, 0);
193 	head = &itn->tunnels[hash];
194 
195 	hlist_for_each_entry_rcu(t, head, hash_node) {
196 		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
197 		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
198 			continue;
199 
200 		if (!(t->dev->flags & IFF_UP))
201 			continue;
202 
203 		if (!ip_tunnel_key_match(&t->parms, flags, key))
204 			continue;
205 
206 		if (t->parms.link == link)
207 			return t;
208 		else if (!cand)
209 			cand = t;
210 	}
211 
212 	if (flags & TUNNEL_NO_KEY)
213 		goto skip_key_lookup;
214 
215 	hlist_for_each_entry_rcu(t, head, hash_node) {
216 		if (t->parms.i_key != key ||
217 		    t->parms.iph.saddr != 0 ||
218 		    t->parms.iph.daddr != 0 ||
219 		    !(t->dev->flags & IFF_UP))
220 			continue;
221 
222 		if (t->parms.link == link)
223 			return t;
224 		else if (!cand)
225 			cand = t;
226 	}
227 
228 skip_key_lookup:
229 	if (cand)
230 		return cand;
231 
232 	t = rcu_dereference(itn->collect_md_tun);
233 	if (t)
234 		return t;
235 
236 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
237 		return netdev_priv(itn->fb_tunnel_dev);
238 
239 	return NULL;
240 }
241 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
242 
243 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
244 				    struct ip_tunnel_parm *parms)
245 {
246 	unsigned int h;
247 	__be32 remote;
248 	__be32 i_key = parms->i_key;
249 
250 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
251 		remote = parms->iph.daddr;
252 	else
253 		remote = 0;
254 
255 	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
256 		i_key = 0;
257 
258 	h = ip_tunnel_hash(i_key, remote);
259 	return &itn->tunnels[h];
260 }
261 
262 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
263 {
264 	struct hlist_head *head = ip_bucket(itn, &t->parms);
265 
266 	if (t->collect_md)
267 		rcu_assign_pointer(itn->collect_md_tun, t);
268 	hlist_add_head_rcu(&t->hash_node, head);
269 }
270 
271 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
272 {
273 	if (t->collect_md)
274 		rcu_assign_pointer(itn->collect_md_tun, NULL);
275 	hlist_del_init_rcu(&t->hash_node);
276 }
277 
278 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
279 					struct ip_tunnel_parm *parms,
280 					int type)
281 {
282 	__be32 remote = parms->iph.daddr;
283 	__be32 local = parms->iph.saddr;
284 	__be32 key = parms->i_key;
285 	__be16 flags = parms->i_flags;
286 	int link = parms->link;
287 	struct ip_tunnel *t = NULL;
288 	struct hlist_head *head = ip_bucket(itn, parms);
289 
290 	hlist_for_each_entry_rcu(t, head, hash_node) {
291 		if (local == t->parms.iph.saddr &&
292 		    remote == t->parms.iph.daddr &&
293 		    link == t->parms.link &&
294 		    type == t->dev->type &&
295 		    ip_tunnel_key_match(&t->parms, flags, key))
296 			break;
297 	}
298 	return t;
299 }
300 
301 static struct net_device *__ip_tunnel_create(struct net *net,
302 					     const struct rtnl_link_ops *ops,
303 					     struct ip_tunnel_parm *parms)
304 {
305 	int err;
306 	struct ip_tunnel *tunnel;
307 	struct net_device *dev;
308 	char name[IFNAMSIZ];
309 
310 	if (parms->name[0])
311 		strlcpy(name, parms->name, IFNAMSIZ);
312 	else {
313 		if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
314 			err = -E2BIG;
315 			goto failed;
316 		}
317 		strlcpy(name, ops->kind, IFNAMSIZ);
318 		strncat(name, "%d", 2);
319 	}
320 
321 	ASSERT_RTNL();
322 	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
323 	if (!dev) {
324 		err = -ENOMEM;
325 		goto failed;
326 	}
327 	dev_net_set(dev, net);
328 
329 	dev->rtnl_link_ops = ops;
330 
331 	tunnel = netdev_priv(dev);
332 	tunnel->parms = *parms;
333 	tunnel->net = net;
334 
335 	err = register_netdevice(dev);
336 	if (err)
337 		goto failed_free;
338 
339 	return dev;
340 
341 failed_free:
342 	free_netdev(dev);
343 failed:
344 	return ERR_PTR(err);
345 }
346 
347 static inline void init_tunnel_flow(struct flowi4 *fl4,
348 				    int proto,
349 				    __be32 daddr, __be32 saddr,
350 				    __be32 key, __u8 tos, int oif)
351 {
352 	memset(fl4, 0, sizeof(*fl4));
353 	fl4->flowi4_oif = oif;
354 	fl4->daddr = daddr;
355 	fl4->saddr = saddr;
356 	fl4->flowi4_tos = tos;
357 	fl4->flowi4_proto = proto;
358 	fl4->fl4_gre_key = key;
359 }
360 
361 static int ip_tunnel_bind_dev(struct net_device *dev)
362 {
363 	struct net_device *tdev = NULL;
364 	struct ip_tunnel *tunnel = netdev_priv(dev);
365 	const struct iphdr *iph;
366 	int hlen = LL_MAX_HEADER;
367 	int mtu = ETH_DATA_LEN;
368 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
369 
370 	iph = &tunnel->parms.iph;
371 
372 	/* Guess output device to choose reasonable mtu and needed_headroom */
373 	if (iph->daddr) {
374 		struct flowi4 fl4;
375 		struct rtable *rt;
376 
377 		init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
378 				 iph->saddr, tunnel->parms.o_key,
379 				 RT_TOS(iph->tos), tunnel->parms.link);
380 		rt = ip_route_output_key(tunnel->net, &fl4);
381 
382 		if (!IS_ERR(rt)) {
383 			tdev = rt->dst.dev;
384 			tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
385 			ip_rt_put(rt);
386 		}
387 		if (dev->type != ARPHRD_ETHER)
388 			dev->flags |= IFF_POINTOPOINT;
389 	}
390 
391 	if (!tdev && tunnel->parms.link)
392 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
393 
394 	if (tdev) {
395 		hlen = tdev->hard_header_len + tdev->needed_headroom;
396 		mtu = tdev->mtu;
397 	}
398 
399 	dev->needed_headroom = t_hlen + hlen;
400 	mtu -= (dev->hard_header_len + t_hlen);
401 
402 	if (mtu < 68)
403 		mtu = 68;
404 
405 	return mtu;
406 }
407 
408 static struct ip_tunnel *ip_tunnel_create(struct net *net,
409 					  struct ip_tunnel_net *itn,
410 					  struct ip_tunnel_parm *parms)
411 {
412 	struct ip_tunnel *nt;
413 	struct net_device *dev;
414 
415 	BUG_ON(!itn->fb_tunnel_dev);
416 	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
417 	if (IS_ERR(dev))
418 		return ERR_CAST(dev);
419 
420 	dev->mtu = ip_tunnel_bind_dev(dev);
421 
422 	nt = netdev_priv(dev);
423 	ip_tunnel_add(itn, nt);
424 	return nt;
425 }
426 
427 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
428 		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
429 		  bool log_ecn_error)
430 {
431 	struct pcpu_sw_netstats *tstats;
432 	const struct iphdr *iph = ip_hdr(skb);
433 	int err;
434 
435 #ifdef CONFIG_NET_IPGRE_BROADCAST
436 	if (ipv4_is_multicast(iph->daddr)) {
437 		tunnel->dev->stats.multicast++;
438 		skb->pkt_type = PACKET_BROADCAST;
439 	}
440 #endif
441 
442 	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
443 	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
444 		tunnel->dev->stats.rx_crc_errors++;
445 		tunnel->dev->stats.rx_errors++;
446 		goto drop;
447 	}
448 
449 	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
450 		if (!(tpi->flags&TUNNEL_SEQ) ||
451 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
452 			tunnel->dev->stats.rx_fifo_errors++;
453 			tunnel->dev->stats.rx_errors++;
454 			goto drop;
455 		}
456 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
457 	}
458 
459 	skb_reset_network_header(skb);
460 
461 	err = IP_ECN_decapsulate(iph, skb);
462 	if (unlikely(err)) {
463 		if (log_ecn_error)
464 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
465 					&iph->saddr, iph->tos);
466 		if (err > 1) {
467 			++tunnel->dev->stats.rx_frame_errors;
468 			++tunnel->dev->stats.rx_errors;
469 			goto drop;
470 		}
471 	}
472 
473 	tstats = this_cpu_ptr(tunnel->dev->tstats);
474 	u64_stats_update_begin(&tstats->syncp);
475 	tstats->rx_packets++;
476 	tstats->rx_bytes += skb->len;
477 	u64_stats_update_end(&tstats->syncp);
478 
479 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
480 
481 	if (tunnel->dev->type == ARPHRD_ETHER) {
482 		skb->protocol = eth_type_trans(skb, tunnel->dev);
483 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
484 	} else {
485 		skb->dev = tunnel->dev;
486 	}
487 
488 	if (tun_dst)
489 		skb_dst_set(skb, (struct dst_entry *)tun_dst);
490 
491 	gro_cells_receive(&tunnel->gro_cells, skb);
492 	return 0;
493 
494 drop:
495 	kfree_skb(skb);
496 	return 0;
497 }
498 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
499 
500 static int ip_encap_hlen(struct ip_tunnel_encap *e)
501 {
502 	const struct ip_tunnel_encap_ops *ops;
503 	int hlen = -EINVAL;
504 
505 	if (e->type == TUNNEL_ENCAP_NONE)
506 		return 0;
507 
508 	if (e->type >= MAX_IPTUN_ENCAP_OPS)
509 		return -EINVAL;
510 
511 	rcu_read_lock();
512 	ops = rcu_dereference(iptun_encaps[e->type]);
513 	if (likely(ops && ops->encap_hlen))
514 		hlen = ops->encap_hlen(e);
515 	rcu_read_unlock();
516 
517 	return hlen;
518 }
519 
520 const struct ip_tunnel_encap_ops __rcu *
521 		iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
522 
523 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
524 			    unsigned int num)
525 {
526 	if (num >= MAX_IPTUN_ENCAP_OPS)
527 		return -ERANGE;
528 
529 	return !cmpxchg((const struct ip_tunnel_encap_ops **)
530 			&iptun_encaps[num],
531 			NULL, ops) ? 0 : -1;
532 }
533 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
534 
535 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
536 			    unsigned int num)
537 {
538 	int ret;
539 
540 	if (num >= MAX_IPTUN_ENCAP_OPS)
541 		return -ERANGE;
542 
543 	ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
544 		       &iptun_encaps[num],
545 		       ops, NULL) == ops) ? 0 : -1;
546 
547 	synchronize_net();
548 
549 	return ret;
550 }
551 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
552 
553 int ip_tunnel_encap_setup(struct ip_tunnel *t,
554 			  struct ip_tunnel_encap *ipencap)
555 {
556 	int hlen;
557 
558 	memset(&t->encap, 0, sizeof(t->encap));
559 
560 	hlen = ip_encap_hlen(ipencap);
561 	if (hlen < 0)
562 		return hlen;
563 
564 	t->encap.type = ipencap->type;
565 	t->encap.sport = ipencap->sport;
566 	t->encap.dport = ipencap->dport;
567 	t->encap.flags = ipencap->flags;
568 
569 	t->encap_hlen = hlen;
570 	t->hlen = t->encap_hlen + t->tun_hlen;
571 
572 	return 0;
573 }
574 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
575 
576 int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
577 		    u8 *protocol, struct flowi4 *fl4)
578 {
579 	const struct ip_tunnel_encap_ops *ops;
580 	int ret = -EINVAL;
581 
582 	if (t->encap.type == TUNNEL_ENCAP_NONE)
583 		return 0;
584 
585 	if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
586 		return -EINVAL;
587 
588 	rcu_read_lock();
589 	ops = rcu_dereference(iptun_encaps[t->encap.type]);
590 	if (likely(ops && ops->build_header))
591 		ret = ops->build_header(skb, &t->encap, protocol, fl4);
592 	rcu_read_unlock();
593 
594 	return ret;
595 }
596 EXPORT_SYMBOL(ip_tunnel_encap);
597 
598 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
599 			    struct rtable *rt, __be16 df,
600 			    const struct iphdr *inner_iph)
601 {
602 	struct ip_tunnel *tunnel = netdev_priv(dev);
603 	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
604 	int mtu;
605 
606 	if (df)
607 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
608 					- sizeof(struct iphdr) - tunnel->hlen;
609 	else
610 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
611 
612 	if (skb_dst(skb))
613 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
614 
615 	if (skb->protocol == htons(ETH_P_IP)) {
616 		if (!skb_is_gso(skb) &&
617 		    (inner_iph->frag_off & htons(IP_DF)) &&
618 		    mtu < pkt_size) {
619 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
620 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
621 			return -E2BIG;
622 		}
623 	}
624 #if IS_ENABLED(CONFIG_IPV6)
625 	else if (skb->protocol == htons(ETH_P_IPV6)) {
626 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
627 
628 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
629 			   mtu >= IPV6_MIN_MTU) {
630 			if ((tunnel->parms.iph.daddr &&
631 			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
632 			    rt6->rt6i_dst.plen == 128) {
633 				rt6->rt6i_flags |= RTF_MODIFIED;
634 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
635 			}
636 		}
637 
638 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
639 					mtu < pkt_size) {
640 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
641 			return -E2BIG;
642 		}
643 	}
644 #endif
645 	return 0;
646 }
647 
648 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
649 		    const struct iphdr *tnl_params, u8 protocol)
650 {
651 	struct ip_tunnel *tunnel = netdev_priv(dev);
652 	const struct iphdr *inner_iph;
653 	struct flowi4 fl4;
654 	u8     tos, ttl;
655 	__be16 df;
656 	struct rtable *rt;		/* Route to the other host */
657 	unsigned int max_headroom;	/* The extra header space needed */
658 	__be32 dst;
659 	bool connected;
660 
661 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
662 	connected = (tunnel->parms.iph.daddr != 0);
663 
664 	dst = tnl_params->daddr;
665 	if (dst == 0) {
666 		/* NBMA tunnel */
667 
668 		if (!skb_dst(skb)) {
669 			dev->stats.tx_fifo_errors++;
670 			goto tx_error;
671 		}
672 
673 		if (skb->protocol == htons(ETH_P_IP)) {
674 			rt = skb_rtable(skb);
675 			dst = rt_nexthop(rt, inner_iph->daddr);
676 		}
677 #if IS_ENABLED(CONFIG_IPV6)
678 		else if (skb->protocol == htons(ETH_P_IPV6)) {
679 			const struct in6_addr *addr6;
680 			struct neighbour *neigh;
681 			bool do_tx_error_icmp;
682 			int addr_type;
683 
684 			neigh = dst_neigh_lookup(skb_dst(skb),
685 						 &ipv6_hdr(skb)->daddr);
686 			if (!neigh)
687 				goto tx_error;
688 
689 			addr6 = (const struct in6_addr *)&neigh->primary_key;
690 			addr_type = ipv6_addr_type(addr6);
691 
692 			if (addr_type == IPV6_ADDR_ANY) {
693 				addr6 = &ipv6_hdr(skb)->daddr;
694 				addr_type = ipv6_addr_type(addr6);
695 			}
696 
697 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
698 				do_tx_error_icmp = true;
699 			else {
700 				do_tx_error_icmp = false;
701 				dst = addr6->s6_addr32[3];
702 			}
703 			neigh_release(neigh);
704 			if (do_tx_error_icmp)
705 				goto tx_error_icmp;
706 		}
707 #endif
708 		else
709 			goto tx_error;
710 
711 		connected = false;
712 	}
713 
714 	tos = tnl_params->tos;
715 	if (tos & 0x1) {
716 		tos &= ~0x1;
717 		if (skb->protocol == htons(ETH_P_IP)) {
718 			tos = inner_iph->tos;
719 			connected = false;
720 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
721 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
722 			connected = false;
723 		}
724 	}
725 
726 	init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
727 			 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
728 
729 	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
730 		goto tx_error;
731 
732 	rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL;
733 
734 	if (!rt) {
735 		rt = ip_route_output_key(tunnel->net, &fl4);
736 
737 		if (IS_ERR(rt)) {
738 			dev->stats.tx_carrier_errors++;
739 			goto tx_error;
740 		}
741 		if (connected)
742 			tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
743 	}
744 
745 	if (rt->dst.dev == dev) {
746 		ip_rt_put(rt);
747 		dev->stats.collisions++;
748 		goto tx_error;
749 	}
750 
751 	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
752 		ip_rt_put(rt);
753 		goto tx_error;
754 	}
755 
756 	if (tunnel->err_count > 0) {
757 		if (time_before(jiffies,
758 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
759 			tunnel->err_count--;
760 
761 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
762 			dst_link_failure(skb);
763 		} else
764 			tunnel->err_count = 0;
765 	}
766 
767 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
768 	ttl = tnl_params->ttl;
769 	if (ttl == 0) {
770 		if (skb->protocol == htons(ETH_P_IP))
771 			ttl = inner_iph->ttl;
772 #if IS_ENABLED(CONFIG_IPV6)
773 		else if (skb->protocol == htons(ETH_P_IPV6))
774 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
775 #endif
776 		else
777 			ttl = ip4_dst_hoplimit(&rt->dst);
778 	}
779 
780 	df = tnl_params->frag_off;
781 	if (skb->protocol == htons(ETH_P_IP))
782 		df |= (inner_iph->frag_off&htons(IP_DF));
783 
784 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
785 			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
786 	if (max_headroom > dev->needed_headroom)
787 		dev->needed_headroom = max_headroom;
788 
789 	if (skb_cow_head(skb, dev->needed_headroom)) {
790 		ip_rt_put(rt);
791 		dev->stats.tx_dropped++;
792 		kfree_skb(skb);
793 		return;
794 	}
795 
796 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
797 		      df, !net_eq(tunnel->net, dev_net(dev)));
798 	return;
799 
800 #if IS_ENABLED(CONFIG_IPV6)
801 tx_error_icmp:
802 	dst_link_failure(skb);
803 #endif
804 tx_error:
805 	dev->stats.tx_errors++;
806 	kfree_skb(skb);
807 }
808 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
809 
810 static void ip_tunnel_update(struct ip_tunnel_net *itn,
811 			     struct ip_tunnel *t,
812 			     struct net_device *dev,
813 			     struct ip_tunnel_parm *p,
814 			     bool set_mtu)
815 {
816 	ip_tunnel_del(itn, t);
817 	t->parms.iph.saddr = p->iph.saddr;
818 	t->parms.iph.daddr = p->iph.daddr;
819 	t->parms.i_key = p->i_key;
820 	t->parms.o_key = p->o_key;
821 	if (dev->type != ARPHRD_ETHER) {
822 		memcpy(dev->dev_addr, &p->iph.saddr, 4);
823 		memcpy(dev->broadcast, &p->iph.daddr, 4);
824 	}
825 	ip_tunnel_add(itn, t);
826 
827 	t->parms.iph.ttl = p->iph.ttl;
828 	t->parms.iph.tos = p->iph.tos;
829 	t->parms.iph.frag_off = p->iph.frag_off;
830 
831 	if (t->parms.link != p->link) {
832 		int mtu;
833 
834 		t->parms.link = p->link;
835 		mtu = ip_tunnel_bind_dev(dev);
836 		if (set_mtu)
837 			dev->mtu = mtu;
838 	}
839 	ip_tunnel_dst_reset_all(t);
840 	netdev_state_change(dev);
841 }
842 
843 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
844 {
845 	int err = 0;
846 	struct ip_tunnel *t = netdev_priv(dev);
847 	struct net *net = t->net;
848 	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
849 
850 	BUG_ON(!itn->fb_tunnel_dev);
851 	switch (cmd) {
852 	case SIOCGETTUNNEL:
853 		if (dev == itn->fb_tunnel_dev) {
854 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
855 			if (!t)
856 				t = netdev_priv(dev);
857 		}
858 		memcpy(p, &t->parms, sizeof(*p));
859 		break;
860 
861 	case SIOCADDTUNNEL:
862 	case SIOCCHGTUNNEL:
863 		err = -EPERM;
864 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
865 			goto done;
866 		if (p->iph.ttl)
867 			p->iph.frag_off |= htons(IP_DF);
868 		if (!(p->i_flags & VTI_ISVTI)) {
869 			if (!(p->i_flags & TUNNEL_KEY))
870 				p->i_key = 0;
871 			if (!(p->o_flags & TUNNEL_KEY))
872 				p->o_key = 0;
873 		}
874 
875 		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
876 
877 		if (cmd == SIOCADDTUNNEL) {
878 			if (!t) {
879 				t = ip_tunnel_create(net, itn, p);
880 				err = PTR_ERR_OR_ZERO(t);
881 				break;
882 			}
883 
884 			err = -EEXIST;
885 			break;
886 		}
887 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
888 			if (t) {
889 				if (t->dev != dev) {
890 					err = -EEXIST;
891 					break;
892 				}
893 			} else {
894 				unsigned int nflags = 0;
895 
896 				if (ipv4_is_multicast(p->iph.daddr))
897 					nflags = IFF_BROADCAST;
898 				else if (p->iph.daddr)
899 					nflags = IFF_POINTOPOINT;
900 
901 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
902 					err = -EINVAL;
903 					break;
904 				}
905 
906 				t = netdev_priv(dev);
907 			}
908 		}
909 
910 		if (t) {
911 			err = 0;
912 			ip_tunnel_update(itn, t, dev, p, true);
913 		} else {
914 			err = -ENOENT;
915 		}
916 		break;
917 
918 	case SIOCDELTUNNEL:
919 		err = -EPERM;
920 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
921 			goto done;
922 
923 		if (dev == itn->fb_tunnel_dev) {
924 			err = -ENOENT;
925 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
926 			if (!t)
927 				goto done;
928 			err = -EPERM;
929 			if (t == netdev_priv(itn->fb_tunnel_dev))
930 				goto done;
931 			dev = t->dev;
932 		}
933 		unregister_netdevice(dev);
934 		err = 0;
935 		break;
936 
937 	default:
938 		err = -EINVAL;
939 	}
940 
941 done:
942 	return err;
943 }
944 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
945 
946 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
947 {
948 	struct ip_tunnel *tunnel = netdev_priv(dev);
949 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
950 
951 	if (new_mtu < 68 ||
952 	    new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
953 		return -EINVAL;
954 	dev->mtu = new_mtu;
955 	return 0;
956 }
957 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
958 
959 static void ip_tunnel_dev_free(struct net_device *dev)
960 {
961 	struct ip_tunnel *tunnel = netdev_priv(dev);
962 
963 	gro_cells_destroy(&tunnel->gro_cells);
964 	free_percpu(tunnel->dst_cache);
965 	free_percpu(dev->tstats);
966 	free_netdev(dev);
967 }
968 
969 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
970 {
971 	struct ip_tunnel *tunnel = netdev_priv(dev);
972 	struct ip_tunnel_net *itn;
973 
974 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
975 
976 	if (itn->fb_tunnel_dev != dev) {
977 		ip_tunnel_del(itn, netdev_priv(dev));
978 		unregister_netdevice_queue(dev, head);
979 	}
980 }
981 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
982 
983 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
984 {
985 	struct ip_tunnel *tunnel = netdev_priv(dev);
986 
987 	return tunnel->net;
988 }
989 EXPORT_SYMBOL(ip_tunnel_get_link_net);
990 
991 int ip_tunnel_get_iflink(const struct net_device *dev)
992 {
993 	struct ip_tunnel *tunnel = netdev_priv(dev);
994 
995 	return tunnel->parms.link;
996 }
997 EXPORT_SYMBOL(ip_tunnel_get_iflink);
998 
999 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
1000 				  struct rtnl_link_ops *ops, char *devname)
1001 {
1002 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1003 	struct ip_tunnel_parm parms;
1004 	unsigned int i;
1005 
1006 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1007 		INIT_HLIST_HEAD(&itn->tunnels[i]);
1008 
1009 	if (!ops) {
1010 		itn->fb_tunnel_dev = NULL;
1011 		return 0;
1012 	}
1013 
1014 	memset(&parms, 0, sizeof(parms));
1015 	if (devname)
1016 		strlcpy(parms.name, devname, IFNAMSIZ);
1017 
1018 	rtnl_lock();
1019 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1020 	/* FB netdevice is special: we have one, and only one per netns.
1021 	 * Allowing to move it to another netns is clearly unsafe.
1022 	 */
1023 	if (!IS_ERR(itn->fb_tunnel_dev)) {
1024 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1025 		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1026 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1027 	}
1028 	rtnl_unlock();
1029 
1030 	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1031 }
1032 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1033 
1034 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1035 			      struct rtnl_link_ops *ops)
1036 {
1037 	struct net *net = dev_net(itn->fb_tunnel_dev);
1038 	struct net_device *dev, *aux;
1039 	int h;
1040 
1041 	for_each_netdev_safe(net, dev, aux)
1042 		if (dev->rtnl_link_ops == ops)
1043 			unregister_netdevice_queue(dev, head);
1044 
1045 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1046 		struct ip_tunnel *t;
1047 		struct hlist_node *n;
1048 		struct hlist_head *thead = &itn->tunnels[h];
1049 
1050 		hlist_for_each_entry_safe(t, n, thead, hash_node)
1051 			/* If dev is in the same netns, it has already
1052 			 * been added to the list by the previous loop.
1053 			 */
1054 			if (!net_eq(dev_net(t->dev), net))
1055 				unregister_netdevice_queue(t->dev, head);
1056 	}
1057 }
1058 
1059 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1060 {
1061 	LIST_HEAD(list);
1062 
1063 	rtnl_lock();
1064 	ip_tunnel_destroy(itn, &list, ops);
1065 	unregister_netdevice_many(&list);
1066 	rtnl_unlock();
1067 }
1068 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1069 
1070 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1071 		      struct ip_tunnel_parm *p)
1072 {
1073 	struct ip_tunnel *nt;
1074 	struct net *net = dev_net(dev);
1075 	struct ip_tunnel_net *itn;
1076 	int mtu;
1077 	int err;
1078 
1079 	nt = netdev_priv(dev);
1080 	itn = net_generic(net, nt->ip_tnl_net_id);
1081 
1082 	if (nt->collect_md) {
1083 		if (rtnl_dereference(itn->collect_md_tun))
1084 			return -EEXIST;
1085 	} else {
1086 		if (ip_tunnel_find(itn, p, dev->type))
1087 			return -EEXIST;
1088 	}
1089 
1090 	nt->net = net;
1091 	nt->parms = *p;
1092 	err = register_netdevice(dev);
1093 	if (err)
1094 		goto out;
1095 
1096 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1097 		eth_hw_addr_random(dev);
1098 
1099 	mtu = ip_tunnel_bind_dev(dev);
1100 	if (!tb[IFLA_MTU])
1101 		dev->mtu = mtu;
1102 
1103 	ip_tunnel_add(itn, nt);
1104 out:
1105 	return err;
1106 }
1107 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1108 
1109 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1110 			 struct ip_tunnel_parm *p)
1111 {
1112 	struct ip_tunnel *t;
1113 	struct ip_tunnel *tunnel = netdev_priv(dev);
1114 	struct net *net = tunnel->net;
1115 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1116 
1117 	if (dev == itn->fb_tunnel_dev)
1118 		return -EINVAL;
1119 
1120 	t = ip_tunnel_find(itn, p, dev->type);
1121 
1122 	if (t) {
1123 		if (t->dev != dev)
1124 			return -EEXIST;
1125 	} else {
1126 		t = tunnel;
1127 
1128 		if (dev->type != ARPHRD_ETHER) {
1129 			unsigned int nflags = 0;
1130 
1131 			if (ipv4_is_multicast(p->iph.daddr))
1132 				nflags = IFF_BROADCAST;
1133 			else if (p->iph.daddr)
1134 				nflags = IFF_POINTOPOINT;
1135 
1136 			if ((dev->flags ^ nflags) &
1137 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1138 				return -EINVAL;
1139 		}
1140 	}
1141 
1142 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1143 	return 0;
1144 }
1145 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1146 
1147 int ip_tunnel_init(struct net_device *dev)
1148 {
1149 	struct ip_tunnel *tunnel = netdev_priv(dev);
1150 	struct iphdr *iph = &tunnel->parms.iph;
1151 	int err;
1152 
1153 	dev->destructor	= ip_tunnel_dev_free;
1154 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1155 	if (!dev->tstats)
1156 		return -ENOMEM;
1157 
1158 	tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
1159 	if (!tunnel->dst_cache) {
1160 		free_percpu(dev->tstats);
1161 		return -ENOMEM;
1162 	}
1163 
1164 	err = gro_cells_init(&tunnel->gro_cells, dev);
1165 	if (err) {
1166 		free_percpu(tunnel->dst_cache);
1167 		free_percpu(dev->tstats);
1168 		return err;
1169 	}
1170 
1171 	tunnel->dev = dev;
1172 	tunnel->net = dev_net(dev);
1173 	strcpy(tunnel->parms.name, dev->name);
1174 	iph->version		= 4;
1175 	iph->ihl		= 5;
1176 
1177 	if (tunnel->collect_md) {
1178 		dev->features |= NETIF_F_NETNS_LOCAL;
1179 		netif_keep_dst(dev);
1180 	}
1181 	return 0;
1182 }
1183 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1184 
1185 void ip_tunnel_uninit(struct net_device *dev)
1186 {
1187 	struct ip_tunnel *tunnel = netdev_priv(dev);
1188 	struct net *net = tunnel->net;
1189 	struct ip_tunnel_net *itn;
1190 
1191 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1192 	/* fb_tunnel_dev will be unregisted in net-exit call. */
1193 	if (itn->fb_tunnel_dev != dev)
1194 		ip_tunnel_del(itn, netdev_priv(dev));
1195 
1196 	ip_tunnel_dst_reset_all(tunnel);
1197 }
1198 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1199 
1200 /* Do least required initialization, rest of init is done in tunnel_init call */
1201 void ip_tunnel_setup(struct net_device *dev, int net_id)
1202 {
1203 	struct ip_tunnel *tunnel = netdev_priv(dev);
1204 	tunnel->ip_tnl_net_id = net_id;
1205 }
1206 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1207 
1208 MODULE_LICENSE("GPL");
1209