xref: /openbmc/linux/net/ipv4/ip_tunnel.c (revision cc8bbe1a)
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20 
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/init.h>
34 #include <linux/in6.h>
35 #include <linux/inetdevice.h>
36 #include <linux/igmp.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/etherdevice.h>
39 #include <linux/if_ether.h>
40 #include <linux/if_vlan.h>
41 #include <linux/rculist.h>
42 #include <linux/err.h>
43 
44 #include <net/sock.h>
45 #include <net/ip.h>
46 #include <net/icmp.h>
47 #include <net/protocol.h>
48 #include <net/ip_tunnels.h>
49 #include <net/arp.h>
50 #include <net/checksum.h>
51 #include <net/dsfield.h>
52 #include <net/inet_ecn.h>
53 #include <net/xfrm.h>
54 #include <net/net_namespace.h>
55 #include <net/netns/generic.h>
56 #include <net/rtnetlink.h>
57 #include <net/udp.h>
58 
59 #if IS_ENABLED(CONFIG_IPV6)
60 #include <net/ipv6.h>
61 #include <net/ip6_fib.h>
62 #include <net/ip6_route.h>
63 #endif
64 
65 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
66 {
67 	return hash_32((__force u32)key ^ (__force u32)remote,
68 			 IP_TNL_HASH_BITS);
69 }
70 
71 static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
72 			     struct dst_entry *dst, __be32 saddr)
73 {
74 	struct dst_entry *old_dst;
75 
76 	dst_clone(dst);
77 	old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
78 	dst_release(old_dst);
79 	idst->saddr = saddr;
80 }
81 
82 static noinline void tunnel_dst_set(struct ip_tunnel *t,
83 			   struct dst_entry *dst, __be32 saddr)
84 {
85 	__tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr);
86 }
87 
88 static void tunnel_dst_reset(struct ip_tunnel *t)
89 {
90 	tunnel_dst_set(t, NULL, 0);
91 }
92 
93 void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
94 {
95 	int i;
96 
97 	for_each_possible_cpu(i)
98 		__tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0);
99 }
100 EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
101 
102 static struct rtable *tunnel_rtable_get(struct ip_tunnel *t,
103 					u32 cookie, __be32 *saddr)
104 {
105 	struct ip_tunnel_dst *idst;
106 	struct dst_entry *dst;
107 
108 	rcu_read_lock();
109 	idst = raw_cpu_ptr(t->dst_cache);
110 	dst = rcu_dereference(idst->dst);
111 	if (dst && !atomic_inc_not_zero(&dst->__refcnt))
112 		dst = NULL;
113 	if (dst) {
114 		if (!dst->obsolete || dst->ops->check(dst, cookie)) {
115 			*saddr = idst->saddr;
116 		} else {
117 			tunnel_dst_reset(t);
118 			dst_release(dst);
119 			dst = NULL;
120 		}
121 	}
122 	rcu_read_unlock();
123 	return (struct rtable *)dst;
124 }
125 
126 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
127 				__be16 flags, __be32 key)
128 {
129 	if (p->i_flags & TUNNEL_KEY) {
130 		if (flags & TUNNEL_KEY)
131 			return key == p->i_key;
132 		else
133 			/* key expected, none present */
134 			return false;
135 	} else
136 		return !(flags & TUNNEL_KEY);
137 }
138 
139 /* Fallback tunnel: no source, no destination, no key, no options
140 
141    Tunnel hash table:
142    We require exact key match i.e. if a key is present in packet
143    it will match only tunnel with the same key; if it is not present,
144    it will match only keyless tunnel.
145 
146    All keysless packets, if not matched configured keyless tunnels
147    will match fallback tunnel.
148    Given src, dst and key, find appropriate for input tunnel.
149 */
150 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
151 				   int link, __be16 flags,
152 				   __be32 remote, __be32 local,
153 				   __be32 key)
154 {
155 	unsigned int hash;
156 	struct ip_tunnel *t, *cand = NULL;
157 	struct hlist_head *head;
158 
159 	hash = ip_tunnel_hash(key, remote);
160 	head = &itn->tunnels[hash];
161 
162 	hlist_for_each_entry_rcu(t, head, hash_node) {
163 		if (local != t->parms.iph.saddr ||
164 		    remote != t->parms.iph.daddr ||
165 		    !(t->dev->flags & IFF_UP))
166 			continue;
167 
168 		if (!ip_tunnel_key_match(&t->parms, flags, key))
169 			continue;
170 
171 		if (t->parms.link == link)
172 			return t;
173 		else
174 			cand = t;
175 	}
176 
177 	hlist_for_each_entry_rcu(t, head, hash_node) {
178 		if (remote != t->parms.iph.daddr ||
179 		    t->parms.iph.saddr != 0 ||
180 		    !(t->dev->flags & IFF_UP))
181 			continue;
182 
183 		if (!ip_tunnel_key_match(&t->parms, flags, key))
184 			continue;
185 
186 		if (t->parms.link == link)
187 			return t;
188 		else if (!cand)
189 			cand = t;
190 	}
191 
192 	hash = ip_tunnel_hash(key, 0);
193 	head = &itn->tunnels[hash];
194 
195 	hlist_for_each_entry_rcu(t, head, hash_node) {
196 		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
197 		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
198 			continue;
199 
200 		if (!(t->dev->flags & IFF_UP))
201 			continue;
202 
203 		if (!ip_tunnel_key_match(&t->parms, flags, key))
204 			continue;
205 
206 		if (t->parms.link == link)
207 			return t;
208 		else if (!cand)
209 			cand = t;
210 	}
211 
212 	if (flags & TUNNEL_NO_KEY)
213 		goto skip_key_lookup;
214 
215 	hlist_for_each_entry_rcu(t, head, hash_node) {
216 		if (t->parms.i_key != key ||
217 		    t->parms.iph.saddr != 0 ||
218 		    t->parms.iph.daddr != 0 ||
219 		    !(t->dev->flags & IFF_UP))
220 			continue;
221 
222 		if (t->parms.link == link)
223 			return t;
224 		else if (!cand)
225 			cand = t;
226 	}
227 
228 skip_key_lookup:
229 	if (cand)
230 		return cand;
231 
232 	t = rcu_dereference(itn->collect_md_tun);
233 	if (t)
234 		return t;
235 
236 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
237 		return netdev_priv(itn->fb_tunnel_dev);
238 
239 	return NULL;
240 }
241 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
242 
243 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
244 				    struct ip_tunnel_parm *parms)
245 {
246 	unsigned int h;
247 	__be32 remote;
248 	__be32 i_key = parms->i_key;
249 
250 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
251 		remote = parms->iph.daddr;
252 	else
253 		remote = 0;
254 
255 	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
256 		i_key = 0;
257 
258 	h = ip_tunnel_hash(i_key, remote);
259 	return &itn->tunnels[h];
260 }
261 
262 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
263 {
264 	struct hlist_head *head = ip_bucket(itn, &t->parms);
265 
266 	if (t->collect_md)
267 		rcu_assign_pointer(itn->collect_md_tun, t);
268 	hlist_add_head_rcu(&t->hash_node, head);
269 }
270 
271 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
272 {
273 	if (t->collect_md)
274 		rcu_assign_pointer(itn->collect_md_tun, NULL);
275 	hlist_del_init_rcu(&t->hash_node);
276 }
277 
278 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
279 					struct ip_tunnel_parm *parms,
280 					int type)
281 {
282 	__be32 remote = parms->iph.daddr;
283 	__be32 local = parms->iph.saddr;
284 	__be32 key = parms->i_key;
285 	__be16 flags = parms->i_flags;
286 	int link = parms->link;
287 	struct ip_tunnel *t = NULL;
288 	struct hlist_head *head = ip_bucket(itn, parms);
289 
290 	hlist_for_each_entry_rcu(t, head, hash_node) {
291 		if (local == t->parms.iph.saddr &&
292 		    remote == t->parms.iph.daddr &&
293 		    link == t->parms.link &&
294 		    type == t->dev->type &&
295 		    ip_tunnel_key_match(&t->parms, flags, key))
296 			break;
297 	}
298 	return t;
299 }
300 
301 static struct net_device *__ip_tunnel_create(struct net *net,
302 					     const struct rtnl_link_ops *ops,
303 					     struct ip_tunnel_parm *parms)
304 {
305 	int err;
306 	struct ip_tunnel *tunnel;
307 	struct net_device *dev;
308 	char name[IFNAMSIZ];
309 
310 	if (parms->name[0])
311 		strlcpy(name, parms->name, IFNAMSIZ);
312 	else {
313 		if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
314 			err = -E2BIG;
315 			goto failed;
316 		}
317 		strlcpy(name, ops->kind, IFNAMSIZ);
318 		strncat(name, "%d", 2);
319 	}
320 
321 	ASSERT_RTNL();
322 	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
323 	if (!dev) {
324 		err = -ENOMEM;
325 		goto failed;
326 	}
327 	dev_net_set(dev, net);
328 
329 	dev->rtnl_link_ops = ops;
330 
331 	tunnel = netdev_priv(dev);
332 	tunnel->parms = *parms;
333 	tunnel->net = net;
334 
335 	err = register_netdevice(dev);
336 	if (err)
337 		goto failed_free;
338 
339 	return dev;
340 
341 failed_free:
342 	free_netdev(dev);
343 failed:
344 	return ERR_PTR(err);
345 }
346 
347 static inline void init_tunnel_flow(struct flowi4 *fl4,
348 				    int proto,
349 				    __be32 daddr, __be32 saddr,
350 				    __be32 key, __u8 tos, int oif)
351 {
352 	memset(fl4, 0, sizeof(*fl4));
353 	fl4->flowi4_oif = oif;
354 	fl4->daddr = daddr;
355 	fl4->saddr = saddr;
356 	fl4->flowi4_tos = tos;
357 	fl4->flowi4_proto = proto;
358 	fl4->fl4_gre_key = key;
359 }
360 
361 static int ip_tunnel_bind_dev(struct net_device *dev)
362 {
363 	struct net_device *tdev = NULL;
364 	struct ip_tunnel *tunnel = netdev_priv(dev);
365 	const struct iphdr *iph;
366 	int hlen = LL_MAX_HEADER;
367 	int mtu = ETH_DATA_LEN;
368 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
369 
370 	iph = &tunnel->parms.iph;
371 
372 	/* Guess output device to choose reasonable mtu and needed_headroom */
373 	if (iph->daddr) {
374 		struct flowi4 fl4;
375 		struct rtable *rt;
376 
377 		init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
378 				 iph->saddr, tunnel->parms.o_key,
379 				 RT_TOS(iph->tos), tunnel->parms.link);
380 		rt = ip_route_output_key(tunnel->net, &fl4);
381 
382 		if (!IS_ERR(rt)) {
383 			tdev = rt->dst.dev;
384 			tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
385 			ip_rt_put(rt);
386 		}
387 		if (dev->type != ARPHRD_ETHER)
388 			dev->flags |= IFF_POINTOPOINT;
389 	}
390 
391 	if (!tdev && tunnel->parms.link)
392 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
393 
394 	if (tdev) {
395 		hlen = tdev->hard_header_len + tdev->needed_headroom;
396 		mtu = tdev->mtu;
397 	}
398 
399 	dev->needed_headroom = t_hlen + hlen;
400 	mtu -= (dev->hard_header_len + t_hlen);
401 
402 	if (mtu < 68)
403 		mtu = 68;
404 
405 	return mtu;
406 }
407 
408 static struct ip_tunnel *ip_tunnel_create(struct net *net,
409 					  struct ip_tunnel_net *itn,
410 					  struct ip_tunnel_parm *parms)
411 {
412 	struct ip_tunnel *nt;
413 	struct net_device *dev;
414 
415 	BUG_ON(!itn->fb_tunnel_dev);
416 	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
417 	if (IS_ERR(dev))
418 		return ERR_CAST(dev);
419 
420 	dev->mtu = ip_tunnel_bind_dev(dev);
421 
422 	nt = netdev_priv(dev);
423 	ip_tunnel_add(itn, nt);
424 	return nt;
425 }
426 
427 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
428 		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
429 		  bool log_ecn_error)
430 {
431 	struct pcpu_sw_netstats *tstats;
432 	const struct iphdr *iph = ip_hdr(skb);
433 	int err;
434 
435 #ifdef CONFIG_NET_IPGRE_BROADCAST
436 	if (ipv4_is_multicast(iph->daddr)) {
437 		tunnel->dev->stats.multicast++;
438 		skb->pkt_type = PACKET_BROADCAST;
439 	}
440 #endif
441 
442 	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
443 	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
444 		tunnel->dev->stats.rx_crc_errors++;
445 		tunnel->dev->stats.rx_errors++;
446 		goto drop;
447 	}
448 
449 	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
450 		if (!(tpi->flags&TUNNEL_SEQ) ||
451 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
452 			tunnel->dev->stats.rx_fifo_errors++;
453 			tunnel->dev->stats.rx_errors++;
454 			goto drop;
455 		}
456 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
457 	}
458 
459 	skb_reset_network_header(skb);
460 
461 	err = IP_ECN_decapsulate(iph, skb);
462 	if (unlikely(err)) {
463 		if (log_ecn_error)
464 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
465 					&iph->saddr, iph->tos);
466 		if (err > 1) {
467 			++tunnel->dev->stats.rx_frame_errors;
468 			++tunnel->dev->stats.rx_errors;
469 			goto drop;
470 		}
471 	}
472 
473 	tstats = this_cpu_ptr(tunnel->dev->tstats);
474 	u64_stats_update_begin(&tstats->syncp);
475 	tstats->rx_packets++;
476 	tstats->rx_bytes += skb->len;
477 	u64_stats_update_end(&tstats->syncp);
478 
479 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
480 
481 	if (tunnel->dev->type == ARPHRD_ETHER) {
482 		skb->protocol = eth_type_trans(skb, tunnel->dev);
483 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
484 	} else {
485 		skb->dev = tunnel->dev;
486 	}
487 
488 	if (tun_dst)
489 		skb_dst_set(skb, (struct dst_entry *)tun_dst);
490 
491 	gro_cells_receive(&tunnel->gro_cells, skb);
492 	return 0;
493 
494 drop:
495 	kfree_skb(skb);
496 	return 0;
497 }
498 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
499 
500 static int ip_encap_hlen(struct ip_tunnel_encap *e)
501 {
502 	const struct ip_tunnel_encap_ops *ops;
503 	int hlen = -EINVAL;
504 
505 	if (e->type == TUNNEL_ENCAP_NONE)
506 		return 0;
507 
508 	if (e->type >= MAX_IPTUN_ENCAP_OPS)
509 		return -EINVAL;
510 
511 	rcu_read_lock();
512 	ops = rcu_dereference(iptun_encaps[e->type]);
513 	if (likely(ops && ops->encap_hlen))
514 		hlen = ops->encap_hlen(e);
515 	rcu_read_unlock();
516 
517 	return hlen;
518 }
519 
520 const struct ip_tunnel_encap_ops __rcu *
521 		iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
522 
523 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
524 			    unsigned int num)
525 {
526 	if (num >= MAX_IPTUN_ENCAP_OPS)
527 		return -ERANGE;
528 
529 	return !cmpxchg((const struct ip_tunnel_encap_ops **)
530 			&iptun_encaps[num],
531 			NULL, ops) ? 0 : -1;
532 }
533 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
534 
535 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
536 			    unsigned int num)
537 {
538 	int ret;
539 
540 	if (num >= MAX_IPTUN_ENCAP_OPS)
541 		return -ERANGE;
542 
543 	ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
544 		       &iptun_encaps[num],
545 		       ops, NULL) == ops) ? 0 : -1;
546 
547 	synchronize_net();
548 
549 	return ret;
550 }
551 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
552 
553 int ip_tunnel_encap_setup(struct ip_tunnel *t,
554 			  struct ip_tunnel_encap *ipencap)
555 {
556 	int hlen;
557 
558 	memset(&t->encap, 0, sizeof(t->encap));
559 
560 	hlen = ip_encap_hlen(ipencap);
561 	if (hlen < 0)
562 		return hlen;
563 
564 	t->encap.type = ipencap->type;
565 	t->encap.sport = ipencap->sport;
566 	t->encap.dport = ipencap->dport;
567 	t->encap.flags = ipencap->flags;
568 
569 	t->encap_hlen = hlen;
570 	t->hlen = t->encap_hlen + t->tun_hlen;
571 
572 	return 0;
573 }
574 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
575 
576 int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
577 		    u8 *protocol, struct flowi4 *fl4)
578 {
579 	const struct ip_tunnel_encap_ops *ops;
580 	int ret = -EINVAL;
581 
582 	if (t->encap.type == TUNNEL_ENCAP_NONE)
583 		return 0;
584 
585 	if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
586 		return -EINVAL;
587 
588 	rcu_read_lock();
589 	ops = rcu_dereference(iptun_encaps[t->encap.type]);
590 	if (likely(ops && ops->build_header))
591 		ret = ops->build_header(skb, &t->encap, protocol, fl4);
592 	rcu_read_unlock();
593 
594 	return ret;
595 }
596 EXPORT_SYMBOL(ip_tunnel_encap);
597 
598 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
599 			    struct rtable *rt, __be16 df,
600 			    const struct iphdr *inner_iph)
601 {
602 	struct ip_tunnel *tunnel = netdev_priv(dev);
603 	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
604 	int mtu;
605 
606 	if (df)
607 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
608 					- sizeof(struct iphdr) - tunnel->hlen;
609 	else
610 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
611 
612 	if (skb_dst(skb))
613 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
614 
615 	if (skb->protocol == htons(ETH_P_IP)) {
616 		if (!skb_is_gso(skb) &&
617 		    (inner_iph->frag_off & htons(IP_DF)) &&
618 		    mtu < pkt_size) {
619 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
620 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
621 			return -E2BIG;
622 		}
623 	}
624 #if IS_ENABLED(CONFIG_IPV6)
625 	else if (skb->protocol == htons(ETH_P_IPV6)) {
626 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
627 
628 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
629 			   mtu >= IPV6_MIN_MTU) {
630 			if ((tunnel->parms.iph.daddr &&
631 			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
632 			    rt6->rt6i_dst.plen == 128) {
633 				rt6->rt6i_flags |= RTF_MODIFIED;
634 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
635 			}
636 		}
637 
638 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
639 					mtu < pkt_size) {
640 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
641 			return -E2BIG;
642 		}
643 	}
644 #endif
645 	return 0;
646 }
647 
648 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
649 		    const struct iphdr *tnl_params, u8 protocol)
650 {
651 	struct ip_tunnel *tunnel = netdev_priv(dev);
652 	const struct iphdr *inner_iph;
653 	struct flowi4 fl4;
654 	u8     tos, ttl;
655 	__be16 df;
656 	struct rtable *rt;		/* Route to the other host */
657 	unsigned int max_headroom;	/* The extra header space needed */
658 	__be32 dst;
659 	bool connected;
660 
661 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
662 	connected = (tunnel->parms.iph.daddr != 0);
663 
664 	dst = tnl_params->daddr;
665 	if (dst == 0) {
666 		/* NBMA tunnel */
667 
668 		if (!skb_dst(skb)) {
669 			dev->stats.tx_fifo_errors++;
670 			goto tx_error;
671 		}
672 
673 		if (skb->protocol == htons(ETH_P_IP)) {
674 			rt = skb_rtable(skb);
675 			dst = rt_nexthop(rt, inner_iph->daddr);
676 		}
677 #if IS_ENABLED(CONFIG_IPV6)
678 		else if (skb->protocol == htons(ETH_P_IPV6)) {
679 			const struct in6_addr *addr6;
680 			struct neighbour *neigh;
681 			bool do_tx_error_icmp;
682 			int addr_type;
683 
684 			neigh = dst_neigh_lookup(skb_dst(skb),
685 						 &ipv6_hdr(skb)->daddr);
686 			if (!neigh)
687 				goto tx_error;
688 
689 			addr6 = (const struct in6_addr *)&neigh->primary_key;
690 			addr_type = ipv6_addr_type(addr6);
691 
692 			if (addr_type == IPV6_ADDR_ANY) {
693 				addr6 = &ipv6_hdr(skb)->daddr;
694 				addr_type = ipv6_addr_type(addr6);
695 			}
696 
697 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
698 				do_tx_error_icmp = true;
699 			else {
700 				do_tx_error_icmp = false;
701 				dst = addr6->s6_addr32[3];
702 			}
703 			neigh_release(neigh);
704 			if (do_tx_error_icmp)
705 				goto tx_error_icmp;
706 		}
707 #endif
708 		else
709 			goto tx_error;
710 
711 		connected = false;
712 	}
713 
714 	tos = tnl_params->tos;
715 	if (tos & 0x1) {
716 		tos &= ~0x1;
717 		if (skb->protocol == htons(ETH_P_IP)) {
718 			tos = inner_iph->tos;
719 			connected = false;
720 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
721 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
722 			connected = false;
723 		}
724 	}
725 
726 	init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
727 			 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
728 
729 	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
730 		goto tx_error;
731 
732 	rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL;
733 
734 	if (!rt) {
735 		rt = ip_route_output_key(tunnel->net, &fl4);
736 
737 		if (IS_ERR(rt)) {
738 			dev->stats.tx_carrier_errors++;
739 			goto tx_error;
740 		}
741 		if (connected)
742 			tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
743 	}
744 
745 	if (rt->dst.dev == dev) {
746 		ip_rt_put(rt);
747 		dev->stats.collisions++;
748 		goto tx_error;
749 	}
750 
751 	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
752 		ip_rt_put(rt);
753 		goto tx_error;
754 	}
755 
756 	if (tunnel->err_count > 0) {
757 		if (time_before(jiffies,
758 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
759 			tunnel->err_count--;
760 
761 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
762 			dst_link_failure(skb);
763 		} else
764 			tunnel->err_count = 0;
765 	}
766 
767 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
768 	ttl = tnl_params->ttl;
769 	if (ttl == 0) {
770 		if (skb->protocol == htons(ETH_P_IP))
771 			ttl = inner_iph->ttl;
772 #if IS_ENABLED(CONFIG_IPV6)
773 		else if (skb->protocol == htons(ETH_P_IPV6))
774 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
775 #endif
776 		else
777 			ttl = ip4_dst_hoplimit(&rt->dst);
778 	}
779 
780 	df = tnl_params->frag_off;
781 	if (skb->protocol == htons(ETH_P_IP))
782 		df |= (inner_iph->frag_off&htons(IP_DF));
783 
784 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
785 			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
786 	if (max_headroom > dev->needed_headroom)
787 		dev->needed_headroom = max_headroom;
788 
789 	if (skb_cow_head(skb, dev->needed_headroom)) {
790 		ip_rt_put(rt);
791 		dev->stats.tx_dropped++;
792 		kfree_skb(skb);
793 		return;
794 	}
795 
796 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
797 		      df, !net_eq(tunnel->net, dev_net(dev)));
798 	return;
799 
800 #if IS_ENABLED(CONFIG_IPV6)
801 tx_error_icmp:
802 	dst_link_failure(skb);
803 #endif
804 tx_error:
805 	dev->stats.tx_errors++;
806 	kfree_skb(skb);
807 }
808 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
809 
810 static void ip_tunnel_update(struct ip_tunnel_net *itn,
811 			     struct ip_tunnel *t,
812 			     struct net_device *dev,
813 			     struct ip_tunnel_parm *p,
814 			     bool set_mtu)
815 {
816 	ip_tunnel_del(itn, t);
817 	t->parms.iph.saddr = p->iph.saddr;
818 	t->parms.iph.daddr = p->iph.daddr;
819 	t->parms.i_key = p->i_key;
820 	t->parms.o_key = p->o_key;
821 	if (dev->type != ARPHRD_ETHER) {
822 		memcpy(dev->dev_addr, &p->iph.saddr, 4);
823 		memcpy(dev->broadcast, &p->iph.daddr, 4);
824 	}
825 	ip_tunnel_add(itn, t);
826 
827 	t->parms.iph.ttl = p->iph.ttl;
828 	t->parms.iph.tos = p->iph.tos;
829 	t->parms.iph.frag_off = p->iph.frag_off;
830 
831 	if (t->parms.link != p->link) {
832 		int mtu;
833 
834 		t->parms.link = p->link;
835 		mtu = ip_tunnel_bind_dev(dev);
836 		if (set_mtu)
837 			dev->mtu = mtu;
838 	}
839 	ip_tunnel_dst_reset_all(t);
840 	netdev_state_change(dev);
841 }
842 
843 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
844 {
845 	int err = 0;
846 	struct ip_tunnel *t = netdev_priv(dev);
847 	struct net *net = t->net;
848 	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
849 
850 	BUG_ON(!itn->fb_tunnel_dev);
851 	switch (cmd) {
852 	case SIOCGETTUNNEL:
853 		if (dev == itn->fb_tunnel_dev) {
854 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
855 			if (!t)
856 				t = netdev_priv(dev);
857 		}
858 		memcpy(p, &t->parms, sizeof(*p));
859 		break;
860 
861 	case SIOCADDTUNNEL:
862 	case SIOCCHGTUNNEL:
863 		err = -EPERM;
864 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
865 			goto done;
866 		if (p->iph.ttl)
867 			p->iph.frag_off |= htons(IP_DF);
868 		if (!(p->i_flags & VTI_ISVTI)) {
869 			if (!(p->i_flags & TUNNEL_KEY))
870 				p->i_key = 0;
871 			if (!(p->o_flags & TUNNEL_KEY))
872 				p->o_key = 0;
873 		}
874 
875 		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
876 
877 		if (cmd == SIOCADDTUNNEL) {
878 			if (!t) {
879 				t = ip_tunnel_create(net, itn, p);
880 				err = PTR_ERR_OR_ZERO(t);
881 				break;
882 			}
883 
884 			err = -EEXIST;
885 			break;
886 		}
887 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
888 			if (t) {
889 				if (t->dev != dev) {
890 					err = -EEXIST;
891 					break;
892 				}
893 			} else {
894 				unsigned int nflags = 0;
895 
896 				if (ipv4_is_multicast(p->iph.daddr))
897 					nflags = IFF_BROADCAST;
898 				else if (p->iph.daddr)
899 					nflags = IFF_POINTOPOINT;
900 
901 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
902 					err = -EINVAL;
903 					break;
904 				}
905 
906 				t = netdev_priv(dev);
907 			}
908 		}
909 
910 		if (t) {
911 			err = 0;
912 			ip_tunnel_update(itn, t, dev, p, true);
913 		} else {
914 			err = -ENOENT;
915 		}
916 		break;
917 
918 	case SIOCDELTUNNEL:
919 		err = -EPERM;
920 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
921 			goto done;
922 
923 		if (dev == itn->fb_tunnel_dev) {
924 			err = -ENOENT;
925 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
926 			if (!t)
927 				goto done;
928 			err = -EPERM;
929 			if (t == netdev_priv(itn->fb_tunnel_dev))
930 				goto done;
931 			dev = t->dev;
932 		}
933 		unregister_netdevice(dev);
934 		err = 0;
935 		break;
936 
937 	default:
938 		err = -EINVAL;
939 	}
940 
941 done:
942 	return err;
943 }
944 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
945 
946 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
947 {
948 	struct ip_tunnel *tunnel = netdev_priv(dev);
949 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
950 	int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
951 
952 	if (new_mtu < 68)
953 		return -EINVAL;
954 
955 	if (new_mtu > max_mtu) {
956 		if (strict)
957 			return -EINVAL;
958 
959 		new_mtu = max_mtu;
960 	}
961 
962 	dev->mtu = new_mtu;
963 	return 0;
964 }
965 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
966 
967 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
968 {
969 	return __ip_tunnel_change_mtu(dev, new_mtu, true);
970 }
971 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
972 
973 static void ip_tunnel_dev_free(struct net_device *dev)
974 {
975 	struct ip_tunnel *tunnel = netdev_priv(dev);
976 
977 	gro_cells_destroy(&tunnel->gro_cells);
978 	free_percpu(tunnel->dst_cache);
979 	free_percpu(dev->tstats);
980 	free_netdev(dev);
981 }
982 
983 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
984 {
985 	struct ip_tunnel *tunnel = netdev_priv(dev);
986 	struct ip_tunnel_net *itn;
987 
988 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
989 
990 	if (itn->fb_tunnel_dev != dev) {
991 		ip_tunnel_del(itn, netdev_priv(dev));
992 		unregister_netdevice_queue(dev, head);
993 	}
994 }
995 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
996 
997 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
998 {
999 	struct ip_tunnel *tunnel = netdev_priv(dev);
1000 
1001 	return tunnel->net;
1002 }
1003 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1004 
1005 int ip_tunnel_get_iflink(const struct net_device *dev)
1006 {
1007 	struct ip_tunnel *tunnel = netdev_priv(dev);
1008 
1009 	return tunnel->parms.link;
1010 }
1011 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1012 
1013 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
1014 				  struct rtnl_link_ops *ops, char *devname)
1015 {
1016 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1017 	struct ip_tunnel_parm parms;
1018 	unsigned int i;
1019 
1020 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1021 		INIT_HLIST_HEAD(&itn->tunnels[i]);
1022 
1023 	if (!ops) {
1024 		itn->fb_tunnel_dev = NULL;
1025 		return 0;
1026 	}
1027 
1028 	memset(&parms, 0, sizeof(parms));
1029 	if (devname)
1030 		strlcpy(parms.name, devname, IFNAMSIZ);
1031 
1032 	rtnl_lock();
1033 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1034 	/* FB netdevice is special: we have one, and only one per netns.
1035 	 * Allowing to move it to another netns is clearly unsafe.
1036 	 */
1037 	if (!IS_ERR(itn->fb_tunnel_dev)) {
1038 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1039 		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1040 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1041 	}
1042 	rtnl_unlock();
1043 
1044 	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1045 }
1046 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1047 
1048 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1049 			      struct rtnl_link_ops *ops)
1050 {
1051 	struct net *net = dev_net(itn->fb_tunnel_dev);
1052 	struct net_device *dev, *aux;
1053 	int h;
1054 
1055 	for_each_netdev_safe(net, dev, aux)
1056 		if (dev->rtnl_link_ops == ops)
1057 			unregister_netdevice_queue(dev, head);
1058 
1059 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1060 		struct ip_tunnel *t;
1061 		struct hlist_node *n;
1062 		struct hlist_head *thead = &itn->tunnels[h];
1063 
1064 		hlist_for_each_entry_safe(t, n, thead, hash_node)
1065 			/* If dev is in the same netns, it has already
1066 			 * been added to the list by the previous loop.
1067 			 */
1068 			if (!net_eq(dev_net(t->dev), net))
1069 				unregister_netdevice_queue(t->dev, head);
1070 	}
1071 }
1072 
1073 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1074 {
1075 	LIST_HEAD(list);
1076 
1077 	rtnl_lock();
1078 	ip_tunnel_destroy(itn, &list, ops);
1079 	unregister_netdevice_many(&list);
1080 	rtnl_unlock();
1081 }
1082 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1083 
1084 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1085 		      struct ip_tunnel_parm *p)
1086 {
1087 	struct ip_tunnel *nt;
1088 	struct net *net = dev_net(dev);
1089 	struct ip_tunnel_net *itn;
1090 	int mtu;
1091 	int err;
1092 
1093 	nt = netdev_priv(dev);
1094 	itn = net_generic(net, nt->ip_tnl_net_id);
1095 
1096 	if (nt->collect_md) {
1097 		if (rtnl_dereference(itn->collect_md_tun))
1098 			return -EEXIST;
1099 	} else {
1100 		if (ip_tunnel_find(itn, p, dev->type))
1101 			return -EEXIST;
1102 	}
1103 
1104 	nt->net = net;
1105 	nt->parms = *p;
1106 	err = register_netdevice(dev);
1107 	if (err)
1108 		goto out;
1109 
1110 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1111 		eth_hw_addr_random(dev);
1112 
1113 	mtu = ip_tunnel_bind_dev(dev);
1114 	if (!tb[IFLA_MTU])
1115 		dev->mtu = mtu;
1116 
1117 	ip_tunnel_add(itn, nt);
1118 out:
1119 	return err;
1120 }
1121 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1122 
1123 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1124 			 struct ip_tunnel_parm *p)
1125 {
1126 	struct ip_tunnel *t;
1127 	struct ip_tunnel *tunnel = netdev_priv(dev);
1128 	struct net *net = tunnel->net;
1129 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1130 
1131 	if (dev == itn->fb_tunnel_dev)
1132 		return -EINVAL;
1133 
1134 	t = ip_tunnel_find(itn, p, dev->type);
1135 
1136 	if (t) {
1137 		if (t->dev != dev)
1138 			return -EEXIST;
1139 	} else {
1140 		t = tunnel;
1141 
1142 		if (dev->type != ARPHRD_ETHER) {
1143 			unsigned int nflags = 0;
1144 
1145 			if (ipv4_is_multicast(p->iph.daddr))
1146 				nflags = IFF_BROADCAST;
1147 			else if (p->iph.daddr)
1148 				nflags = IFF_POINTOPOINT;
1149 
1150 			if ((dev->flags ^ nflags) &
1151 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1152 				return -EINVAL;
1153 		}
1154 	}
1155 
1156 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1157 	return 0;
1158 }
1159 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1160 
1161 int ip_tunnel_init(struct net_device *dev)
1162 {
1163 	struct ip_tunnel *tunnel = netdev_priv(dev);
1164 	struct iphdr *iph = &tunnel->parms.iph;
1165 	int err;
1166 
1167 	dev->destructor	= ip_tunnel_dev_free;
1168 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1169 	if (!dev->tstats)
1170 		return -ENOMEM;
1171 
1172 	tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
1173 	if (!tunnel->dst_cache) {
1174 		free_percpu(dev->tstats);
1175 		return -ENOMEM;
1176 	}
1177 
1178 	err = gro_cells_init(&tunnel->gro_cells, dev);
1179 	if (err) {
1180 		free_percpu(tunnel->dst_cache);
1181 		free_percpu(dev->tstats);
1182 		return err;
1183 	}
1184 
1185 	tunnel->dev = dev;
1186 	tunnel->net = dev_net(dev);
1187 	strcpy(tunnel->parms.name, dev->name);
1188 	iph->version		= 4;
1189 	iph->ihl		= 5;
1190 
1191 	if (tunnel->collect_md) {
1192 		dev->features |= NETIF_F_NETNS_LOCAL;
1193 		netif_keep_dst(dev);
1194 	}
1195 	return 0;
1196 }
1197 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1198 
1199 void ip_tunnel_uninit(struct net_device *dev)
1200 {
1201 	struct ip_tunnel *tunnel = netdev_priv(dev);
1202 	struct net *net = tunnel->net;
1203 	struct ip_tunnel_net *itn;
1204 
1205 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1206 	/* fb_tunnel_dev will be unregisted in net-exit call. */
1207 	if (itn->fb_tunnel_dev != dev)
1208 		ip_tunnel_del(itn, netdev_priv(dev));
1209 
1210 	ip_tunnel_dst_reset_all(tunnel);
1211 }
1212 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1213 
1214 /* Do least required initialization, rest of init is done in tunnel_init call */
1215 void ip_tunnel_setup(struct net_device *dev, int net_id)
1216 {
1217 	struct ip_tunnel *tunnel = netdev_priv(dev);
1218 	tunnel->ip_tnl_net_id = net_id;
1219 }
1220 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1221 
1222 MODULE_LICENSE("GPL");
1223