xref: /openbmc/linux/net/ipv4/ip_tunnel.c (revision 3b27d139)
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20 
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/mroute.h>
34 #include <linux/init.h>
35 #include <linux/in6.h>
36 #include <linux/inetdevice.h>
37 #include <linux/igmp.h>
38 #include <linux/netfilter_ipv4.h>
39 #include <linux/etherdevice.h>
40 #include <linux/if_ether.h>
41 #include <linux/if_vlan.h>
42 #include <linux/rculist.h>
43 #include <linux/err.h>
44 
45 #include <net/sock.h>
46 #include <net/ip.h>
47 #include <net/icmp.h>
48 #include <net/protocol.h>
49 #include <net/ip_tunnels.h>
50 #include <net/arp.h>
51 #include <net/checksum.h>
52 #include <net/dsfield.h>
53 #include <net/inet_ecn.h>
54 #include <net/xfrm.h>
55 #include <net/net_namespace.h>
56 #include <net/netns/generic.h>
57 #include <net/rtnetlink.h>
58 #include <net/udp.h>
59 
60 #if IS_ENABLED(CONFIG_IPV6)
61 #include <net/ipv6.h>
62 #include <net/ip6_fib.h>
63 #include <net/ip6_route.h>
64 #endif
65 
66 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
67 {
68 	return hash_32((__force u32)key ^ (__force u32)remote,
69 			 IP_TNL_HASH_BITS);
70 }
71 
72 static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
73 			     struct dst_entry *dst, __be32 saddr)
74 {
75 	struct dst_entry *old_dst;
76 
77 	dst_clone(dst);
78 	old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
79 	dst_release(old_dst);
80 	idst->saddr = saddr;
81 }
82 
83 static noinline void tunnel_dst_set(struct ip_tunnel *t,
84 			   struct dst_entry *dst, __be32 saddr)
85 {
86 	__tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr);
87 }
88 
89 static void tunnel_dst_reset(struct ip_tunnel *t)
90 {
91 	tunnel_dst_set(t, NULL, 0);
92 }
93 
94 void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
95 {
96 	int i;
97 
98 	for_each_possible_cpu(i)
99 		__tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0);
100 }
101 EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
102 
103 static struct rtable *tunnel_rtable_get(struct ip_tunnel *t,
104 					u32 cookie, __be32 *saddr)
105 {
106 	struct ip_tunnel_dst *idst;
107 	struct dst_entry *dst;
108 
109 	rcu_read_lock();
110 	idst = raw_cpu_ptr(t->dst_cache);
111 	dst = rcu_dereference(idst->dst);
112 	if (dst && !atomic_inc_not_zero(&dst->__refcnt))
113 		dst = NULL;
114 	if (dst) {
115 		if (!dst->obsolete || dst->ops->check(dst, cookie)) {
116 			*saddr = idst->saddr;
117 		} else {
118 			tunnel_dst_reset(t);
119 			dst_release(dst);
120 			dst = NULL;
121 		}
122 	}
123 	rcu_read_unlock();
124 	return (struct rtable *)dst;
125 }
126 
127 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
128 				__be16 flags, __be32 key)
129 {
130 	if (p->i_flags & TUNNEL_KEY) {
131 		if (flags & TUNNEL_KEY)
132 			return key == p->i_key;
133 		else
134 			/* key expected, none present */
135 			return false;
136 	} else
137 		return !(flags & TUNNEL_KEY);
138 }
139 
140 /* Fallback tunnel: no source, no destination, no key, no options
141 
142    Tunnel hash table:
143    We require exact key match i.e. if a key is present in packet
144    it will match only tunnel with the same key; if it is not present,
145    it will match only keyless tunnel.
146 
147    All keysless packets, if not matched configured keyless tunnels
148    will match fallback tunnel.
149    Given src, dst and key, find appropriate for input tunnel.
150 */
151 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
152 				   int link, __be16 flags,
153 				   __be32 remote, __be32 local,
154 				   __be32 key)
155 {
156 	unsigned int hash;
157 	struct ip_tunnel *t, *cand = NULL;
158 	struct hlist_head *head;
159 
160 	hash = ip_tunnel_hash(key, remote);
161 	head = &itn->tunnels[hash];
162 
163 	hlist_for_each_entry_rcu(t, head, hash_node) {
164 		if (local != t->parms.iph.saddr ||
165 		    remote != t->parms.iph.daddr ||
166 		    !(t->dev->flags & IFF_UP))
167 			continue;
168 
169 		if (!ip_tunnel_key_match(&t->parms, flags, key))
170 			continue;
171 
172 		if (t->parms.link == link)
173 			return t;
174 		else
175 			cand = t;
176 	}
177 
178 	hlist_for_each_entry_rcu(t, head, hash_node) {
179 		if (remote != t->parms.iph.daddr ||
180 		    t->parms.iph.saddr != 0 ||
181 		    !(t->dev->flags & IFF_UP))
182 			continue;
183 
184 		if (!ip_tunnel_key_match(&t->parms, flags, key))
185 			continue;
186 
187 		if (t->parms.link == link)
188 			return t;
189 		else if (!cand)
190 			cand = t;
191 	}
192 
193 	hash = ip_tunnel_hash(key, 0);
194 	head = &itn->tunnels[hash];
195 
196 	hlist_for_each_entry_rcu(t, head, hash_node) {
197 		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
198 		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
199 			continue;
200 
201 		if (!(t->dev->flags & IFF_UP))
202 			continue;
203 
204 		if (!ip_tunnel_key_match(&t->parms, flags, key))
205 			continue;
206 
207 		if (t->parms.link == link)
208 			return t;
209 		else if (!cand)
210 			cand = t;
211 	}
212 
213 	if (flags & TUNNEL_NO_KEY)
214 		goto skip_key_lookup;
215 
216 	hlist_for_each_entry_rcu(t, head, hash_node) {
217 		if (t->parms.i_key != key ||
218 		    t->parms.iph.saddr != 0 ||
219 		    t->parms.iph.daddr != 0 ||
220 		    !(t->dev->flags & IFF_UP))
221 			continue;
222 
223 		if (t->parms.link == link)
224 			return t;
225 		else if (!cand)
226 			cand = t;
227 	}
228 
229 skip_key_lookup:
230 	if (cand)
231 		return cand;
232 
233 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
234 		return netdev_priv(itn->fb_tunnel_dev);
235 
236 
237 	return NULL;
238 }
239 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
240 
241 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
242 				    struct ip_tunnel_parm *parms)
243 {
244 	unsigned int h;
245 	__be32 remote;
246 	__be32 i_key = parms->i_key;
247 
248 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
249 		remote = parms->iph.daddr;
250 	else
251 		remote = 0;
252 
253 	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
254 		i_key = 0;
255 
256 	h = ip_tunnel_hash(i_key, remote);
257 	return &itn->tunnels[h];
258 }
259 
260 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
261 {
262 	struct hlist_head *head = ip_bucket(itn, &t->parms);
263 
264 	hlist_add_head_rcu(&t->hash_node, head);
265 }
266 
267 static void ip_tunnel_del(struct ip_tunnel *t)
268 {
269 	hlist_del_init_rcu(&t->hash_node);
270 }
271 
272 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
273 					struct ip_tunnel_parm *parms,
274 					int type)
275 {
276 	__be32 remote = parms->iph.daddr;
277 	__be32 local = parms->iph.saddr;
278 	__be32 key = parms->i_key;
279 	__be16 flags = parms->i_flags;
280 	int link = parms->link;
281 	struct ip_tunnel *t = NULL;
282 	struct hlist_head *head = ip_bucket(itn, parms);
283 
284 	hlist_for_each_entry_rcu(t, head, hash_node) {
285 		if (local == t->parms.iph.saddr &&
286 		    remote == t->parms.iph.daddr &&
287 		    link == t->parms.link &&
288 		    type == t->dev->type &&
289 		    ip_tunnel_key_match(&t->parms, flags, key))
290 			break;
291 	}
292 	return t;
293 }
294 
295 static struct net_device *__ip_tunnel_create(struct net *net,
296 					     const struct rtnl_link_ops *ops,
297 					     struct ip_tunnel_parm *parms)
298 {
299 	int err;
300 	struct ip_tunnel *tunnel;
301 	struct net_device *dev;
302 	char name[IFNAMSIZ];
303 
304 	if (parms->name[0])
305 		strlcpy(name, parms->name, IFNAMSIZ);
306 	else {
307 		if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
308 			err = -E2BIG;
309 			goto failed;
310 		}
311 		strlcpy(name, ops->kind, IFNAMSIZ);
312 		strncat(name, "%d", 2);
313 	}
314 
315 	ASSERT_RTNL();
316 	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
317 	if (!dev) {
318 		err = -ENOMEM;
319 		goto failed;
320 	}
321 	dev_net_set(dev, net);
322 
323 	dev->rtnl_link_ops = ops;
324 
325 	tunnel = netdev_priv(dev);
326 	tunnel->parms = *parms;
327 	tunnel->net = net;
328 
329 	err = register_netdevice(dev);
330 	if (err)
331 		goto failed_free;
332 
333 	return dev;
334 
335 failed_free:
336 	free_netdev(dev);
337 failed:
338 	return ERR_PTR(err);
339 }
340 
341 static inline void init_tunnel_flow(struct flowi4 *fl4,
342 				    int proto,
343 				    __be32 daddr, __be32 saddr,
344 				    __be32 key, __u8 tos, int oif)
345 {
346 	memset(fl4, 0, sizeof(*fl4));
347 	fl4->flowi4_oif = oif;
348 	fl4->daddr = daddr;
349 	fl4->saddr = saddr;
350 	fl4->flowi4_tos = tos;
351 	fl4->flowi4_proto = proto;
352 	fl4->fl4_gre_key = key;
353 }
354 
355 static int ip_tunnel_bind_dev(struct net_device *dev)
356 {
357 	struct net_device *tdev = NULL;
358 	struct ip_tunnel *tunnel = netdev_priv(dev);
359 	const struct iphdr *iph;
360 	int hlen = LL_MAX_HEADER;
361 	int mtu = ETH_DATA_LEN;
362 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
363 
364 	iph = &tunnel->parms.iph;
365 
366 	/* Guess output device to choose reasonable mtu and needed_headroom */
367 	if (iph->daddr) {
368 		struct flowi4 fl4;
369 		struct rtable *rt;
370 
371 		init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
372 				 iph->saddr, tunnel->parms.o_key,
373 				 RT_TOS(iph->tos), tunnel->parms.link);
374 		rt = ip_route_output_key(tunnel->net, &fl4);
375 
376 		if (!IS_ERR(rt)) {
377 			tdev = rt->dst.dev;
378 			tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
379 			ip_rt_put(rt);
380 		}
381 		if (dev->type != ARPHRD_ETHER)
382 			dev->flags |= IFF_POINTOPOINT;
383 	}
384 
385 	if (!tdev && tunnel->parms.link)
386 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
387 
388 	if (tdev) {
389 		hlen = tdev->hard_header_len + tdev->needed_headroom;
390 		mtu = tdev->mtu;
391 	}
392 
393 	dev->needed_headroom = t_hlen + hlen;
394 	mtu -= (dev->hard_header_len + t_hlen);
395 
396 	if (mtu < 68)
397 		mtu = 68;
398 
399 	return mtu;
400 }
401 
402 static struct ip_tunnel *ip_tunnel_create(struct net *net,
403 					  struct ip_tunnel_net *itn,
404 					  struct ip_tunnel_parm *parms)
405 {
406 	struct ip_tunnel *nt;
407 	struct net_device *dev;
408 
409 	BUG_ON(!itn->fb_tunnel_dev);
410 	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
411 	if (IS_ERR(dev))
412 		return ERR_CAST(dev);
413 
414 	dev->mtu = ip_tunnel_bind_dev(dev);
415 
416 	nt = netdev_priv(dev);
417 	ip_tunnel_add(itn, nt);
418 	return nt;
419 }
420 
421 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
422 		  const struct tnl_ptk_info *tpi, bool log_ecn_error)
423 {
424 	struct pcpu_sw_netstats *tstats;
425 	const struct iphdr *iph = ip_hdr(skb);
426 	int err;
427 
428 #ifdef CONFIG_NET_IPGRE_BROADCAST
429 	if (ipv4_is_multicast(iph->daddr)) {
430 		tunnel->dev->stats.multicast++;
431 		skb->pkt_type = PACKET_BROADCAST;
432 	}
433 #endif
434 
435 	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
436 	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
437 		tunnel->dev->stats.rx_crc_errors++;
438 		tunnel->dev->stats.rx_errors++;
439 		goto drop;
440 	}
441 
442 	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
443 		if (!(tpi->flags&TUNNEL_SEQ) ||
444 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
445 			tunnel->dev->stats.rx_fifo_errors++;
446 			tunnel->dev->stats.rx_errors++;
447 			goto drop;
448 		}
449 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
450 	}
451 
452 	skb_reset_network_header(skb);
453 
454 	err = IP_ECN_decapsulate(iph, skb);
455 	if (unlikely(err)) {
456 		if (log_ecn_error)
457 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
458 					&iph->saddr, iph->tos);
459 		if (err > 1) {
460 			++tunnel->dev->stats.rx_frame_errors;
461 			++tunnel->dev->stats.rx_errors;
462 			goto drop;
463 		}
464 	}
465 
466 	tstats = this_cpu_ptr(tunnel->dev->tstats);
467 	u64_stats_update_begin(&tstats->syncp);
468 	tstats->rx_packets++;
469 	tstats->rx_bytes += skb->len;
470 	u64_stats_update_end(&tstats->syncp);
471 
472 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
473 
474 	if (tunnel->dev->type == ARPHRD_ETHER) {
475 		skb->protocol = eth_type_trans(skb, tunnel->dev);
476 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
477 	} else {
478 		skb->dev = tunnel->dev;
479 	}
480 
481 	gro_cells_receive(&tunnel->gro_cells, skb);
482 	return 0;
483 
484 drop:
485 	kfree_skb(skb);
486 	return 0;
487 }
488 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
489 
490 static int ip_encap_hlen(struct ip_tunnel_encap *e)
491 {
492 	const struct ip_tunnel_encap_ops *ops;
493 	int hlen = -EINVAL;
494 
495 	if (e->type == TUNNEL_ENCAP_NONE)
496 		return 0;
497 
498 	if (e->type >= MAX_IPTUN_ENCAP_OPS)
499 		return -EINVAL;
500 
501 	rcu_read_lock();
502 	ops = rcu_dereference(iptun_encaps[e->type]);
503 	if (likely(ops && ops->encap_hlen))
504 		hlen = ops->encap_hlen(e);
505 	rcu_read_unlock();
506 
507 	return hlen;
508 }
509 
510 const struct ip_tunnel_encap_ops __rcu *
511 		iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
512 
513 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
514 			    unsigned int num)
515 {
516 	if (num >= MAX_IPTUN_ENCAP_OPS)
517 		return -ERANGE;
518 
519 	return !cmpxchg((const struct ip_tunnel_encap_ops **)
520 			&iptun_encaps[num],
521 			NULL, ops) ? 0 : -1;
522 }
523 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
524 
525 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
526 			    unsigned int num)
527 {
528 	int ret;
529 
530 	if (num >= MAX_IPTUN_ENCAP_OPS)
531 		return -ERANGE;
532 
533 	ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
534 		       &iptun_encaps[num],
535 		       ops, NULL) == ops) ? 0 : -1;
536 
537 	synchronize_net();
538 
539 	return ret;
540 }
541 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
542 
543 int ip_tunnel_encap_setup(struct ip_tunnel *t,
544 			  struct ip_tunnel_encap *ipencap)
545 {
546 	int hlen;
547 
548 	memset(&t->encap, 0, sizeof(t->encap));
549 
550 	hlen = ip_encap_hlen(ipencap);
551 	if (hlen < 0)
552 		return hlen;
553 
554 	t->encap.type = ipencap->type;
555 	t->encap.sport = ipencap->sport;
556 	t->encap.dport = ipencap->dport;
557 	t->encap.flags = ipencap->flags;
558 
559 	t->encap_hlen = hlen;
560 	t->hlen = t->encap_hlen + t->tun_hlen;
561 
562 	return 0;
563 }
564 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
565 
566 int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
567 		    u8 *protocol, struct flowi4 *fl4)
568 {
569 	const struct ip_tunnel_encap_ops *ops;
570 	int ret = -EINVAL;
571 
572 	if (t->encap.type == TUNNEL_ENCAP_NONE)
573 		return 0;
574 
575 	if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
576 		return -EINVAL;
577 
578 	rcu_read_lock();
579 	ops = rcu_dereference(iptun_encaps[t->encap.type]);
580 	if (likely(ops && ops->build_header))
581 		ret = ops->build_header(skb, &t->encap, protocol, fl4);
582 	rcu_read_unlock();
583 
584 	return ret;
585 }
586 EXPORT_SYMBOL(ip_tunnel_encap);
587 
588 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
589 			    struct rtable *rt, __be16 df,
590 			    const struct iphdr *inner_iph)
591 {
592 	struct ip_tunnel *tunnel = netdev_priv(dev);
593 	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
594 	int mtu;
595 
596 	if (df)
597 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
598 					- sizeof(struct iphdr) - tunnel->hlen;
599 	else
600 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
601 
602 	if (skb_dst(skb))
603 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
604 
605 	if (skb->protocol == htons(ETH_P_IP)) {
606 		if (!skb_is_gso(skb) &&
607 		    (inner_iph->frag_off & htons(IP_DF)) &&
608 		    mtu < pkt_size) {
609 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
610 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
611 			return -E2BIG;
612 		}
613 	}
614 #if IS_ENABLED(CONFIG_IPV6)
615 	else if (skb->protocol == htons(ETH_P_IPV6)) {
616 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
617 
618 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
619 			   mtu >= IPV6_MIN_MTU) {
620 			if ((tunnel->parms.iph.daddr &&
621 			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
622 			    rt6->rt6i_dst.plen == 128) {
623 				rt6->rt6i_flags |= RTF_MODIFIED;
624 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
625 			}
626 		}
627 
628 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
629 					mtu < pkt_size) {
630 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
631 			return -E2BIG;
632 		}
633 	}
634 #endif
635 	return 0;
636 }
637 
638 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
639 		    const struct iphdr *tnl_params, u8 protocol)
640 {
641 	struct ip_tunnel *tunnel = netdev_priv(dev);
642 	const struct iphdr *inner_iph;
643 	struct flowi4 fl4;
644 	u8     tos, ttl;
645 	__be16 df;
646 	struct rtable *rt;		/* Route to the other host */
647 	unsigned int max_headroom;	/* The extra header space needed */
648 	__be32 dst;
649 	int err;
650 	bool connected;
651 
652 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
653 	connected = (tunnel->parms.iph.daddr != 0);
654 
655 	dst = tnl_params->daddr;
656 	if (dst == 0) {
657 		/* NBMA tunnel */
658 
659 		if (!skb_dst(skb)) {
660 			dev->stats.tx_fifo_errors++;
661 			goto tx_error;
662 		}
663 
664 		if (skb->protocol == htons(ETH_P_IP)) {
665 			rt = skb_rtable(skb);
666 			dst = rt_nexthop(rt, inner_iph->daddr);
667 		}
668 #if IS_ENABLED(CONFIG_IPV6)
669 		else if (skb->protocol == htons(ETH_P_IPV6)) {
670 			const struct in6_addr *addr6;
671 			struct neighbour *neigh;
672 			bool do_tx_error_icmp;
673 			int addr_type;
674 
675 			neigh = dst_neigh_lookup(skb_dst(skb),
676 						 &ipv6_hdr(skb)->daddr);
677 			if (!neigh)
678 				goto tx_error;
679 
680 			addr6 = (const struct in6_addr *)&neigh->primary_key;
681 			addr_type = ipv6_addr_type(addr6);
682 
683 			if (addr_type == IPV6_ADDR_ANY) {
684 				addr6 = &ipv6_hdr(skb)->daddr;
685 				addr_type = ipv6_addr_type(addr6);
686 			}
687 
688 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
689 				do_tx_error_icmp = true;
690 			else {
691 				do_tx_error_icmp = false;
692 				dst = addr6->s6_addr32[3];
693 			}
694 			neigh_release(neigh);
695 			if (do_tx_error_icmp)
696 				goto tx_error_icmp;
697 		}
698 #endif
699 		else
700 			goto tx_error;
701 
702 		connected = false;
703 	}
704 
705 	tos = tnl_params->tos;
706 	if (tos & 0x1) {
707 		tos &= ~0x1;
708 		if (skb->protocol == htons(ETH_P_IP)) {
709 			tos = inner_iph->tos;
710 			connected = false;
711 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
712 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
713 			connected = false;
714 		}
715 	}
716 
717 	init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
718 			 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
719 
720 	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
721 		goto tx_error;
722 
723 	rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL;
724 
725 	if (!rt) {
726 		rt = ip_route_output_key(tunnel->net, &fl4);
727 
728 		if (IS_ERR(rt)) {
729 			dev->stats.tx_carrier_errors++;
730 			goto tx_error;
731 		}
732 		if (connected)
733 			tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
734 	}
735 
736 	if (rt->dst.dev == dev) {
737 		ip_rt_put(rt);
738 		dev->stats.collisions++;
739 		goto tx_error;
740 	}
741 
742 	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
743 		ip_rt_put(rt);
744 		goto tx_error;
745 	}
746 
747 	if (tunnel->err_count > 0) {
748 		if (time_before(jiffies,
749 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
750 			tunnel->err_count--;
751 
752 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
753 			dst_link_failure(skb);
754 		} else
755 			tunnel->err_count = 0;
756 	}
757 
758 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
759 	ttl = tnl_params->ttl;
760 	if (ttl == 0) {
761 		if (skb->protocol == htons(ETH_P_IP))
762 			ttl = inner_iph->ttl;
763 #if IS_ENABLED(CONFIG_IPV6)
764 		else if (skb->protocol == htons(ETH_P_IPV6))
765 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
766 #endif
767 		else
768 			ttl = ip4_dst_hoplimit(&rt->dst);
769 	}
770 
771 	df = tnl_params->frag_off;
772 	if (skb->protocol == htons(ETH_P_IP))
773 		df |= (inner_iph->frag_off&htons(IP_DF));
774 
775 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
776 			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
777 	if (max_headroom > dev->needed_headroom)
778 		dev->needed_headroom = max_headroom;
779 
780 	if (skb_cow_head(skb, dev->needed_headroom)) {
781 		ip_rt_put(rt);
782 		dev->stats.tx_dropped++;
783 		kfree_skb(skb);
784 		return;
785 	}
786 
787 	err = iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol,
788 			    tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
789 	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
790 
791 	return;
792 
793 #if IS_ENABLED(CONFIG_IPV6)
794 tx_error_icmp:
795 	dst_link_failure(skb);
796 #endif
797 tx_error:
798 	dev->stats.tx_errors++;
799 	kfree_skb(skb);
800 }
801 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
802 
803 static void ip_tunnel_update(struct ip_tunnel_net *itn,
804 			     struct ip_tunnel *t,
805 			     struct net_device *dev,
806 			     struct ip_tunnel_parm *p,
807 			     bool set_mtu)
808 {
809 	ip_tunnel_del(t);
810 	t->parms.iph.saddr = p->iph.saddr;
811 	t->parms.iph.daddr = p->iph.daddr;
812 	t->parms.i_key = p->i_key;
813 	t->parms.o_key = p->o_key;
814 	if (dev->type != ARPHRD_ETHER) {
815 		memcpy(dev->dev_addr, &p->iph.saddr, 4);
816 		memcpy(dev->broadcast, &p->iph.daddr, 4);
817 	}
818 	ip_tunnel_add(itn, t);
819 
820 	t->parms.iph.ttl = p->iph.ttl;
821 	t->parms.iph.tos = p->iph.tos;
822 	t->parms.iph.frag_off = p->iph.frag_off;
823 
824 	if (t->parms.link != p->link) {
825 		int mtu;
826 
827 		t->parms.link = p->link;
828 		mtu = ip_tunnel_bind_dev(dev);
829 		if (set_mtu)
830 			dev->mtu = mtu;
831 	}
832 	ip_tunnel_dst_reset_all(t);
833 	netdev_state_change(dev);
834 }
835 
836 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
837 {
838 	int err = 0;
839 	struct ip_tunnel *t = netdev_priv(dev);
840 	struct net *net = t->net;
841 	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
842 
843 	BUG_ON(!itn->fb_tunnel_dev);
844 	switch (cmd) {
845 	case SIOCGETTUNNEL:
846 		if (dev == itn->fb_tunnel_dev) {
847 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
848 			if (!t)
849 				t = netdev_priv(dev);
850 		}
851 		memcpy(p, &t->parms, sizeof(*p));
852 		break;
853 
854 	case SIOCADDTUNNEL:
855 	case SIOCCHGTUNNEL:
856 		err = -EPERM;
857 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
858 			goto done;
859 		if (p->iph.ttl)
860 			p->iph.frag_off |= htons(IP_DF);
861 		if (!(p->i_flags & VTI_ISVTI)) {
862 			if (!(p->i_flags & TUNNEL_KEY))
863 				p->i_key = 0;
864 			if (!(p->o_flags & TUNNEL_KEY))
865 				p->o_key = 0;
866 		}
867 
868 		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
869 
870 		if (cmd == SIOCADDTUNNEL) {
871 			if (!t) {
872 				t = ip_tunnel_create(net, itn, p);
873 				err = PTR_ERR_OR_ZERO(t);
874 				break;
875 			}
876 
877 			err = -EEXIST;
878 			break;
879 		}
880 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
881 			if (t) {
882 				if (t->dev != dev) {
883 					err = -EEXIST;
884 					break;
885 				}
886 			} else {
887 				unsigned int nflags = 0;
888 
889 				if (ipv4_is_multicast(p->iph.daddr))
890 					nflags = IFF_BROADCAST;
891 				else if (p->iph.daddr)
892 					nflags = IFF_POINTOPOINT;
893 
894 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
895 					err = -EINVAL;
896 					break;
897 				}
898 
899 				t = netdev_priv(dev);
900 			}
901 		}
902 
903 		if (t) {
904 			err = 0;
905 			ip_tunnel_update(itn, t, dev, p, true);
906 		} else {
907 			err = -ENOENT;
908 		}
909 		break;
910 
911 	case SIOCDELTUNNEL:
912 		err = -EPERM;
913 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
914 			goto done;
915 
916 		if (dev == itn->fb_tunnel_dev) {
917 			err = -ENOENT;
918 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
919 			if (!t)
920 				goto done;
921 			err = -EPERM;
922 			if (t == netdev_priv(itn->fb_tunnel_dev))
923 				goto done;
924 			dev = t->dev;
925 		}
926 		unregister_netdevice(dev);
927 		err = 0;
928 		break;
929 
930 	default:
931 		err = -EINVAL;
932 	}
933 
934 done:
935 	return err;
936 }
937 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
938 
939 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
940 {
941 	struct ip_tunnel *tunnel = netdev_priv(dev);
942 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
943 
944 	if (new_mtu < 68 ||
945 	    new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
946 		return -EINVAL;
947 	dev->mtu = new_mtu;
948 	return 0;
949 }
950 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
951 
952 static void ip_tunnel_dev_free(struct net_device *dev)
953 {
954 	struct ip_tunnel *tunnel = netdev_priv(dev);
955 
956 	gro_cells_destroy(&tunnel->gro_cells);
957 	free_percpu(tunnel->dst_cache);
958 	free_percpu(dev->tstats);
959 	free_netdev(dev);
960 }
961 
962 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
963 {
964 	struct ip_tunnel *tunnel = netdev_priv(dev);
965 	struct ip_tunnel_net *itn;
966 
967 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
968 
969 	if (itn->fb_tunnel_dev != dev) {
970 		ip_tunnel_del(netdev_priv(dev));
971 		unregister_netdevice_queue(dev, head);
972 	}
973 }
974 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
975 
976 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
977 {
978 	struct ip_tunnel *tunnel = netdev_priv(dev);
979 
980 	return tunnel->net;
981 }
982 EXPORT_SYMBOL(ip_tunnel_get_link_net);
983 
984 int ip_tunnel_get_iflink(const struct net_device *dev)
985 {
986 	struct ip_tunnel *tunnel = netdev_priv(dev);
987 
988 	return tunnel->parms.link;
989 }
990 EXPORT_SYMBOL(ip_tunnel_get_iflink);
991 
992 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
993 				  struct rtnl_link_ops *ops, char *devname)
994 {
995 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
996 	struct ip_tunnel_parm parms;
997 	unsigned int i;
998 
999 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1000 		INIT_HLIST_HEAD(&itn->tunnels[i]);
1001 
1002 	if (!ops) {
1003 		itn->fb_tunnel_dev = NULL;
1004 		return 0;
1005 	}
1006 
1007 	memset(&parms, 0, sizeof(parms));
1008 	if (devname)
1009 		strlcpy(parms.name, devname, IFNAMSIZ);
1010 
1011 	rtnl_lock();
1012 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1013 	/* FB netdevice is special: we have one, and only one per netns.
1014 	 * Allowing to move it to another netns is clearly unsafe.
1015 	 */
1016 	if (!IS_ERR(itn->fb_tunnel_dev)) {
1017 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1018 		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1019 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1020 	}
1021 	rtnl_unlock();
1022 
1023 	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1024 }
1025 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1026 
1027 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1028 			      struct rtnl_link_ops *ops)
1029 {
1030 	struct net *net = dev_net(itn->fb_tunnel_dev);
1031 	struct net_device *dev, *aux;
1032 	int h;
1033 
1034 	for_each_netdev_safe(net, dev, aux)
1035 		if (dev->rtnl_link_ops == ops)
1036 			unregister_netdevice_queue(dev, head);
1037 
1038 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1039 		struct ip_tunnel *t;
1040 		struct hlist_node *n;
1041 		struct hlist_head *thead = &itn->tunnels[h];
1042 
1043 		hlist_for_each_entry_safe(t, n, thead, hash_node)
1044 			/* If dev is in the same netns, it has already
1045 			 * been added to the list by the previous loop.
1046 			 */
1047 			if (!net_eq(dev_net(t->dev), net))
1048 				unregister_netdevice_queue(t->dev, head);
1049 	}
1050 }
1051 
1052 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1053 {
1054 	LIST_HEAD(list);
1055 
1056 	rtnl_lock();
1057 	ip_tunnel_destroy(itn, &list, ops);
1058 	unregister_netdevice_many(&list);
1059 	rtnl_unlock();
1060 }
1061 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1062 
1063 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1064 		      struct ip_tunnel_parm *p)
1065 {
1066 	struct ip_tunnel *nt;
1067 	struct net *net = dev_net(dev);
1068 	struct ip_tunnel_net *itn;
1069 	int mtu;
1070 	int err;
1071 
1072 	nt = netdev_priv(dev);
1073 	itn = net_generic(net, nt->ip_tnl_net_id);
1074 
1075 	if (ip_tunnel_find(itn, p, dev->type))
1076 		return -EEXIST;
1077 
1078 	nt->net = net;
1079 	nt->parms = *p;
1080 	err = register_netdevice(dev);
1081 	if (err)
1082 		goto out;
1083 
1084 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1085 		eth_hw_addr_random(dev);
1086 
1087 	mtu = ip_tunnel_bind_dev(dev);
1088 	if (!tb[IFLA_MTU])
1089 		dev->mtu = mtu;
1090 
1091 	ip_tunnel_add(itn, nt);
1092 
1093 out:
1094 	return err;
1095 }
1096 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1097 
1098 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1099 			 struct ip_tunnel_parm *p)
1100 {
1101 	struct ip_tunnel *t;
1102 	struct ip_tunnel *tunnel = netdev_priv(dev);
1103 	struct net *net = tunnel->net;
1104 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1105 
1106 	if (dev == itn->fb_tunnel_dev)
1107 		return -EINVAL;
1108 
1109 	t = ip_tunnel_find(itn, p, dev->type);
1110 
1111 	if (t) {
1112 		if (t->dev != dev)
1113 			return -EEXIST;
1114 	} else {
1115 		t = tunnel;
1116 
1117 		if (dev->type != ARPHRD_ETHER) {
1118 			unsigned int nflags = 0;
1119 
1120 			if (ipv4_is_multicast(p->iph.daddr))
1121 				nflags = IFF_BROADCAST;
1122 			else if (p->iph.daddr)
1123 				nflags = IFF_POINTOPOINT;
1124 
1125 			if ((dev->flags ^ nflags) &
1126 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1127 				return -EINVAL;
1128 		}
1129 	}
1130 
1131 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1132 	return 0;
1133 }
1134 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1135 
1136 int ip_tunnel_init(struct net_device *dev)
1137 {
1138 	struct ip_tunnel *tunnel = netdev_priv(dev);
1139 	struct iphdr *iph = &tunnel->parms.iph;
1140 	int err;
1141 
1142 	dev->destructor	= ip_tunnel_dev_free;
1143 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1144 	if (!dev->tstats)
1145 		return -ENOMEM;
1146 
1147 	tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
1148 	if (!tunnel->dst_cache) {
1149 		free_percpu(dev->tstats);
1150 		return -ENOMEM;
1151 	}
1152 
1153 	err = gro_cells_init(&tunnel->gro_cells, dev);
1154 	if (err) {
1155 		free_percpu(tunnel->dst_cache);
1156 		free_percpu(dev->tstats);
1157 		return err;
1158 	}
1159 
1160 	tunnel->dev = dev;
1161 	tunnel->net = dev_net(dev);
1162 	strcpy(tunnel->parms.name, dev->name);
1163 	iph->version		= 4;
1164 	iph->ihl		= 5;
1165 
1166 	return 0;
1167 }
1168 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1169 
1170 void ip_tunnel_uninit(struct net_device *dev)
1171 {
1172 	struct ip_tunnel *tunnel = netdev_priv(dev);
1173 	struct net *net = tunnel->net;
1174 	struct ip_tunnel_net *itn;
1175 
1176 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1177 	/* fb_tunnel_dev will be unregisted in net-exit call. */
1178 	if (itn->fb_tunnel_dev != dev)
1179 		ip_tunnel_del(netdev_priv(dev));
1180 
1181 	ip_tunnel_dst_reset_all(tunnel);
1182 }
1183 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1184 
1185 /* Do least required initialization, rest of init is done in tunnel_init call */
1186 void ip_tunnel_setup(struct net_device *dev, int net_id)
1187 {
1188 	struct ip_tunnel *tunnel = netdev_priv(dev);
1189 	tunnel->ip_tnl_net_id = net_id;
1190 }
1191 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1192 
1193 MODULE_LICENSE("GPL");
1194