xref: /openbmc/linux/net/ipv4/ip_tunnel.c (revision 5ff32883)
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20 
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/init.h>
34 #include <linux/in6.h>
35 #include <linux/inetdevice.h>
36 #include <linux/igmp.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/etherdevice.h>
39 #include <linux/if_ether.h>
40 #include <linux/if_vlan.h>
41 #include <linux/rculist.h>
42 #include <linux/err.h>
43 
44 #include <net/sock.h>
45 #include <net/ip.h>
46 #include <net/icmp.h>
47 #include <net/protocol.h>
48 #include <net/ip_tunnels.h>
49 #include <net/arp.h>
50 #include <net/checksum.h>
51 #include <net/dsfield.h>
52 #include <net/inet_ecn.h>
53 #include <net/xfrm.h>
54 #include <net/net_namespace.h>
55 #include <net/netns/generic.h>
56 #include <net/rtnetlink.h>
57 #include <net/udp.h>
58 #include <net/dst_metadata.h>
59 
60 #if IS_ENABLED(CONFIG_IPV6)
61 #include <net/ipv6.h>
62 #include <net/ip6_fib.h>
63 #include <net/ip6_route.h>
64 #endif
65 
66 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
67 {
68 	return hash_32((__force u32)key ^ (__force u32)remote,
69 			 IP_TNL_HASH_BITS);
70 }
71 
72 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
73 				__be16 flags, __be32 key)
74 {
75 	if (p->i_flags & TUNNEL_KEY) {
76 		if (flags & TUNNEL_KEY)
77 			return key == p->i_key;
78 		else
79 			/* key expected, none present */
80 			return false;
81 	} else
82 		return !(flags & TUNNEL_KEY);
83 }
84 
85 /* Fallback tunnel: no source, no destination, no key, no options
86 
87    Tunnel hash table:
88    We require exact key match i.e. if a key is present in packet
89    it will match only tunnel with the same key; if it is not present,
90    it will match only keyless tunnel.
91 
92    All keysless packets, if not matched configured keyless tunnels
93    will match fallback tunnel.
94    Given src, dst and key, find appropriate for input tunnel.
95 */
96 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
97 				   int link, __be16 flags,
98 				   __be32 remote, __be32 local,
99 				   __be32 key)
100 {
101 	unsigned int hash;
102 	struct ip_tunnel *t, *cand = NULL;
103 	struct hlist_head *head;
104 
105 	hash = ip_tunnel_hash(key, remote);
106 	head = &itn->tunnels[hash];
107 
108 	hlist_for_each_entry_rcu(t, head, hash_node) {
109 		if (local != t->parms.iph.saddr ||
110 		    remote != t->parms.iph.daddr ||
111 		    !(t->dev->flags & IFF_UP))
112 			continue;
113 
114 		if (!ip_tunnel_key_match(&t->parms, flags, key))
115 			continue;
116 
117 		if (t->parms.link == link)
118 			return t;
119 		else
120 			cand = t;
121 	}
122 
123 	hlist_for_each_entry_rcu(t, head, hash_node) {
124 		if (remote != t->parms.iph.daddr ||
125 		    t->parms.iph.saddr != 0 ||
126 		    !(t->dev->flags & IFF_UP))
127 			continue;
128 
129 		if (!ip_tunnel_key_match(&t->parms, flags, key))
130 			continue;
131 
132 		if (t->parms.link == link)
133 			return t;
134 		else if (!cand)
135 			cand = t;
136 	}
137 
138 	hash = ip_tunnel_hash(key, 0);
139 	head = &itn->tunnels[hash];
140 
141 	hlist_for_each_entry_rcu(t, head, hash_node) {
142 		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
143 		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
144 			continue;
145 
146 		if (!(t->dev->flags & IFF_UP))
147 			continue;
148 
149 		if (!ip_tunnel_key_match(&t->parms, flags, key))
150 			continue;
151 
152 		if (t->parms.link == link)
153 			return t;
154 		else if (!cand)
155 			cand = t;
156 	}
157 
158 	if (flags & TUNNEL_NO_KEY)
159 		goto skip_key_lookup;
160 
161 	hlist_for_each_entry_rcu(t, head, hash_node) {
162 		if (t->parms.i_key != key ||
163 		    t->parms.iph.saddr != 0 ||
164 		    t->parms.iph.daddr != 0 ||
165 		    !(t->dev->flags & IFF_UP))
166 			continue;
167 
168 		if (t->parms.link == link)
169 			return t;
170 		else if (!cand)
171 			cand = t;
172 	}
173 
174 skip_key_lookup:
175 	if (cand)
176 		return cand;
177 
178 	t = rcu_dereference(itn->collect_md_tun);
179 	if (t && t->dev->flags & IFF_UP)
180 		return t;
181 
182 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
183 		return netdev_priv(itn->fb_tunnel_dev);
184 
185 	return NULL;
186 }
187 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
188 
189 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
190 				    struct ip_tunnel_parm *parms)
191 {
192 	unsigned int h;
193 	__be32 remote;
194 	__be32 i_key = parms->i_key;
195 
196 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
197 		remote = parms->iph.daddr;
198 	else
199 		remote = 0;
200 
201 	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
202 		i_key = 0;
203 
204 	h = ip_tunnel_hash(i_key, remote);
205 	return &itn->tunnels[h];
206 }
207 
208 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
209 {
210 	struct hlist_head *head = ip_bucket(itn, &t->parms);
211 
212 	if (t->collect_md)
213 		rcu_assign_pointer(itn->collect_md_tun, t);
214 	hlist_add_head_rcu(&t->hash_node, head);
215 }
216 
217 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
218 {
219 	if (t->collect_md)
220 		rcu_assign_pointer(itn->collect_md_tun, NULL);
221 	hlist_del_init_rcu(&t->hash_node);
222 }
223 
224 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
225 					struct ip_tunnel_parm *parms,
226 					int type)
227 {
228 	__be32 remote = parms->iph.daddr;
229 	__be32 local = parms->iph.saddr;
230 	__be32 key = parms->i_key;
231 	__be16 flags = parms->i_flags;
232 	int link = parms->link;
233 	struct ip_tunnel *t = NULL;
234 	struct hlist_head *head = ip_bucket(itn, parms);
235 
236 	hlist_for_each_entry_rcu(t, head, hash_node) {
237 		if (local == t->parms.iph.saddr &&
238 		    remote == t->parms.iph.daddr &&
239 		    link == t->parms.link &&
240 		    type == t->dev->type &&
241 		    ip_tunnel_key_match(&t->parms, flags, key))
242 			break;
243 	}
244 	return t;
245 }
246 
247 static struct net_device *__ip_tunnel_create(struct net *net,
248 					     const struct rtnl_link_ops *ops,
249 					     struct ip_tunnel_parm *parms)
250 {
251 	int err;
252 	struct ip_tunnel *tunnel;
253 	struct net_device *dev;
254 	char name[IFNAMSIZ];
255 
256 	err = -E2BIG;
257 	if (parms->name[0]) {
258 		if (!dev_valid_name(parms->name))
259 			goto failed;
260 		strlcpy(name, parms->name, IFNAMSIZ);
261 	} else {
262 		if (strlen(ops->kind) > (IFNAMSIZ - 3))
263 			goto failed;
264 		strcpy(name, ops->kind);
265 		strcat(name, "%d");
266 	}
267 
268 	ASSERT_RTNL();
269 	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
270 	if (!dev) {
271 		err = -ENOMEM;
272 		goto failed;
273 	}
274 	dev_net_set(dev, net);
275 
276 	dev->rtnl_link_ops = ops;
277 
278 	tunnel = netdev_priv(dev);
279 	tunnel->parms = *parms;
280 	tunnel->net = net;
281 
282 	err = register_netdevice(dev);
283 	if (err)
284 		goto failed_free;
285 
286 	return dev;
287 
288 failed_free:
289 	free_netdev(dev);
290 failed:
291 	return ERR_PTR(err);
292 }
293 
294 static int ip_tunnel_bind_dev(struct net_device *dev)
295 {
296 	struct net_device *tdev = NULL;
297 	struct ip_tunnel *tunnel = netdev_priv(dev);
298 	const struct iphdr *iph;
299 	int hlen = LL_MAX_HEADER;
300 	int mtu = ETH_DATA_LEN;
301 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
302 
303 	iph = &tunnel->parms.iph;
304 
305 	/* Guess output device to choose reasonable mtu and needed_headroom */
306 	if (iph->daddr) {
307 		struct flowi4 fl4;
308 		struct rtable *rt;
309 
310 		ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
311 				    iph->saddr, tunnel->parms.o_key,
312 				    RT_TOS(iph->tos), tunnel->parms.link,
313 				    tunnel->fwmark);
314 		rt = ip_route_output_key(tunnel->net, &fl4);
315 
316 		if (!IS_ERR(rt)) {
317 			tdev = rt->dst.dev;
318 			ip_rt_put(rt);
319 		}
320 		if (dev->type != ARPHRD_ETHER)
321 			dev->flags |= IFF_POINTOPOINT;
322 
323 		dst_cache_reset(&tunnel->dst_cache);
324 	}
325 
326 	if (!tdev && tunnel->parms.link)
327 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
328 
329 	if (tdev) {
330 		hlen = tdev->hard_header_len + tdev->needed_headroom;
331 		mtu = min(tdev->mtu, IP_MAX_MTU);
332 	}
333 
334 	dev->needed_headroom = t_hlen + hlen;
335 	mtu -= (dev->hard_header_len + t_hlen);
336 
337 	if (mtu < IPV4_MIN_MTU)
338 		mtu = IPV4_MIN_MTU;
339 
340 	return mtu;
341 }
342 
343 static struct ip_tunnel *ip_tunnel_create(struct net *net,
344 					  struct ip_tunnel_net *itn,
345 					  struct ip_tunnel_parm *parms)
346 {
347 	struct ip_tunnel *nt;
348 	struct net_device *dev;
349 	int t_hlen;
350 	int mtu;
351 	int err;
352 
353 	dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
354 	if (IS_ERR(dev))
355 		return ERR_CAST(dev);
356 
357 	mtu = ip_tunnel_bind_dev(dev);
358 	err = dev_set_mtu(dev, mtu);
359 	if (err)
360 		goto err_dev_set_mtu;
361 
362 	nt = netdev_priv(dev);
363 	t_hlen = nt->hlen + sizeof(struct iphdr);
364 	dev->min_mtu = ETH_MIN_MTU;
365 	dev->max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
366 	ip_tunnel_add(itn, nt);
367 	return nt;
368 
369 err_dev_set_mtu:
370 	unregister_netdevice(dev);
371 	return ERR_PTR(err);
372 }
373 
374 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
375 		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
376 		  bool log_ecn_error)
377 {
378 	struct pcpu_sw_netstats *tstats;
379 	const struct iphdr *iph = ip_hdr(skb);
380 	int err;
381 
382 #ifdef CONFIG_NET_IPGRE_BROADCAST
383 	if (ipv4_is_multicast(iph->daddr)) {
384 		tunnel->dev->stats.multicast++;
385 		skb->pkt_type = PACKET_BROADCAST;
386 	}
387 #endif
388 
389 	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
390 	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
391 		tunnel->dev->stats.rx_crc_errors++;
392 		tunnel->dev->stats.rx_errors++;
393 		goto drop;
394 	}
395 
396 	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
397 		if (!(tpi->flags&TUNNEL_SEQ) ||
398 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
399 			tunnel->dev->stats.rx_fifo_errors++;
400 			tunnel->dev->stats.rx_errors++;
401 			goto drop;
402 		}
403 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
404 	}
405 
406 	skb_reset_network_header(skb);
407 
408 	err = IP_ECN_decapsulate(iph, skb);
409 	if (unlikely(err)) {
410 		if (log_ecn_error)
411 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
412 					&iph->saddr, iph->tos);
413 		if (err > 1) {
414 			++tunnel->dev->stats.rx_frame_errors;
415 			++tunnel->dev->stats.rx_errors;
416 			goto drop;
417 		}
418 	}
419 
420 	tstats = this_cpu_ptr(tunnel->dev->tstats);
421 	u64_stats_update_begin(&tstats->syncp);
422 	tstats->rx_packets++;
423 	tstats->rx_bytes += skb->len;
424 	u64_stats_update_end(&tstats->syncp);
425 
426 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
427 
428 	if (tunnel->dev->type == ARPHRD_ETHER) {
429 		skb->protocol = eth_type_trans(skb, tunnel->dev);
430 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
431 	} else {
432 		skb->dev = tunnel->dev;
433 	}
434 
435 	if (tun_dst)
436 		skb_dst_set(skb, (struct dst_entry *)tun_dst);
437 
438 	gro_cells_receive(&tunnel->gro_cells, skb);
439 	return 0;
440 
441 drop:
442 	if (tun_dst)
443 		dst_release((struct dst_entry *)tun_dst);
444 	kfree_skb(skb);
445 	return 0;
446 }
447 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
448 
449 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
450 			    unsigned int num)
451 {
452 	if (num >= MAX_IPTUN_ENCAP_OPS)
453 		return -ERANGE;
454 
455 	return !cmpxchg((const struct ip_tunnel_encap_ops **)
456 			&iptun_encaps[num],
457 			NULL, ops) ? 0 : -1;
458 }
459 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
460 
461 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
462 			    unsigned int num)
463 {
464 	int ret;
465 
466 	if (num >= MAX_IPTUN_ENCAP_OPS)
467 		return -ERANGE;
468 
469 	ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
470 		       &iptun_encaps[num],
471 		       ops, NULL) == ops) ? 0 : -1;
472 
473 	synchronize_net();
474 
475 	return ret;
476 }
477 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
478 
479 int ip_tunnel_encap_setup(struct ip_tunnel *t,
480 			  struct ip_tunnel_encap *ipencap)
481 {
482 	int hlen;
483 
484 	memset(&t->encap, 0, sizeof(t->encap));
485 
486 	hlen = ip_encap_hlen(ipencap);
487 	if (hlen < 0)
488 		return hlen;
489 
490 	t->encap.type = ipencap->type;
491 	t->encap.sport = ipencap->sport;
492 	t->encap.dport = ipencap->dport;
493 	t->encap.flags = ipencap->flags;
494 
495 	t->encap_hlen = hlen;
496 	t->hlen = t->encap_hlen + t->tun_hlen;
497 
498 	return 0;
499 }
500 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
501 
502 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
503 			    struct rtable *rt, __be16 df,
504 			    const struct iphdr *inner_iph)
505 {
506 	struct ip_tunnel *tunnel = netdev_priv(dev);
507 	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
508 	int mtu;
509 
510 	if (df)
511 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
512 					- sizeof(struct iphdr) - tunnel->hlen;
513 	else
514 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
515 
516 	skb_dst_update_pmtu(skb, mtu);
517 
518 	if (skb->protocol == htons(ETH_P_IP)) {
519 		if (!skb_is_gso(skb) &&
520 		    (inner_iph->frag_off & htons(IP_DF)) &&
521 		    mtu < pkt_size) {
522 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
523 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
524 			return -E2BIG;
525 		}
526 	}
527 #if IS_ENABLED(CONFIG_IPV6)
528 	else if (skb->protocol == htons(ETH_P_IPV6)) {
529 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
530 
531 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
532 			   mtu >= IPV6_MIN_MTU) {
533 			if ((tunnel->parms.iph.daddr &&
534 			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
535 			    rt6->rt6i_dst.plen == 128) {
536 				rt6->rt6i_flags |= RTF_MODIFIED;
537 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
538 			}
539 		}
540 
541 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
542 					mtu < pkt_size) {
543 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
544 			return -E2BIG;
545 		}
546 	}
547 #endif
548 	return 0;
549 }
550 
551 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
552 {
553 	struct ip_tunnel *tunnel = netdev_priv(dev);
554 	u32 headroom = sizeof(struct iphdr);
555 	struct ip_tunnel_info *tun_info;
556 	const struct ip_tunnel_key *key;
557 	const struct iphdr *inner_iph;
558 	struct rtable *rt;
559 	struct flowi4 fl4;
560 	__be16 df = 0;
561 	u8 tos, ttl;
562 
563 	tun_info = skb_tunnel_info(skb);
564 	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
565 		     ip_tunnel_info_af(tun_info) != AF_INET))
566 		goto tx_error;
567 	key = &tun_info->key;
568 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
569 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
570 	tos = key->tos;
571 	if (tos == 1) {
572 		if (skb->protocol == htons(ETH_P_IP))
573 			tos = inner_iph->tos;
574 		else if (skb->protocol == htons(ETH_P_IPV6))
575 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
576 	}
577 	ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0,
578 			    RT_TOS(tos), tunnel->parms.link, tunnel->fwmark);
579 	if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
580 		goto tx_error;
581 	rt = ip_route_output_key(tunnel->net, &fl4);
582 	if (IS_ERR(rt)) {
583 		dev->stats.tx_carrier_errors++;
584 		goto tx_error;
585 	}
586 	if (rt->dst.dev == dev) {
587 		ip_rt_put(rt);
588 		dev->stats.collisions++;
589 		goto tx_error;
590 	}
591 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
592 	ttl = key->ttl;
593 	if (ttl == 0) {
594 		if (skb->protocol == htons(ETH_P_IP))
595 			ttl = inner_iph->ttl;
596 		else if (skb->protocol == htons(ETH_P_IPV6))
597 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
598 		else
599 			ttl = ip4_dst_hoplimit(&rt->dst);
600 	}
601 	if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
602 		df = htons(IP_DF);
603 	else if (skb->protocol == htons(ETH_P_IP))
604 		df = inner_iph->frag_off & htons(IP_DF);
605 	headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
606 	if (headroom > dev->needed_headroom)
607 		dev->needed_headroom = headroom;
608 
609 	if (skb_cow_head(skb, dev->needed_headroom)) {
610 		ip_rt_put(rt);
611 		goto tx_dropped;
612 	}
613 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
614 		      df, !net_eq(tunnel->net, dev_net(dev)));
615 	return;
616 tx_error:
617 	dev->stats.tx_errors++;
618 	goto kfree;
619 tx_dropped:
620 	dev->stats.tx_dropped++;
621 kfree:
622 	kfree_skb(skb);
623 }
624 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
625 
626 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
627 		    const struct iphdr *tnl_params, u8 protocol)
628 {
629 	struct ip_tunnel *tunnel = netdev_priv(dev);
630 	const struct iphdr *inner_iph;
631 	struct flowi4 fl4;
632 	u8     tos, ttl;
633 	__be16 df;
634 	struct rtable *rt;		/* Route to the other host */
635 	unsigned int max_headroom;	/* The extra header space needed */
636 	__be32 dst;
637 	bool connected;
638 
639 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
640 	connected = (tunnel->parms.iph.daddr != 0);
641 
642 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
643 
644 	dst = tnl_params->daddr;
645 	if (dst == 0) {
646 		/* NBMA tunnel */
647 		struct ip_tunnel_info *tun_info;
648 
649 		if (!skb_dst(skb)) {
650 			dev->stats.tx_fifo_errors++;
651 			goto tx_error;
652 		}
653 
654 		tun_info = skb_tunnel_info(skb);
655 		if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
656 		    ip_tunnel_info_af(tun_info) == AF_INET &&
657 		    tun_info->key.u.ipv4.dst)
658 			dst = tun_info->key.u.ipv4.dst;
659 		else if (skb->protocol == htons(ETH_P_IP)) {
660 			rt = skb_rtable(skb);
661 			dst = rt_nexthop(rt, inner_iph->daddr);
662 		}
663 #if IS_ENABLED(CONFIG_IPV6)
664 		else if (skb->protocol == htons(ETH_P_IPV6)) {
665 			const struct in6_addr *addr6;
666 			struct neighbour *neigh;
667 			bool do_tx_error_icmp;
668 			int addr_type;
669 
670 			neigh = dst_neigh_lookup(skb_dst(skb),
671 						 &ipv6_hdr(skb)->daddr);
672 			if (!neigh)
673 				goto tx_error;
674 
675 			addr6 = (const struct in6_addr *)&neigh->primary_key;
676 			addr_type = ipv6_addr_type(addr6);
677 
678 			if (addr_type == IPV6_ADDR_ANY) {
679 				addr6 = &ipv6_hdr(skb)->daddr;
680 				addr_type = ipv6_addr_type(addr6);
681 			}
682 
683 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
684 				do_tx_error_icmp = true;
685 			else {
686 				do_tx_error_icmp = false;
687 				dst = addr6->s6_addr32[3];
688 			}
689 			neigh_release(neigh);
690 			if (do_tx_error_icmp)
691 				goto tx_error_icmp;
692 		}
693 #endif
694 		else
695 			goto tx_error;
696 
697 		connected = false;
698 	}
699 
700 	tos = tnl_params->tos;
701 	if (tos & 0x1) {
702 		tos &= ~0x1;
703 		if (skb->protocol == htons(ETH_P_IP)) {
704 			tos = inner_iph->tos;
705 			connected = false;
706 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
707 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
708 			connected = false;
709 		}
710 	}
711 
712 	ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
713 			    tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
714 			    tunnel->fwmark);
715 
716 	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
717 		goto tx_error;
718 
719 	rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
720 			 NULL;
721 
722 	if (!rt) {
723 		rt = ip_route_output_key(tunnel->net, &fl4);
724 
725 		if (IS_ERR(rt)) {
726 			dev->stats.tx_carrier_errors++;
727 			goto tx_error;
728 		}
729 		if (connected)
730 			dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
731 					  fl4.saddr);
732 	}
733 
734 	if (rt->dst.dev == dev) {
735 		ip_rt_put(rt);
736 		dev->stats.collisions++;
737 		goto tx_error;
738 	}
739 
740 	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
741 		ip_rt_put(rt);
742 		goto tx_error;
743 	}
744 
745 	if (tunnel->err_count > 0) {
746 		if (time_before(jiffies,
747 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
748 			tunnel->err_count--;
749 
750 			dst_link_failure(skb);
751 		} else
752 			tunnel->err_count = 0;
753 	}
754 
755 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
756 	ttl = tnl_params->ttl;
757 	if (ttl == 0) {
758 		if (skb->protocol == htons(ETH_P_IP))
759 			ttl = inner_iph->ttl;
760 #if IS_ENABLED(CONFIG_IPV6)
761 		else if (skb->protocol == htons(ETH_P_IPV6))
762 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
763 #endif
764 		else
765 			ttl = ip4_dst_hoplimit(&rt->dst);
766 	}
767 
768 	df = tnl_params->frag_off;
769 	if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
770 		df |= (inner_iph->frag_off&htons(IP_DF));
771 
772 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
773 			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
774 	if (max_headroom > dev->needed_headroom)
775 		dev->needed_headroom = max_headroom;
776 
777 	if (skb_cow_head(skb, dev->needed_headroom)) {
778 		ip_rt_put(rt);
779 		dev->stats.tx_dropped++;
780 		kfree_skb(skb);
781 		return;
782 	}
783 
784 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
785 		      df, !net_eq(tunnel->net, dev_net(dev)));
786 	return;
787 
788 #if IS_ENABLED(CONFIG_IPV6)
789 tx_error_icmp:
790 	dst_link_failure(skb);
791 #endif
792 tx_error:
793 	dev->stats.tx_errors++;
794 	kfree_skb(skb);
795 }
796 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
797 
798 static void ip_tunnel_update(struct ip_tunnel_net *itn,
799 			     struct ip_tunnel *t,
800 			     struct net_device *dev,
801 			     struct ip_tunnel_parm *p,
802 			     bool set_mtu,
803 			     __u32 fwmark)
804 {
805 	ip_tunnel_del(itn, t);
806 	t->parms.iph.saddr = p->iph.saddr;
807 	t->parms.iph.daddr = p->iph.daddr;
808 	t->parms.i_key = p->i_key;
809 	t->parms.o_key = p->o_key;
810 	if (dev->type != ARPHRD_ETHER) {
811 		memcpy(dev->dev_addr, &p->iph.saddr, 4);
812 		memcpy(dev->broadcast, &p->iph.daddr, 4);
813 	}
814 	ip_tunnel_add(itn, t);
815 
816 	t->parms.iph.ttl = p->iph.ttl;
817 	t->parms.iph.tos = p->iph.tos;
818 	t->parms.iph.frag_off = p->iph.frag_off;
819 
820 	if (t->parms.link != p->link || t->fwmark != fwmark) {
821 		int mtu;
822 
823 		t->parms.link = p->link;
824 		t->fwmark = fwmark;
825 		mtu = ip_tunnel_bind_dev(dev);
826 		if (set_mtu)
827 			dev->mtu = mtu;
828 	}
829 	dst_cache_reset(&t->dst_cache);
830 	netdev_state_change(dev);
831 }
832 
833 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
834 {
835 	int err = 0;
836 	struct ip_tunnel *t = netdev_priv(dev);
837 	struct net *net = t->net;
838 	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
839 
840 	switch (cmd) {
841 	case SIOCGETTUNNEL:
842 		if (dev == itn->fb_tunnel_dev) {
843 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
844 			if (!t)
845 				t = netdev_priv(dev);
846 		}
847 		memcpy(p, &t->parms, sizeof(*p));
848 		break;
849 
850 	case SIOCADDTUNNEL:
851 	case SIOCCHGTUNNEL:
852 		err = -EPERM;
853 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
854 			goto done;
855 		if (p->iph.ttl)
856 			p->iph.frag_off |= htons(IP_DF);
857 		if (!(p->i_flags & VTI_ISVTI)) {
858 			if (!(p->i_flags & TUNNEL_KEY))
859 				p->i_key = 0;
860 			if (!(p->o_flags & TUNNEL_KEY))
861 				p->o_key = 0;
862 		}
863 
864 		t = ip_tunnel_find(itn, p, itn->type);
865 
866 		if (cmd == SIOCADDTUNNEL) {
867 			if (!t) {
868 				t = ip_tunnel_create(net, itn, p);
869 				err = PTR_ERR_OR_ZERO(t);
870 				break;
871 			}
872 
873 			err = -EEXIST;
874 			break;
875 		}
876 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
877 			if (t) {
878 				if (t->dev != dev) {
879 					err = -EEXIST;
880 					break;
881 				}
882 			} else {
883 				unsigned int nflags = 0;
884 
885 				if (ipv4_is_multicast(p->iph.daddr))
886 					nflags = IFF_BROADCAST;
887 				else if (p->iph.daddr)
888 					nflags = IFF_POINTOPOINT;
889 
890 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
891 					err = -EINVAL;
892 					break;
893 				}
894 
895 				t = netdev_priv(dev);
896 			}
897 		}
898 
899 		if (t) {
900 			err = 0;
901 			ip_tunnel_update(itn, t, dev, p, true, 0);
902 		} else {
903 			err = -ENOENT;
904 		}
905 		break;
906 
907 	case SIOCDELTUNNEL:
908 		err = -EPERM;
909 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
910 			goto done;
911 
912 		if (dev == itn->fb_tunnel_dev) {
913 			err = -ENOENT;
914 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
915 			if (!t)
916 				goto done;
917 			err = -EPERM;
918 			if (t == netdev_priv(itn->fb_tunnel_dev))
919 				goto done;
920 			dev = t->dev;
921 		}
922 		unregister_netdevice(dev);
923 		err = 0;
924 		break;
925 
926 	default:
927 		err = -EINVAL;
928 	}
929 
930 done:
931 	return err;
932 }
933 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
934 
935 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
936 {
937 	struct ip_tunnel *tunnel = netdev_priv(dev);
938 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
939 	int max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
940 
941 	if (new_mtu < ETH_MIN_MTU)
942 		return -EINVAL;
943 
944 	if (new_mtu > max_mtu) {
945 		if (strict)
946 			return -EINVAL;
947 
948 		new_mtu = max_mtu;
949 	}
950 
951 	dev->mtu = new_mtu;
952 	return 0;
953 }
954 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
955 
956 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
957 {
958 	return __ip_tunnel_change_mtu(dev, new_mtu, true);
959 }
960 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
961 
962 static void ip_tunnel_dev_free(struct net_device *dev)
963 {
964 	struct ip_tunnel *tunnel = netdev_priv(dev);
965 
966 	gro_cells_destroy(&tunnel->gro_cells);
967 	dst_cache_destroy(&tunnel->dst_cache);
968 	free_percpu(dev->tstats);
969 }
970 
971 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
972 {
973 	struct ip_tunnel *tunnel = netdev_priv(dev);
974 	struct ip_tunnel_net *itn;
975 
976 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
977 
978 	if (itn->fb_tunnel_dev != dev) {
979 		ip_tunnel_del(itn, netdev_priv(dev));
980 		unregister_netdevice_queue(dev, head);
981 	}
982 }
983 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
984 
985 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
986 {
987 	struct ip_tunnel *tunnel = netdev_priv(dev);
988 
989 	return tunnel->net;
990 }
991 EXPORT_SYMBOL(ip_tunnel_get_link_net);
992 
993 int ip_tunnel_get_iflink(const struct net_device *dev)
994 {
995 	struct ip_tunnel *tunnel = netdev_priv(dev);
996 
997 	return tunnel->parms.link;
998 }
999 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1000 
1001 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1002 				  struct rtnl_link_ops *ops, char *devname)
1003 {
1004 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1005 	struct ip_tunnel_parm parms;
1006 	unsigned int i;
1007 
1008 	itn->rtnl_link_ops = ops;
1009 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1010 		INIT_HLIST_HEAD(&itn->tunnels[i]);
1011 
1012 	if (!ops || !net_has_fallback_tunnels(net)) {
1013 		struct ip_tunnel_net *it_init_net;
1014 
1015 		it_init_net = net_generic(&init_net, ip_tnl_net_id);
1016 		itn->type = it_init_net->type;
1017 		itn->fb_tunnel_dev = NULL;
1018 		return 0;
1019 	}
1020 
1021 	memset(&parms, 0, sizeof(parms));
1022 	if (devname)
1023 		strlcpy(parms.name, devname, IFNAMSIZ);
1024 
1025 	rtnl_lock();
1026 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1027 	/* FB netdevice is special: we have one, and only one per netns.
1028 	 * Allowing to move it to another netns is clearly unsafe.
1029 	 */
1030 	if (!IS_ERR(itn->fb_tunnel_dev)) {
1031 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1032 		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1033 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1034 		itn->type = itn->fb_tunnel_dev->type;
1035 	}
1036 	rtnl_unlock();
1037 
1038 	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1039 }
1040 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1041 
1042 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1043 			      struct list_head *head,
1044 			      struct rtnl_link_ops *ops)
1045 {
1046 	struct net_device *dev, *aux;
1047 	int h;
1048 
1049 	for_each_netdev_safe(net, dev, aux)
1050 		if (dev->rtnl_link_ops == ops)
1051 			unregister_netdevice_queue(dev, head);
1052 
1053 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1054 		struct ip_tunnel *t;
1055 		struct hlist_node *n;
1056 		struct hlist_head *thead = &itn->tunnels[h];
1057 
1058 		hlist_for_each_entry_safe(t, n, thead, hash_node)
1059 			/* If dev is in the same netns, it has already
1060 			 * been added to the list by the previous loop.
1061 			 */
1062 			if (!net_eq(dev_net(t->dev), net))
1063 				unregister_netdevice_queue(t->dev, head);
1064 	}
1065 }
1066 
1067 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1068 			   struct rtnl_link_ops *ops)
1069 {
1070 	struct ip_tunnel_net *itn;
1071 	struct net *net;
1072 	LIST_HEAD(list);
1073 
1074 	rtnl_lock();
1075 	list_for_each_entry(net, net_list, exit_list) {
1076 		itn = net_generic(net, id);
1077 		ip_tunnel_destroy(net, itn, &list, ops);
1078 	}
1079 	unregister_netdevice_many(&list);
1080 	rtnl_unlock();
1081 }
1082 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1083 
1084 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1085 		      struct ip_tunnel_parm *p, __u32 fwmark)
1086 {
1087 	struct ip_tunnel *nt;
1088 	struct net *net = dev_net(dev);
1089 	struct ip_tunnel_net *itn;
1090 	int mtu;
1091 	int err;
1092 
1093 	nt = netdev_priv(dev);
1094 	itn = net_generic(net, nt->ip_tnl_net_id);
1095 
1096 	if (nt->collect_md) {
1097 		if (rtnl_dereference(itn->collect_md_tun))
1098 			return -EEXIST;
1099 	} else {
1100 		if (ip_tunnel_find(itn, p, dev->type))
1101 			return -EEXIST;
1102 	}
1103 
1104 	nt->net = net;
1105 	nt->parms = *p;
1106 	nt->fwmark = fwmark;
1107 	err = register_netdevice(dev);
1108 	if (err)
1109 		goto err_register_netdevice;
1110 
1111 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1112 		eth_hw_addr_random(dev);
1113 
1114 	mtu = ip_tunnel_bind_dev(dev);
1115 	if (tb[IFLA_MTU]) {
1116 		unsigned int max = IP_MAX_MTU - dev->hard_header_len - nt->hlen;
1117 
1118 		mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU,
1119 			    (unsigned int)(max - sizeof(struct iphdr)));
1120 	}
1121 
1122 	err = dev_set_mtu(dev, mtu);
1123 	if (err)
1124 		goto err_dev_set_mtu;
1125 
1126 	ip_tunnel_add(itn, nt);
1127 	return 0;
1128 
1129 err_dev_set_mtu:
1130 	unregister_netdevice(dev);
1131 err_register_netdevice:
1132 	return err;
1133 }
1134 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1135 
1136 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1137 			 struct ip_tunnel_parm *p, __u32 fwmark)
1138 {
1139 	struct ip_tunnel *t;
1140 	struct ip_tunnel *tunnel = netdev_priv(dev);
1141 	struct net *net = tunnel->net;
1142 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1143 
1144 	if (dev == itn->fb_tunnel_dev)
1145 		return -EINVAL;
1146 
1147 	t = ip_tunnel_find(itn, p, dev->type);
1148 
1149 	if (t) {
1150 		if (t->dev != dev)
1151 			return -EEXIST;
1152 	} else {
1153 		t = tunnel;
1154 
1155 		if (dev->type != ARPHRD_ETHER) {
1156 			unsigned int nflags = 0;
1157 
1158 			if (ipv4_is_multicast(p->iph.daddr))
1159 				nflags = IFF_BROADCAST;
1160 			else if (p->iph.daddr)
1161 				nflags = IFF_POINTOPOINT;
1162 
1163 			if ((dev->flags ^ nflags) &
1164 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1165 				return -EINVAL;
1166 		}
1167 	}
1168 
1169 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1170 	return 0;
1171 }
1172 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1173 
1174 int ip_tunnel_init(struct net_device *dev)
1175 {
1176 	struct ip_tunnel *tunnel = netdev_priv(dev);
1177 	struct iphdr *iph = &tunnel->parms.iph;
1178 	int err;
1179 
1180 	dev->needs_free_netdev = true;
1181 	dev->priv_destructor = ip_tunnel_dev_free;
1182 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1183 	if (!dev->tstats)
1184 		return -ENOMEM;
1185 
1186 	err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1187 	if (err) {
1188 		free_percpu(dev->tstats);
1189 		return err;
1190 	}
1191 
1192 	err = gro_cells_init(&tunnel->gro_cells, dev);
1193 	if (err) {
1194 		dst_cache_destroy(&tunnel->dst_cache);
1195 		free_percpu(dev->tstats);
1196 		return err;
1197 	}
1198 
1199 	tunnel->dev = dev;
1200 	tunnel->net = dev_net(dev);
1201 	strcpy(tunnel->parms.name, dev->name);
1202 	iph->version		= 4;
1203 	iph->ihl		= 5;
1204 
1205 	if (tunnel->collect_md) {
1206 		dev->features |= NETIF_F_NETNS_LOCAL;
1207 		netif_keep_dst(dev);
1208 	}
1209 	return 0;
1210 }
1211 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1212 
1213 void ip_tunnel_uninit(struct net_device *dev)
1214 {
1215 	struct ip_tunnel *tunnel = netdev_priv(dev);
1216 	struct net *net = tunnel->net;
1217 	struct ip_tunnel_net *itn;
1218 
1219 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1220 	/* fb_tunnel_dev will be unregisted in net-exit call. */
1221 	if (itn->fb_tunnel_dev != dev)
1222 		ip_tunnel_del(itn, netdev_priv(dev));
1223 
1224 	dst_cache_reset(&tunnel->dst_cache);
1225 }
1226 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1227 
1228 /* Do least required initialization, rest of init is done in tunnel_init call */
1229 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1230 {
1231 	struct ip_tunnel *tunnel = netdev_priv(dev);
1232 	tunnel->ip_tnl_net_id = net_id;
1233 }
1234 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1235 
1236 MODULE_LICENSE("GPL");
1237