xref: /openbmc/linux/net/ipv4/ip_tunnel.c (revision f220d3eb)
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20 
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/init.h>
34 #include <linux/in6.h>
35 #include <linux/inetdevice.h>
36 #include <linux/igmp.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/etherdevice.h>
39 #include <linux/if_ether.h>
40 #include <linux/if_vlan.h>
41 #include <linux/rculist.h>
42 #include <linux/err.h>
43 
44 #include <net/sock.h>
45 #include <net/ip.h>
46 #include <net/icmp.h>
47 #include <net/protocol.h>
48 #include <net/ip_tunnels.h>
49 #include <net/arp.h>
50 #include <net/checksum.h>
51 #include <net/dsfield.h>
52 #include <net/inet_ecn.h>
53 #include <net/xfrm.h>
54 #include <net/net_namespace.h>
55 #include <net/netns/generic.h>
56 #include <net/rtnetlink.h>
57 #include <net/udp.h>
58 #include <net/dst_metadata.h>
59 
60 #if IS_ENABLED(CONFIG_IPV6)
61 #include <net/ipv6.h>
62 #include <net/ip6_fib.h>
63 #include <net/ip6_route.h>
64 #endif
65 
66 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
67 {
68 	return hash_32((__force u32)key ^ (__force u32)remote,
69 			 IP_TNL_HASH_BITS);
70 }
71 
72 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
73 				__be16 flags, __be32 key)
74 {
75 	if (p->i_flags & TUNNEL_KEY) {
76 		if (flags & TUNNEL_KEY)
77 			return key == p->i_key;
78 		else
79 			/* key expected, none present */
80 			return false;
81 	} else
82 		return !(flags & TUNNEL_KEY);
83 }
84 
85 /* Fallback tunnel: no source, no destination, no key, no options
86 
87    Tunnel hash table:
88    We require exact key match i.e. if a key is present in packet
89    it will match only tunnel with the same key; if it is not present,
90    it will match only keyless tunnel.
91 
92    All keysless packets, if not matched configured keyless tunnels
93    will match fallback tunnel.
94    Given src, dst and key, find appropriate for input tunnel.
95 */
96 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
97 				   int link, __be16 flags,
98 				   __be32 remote, __be32 local,
99 				   __be32 key)
100 {
101 	unsigned int hash;
102 	struct ip_tunnel *t, *cand = NULL;
103 	struct hlist_head *head;
104 
105 	hash = ip_tunnel_hash(key, remote);
106 	head = &itn->tunnels[hash];
107 
108 	hlist_for_each_entry_rcu(t, head, hash_node) {
109 		if (local != t->parms.iph.saddr ||
110 		    remote != t->parms.iph.daddr ||
111 		    !(t->dev->flags & IFF_UP))
112 			continue;
113 
114 		if (!ip_tunnel_key_match(&t->parms, flags, key))
115 			continue;
116 
117 		if (t->parms.link == link)
118 			return t;
119 		else
120 			cand = t;
121 	}
122 
123 	hlist_for_each_entry_rcu(t, head, hash_node) {
124 		if (remote != t->parms.iph.daddr ||
125 		    t->parms.iph.saddr != 0 ||
126 		    !(t->dev->flags & IFF_UP))
127 			continue;
128 
129 		if (!ip_tunnel_key_match(&t->parms, flags, key))
130 			continue;
131 
132 		if (t->parms.link == link)
133 			return t;
134 		else if (!cand)
135 			cand = t;
136 	}
137 
138 	hash = ip_tunnel_hash(key, 0);
139 	head = &itn->tunnels[hash];
140 
141 	hlist_for_each_entry_rcu(t, head, hash_node) {
142 		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
143 		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
144 			continue;
145 
146 		if (!(t->dev->flags & IFF_UP))
147 			continue;
148 
149 		if (!ip_tunnel_key_match(&t->parms, flags, key))
150 			continue;
151 
152 		if (t->parms.link == link)
153 			return t;
154 		else if (!cand)
155 			cand = t;
156 	}
157 
158 	if (flags & TUNNEL_NO_KEY)
159 		goto skip_key_lookup;
160 
161 	hlist_for_each_entry_rcu(t, head, hash_node) {
162 		if (t->parms.i_key != key ||
163 		    t->parms.iph.saddr != 0 ||
164 		    t->parms.iph.daddr != 0 ||
165 		    !(t->dev->flags & IFF_UP))
166 			continue;
167 
168 		if (t->parms.link == link)
169 			return t;
170 		else if (!cand)
171 			cand = t;
172 	}
173 
174 skip_key_lookup:
175 	if (cand)
176 		return cand;
177 
178 	t = rcu_dereference(itn->collect_md_tun);
179 	if (t && t->dev->flags & IFF_UP)
180 		return t;
181 
182 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
183 		return netdev_priv(itn->fb_tunnel_dev);
184 
185 	return NULL;
186 }
187 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
188 
189 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
190 				    struct ip_tunnel_parm *parms)
191 {
192 	unsigned int h;
193 	__be32 remote;
194 	__be32 i_key = parms->i_key;
195 
196 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
197 		remote = parms->iph.daddr;
198 	else
199 		remote = 0;
200 
201 	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
202 		i_key = 0;
203 
204 	h = ip_tunnel_hash(i_key, remote);
205 	return &itn->tunnels[h];
206 }
207 
208 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
209 {
210 	struct hlist_head *head = ip_bucket(itn, &t->parms);
211 
212 	if (t->collect_md)
213 		rcu_assign_pointer(itn->collect_md_tun, t);
214 	hlist_add_head_rcu(&t->hash_node, head);
215 }
216 
217 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
218 {
219 	if (t->collect_md)
220 		rcu_assign_pointer(itn->collect_md_tun, NULL);
221 	hlist_del_init_rcu(&t->hash_node);
222 }
223 
224 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
225 					struct ip_tunnel_parm *parms,
226 					int type)
227 {
228 	__be32 remote = parms->iph.daddr;
229 	__be32 local = parms->iph.saddr;
230 	__be32 key = parms->i_key;
231 	__be16 flags = parms->i_flags;
232 	int link = parms->link;
233 	struct ip_tunnel *t = NULL;
234 	struct hlist_head *head = ip_bucket(itn, parms);
235 
236 	hlist_for_each_entry_rcu(t, head, hash_node) {
237 		if (local == t->parms.iph.saddr &&
238 		    remote == t->parms.iph.daddr &&
239 		    link == t->parms.link &&
240 		    type == t->dev->type &&
241 		    ip_tunnel_key_match(&t->parms, flags, key))
242 			break;
243 	}
244 	return t;
245 }
246 
247 static struct net_device *__ip_tunnel_create(struct net *net,
248 					     const struct rtnl_link_ops *ops,
249 					     struct ip_tunnel_parm *parms)
250 {
251 	int err;
252 	struct ip_tunnel *tunnel;
253 	struct net_device *dev;
254 	char name[IFNAMSIZ];
255 
256 	err = -E2BIG;
257 	if (parms->name[0]) {
258 		if (!dev_valid_name(parms->name))
259 			goto failed;
260 		strlcpy(name, parms->name, IFNAMSIZ);
261 	} else {
262 		if (strlen(ops->kind) > (IFNAMSIZ - 3))
263 			goto failed;
264 		strcpy(name, ops->kind);
265 		strcat(name, "%d");
266 	}
267 
268 	ASSERT_RTNL();
269 	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
270 	if (!dev) {
271 		err = -ENOMEM;
272 		goto failed;
273 	}
274 	dev_net_set(dev, net);
275 
276 	dev->rtnl_link_ops = ops;
277 
278 	tunnel = netdev_priv(dev);
279 	tunnel->parms = *parms;
280 	tunnel->net = net;
281 
282 	err = register_netdevice(dev);
283 	if (err)
284 		goto failed_free;
285 
286 	return dev;
287 
288 failed_free:
289 	free_netdev(dev);
290 failed:
291 	return ERR_PTR(err);
292 }
293 
294 static int ip_tunnel_bind_dev(struct net_device *dev)
295 {
296 	struct net_device *tdev = NULL;
297 	struct ip_tunnel *tunnel = netdev_priv(dev);
298 	const struct iphdr *iph;
299 	int hlen = LL_MAX_HEADER;
300 	int mtu = ETH_DATA_LEN;
301 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
302 
303 	iph = &tunnel->parms.iph;
304 
305 	/* Guess output device to choose reasonable mtu and needed_headroom */
306 	if (iph->daddr) {
307 		struct flowi4 fl4;
308 		struct rtable *rt;
309 
310 		ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
311 				    iph->saddr, tunnel->parms.o_key,
312 				    RT_TOS(iph->tos), tunnel->parms.link,
313 				    tunnel->fwmark);
314 		rt = ip_route_output_key(tunnel->net, &fl4);
315 
316 		if (!IS_ERR(rt)) {
317 			tdev = rt->dst.dev;
318 			ip_rt_put(rt);
319 		}
320 		if (dev->type != ARPHRD_ETHER)
321 			dev->flags |= IFF_POINTOPOINT;
322 
323 		dst_cache_reset(&tunnel->dst_cache);
324 	}
325 
326 	if (!tdev && tunnel->parms.link)
327 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
328 
329 	if (tdev) {
330 		hlen = tdev->hard_header_len + tdev->needed_headroom;
331 		mtu = min(tdev->mtu, IP_MAX_MTU);
332 	}
333 
334 	dev->needed_headroom = t_hlen + hlen;
335 	mtu -= (dev->hard_header_len + t_hlen);
336 
337 	if (mtu < IPV4_MIN_MTU)
338 		mtu = IPV4_MIN_MTU;
339 
340 	return mtu;
341 }
342 
343 static struct ip_tunnel *ip_tunnel_create(struct net *net,
344 					  struct ip_tunnel_net *itn,
345 					  struct ip_tunnel_parm *parms)
346 {
347 	struct ip_tunnel *nt;
348 	struct net_device *dev;
349 	int t_hlen;
350 	int mtu;
351 	int err;
352 
353 	dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
354 	if (IS_ERR(dev))
355 		return ERR_CAST(dev);
356 
357 	mtu = ip_tunnel_bind_dev(dev);
358 	err = dev_set_mtu(dev, mtu);
359 	if (err)
360 		goto err_dev_set_mtu;
361 
362 	nt = netdev_priv(dev);
363 	t_hlen = nt->hlen + sizeof(struct iphdr);
364 	dev->min_mtu = ETH_MIN_MTU;
365 	dev->max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
366 	ip_tunnel_add(itn, nt);
367 	return nt;
368 
369 err_dev_set_mtu:
370 	unregister_netdevice(dev);
371 	return ERR_PTR(err);
372 }
373 
374 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
375 		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
376 		  bool log_ecn_error)
377 {
378 	struct pcpu_sw_netstats *tstats;
379 	const struct iphdr *iph = ip_hdr(skb);
380 	int err;
381 
382 #ifdef CONFIG_NET_IPGRE_BROADCAST
383 	if (ipv4_is_multicast(iph->daddr)) {
384 		tunnel->dev->stats.multicast++;
385 		skb->pkt_type = PACKET_BROADCAST;
386 	}
387 #endif
388 
389 	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
390 	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
391 		tunnel->dev->stats.rx_crc_errors++;
392 		tunnel->dev->stats.rx_errors++;
393 		goto drop;
394 	}
395 
396 	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
397 		if (!(tpi->flags&TUNNEL_SEQ) ||
398 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
399 			tunnel->dev->stats.rx_fifo_errors++;
400 			tunnel->dev->stats.rx_errors++;
401 			goto drop;
402 		}
403 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
404 	}
405 
406 	skb_reset_network_header(skb);
407 
408 	err = IP_ECN_decapsulate(iph, skb);
409 	if (unlikely(err)) {
410 		if (log_ecn_error)
411 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
412 					&iph->saddr, iph->tos);
413 		if (err > 1) {
414 			++tunnel->dev->stats.rx_frame_errors;
415 			++tunnel->dev->stats.rx_errors;
416 			goto drop;
417 		}
418 	}
419 
420 	tstats = this_cpu_ptr(tunnel->dev->tstats);
421 	u64_stats_update_begin(&tstats->syncp);
422 	tstats->rx_packets++;
423 	tstats->rx_bytes += skb->len;
424 	u64_stats_update_end(&tstats->syncp);
425 
426 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
427 
428 	if (tunnel->dev->type == ARPHRD_ETHER) {
429 		skb->protocol = eth_type_trans(skb, tunnel->dev);
430 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
431 	} else {
432 		skb->dev = tunnel->dev;
433 	}
434 
435 	if (tun_dst)
436 		skb_dst_set(skb, (struct dst_entry *)tun_dst);
437 
438 	gro_cells_receive(&tunnel->gro_cells, skb);
439 	return 0;
440 
441 drop:
442 	if (tun_dst)
443 		dst_release((struct dst_entry *)tun_dst);
444 	kfree_skb(skb);
445 	return 0;
446 }
447 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
448 
449 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
450 			    unsigned int num)
451 {
452 	if (num >= MAX_IPTUN_ENCAP_OPS)
453 		return -ERANGE;
454 
455 	return !cmpxchg((const struct ip_tunnel_encap_ops **)
456 			&iptun_encaps[num],
457 			NULL, ops) ? 0 : -1;
458 }
459 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
460 
461 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
462 			    unsigned int num)
463 {
464 	int ret;
465 
466 	if (num >= MAX_IPTUN_ENCAP_OPS)
467 		return -ERANGE;
468 
469 	ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
470 		       &iptun_encaps[num],
471 		       ops, NULL) == ops) ? 0 : -1;
472 
473 	synchronize_net();
474 
475 	return ret;
476 }
477 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
478 
479 int ip_tunnel_encap_setup(struct ip_tunnel *t,
480 			  struct ip_tunnel_encap *ipencap)
481 {
482 	int hlen;
483 
484 	memset(&t->encap, 0, sizeof(t->encap));
485 
486 	hlen = ip_encap_hlen(ipencap);
487 	if (hlen < 0)
488 		return hlen;
489 
490 	t->encap.type = ipencap->type;
491 	t->encap.sport = ipencap->sport;
492 	t->encap.dport = ipencap->dport;
493 	t->encap.flags = ipencap->flags;
494 
495 	t->encap_hlen = hlen;
496 	t->hlen = t->encap_hlen + t->tun_hlen;
497 
498 	return 0;
499 }
500 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
501 
502 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
503 			    struct rtable *rt, __be16 df,
504 			    const struct iphdr *inner_iph)
505 {
506 	struct ip_tunnel *tunnel = netdev_priv(dev);
507 	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
508 	int mtu;
509 
510 	if (df)
511 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
512 					- sizeof(struct iphdr) - tunnel->hlen;
513 	else
514 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
515 
516 	skb_dst_update_pmtu(skb, mtu);
517 
518 	if (skb->protocol == htons(ETH_P_IP)) {
519 		if (!skb_is_gso(skb) &&
520 		    (inner_iph->frag_off & htons(IP_DF)) &&
521 		    mtu < pkt_size) {
522 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
523 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
524 			return -E2BIG;
525 		}
526 	}
527 #if IS_ENABLED(CONFIG_IPV6)
528 	else if (skb->protocol == htons(ETH_P_IPV6)) {
529 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
530 
531 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
532 			   mtu >= IPV6_MIN_MTU) {
533 			if ((tunnel->parms.iph.daddr &&
534 			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
535 			    rt6->rt6i_dst.plen == 128) {
536 				rt6->rt6i_flags |= RTF_MODIFIED;
537 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
538 			}
539 		}
540 
541 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
542 					mtu < pkt_size) {
543 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
544 			return -E2BIG;
545 		}
546 	}
547 #endif
548 	return 0;
549 }
550 
551 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
552 {
553 	struct ip_tunnel *tunnel = netdev_priv(dev);
554 	u32 headroom = sizeof(struct iphdr);
555 	struct ip_tunnel_info *tun_info;
556 	const struct ip_tunnel_key *key;
557 	const struct iphdr *inner_iph;
558 	struct rtable *rt;
559 	struct flowi4 fl4;
560 	__be16 df = 0;
561 	u8 tos, ttl;
562 
563 	tun_info = skb_tunnel_info(skb);
564 	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
565 		     ip_tunnel_info_af(tun_info) != AF_INET))
566 		goto tx_error;
567 	key = &tun_info->key;
568 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
569 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
570 	tos = key->tos;
571 	if (tos == 1) {
572 		if (skb->protocol == htons(ETH_P_IP))
573 			tos = inner_iph->tos;
574 		else if (skb->protocol == htons(ETH_P_IPV6))
575 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
576 	}
577 	ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0,
578 			    RT_TOS(tos), tunnel->parms.link, tunnel->fwmark);
579 	if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
580 		goto tx_error;
581 	rt = ip_route_output_key(tunnel->net, &fl4);
582 	if (IS_ERR(rt)) {
583 		dev->stats.tx_carrier_errors++;
584 		goto tx_error;
585 	}
586 	if (rt->dst.dev == dev) {
587 		ip_rt_put(rt);
588 		dev->stats.collisions++;
589 		goto tx_error;
590 	}
591 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
592 	ttl = key->ttl;
593 	if (ttl == 0) {
594 		if (skb->protocol == htons(ETH_P_IP))
595 			ttl = inner_iph->ttl;
596 		else if (skb->protocol == htons(ETH_P_IPV6))
597 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
598 		else
599 			ttl = ip4_dst_hoplimit(&rt->dst);
600 	}
601 	if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
602 		df = htons(IP_DF);
603 	else if (skb->protocol == htons(ETH_P_IP))
604 		df = inner_iph->frag_off & htons(IP_DF);
605 	headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
606 	if (headroom > dev->needed_headroom)
607 		dev->needed_headroom = headroom;
608 
609 	if (skb_cow_head(skb, dev->needed_headroom)) {
610 		ip_rt_put(rt);
611 		goto tx_dropped;
612 	}
613 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
614 		      df, !net_eq(tunnel->net, dev_net(dev)));
615 	return;
616 tx_error:
617 	dev->stats.tx_errors++;
618 	goto kfree;
619 tx_dropped:
620 	dev->stats.tx_dropped++;
621 kfree:
622 	kfree_skb(skb);
623 }
624 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
625 
626 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
627 		    const struct iphdr *tnl_params, u8 protocol)
628 {
629 	struct ip_tunnel *tunnel = netdev_priv(dev);
630 	const struct iphdr *inner_iph;
631 	struct flowi4 fl4;
632 	u8     tos, ttl;
633 	__be16 df;
634 	struct rtable *rt;		/* Route to the other host */
635 	unsigned int max_headroom;	/* The extra header space needed */
636 	__be32 dst;
637 	bool connected;
638 
639 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
640 	connected = (tunnel->parms.iph.daddr != 0);
641 
642 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
643 
644 	dst = tnl_params->daddr;
645 	if (dst == 0) {
646 		/* NBMA tunnel */
647 
648 		if (!skb_dst(skb)) {
649 			dev->stats.tx_fifo_errors++;
650 			goto tx_error;
651 		}
652 
653 		if (skb->protocol == htons(ETH_P_IP)) {
654 			rt = skb_rtable(skb);
655 			dst = rt_nexthop(rt, inner_iph->daddr);
656 		}
657 #if IS_ENABLED(CONFIG_IPV6)
658 		else if (skb->protocol == htons(ETH_P_IPV6)) {
659 			const struct in6_addr *addr6;
660 			struct neighbour *neigh;
661 			bool do_tx_error_icmp;
662 			int addr_type;
663 
664 			neigh = dst_neigh_lookup(skb_dst(skb),
665 						 &ipv6_hdr(skb)->daddr);
666 			if (!neigh)
667 				goto tx_error;
668 
669 			addr6 = (const struct in6_addr *)&neigh->primary_key;
670 			addr_type = ipv6_addr_type(addr6);
671 
672 			if (addr_type == IPV6_ADDR_ANY) {
673 				addr6 = &ipv6_hdr(skb)->daddr;
674 				addr_type = ipv6_addr_type(addr6);
675 			}
676 
677 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
678 				do_tx_error_icmp = true;
679 			else {
680 				do_tx_error_icmp = false;
681 				dst = addr6->s6_addr32[3];
682 			}
683 			neigh_release(neigh);
684 			if (do_tx_error_icmp)
685 				goto tx_error_icmp;
686 		}
687 #endif
688 		else
689 			goto tx_error;
690 
691 		connected = false;
692 	}
693 
694 	tos = tnl_params->tos;
695 	if (tos & 0x1) {
696 		tos &= ~0x1;
697 		if (skb->protocol == htons(ETH_P_IP)) {
698 			tos = inner_iph->tos;
699 			connected = false;
700 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
701 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
702 			connected = false;
703 		}
704 	}
705 
706 	ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
707 			    tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
708 			    tunnel->fwmark);
709 
710 	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
711 		goto tx_error;
712 
713 	rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
714 			 NULL;
715 
716 	if (!rt) {
717 		rt = ip_route_output_key(tunnel->net, &fl4);
718 
719 		if (IS_ERR(rt)) {
720 			dev->stats.tx_carrier_errors++;
721 			goto tx_error;
722 		}
723 		if (connected)
724 			dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
725 					  fl4.saddr);
726 	}
727 
728 	if (rt->dst.dev == dev) {
729 		ip_rt_put(rt);
730 		dev->stats.collisions++;
731 		goto tx_error;
732 	}
733 
734 	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
735 		ip_rt_put(rt);
736 		goto tx_error;
737 	}
738 
739 	if (tunnel->err_count > 0) {
740 		if (time_before(jiffies,
741 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
742 			tunnel->err_count--;
743 
744 			dst_link_failure(skb);
745 		} else
746 			tunnel->err_count = 0;
747 	}
748 
749 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
750 	ttl = tnl_params->ttl;
751 	if (ttl == 0) {
752 		if (skb->protocol == htons(ETH_P_IP))
753 			ttl = inner_iph->ttl;
754 #if IS_ENABLED(CONFIG_IPV6)
755 		else if (skb->protocol == htons(ETH_P_IPV6))
756 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
757 #endif
758 		else
759 			ttl = ip4_dst_hoplimit(&rt->dst);
760 	}
761 
762 	df = tnl_params->frag_off;
763 	if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
764 		df |= (inner_iph->frag_off&htons(IP_DF));
765 
766 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
767 			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
768 	if (max_headroom > dev->needed_headroom)
769 		dev->needed_headroom = max_headroom;
770 
771 	if (skb_cow_head(skb, dev->needed_headroom)) {
772 		ip_rt_put(rt);
773 		dev->stats.tx_dropped++;
774 		kfree_skb(skb);
775 		return;
776 	}
777 
778 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
779 		      df, !net_eq(tunnel->net, dev_net(dev)));
780 	return;
781 
782 #if IS_ENABLED(CONFIG_IPV6)
783 tx_error_icmp:
784 	dst_link_failure(skb);
785 #endif
786 tx_error:
787 	dev->stats.tx_errors++;
788 	kfree_skb(skb);
789 }
790 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
791 
792 static void ip_tunnel_update(struct ip_tunnel_net *itn,
793 			     struct ip_tunnel *t,
794 			     struct net_device *dev,
795 			     struct ip_tunnel_parm *p,
796 			     bool set_mtu,
797 			     __u32 fwmark)
798 {
799 	ip_tunnel_del(itn, t);
800 	t->parms.iph.saddr = p->iph.saddr;
801 	t->parms.iph.daddr = p->iph.daddr;
802 	t->parms.i_key = p->i_key;
803 	t->parms.o_key = p->o_key;
804 	if (dev->type != ARPHRD_ETHER) {
805 		memcpy(dev->dev_addr, &p->iph.saddr, 4);
806 		memcpy(dev->broadcast, &p->iph.daddr, 4);
807 	}
808 	ip_tunnel_add(itn, t);
809 
810 	t->parms.iph.ttl = p->iph.ttl;
811 	t->parms.iph.tos = p->iph.tos;
812 	t->parms.iph.frag_off = p->iph.frag_off;
813 
814 	if (t->parms.link != p->link || t->fwmark != fwmark) {
815 		int mtu;
816 
817 		t->parms.link = p->link;
818 		t->fwmark = fwmark;
819 		mtu = ip_tunnel_bind_dev(dev);
820 		if (set_mtu)
821 			dev->mtu = mtu;
822 	}
823 	dst_cache_reset(&t->dst_cache);
824 	netdev_state_change(dev);
825 }
826 
827 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
828 {
829 	int err = 0;
830 	struct ip_tunnel *t = netdev_priv(dev);
831 	struct net *net = t->net;
832 	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
833 
834 	switch (cmd) {
835 	case SIOCGETTUNNEL:
836 		if (dev == itn->fb_tunnel_dev) {
837 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
838 			if (!t)
839 				t = netdev_priv(dev);
840 		}
841 		memcpy(p, &t->parms, sizeof(*p));
842 		break;
843 
844 	case SIOCADDTUNNEL:
845 	case SIOCCHGTUNNEL:
846 		err = -EPERM;
847 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
848 			goto done;
849 		if (p->iph.ttl)
850 			p->iph.frag_off |= htons(IP_DF);
851 		if (!(p->i_flags & VTI_ISVTI)) {
852 			if (!(p->i_flags & TUNNEL_KEY))
853 				p->i_key = 0;
854 			if (!(p->o_flags & TUNNEL_KEY))
855 				p->o_key = 0;
856 		}
857 
858 		t = ip_tunnel_find(itn, p, itn->type);
859 
860 		if (cmd == SIOCADDTUNNEL) {
861 			if (!t) {
862 				t = ip_tunnel_create(net, itn, p);
863 				err = PTR_ERR_OR_ZERO(t);
864 				break;
865 			}
866 
867 			err = -EEXIST;
868 			break;
869 		}
870 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
871 			if (t) {
872 				if (t->dev != dev) {
873 					err = -EEXIST;
874 					break;
875 				}
876 			} else {
877 				unsigned int nflags = 0;
878 
879 				if (ipv4_is_multicast(p->iph.daddr))
880 					nflags = IFF_BROADCAST;
881 				else if (p->iph.daddr)
882 					nflags = IFF_POINTOPOINT;
883 
884 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
885 					err = -EINVAL;
886 					break;
887 				}
888 
889 				t = netdev_priv(dev);
890 			}
891 		}
892 
893 		if (t) {
894 			err = 0;
895 			ip_tunnel_update(itn, t, dev, p, true, 0);
896 		} else {
897 			err = -ENOENT;
898 		}
899 		break;
900 
901 	case SIOCDELTUNNEL:
902 		err = -EPERM;
903 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
904 			goto done;
905 
906 		if (dev == itn->fb_tunnel_dev) {
907 			err = -ENOENT;
908 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
909 			if (!t)
910 				goto done;
911 			err = -EPERM;
912 			if (t == netdev_priv(itn->fb_tunnel_dev))
913 				goto done;
914 			dev = t->dev;
915 		}
916 		unregister_netdevice(dev);
917 		err = 0;
918 		break;
919 
920 	default:
921 		err = -EINVAL;
922 	}
923 
924 done:
925 	return err;
926 }
927 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
928 
929 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
930 {
931 	struct ip_tunnel *tunnel = netdev_priv(dev);
932 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
933 	int max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
934 
935 	if (new_mtu < ETH_MIN_MTU)
936 		return -EINVAL;
937 
938 	if (new_mtu > max_mtu) {
939 		if (strict)
940 			return -EINVAL;
941 
942 		new_mtu = max_mtu;
943 	}
944 
945 	dev->mtu = new_mtu;
946 	return 0;
947 }
948 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
949 
950 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
951 {
952 	return __ip_tunnel_change_mtu(dev, new_mtu, true);
953 }
954 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
955 
956 static void ip_tunnel_dev_free(struct net_device *dev)
957 {
958 	struct ip_tunnel *tunnel = netdev_priv(dev);
959 
960 	gro_cells_destroy(&tunnel->gro_cells);
961 	dst_cache_destroy(&tunnel->dst_cache);
962 	free_percpu(dev->tstats);
963 }
964 
965 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
966 {
967 	struct ip_tunnel *tunnel = netdev_priv(dev);
968 	struct ip_tunnel_net *itn;
969 
970 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
971 
972 	if (itn->fb_tunnel_dev != dev) {
973 		ip_tunnel_del(itn, netdev_priv(dev));
974 		unregister_netdevice_queue(dev, head);
975 	}
976 }
977 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
978 
979 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
980 {
981 	struct ip_tunnel *tunnel = netdev_priv(dev);
982 
983 	return tunnel->net;
984 }
985 EXPORT_SYMBOL(ip_tunnel_get_link_net);
986 
987 int ip_tunnel_get_iflink(const struct net_device *dev)
988 {
989 	struct ip_tunnel *tunnel = netdev_priv(dev);
990 
991 	return tunnel->parms.link;
992 }
993 EXPORT_SYMBOL(ip_tunnel_get_iflink);
994 
995 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
996 				  struct rtnl_link_ops *ops, char *devname)
997 {
998 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
999 	struct ip_tunnel_parm parms;
1000 	unsigned int i;
1001 
1002 	itn->rtnl_link_ops = ops;
1003 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1004 		INIT_HLIST_HEAD(&itn->tunnels[i]);
1005 
1006 	if (!ops || !net_has_fallback_tunnels(net)) {
1007 		struct ip_tunnel_net *it_init_net;
1008 
1009 		it_init_net = net_generic(&init_net, ip_tnl_net_id);
1010 		itn->type = it_init_net->type;
1011 		itn->fb_tunnel_dev = NULL;
1012 		return 0;
1013 	}
1014 
1015 	memset(&parms, 0, sizeof(parms));
1016 	if (devname)
1017 		strlcpy(parms.name, devname, IFNAMSIZ);
1018 
1019 	rtnl_lock();
1020 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1021 	/* FB netdevice is special: we have one, and only one per netns.
1022 	 * Allowing to move it to another netns is clearly unsafe.
1023 	 */
1024 	if (!IS_ERR(itn->fb_tunnel_dev)) {
1025 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1026 		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1027 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1028 		itn->type = itn->fb_tunnel_dev->type;
1029 	}
1030 	rtnl_unlock();
1031 
1032 	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1033 }
1034 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1035 
1036 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1037 			      struct list_head *head,
1038 			      struct rtnl_link_ops *ops)
1039 {
1040 	struct net_device *dev, *aux;
1041 	int h;
1042 
1043 	for_each_netdev_safe(net, dev, aux)
1044 		if (dev->rtnl_link_ops == ops)
1045 			unregister_netdevice_queue(dev, head);
1046 
1047 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1048 		struct ip_tunnel *t;
1049 		struct hlist_node *n;
1050 		struct hlist_head *thead = &itn->tunnels[h];
1051 
1052 		hlist_for_each_entry_safe(t, n, thead, hash_node)
1053 			/* If dev is in the same netns, it has already
1054 			 * been added to the list by the previous loop.
1055 			 */
1056 			if (!net_eq(dev_net(t->dev), net))
1057 				unregister_netdevice_queue(t->dev, head);
1058 	}
1059 }
1060 
1061 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1062 			   struct rtnl_link_ops *ops)
1063 {
1064 	struct ip_tunnel_net *itn;
1065 	struct net *net;
1066 	LIST_HEAD(list);
1067 
1068 	rtnl_lock();
1069 	list_for_each_entry(net, net_list, exit_list) {
1070 		itn = net_generic(net, id);
1071 		ip_tunnel_destroy(net, itn, &list, ops);
1072 	}
1073 	unregister_netdevice_many(&list);
1074 	rtnl_unlock();
1075 }
1076 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1077 
1078 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1079 		      struct ip_tunnel_parm *p, __u32 fwmark)
1080 {
1081 	struct ip_tunnel *nt;
1082 	struct net *net = dev_net(dev);
1083 	struct ip_tunnel_net *itn;
1084 	int mtu;
1085 	int err;
1086 
1087 	nt = netdev_priv(dev);
1088 	itn = net_generic(net, nt->ip_tnl_net_id);
1089 
1090 	if (nt->collect_md) {
1091 		if (rtnl_dereference(itn->collect_md_tun))
1092 			return -EEXIST;
1093 	} else {
1094 		if (ip_tunnel_find(itn, p, dev->type))
1095 			return -EEXIST;
1096 	}
1097 
1098 	nt->net = net;
1099 	nt->parms = *p;
1100 	nt->fwmark = fwmark;
1101 	err = register_netdevice(dev);
1102 	if (err)
1103 		goto err_register_netdevice;
1104 
1105 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1106 		eth_hw_addr_random(dev);
1107 
1108 	mtu = ip_tunnel_bind_dev(dev);
1109 	if (tb[IFLA_MTU]) {
1110 		unsigned int max = IP_MAX_MTU - dev->hard_header_len - nt->hlen;
1111 
1112 		mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU,
1113 			    (unsigned int)(max - sizeof(struct iphdr)));
1114 	}
1115 
1116 	err = dev_set_mtu(dev, mtu);
1117 	if (err)
1118 		goto err_dev_set_mtu;
1119 
1120 	ip_tunnel_add(itn, nt);
1121 	return 0;
1122 
1123 err_dev_set_mtu:
1124 	unregister_netdevice(dev);
1125 err_register_netdevice:
1126 	return err;
1127 }
1128 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1129 
1130 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1131 			 struct ip_tunnel_parm *p, __u32 fwmark)
1132 {
1133 	struct ip_tunnel *t;
1134 	struct ip_tunnel *tunnel = netdev_priv(dev);
1135 	struct net *net = tunnel->net;
1136 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1137 
1138 	if (dev == itn->fb_tunnel_dev)
1139 		return -EINVAL;
1140 
1141 	t = ip_tunnel_find(itn, p, dev->type);
1142 
1143 	if (t) {
1144 		if (t->dev != dev)
1145 			return -EEXIST;
1146 	} else {
1147 		t = tunnel;
1148 
1149 		if (dev->type != ARPHRD_ETHER) {
1150 			unsigned int nflags = 0;
1151 
1152 			if (ipv4_is_multicast(p->iph.daddr))
1153 				nflags = IFF_BROADCAST;
1154 			else if (p->iph.daddr)
1155 				nflags = IFF_POINTOPOINT;
1156 
1157 			if ((dev->flags ^ nflags) &
1158 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1159 				return -EINVAL;
1160 		}
1161 	}
1162 
1163 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1164 	return 0;
1165 }
1166 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1167 
1168 int ip_tunnel_init(struct net_device *dev)
1169 {
1170 	struct ip_tunnel *tunnel = netdev_priv(dev);
1171 	struct iphdr *iph = &tunnel->parms.iph;
1172 	int err;
1173 
1174 	dev->needs_free_netdev = true;
1175 	dev->priv_destructor = ip_tunnel_dev_free;
1176 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1177 	if (!dev->tstats)
1178 		return -ENOMEM;
1179 
1180 	err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1181 	if (err) {
1182 		free_percpu(dev->tstats);
1183 		return err;
1184 	}
1185 
1186 	err = gro_cells_init(&tunnel->gro_cells, dev);
1187 	if (err) {
1188 		dst_cache_destroy(&tunnel->dst_cache);
1189 		free_percpu(dev->tstats);
1190 		return err;
1191 	}
1192 
1193 	tunnel->dev = dev;
1194 	tunnel->net = dev_net(dev);
1195 	strcpy(tunnel->parms.name, dev->name);
1196 	iph->version		= 4;
1197 	iph->ihl		= 5;
1198 
1199 	if (tunnel->collect_md) {
1200 		dev->features |= NETIF_F_NETNS_LOCAL;
1201 		netif_keep_dst(dev);
1202 	}
1203 	return 0;
1204 }
1205 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1206 
1207 void ip_tunnel_uninit(struct net_device *dev)
1208 {
1209 	struct ip_tunnel *tunnel = netdev_priv(dev);
1210 	struct net *net = tunnel->net;
1211 	struct ip_tunnel_net *itn;
1212 
1213 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1214 	/* fb_tunnel_dev will be unregisted in net-exit call. */
1215 	if (itn->fb_tunnel_dev != dev)
1216 		ip_tunnel_del(itn, netdev_priv(dev));
1217 
1218 	dst_cache_reset(&tunnel->dst_cache);
1219 }
1220 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1221 
1222 /* Do least required initialization, rest of init is done in tunnel_init call */
1223 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1224 {
1225 	struct ip_tunnel *tunnel = netdev_priv(dev);
1226 	tunnel->ip_tnl_net_id = net_id;
1227 }
1228 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1229 
1230 MODULE_LICENSE("GPL");
1231