xref: /openbmc/linux/net/ipv4/ip_tunnel.c (revision 584eab291c67894cb17cc87544b9d086228ea70f)
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20 
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/init.h>
34 #include <linux/in6.h>
35 #include <linux/inetdevice.h>
36 #include <linux/igmp.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/etherdevice.h>
39 #include <linux/if_ether.h>
40 #include <linux/if_vlan.h>
41 #include <linux/rculist.h>
42 #include <linux/err.h>
43 
44 #include <net/sock.h>
45 #include <net/ip.h>
46 #include <net/icmp.h>
47 #include <net/protocol.h>
48 #include <net/ip_tunnels.h>
49 #include <net/arp.h>
50 #include <net/checksum.h>
51 #include <net/dsfield.h>
52 #include <net/inet_ecn.h>
53 #include <net/xfrm.h>
54 #include <net/net_namespace.h>
55 #include <net/netns/generic.h>
56 #include <net/rtnetlink.h>
57 #include <net/udp.h>
58 #include <net/dst_metadata.h>
59 
60 #if IS_ENABLED(CONFIG_IPV6)
61 #include <net/ipv6.h>
62 #include <net/ip6_fib.h>
63 #include <net/ip6_route.h>
64 #endif
65 
66 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
67 {
68 	return hash_32((__force u32)key ^ (__force u32)remote,
69 			 IP_TNL_HASH_BITS);
70 }
71 
72 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
73 				__be16 flags, __be32 key)
74 {
75 	if (p->i_flags & TUNNEL_KEY) {
76 		if (flags & TUNNEL_KEY)
77 			return key == p->i_key;
78 		else
79 			/* key expected, none present */
80 			return false;
81 	} else
82 		return !(flags & TUNNEL_KEY);
83 }
84 
85 /* Fallback tunnel: no source, no destination, no key, no options
86 
87    Tunnel hash table:
88    We require exact key match i.e. if a key is present in packet
89    it will match only tunnel with the same key; if it is not present,
90    it will match only keyless tunnel.
91 
92    All keysless packets, if not matched configured keyless tunnels
93    will match fallback tunnel.
94    Given src, dst and key, find appropriate for input tunnel.
95 */
96 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
97 				   int link, __be16 flags,
98 				   __be32 remote, __be32 local,
99 				   __be32 key)
100 {
101 	unsigned int hash;
102 	struct ip_tunnel *t, *cand = NULL;
103 	struct hlist_head *head;
104 
105 	hash = ip_tunnel_hash(key, remote);
106 	head = &itn->tunnels[hash];
107 
108 	hlist_for_each_entry_rcu(t, head, hash_node) {
109 		if (local != t->parms.iph.saddr ||
110 		    remote != t->parms.iph.daddr ||
111 		    !(t->dev->flags & IFF_UP))
112 			continue;
113 
114 		if (!ip_tunnel_key_match(&t->parms, flags, key))
115 			continue;
116 
117 		if (t->parms.link == link)
118 			return t;
119 		else
120 			cand = t;
121 	}
122 
123 	hlist_for_each_entry_rcu(t, head, hash_node) {
124 		if (remote != t->parms.iph.daddr ||
125 		    t->parms.iph.saddr != 0 ||
126 		    !(t->dev->flags & IFF_UP))
127 			continue;
128 
129 		if (!ip_tunnel_key_match(&t->parms, flags, key))
130 			continue;
131 
132 		if (t->parms.link == link)
133 			return t;
134 		else if (!cand)
135 			cand = t;
136 	}
137 
138 	hash = ip_tunnel_hash(key, 0);
139 	head = &itn->tunnels[hash];
140 
141 	hlist_for_each_entry_rcu(t, head, hash_node) {
142 		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
143 		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
144 			continue;
145 
146 		if (!(t->dev->flags & IFF_UP))
147 			continue;
148 
149 		if (!ip_tunnel_key_match(&t->parms, flags, key))
150 			continue;
151 
152 		if (t->parms.link == link)
153 			return t;
154 		else if (!cand)
155 			cand = t;
156 	}
157 
158 	if (flags & TUNNEL_NO_KEY)
159 		goto skip_key_lookup;
160 
161 	hlist_for_each_entry_rcu(t, head, hash_node) {
162 		if (t->parms.i_key != key ||
163 		    t->parms.iph.saddr != 0 ||
164 		    t->parms.iph.daddr != 0 ||
165 		    !(t->dev->flags & IFF_UP))
166 			continue;
167 
168 		if (t->parms.link == link)
169 			return t;
170 		else if (!cand)
171 			cand = t;
172 	}
173 
174 skip_key_lookup:
175 	if (cand)
176 		return cand;
177 
178 	t = rcu_dereference(itn->collect_md_tun);
179 	if (t && t->dev->flags & IFF_UP)
180 		return t;
181 
182 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
183 		return netdev_priv(itn->fb_tunnel_dev);
184 
185 	return NULL;
186 }
187 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
188 
189 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
190 				    struct ip_tunnel_parm *parms)
191 {
192 	unsigned int h;
193 	__be32 remote;
194 	__be32 i_key = parms->i_key;
195 
196 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
197 		remote = parms->iph.daddr;
198 	else
199 		remote = 0;
200 
201 	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
202 		i_key = 0;
203 
204 	h = ip_tunnel_hash(i_key, remote);
205 	return &itn->tunnels[h];
206 }
207 
208 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
209 {
210 	struct hlist_head *head = ip_bucket(itn, &t->parms);
211 
212 	if (t->collect_md)
213 		rcu_assign_pointer(itn->collect_md_tun, t);
214 	hlist_add_head_rcu(&t->hash_node, head);
215 }
216 
217 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
218 {
219 	if (t->collect_md)
220 		rcu_assign_pointer(itn->collect_md_tun, NULL);
221 	hlist_del_init_rcu(&t->hash_node);
222 }
223 
224 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
225 					struct ip_tunnel_parm *parms,
226 					int type)
227 {
228 	__be32 remote = parms->iph.daddr;
229 	__be32 local = parms->iph.saddr;
230 	__be32 key = parms->i_key;
231 	__be16 flags = parms->i_flags;
232 	int link = parms->link;
233 	struct ip_tunnel *t = NULL;
234 	struct hlist_head *head = ip_bucket(itn, parms);
235 
236 	hlist_for_each_entry_rcu(t, head, hash_node) {
237 		if (local == t->parms.iph.saddr &&
238 		    remote == t->parms.iph.daddr &&
239 		    link == t->parms.link &&
240 		    type == t->dev->type &&
241 		    ip_tunnel_key_match(&t->parms, flags, key))
242 			break;
243 	}
244 	return t;
245 }
246 
247 static struct net_device *__ip_tunnel_create(struct net *net,
248 					     const struct rtnl_link_ops *ops,
249 					     struct ip_tunnel_parm *parms)
250 {
251 	int err;
252 	struct ip_tunnel *tunnel;
253 	struct net_device *dev;
254 	char name[IFNAMSIZ];
255 
256 	err = -E2BIG;
257 	if (parms->name[0]) {
258 		if (!dev_valid_name(parms->name))
259 			goto failed;
260 		strlcpy(name, parms->name, IFNAMSIZ);
261 	} else {
262 		if (strlen(ops->kind) > (IFNAMSIZ - 3))
263 			goto failed;
264 		strcpy(name, ops->kind);
265 		strcat(name, "%d");
266 	}
267 
268 	ASSERT_RTNL();
269 	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
270 	if (!dev) {
271 		err = -ENOMEM;
272 		goto failed;
273 	}
274 	dev_net_set(dev, net);
275 
276 	dev->rtnl_link_ops = ops;
277 
278 	tunnel = netdev_priv(dev);
279 	tunnel->parms = *parms;
280 	tunnel->net = net;
281 
282 	err = register_netdevice(dev);
283 	if (err)
284 		goto failed_free;
285 
286 	return dev;
287 
288 failed_free:
289 	free_netdev(dev);
290 failed:
291 	return ERR_PTR(err);
292 }
293 
294 static int ip_tunnel_bind_dev(struct net_device *dev)
295 {
296 	struct net_device *tdev = NULL;
297 	struct ip_tunnel *tunnel = netdev_priv(dev);
298 	const struct iphdr *iph;
299 	int hlen = LL_MAX_HEADER;
300 	int mtu = ETH_DATA_LEN;
301 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
302 
303 	iph = &tunnel->parms.iph;
304 
305 	/* Guess output device to choose reasonable mtu and needed_headroom */
306 	if (iph->daddr) {
307 		struct flowi4 fl4;
308 		struct rtable *rt;
309 
310 		ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
311 				    iph->saddr, tunnel->parms.o_key,
312 				    RT_TOS(iph->tos), tunnel->parms.link,
313 				    tunnel->fwmark);
314 		rt = ip_route_output_key(tunnel->net, &fl4);
315 
316 		if (!IS_ERR(rt)) {
317 			tdev = rt->dst.dev;
318 			ip_rt_put(rt);
319 		}
320 		if (dev->type != ARPHRD_ETHER)
321 			dev->flags |= IFF_POINTOPOINT;
322 
323 		dst_cache_reset(&tunnel->dst_cache);
324 	}
325 
326 	if (!tdev && tunnel->parms.link)
327 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
328 
329 	if (tdev) {
330 		hlen = tdev->hard_header_len + tdev->needed_headroom;
331 		mtu = min(tdev->mtu, IP_MAX_MTU);
332 	}
333 
334 	dev->needed_headroom = t_hlen + hlen;
335 	mtu -= (dev->hard_header_len + t_hlen);
336 
337 	if (mtu < IPV4_MIN_MTU)
338 		mtu = IPV4_MIN_MTU;
339 
340 	return mtu;
341 }
342 
343 static struct ip_tunnel *ip_tunnel_create(struct net *net,
344 					  struct ip_tunnel_net *itn,
345 					  struct ip_tunnel_parm *parms)
346 {
347 	struct ip_tunnel *nt;
348 	struct net_device *dev;
349 	int t_hlen;
350 	int mtu;
351 	int err;
352 
353 	dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
354 	if (IS_ERR(dev))
355 		return ERR_CAST(dev);
356 
357 	mtu = ip_tunnel_bind_dev(dev);
358 	err = dev_set_mtu(dev, mtu);
359 	if (err)
360 		goto err_dev_set_mtu;
361 
362 	nt = netdev_priv(dev);
363 	t_hlen = nt->hlen + sizeof(struct iphdr);
364 	dev->min_mtu = ETH_MIN_MTU;
365 	dev->max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
366 	ip_tunnel_add(itn, nt);
367 	return nt;
368 
369 err_dev_set_mtu:
370 	unregister_netdevice(dev);
371 	return ERR_PTR(err);
372 }
373 
374 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
375 		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
376 		  bool log_ecn_error)
377 {
378 	struct pcpu_sw_netstats *tstats;
379 	const struct iphdr *iph = ip_hdr(skb);
380 	int err;
381 
382 #ifdef CONFIG_NET_IPGRE_BROADCAST
383 	if (ipv4_is_multicast(iph->daddr)) {
384 		tunnel->dev->stats.multicast++;
385 		skb->pkt_type = PACKET_BROADCAST;
386 	}
387 #endif
388 
389 	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
390 	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
391 		tunnel->dev->stats.rx_crc_errors++;
392 		tunnel->dev->stats.rx_errors++;
393 		goto drop;
394 	}
395 
396 	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
397 		if (!(tpi->flags&TUNNEL_SEQ) ||
398 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
399 			tunnel->dev->stats.rx_fifo_errors++;
400 			tunnel->dev->stats.rx_errors++;
401 			goto drop;
402 		}
403 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
404 	}
405 
406 	skb_reset_network_header(skb);
407 
408 	err = IP_ECN_decapsulate(iph, skb);
409 	if (unlikely(err)) {
410 		if (log_ecn_error)
411 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
412 					&iph->saddr, iph->tos);
413 		if (err > 1) {
414 			++tunnel->dev->stats.rx_frame_errors;
415 			++tunnel->dev->stats.rx_errors;
416 			goto drop;
417 		}
418 	}
419 
420 	tstats = this_cpu_ptr(tunnel->dev->tstats);
421 	u64_stats_update_begin(&tstats->syncp);
422 	tstats->rx_packets++;
423 	tstats->rx_bytes += skb->len;
424 	u64_stats_update_end(&tstats->syncp);
425 
426 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
427 
428 	if (tunnel->dev->type == ARPHRD_ETHER) {
429 		skb->protocol = eth_type_trans(skb, tunnel->dev);
430 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
431 	} else {
432 		skb->dev = tunnel->dev;
433 	}
434 
435 	if (tun_dst)
436 		skb_dst_set(skb, (struct dst_entry *)tun_dst);
437 
438 	gro_cells_receive(&tunnel->gro_cells, skb);
439 	return 0;
440 
441 drop:
442 	if (tun_dst)
443 		dst_release((struct dst_entry *)tun_dst);
444 	kfree_skb(skb);
445 	return 0;
446 }
447 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
448 
449 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
450 			    unsigned int num)
451 {
452 	if (num >= MAX_IPTUN_ENCAP_OPS)
453 		return -ERANGE;
454 
455 	return !cmpxchg((const struct ip_tunnel_encap_ops **)
456 			&iptun_encaps[num],
457 			NULL, ops) ? 0 : -1;
458 }
459 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
460 
461 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
462 			    unsigned int num)
463 {
464 	int ret;
465 
466 	if (num >= MAX_IPTUN_ENCAP_OPS)
467 		return -ERANGE;
468 
469 	ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
470 		       &iptun_encaps[num],
471 		       ops, NULL) == ops) ? 0 : -1;
472 
473 	synchronize_net();
474 
475 	return ret;
476 }
477 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
478 
479 int ip_tunnel_encap_setup(struct ip_tunnel *t,
480 			  struct ip_tunnel_encap *ipencap)
481 {
482 	int hlen;
483 
484 	memset(&t->encap, 0, sizeof(t->encap));
485 
486 	hlen = ip_encap_hlen(ipencap);
487 	if (hlen < 0)
488 		return hlen;
489 
490 	t->encap.type = ipencap->type;
491 	t->encap.sport = ipencap->sport;
492 	t->encap.dport = ipencap->dport;
493 	t->encap.flags = ipencap->flags;
494 
495 	t->encap_hlen = hlen;
496 	t->hlen = t->encap_hlen + t->tun_hlen;
497 
498 	return 0;
499 }
500 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
501 
502 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
503 			    struct rtable *rt, __be16 df,
504 			    const struct iphdr *inner_iph)
505 {
506 	struct ip_tunnel *tunnel = netdev_priv(dev);
507 	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
508 	int mtu;
509 
510 	if (df)
511 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
512 					- sizeof(struct iphdr) - tunnel->hlen;
513 	else
514 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
515 
516 	skb_dst_update_pmtu(skb, mtu);
517 
518 	if (skb->protocol == htons(ETH_P_IP)) {
519 		if (!skb_is_gso(skb) &&
520 		    (inner_iph->frag_off & htons(IP_DF)) &&
521 		    mtu < pkt_size) {
522 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
523 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
524 			return -E2BIG;
525 		}
526 	}
527 #if IS_ENABLED(CONFIG_IPV6)
528 	else if (skb->protocol == htons(ETH_P_IPV6)) {
529 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
530 
531 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
532 			   mtu >= IPV6_MIN_MTU) {
533 			if ((tunnel->parms.iph.daddr &&
534 			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
535 			    rt6->rt6i_dst.plen == 128) {
536 				rt6->rt6i_flags |= RTF_MODIFIED;
537 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
538 			}
539 		}
540 
541 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
542 					mtu < pkt_size) {
543 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
544 			return -E2BIG;
545 		}
546 	}
547 #endif
548 	return 0;
549 }
550 
551 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
552 {
553 	struct ip_tunnel *tunnel = netdev_priv(dev);
554 	u32 headroom = sizeof(struct iphdr);
555 	struct ip_tunnel_info *tun_info;
556 	const struct ip_tunnel_key *key;
557 	const struct iphdr *inner_iph;
558 	struct rtable *rt;
559 	struct flowi4 fl4;
560 	__be16 df = 0;
561 	u8 tos, ttl;
562 
563 	tun_info = skb_tunnel_info(skb);
564 	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
565 		     ip_tunnel_info_af(tun_info) != AF_INET))
566 		goto tx_error;
567 	key = &tun_info->key;
568 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
569 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
570 	tos = key->tos;
571 	if (tos == 1) {
572 		if (skb->protocol == htons(ETH_P_IP))
573 			tos = inner_iph->tos;
574 		else if (skb->protocol == htons(ETH_P_IPV6))
575 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
576 	}
577 	ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0,
578 			    RT_TOS(tos), tunnel->parms.link, tunnel->fwmark);
579 	if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
580 		goto tx_error;
581 	rt = ip_route_output_key(tunnel->net, &fl4);
582 	if (IS_ERR(rt)) {
583 		dev->stats.tx_carrier_errors++;
584 		goto tx_error;
585 	}
586 	if (rt->dst.dev == dev) {
587 		ip_rt_put(rt);
588 		dev->stats.collisions++;
589 		goto tx_error;
590 	}
591 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
592 	ttl = key->ttl;
593 	if (ttl == 0) {
594 		if (skb->protocol == htons(ETH_P_IP))
595 			ttl = inner_iph->ttl;
596 		else if (skb->protocol == htons(ETH_P_IPV6))
597 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
598 		else
599 			ttl = ip4_dst_hoplimit(&rt->dst);
600 	}
601 	if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
602 		df = htons(IP_DF);
603 	else if (skb->protocol == htons(ETH_P_IP))
604 		df = inner_iph->frag_off & htons(IP_DF);
605 	headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
606 	if (headroom > dev->needed_headroom)
607 		dev->needed_headroom = headroom;
608 
609 	if (skb_cow_head(skb, dev->needed_headroom)) {
610 		ip_rt_put(rt);
611 		goto tx_dropped;
612 	}
613 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
614 		      df, !net_eq(tunnel->net, dev_net(dev)));
615 	return;
616 tx_error:
617 	dev->stats.tx_errors++;
618 	goto kfree;
619 tx_dropped:
620 	dev->stats.tx_dropped++;
621 kfree:
622 	kfree_skb(skb);
623 }
624 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
625 
626 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
627 		    const struct iphdr *tnl_params, u8 protocol)
628 {
629 	struct ip_tunnel *tunnel = netdev_priv(dev);
630 	unsigned int inner_nhdr_len = 0;
631 	const struct iphdr *inner_iph;
632 	struct flowi4 fl4;
633 	u8     tos, ttl;
634 	__be16 df;
635 	struct rtable *rt;		/* Route to the other host */
636 	unsigned int max_headroom;	/* The extra header space needed */
637 	__be32 dst;
638 	bool connected;
639 
640 	/* ensure we can access the inner net header, for several users below */
641 	if (skb->protocol == htons(ETH_P_IP))
642 		inner_nhdr_len = sizeof(struct iphdr);
643 	else if (skb->protocol == htons(ETH_P_IPV6))
644 		inner_nhdr_len = sizeof(struct ipv6hdr);
645 	if (unlikely(!pskb_may_pull(skb, inner_nhdr_len)))
646 		goto tx_error;
647 
648 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
649 	connected = (tunnel->parms.iph.daddr != 0);
650 
651 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
652 
653 	dst = tnl_params->daddr;
654 	if (dst == 0) {
655 		/* NBMA tunnel */
656 
657 		if (!skb_dst(skb)) {
658 			dev->stats.tx_fifo_errors++;
659 			goto tx_error;
660 		}
661 
662 		if (skb->protocol == htons(ETH_P_IP)) {
663 			rt = skb_rtable(skb);
664 			dst = rt_nexthop(rt, inner_iph->daddr);
665 		}
666 #if IS_ENABLED(CONFIG_IPV6)
667 		else if (skb->protocol == htons(ETH_P_IPV6)) {
668 			const struct in6_addr *addr6;
669 			struct neighbour *neigh;
670 			bool do_tx_error_icmp;
671 			int addr_type;
672 
673 			neigh = dst_neigh_lookup(skb_dst(skb),
674 						 &ipv6_hdr(skb)->daddr);
675 			if (!neigh)
676 				goto tx_error;
677 
678 			addr6 = (const struct in6_addr *)&neigh->primary_key;
679 			addr_type = ipv6_addr_type(addr6);
680 
681 			if (addr_type == IPV6_ADDR_ANY) {
682 				addr6 = &ipv6_hdr(skb)->daddr;
683 				addr_type = ipv6_addr_type(addr6);
684 			}
685 
686 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
687 				do_tx_error_icmp = true;
688 			else {
689 				do_tx_error_icmp = false;
690 				dst = addr6->s6_addr32[3];
691 			}
692 			neigh_release(neigh);
693 			if (do_tx_error_icmp)
694 				goto tx_error_icmp;
695 		}
696 #endif
697 		else
698 			goto tx_error;
699 
700 		connected = false;
701 	}
702 
703 	tos = tnl_params->tos;
704 	if (tos & 0x1) {
705 		tos &= ~0x1;
706 		if (skb->protocol == htons(ETH_P_IP)) {
707 			tos = inner_iph->tos;
708 			connected = false;
709 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
710 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
711 			connected = false;
712 		}
713 	}
714 
715 	ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
716 			    tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
717 			    tunnel->fwmark);
718 
719 	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
720 		goto tx_error;
721 
722 	rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
723 			 NULL;
724 
725 	if (!rt) {
726 		rt = ip_route_output_key(tunnel->net, &fl4);
727 
728 		if (IS_ERR(rt)) {
729 			dev->stats.tx_carrier_errors++;
730 			goto tx_error;
731 		}
732 		if (connected)
733 			dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
734 					  fl4.saddr);
735 	}
736 
737 	if (rt->dst.dev == dev) {
738 		ip_rt_put(rt);
739 		dev->stats.collisions++;
740 		goto tx_error;
741 	}
742 
743 	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
744 		ip_rt_put(rt);
745 		goto tx_error;
746 	}
747 
748 	if (tunnel->err_count > 0) {
749 		if (time_before(jiffies,
750 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
751 			tunnel->err_count--;
752 
753 			dst_link_failure(skb);
754 		} else
755 			tunnel->err_count = 0;
756 	}
757 
758 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
759 	ttl = tnl_params->ttl;
760 	if (ttl == 0) {
761 		if (skb->protocol == htons(ETH_P_IP))
762 			ttl = inner_iph->ttl;
763 #if IS_ENABLED(CONFIG_IPV6)
764 		else if (skb->protocol == htons(ETH_P_IPV6))
765 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
766 #endif
767 		else
768 			ttl = ip4_dst_hoplimit(&rt->dst);
769 	}
770 
771 	df = tnl_params->frag_off;
772 	if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
773 		df |= (inner_iph->frag_off&htons(IP_DF));
774 
775 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
776 			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
777 	if (max_headroom > dev->needed_headroom)
778 		dev->needed_headroom = max_headroom;
779 
780 	if (skb_cow_head(skb, dev->needed_headroom)) {
781 		ip_rt_put(rt);
782 		dev->stats.tx_dropped++;
783 		kfree_skb(skb);
784 		return;
785 	}
786 
787 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
788 		      df, !net_eq(tunnel->net, dev_net(dev)));
789 	return;
790 
791 #if IS_ENABLED(CONFIG_IPV6)
792 tx_error_icmp:
793 	dst_link_failure(skb);
794 #endif
795 tx_error:
796 	dev->stats.tx_errors++;
797 	kfree_skb(skb);
798 }
799 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
800 
801 static void ip_tunnel_update(struct ip_tunnel_net *itn,
802 			     struct ip_tunnel *t,
803 			     struct net_device *dev,
804 			     struct ip_tunnel_parm *p,
805 			     bool set_mtu,
806 			     __u32 fwmark)
807 {
808 	ip_tunnel_del(itn, t);
809 	t->parms.iph.saddr = p->iph.saddr;
810 	t->parms.iph.daddr = p->iph.daddr;
811 	t->parms.i_key = p->i_key;
812 	t->parms.o_key = p->o_key;
813 	if (dev->type != ARPHRD_ETHER) {
814 		memcpy(dev->dev_addr, &p->iph.saddr, 4);
815 		memcpy(dev->broadcast, &p->iph.daddr, 4);
816 	}
817 	ip_tunnel_add(itn, t);
818 
819 	t->parms.iph.ttl = p->iph.ttl;
820 	t->parms.iph.tos = p->iph.tos;
821 	t->parms.iph.frag_off = p->iph.frag_off;
822 
823 	if (t->parms.link != p->link || t->fwmark != fwmark) {
824 		int mtu;
825 
826 		t->parms.link = p->link;
827 		t->fwmark = fwmark;
828 		mtu = ip_tunnel_bind_dev(dev);
829 		if (set_mtu)
830 			dev->mtu = mtu;
831 	}
832 	dst_cache_reset(&t->dst_cache);
833 	netdev_state_change(dev);
834 }
835 
836 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
837 {
838 	int err = 0;
839 	struct ip_tunnel *t = netdev_priv(dev);
840 	struct net *net = t->net;
841 	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
842 
843 	switch (cmd) {
844 	case SIOCGETTUNNEL:
845 		if (dev == itn->fb_tunnel_dev) {
846 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
847 			if (!t)
848 				t = netdev_priv(dev);
849 		}
850 		memcpy(p, &t->parms, sizeof(*p));
851 		break;
852 
853 	case SIOCADDTUNNEL:
854 	case SIOCCHGTUNNEL:
855 		err = -EPERM;
856 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
857 			goto done;
858 		if (p->iph.ttl)
859 			p->iph.frag_off |= htons(IP_DF);
860 		if (!(p->i_flags & VTI_ISVTI)) {
861 			if (!(p->i_flags & TUNNEL_KEY))
862 				p->i_key = 0;
863 			if (!(p->o_flags & TUNNEL_KEY))
864 				p->o_key = 0;
865 		}
866 
867 		t = ip_tunnel_find(itn, p, itn->type);
868 
869 		if (cmd == SIOCADDTUNNEL) {
870 			if (!t) {
871 				t = ip_tunnel_create(net, itn, p);
872 				err = PTR_ERR_OR_ZERO(t);
873 				break;
874 			}
875 
876 			err = -EEXIST;
877 			break;
878 		}
879 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
880 			if (t) {
881 				if (t->dev != dev) {
882 					err = -EEXIST;
883 					break;
884 				}
885 			} else {
886 				unsigned int nflags = 0;
887 
888 				if (ipv4_is_multicast(p->iph.daddr))
889 					nflags = IFF_BROADCAST;
890 				else if (p->iph.daddr)
891 					nflags = IFF_POINTOPOINT;
892 
893 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
894 					err = -EINVAL;
895 					break;
896 				}
897 
898 				t = netdev_priv(dev);
899 			}
900 		}
901 
902 		if (t) {
903 			err = 0;
904 			ip_tunnel_update(itn, t, dev, p, true, 0);
905 		} else {
906 			err = -ENOENT;
907 		}
908 		break;
909 
910 	case SIOCDELTUNNEL:
911 		err = -EPERM;
912 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
913 			goto done;
914 
915 		if (dev == itn->fb_tunnel_dev) {
916 			err = -ENOENT;
917 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
918 			if (!t)
919 				goto done;
920 			err = -EPERM;
921 			if (t == netdev_priv(itn->fb_tunnel_dev))
922 				goto done;
923 			dev = t->dev;
924 		}
925 		unregister_netdevice(dev);
926 		err = 0;
927 		break;
928 
929 	default:
930 		err = -EINVAL;
931 	}
932 
933 done:
934 	return err;
935 }
936 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
937 
938 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
939 {
940 	struct ip_tunnel *tunnel = netdev_priv(dev);
941 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
942 	int max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
943 
944 	if (new_mtu < ETH_MIN_MTU)
945 		return -EINVAL;
946 
947 	if (new_mtu > max_mtu) {
948 		if (strict)
949 			return -EINVAL;
950 
951 		new_mtu = max_mtu;
952 	}
953 
954 	dev->mtu = new_mtu;
955 	return 0;
956 }
957 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
958 
959 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
960 {
961 	return __ip_tunnel_change_mtu(dev, new_mtu, true);
962 }
963 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
964 
965 static void ip_tunnel_dev_free(struct net_device *dev)
966 {
967 	struct ip_tunnel *tunnel = netdev_priv(dev);
968 
969 	gro_cells_destroy(&tunnel->gro_cells);
970 	dst_cache_destroy(&tunnel->dst_cache);
971 	free_percpu(dev->tstats);
972 }
973 
974 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
975 {
976 	struct ip_tunnel *tunnel = netdev_priv(dev);
977 	struct ip_tunnel_net *itn;
978 
979 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
980 
981 	if (itn->fb_tunnel_dev != dev) {
982 		ip_tunnel_del(itn, netdev_priv(dev));
983 		unregister_netdevice_queue(dev, head);
984 	}
985 }
986 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
987 
988 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
989 {
990 	struct ip_tunnel *tunnel = netdev_priv(dev);
991 
992 	return tunnel->net;
993 }
994 EXPORT_SYMBOL(ip_tunnel_get_link_net);
995 
996 int ip_tunnel_get_iflink(const struct net_device *dev)
997 {
998 	struct ip_tunnel *tunnel = netdev_priv(dev);
999 
1000 	return tunnel->parms.link;
1001 }
1002 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1003 
1004 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1005 				  struct rtnl_link_ops *ops, char *devname)
1006 {
1007 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1008 	struct ip_tunnel_parm parms;
1009 	unsigned int i;
1010 
1011 	itn->rtnl_link_ops = ops;
1012 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1013 		INIT_HLIST_HEAD(&itn->tunnels[i]);
1014 
1015 	if (!ops || !net_has_fallback_tunnels(net)) {
1016 		struct ip_tunnel_net *it_init_net;
1017 
1018 		it_init_net = net_generic(&init_net, ip_tnl_net_id);
1019 		itn->type = it_init_net->type;
1020 		itn->fb_tunnel_dev = NULL;
1021 		return 0;
1022 	}
1023 
1024 	memset(&parms, 0, sizeof(parms));
1025 	if (devname)
1026 		strlcpy(parms.name, devname, IFNAMSIZ);
1027 
1028 	rtnl_lock();
1029 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1030 	/* FB netdevice is special: we have one, and only one per netns.
1031 	 * Allowing to move it to another netns is clearly unsafe.
1032 	 */
1033 	if (!IS_ERR(itn->fb_tunnel_dev)) {
1034 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1035 		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1036 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1037 		itn->type = itn->fb_tunnel_dev->type;
1038 	}
1039 	rtnl_unlock();
1040 
1041 	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1042 }
1043 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1044 
1045 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1046 			      struct list_head *head,
1047 			      struct rtnl_link_ops *ops)
1048 {
1049 	struct net_device *dev, *aux;
1050 	int h;
1051 
1052 	for_each_netdev_safe(net, dev, aux)
1053 		if (dev->rtnl_link_ops == ops)
1054 			unregister_netdevice_queue(dev, head);
1055 
1056 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1057 		struct ip_tunnel *t;
1058 		struct hlist_node *n;
1059 		struct hlist_head *thead = &itn->tunnels[h];
1060 
1061 		hlist_for_each_entry_safe(t, n, thead, hash_node)
1062 			/* If dev is in the same netns, it has already
1063 			 * been added to the list by the previous loop.
1064 			 */
1065 			if (!net_eq(dev_net(t->dev), net))
1066 				unregister_netdevice_queue(t->dev, head);
1067 	}
1068 }
1069 
1070 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1071 			   struct rtnl_link_ops *ops)
1072 {
1073 	struct ip_tunnel_net *itn;
1074 	struct net *net;
1075 	LIST_HEAD(list);
1076 
1077 	rtnl_lock();
1078 	list_for_each_entry(net, net_list, exit_list) {
1079 		itn = net_generic(net, id);
1080 		ip_tunnel_destroy(net, itn, &list, ops);
1081 	}
1082 	unregister_netdevice_many(&list);
1083 	rtnl_unlock();
1084 }
1085 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1086 
1087 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1088 		      struct ip_tunnel_parm *p, __u32 fwmark)
1089 {
1090 	struct ip_tunnel *nt;
1091 	struct net *net = dev_net(dev);
1092 	struct ip_tunnel_net *itn;
1093 	int mtu;
1094 	int err;
1095 
1096 	nt = netdev_priv(dev);
1097 	itn = net_generic(net, nt->ip_tnl_net_id);
1098 
1099 	if (nt->collect_md) {
1100 		if (rtnl_dereference(itn->collect_md_tun))
1101 			return -EEXIST;
1102 	} else {
1103 		if (ip_tunnel_find(itn, p, dev->type))
1104 			return -EEXIST;
1105 	}
1106 
1107 	nt->net = net;
1108 	nt->parms = *p;
1109 	nt->fwmark = fwmark;
1110 	err = register_netdevice(dev);
1111 	if (err)
1112 		goto err_register_netdevice;
1113 
1114 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1115 		eth_hw_addr_random(dev);
1116 
1117 	mtu = ip_tunnel_bind_dev(dev);
1118 	if (tb[IFLA_MTU]) {
1119 		unsigned int max = IP_MAX_MTU - dev->hard_header_len - nt->hlen;
1120 
1121 		mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU,
1122 			    (unsigned int)(max - sizeof(struct iphdr)));
1123 	}
1124 
1125 	err = dev_set_mtu(dev, mtu);
1126 	if (err)
1127 		goto err_dev_set_mtu;
1128 
1129 	ip_tunnel_add(itn, nt);
1130 	return 0;
1131 
1132 err_dev_set_mtu:
1133 	unregister_netdevice(dev);
1134 err_register_netdevice:
1135 	return err;
1136 }
1137 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1138 
1139 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1140 			 struct ip_tunnel_parm *p, __u32 fwmark)
1141 {
1142 	struct ip_tunnel *t;
1143 	struct ip_tunnel *tunnel = netdev_priv(dev);
1144 	struct net *net = tunnel->net;
1145 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1146 
1147 	if (dev == itn->fb_tunnel_dev)
1148 		return -EINVAL;
1149 
1150 	t = ip_tunnel_find(itn, p, dev->type);
1151 
1152 	if (t) {
1153 		if (t->dev != dev)
1154 			return -EEXIST;
1155 	} else {
1156 		t = tunnel;
1157 
1158 		if (dev->type != ARPHRD_ETHER) {
1159 			unsigned int nflags = 0;
1160 
1161 			if (ipv4_is_multicast(p->iph.daddr))
1162 				nflags = IFF_BROADCAST;
1163 			else if (p->iph.daddr)
1164 				nflags = IFF_POINTOPOINT;
1165 
1166 			if ((dev->flags ^ nflags) &
1167 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1168 				return -EINVAL;
1169 		}
1170 	}
1171 
1172 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1173 	return 0;
1174 }
1175 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1176 
1177 int ip_tunnel_init(struct net_device *dev)
1178 {
1179 	struct ip_tunnel *tunnel = netdev_priv(dev);
1180 	struct iphdr *iph = &tunnel->parms.iph;
1181 	int err;
1182 
1183 	dev->needs_free_netdev = true;
1184 	dev->priv_destructor = ip_tunnel_dev_free;
1185 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1186 	if (!dev->tstats)
1187 		return -ENOMEM;
1188 
1189 	err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1190 	if (err) {
1191 		free_percpu(dev->tstats);
1192 		return err;
1193 	}
1194 
1195 	err = gro_cells_init(&tunnel->gro_cells, dev);
1196 	if (err) {
1197 		dst_cache_destroy(&tunnel->dst_cache);
1198 		free_percpu(dev->tstats);
1199 		return err;
1200 	}
1201 
1202 	tunnel->dev = dev;
1203 	tunnel->net = dev_net(dev);
1204 	strcpy(tunnel->parms.name, dev->name);
1205 	iph->version		= 4;
1206 	iph->ihl		= 5;
1207 
1208 	if (tunnel->collect_md) {
1209 		dev->features |= NETIF_F_NETNS_LOCAL;
1210 		netif_keep_dst(dev);
1211 	}
1212 	return 0;
1213 }
1214 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1215 
1216 void ip_tunnel_uninit(struct net_device *dev)
1217 {
1218 	struct ip_tunnel *tunnel = netdev_priv(dev);
1219 	struct net *net = tunnel->net;
1220 	struct ip_tunnel_net *itn;
1221 
1222 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1223 	/* fb_tunnel_dev will be unregisted in net-exit call. */
1224 	if (itn->fb_tunnel_dev != dev)
1225 		ip_tunnel_del(itn, netdev_priv(dev));
1226 
1227 	dst_cache_reset(&tunnel->dst_cache);
1228 }
1229 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1230 
1231 /* Do least required initialization, rest of init is done in tunnel_init call */
1232 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1233 {
1234 	struct ip_tunnel *tunnel = netdev_priv(dev);
1235 	tunnel->ip_tnl_net_id = net_id;
1236 }
1237 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1238 
1239 MODULE_LICENSE("GPL");
1240