xref: /openbmc/linux/net/ipv4/ip_tunnel.c (revision c4f7ac64)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2013 Nicira, Inc.
4  */
5 
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7 
8 #include <linux/capability.h>
9 #include <linux/module.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/slab.h>
13 #include <linux/uaccess.h>
14 #include <linux/skbuff.h>
15 #include <linux/netdevice.h>
16 #include <linux/in.h>
17 #include <linux/tcp.h>
18 #include <linux/udp.h>
19 #include <linux/if_arp.h>
20 #include <linux/init.h>
21 #include <linux/in6.h>
22 #include <linux/inetdevice.h>
23 #include <linux/igmp.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/etherdevice.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_vlan.h>
28 #include <linux/rculist.h>
29 #include <linux/err.h>
30 
31 #include <net/sock.h>
32 #include <net/ip.h>
33 #include <net/icmp.h>
34 #include <net/protocol.h>
35 #include <net/ip_tunnels.h>
36 #include <net/arp.h>
37 #include <net/checksum.h>
38 #include <net/dsfield.h>
39 #include <net/inet_ecn.h>
40 #include <net/xfrm.h>
41 #include <net/net_namespace.h>
42 #include <net/netns/generic.h>
43 #include <net/rtnetlink.h>
44 #include <net/udp.h>
45 #include <net/dst_metadata.h>
46 
47 #if IS_ENABLED(CONFIG_IPV6)
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #endif
52 
53 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
54 {
55 	return hash_32((__force u32)key ^ (__force u32)remote,
56 			 IP_TNL_HASH_BITS);
57 }
58 
59 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
60 				__be16 flags, __be32 key)
61 {
62 	if (p->i_flags & TUNNEL_KEY) {
63 		if (flags & TUNNEL_KEY)
64 			return key == p->i_key;
65 		else
66 			/* key expected, none present */
67 			return false;
68 	} else
69 		return !(flags & TUNNEL_KEY);
70 }
71 
72 /* Fallback tunnel: no source, no destination, no key, no options
73 
74    Tunnel hash table:
75    We require exact key match i.e. if a key is present in packet
76    it will match only tunnel with the same key; if it is not present,
77    it will match only keyless tunnel.
78 
79    All keysless packets, if not matched configured keyless tunnels
80    will match fallback tunnel.
81    Given src, dst and key, find appropriate for input tunnel.
82 */
83 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
84 				   int link, __be16 flags,
85 				   __be32 remote, __be32 local,
86 				   __be32 key)
87 {
88 	struct ip_tunnel *t, *cand = NULL;
89 	struct hlist_head *head;
90 	struct net_device *ndev;
91 	unsigned int hash;
92 
93 	hash = ip_tunnel_hash(key, remote);
94 	head = &itn->tunnels[hash];
95 
96 	hlist_for_each_entry_rcu(t, head, hash_node) {
97 		if (local != t->parms.iph.saddr ||
98 		    remote != t->parms.iph.daddr ||
99 		    !(t->dev->flags & IFF_UP))
100 			continue;
101 
102 		if (!ip_tunnel_key_match(&t->parms, flags, key))
103 			continue;
104 
105 		if (t->parms.link == link)
106 			return t;
107 		else
108 			cand = t;
109 	}
110 
111 	hlist_for_each_entry_rcu(t, head, hash_node) {
112 		if (remote != t->parms.iph.daddr ||
113 		    t->parms.iph.saddr != 0 ||
114 		    !(t->dev->flags & IFF_UP))
115 			continue;
116 
117 		if (!ip_tunnel_key_match(&t->parms, flags, key))
118 			continue;
119 
120 		if (t->parms.link == link)
121 			return t;
122 		else if (!cand)
123 			cand = t;
124 	}
125 
126 	hash = ip_tunnel_hash(key, 0);
127 	head = &itn->tunnels[hash];
128 
129 	hlist_for_each_entry_rcu(t, head, hash_node) {
130 		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
131 		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
132 			continue;
133 
134 		if (!(t->dev->flags & IFF_UP))
135 			continue;
136 
137 		if (!ip_tunnel_key_match(&t->parms, flags, key))
138 			continue;
139 
140 		if (t->parms.link == link)
141 			return t;
142 		else if (!cand)
143 			cand = t;
144 	}
145 
146 	hlist_for_each_entry_rcu(t, head, hash_node) {
147 		if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
148 		    t->parms.iph.saddr != 0 ||
149 		    t->parms.iph.daddr != 0 ||
150 		    !(t->dev->flags & IFF_UP))
151 			continue;
152 
153 		if (t->parms.link == link)
154 			return t;
155 		else if (!cand)
156 			cand = t;
157 	}
158 
159 	if (cand)
160 		return cand;
161 
162 	t = rcu_dereference(itn->collect_md_tun);
163 	if (t && t->dev->flags & IFF_UP)
164 		return t;
165 
166 	ndev = READ_ONCE(itn->fb_tunnel_dev);
167 	if (ndev && ndev->flags & IFF_UP)
168 		return netdev_priv(ndev);
169 
170 	return NULL;
171 }
172 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
173 
174 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
175 				    struct ip_tunnel_parm *parms)
176 {
177 	unsigned int h;
178 	__be32 remote;
179 	__be32 i_key = parms->i_key;
180 
181 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
182 		remote = parms->iph.daddr;
183 	else
184 		remote = 0;
185 
186 	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
187 		i_key = 0;
188 
189 	h = ip_tunnel_hash(i_key, remote);
190 	return &itn->tunnels[h];
191 }
192 
193 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
194 {
195 	struct hlist_head *head = ip_bucket(itn, &t->parms);
196 
197 	if (t->collect_md)
198 		rcu_assign_pointer(itn->collect_md_tun, t);
199 	hlist_add_head_rcu(&t->hash_node, head);
200 }
201 
202 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
203 {
204 	if (t->collect_md)
205 		rcu_assign_pointer(itn->collect_md_tun, NULL);
206 	hlist_del_init_rcu(&t->hash_node);
207 }
208 
209 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
210 					struct ip_tunnel_parm *parms,
211 					int type)
212 {
213 	__be32 remote = parms->iph.daddr;
214 	__be32 local = parms->iph.saddr;
215 	__be32 key = parms->i_key;
216 	__be16 flags = parms->i_flags;
217 	int link = parms->link;
218 	struct ip_tunnel *t = NULL;
219 	struct hlist_head *head = ip_bucket(itn, parms);
220 
221 	hlist_for_each_entry_rcu(t, head, hash_node) {
222 		if (local == t->parms.iph.saddr &&
223 		    remote == t->parms.iph.daddr &&
224 		    link == t->parms.link &&
225 		    type == t->dev->type &&
226 		    ip_tunnel_key_match(&t->parms, flags, key))
227 			break;
228 	}
229 	return t;
230 }
231 
232 static struct net_device *__ip_tunnel_create(struct net *net,
233 					     const struct rtnl_link_ops *ops,
234 					     struct ip_tunnel_parm *parms)
235 {
236 	int err;
237 	struct ip_tunnel *tunnel;
238 	struct net_device *dev;
239 	char name[IFNAMSIZ];
240 
241 	err = -E2BIG;
242 	if (parms->name[0]) {
243 		if (!dev_valid_name(parms->name))
244 			goto failed;
245 		strlcpy(name, parms->name, IFNAMSIZ);
246 	} else {
247 		if (strlen(ops->kind) > (IFNAMSIZ - 3))
248 			goto failed;
249 		strcpy(name, ops->kind);
250 		strcat(name, "%d");
251 	}
252 
253 	ASSERT_RTNL();
254 	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
255 	if (!dev) {
256 		err = -ENOMEM;
257 		goto failed;
258 	}
259 	dev_net_set(dev, net);
260 
261 	dev->rtnl_link_ops = ops;
262 
263 	tunnel = netdev_priv(dev);
264 	tunnel->parms = *parms;
265 	tunnel->net = net;
266 
267 	err = register_netdevice(dev);
268 	if (err)
269 		goto failed_free;
270 
271 	return dev;
272 
273 failed_free:
274 	free_netdev(dev);
275 failed:
276 	return ERR_PTR(err);
277 }
278 
279 static int ip_tunnel_bind_dev(struct net_device *dev)
280 {
281 	struct net_device *tdev = NULL;
282 	struct ip_tunnel *tunnel = netdev_priv(dev);
283 	const struct iphdr *iph;
284 	int hlen = LL_MAX_HEADER;
285 	int mtu = ETH_DATA_LEN;
286 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
287 
288 	iph = &tunnel->parms.iph;
289 
290 	/* Guess output device to choose reasonable mtu and needed_headroom */
291 	if (iph->daddr) {
292 		struct flowi4 fl4;
293 		struct rtable *rt;
294 
295 		ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
296 				    iph->saddr, tunnel->parms.o_key,
297 				    RT_TOS(iph->tos), tunnel->parms.link,
298 				    tunnel->fwmark, 0);
299 		rt = ip_route_output_key(tunnel->net, &fl4);
300 
301 		if (!IS_ERR(rt)) {
302 			tdev = rt->dst.dev;
303 			ip_rt_put(rt);
304 		}
305 		if (dev->type != ARPHRD_ETHER)
306 			dev->flags |= IFF_POINTOPOINT;
307 
308 		dst_cache_reset(&tunnel->dst_cache);
309 	}
310 
311 	if (!tdev && tunnel->parms.link)
312 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
313 
314 	if (tdev) {
315 		hlen = tdev->hard_header_len + tdev->needed_headroom;
316 		mtu = min(tdev->mtu, IP_MAX_MTU);
317 	}
318 
319 	dev->needed_headroom = t_hlen + hlen;
320 	mtu -= t_hlen;
321 
322 	if (mtu < IPV4_MIN_MTU)
323 		mtu = IPV4_MIN_MTU;
324 
325 	return mtu;
326 }
327 
328 static struct ip_tunnel *ip_tunnel_create(struct net *net,
329 					  struct ip_tunnel_net *itn,
330 					  struct ip_tunnel_parm *parms)
331 {
332 	struct ip_tunnel *nt;
333 	struct net_device *dev;
334 	int t_hlen;
335 	int mtu;
336 	int err;
337 
338 	dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
339 	if (IS_ERR(dev))
340 		return ERR_CAST(dev);
341 
342 	mtu = ip_tunnel_bind_dev(dev);
343 	err = dev_set_mtu(dev, mtu);
344 	if (err)
345 		goto err_dev_set_mtu;
346 
347 	nt = netdev_priv(dev);
348 	t_hlen = nt->hlen + sizeof(struct iphdr);
349 	dev->min_mtu = ETH_MIN_MTU;
350 	dev->max_mtu = IP_MAX_MTU - t_hlen;
351 	ip_tunnel_add(itn, nt);
352 	return nt;
353 
354 err_dev_set_mtu:
355 	unregister_netdevice(dev);
356 	return ERR_PTR(err);
357 }
358 
359 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
360 		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
361 		  bool log_ecn_error)
362 {
363 	const struct iphdr *iph = ip_hdr(skb);
364 	int err;
365 
366 #ifdef CONFIG_NET_IPGRE_BROADCAST
367 	if (ipv4_is_multicast(iph->daddr)) {
368 		tunnel->dev->stats.multicast++;
369 		skb->pkt_type = PACKET_BROADCAST;
370 	}
371 #endif
372 
373 	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
374 	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
375 		tunnel->dev->stats.rx_crc_errors++;
376 		tunnel->dev->stats.rx_errors++;
377 		goto drop;
378 	}
379 
380 	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
381 		if (!(tpi->flags&TUNNEL_SEQ) ||
382 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
383 			tunnel->dev->stats.rx_fifo_errors++;
384 			tunnel->dev->stats.rx_errors++;
385 			goto drop;
386 		}
387 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
388 	}
389 
390 	skb_reset_network_header(skb);
391 
392 	err = IP_ECN_decapsulate(iph, skb);
393 	if (unlikely(err)) {
394 		if (log_ecn_error)
395 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
396 					&iph->saddr, iph->tos);
397 		if (err > 1) {
398 			++tunnel->dev->stats.rx_frame_errors;
399 			++tunnel->dev->stats.rx_errors;
400 			goto drop;
401 		}
402 	}
403 
404 	dev_sw_netstats_rx_add(tunnel->dev, skb->len);
405 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
406 
407 	if (tunnel->dev->type == ARPHRD_ETHER) {
408 		skb->protocol = eth_type_trans(skb, tunnel->dev);
409 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
410 	} else {
411 		skb->dev = tunnel->dev;
412 	}
413 
414 	if (tun_dst)
415 		skb_dst_set(skb, (struct dst_entry *)tun_dst);
416 
417 	gro_cells_receive(&tunnel->gro_cells, skb);
418 	return 0;
419 
420 drop:
421 	if (tun_dst)
422 		dst_release((struct dst_entry *)tun_dst);
423 	kfree_skb(skb);
424 	return 0;
425 }
426 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
427 
428 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
429 			    unsigned int num)
430 {
431 	if (num >= MAX_IPTUN_ENCAP_OPS)
432 		return -ERANGE;
433 
434 	return !cmpxchg((const struct ip_tunnel_encap_ops **)
435 			&iptun_encaps[num],
436 			NULL, ops) ? 0 : -1;
437 }
438 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
439 
440 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
441 			    unsigned int num)
442 {
443 	int ret;
444 
445 	if (num >= MAX_IPTUN_ENCAP_OPS)
446 		return -ERANGE;
447 
448 	ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
449 		       &iptun_encaps[num],
450 		       ops, NULL) == ops) ? 0 : -1;
451 
452 	synchronize_net();
453 
454 	return ret;
455 }
456 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
457 
458 int ip_tunnel_encap_setup(struct ip_tunnel *t,
459 			  struct ip_tunnel_encap *ipencap)
460 {
461 	int hlen;
462 
463 	memset(&t->encap, 0, sizeof(t->encap));
464 
465 	hlen = ip_encap_hlen(ipencap);
466 	if (hlen < 0)
467 		return hlen;
468 
469 	t->encap.type = ipencap->type;
470 	t->encap.sport = ipencap->sport;
471 	t->encap.dport = ipencap->dport;
472 	t->encap.flags = ipencap->flags;
473 
474 	t->encap_hlen = hlen;
475 	t->hlen = t->encap_hlen + t->tun_hlen;
476 
477 	return 0;
478 }
479 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
480 
481 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
482 			    struct rtable *rt, __be16 df,
483 			    const struct iphdr *inner_iph,
484 			    int tunnel_hlen, __be32 dst, bool md)
485 {
486 	struct ip_tunnel *tunnel = netdev_priv(dev);
487 	int pkt_size;
488 	int mtu;
489 
490 	tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
491 	pkt_size = skb->len - tunnel_hlen;
492 
493 	if (df)
494 		mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen);
495 	else
496 		mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
497 
498 	if (skb_valid_dst(skb))
499 		skb_dst_update_pmtu_no_confirm(skb, mtu);
500 
501 	if (skb->protocol == htons(ETH_P_IP)) {
502 		if (!skb_is_gso(skb) &&
503 		    (inner_iph->frag_off & htons(IP_DF)) &&
504 		    mtu < pkt_size) {
505 			icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
506 			return -E2BIG;
507 		}
508 	}
509 #if IS_ENABLED(CONFIG_IPV6)
510 	else if (skb->protocol == htons(ETH_P_IPV6)) {
511 		struct rt6_info *rt6;
512 		__be32 daddr;
513 
514 		rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) :
515 					   NULL;
516 		daddr = md ? dst : tunnel->parms.iph.daddr;
517 
518 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
519 			   mtu >= IPV6_MIN_MTU) {
520 			if ((daddr && !ipv4_is_multicast(daddr)) ||
521 			    rt6->rt6i_dst.plen == 128) {
522 				rt6->rt6i_flags |= RTF_MODIFIED;
523 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
524 			}
525 		}
526 
527 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
528 					mtu < pkt_size) {
529 			icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
530 			return -E2BIG;
531 		}
532 	}
533 #endif
534 	return 0;
535 }
536 
537 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
538 		       u8 proto, int tunnel_hlen)
539 {
540 	struct ip_tunnel *tunnel = netdev_priv(dev);
541 	u32 headroom = sizeof(struct iphdr);
542 	struct ip_tunnel_info *tun_info;
543 	const struct ip_tunnel_key *key;
544 	const struct iphdr *inner_iph;
545 	struct rtable *rt = NULL;
546 	struct flowi4 fl4;
547 	__be16 df = 0;
548 	u8 tos, ttl;
549 	bool use_cache;
550 
551 	tun_info = skb_tunnel_info(skb);
552 	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
553 		     ip_tunnel_info_af(tun_info) != AF_INET))
554 		goto tx_error;
555 	key = &tun_info->key;
556 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
557 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
558 	tos = key->tos;
559 	if (tos == 1) {
560 		if (skb->protocol == htons(ETH_P_IP))
561 			tos = inner_iph->tos;
562 		else if (skb->protocol == htons(ETH_P_IPV6))
563 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
564 	}
565 	ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
566 			    tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
567 			    0, skb->mark, skb_get_hash(skb));
568 	if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
569 		goto tx_error;
570 
571 	use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
572 	if (use_cache)
573 		rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
574 	if (!rt) {
575 		rt = ip_route_output_key(tunnel->net, &fl4);
576 		if (IS_ERR(rt)) {
577 			dev->stats.tx_carrier_errors++;
578 			goto tx_error;
579 		}
580 		if (use_cache)
581 			dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
582 					  fl4.saddr);
583 	}
584 	if (rt->dst.dev == dev) {
585 		ip_rt_put(rt);
586 		dev->stats.collisions++;
587 		goto tx_error;
588 	}
589 
590 	if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
591 		df = htons(IP_DF);
592 	if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
593 			    key->u.ipv4.dst, true)) {
594 		ip_rt_put(rt);
595 		goto tx_error;
596 	}
597 
598 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
599 	ttl = key->ttl;
600 	if (ttl == 0) {
601 		if (skb->protocol == htons(ETH_P_IP))
602 			ttl = inner_iph->ttl;
603 		else if (skb->protocol == htons(ETH_P_IPV6))
604 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
605 		else
606 			ttl = ip4_dst_hoplimit(&rt->dst);
607 	}
608 
609 	headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
610 	if (headroom > dev->needed_headroom)
611 		dev->needed_headroom = headroom;
612 
613 	if (skb_cow_head(skb, dev->needed_headroom)) {
614 		ip_rt_put(rt);
615 		goto tx_dropped;
616 	}
617 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
618 		      df, !net_eq(tunnel->net, dev_net(dev)));
619 	return;
620 tx_error:
621 	dev->stats.tx_errors++;
622 	goto kfree;
623 tx_dropped:
624 	dev->stats.tx_dropped++;
625 kfree:
626 	kfree_skb(skb);
627 }
628 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
629 
630 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
631 		    const struct iphdr *tnl_params, u8 protocol)
632 {
633 	struct ip_tunnel *tunnel = netdev_priv(dev);
634 	struct ip_tunnel_info *tun_info = NULL;
635 	const struct iphdr *inner_iph;
636 	unsigned int max_headroom;	/* The extra header space needed */
637 	struct rtable *rt = NULL;		/* Route to the other host */
638 	bool use_cache = false;
639 	struct flowi4 fl4;
640 	bool md = false;
641 	bool connected;
642 	u8 tos, ttl;
643 	__be32 dst;
644 	__be16 df;
645 
646 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
647 	connected = (tunnel->parms.iph.daddr != 0);
648 
649 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
650 
651 	dst = tnl_params->daddr;
652 	if (dst == 0) {
653 		/* NBMA tunnel */
654 
655 		if (!skb_dst(skb)) {
656 			dev->stats.tx_fifo_errors++;
657 			goto tx_error;
658 		}
659 
660 		tun_info = skb_tunnel_info(skb);
661 		if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
662 		    ip_tunnel_info_af(tun_info) == AF_INET &&
663 		    tun_info->key.u.ipv4.dst) {
664 			dst = tun_info->key.u.ipv4.dst;
665 			md = true;
666 			connected = true;
667 		}
668 		else if (skb->protocol == htons(ETH_P_IP)) {
669 			rt = skb_rtable(skb);
670 			dst = rt_nexthop(rt, inner_iph->daddr);
671 		}
672 #if IS_ENABLED(CONFIG_IPV6)
673 		else if (skb->protocol == htons(ETH_P_IPV6)) {
674 			const struct in6_addr *addr6;
675 			struct neighbour *neigh;
676 			bool do_tx_error_icmp;
677 			int addr_type;
678 
679 			neigh = dst_neigh_lookup(skb_dst(skb),
680 						 &ipv6_hdr(skb)->daddr);
681 			if (!neigh)
682 				goto tx_error;
683 
684 			addr6 = (const struct in6_addr *)&neigh->primary_key;
685 			addr_type = ipv6_addr_type(addr6);
686 
687 			if (addr_type == IPV6_ADDR_ANY) {
688 				addr6 = &ipv6_hdr(skb)->daddr;
689 				addr_type = ipv6_addr_type(addr6);
690 			}
691 
692 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
693 				do_tx_error_icmp = true;
694 			else {
695 				do_tx_error_icmp = false;
696 				dst = addr6->s6_addr32[3];
697 			}
698 			neigh_release(neigh);
699 			if (do_tx_error_icmp)
700 				goto tx_error_icmp;
701 		}
702 #endif
703 		else
704 			goto tx_error;
705 
706 		if (!md)
707 			connected = false;
708 	}
709 
710 	tos = tnl_params->tos;
711 	if (tos & 0x1) {
712 		tos &= ~0x1;
713 		if (skb->protocol == htons(ETH_P_IP)) {
714 			tos = inner_iph->tos;
715 			connected = false;
716 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
717 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
718 			connected = false;
719 		}
720 	}
721 
722 	ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
723 			    tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
724 			    tunnel->fwmark, skb_get_hash(skb));
725 
726 	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
727 		goto tx_error;
728 
729 	if (connected && md) {
730 		use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
731 		if (use_cache)
732 			rt = dst_cache_get_ip4(&tun_info->dst_cache,
733 					       &fl4.saddr);
734 	} else {
735 		rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
736 						&fl4.saddr) : NULL;
737 	}
738 
739 	if (!rt) {
740 		rt = ip_route_output_key(tunnel->net, &fl4);
741 
742 		if (IS_ERR(rt)) {
743 			dev->stats.tx_carrier_errors++;
744 			goto tx_error;
745 		}
746 		if (use_cache)
747 			dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
748 					  fl4.saddr);
749 		else if (!md && connected)
750 			dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
751 					  fl4.saddr);
752 	}
753 
754 	if (rt->dst.dev == dev) {
755 		ip_rt_put(rt);
756 		dev->stats.collisions++;
757 		goto tx_error;
758 	}
759 
760 	df = tnl_params->frag_off;
761 	if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
762 		df |= (inner_iph->frag_off & htons(IP_DF));
763 
764 	if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) {
765 		ip_rt_put(rt);
766 		goto tx_error;
767 	}
768 
769 	if (tunnel->err_count > 0) {
770 		if (time_before(jiffies,
771 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
772 			tunnel->err_count--;
773 
774 			dst_link_failure(skb);
775 		} else
776 			tunnel->err_count = 0;
777 	}
778 
779 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
780 	ttl = tnl_params->ttl;
781 	if (ttl == 0) {
782 		if (skb->protocol == htons(ETH_P_IP))
783 			ttl = inner_iph->ttl;
784 #if IS_ENABLED(CONFIG_IPV6)
785 		else if (skb->protocol == htons(ETH_P_IPV6))
786 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
787 #endif
788 		else
789 			ttl = ip4_dst_hoplimit(&rt->dst);
790 	}
791 
792 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
793 			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
794 	if (max_headroom > dev->needed_headroom)
795 		dev->needed_headroom = max_headroom;
796 
797 	if (skb_cow_head(skb, dev->needed_headroom)) {
798 		ip_rt_put(rt);
799 		dev->stats.tx_dropped++;
800 		kfree_skb(skb);
801 		return;
802 	}
803 
804 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
805 		      df, !net_eq(tunnel->net, dev_net(dev)));
806 	return;
807 
808 #if IS_ENABLED(CONFIG_IPV6)
809 tx_error_icmp:
810 	dst_link_failure(skb);
811 #endif
812 tx_error:
813 	dev->stats.tx_errors++;
814 	kfree_skb(skb);
815 }
816 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
817 
818 static void ip_tunnel_update(struct ip_tunnel_net *itn,
819 			     struct ip_tunnel *t,
820 			     struct net_device *dev,
821 			     struct ip_tunnel_parm *p,
822 			     bool set_mtu,
823 			     __u32 fwmark)
824 {
825 	ip_tunnel_del(itn, t);
826 	t->parms.iph.saddr = p->iph.saddr;
827 	t->parms.iph.daddr = p->iph.daddr;
828 	t->parms.i_key = p->i_key;
829 	t->parms.o_key = p->o_key;
830 	if (dev->type != ARPHRD_ETHER) {
831 		memcpy(dev->dev_addr, &p->iph.saddr, 4);
832 		memcpy(dev->broadcast, &p->iph.daddr, 4);
833 	}
834 	ip_tunnel_add(itn, t);
835 
836 	t->parms.iph.ttl = p->iph.ttl;
837 	t->parms.iph.tos = p->iph.tos;
838 	t->parms.iph.frag_off = p->iph.frag_off;
839 
840 	if (t->parms.link != p->link || t->fwmark != fwmark) {
841 		int mtu;
842 
843 		t->parms.link = p->link;
844 		t->fwmark = fwmark;
845 		mtu = ip_tunnel_bind_dev(dev);
846 		if (set_mtu)
847 			dev->mtu = mtu;
848 	}
849 	dst_cache_reset(&t->dst_cache);
850 	netdev_state_change(dev);
851 }
852 
853 int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
854 {
855 	int err = 0;
856 	struct ip_tunnel *t = netdev_priv(dev);
857 	struct net *net = t->net;
858 	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
859 
860 	switch (cmd) {
861 	case SIOCGETTUNNEL:
862 		if (dev == itn->fb_tunnel_dev) {
863 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
864 			if (!t)
865 				t = netdev_priv(dev);
866 		}
867 		memcpy(p, &t->parms, sizeof(*p));
868 		break;
869 
870 	case SIOCADDTUNNEL:
871 	case SIOCCHGTUNNEL:
872 		err = -EPERM;
873 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
874 			goto done;
875 		if (p->iph.ttl)
876 			p->iph.frag_off |= htons(IP_DF);
877 		if (!(p->i_flags & VTI_ISVTI)) {
878 			if (!(p->i_flags & TUNNEL_KEY))
879 				p->i_key = 0;
880 			if (!(p->o_flags & TUNNEL_KEY))
881 				p->o_key = 0;
882 		}
883 
884 		t = ip_tunnel_find(itn, p, itn->type);
885 
886 		if (cmd == SIOCADDTUNNEL) {
887 			if (!t) {
888 				t = ip_tunnel_create(net, itn, p);
889 				err = PTR_ERR_OR_ZERO(t);
890 				break;
891 			}
892 
893 			err = -EEXIST;
894 			break;
895 		}
896 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
897 			if (t) {
898 				if (t->dev != dev) {
899 					err = -EEXIST;
900 					break;
901 				}
902 			} else {
903 				unsigned int nflags = 0;
904 
905 				if (ipv4_is_multicast(p->iph.daddr))
906 					nflags = IFF_BROADCAST;
907 				else if (p->iph.daddr)
908 					nflags = IFF_POINTOPOINT;
909 
910 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
911 					err = -EINVAL;
912 					break;
913 				}
914 
915 				t = netdev_priv(dev);
916 			}
917 		}
918 
919 		if (t) {
920 			err = 0;
921 			ip_tunnel_update(itn, t, dev, p, true, 0);
922 		} else {
923 			err = -ENOENT;
924 		}
925 		break;
926 
927 	case SIOCDELTUNNEL:
928 		err = -EPERM;
929 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
930 			goto done;
931 
932 		if (dev == itn->fb_tunnel_dev) {
933 			err = -ENOENT;
934 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
935 			if (!t)
936 				goto done;
937 			err = -EPERM;
938 			if (t == netdev_priv(itn->fb_tunnel_dev))
939 				goto done;
940 			dev = t->dev;
941 		}
942 		unregister_netdevice(dev);
943 		err = 0;
944 		break;
945 
946 	default:
947 		err = -EINVAL;
948 	}
949 
950 done:
951 	return err;
952 }
953 EXPORT_SYMBOL_GPL(ip_tunnel_ctl);
954 
955 int ip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
956 {
957 	struct ip_tunnel_parm p;
958 	int err;
959 
960 	if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
961 		return -EFAULT;
962 	err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
963 	if (!err && copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
964 		return -EFAULT;
965 	return err;
966 }
967 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
968 
969 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
970 {
971 	struct ip_tunnel *tunnel = netdev_priv(dev);
972 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
973 	int max_mtu = IP_MAX_MTU - t_hlen;
974 
975 	if (new_mtu < ETH_MIN_MTU)
976 		return -EINVAL;
977 
978 	if (new_mtu > max_mtu) {
979 		if (strict)
980 			return -EINVAL;
981 
982 		new_mtu = max_mtu;
983 	}
984 
985 	dev->mtu = new_mtu;
986 	return 0;
987 }
988 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
989 
990 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
991 {
992 	return __ip_tunnel_change_mtu(dev, new_mtu, true);
993 }
994 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
995 
996 static void ip_tunnel_dev_free(struct net_device *dev)
997 {
998 	struct ip_tunnel *tunnel = netdev_priv(dev);
999 
1000 	gro_cells_destroy(&tunnel->gro_cells);
1001 	dst_cache_destroy(&tunnel->dst_cache);
1002 	free_percpu(dev->tstats);
1003 }
1004 
1005 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1006 {
1007 	struct ip_tunnel *tunnel = netdev_priv(dev);
1008 	struct ip_tunnel_net *itn;
1009 
1010 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1011 
1012 	if (itn->fb_tunnel_dev != dev) {
1013 		ip_tunnel_del(itn, netdev_priv(dev));
1014 		unregister_netdevice_queue(dev, head);
1015 	}
1016 }
1017 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1018 
1019 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1020 {
1021 	struct ip_tunnel *tunnel = netdev_priv(dev);
1022 
1023 	return tunnel->net;
1024 }
1025 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1026 
1027 int ip_tunnel_get_iflink(const struct net_device *dev)
1028 {
1029 	struct ip_tunnel *tunnel = netdev_priv(dev);
1030 
1031 	return tunnel->parms.link;
1032 }
1033 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1034 
1035 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1036 				  struct rtnl_link_ops *ops, char *devname)
1037 {
1038 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1039 	struct ip_tunnel_parm parms;
1040 	unsigned int i;
1041 
1042 	itn->rtnl_link_ops = ops;
1043 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1044 		INIT_HLIST_HEAD(&itn->tunnels[i]);
1045 
1046 	if (!ops || !net_has_fallback_tunnels(net)) {
1047 		struct ip_tunnel_net *it_init_net;
1048 
1049 		it_init_net = net_generic(&init_net, ip_tnl_net_id);
1050 		itn->type = it_init_net->type;
1051 		itn->fb_tunnel_dev = NULL;
1052 		return 0;
1053 	}
1054 
1055 	memset(&parms, 0, sizeof(parms));
1056 	if (devname)
1057 		strlcpy(parms.name, devname, IFNAMSIZ);
1058 
1059 	rtnl_lock();
1060 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1061 	/* FB netdevice is special: we have one, and only one per netns.
1062 	 * Allowing to move it to another netns is clearly unsafe.
1063 	 */
1064 	if (!IS_ERR(itn->fb_tunnel_dev)) {
1065 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1066 		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1067 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1068 		itn->type = itn->fb_tunnel_dev->type;
1069 	}
1070 	rtnl_unlock();
1071 
1072 	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1073 }
1074 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1075 
1076 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1077 			      struct list_head *head,
1078 			      struct rtnl_link_ops *ops)
1079 {
1080 	struct net_device *dev, *aux;
1081 	int h;
1082 
1083 	for_each_netdev_safe(net, dev, aux)
1084 		if (dev->rtnl_link_ops == ops)
1085 			unregister_netdevice_queue(dev, head);
1086 
1087 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1088 		struct ip_tunnel *t;
1089 		struct hlist_node *n;
1090 		struct hlist_head *thead = &itn->tunnels[h];
1091 
1092 		hlist_for_each_entry_safe(t, n, thead, hash_node)
1093 			/* If dev is in the same netns, it has already
1094 			 * been added to the list by the previous loop.
1095 			 */
1096 			if (!net_eq(dev_net(t->dev), net))
1097 				unregister_netdevice_queue(t->dev, head);
1098 	}
1099 }
1100 
1101 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1102 			   struct rtnl_link_ops *ops)
1103 {
1104 	struct ip_tunnel_net *itn;
1105 	struct net *net;
1106 	LIST_HEAD(list);
1107 
1108 	rtnl_lock();
1109 	list_for_each_entry(net, net_list, exit_list) {
1110 		itn = net_generic(net, id);
1111 		ip_tunnel_destroy(net, itn, &list, ops);
1112 	}
1113 	unregister_netdevice_many(&list);
1114 	rtnl_unlock();
1115 }
1116 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1117 
1118 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1119 		      struct ip_tunnel_parm *p, __u32 fwmark)
1120 {
1121 	struct ip_tunnel *nt;
1122 	struct net *net = dev_net(dev);
1123 	struct ip_tunnel_net *itn;
1124 	int mtu;
1125 	int err;
1126 
1127 	nt = netdev_priv(dev);
1128 	itn = net_generic(net, nt->ip_tnl_net_id);
1129 
1130 	if (nt->collect_md) {
1131 		if (rtnl_dereference(itn->collect_md_tun))
1132 			return -EEXIST;
1133 	} else {
1134 		if (ip_tunnel_find(itn, p, dev->type))
1135 			return -EEXIST;
1136 	}
1137 
1138 	nt->net = net;
1139 	nt->parms = *p;
1140 	nt->fwmark = fwmark;
1141 	err = register_netdevice(dev);
1142 	if (err)
1143 		goto err_register_netdevice;
1144 
1145 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1146 		eth_hw_addr_random(dev);
1147 
1148 	mtu = ip_tunnel_bind_dev(dev);
1149 	if (tb[IFLA_MTU]) {
1150 		unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr));
1151 
1152 		mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max);
1153 	}
1154 
1155 	err = dev_set_mtu(dev, mtu);
1156 	if (err)
1157 		goto err_dev_set_mtu;
1158 
1159 	ip_tunnel_add(itn, nt);
1160 	return 0;
1161 
1162 err_dev_set_mtu:
1163 	unregister_netdevice(dev);
1164 err_register_netdevice:
1165 	return err;
1166 }
1167 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1168 
1169 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1170 			 struct ip_tunnel_parm *p, __u32 fwmark)
1171 {
1172 	struct ip_tunnel *t;
1173 	struct ip_tunnel *tunnel = netdev_priv(dev);
1174 	struct net *net = tunnel->net;
1175 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1176 
1177 	if (dev == itn->fb_tunnel_dev)
1178 		return -EINVAL;
1179 
1180 	t = ip_tunnel_find(itn, p, dev->type);
1181 
1182 	if (t) {
1183 		if (t->dev != dev)
1184 			return -EEXIST;
1185 	} else {
1186 		t = tunnel;
1187 
1188 		if (dev->type != ARPHRD_ETHER) {
1189 			unsigned int nflags = 0;
1190 
1191 			if (ipv4_is_multicast(p->iph.daddr))
1192 				nflags = IFF_BROADCAST;
1193 			else if (p->iph.daddr)
1194 				nflags = IFF_POINTOPOINT;
1195 
1196 			if ((dev->flags ^ nflags) &
1197 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1198 				return -EINVAL;
1199 		}
1200 	}
1201 
1202 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1203 	return 0;
1204 }
1205 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1206 
1207 int ip_tunnel_init(struct net_device *dev)
1208 {
1209 	struct ip_tunnel *tunnel = netdev_priv(dev);
1210 	struct iphdr *iph = &tunnel->parms.iph;
1211 	int err;
1212 
1213 	dev->needs_free_netdev = true;
1214 	dev->priv_destructor = ip_tunnel_dev_free;
1215 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1216 	if (!dev->tstats)
1217 		return -ENOMEM;
1218 
1219 	err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1220 	if (err) {
1221 		free_percpu(dev->tstats);
1222 		return err;
1223 	}
1224 
1225 	err = gro_cells_init(&tunnel->gro_cells, dev);
1226 	if (err) {
1227 		dst_cache_destroy(&tunnel->dst_cache);
1228 		free_percpu(dev->tstats);
1229 		return err;
1230 	}
1231 
1232 	tunnel->dev = dev;
1233 	tunnel->net = dev_net(dev);
1234 	strcpy(tunnel->parms.name, dev->name);
1235 	iph->version		= 4;
1236 	iph->ihl		= 5;
1237 
1238 	if (tunnel->collect_md)
1239 		netif_keep_dst(dev);
1240 	return 0;
1241 }
1242 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1243 
1244 void ip_tunnel_uninit(struct net_device *dev)
1245 {
1246 	struct ip_tunnel *tunnel = netdev_priv(dev);
1247 	struct net *net = tunnel->net;
1248 	struct ip_tunnel_net *itn;
1249 
1250 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1251 	ip_tunnel_del(itn, netdev_priv(dev));
1252 	if (itn->fb_tunnel_dev == dev)
1253 		WRITE_ONCE(itn->fb_tunnel_dev, NULL);
1254 
1255 	dst_cache_reset(&tunnel->dst_cache);
1256 }
1257 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1258 
1259 /* Do least required initialization, rest of init is done in tunnel_init call */
1260 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1261 {
1262 	struct ip_tunnel *tunnel = netdev_priv(dev);
1263 	tunnel->ip_tnl_net_id = net_id;
1264 }
1265 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1266 
1267 MODULE_LICENSE("GPL");
1268