xref: /openbmc/linux/net/ipv4/ip_tunnel.c (revision b240b419db5d624ce7a5a397d6f62a1a686009ec)
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20 
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/init.h>
34 #include <linux/in6.h>
35 #include <linux/inetdevice.h>
36 #include <linux/igmp.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/etherdevice.h>
39 #include <linux/if_ether.h>
40 #include <linux/if_vlan.h>
41 #include <linux/rculist.h>
42 #include <linux/err.h>
43 
44 #include <net/sock.h>
45 #include <net/ip.h>
46 #include <net/icmp.h>
47 #include <net/protocol.h>
48 #include <net/ip_tunnels.h>
49 #include <net/arp.h>
50 #include <net/checksum.h>
51 #include <net/dsfield.h>
52 #include <net/inet_ecn.h>
53 #include <net/xfrm.h>
54 #include <net/net_namespace.h>
55 #include <net/netns/generic.h>
56 #include <net/rtnetlink.h>
57 #include <net/udp.h>
58 #include <net/dst_metadata.h>
59 
60 #if IS_ENABLED(CONFIG_IPV6)
61 #include <net/ipv6.h>
62 #include <net/ip6_fib.h>
63 #include <net/ip6_route.h>
64 #endif
65 
66 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
67 {
68 	return hash_32((__force u32)key ^ (__force u32)remote,
69 			 IP_TNL_HASH_BITS);
70 }
71 
72 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
73 				__be16 flags, __be32 key)
74 {
75 	if (p->i_flags & TUNNEL_KEY) {
76 		if (flags & TUNNEL_KEY)
77 			return key == p->i_key;
78 		else
79 			/* key expected, none present */
80 			return false;
81 	} else
82 		return !(flags & TUNNEL_KEY);
83 }
84 
85 /* Fallback tunnel: no source, no destination, no key, no options
86 
87    Tunnel hash table:
88    We require exact key match i.e. if a key is present in packet
89    it will match only tunnel with the same key; if it is not present,
90    it will match only keyless tunnel.
91 
92    All keysless packets, if not matched configured keyless tunnels
93    will match fallback tunnel.
94    Given src, dst and key, find appropriate for input tunnel.
95 */
96 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
97 				   int link, __be16 flags,
98 				   __be32 remote, __be32 local,
99 				   __be32 key)
100 {
101 	unsigned int hash;
102 	struct ip_tunnel *t, *cand = NULL;
103 	struct hlist_head *head;
104 
105 	hash = ip_tunnel_hash(key, remote);
106 	head = &itn->tunnels[hash];
107 
108 	hlist_for_each_entry_rcu(t, head, hash_node) {
109 		if (local != t->parms.iph.saddr ||
110 		    remote != t->parms.iph.daddr ||
111 		    !(t->dev->flags & IFF_UP))
112 			continue;
113 
114 		if (!ip_tunnel_key_match(&t->parms, flags, key))
115 			continue;
116 
117 		if (t->parms.link == link)
118 			return t;
119 		else
120 			cand = t;
121 	}
122 
123 	hlist_for_each_entry_rcu(t, head, hash_node) {
124 		if (remote != t->parms.iph.daddr ||
125 		    t->parms.iph.saddr != 0 ||
126 		    !(t->dev->flags & IFF_UP))
127 			continue;
128 
129 		if (!ip_tunnel_key_match(&t->parms, flags, key))
130 			continue;
131 
132 		if (t->parms.link == link)
133 			return t;
134 		else if (!cand)
135 			cand = t;
136 	}
137 
138 	hash = ip_tunnel_hash(key, 0);
139 	head = &itn->tunnels[hash];
140 
141 	hlist_for_each_entry_rcu(t, head, hash_node) {
142 		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
143 		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
144 			continue;
145 
146 		if (!(t->dev->flags & IFF_UP))
147 			continue;
148 
149 		if (!ip_tunnel_key_match(&t->parms, flags, key))
150 			continue;
151 
152 		if (t->parms.link == link)
153 			return t;
154 		else if (!cand)
155 			cand = t;
156 	}
157 
158 	if (flags & TUNNEL_NO_KEY)
159 		goto skip_key_lookup;
160 
161 	hlist_for_each_entry_rcu(t, head, hash_node) {
162 		if (t->parms.i_key != key ||
163 		    t->parms.iph.saddr != 0 ||
164 		    t->parms.iph.daddr != 0 ||
165 		    !(t->dev->flags & IFF_UP))
166 			continue;
167 
168 		if (t->parms.link == link)
169 			return t;
170 		else if (!cand)
171 			cand = t;
172 	}
173 
174 skip_key_lookup:
175 	if (cand)
176 		return cand;
177 
178 	t = rcu_dereference(itn->collect_md_tun);
179 	if (t && t->dev->flags & IFF_UP)
180 		return t;
181 
182 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
183 		return netdev_priv(itn->fb_tunnel_dev);
184 
185 	return NULL;
186 }
187 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
188 
189 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
190 				    struct ip_tunnel_parm *parms)
191 {
192 	unsigned int h;
193 	__be32 remote;
194 	__be32 i_key = parms->i_key;
195 
196 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
197 		remote = parms->iph.daddr;
198 	else
199 		remote = 0;
200 
201 	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
202 		i_key = 0;
203 
204 	h = ip_tunnel_hash(i_key, remote);
205 	return &itn->tunnels[h];
206 }
207 
208 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
209 {
210 	struct hlist_head *head = ip_bucket(itn, &t->parms);
211 
212 	if (t->collect_md)
213 		rcu_assign_pointer(itn->collect_md_tun, t);
214 	hlist_add_head_rcu(&t->hash_node, head);
215 }
216 
217 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
218 {
219 	if (t->collect_md)
220 		rcu_assign_pointer(itn->collect_md_tun, NULL);
221 	hlist_del_init_rcu(&t->hash_node);
222 }
223 
224 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
225 					struct ip_tunnel_parm *parms,
226 					int type)
227 {
228 	__be32 remote = parms->iph.daddr;
229 	__be32 local = parms->iph.saddr;
230 	__be32 key = parms->i_key;
231 	__be16 flags = parms->i_flags;
232 	int link = parms->link;
233 	struct ip_tunnel *t = NULL;
234 	struct hlist_head *head = ip_bucket(itn, parms);
235 
236 	hlist_for_each_entry_rcu(t, head, hash_node) {
237 		if (local == t->parms.iph.saddr &&
238 		    remote == t->parms.iph.daddr &&
239 		    link == t->parms.link &&
240 		    type == t->dev->type &&
241 		    ip_tunnel_key_match(&t->parms, flags, key))
242 			break;
243 	}
244 	return t;
245 }
246 
247 static struct net_device *__ip_tunnel_create(struct net *net,
248 					     const struct rtnl_link_ops *ops,
249 					     struct ip_tunnel_parm *parms)
250 {
251 	int err;
252 	struct ip_tunnel *tunnel;
253 	struct net_device *dev;
254 	char name[IFNAMSIZ];
255 
256 	if (parms->name[0])
257 		strlcpy(name, parms->name, IFNAMSIZ);
258 	else {
259 		if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
260 			err = -E2BIG;
261 			goto failed;
262 		}
263 		strlcpy(name, ops->kind, IFNAMSIZ);
264 		strncat(name, "%d", 2);
265 	}
266 
267 	ASSERT_RTNL();
268 	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
269 	if (!dev) {
270 		err = -ENOMEM;
271 		goto failed;
272 	}
273 	dev_net_set(dev, net);
274 
275 	dev->rtnl_link_ops = ops;
276 
277 	tunnel = netdev_priv(dev);
278 	tunnel->parms = *parms;
279 	tunnel->net = net;
280 
281 	err = register_netdevice(dev);
282 	if (err)
283 		goto failed_free;
284 
285 	return dev;
286 
287 failed_free:
288 	free_netdev(dev);
289 failed:
290 	return ERR_PTR(err);
291 }
292 
293 static int ip_tunnel_bind_dev(struct net_device *dev)
294 {
295 	struct net_device *tdev = NULL;
296 	struct ip_tunnel *tunnel = netdev_priv(dev);
297 	const struct iphdr *iph;
298 	int hlen = LL_MAX_HEADER;
299 	int mtu = ETH_DATA_LEN;
300 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
301 
302 	iph = &tunnel->parms.iph;
303 
304 	/* Guess output device to choose reasonable mtu and needed_headroom */
305 	if (iph->daddr) {
306 		struct flowi4 fl4;
307 		struct rtable *rt;
308 
309 		ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
310 				    iph->saddr, tunnel->parms.o_key,
311 				    RT_TOS(iph->tos), tunnel->parms.link,
312 				    tunnel->fwmark);
313 		rt = ip_route_output_key(tunnel->net, &fl4);
314 
315 		if (!IS_ERR(rt)) {
316 			tdev = rt->dst.dev;
317 			ip_rt_put(rt);
318 		}
319 		if (dev->type != ARPHRD_ETHER)
320 			dev->flags |= IFF_POINTOPOINT;
321 
322 		dst_cache_reset(&tunnel->dst_cache);
323 	}
324 
325 	if (!tdev && tunnel->parms.link)
326 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
327 
328 	if (tdev) {
329 		hlen = tdev->hard_header_len + tdev->needed_headroom;
330 		mtu = tdev->mtu;
331 	}
332 
333 	dev->needed_headroom = t_hlen + hlen;
334 	mtu -= (dev->hard_header_len + t_hlen);
335 
336 	if (mtu < IPV4_MIN_MTU)
337 		mtu = IPV4_MIN_MTU;
338 
339 	return mtu;
340 }
341 
342 static struct ip_tunnel *ip_tunnel_create(struct net *net,
343 					  struct ip_tunnel_net *itn,
344 					  struct ip_tunnel_parm *parms)
345 {
346 	struct ip_tunnel *nt;
347 	struct net_device *dev;
348 	int t_hlen;
349 	int mtu;
350 	int err;
351 
352 	dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
353 	if (IS_ERR(dev))
354 		return ERR_CAST(dev);
355 
356 	mtu = ip_tunnel_bind_dev(dev);
357 	err = dev_set_mtu(dev, mtu);
358 	if (err)
359 		goto err_dev_set_mtu;
360 
361 	nt = netdev_priv(dev);
362 	t_hlen = nt->hlen + sizeof(struct iphdr);
363 	dev->min_mtu = ETH_MIN_MTU;
364 	dev->max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
365 	ip_tunnel_add(itn, nt);
366 	return nt;
367 
368 err_dev_set_mtu:
369 	unregister_netdevice(dev);
370 	return ERR_PTR(err);
371 }
372 
373 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
374 		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
375 		  bool log_ecn_error)
376 {
377 	struct pcpu_sw_netstats *tstats;
378 	const struct iphdr *iph = ip_hdr(skb);
379 	int err;
380 
381 #ifdef CONFIG_NET_IPGRE_BROADCAST
382 	if (ipv4_is_multicast(iph->daddr)) {
383 		tunnel->dev->stats.multicast++;
384 		skb->pkt_type = PACKET_BROADCAST;
385 	}
386 #endif
387 
388 	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
389 	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
390 		tunnel->dev->stats.rx_crc_errors++;
391 		tunnel->dev->stats.rx_errors++;
392 		goto drop;
393 	}
394 
395 	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
396 		if (!(tpi->flags&TUNNEL_SEQ) ||
397 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
398 			tunnel->dev->stats.rx_fifo_errors++;
399 			tunnel->dev->stats.rx_errors++;
400 			goto drop;
401 		}
402 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
403 	}
404 
405 	skb_reset_network_header(skb);
406 
407 	err = IP_ECN_decapsulate(iph, skb);
408 	if (unlikely(err)) {
409 		if (log_ecn_error)
410 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
411 					&iph->saddr, iph->tos);
412 		if (err > 1) {
413 			++tunnel->dev->stats.rx_frame_errors;
414 			++tunnel->dev->stats.rx_errors;
415 			goto drop;
416 		}
417 	}
418 
419 	tstats = this_cpu_ptr(tunnel->dev->tstats);
420 	u64_stats_update_begin(&tstats->syncp);
421 	tstats->rx_packets++;
422 	tstats->rx_bytes += skb->len;
423 	u64_stats_update_end(&tstats->syncp);
424 
425 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
426 
427 	if (tunnel->dev->type == ARPHRD_ETHER) {
428 		skb->protocol = eth_type_trans(skb, tunnel->dev);
429 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
430 	} else {
431 		skb->dev = tunnel->dev;
432 	}
433 
434 	if (tun_dst)
435 		skb_dst_set(skb, (struct dst_entry *)tun_dst);
436 
437 	gro_cells_receive(&tunnel->gro_cells, skb);
438 	return 0;
439 
440 drop:
441 	if (tun_dst)
442 		dst_release((struct dst_entry *)tun_dst);
443 	kfree_skb(skb);
444 	return 0;
445 }
446 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
447 
448 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
449 			    unsigned int num)
450 {
451 	if (num >= MAX_IPTUN_ENCAP_OPS)
452 		return -ERANGE;
453 
454 	return !cmpxchg((const struct ip_tunnel_encap_ops **)
455 			&iptun_encaps[num],
456 			NULL, ops) ? 0 : -1;
457 }
458 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
459 
460 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
461 			    unsigned int num)
462 {
463 	int ret;
464 
465 	if (num >= MAX_IPTUN_ENCAP_OPS)
466 		return -ERANGE;
467 
468 	ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
469 		       &iptun_encaps[num],
470 		       ops, NULL) == ops) ? 0 : -1;
471 
472 	synchronize_net();
473 
474 	return ret;
475 }
476 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
477 
478 int ip_tunnel_encap_setup(struct ip_tunnel *t,
479 			  struct ip_tunnel_encap *ipencap)
480 {
481 	int hlen;
482 
483 	memset(&t->encap, 0, sizeof(t->encap));
484 
485 	hlen = ip_encap_hlen(ipencap);
486 	if (hlen < 0)
487 		return hlen;
488 
489 	t->encap.type = ipencap->type;
490 	t->encap.sport = ipencap->sport;
491 	t->encap.dport = ipencap->dport;
492 	t->encap.flags = ipencap->flags;
493 
494 	t->encap_hlen = hlen;
495 	t->hlen = t->encap_hlen + t->tun_hlen;
496 
497 	return 0;
498 }
499 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
500 
501 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
502 			    struct rtable *rt, __be16 df,
503 			    const struct iphdr *inner_iph)
504 {
505 	struct ip_tunnel *tunnel = netdev_priv(dev);
506 	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
507 	int mtu;
508 
509 	if (df)
510 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
511 					- sizeof(struct iphdr) - tunnel->hlen;
512 	else
513 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
514 
515 	skb_dst_update_pmtu(skb, mtu);
516 
517 	if (skb->protocol == htons(ETH_P_IP)) {
518 		if (!skb_is_gso(skb) &&
519 		    (inner_iph->frag_off & htons(IP_DF)) &&
520 		    mtu < pkt_size) {
521 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
522 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
523 			return -E2BIG;
524 		}
525 	}
526 #if IS_ENABLED(CONFIG_IPV6)
527 	else if (skb->protocol == htons(ETH_P_IPV6)) {
528 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
529 
530 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
531 			   mtu >= IPV6_MIN_MTU) {
532 			if ((tunnel->parms.iph.daddr &&
533 			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
534 			    rt6->rt6i_dst.plen == 128) {
535 				rt6->rt6i_flags |= RTF_MODIFIED;
536 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
537 			}
538 		}
539 
540 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
541 					mtu < pkt_size) {
542 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
543 			return -E2BIG;
544 		}
545 	}
546 #endif
547 	return 0;
548 }
549 
550 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
551 {
552 	struct ip_tunnel *tunnel = netdev_priv(dev);
553 	u32 headroom = sizeof(struct iphdr);
554 	struct ip_tunnel_info *tun_info;
555 	const struct ip_tunnel_key *key;
556 	const struct iphdr *inner_iph;
557 	struct rtable *rt;
558 	struct flowi4 fl4;
559 	__be16 df = 0;
560 	u8 tos, ttl;
561 
562 	tun_info = skb_tunnel_info(skb);
563 	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
564 		     ip_tunnel_info_af(tun_info) != AF_INET))
565 		goto tx_error;
566 	key = &tun_info->key;
567 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
568 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
569 	tos = key->tos;
570 	if (tos == 1) {
571 		if (skb->protocol == htons(ETH_P_IP))
572 			tos = inner_iph->tos;
573 		else if (skb->protocol == htons(ETH_P_IPV6))
574 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
575 	}
576 	ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0,
577 			    RT_TOS(tos), tunnel->parms.link, tunnel->fwmark);
578 	if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
579 		goto tx_error;
580 	rt = ip_route_output_key(tunnel->net, &fl4);
581 	if (IS_ERR(rt)) {
582 		dev->stats.tx_carrier_errors++;
583 		goto tx_error;
584 	}
585 	if (rt->dst.dev == dev) {
586 		ip_rt_put(rt);
587 		dev->stats.collisions++;
588 		goto tx_error;
589 	}
590 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
591 	ttl = key->ttl;
592 	if (ttl == 0) {
593 		if (skb->protocol == htons(ETH_P_IP))
594 			ttl = inner_iph->ttl;
595 		else if (skb->protocol == htons(ETH_P_IPV6))
596 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
597 		else
598 			ttl = ip4_dst_hoplimit(&rt->dst);
599 	}
600 	if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
601 		df = htons(IP_DF);
602 	else if (skb->protocol == htons(ETH_P_IP))
603 		df = inner_iph->frag_off & htons(IP_DF);
604 	headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
605 	if (headroom > dev->needed_headroom)
606 		dev->needed_headroom = headroom;
607 
608 	if (skb_cow_head(skb, dev->needed_headroom)) {
609 		ip_rt_put(rt);
610 		goto tx_dropped;
611 	}
612 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
613 		      df, !net_eq(tunnel->net, dev_net(dev)));
614 	return;
615 tx_error:
616 	dev->stats.tx_errors++;
617 	goto kfree;
618 tx_dropped:
619 	dev->stats.tx_dropped++;
620 kfree:
621 	kfree_skb(skb);
622 }
623 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
624 
625 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
626 		    const struct iphdr *tnl_params, u8 protocol)
627 {
628 	struct ip_tunnel *tunnel = netdev_priv(dev);
629 	const struct iphdr *inner_iph;
630 	struct flowi4 fl4;
631 	u8     tos, ttl;
632 	__be16 df;
633 	struct rtable *rt;		/* Route to the other host */
634 	unsigned int max_headroom;	/* The extra header space needed */
635 	__be32 dst;
636 	bool connected;
637 
638 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
639 	connected = (tunnel->parms.iph.daddr != 0);
640 
641 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
642 
643 	dst = tnl_params->daddr;
644 	if (dst == 0) {
645 		/* NBMA tunnel */
646 
647 		if (!skb_dst(skb)) {
648 			dev->stats.tx_fifo_errors++;
649 			goto tx_error;
650 		}
651 
652 		if (skb->protocol == htons(ETH_P_IP)) {
653 			rt = skb_rtable(skb);
654 			dst = rt_nexthop(rt, inner_iph->daddr);
655 		}
656 #if IS_ENABLED(CONFIG_IPV6)
657 		else if (skb->protocol == htons(ETH_P_IPV6)) {
658 			const struct in6_addr *addr6;
659 			struct neighbour *neigh;
660 			bool do_tx_error_icmp;
661 			int addr_type;
662 
663 			neigh = dst_neigh_lookup(skb_dst(skb),
664 						 &ipv6_hdr(skb)->daddr);
665 			if (!neigh)
666 				goto tx_error;
667 
668 			addr6 = (const struct in6_addr *)&neigh->primary_key;
669 			addr_type = ipv6_addr_type(addr6);
670 
671 			if (addr_type == IPV6_ADDR_ANY) {
672 				addr6 = &ipv6_hdr(skb)->daddr;
673 				addr_type = ipv6_addr_type(addr6);
674 			}
675 
676 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
677 				do_tx_error_icmp = true;
678 			else {
679 				do_tx_error_icmp = false;
680 				dst = addr6->s6_addr32[3];
681 			}
682 			neigh_release(neigh);
683 			if (do_tx_error_icmp)
684 				goto tx_error_icmp;
685 		}
686 #endif
687 		else
688 			goto tx_error;
689 
690 		connected = false;
691 	}
692 
693 	tos = tnl_params->tos;
694 	if (tos & 0x1) {
695 		tos &= ~0x1;
696 		if (skb->protocol == htons(ETH_P_IP)) {
697 			tos = inner_iph->tos;
698 			connected = false;
699 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
700 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
701 			connected = false;
702 		}
703 	}
704 
705 	ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
706 			    tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
707 			    tunnel->fwmark);
708 
709 	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
710 		goto tx_error;
711 
712 	rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
713 			 NULL;
714 
715 	if (!rt) {
716 		rt = ip_route_output_key(tunnel->net, &fl4);
717 
718 		if (IS_ERR(rt)) {
719 			dev->stats.tx_carrier_errors++;
720 			goto tx_error;
721 		}
722 		if (connected)
723 			dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
724 					  fl4.saddr);
725 	}
726 
727 	if (rt->dst.dev == dev) {
728 		ip_rt_put(rt);
729 		dev->stats.collisions++;
730 		goto tx_error;
731 	}
732 
733 	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
734 		ip_rt_put(rt);
735 		goto tx_error;
736 	}
737 
738 	if (tunnel->err_count > 0) {
739 		if (time_before(jiffies,
740 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
741 			tunnel->err_count--;
742 
743 			dst_link_failure(skb);
744 		} else
745 			tunnel->err_count = 0;
746 	}
747 
748 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
749 	ttl = tnl_params->ttl;
750 	if (ttl == 0) {
751 		if (skb->protocol == htons(ETH_P_IP))
752 			ttl = inner_iph->ttl;
753 #if IS_ENABLED(CONFIG_IPV6)
754 		else if (skb->protocol == htons(ETH_P_IPV6))
755 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
756 #endif
757 		else
758 			ttl = ip4_dst_hoplimit(&rt->dst);
759 	}
760 
761 	df = tnl_params->frag_off;
762 	if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
763 		df |= (inner_iph->frag_off&htons(IP_DF));
764 
765 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
766 			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
767 	if (max_headroom > dev->needed_headroom)
768 		dev->needed_headroom = max_headroom;
769 
770 	if (skb_cow_head(skb, dev->needed_headroom)) {
771 		ip_rt_put(rt);
772 		dev->stats.tx_dropped++;
773 		kfree_skb(skb);
774 		return;
775 	}
776 
777 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
778 		      df, !net_eq(tunnel->net, dev_net(dev)));
779 	return;
780 
781 #if IS_ENABLED(CONFIG_IPV6)
782 tx_error_icmp:
783 	dst_link_failure(skb);
784 #endif
785 tx_error:
786 	dev->stats.tx_errors++;
787 	kfree_skb(skb);
788 }
789 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
790 
791 static void ip_tunnel_update(struct ip_tunnel_net *itn,
792 			     struct ip_tunnel *t,
793 			     struct net_device *dev,
794 			     struct ip_tunnel_parm *p,
795 			     bool set_mtu,
796 			     __u32 fwmark)
797 {
798 	ip_tunnel_del(itn, t);
799 	t->parms.iph.saddr = p->iph.saddr;
800 	t->parms.iph.daddr = p->iph.daddr;
801 	t->parms.i_key = p->i_key;
802 	t->parms.o_key = p->o_key;
803 	if (dev->type != ARPHRD_ETHER) {
804 		memcpy(dev->dev_addr, &p->iph.saddr, 4);
805 		memcpy(dev->broadcast, &p->iph.daddr, 4);
806 	}
807 	ip_tunnel_add(itn, t);
808 
809 	t->parms.iph.ttl = p->iph.ttl;
810 	t->parms.iph.tos = p->iph.tos;
811 	t->parms.iph.frag_off = p->iph.frag_off;
812 
813 	if (t->parms.link != p->link || t->fwmark != fwmark) {
814 		int mtu;
815 
816 		t->parms.link = p->link;
817 		t->fwmark = fwmark;
818 		mtu = ip_tunnel_bind_dev(dev);
819 		if (set_mtu)
820 			dev->mtu = mtu;
821 	}
822 	dst_cache_reset(&t->dst_cache);
823 	netdev_state_change(dev);
824 }
825 
826 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
827 {
828 	int err = 0;
829 	struct ip_tunnel *t = netdev_priv(dev);
830 	struct net *net = t->net;
831 	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
832 
833 	switch (cmd) {
834 	case SIOCGETTUNNEL:
835 		if (dev == itn->fb_tunnel_dev) {
836 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
837 			if (!t)
838 				t = netdev_priv(dev);
839 		}
840 		memcpy(p, &t->parms, sizeof(*p));
841 		break;
842 
843 	case SIOCADDTUNNEL:
844 	case SIOCCHGTUNNEL:
845 		err = -EPERM;
846 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
847 			goto done;
848 		if (p->iph.ttl)
849 			p->iph.frag_off |= htons(IP_DF);
850 		if (!(p->i_flags & VTI_ISVTI)) {
851 			if (!(p->i_flags & TUNNEL_KEY))
852 				p->i_key = 0;
853 			if (!(p->o_flags & TUNNEL_KEY))
854 				p->o_key = 0;
855 		}
856 
857 		t = ip_tunnel_find(itn, p, itn->type);
858 
859 		if (cmd == SIOCADDTUNNEL) {
860 			if (!t) {
861 				t = ip_tunnel_create(net, itn, p);
862 				err = PTR_ERR_OR_ZERO(t);
863 				break;
864 			}
865 
866 			err = -EEXIST;
867 			break;
868 		}
869 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
870 			if (t) {
871 				if (t->dev != dev) {
872 					err = -EEXIST;
873 					break;
874 				}
875 			} else {
876 				unsigned int nflags = 0;
877 
878 				if (ipv4_is_multicast(p->iph.daddr))
879 					nflags = IFF_BROADCAST;
880 				else if (p->iph.daddr)
881 					nflags = IFF_POINTOPOINT;
882 
883 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
884 					err = -EINVAL;
885 					break;
886 				}
887 
888 				t = netdev_priv(dev);
889 			}
890 		}
891 
892 		if (t) {
893 			err = 0;
894 			ip_tunnel_update(itn, t, dev, p, true, 0);
895 		} else {
896 			err = -ENOENT;
897 		}
898 		break;
899 
900 	case SIOCDELTUNNEL:
901 		err = -EPERM;
902 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
903 			goto done;
904 
905 		if (dev == itn->fb_tunnel_dev) {
906 			err = -ENOENT;
907 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
908 			if (!t)
909 				goto done;
910 			err = -EPERM;
911 			if (t == netdev_priv(itn->fb_tunnel_dev))
912 				goto done;
913 			dev = t->dev;
914 		}
915 		unregister_netdevice(dev);
916 		err = 0;
917 		break;
918 
919 	default:
920 		err = -EINVAL;
921 	}
922 
923 done:
924 	return err;
925 }
926 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
927 
928 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
929 {
930 	struct ip_tunnel *tunnel = netdev_priv(dev);
931 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
932 	int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
933 
934 	if (new_mtu < ETH_MIN_MTU)
935 		return -EINVAL;
936 
937 	if (new_mtu > max_mtu) {
938 		if (strict)
939 			return -EINVAL;
940 
941 		new_mtu = max_mtu;
942 	}
943 
944 	dev->mtu = new_mtu;
945 	return 0;
946 }
947 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
948 
949 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
950 {
951 	return __ip_tunnel_change_mtu(dev, new_mtu, true);
952 }
953 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
954 
955 static void ip_tunnel_dev_free(struct net_device *dev)
956 {
957 	struct ip_tunnel *tunnel = netdev_priv(dev);
958 
959 	gro_cells_destroy(&tunnel->gro_cells);
960 	dst_cache_destroy(&tunnel->dst_cache);
961 	free_percpu(dev->tstats);
962 }
963 
964 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
965 {
966 	struct ip_tunnel *tunnel = netdev_priv(dev);
967 	struct ip_tunnel_net *itn;
968 
969 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
970 
971 	if (itn->fb_tunnel_dev != dev) {
972 		ip_tunnel_del(itn, netdev_priv(dev));
973 		unregister_netdevice_queue(dev, head);
974 	}
975 }
976 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
977 
978 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
979 {
980 	struct ip_tunnel *tunnel = netdev_priv(dev);
981 
982 	return tunnel->net;
983 }
984 EXPORT_SYMBOL(ip_tunnel_get_link_net);
985 
986 int ip_tunnel_get_iflink(const struct net_device *dev)
987 {
988 	struct ip_tunnel *tunnel = netdev_priv(dev);
989 
990 	return tunnel->parms.link;
991 }
992 EXPORT_SYMBOL(ip_tunnel_get_iflink);
993 
994 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
995 				  struct rtnl_link_ops *ops, char *devname)
996 {
997 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
998 	struct ip_tunnel_parm parms;
999 	unsigned int i;
1000 
1001 	itn->rtnl_link_ops = ops;
1002 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1003 		INIT_HLIST_HEAD(&itn->tunnels[i]);
1004 
1005 	if (!ops || !net_has_fallback_tunnels(net)) {
1006 		struct ip_tunnel_net *it_init_net;
1007 
1008 		it_init_net = net_generic(&init_net, ip_tnl_net_id);
1009 		itn->type = it_init_net->type;
1010 		itn->fb_tunnel_dev = NULL;
1011 		return 0;
1012 	}
1013 
1014 	memset(&parms, 0, sizeof(parms));
1015 	if (devname)
1016 		strlcpy(parms.name, devname, IFNAMSIZ);
1017 
1018 	rtnl_lock();
1019 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1020 	/* FB netdevice is special: we have one, and only one per netns.
1021 	 * Allowing to move it to another netns is clearly unsafe.
1022 	 */
1023 	if (!IS_ERR(itn->fb_tunnel_dev)) {
1024 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1025 		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1026 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1027 		itn->type = itn->fb_tunnel_dev->type;
1028 	}
1029 	rtnl_unlock();
1030 
1031 	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1032 }
1033 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1034 
1035 static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1036 			      struct list_head *head,
1037 			      struct rtnl_link_ops *ops)
1038 {
1039 	struct net_device *dev, *aux;
1040 	int h;
1041 
1042 	for_each_netdev_safe(net, dev, aux)
1043 		if (dev->rtnl_link_ops == ops)
1044 			unregister_netdevice_queue(dev, head);
1045 
1046 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1047 		struct ip_tunnel *t;
1048 		struct hlist_node *n;
1049 		struct hlist_head *thead = &itn->tunnels[h];
1050 
1051 		hlist_for_each_entry_safe(t, n, thead, hash_node)
1052 			/* If dev is in the same netns, it has already
1053 			 * been added to the list by the previous loop.
1054 			 */
1055 			if (!net_eq(dev_net(t->dev), net))
1056 				unregister_netdevice_queue(t->dev, head);
1057 	}
1058 }
1059 
1060 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1061 			   struct rtnl_link_ops *ops)
1062 {
1063 	struct ip_tunnel_net *itn;
1064 	struct net *net;
1065 	LIST_HEAD(list);
1066 
1067 	rtnl_lock();
1068 	list_for_each_entry(net, net_list, exit_list) {
1069 		itn = net_generic(net, id);
1070 		ip_tunnel_destroy(net, itn, &list, ops);
1071 	}
1072 	unregister_netdevice_many(&list);
1073 	rtnl_unlock();
1074 }
1075 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1076 
1077 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1078 		      struct ip_tunnel_parm *p, __u32 fwmark)
1079 {
1080 	struct ip_tunnel *nt;
1081 	struct net *net = dev_net(dev);
1082 	struct ip_tunnel_net *itn;
1083 	int mtu;
1084 	int err;
1085 
1086 	nt = netdev_priv(dev);
1087 	itn = net_generic(net, nt->ip_tnl_net_id);
1088 
1089 	if (nt->collect_md) {
1090 		if (rtnl_dereference(itn->collect_md_tun))
1091 			return -EEXIST;
1092 	} else {
1093 		if (ip_tunnel_find(itn, p, dev->type))
1094 			return -EEXIST;
1095 	}
1096 
1097 	nt->net = net;
1098 	nt->parms = *p;
1099 	nt->fwmark = fwmark;
1100 	err = register_netdevice(dev);
1101 	if (err)
1102 		goto err_register_netdevice;
1103 
1104 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1105 		eth_hw_addr_random(dev);
1106 
1107 	mtu = ip_tunnel_bind_dev(dev);
1108 	if (tb[IFLA_MTU]) {
1109 		unsigned int max = 0xfff8 - dev->hard_header_len - nt->hlen;
1110 
1111 		mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU,
1112 			    (unsigned int)(max - sizeof(struct iphdr)));
1113 	}
1114 
1115 	err = dev_set_mtu(dev, mtu);
1116 	if (err)
1117 		goto err_dev_set_mtu;
1118 
1119 	ip_tunnel_add(itn, nt);
1120 	return 0;
1121 
1122 err_dev_set_mtu:
1123 	unregister_netdevice(dev);
1124 err_register_netdevice:
1125 	return err;
1126 }
1127 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1128 
1129 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1130 			 struct ip_tunnel_parm *p, __u32 fwmark)
1131 {
1132 	struct ip_tunnel *t;
1133 	struct ip_tunnel *tunnel = netdev_priv(dev);
1134 	struct net *net = tunnel->net;
1135 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1136 
1137 	if (dev == itn->fb_tunnel_dev)
1138 		return -EINVAL;
1139 
1140 	t = ip_tunnel_find(itn, p, dev->type);
1141 
1142 	if (t) {
1143 		if (t->dev != dev)
1144 			return -EEXIST;
1145 	} else {
1146 		t = tunnel;
1147 
1148 		if (dev->type != ARPHRD_ETHER) {
1149 			unsigned int nflags = 0;
1150 
1151 			if (ipv4_is_multicast(p->iph.daddr))
1152 				nflags = IFF_BROADCAST;
1153 			else if (p->iph.daddr)
1154 				nflags = IFF_POINTOPOINT;
1155 
1156 			if ((dev->flags ^ nflags) &
1157 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1158 				return -EINVAL;
1159 		}
1160 	}
1161 
1162 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1163 	return 0;
1164 }
1165 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1166 
1167 int ip_tunnel_init(struct net_device *dev)
1168 {
1169 	struct ip_tunnel *tunnel = netdev_priv(dev);
1170 	struct iphdr *iph = &tunnel->parms.iph;
1171 	int err;
1172 
1173 	dev->needs_free_netdev = true;
1174 	dev->priv_destructor = ip_tunnel_dev_free;
1175 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1176 	if (!dev->tstats)
1177 		return -ENOMEM;
1178 
1179 	err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1180 	if (err) {
1181 		free_percpu(dev->tstats);
1182 		return err;
1183 	}
1184 
1185 	err = gro_cells_init(&tunnel->gro_cells, dev);
1186 	if (err) {
1187 		dst_cache_destroy(&tunnel->dst_cache);
1188 		free_percpu(dev->tstats);
1189 		return err;
1190 	}
1191 
1192 	tunnel->dev = dev;
1193 	tunnel->net = dev_net(dev);
1194 	strcpy(tunnel->parms.name, dev->name);
1195 	iph->version		= 4;
1196 	iph->ihl		= 5;
1197 
1198 	if (tunnel->collect_md) {
1199 		dev->features |= NETIF_F_NETNS_LOCAL;
1200 		netif_keep_dst(dev);
1201 	}
1202 	return 0;
1203 }
1204 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1205 
1206 void ip_tunnel_uninit(struct net_device *dev)
1207 {
1208 	struct ip_tunnel *tunnel = netdev_priv(dev);
1209 	struct net *net = tunnel->net;
1210 	struct ip_tunnel_net *itn;
1211 
1212 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1213 	/* fb_tunnel_dev will be unregisted in net-exit call. */
1214 	if (itn->fb_tunnel_dev != dev)
1215 		ip_tunnel_del(itn, netdev_priv(dev));
1216 
1217 	dst_cache_reset(&tunnel->dst_cache);
1218 }
1219 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1220 
1221 /* Do least required initialization, rest of init is done in tunnel_init call */
1222 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1223 {
1224 	struct ip_tunnel *tunnel = netdev_priv(dev);
1225 	tunnel->ip_tnl_net_id = net_id;
1226 }
1227 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1228 
1229 MODULE_LICENSE("GPL");
1230