xref: /openbmc/linux/net/ipv4/ip_tunnel.c (revision 82003e04)
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20 
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/init.h>
34 #include <linux/in6.h>
35 #include <linux/inetdevice.h>
36 #include <linux/igmp.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/etherdevice.h>
39 #include <linux/if_ether.h>
40 #include <linux/if_vlan.h>
41 #include <linux/rculist.h>
42 #include <linux/err.h>
43 
44 #include <net/sock.h>
45 #include <net/ip.h>
46 #include <net/icmp.h>
47 #include <net/protocol.h>
48 #include <net/ip_tunnels.h>
49 #include <net/arp.h>
50 #include <net/checksum.h>
51 #include <net/dsfield.h>
52 #include <net/inet_ecn.h>
53 #include <net/xfrm.h>
54 #include <net/net_namespace.h>
55 #include <net/netns/generic.h>
56 #include <net/rtnetlink.h>
57 #include <net/udp.h>
58 #include <net/dst_metadata.h>
59 
60 #if IS_ENABLED(CONFIG_IPV6)
61 #include <net/ipv6.h>
62 #include <net/ip6_fib.h>
63 #include <net/ip6_route.h>
64 #endif
65 
66 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
67 {
68 	return hash_32((__force u32)key ^ (__force u32)remote,
69 			 IP_TNL_HASH_BITS);
70 }
71 
72 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
73 				__be16 flags, __be32 key)
74 {
75 	if (p->i_flags & TUNNEL_KEY) {
76 		if (flags & TUNNEL_KEY)
77 			return key == p->i_key;
78 		else
79 			/* key expected, none present */
80 			return false;
81 	} else
82 		return !(flags & TUNNEL_KEY);
83 }
84 
85 /* Fallback tunnel: no source, no destination, no key, no options
86 
87    Tunnel hash table:
88    We require exact key match i.e. if a key is present in packet
89    it will match only tunnel with the same key; if it is not present,
90    it will match only keyless tunnel.
91 
92    All keysless packets, if not matched configured keyless tunnels
93    will match fallback tunnel.
94    Given src, dst and key, find appropriate for input tunnel.
95 */
96 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
97 				   int link, __be16 flags,
98 				   __be32 remote, __be32 local,
99 				   __be32 key)
100 {
101 	unsigned int hash;
102 	struct ip_tunnel *t, *cand = NULL;
103 	struct hlist_head *head;
104 
105 	hash = ip_tunnel_hash(key, remote);
106 	head = &itn->tunnels[hash];
107 
108 	hlist_for_each_entry_rcu(t, head, hash_node) {
109 		if (local != t->parms.iph.saddr ||
110 		    remote != t->parms.iph.daddr ||
111 		    !(t->dev->flags & IFF_UP))
112 			continue;
113 
114 		if (!ip_tunnel_key_match(&t->parms, flags, key))
115 			continue;
116 
117 		if (t->parms.link == link)
118 			return t;
119 		else
120 			cand = t;
121 	}
122 
123 	hlist_for_each_entry_rcu(t, head, hash_node) {
124 		if (remote != t->parms.iph.daddr ||
125 		    t->parms.iph.saddr != 0 ||
126 		    !(t->dev->flags & IFF_UP))
127 			continue;
128 
129 		if (!ip_tunnel_key_match(&t->parms, flags, key))
130 			continue;
131 
132 		if (t->parms.link == link)
133 			return t;
134 		else if (!cand)
135 			cand = t;
136 	}
137 
138 	hash = ip_tunnel_hash(key, 0);
139 	head = &itn->tunnels[hash];
140 
141 	hlist_for_each_entry_rcu(t, head, hash_node) {
142 		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
143 		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
144 			continue;
145 
146 		if (!(t->dev->flags & IFF_UP))
147 			continue;
148 
149 		if (!ip_tunnel_key_match(&t->parms, flags, key))
150 			continue;
151 
152 		if (t->parms.link == link)
153 			return t;
154 		else if (!cand)
155 			cand = t;
156 	}
157 
158 	if (flags & TUNNEL_NO_KEY)
159 		goto skip_key_lookup;
160 
161 	hlist_for_each_entry_rcu(t, head, hash_node) {
162 		if (t->parms.i_key != key ||
163 		    t->parms.iph.saddr != 0 ||
164 		    t->parms.iph.daddr != 0 ||
165 		    !(t->dev->flags & IFF_UP))
166 			continue;
167 
168 		if (t->parms.link == link)
169 			return t;
170 		else if (!cand)
171 			cand = t;
172 	}
173 
174 skip_key_lookup:
175 	if (cand)
176 		return cand;
177 
178 	t = rcu_dereference(itn->collect_md_tun);
179 	if (t)
180 		return t;
181 
182 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
183 		return netdev_priv(itn->fb_tunnel_dev);
184 
185 	return NULL;
186 }
187 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
188 
189 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
190 				    struct ip_tunnel_parm *parms)
191 {
192 	unsigned int h;
193 	__be32 remote;
194 	__be32 i_key = parms->i_key;
195 
196 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
197 		remote = parms->iph.daddr;
198 	else
199 		remote = 0;
200 
201 	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
202 		i_key = 0;
203 
204 	h = ip_tunnel_hash(i_key, remote);
205 	return &itn->tunnels[h];
206 }
207 
208 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
209 {
210 	struct hlist_head *head = ip_bucket(itn, &t->parms);
211 
212 	if (t->collect_md)
213 		rcu_assign_pointer(itn->collect_md_tun, t);
214 	hlist_add_head_rcu(&t->hash_node, head);
215 }
216 
217 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
218 {
219 	if (t->collect_md)
220 		rcu_assign_pointer(itn->collect_md_tun, NULL);
221 	hlist_del_init_rcu(&t->hash_node);
222 }
223 
224 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
225 					struct ip_tunnel_parm *parms,
226 					int type)
227 {
228 	__be32 remote = parms->iph.daddr;
229 	__be32 local = parms->iph.saddr;
230 	__be32 key = parms->i_key;
231 	__be16 flags = parms->i_flags;
232 	int link = parms->link;
233 	struct ip_tunnel *t = NULL;
234 	struct hlist_head *head = ip_bucket(itn, parms);
235 
236 	hlist_for_each_entry_rcu(t, head, hash_node) {
237 		if (local == t->parms.iph.saddr &&
238 		    remote == t->parms.iph.daddr &&
239 		    link == t->parms.link &&
240 		    type == t->dev->type &&
241 		    ip_tunnel_key_match(&t->parms, flags, key))
242 			break;
243 	}
244 	return t;
245 }
246 
247 static struct net_device *__ip_tunnel_create(struct net *net,
248 					     const struct rtnl_link_ops *ops,
249 					     struct ip_tunnel_parm *parms)
250 {
251 	int err;
252 	struct ip_tunnel *tunnel;
253 	struct net_device *dev;
254 	char name[IFNAMSIZ];
255 
256 	if (parms->name[0])
257 		strlcpy(name, parms->name, IFNAMSIZ);
258 	else {
259 		if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
260 			err = -E2BIG;
261 			goto failed;
262 		}
263 		strlcpy(name, ops->kind, IFNAMSIZ);
264 		strncat(name, "%d", 2);
265 	}
266 
267 	ASSERT_RTNL();
268 	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
269 	if (!dev) {
270 		err = -ENOMEM;
271 		goto failed;
272 	}
273 	dev_net_set(dev, net);
274 
275 	dev->rtnl_link_ops = ops;
276 
277 	tunnel = netdev_priv(dev);
278 	tunnel->parms = *parms;
279 	tunnel->net = net;
280 
281 	err = register_netdevice(dev);
282 	if (err)
283 		goto failed_free;
284 
285 	return dev;
286 
287 failed_free:
288 	free_netdev(dev);
289 failed:
290 	return ERR_PTR(err);
291 }
292 
293 static inline void init_tunnel_flow(struct flowi4 *fl4,
294 				    int proto,
295 				    __be32 daddr, __be32 saddr,
296 				    __be32 key, __u8 tos, int oif)
297 {
298 	memset(fl4, 0, sizeof(*fl4));
299 	fl4->flowi4_oif = oif;
300 	fl4->daddr = daddr;
301 	fl4->saddr = saddr;
302 	fl4->flowi4_tos = tos;
303 	fl4->flowi4_proto = proto;
304 	fl4->fl4_gre_key = key;
305 }
306 
307 static int ip_tunnel_bind_dev(struct net_device *dev)
308 {
309 	struct net_device *tdev = NULL;
310 	struct ip_tunnel *tunnel = netdev_priv(dev);
311 	const struct iphdr *iph;
312 	int hlen = LL_MAX_HEADER;
313 	int mtu = ETH_DATA_LEN;
314 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
315 
316 	iph = &tunnel->parms.iph;
317 
318 	/* Guess output device to choose reasonable mtu and needed_headroom */
319 	if (iph->daddr) {
320 		struct flowi4 fl4;
321 		struct rtable *rt;
322 
323 		init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
324 				 iph->saddr, tunnel->parms.o_key,
325 				 RT_TOS(iph->tos), tunnel->parms.link);
326 		rt = ip_route_output_key(tunnel->net, &fl4);
327 
328 		if (!IS_ERR(rt)) {
329 			tdev = rt->dst.dev;
330 			ip_rt_put(rt);
331 		}
332 		if (dev->type != ARPHRD_ETHER)
333 			dev->flags |= IFF_POINTOPOINT;
334 
335 		dst_cache_reset(&tunnel->dst_cache);
336 	}
337 
338 	if (!tdev && tunnel->parms.link)
339 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
340 
341 	if (tdev) {
342 		hlen = tdev->hard_header_len + tdev->needed_headroom;
343 		mtu = tdev->mtu;
344 	}
345 
346 	dev->needed_headroom = t_hlen + hlen;
347 	mtu -= (dev->hard_header_len + t_hlen);
348 
349 	if (mtu < 68)
350 		mtu = 68;
351 
352 	return mtu;
353 }
354 
355 static struct ip_tunnel *ip_tunnel_create(struct net *net,
356 					  struct ip_tunnel_net *itn,
357 					  struct ip_tunnel_parm *parms)
358 {
359 	struct ip_tunnel *nt;
360 	struct net_device *dev;
361 
362 	BUG_ON(!itn->fb_tunnel_dev);
363 	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
364 	if (IS_ERR(dev))
365 		return ERR_CAST(dev);
366 
367 	dev->mtu = ip_tunnel_bind_dev(dev);
368 
369 	nt = netdev_priv(dev);
370 	ip_tunnel_add(itn, nt);
371 	return nt;
372 }
373 
374 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
375 		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
376 		  bool log_ecn_error)
377 {
378 	struct pcpu_sw_netstats *tstats;
379 	const struct iphdr *iph = ip_hdr(skb);
380 	int err;
381 
382 #ifdef CONFIG_NET_IPGRE_BROADCAST
383 	if (ipv4_is_multicast(iph->daddr)) {
384 		tunnel->dev->stats.multicast++;
385 		skb->pkt_type = PACKET_BROADCAST;
386 	}
387 #endif
388 
389 	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
390 	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
391 		tunnel->dev->stats.rx_crc_errors++;
392 		tunnel->dev->stats.rx_errors++;
393 		goto drop;
394 	}
395 
396 	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
397 		if (!(tpi->flags&TUNNEL_SEQ) ||
398 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
399 			tunnel->dev->stats.rx_fifo_errors++;
400 			tunnel->dev->stats.rx_errors++;
401 			goto drop;
402 		}
403 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
404 	}
405 
406 	skb_reset_network_header(skb);
407 
408 	err = IP_ECN_decapsulate(iph, skb);
409 	if (unlikely(err)) {
410 		if (log_ecn_error)
411 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
412 					&iph->saddr, iph->tos);
413 		if (err > 1) {
414 			++tunnel->dev->stats.rx_frame_errors;
415 			++tunnel->dev->stats.rx_errors;
416 			goto drop;
417 		}
418 	}
419 
420 	tstats = this_cpu_ptr(tunnel->dev->tstats);
421 	u64_stats_update_begin(&tstats->syncp);
422 	tstats->rx_packets++;
423 	tstats->rx_bytes += skb->len;
424 	u64_stats_update_end(&tstats->syncp);
425 
426 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
427 
428 	if (tunnel->dev->type == ARPHRD_ETHER) {
429 		skb->protocol = eth_type_trans(skb, tunnel->dev);
430 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
431 	} else {
432 		skb->dev = tunnel->dev;
433 	}
434 
435 	if (tun_dst)
436 		skb_dst_set(skb, (struct dst_entry *)tun_dst);
437 
438 	gro_cells_receive(&tunnel->gro_cells, skb);
439 	return 0;
440 
441 drop:
442 	kfree_skb(skb);
443 	return 0;
444 }
445 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
446 
447 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
448 			    unsigned int num)
449 {
450 	if (num >= MAX_IPTUN_ENCAP_OPS)
451 		return -ERANGE;
452 
453 	return !cmpxchg((const struct ip_tunnel_encap_ops **)
454 			&iptun_encaps[num],
455 			NULL, ops) ? 0 : -1;
456 }
457 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
458 
459 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
460 			    unsigned int num)
461 {
462 	int ret;
463 
464 	if (num >= MAX_IPTUN_ENCAP_OPS)
465 		return -ERANGE;
466 
467 	ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
468 		       &iptun_encaps[num],
469 		       ops, NULL) == ops) ? 0 : -1;
470 
471 	synchronize_net();
472 
473 	return ret;
474 }
475 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
476 
477 int ip_tunnel_encap_setup(struct ip_tunnel *t,
478 			  struct ip_tunnel_encap *ipencap)
479 {
480 	int hlen;
481 
482 	memset(&t->encap, 0, sizeof(t->encap));
483 
484 	hlen = ip_encap_hlen(ipencap);
485 	if (hlen < 0)
486 		return hlen;
487 
488 	t->encap.type = ipencap->type;
489 	t->encap.sport = ipencap->sport;
490 	t->encap.dport = ipencap->dport;
491 	t->encap.flags = ipencap->flags;
492 
493 	t->encap_hlen = hlen;
494 	t->hlen = t->encap_hlen + t->tun_hlen;
495 
496 	return 0;
497 }
498 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
499 
500 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
501 			    struct rtable *rt, __be16 df,
502 			    const struct iphdr *inner_iph)
503 {
504 	struct ip_tunnel *tunnel = netdev_priv(dev);
505 	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
506 	int mtu;
507 
508 	if (df)
509 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
510 					- sizeof(struct iphdr) - tunnel->hlen;
511 	else
512 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
513 
514 	if (skb_dst(skb))
515 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
516 
517 	if (skb->protocol == htons(ETH_P_IP)) {
518 		if (!skb_is_gso(skb) &&
519 		    (inner_iph->frag_off & htons(IP_DF)) &&
520 		    mtu < pkt_size) {
521 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
522 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
523 			return -E2BIG;
524 		}
525 	}
526 #if IS_ENABLED(CONFIG_IPV6)
527 	else if (skb->protocol == htons(ETH_P_IPV6)) {
528 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
529 
530 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
531 			   mtu >= IPV6_MIN_MTU) {
532 			if ((tunnel->parms.iph.daddr &&
533 			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
534 			    rt6->rt6i_dst.plen == 128) {
535 				rt6->rt6i_flags |= RTF_MODIFIED;
536 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
537 			}
538 		}
539 
540 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
541 					mtu < pkt_size) {
542 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
543 			return -E2BIG;
544 		}
545 	}
546 #endif
547 	return 0;
548 }
549 
550 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
551 {
552 	struct ip_tunnel *tunnel = netdev_priv(dev);
553 	u32 headroom = sizeof(struct iphdr);
554 	struct ip_tunnel_info *tun_info;
555 	const struct ip_tunnel_key *key;
556 	const struct iphdr *inner_iph;
557 	struct rtable *rt;
558 	struct flowi4 fl4;
559 	__be16 df = 0;
560 	u8 tos, ttl;
561 
562 	tun_info = skb_tunnel_info(skb);
563 	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
564 		     ip_tunnel_info_af(tun_info) != AF_INET))
565 		goto tx_error;
566 	key = &tun_info->key;
567 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
568 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
569 	tos = key->tos;
570 	if (tos == 1) {
571 		if (skb->protocol == htons(ETH_P_IP))
572 			tos = inner_iph->tos;
573 		else if (skb->protocol == htons(ETH_P_IPV6))
574 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
575 	}
576 	init_tunnel_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0,
577 			 RT_TOS(tos), tunnel->parms.link);
578 	if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
579 		goto tx_error;
580 	rt = ip_route_output_key(tunnel->net, &fl4);
581 	if (IS_ERR(rt)) {
582 		dev->stats.tx_carrier_errors++;
583 		goto tx_error;
584 	}
585 	if (rt->dst.dev == dev) {
586 		ip_rt_put(rt);
587 		dev->stats.collisions++;
588 		goto tx_error;
589 	}
590 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
591 	ttl = key->ttl;
592 	if (ttl == 0) {
593 		if (skb->protocol == htons(ETH_P_IP))
594 			ttl = inner_iph->ttl;
595 		else if (skb->protocol == htons(ETH_P_IPV6))
596 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
597 		else
598 			ttl = ip4_dst_hoplimit(&rt->dst);
599 	}
600 	if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
601 		df = htons(IP_DF);
602 	else if (skb->protocol == htons(ETH_P_IP))
603 		df = inner_iph->frag_off & htons(IP_DF);
604 	headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
605 	if (headroom > dev->needed_headroom)
606 		dev->needed_headroom = headroom;
607 
608 	if (skb_cow_head(skb, dev->needed_headroom)) {
609 		ip_rt_put(rt);
610 		goto tx_dropped;
611 	}
612 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, key->tos,
613 		      key->ttl, df, !net_eq(tunnel->net, dev_net(dev)));
614 	return;
615 tx_error:
616 	dev->stats.tx_errors++;
617 	goto kfree;
618 tx_dropped:
619 	dev->stats.tx_dropped++;
620 kfree:
621 	kfree_skb(skb);
622 }
623 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
624 
625 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
626 		    const struct iphdr *tnl_params, u8 protocol)
627 {
628 	struct ip_tunnel *tunnel = netdev_priv(dev);
629 	const struct iphdr *inner_iph;
630 	struct flowi4 fl4;
631 	u8     tos, ttl;
632 	__be16 df;
633 	struct rtable *rt;		/* Route to the other host */
634 	unsigned int max_headroom;	/* The extra header space needed */
635 	__be32 dst;
636 	bool connected;
637 
638 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
639 	connected = (tunnel->parms.iph.daddr != 0);
640 
641 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
642 
643 	dst = tnl_params->daddr;
644 	if (dst == 0) {
645 		/* NBMA tunnel */
646 
647 		if (!skb_dst(skb)) {
648 			dev->stats.tx_fifo_errors++;
649 			goto tx_error;
650 		}
651 
652 		if (skb->protocol == htons(ETH_P_IP)) {
653 			rt = skb_rtable(skb);
654 			dst = rt_nexthop(rt, inner_iph->daddr);
655 		}
656 #if IS_ENABLED(CONFIG_IPV6)
657 		else if (skb->protocol == htons(ETH_P_IPV6)) {
658 			const struct in6_addr *addr6;
659 			struct neighbour *neigh;
660 			bool do_tx_error_icmp;
661 			int addr_type;
662 
663 			neigh = dst_neigh_lookup(skb_dst(skb),
664 						 &ipv6_hdr(skb)->daddr);
665 			if (!neigh)
666 				goto tx_error;
667 
668 			addr6 = (const struct in6_addr *)&neigh->primary_key;
669 			addr_type = ipv6_addr_type(addr6);
670 
671 			if (addr_type == IPV6_ADDR_ANY) {
672 				addr6 = &ipv6_hdr(skb)->daddr;
673 				addr_type = ipv6_addr_type(addr6);
674 			}
675 
676 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
677 				do_tx_error_icmp = true;
678 			else {
679 				do_tx_error_icmp = false;
680 				dst = addr6->s6_addr32[3];
681 			}
682 			neigh_release(neigh);
683 			if (do_tx_error_icmp)
684 				goto tx_error_icmp;
685 		}
686 #endif
687 		else
688 			goto tx_error;
689 
690 		connected = false;
691 	}
692 
693 	tos = tnl_params->tos;
694 	if (tos & 0x1) {
695 		tos &= ~0x1;
696 		if (skb->protocol == htons(ETH_P_IP)) {
697 			tos = inner_iph->tos;
698 			connected = false;
699 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
700 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
701 			connected = false;
702 		}
703 	}
704 
705 	init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
706 			 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
707 
708 	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
709 		goto tx_error;
710 
711 	rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
712 			 NULL;
713 
714 	if (!rt) {
715 		rt = ip_route_output_key(tunnel->net, &fl4);
716 
717 		if (IS_ERR(rt)) {
718 			dev->stats.tx_carrier_errors++;
719 			goto tx_error;
720 		}
721 		if (connected)
722 			dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
723 					  fl4.saddr);
724 	}
725 
726 	if (rt->dst.dev == dev) {
727 		ip_rt_put(rt);
728 		dev->stats.collisions++;
729 		goto tx_error;
730 	}
731 
732 	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
733 		ip_rt_put(rt);
734 		goto tx_error;
735 	}
736 
737 	if (tunnel->err_count > 0) {
738 		if (time_before(jiffies,
739 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
740 			tunnel->err_count--;
741 
742 			dst_link_failure(skb);
743 		} else
744 			tunnel->err_count = 0;
745 	}
746 
747 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
748 	ttl = tnl_params->ttl;
749 	if (ttl == 0) {
750 		if (skb->protocol == htons(ETH_P_IP))
751 			ttl = inner_iph->ttl;
752 #if IS_ENABLED(CONFIG_IPV6)
753 		else if (skb->protocol == htons(ETH_P_IPV6))
754 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
755 #endif
756 		else
757 			ttl = ip4_dst_hoplimit(&rt->dst);
758 	}
759 
760 	df = tnl_params->frag_off;
761 	if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
762 		df |= (inner_iph->frag_off&htons(IP_DF));
763 
764 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
765 			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
766 	if (max_headroom > dev->needed_headroom)
767 		dev->needed_headroom = max_headroom;
768 
769 	if (skb_cow_head(skb, dev->needed_headroom)) {
770 		ip_rt_put(rt);
771 		dev->stats.tx_dropped++;
772 		kfree_skb(skb);
773 		return;
774 	}
775 
776 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
777 		      df, !net_eq(tunnel->net, dev_net(dev)));
778 	return;
779 
780 #if IS_ENABLED(CONFIG_IPV6)
781 tx_error_icmp:
782 	dst_link_failure(skb);
783 #endif
784 tx_error:
785 	dev->stats.tx_errors++;
786 	kfree_skb(skb);
787 }
788 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
789 
790 static void ip_tunnel_update(struct ip_tunnel_net *itn,
791 			     struct ip_tunnel *t,
792 			     struct net_device *dev,
793 			     struct ip_tunnel_parm *p,
794 			     bool set_mtu)
795 {
796 	ip_tunnel_del(itn, t);
797 	t->parms.iph.saddr = p->iph.saddr;
798 	t->parms.iph.daddr = p->iph.daddr;
799 	t->parms.i_key = p->i_key;
800 	t->parms.o_key = p->o_key;
801 	if (dev->type != ARPHRD_ETHER) {
802 		memcpy(dev->dev_addr, &p->iph.saddr, 4);
803 		memcpy(dev->broadcast, &p->iph.daddr, 4);
804 	}
805 	ip_tunnel_add(itn, t);
806 
807 	t->parms.iph.ttl = p->iph.ttl;
808 	t->parms.iph.tos = p->iph.tos;
809 	t->parms.iph.frag_off = p->iph.frag_off;
810 
811 	if (t->parms.link != p->link) {
812 		int mtu;
813 
814 		t->parms.link = p->link;
815 		mtu = ip_tunnel_bind_dev(dev);
816 		if (set_mtu)
817 			dev->mtu = mtu;
818 	}
819 	dst_cache_reset(&t->dst_cache);
820 	netdev_state_change(dev);
821 }
822 
823 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
824 {
825 	int err = 0;
826 	struct ip_tunnel *t = netdev_priv(dev);
827 	struct net *net = t->net;
828 	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
829 
830 	BUG_ON(!itn->fb_tunnel_dev);
831 	switch (cmd) {
832 	case SIOCGETTUNNEL:
833 		if (dev == itn->fb_tunnel_dev) {
834 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
835 			if (!t)
836 				t = netdev_priv(dev);
837 		}
838 		memcpy(p, &t->parms, sizeof(*p));
839 		break;
840 
841 	case SIOCADDTUNNEL:
842 	case SIOCCHGTUNNEL:
843 		err = -EPERM;
844 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
845 			goto done;
846 		if (p->iph.ttl)
847 			p->iph.frag_off |= htons(IP_DF);
848 		if (!(p->i_flags & VTI_ISVTI)) {
849 			if (!(p->i_flags & TUNNEL_KEY))
850 				p->i_key = 0;
851 			if (!(p->o_flags & TUNNEL_KEY))
852 				p->o_key = 0;
853 		}
854 
855 		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
856 
857 		if (cmd == SIOCADDTUNNEL) {
858 			if (!t) {
859 				t = ip_tunnel_create(net, itn, p);
860 				err = PTR_ERR_OR_ZERO(t);
861 				break;
862 			}
863 
864 			err = -EEXIST;
865 			break;
866 		}
867 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
868 			if (t) {
869 				if (t->dev != dev) {
870 					err = -EEXIST;
871 					break;
872 				}
873 			} else {
874 				unsigned int nflags = 0;
875 
876 				if (ipv4_is_multicast(p->iph.daddr))
877 					nflags = IFF_BROADCAST;
878 				else if (p->iph.daddr)
879 					nflags = IFF_POINTOPOINT;
880 
881 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
882 					err = -EINVAL;
883 					break;
884 				}
885 
886 				t = netdev_priv(dev);
887 			}
888 		}
889 
890 		if (t) {
891 			err = 0;
892 			ip_tunnel_update(itn, t, dev, p, true);
893 		} else {
894 			err = -ENOENT;
895 		}
896 		break;
897 
898 	case SIOCDELTUNNEL:
899 		err = -EPERM;
900 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
901 			goto done;
902 
903 		if (dev == itn->fb_tunnel_dev) {
904 			err = -ENOENT;
905 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
906 			if (!t)
907 				goto done;
908 			err = -EPERM;
909 			if (t == netdev_priv(itn->fb_tunnel_dev))
910 				goto done;
911 			dev = t->dev;
912 		}
913 		unregister_netdevice(dev);
914 		err = 0;
915 		break;
916 
917 	default:
918 		err = -EINVAL;
919 	}
920 
921 done:
922 	return err;
923 }
924 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
925 
926 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
927 {
928 	struct ip_tunnel *tunnel = netdev_priv(dev);
929 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
930 	int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
931 
932 	if (new_mtu < 68)
933 		return -EINVAL;
934 
935 	if (new_mtu > max_mtu) {
936 		if (strict)
937 			return -EINVAL;
938 
939 		new_mtu = max_mtu;
940 	}
941 
942 	dev->mtu = new_mtu;
943 	return 0;
944 }
945 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
946 
947 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
948 {
949 	return __ip_tunnel_change_mtu(dev, new_mtu, true);
950 }
951 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
952 
953 static void ip_tunnel_dev_free(struct net_device *dev)
954 {
955 	struct ip_tunnel *tunnel = netdev_priv(dev);
956 
957 	gro_cells_destroy(&tunnel->gro_cells);
958 	dst_cache_destroy(&tunnel->dst_cache);
959 	free_percpu(dev->tstats);
960 	free_netdev(dev);
961 }
962 
963 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
964 {
965 	struct ip_tunnel *tunnel = netdev_priv(dev);
966 	struct ip_tunnel_net *itn;
967 
968 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
969 
970 	if (itn->fb_tunnel_dev != dev) {
971 		ip_tunnel_del(itn, netdev_priv(dev));
972 		unregister_netdevice_queue(dev, head);
973 	}
974 }
975 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
976 
977 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
978 {
979 	struct ip_tunnel *tunnel = netdev_priv(dev);
980 
981 	return tunnel->net;
982 }
983 EXPORT_SYMBOL(ip_tunnel_get_link_net);
984 
985 int ip_tunnel_get_iflink(const struct net_device *dev)
986 {
987 	struct ip_tunnel *tunnel = netdev_priv(dev);
988 
989 	return tunnel->parms.link;
990 }
991 EXPORT_SYMBOL(ip_tunnel_get_iflink);
992 
993 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
994 				  struct rtnl_link_ops *ops, char *devname)
995 {
996 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
997 	struct ip_tunnel_parm parms;
998 	unsigned int i;
999 
1000 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1001 		INIT_HLIST_HEAD(&itn->tunnels[i]);
1002 
1003 	if (!ops) {
1004 		itn->fb_tunnel_dev = NULL;
1005 		return 0;
1006 	}
1007 
1008 	memset(&parms, 0, sizeof(parms));
1009 	if (devname)
1010 		strlcpy(parms.name, devname, IFNAMSIZ);
1011 
1012 	rtnl_lock();
1013 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1014 	/* FB netdevice is special: we have one, and only one per netns.
1015 	 * Allowing to move it to another netns is clearly unsafe.
1016 	 */
1017 	if (!IS_ERR(itn->fb_tunnel_dev)) {
1018 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1019 		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1020 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1021 	}
1022 	rtnl_unlock();
1023 
1024 	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1025 }
1026 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1027 
1028 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1029 			      struct rtnl_link_ops *ops)
1030 {
1031 	struct net *net = dev_net(itn->fb_tunnel_dev);
1032 	struct net_device *dev, *aux;
1033 	int h;
1034 
1035 	for_each_netdev_safe(net, dev, aux)
1036 		if (dev->rtnl_link_ops == ops)
1037 			unregister_netdevice_queue(dev, head);
1038 
1039 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1040 		struct ip_tunnel *t;
1041 		struct hlist_node *n;
1042 		struct hlist_head *thead = &itn->tunnels[h];
1043 
1044 		hlist_for_each_entry_safe(t, n, thead, hash_node)
1045 			/* If dev is in the same netns, it has already
1046 			 * been added to the list by the previous loop.
1047 			 */
1048 			if (!net_eq(dev_net(t->dev), net))
1049 				unregister_netdevice_queue(t->dev, head);
1050 	}
1051 }
1052 
1053 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1054 {
1055 	LIST_HEAD(list);
1056 
1057 	rtnl_lock();
1058 	ip_tunnel_destroy(itn, &list, ops);
1059 	unregister_netdevice_many(&list);
1060 	rtnl_unlock();
1061 }
1062 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1063 
1064 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1065 		      struct ip_tunnel_parm *p)
1066 {
1067 	struct ip_tunnel *nt;
1068 	struct net *net = dev_net(dev);
1069 	struct ip_tunnel_net *itn;
1070 	int mtu;
1071 	int err;
1072 
1073 	nt = netdev_priv(dev);
1074 	itn = net_generic(net, nt->ip_tnl_net_id);
1075 
1076 	if (nt->collect_md) {
1077 		if (rtnl_dereference(itn->collect_md_tun))
1078 			return -EEXIST;
1079 	} else {
1080 		if (ip_tunnel_find(itn, p, dev->type))
1081 			return -EEXIST;
1082 	}
1083 
1084 	nt->net = net;
1085 	nt->parms = *p;
1086 	err = register_netdevice(dev);
1087 	if (err)
1088 		goto out;
1089 
1090 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1091 		eth_hw_addr_random(dev);
1092 
1093 	mtu = ip_tunnel_bind_dev(dev);
1094 	if (!tb[IFLA_MTU])
1095 		dev->mtu = mtu;
1096 
1097 	ip_tunnel_add(itn, nt);
1098 out:
1099 	return err;
1100 }
1101 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1102 
1103 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1104 			 struct ip_tunnel_parm *p)
1105 {
1106 	struct ip_tunnel *t;
1107 	struct ip_tunnel *tunnel = netdev_priv(dev);
1108 	struct net *net = tunnel->net;
1109 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1110 
1111 	if (dev == itn->fb_tunnel_dev)
1112 		return -EINVAL;
1113 
1114 	t = ip_tunnel_find(itn, p, dev->type);
1115 
1116 	if (t) {
1117 		if (t->dev != dev)
1118 			return -EEXIST;
1119 	} else {
1120 		t = tunnel;
1121 
1122 		if (dev->type != ARPHRD_ETHER) {
1123 			unsigned int nflags = 0;
1124 
1125 			if (ipv4_is_multicast(p->iph.daddr))
1126 				nflags = IFF_BROADCAST;
1127 			else if (p->iph.daddr)
1128 				nflags = IFF_POINTOPOINT;
1129 
1130 			if ((dev->flags ^ nflags) &
1131 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1132 				return -EINVAL;
1133 		}
1134 	}
1135 
1136 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1137 	return 0;
1138 }
1139 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1140 
1141 int ip_tunnel_init(struct net_device *dev)
1142 {
1143 	struct ip_tunnel *tunnel = netdev_priv(dev);
1144 	struct iphdr *iph = &tunnel->parms.iph;
1145 	int err;
1146 
1147 	dev->destructor	= ip_tunnel_dev_free;
1148 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1149 	if (!dev->tstats)
1150 		return -ENOMEM;
1151 
1152 	err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1153 	if (err) {
1154 		free_percpu(dev->tstats);
1155 		return err;
1156 	}
1157 
1158 	err = gro_cells_init(&tunnel->gro_cells, dev);
1159 	if (err) {
1160 		dst_cache_destroy(&tunnel->dst_cache);
1161 		free_percpu(dev->tstats);
1162 		return err;
1163 	}
1164 
1165 	tunnel->dev = dev;
1166 	tunnel->net = dev_net(dev);
1167 	strcpy(tunnel->parms.name, dev->name);
1168 	iph->version		= 4;
1169 	iph->ihl		= 5;
1170 
1171 	if (tunnel->collect_md) {
1172 		dev->features |= NETIF_F_NETNS_LOCAL;
1173 		netif_keep_dst(dev);
1174 	}
1175 	return 0;
1176 }
1177 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1178 
1179 void ip_tunnel_uninit(struct net_device *dev)
1180 {
1181 	struct ip_tunnel *tunnel = netdev_priv(dev);
1182 	struct net *net = tunnel->net;
1183 	struct ip_tunnel_net *itn;
1184 
1185 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1186 	/* fb_tunnel_dev will be unregisted in net-exit call. */
1187 	if (itn->fb_tunnel_dev != dev)
1188 		ip_tunnel_del(itn, netdev_priv(dev));
1189 
1190 	dst_cache_reset(&tunnel->dst_cache);
1191 }
1192 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1193 
1194 /* Do least required initialization, rest of init is done in tunnel_init call */
1195 void ip_tunnel_setup(struct net_device *dev, int net_id)
1196 {
1197 	struct ip_tunnel *tunnel = netdev_priv(dev);
1198 	tunnel->ip_tnl_net_id = net_id;
1199 }
1200 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1201 
1202 MODULE_LICENSE("GPL");
1203