xref: /openbmc/linux/net/ipv4/ip_tunnel.c (revision c5254e72)
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20 
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/init.h>
34 #include <linux/in6.h>
35 #include <linux/inetdevice.h>
36 #include <linux/igmp.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/etherdevice.h>
39 #include <linux/if_ether.h>
40 #include <linux/if_vlan.h>
41 #include <linux/rculist.h>
42 #include <linux/err.h>
43 
44 #include <net/sock.h>
45 #include <net/ip.h>
46 #include <net/icmp.h>
47 #include <net/protocol.h>
48 #include <net/ip_tunnels.h>
49 #include <net/arp.h>
50 #include <net/checksum.h>
51 #include <net/dsfield.h>
52 #include <net/inet_ecn.h>
53 #include <net/xfrm.h>
54 #include <net/net_namespace.h>
55 #include <net/netns/generic.h>
56 #include <net/rtnetlink.h>
57 #include <net/udp.h>
58 #include <net/dst_metadata.h>
59 
60 #if IS_ENABLED(CONFIG_IPV6)
61 #include <net/ipv6.h>
62 #include <net/ip6_fib.h>
63 #include <net/ip6_route.h>
64 #endif
65 
66 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
67 {
68 	return hash_32((__force u32)key ^ (__force u32)remote,
69 			 IP_TNL_HASH_BITS);
70 }
71 
72 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
73 				__be16 flags, __be32 key)
74 {
75 	if (p->i_flags & TUNNEL_KEY) {
76 		if (flags & TUNNEL_KEY)
77 			return key == p->i_key;
78 		else
79 			/* key expected, none present */
80 			return false;
81 	} else
82 		return !(flags & TUNNEL_KEY);
83 }
84 
85 /* Fallback tunnel: no source, no destination, no key, no options
86 
87    Tunnel hash table:
88    We require exact key match i.e. if a key is present in packet
89    it will match only tunnel with the same key; if it is not present,
90    it will match only keyless tunnel.
91 
92    All keysless packets, if not matched configured keyless tunnels
93    will match fallback tunnel.
94    Given src, dst and key, find appropriate for input tunnel.
95 */
96 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
97 				   int link, __be16 flags,
98 				   __be32 remote, __be32 local,
99 				   __be32 key)
100 {
101 	unsigned int hash;
102 	struct ip_tunnel *t, *cand = NULL;
103 	struct hlist_head *head;
104 
105 	hash = ip_tunnel_hash(key, remote);
106 	head = &itn->tunnels[hash];
107 
108 	hlist_for_each_entry_rcu(t, head, hash_node) {
109 		if (local != t->parms.iph.saddr ||
110 		    remote != t->parms.iph.daddr ||
111 		    !(t->dev->flags & IFF_UP))
112 			continue;
113 
114 		if (!ip_tunnel_key_match(&t->parms, flags, key))
115 			continue;
116 
117 		if (t->parms.link == link)
118 			return t;
119 		else
120 			cand = t;
121 	}
122 
123 	hlist_for_each_entry_rcu(t, head, hash_node) {
124 		if (remote != t->parms.iph.daddr ||
125 		    t->parms.iph.saddr != 0 ||
126 		    !(t->dev->flags & IFF_UP))
127 			continue;
128 
129 		if (!ip_tunnel_key_match(&t->parms, flags, key))
130 			continue;
131 
132 		if (t->parms.link == link)
133 			return t;
134 		else if (!cand)
135 			cand = t;
136 	}
137 
138 	hash = ip_tunnel_hash(key, 0);
139 	head = &itn->tunnels[hash];
140 
141 	hlist_for_each_entry_rcu(t, head, hash_node) {
142 		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
143 		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
144 			continue;
145 
146 		if (!(t->dev->flags & IFF_UP))
147 			continue;
148 
149 		if (!ip_tunnel_key_match(&t->parms, flags, key))
150 			continue;
151 
152 		if (t->parms.link == link)
153 			return t;
154 		else if (!cand)
155 			cand = t;
156 	}
157 
158 	if (flags & TUNNEL_NO_KEY)
159 		goto skip_key_lookup;
160 
161 	hlist_for_each_entry_rcu(t, head, hash_node) {
162 		if (t->parms.i_key != key ||
163 		    t->parms.iph.saddr != 0 ||
164 		    t->parms.iph.daddr != 0 ||
165 		    !(t->dev->flags & IFF_UP))
166 			continue;
167 
168 		if (t->parms.link == link)
169 			return t;
170 		else if (!cand)
171 			cand = t;
172 	}
173 
174 skip_key_lookup:
175 	if (cand)
176 		return cand;
177 
178 	t = rcu_dereference(itn->collect_md_tun);
179 	if (t)
180 		return t;
181 
182 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
183 		return netdev_priv(itn->fb_tunnel_dev);
184 
185 	return NULL;
186 }
187 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
188 
189 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
190 				    struct ip_tunnel_parm *parms)
191 {
192 	unsigned int h;
193 	__be32 remote;
194 	__be32 i_key = parms->i_key;
195 
196 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
197 		remote = parms->iph.daddr;
198 	else
199 		remote = 0;
200 
201 	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
202 		i_key = 0;
203 
204 	h = ip_tunnel_hash(i_key, remote);
205 	return &itn->tunnels[h];
206 }
207 
208 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
209 {
210 	struct hlist_head *head = ip_bucket(itn, &t->parms);
211 
212 	if (t->collect_md)
213 		rcu_assign_pointer(itn->collect_md_tun, t);
214 	hlist_add_head_rcu(&t->hash_node, head);
215 }
216 
217 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
218 {
219 	if (t->collect_md)
220 		rcu_assign_pointer(itn->collect_md_tun, NULL);
221 	hlist_del_init_rcu(&t->hash_node);
222 }
223 
224 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
225 					struct ip_tunnel_parm *parms,
226 					int type)
227 {
228 	__be32 remote = parms->iph.daddr;
229 	__be32 local = parms->iph.saddr;
230 	__be32 key = parms->i_key;
231 	__be16 flags = parms->i_flags;
232 	int link = parms->link;
233 	struct ip_tunnel *t = NULL;
234 	struct hlist_head *head = ip_bucket(itn, parms);
235 
236 	hlist_for_each_entry_rcu(t, head, hash_node) {
237 		if (local == t->parms.iph.saddr &&
238 		    remote == t->parms.iph.daddr &&
239 		    link == t->parms.link &&
240 		    type == t->dev->type &&
241 		    ip_tunnel_key_match(&t->parms, flags, key))
242 			break;
243 	}
244 	return t;
245 }
246 
247 static struct net_device *__ip_tunnel_create(struct net *net,
248 					     const struct rtnl_link_ops *ops,
249 					     struct ip_tunnel_parm *parms)
250 {
251 	int err;
252 	struct ip_tunnel *tunnel;
253 	struct net_device *dev;
254 	char name[IFNAMSIZ];
255 
256 	if (parms->name[0])
257 		strlcpy(name, parms->name, IFNAMSIZ);
258 	else {
259 		if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
260 			err = -E2BIG;
261 			goto failed;
262 		}
263 		strlcpy(name, ops->kind, IFNAMSIZ);
264 		strncat(name, "%d", 2);
265 	}
266 
267 	ASSERT_RTNL();
268 	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
269 	if (!dev) {
270 		err = -ENOMEM;
271 		goto failed;
272 	}
273 	dev_net_set(dev, net);
274 
275 	dev->rtnl_link_ops = ops;
276 
277 	tunnel = netdev_priv(dev);
278 	tunnel->parms = *parms;
279 	tunnel->net = net;
280 
281 	err = register_netdevice(dev);
282 	if (err)
283 		goto failed_free;
284 
285 	return dev;
286 
287 failed_free:
288 	free_netdev(dev);
289 failed:
290 	return ERR_PTR(err);
291 }
292 
293 static inline void init_tunnel_flow(struct flowi4 *fl4,
294 				    int proto,
295 				    __be32 daddr, __be32 saddr,
296 				    __be32 key, __u8 tos, int oif)
297 {
298 	memset(fl4, 0, sizeof(*fl4));
299 	fl4->flowi4_oif = oif;
300 	fl4->daddr = daddr;
301 	fl4->saddr = saddr;
302 	fl4->flowi4_tos = tos;
303 	fl4->flowi4_proto = proto;
304 	fl4->fl4_gre_key = key;
305 }
306 
307 static int ip_tunnel_bind_dev(struct net_device *dev)
308 {
309 	struct net_device *tdev = NULL;
310 	struct ip_tunnel *tunnel = netdev_priv(dev);
311 	const struct iphdr *iph;
312 	int hlen = LL_MAX_HEADER;
313 	int mtu = ETH_DATA_LEN;
314 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
315 
316 	iph = &tunnel->parms.iph;
317 
318 	/* Guess output device to choose reasonable mtu and needed_headroom */
319 	if (iph->daddr) {
320 		struct flowi4 fl4;
321 		struct rtable *rt;
322 
323 		init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
324 				 iph->saddr, tunnel->parms.o_key,
325 				 RT_TOS(iph->tos), tunnel->parms.link);
326 		rt = ip_route_output_key(tunnel->net, &fl4);
327 
328 		if (!IS_ERR(rt)) {
329 			tdev = rt->dst.dev;
330 			ip_rt_put(rt);
331 		}
332 		if (dev->type != ARPHRD_ETHER)
333 			dev->flags |= IFF_POINTOPOINT;
334 
335 		dst_cache_reset(&tunnel->dst_cache);
336 	}
337 
338 	if (!tdev && tunnel->parms.link)
339 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
340 
341 	if (tdev) {
342 		hlen = tdev->hard_header_len + tdev->needed_headroom;
343 		mtu = tdev->mtu;
344 	}
345 
346 	dev->needed_headroom = t_hlen + hlen;
347 	mtu -= (dev->hard_header_len + t_hlen);
348 
349 	if (mtu < 68)
350 		mtu = 68;
351 
352 	return mtu;
353 }
354 
355 static struct ip_tunnel *ip_tunnel_create(struct net *net,
356 					  struct ip_tunnel_net *itn,
357 					  struct ip_tunnel_parm *parms)
358 {
359 	struct ip_tunnel *nt;
360 	struct net_device *dev;
361 	int t_hlen;
362 
363 	BUG_ON(!itn->fb_tunnel_dev);
364 	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
365 	if (IS_ERR(dev))
366 		return ERR_CAST(dev);
367 
368 	dev->mtu = ip_tunnel_bind_dev(dev);
369 
370 	nt = netdev_priv(dev);
371 	t_hlen = nt->hlen + sizeof(struct iphdr);
372 	dev->min_mtu = ETH_MIN_MTU;
373 	dev->max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
374 	ip_tunnel_add(itn, nt);
375 	return nt;
376 }
377 
378 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
379 		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
380 		  bool log_ecn_error)
381 {
382 	struct pcpu_sw_netstats *tstats;
383 	const struct iphdr *iph = ip_hdr(skb);
384 	int err;
385 
386 #ifdef CONFIG_NET_IPGRE_BROADCAST
387 	if (ipv4_is_multicast(iph->daddr)) {
388 		tunnel->dev->stats.multicast++;
389 		skb->pkt_type = PACKET_BROADCAST;
390 	}
391 #endif
392 
393 	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
394 	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
395 		tunnel->dev->stats.rx_crc_errors++;
396 		tunnel->dev->stats.rx_errors++;
397 		goto drop;
398 	}
399 
400 	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
401 		if (!(tpi->flags&TUNNEL_SEQ) ||
402 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
403 			tunnel->dev->stats.rx_fifo_errors++;
404 			tunnel->dev->stats.rx_errors++;
405 			goto drop;
406 		}
407 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
408 	}
409 
410 	skb_reset_network_header(skb);
411 
412 	err = IP_ECN_decapsulate(iph, skb);
413 	if (unlikely(err)) {
414 		if (log_ecn_error)
415 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
416 					&iph->saddr, iph->tos);
417 		if (err > 1) {
418 			++tunnel->dev->stats.rx_frame_errors;
419 			++tunnel->dev->stats.rx_errors;
420 			goto drop;
421 		}
422 	}
423 
424 	tstats = this_cpu_ptr(tunnel->dev->tstats);
425 	u64_stats_update_begin(&tstats->syncp);
426 	tstats->rx_packets++;
427 	tstats->rx_bytes += skb->len;
428 	u64_stats_update_end(&tstats->syncp);
429 
430 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
431 
432 	if (tunnel->dev->type == ARPHRD_ETHER) {
433 		skb->protocol = eth_type_trans(skb, tunnel->dev);
434 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
435 	} else {
436 		skb->dev = tunnel->dev;
437 	}
438 
439 	if (tun_dst)
440 		skb_dst_set(skb, (struct dst_entry *)tun_dst);
441 
442 	gro_cells_receive(&tunnel->gro_cells, skb);
443 	return 0;
444 
445 drop:
446 	kfree_skb(skb);
447 	return 0;
448 }
449 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
450 
451 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
452 			    unsigned int num)
453 {
454 	if (num >= MAX_IPTUN_ENCAP_OPS)
455 		return -ERANGE;
456 
457 	return !cmpxchg((const struct ip_tunnel_encap_ops **)
458 			&iptun_encaps[num],
459 			NULL, ops) ? 0 : -1;
460 }
461 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
462 
463 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
464 			    unsigned int num)
465 {
466 	int ret;
467 
468 	if (num >= MAX_IPTUN_ENCAP_OPS)
469 		return -ERANGE;
470 
471 	ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
472 		       &iptun_encaps[num],
473 		       ops, NULL) == ops) ? 0 : -1;
474 
475 	synchronize_net();
476 
477 	return ret;
478 }
479 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
480 
481 int ip_tunnel_encap_setup(struct ip_tunnel *t,
482 			  struct ip_tunnel_encap *ipencap)
483 {
484 	int hlen;
485 
486 	memset(&t->encap, 0, sizeof(t->encap));
487 
488 	hlen = ip_encap_hlen(ipencap);
489 	if (hlen < 0)
490 		return hlen;
491 
492 	t->encap.type = ipencap->type;
493 	t->encap.sport = ipencap->sport;
494 	t->encap.dport = ipencap->dport;
495 	t->encap.flags = ipencap->flags;
496 
497 	t->encap_hlen = hlen;
498 	t->hlen = t->encap_hlen + t->tun_hlen;
499 
500 	return 0;
501 }
502 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
503 
504 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
505 			    struct rtable *rt, __be16 df,
506 			    const struct iphdr *inner_iph)
507 {
508 	struct ip_tunnel *tunnel = netdev_priv(dev);
509 	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
510 	int mtu;
511 
512 	if (df)
513 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
514 					- sizeof(struct iphdr) - tunnel->hlen;
515 	else
516 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
517 
518 	if (skb_dst(skb))
519 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
520 
521 	if (skb->protocol == htons(ETH_P_IP)) {
522 		if (!skb_is_gso(skb) &&
523 		    (inner_iph->frag_off & htons(IP_DF)) &&
524 		    mtu < pkt_size) {
525 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
526 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
527 			return -E2BIG;
528 		}
529 	}
530 #if IS_ENABLED(CONFIG_IPV6)
531 	else if (skb->protocol == htons(ETH_P_IPV6)) {
532 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
533 
534 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
535 			   mtu >= IPV6_MIN_MTU) {
536 			if ((tunnel->parms.iph.daddr &&
537 			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
538 			    rt6->rt6i_dst.plen == 128) {
539 				rt6->rt6i_flags |= RTF_MODIFIED;
540 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
541 			}
542 		}
543 
544 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
545 					mtu < pkt_size) {
546 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
547 			return -E2BIG;
548 		}
549 	}
550 #endif
551 	return 0;
552 }
553 
554 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
555 {
556 	struct ip_tunnel *tunnel = netdev_priv(dev);
557 	u32 headroom = sizeof(struct iphdr);
558 	struct ip_tunnel_info *tun_info;
559 	const struct ip_tunnel_key *key;
560 	const struct iphdr *inner_iph;
561 	struct rtable *rt;
562 	struct flowi4 fl4;
563 	__be16 df = 0;
564 	u8 tos, ttl;
565 
566 	tun_info = skb_tunnel_info(skb);
567 	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
568 		     ip_tunnel_info_af(tun_info) != AF_INET))
569 		goto tx_error;
570 	key = &tun_info->key;
571 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
572 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
573 	tos = key->tos;
574 	if (tos == 1) {
575 		if (skb->protocol == htons(ETH_P_IP))
576 			tos = inner_iph->tos;
577 		else if (skb->protocol == htons(ETH_P_IPV6))
578 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
579 	}
580 	init_tunnel_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0,
581 			 RT_TOS(tos), tunnel->parms.link);
582 	if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
583 		goto tx_error;
584 	rt = ip_route_output_key(tunnel->net, &fl4);
585 	if (IS_ERR(rt)) {
586 		dev->stats.tx_carrier_errors++;
587 		goto tx_error;
588 	}
589 	if (rt->dst.dev == dev) {
590 		ip_rt_put(rt);
591 		dev->stats.collisions++;
592 		goto tx_error;
593 	}
594 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
595 	ttl = key->ttl;
596 	if (ttl == 0) {
597 		if (skb->protocol == htons(ETH_P_IP))
598 			ttl = inner_iph->ttl;
599 		else if (skb->protocol == htons(ETH_P_IPV6))
600 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
601 		else
602 			ttl = ip4_dst_hoplimit(&rt->dst);
603 	}
604 	if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
605 		df = htons(IP_DF);
606 	else if (skb->protocol == htons(ETH_P_IP))
607 		df = inner_iph->frag_off & htons(IP_DF);
608 	headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
609 	if (headroom > dev->needed_headroom)
610 		dev->needed_headroom = headroom;
611 
612 	if (skb_cow_head(skb, dev->needed_headroom)) {
613 		ip_rt_put(rt);
614 		goto tx_dropped;
615 	}
616 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, key->tos,
617 		      key->ttl, df, !net_eq(tunnel->net, dev_net(dev)));
618 	return;
619 tx_error:
620 	dev->stats.tx_errors++;
621 	goto kfree;
622 tx_dropped:
623 	dev->stats.tx_dropped++;
624 kfree:
625 	kfree_skb(skb);
626 }
627 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
628 
629 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
630 		    const struct iphdr *tnl_params, u8 protocol)
631 {
632 	struct ip_tunnel *tunnel = netdev_priv(dev);
633 	const struct iphdr *inner_iph;
634 	struct flowi4 fl4;
635 	u8     tos, ttl;
636 	__be16 df;
637 	struct rtable *rt;		/* Route to the other host */
638 	unsigned int max_headroom;	/* The extra header space needed */
639 	__be32 dst;
640 	bool connected;
641 
642 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
643 	connected = (tunnel->parms.iph.daddr != 0);
644 
645 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
646 
647 	dst = tnl_params->daddr;
648 	if (dst == 0) {
649 		/* NBMA tunnel */
650 
651 		if (!skb_dst(skb)) {
652 			dev->stats.tx_fifo_errors++;
653 			goto tx_error;
654 		}
655 
656 		if (skb->protocol == htons(ETH_P_IP)) {
657 			rt = skb_rtable(skb);
658 			dst = rt_nexthop(rt, inner_iph->daddr);
659 		}
660 #if IS_ENABLED(CONFIG_IPV6)
661 		else if (skb->protocol == htons(ETH_P_IPV6)) {
662 			const struct in6_addr *addr6;
663 			struct neighbour *neigh;
664 			bool do_tx_error_icmp;
665 			int addr_type;
666 
667 			neigh = dst_neigh_lookup(skb_dst(skb),
668 						 &ipv6_hdr(skb)->daddr);
669 			if (!neigh)
670 				goto tx_error;
671 
672 			addr6 = (const struct in6_addr *)&neigh->primary_key;
673 			addr_type = ipv6_addr_type(addr6);
674 
675 			if (addr_type == IPV6_ADDR_ANY) {
676 				addr6 = &ipv6_hdr(skb)->daddr;
677 				addr_type = ipv6_addr_type(addr6);
678 			}
679 
680 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
681 				do_tx_error_icmp = true;
682 			else {
683 				do_tx_error_icmp = false;
684 				dst = addr6->s6_addr32[3];
685 			}
686 			neigh_release(neigh);
687 			if (do_tx_error_icmp)
688 				goto tx_error_icmp;
689 		}
690 #endif
691 		else
692 			goto tx_error;
693 
694 		connected = false;
695 	}
696 
697 	tos = tnl_params->tos;
698 	if (tos & 0x1) {
699 		tos &= ~0x1;
700 		if (skb->protocol == htons(ETH_P_IP)) {
701 			tos = inner_iph->tos;
702 			connected = false;
703 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
704 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
705 			connected = false;
706 		}
707 	}
708 
709 	init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
710 			 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
711 
712 	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
713 		goto tx_error;
714 
715 	rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
716 			 NULL;
717 
718 	if (!rt) {
719 		rt = ip_route_output_key(tunnel->net, &fl4);
720 
721 		if (IS_ERR(rt)) {
722 			dev->stats.tx_carrier_errors++;
723 			goto tx_error;
724 		}
725 		if (connected)
726 			dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
727 					  fl4.saddr);
728 	}
729 
730 	if (rt->dst.dev == dev) {
731 		ip_rt_put(rt);
732 		dev->stats.collisions++;
733 		goto tx_error;
734 	}
735 
736 	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
737 		ip_rt_put(rt);
738 		goto tx_error;
739 	}
740 
741 	if (tunnel->err_count > 0) {
742 		if (time_before(jiffies,
743 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
744 			tunnel->err_count--;
745 
746 			dst_link_failure(skb);
747 		} else
748 			tunnel->err_count = 0;
749 	}
750 
751 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
752 	ttl = tnl_params->ttl;
753 	if (ttl == 0) {
754 		if (skb->protocol == htons(ETH_P_IP))
755 			ttl = inner_iph->ttl;
756 #if IS_ENABLED(CONFIG_IPV6)
757 		else if (skb->protocol == htons(ETH_P_IPV6))
758 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
759 #endif
760 		else
761 			ttl = ip4_dst_hoplimit(&rt->dst);
762 	}
763 
764 	df = tnl_params->frag_off;
765 	if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
766 		df |= (inner_iph->frag_off&htons(IP_DF));
767 
768 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
769 			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
770 	if (max_headroom > dev->needed_headroom)
771 		dev->needed_headroom = max_headroom;
772 
773 	if (skb_cow_head(skb, dev->needed_headroom)) {
774 		ip_rt_put(rt);
775 		dev->stats.tx_dropped++;
776 		kfree_skb(skb);
777 		return;
778 	}
779 
780 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
781 		      df, !net_eq(tunnel->net, dev_net(dev)));
782 	return;
783 
784 #if IS_ENABLED(CONFIG_IPV6)
785 tx_error_icmp:
786 	dst_link_failure(skb);
787 #endif
788 tx_error:
789 	dev->stats.tx_errors++;
790 	kfree_skb(skb);
791 }
792 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
793 
794 static void ip_tunnel_update(struct ip_tunnel_net *itn,
795 			     struct ip_tunnel *t,
796 			     struct net_device *dev,
797 			     struct ip_tunnel_parm *p,
798 			     bool set_mtu)
799 {
800 	ip_tunnel_del(itn, t);
801 	t->parms.iph.saddr = p->iph.saddr;
802 	t->parms.iph.daddr = p->iph.daddr;
803 	t->parms.i_key = p->i_key;
804 	t->parms.o_key = p->o_key;
805 	if (dev->type != ARPHRD_ETHER) {
806 		memcpy(dev->dev_addr, &p->iph.saddr, 4);
807 		memcpy(dev->broadcast, &p->iph.daddr, 4);
808 	}
809 	ip_tunnel_add(itn, t);
810 
811 	t->parms.iph.ttl = p->iph.ttl;
812 	t->parms.iph.tos = p->iph.tos;
813 	t->parms.iph.frag_off = p->iph.frag_off;
814 
815 	if (t->parms.link != p->link) {
816 		int mtu;
817 
818 		t->parms.link = p->link;
819 		mtu = ip_tunnel_bind_dev(dev);
820 		if (set_mtu)
821 			dev->mtu = mtu;
822 	}
823 	dst_cache_reset(&t->dst_cache);
824 	netdev_state_change(dev);
825 }
826 
827 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
828 {
829 	int err = 0;
830 	struct ip_tunnel *t = netdev_priv(dev);
831 	struct net *net = t->net;
832 	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
833 
834 	BUG_ON(!itn->fb_tunnel_dev);
835 	switch (cmd) {
836 	case SIOCGETTUNNEL:
837 		if (dev == itn->fb_tunnel_dev) {
838 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
839 			if (!t)
840 				t = netdev_priv(dev);
841 		}
842 		memcpy(p, &t->parms, sizeof(*p));
843 		break;
844 
845 	case SIOCADDTUNNEL:
846 	case SIOCCHGTUNNEL:
847 		err = -EPERM;
848 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
849 			goto done;
850 		if (p->iph.ttl)
851 			p->iph.frag_off |= htons(IP_DF);
852 		if (!(p->i_flags & VTI_ISVTI)) {
853 			if (!(p->i_flags & TUNNEL_KEY))
854 				p->i_key = 0;
855 			if (!(p->o_flags & TUNNEL_KEY))
856 				p->o_key = 0;
857 		}
858 
859 		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
860 
861 		if (cmd == SIOCADDTUNNEL) {
862 			if (!t) {
863 				t = ip_tunnel_create(net, itn, p);
864 				err = PTR_ERR_OR_ZERO(t);
865 				break;
866 			}
867 
868 			err = -EEXIST;
869 			break;
870 		}
871 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
872 			if (t) {
873 				if (t->dev != dev) {
874 					err = -EEXIST;
875 					break;
876 				}
877 			} else {
878 				unsigned int nflags = 0;
879 
880 				if (ipv4_is_multicast(p->iph.daddr))
881 					nflags = IFF_BROADCAST;
882 				else if (p->iph.daddr)
883 					nflags = IFF_POINTOPOINT;
884 
885 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
886 					err = -EINVAL;
887 					break;
888 				}
889 
890 				t = netdev_priv(dev);
891 			}
892 		}
893 
894 		if (t) {
895 			err = 0;
896 			ip_tunnel_update(itn, t, dev, p, true);
897 		} else {
898 			err = -ENOENT;
899 		}
900 		break;
901 
902 	case SIOCDELTUNNEL:
903 		err = -EPERM;
904 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
905 			goto done;
906 
907 		if (dev == itn->fb_tunnel_dev) {
908 			err = -ENOENT;
909 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
910 			if (!t)
911 				goto done;
912 			err = -EPERM;
913 			if (t == netdev_priv(itn->fb_tunnel_dev))
914 				goto done;
915 			dev = t->dev;
916 		}
917 		unregister_netdevice(dev);
918 		err = 0;
919 		break;
920 
921 	default:
922 		err = -EINVAL;
923 	}
924 
925 done:
926 	return err;
927 }
928 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
929 
930 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
931 {
932 	struct ip_tunnel *tunnel = netdev_priv(dev);
933 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
934 	int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
935 
936 	if (new_mtu < ETH_MIN_MTU)
937 		return -EINVAL;
938 
939 	if (new_mtu > max_mtu) {
940 		if (strict)
941 			return -EINVAL;
942 
943 		new_mtu = max_mtu;
944 	}
945 
946 	dev->mtu = new_mtu;
947 	return 0;
948 }
949 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
950 
951 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
952 {
953 	return __ip_tunnel_change_mtu(dev, new_mtu, true);
954 }
955 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
956 
957 static void ip_tunnel_dev_free(struct net_device *dev)
958 {
959 	struct ip_tunnel *tunnel = netdev_priv(dev);
960 
961 	gro_cells_destroy(&tunnel->gro_cells);
962 	dst_cache_destroy(&tunnel->dst_cache);
963 	free_percpu(dev->tstats);
964 	free_netdev(dev);
965 }
966 
967 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
968 {
969 	struct ip_tunnel *tunnel = netdev_priv(dev);
970 	struct ip_tunnel_net *itn;
971 
972 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
973 
974 	if (itn->fb_tunnel_dev != dev) {
975 		ip_tunnel_del(itn, netdev_priv(dev));
976 		unregister_netdevice_queue(dev, head);
977 	}
978 }
979 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
980 
981 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
982 {
983 	struct ip_tunnel *tunnel = netdev_priv(dev);
984 
985 	return tunnel->net;
986 }
987 EXPORT_SYMBOL(ip_tunnel_get_link_net);
988 
989 int ip_tunnel_get_iflink(const struct net_device *dev)
990 {
991 	struct ip_tunnel *tunnel = netdev_priv(dev);
992 
993 	return tunnel->parms.link;
994 }
995 EXPORT_SYMBOL(ip_tunnel_get_iflink);
996 
997 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
998 				  struct rtnl_link_ops *ops, char *devname)
999 {
1000 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1001 	struct ip_tunnel_parm parms;
1002 	unsigned int i;
1003 
1004 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1005 		INIT_HLIST_HEAD(&itn->tunnels[i]);
1006 
1007 	if (!ops) {
1008 		itn->fb_tunnel_dev = NULL;
1009 		return 0;
1010 	}
1011 
1012 	memset(&parms, 0, sizeof(parms));
1013 	if (devname)
1014 		strlcpy(parms.name, devname, IFNAMSIZ);
1015 
1016 	rtnl_lock();
1017 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1018 	/* FB netdevice is special: we have one, and only one per netns.
1019 	 * Allowing to move it to another netns is clearly unsafe.
1020 	 */
1021 	if (!IS_ERR(itn->fb_tunnel_dev)) {
1022 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1023 		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1024 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1025 	}
1026 	rtnl_unlock();
1027 
1028 	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1029 }
1030 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1031 
1032 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1033 			      struct rtnl_link_ops *ops)
1034 {
1035 	struct net *net = dev_net(itn->fb_tunnel_dev);
1036 	struct net_device *dev, *aux;
1037 	int h;
1038 
1039 	for_each_netdev_safe(net, dev, aux)
1040 		if (dev->rtnl_link_ops == ops)
1041 			unregister_netdevice_queue(dev, head);
1042 
1043 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1044 		struct ip_tunnel *t;
1045 		struct hlist_node *n;
1046 		struct hlist_head *thead = &itn->tunnels[h];
1047 
1048 		hlist_for_each_entry_safe(t, n, thead, hash_node)
1049 			/* If dev is in the same netns, it has already
1050 			 * been added to the list by the previous loop.
1051 			 */
1052 			if (!net_eq(dev_net(t->dev), net))
1053 				unregister_netdevice_queue(t->dev, head);
1054 	}
1055 }
1056 
1057 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1058 {
1059 	LIST_HEAD(list);
1060 
1061 	rtnl_lock();
1062 	ip_tunnel_destroy(itn, &list, ops);
1063 	unregister_netdevice_many(&list);
1064 	rtnl_unlock();
1065 }
1066 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1067 
1068 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1069 		      struct ip_tunnel_parm *p)
1070 {
1071 	struct ip_tunnel *nt;
1072 	struct net *net = dev_net(dev);
1073 	struct ip_tunnel_net *itn;
1074 	int mtu;
1075 	int err;
1076 
1077 	nt = netdev_priv(dev);
1078 	itn = net_generic(net, nt->ip_tnl_net_id);
1079 
1080 	if (nt->collect_md) {
1081 		if (rtnl_dereference(itn->collect_md_tun))
1082 			return -EEXIST;
1083 	} else {
1084 		if (ip_tunnel_find(itn, p, dev->type))
1085 			return -EEXIST;
1086 	}
1087 
1088 	nt->net = net;
1089 	nt->parms = *p;
1090 	err = register_netdevice(dev);
1091 	if (err)
1092 		goto out;
1093 
1094 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1095 		eth_hw_addr_random(dev);
1096 
1097 	mtu = ip_tunnel_bind_dev(dev);
1098 	if (!tb[IFLA_MTU])
1099 		dev->mtu = mtu;
1100 
1101 	ip_tunnel_add(itn, nt);
1102 out:
1103 	return err;
1104 }
1105 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1106 
1107 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1108 			 struct ip_tunnel_parm *p)
1109 {
1110 	struct ip_tunnel *t;
1111 	struct ip_tunnel *tunnel = netdev_priv(dev);
1112 	struct net *net = tunnel->net;
1113 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1114 
1115 	if (dev == itn->fb_tunnel_dev)
1116 		return -EINVAL;
1117 
1118 	t = ip_tunnel_find(itn, p, dev->type);
1119 
1120 	if (t) {
1121 		if (t->dev != dev)
1122 			return -EEXIST;
1123 	} else {
1124 		t = tunnel;
1125 
1126 		if (dev->type != ARPHRD_ETHER) {
1127 			unsigned int nflags = 0;
1128 
1129 			if (ipv4_is_multicast(p->iph.daddr))
1130 				nflags = IFF_BROADCAST;
1131 			else if (p->iph.daddr)
1132 				nflags = IFF_POINTOPOINT;
1133 
1134 			if ((dev->flags ^ nflags) &
1135 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1136 				return -EINVAL;
1137 		}
1138 	}
1139 
1140 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1141 	return 0;
1142 }
1143 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1144 
1145 int ip_tunnel_init(struct net_device *dev)
1146 {
1147 	struct ip_tunnel *tunnel = netdev_priv(dev);
1148 	struct iphdr *iph = &tunnel->parms.iph;
1149 	int err;
1150 
1151 	dev->destructor	= ip_tunnel_dev_free;
1152 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1153 	if (!dev->tstats)
1154 		return -ENOMEM;
1155 
1156 	err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1157 	if (err) {
1158 		free_percpu(dev->tstats);
1159 		return err;
1160 	}
1161 
1162 	err = gro_cells_init(&tunnel->gro_cells, dev);
1163 	if (err) {
1164 		dst_cache_destroy(&tunnel->dst_cache);
1165 		free_percpu(dev->tstats);
1166 		return err;
1167 	}
1168 
1169 	tunnel->dev = dev;
1170 	tunnel->net = dev_net(dev);
1171 	strcpy(tunnel->parms.name, dev->name);
1172 	iph->version		= 4;
1173 	iph->ihl		= 5;
1174 
1175 	if (tunnel->collect_md) {
1176 		dev->features |= NETIF_F_NETNS_LOCAL;
1177 		netif_keep_dst(dev);
1178 	}
1179 	return 0;
1180 }
1181 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1182 
1183 void ip_tunnel_uninit(struct net_device *dev)
1184 {
1185 	struct ip_tunnel *tunnel = netdev_priv(dev);
1186 	struct net *net = tunnel->net;
1187 	struct ip_tunnel_net *itn;
1188 
1189 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1190 	/* fb_tunnel_dev will be unregisted in net-exit call. */
1191 	if (itn->fb_tunnel_dev != dev)
1192 		ip_tunnel_del(itn, netdev_priv(dev));
1193 
1194 	dst_cache_reset(&tunnel->dst_cache);
1195 }
1196 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1197 
1198 /* Do least required initialization, rest of init is done in tunnel_init call */
1199 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1200 {
1201 	struct ip_tunnel *tunnel = netdev_priv(dev);
1202 	tunnel->ip_tnl_net_id = net_id;
1203 }
1204 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1205 
1206 MODULE_LICENSE("GPL");
1207