xref: /openbmc/linux/net/ipv4/ip_tunnel.c (revision 1491eaf9)
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20 
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/init.h>
34 #include <linux/in6.h>
35 #include <linux/inetdevice.h>
36 #include <linux/igmp.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/etherdevice.h>
39 #include <linux/if_ether.h>
40 #include <linux/if_vlan.h>
41 #include <linux/rculist.h>
42 #include <linux/err.h>
43 
44 #include <net/sock.h>
45 #include <net/ip.h>
46 #include <net/icmp.h>
47 #include <net/protocol.h>
48 #include <net/ip_tunnels.h>
49 #include <net/arp.h>
50 #include <net/checksum.h>
51 #include <net/dsfield.h>
52 #include <net/inet_ecn.h>
53 #include <net/xfrm.h>
54 #include <net/net_namespace.h>
55 #include <net/netns/generic.h>
56 #include <net/rtnetlink.h>
57 #include <net/udp.h>
58 
59 #if IS_ENABLED(CONFIG_IPV6)
60 #include <net/ipv6.h>
61 #include <net/ip6_fib.h>
62 #include <net/ip6_route.h>
63 #endif
64 
65 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
66 {
67 	return hash_32((__force u32)key ^ (__force u32)remote,
68 			 IP_TNL_HASH_BITS);
69 }
70 
71 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
72 				__be16 flags, __be32 key)
73 {
74 	if (p->i_flags & TUNNEL_KEY) {
75 		if (flags & TUNNEL_KEY)
76 			return key == p->i_key;
77 		else
78 			/* key expected, none present */
79 			return false;
80 	} else
81 		return !(flags & TUNNEL_KEY);
82 }
83 
84 /* Fallback tunnel: no source, no destination, no key, no options
85 
86    Tunnel hash table:
87    We require exact key match i.e. if a key is present in packet
88    it will match only tunnel with the same key; if it is not present,
89    it will match only keyless tunnel.
90 
91    All keysless packets, if not matched configured keyless tunnels
92    will match fallback tunnel.
93    Given src, dst and key, find appropriate for input tunnel.
94 */
95 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
96 				   int link, __be16 flags,
97 				   __be32 remote, __be32 local,
98 				   __be32 key)
99 {
100 	unsigned int hash;
101 	struct ip_tunnel *t, *cand = NULL;
102 	struct hlist_head *head;
103 
104 	hash = ip_tunnel_hash(key, remote);
105 	head = &itn->tunnels[hash];
106 
107 	hlist_for_each_entry_rcu(t, head, hash_node) {
108 		if (local != t->parms.iph.saddr ||
109 		    remote != t->parms.iph.daddr ||
110 		    !(t->dev->flags & IFF_UP))
111 			continue;
112 
113 		if (!ip_tunnel_key_match(&t->parms, flags, key))
114 			continue;
115 
116 		if (t->parms.link == link)
117 			return t;
118 		else
119 			cand = t;
120 	}
121 
122 	hlist_for_each_entry_rcu(t, head, hash_node) {
123 		if (remote != t->parms.iph.daddr ||
124 		    t->parms.iph.saddr != 0 ||
125 		    !(t->dev->flags & IFF_UP))
126 			continue;
127 
128 		if (!ip_tunnel_key_match(&t->parms, flags, key))
129 			continue;
130 
131 		if (t->parms.link == link)
132 			return t;
133 		else if (!cand)
134 			cand = t;
135 	}
136 
137 	hash = ip_tunnel_hash(key, 0);
138 	head = &itn->tunnels[hash];
139 
140 	hlist_for_each_entry_rcu(t, head, hash_node) {
141 		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
142 		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
143 			continue;
144 
145 		if (!(t->dev->flags & IFF_UP))
146 			continue;
147 
148 		if (!ip_tunnel_key_match(&t->parms, flags, key))
149 			continue;
150 
151 		if (t->parms.link == link)
152 			return t;
153 		else if (!cand)
154 			cand = t;
155 	}
156 
157 	if (flags & TUNNEL_NO_KEY)
158 		goto skip_key_lookup;
159 
160 	hlist_for_each_entry_rcu(t, head, hash_node) {
161 		if (t->parms.i_key != key ||
162 		    t->parms.iph.saddr != 0 ||
163 		    t->parms.iph.daddr != 0 ||
164 		    !(t->dev->flags & IFF_UP))
165 			continue;
166 
167 		if (t->parms.link == link)
168 			return t;
169 		else if (!cand)
170 			cand = t;
171 	}
172 
173 skip_key_lookup:
174 	if (cand)
175 		return cand;
176 
177 	t = rcu_dereference(itn->collect_md_tun);
178 	if (t)
179 		return t;
180 
181 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
182 		return netdev_priv(itn->fb_tunnel_dev);
183 
184 	return NULL;
185 }
186 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
187 
188 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
189 				    struct ip_tunnel_parm *parms)
190 {
191 	unsigned int h;
192 	__be32 remote;
193 	__be32 i_key = parms->i_key;
194 
195 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
196 		remote = parms->iph.daddr;
197 	else
198 		remote = 0;
199 
200 	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
201 		i_key = 0;
202 
203 	h = ip_tunnel_hash(i_key, remote);
204 	return &itn->tunnels[h];
205 }
206 
207 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
208 {
209 	struct hlist_head *head = ip_bucket(itn, &t->parms);
210 
211 	if (t->collect_md)
212 		rcu_assign_pointer(itn->collect_md_tun, t);
213 	hlist_add_head_rcu(&t->hash_node, head);
214 }
215 
216 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
217 {
218 	if (t->collect_md)
219 		rcu_assign_pointer(itn->collect_md_tun, NULL);
220 	hlist_del_init_rcu(&t->hash_node);
221 }
222 
223 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
224 					struct ip_tunnel_parm *parms,
225 					int type)
226 {
227 	__be32 remote = parms->iph.daddr;
228 	__be32 local = parms->iph.saddr;
229 	__be32 key = parms->i_key;
230 	__be16 flags = parms->i_flags;
231 	int link = parms->link;
232 	struct ip_tunnel *t = NULL;
233 	struct hlist_head *head = ip_bucket(itn, parms);
234 
235 	hlist_for_each_entry_rcu(t, head, hash_node) {
236 		if (local == t->parms.iph.saddr &&
237 		    remote == t->parms.iph.daddr &&
238 		    link == t->parms.link &&
239 		    type == t->dev->type &&
240 		    ip_tunnel_key_match(&t->parms, flags, key))
241 			break;
242 	}
243 	return t;
244 }
245 
246 static struct net_device *__ip_tunnel_create(struct net *net,
247 					     const struct rtnl_link_ops *ops,
248 					     struct ip_tunnel_parm *parms)
249 {
250 	int err;
251 	struct ip_tunnel *tunnel;
252 	struct net_device *dev;
253 	char name[IFNAMSIZ];
254 
255 	if (parms->name[0])
256 		strlcpy(name, parms->name, IFNAMSIZ);
257 	else {
258 		if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
259 			err = -E2BIG;
260 			goto failed;
261 		}
262 		strlcpy(name, ops->kind, IFNAMSIZ);
263 		strncat(name, "%d", 2);
264 	}
265 
266 	ASSERT_RTNL();
267 	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
268 	if (!dev) {
269 		err = -ENOMEM;
270 		goto failed;
271 	}
272 	dev_net_set(dev, net);
273 
274 	dev->rtnl_link_ops = ops;
275 
276 	tunnel = netdev_priv(dev);
277 	tunnel->parms = *parms;
278 	tunnel->net = net;
279 
280 	err = register_netdevice(dev);
281 	if (err)
282 		goto failed_free;
283 
284 	return dev;
285 
286 failed_free:
287 	free_netdev(dev);
288 failed:
289 	return ERR_PTR(err);
290 }
291 
292 static inline void init_tunnel_flow(struct flowi4 *fl4,
293 				    int proto,
294 				    __be32 daddr, __be32 saddr,
295 				    __be32 key, __u8 tos, int oif)
296 {
297 	memset(fl4, 0, sizeof(*fl4));
298 	fl4->flowi4_oif = oif;
299 	fl4->daddr = daddr;
300 	fl4->saddr = saddr;
301 	fl4->flowi4_tos = tos;
302 	fl4->flowi4_proto = proto;
303 	fl4->fl4_gre_key = key;
304 }
305 
306 static int ip_tunnel_bind_dev(struct net_device *dev)
307 {
308 	struct net_device *tdev = NULL;
309 	struct ip_tunnel *tunnel = netdev_priv(dev);
310 	const struct iphdr *iph;
311 	int hlen = LL_MAX_HEADER;
312 	int mtu = ETH_DATA_LEN;
313 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
314 
315 	iph = &tunnel->parms.iph;
316 
317 	/* Guess output device to choose reasonable mtu and needed_headroom */
318 	if (iph->daddr) {
319 		struct flowi4 fl4;
320 		struct rtable *rt;
321 
322 		init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
323 				 iph->saddr, tunnel->parms.o_key,
324 				 RT_TOS(iph->tos), tunnel->parms.link);
325 		rt = ip_route_output_key(tunnel->net, &fl4);
326 
327 		if (!IS_ERR(rt)) {
328 			tdev = rt->dst.dev;
329 			ip_rt_put(rt);
330 		}
331 		if (dev->type != ARPHRD_ETHER)
332 			dev->flags |= IFF_POINTOPOINT;
333 
334 		dst_cache_reset(&tunnel->dst_cache);
335 	}
336 
337 	if (!tdev && tunnel->parms.link)
338 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
339 
340 	if (tdev) {
341 		hlen = tdev->hard_header_len + tdev->needed_headroom;
342 		mtu = tdev->mtu;
343 	}
344 
345 	dev->needed_headroom = t_hlen + hlen;
346 	mtu -= (dev->hard_header_len + t_hlen);
347 
348 	if (mtu < 68)
349 		mtu = 68;
350 
351 	return mtu;
352 }
353 
354 static struct ip_tunnel *ip_tunnel_create(struct net *net,
355 					  struct ip_tunnel_net *itn,
356 					  struct ip_tunnel_parm *parms)
357 {
358 	struct ip_tunnel *nt;
359 	struct net_device *dev;
360 
361 	BUG_ON(!itn->fb_tunnel_dev);
362 	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
363 	if (IS_ERR(dev))
364 		return ERR_CAST(dev);
365 
366 	dev->mtu = ip_tunnel_bind_dev(dev);
367 
368 	nt = netdev_priv(dev);
369 	ip_tunnel_add(itn, nt);
370 	return nt;
371 }
372 
373 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
374 		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
375 		  bool log_ecn_error)
376 {
377 	struct pcpu_sw_netstats *tstats;
378 	const struct iphdr *iph = ip_hdr(skb);
379 	int err;
380 
381 #ifdef CONFIG_NET_IPGRE_BROADCAST
382 	if (ipv4_is_multicast(iph->daddr)) {
383 		tunnel->dev->stats.multicast++;
384 		skb->pkt_type = PACKET_BROADCAST;
385 	}
386 #endif
387 
388 	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
389 	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
390 		tunnel->dev->stats.rx_crc_errors++;
391 		tunnel->dev->stats.rx_errors++;
392 		goto drop;
393 	}
394 
395 	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
396 		if (!(tpi->flags&TUNNEL_SEQ) ||
397 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
398 			tunnel->dev->stats.rx_fifo_errors++;
399 			tunnel->dev->stats.rx_errors++;
400 			goto drop;
401 		}
402 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
403 	}
404 
405 	skb_reset_network_header(skb);
406 
407 	err = IP_ECN_decapsulate(iph, skb);
408 	if (unlikely(err)) {
409 		if (log_ecn_error)
410 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
411 					&iph->saddr, iph->tos);
412 		if (err > 1) {
413 			++tunnel->dev->stats.rx_frame_errors;
414 			++tunnel->dev->stats.rx_errors;
415 			goto drop;
416 		}
417 	}
418 
419 	tstats = this_cpu_ptr(tunnel->dev->tstats);
420 	u64_stats_update_begin(&tstats->syncp);
421 	tstats->rx_packets++;
422 	tstats->rx_bytes += skb->len;
423 	u64_stats_update_end(&tstats->syncp);
424 
425 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
426 
427 	if (tunnel->dev->type == ARPHRD_ETHER) {
428 		skb->protocol = eth_type_trans(skb, tunnel->dev);
429 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
430 	} else {
431 		skb->dev = tunnel->dev;
432 	}
433 
434 	if (tun_dst)
435 		skb_dst_set(skb, (struct dst_entry *)tun_dst);
436 
437 	gro_cells_receive(&tunnel->gro_cells, skb);
438 	return 0;
439 
440 drop:
441 	kfree_skb(skb);
442 	return 0;
443 }
444 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
445 
446 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
447 			    unsigned int num)
448 {
449 	if (num >= MAX_IPTUN_ENCAP_OPS)
450 		return -ERANGE;
451 
452 	return !cmpxchg((const struct ip_tunnel_encap_ops **)
453 			&iptun_encaps[num],
454 			NULL, ops) ? 0 : -1;
455 }
456 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
457 
458 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
459 			    unsigned int num)
460 {
461 	int ret;
462 
463 	if (num >= MAX_IPTUN_ENCAP_OPS)
464 		return -ERANGE;
465 
466 	ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
467 		       &iptun_encaps[num],
468 		       ops, NULL) == ops) ? 0 : -1;
469 
470 	synchronize_net();
471 
472 	return ret;
473 }
474 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
475 
476 int ip_tunnel_encap_setup(struct ip_tunnel *t,
477 			  struct ip_tunnel_encap *ipencap)
478 {
479 	int hlen;
480 
481 	memset(&t->encap, 0, sizeof(t->encap));
482 
483 	hlen = ip_encap_hlen(ipencap);
484 	if (hlen < 0)
485 		return hlen;
486 
487 	t->encap.type = ipencap->type;
488 	t->encap.sport = ipencap->sport;
489 	t->encap.dport = ipencap->dport;
490 	t->encap.flags = ipencap->flags;
491 
492 	t->encap_hlen = hlen;
493 	t->hlen = t->encap_hlen + t->tun_hlen;
494 
495 	return 0;
496 }
497 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
498 
499 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
500 			    struct rtable *rt, __be16 df,
501 			    const struct iphdr *inner_iph)
502 {
503 	struct ip_tunnel *tunnel = netdev_priv(dev);
504 	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
505 	int mtu;
506 
507 	if (df)
508 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
509 					- sizeof(struct iphdr) - tunnel->hlen;
510 	else
511 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
512 
513 	if (skb_dst(skb))
514 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
515 
516 	if (skb->protocol == htons(ETH_P_IP)) {
517 		if (!skb_is_gso(skb) &&
518 		    (inner_iph->frag_off & htons(IP_DF)) &&
519 		    mtu < pkt_size) {
520 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
521 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
522 			return -E2BIG;
523 		}
524 	}
525 #if IS_ENABLED(CONFIG_IPV6)
526 	else if (skb->protocol == htons(ETH_P_IPV6)) {
527 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
528 
529 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
530 			   mtu >= IPV6_MIN_MTU) {
531 			if ((tunnel->parms.iph.daddr &&
532 			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
533 			    rt6->rt6i_dst.plen == 128) {
534 				rt6->rt6i_flags |= RTF_MODIFIED;
535 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
536 			}
537 		}
538 
539 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
540 					mtu < pkt_size) {
541 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
542 			return -E2BIG;
543 		}
544 	}
545 #endif
546 	return 0;
547 }
548 
549 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
550 		    const struct iphdr *tnl_params, u8 protocol)
551 {
552 	struct ip_tunnel *tunnel = netdev_priv(dev);
553 	const struct iphdr *inner_iph;
554 	struct flowi4 fl4;
555 	u8     tos, ttl;
556 	__be16 df;
557 	struct rtable *rt;		/* Route to the other host */
558 	unsigned int max_headroom;	/* The extra header space needed */
559 	__be32 dst;
560 	bool connected;
561 
562 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
563 	connected = (tunnel->parms.iph.daddr != 0);
564 
565 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
566 
567 	dst = tnl_params->daddr;
568 	if (dst == 0) {
569 		/* NBMA tunnel */
570 
571 		if (!skb_dst(skb)) {
572 			dev->stats.tx_fifo_errors++;
573 			goto tx_error;
574 		}
575 
576 		if (skb->protocol == htons(ETH_P_IP)) {
577 			rt = skb_rtable(skb);
578 			dst = rt_nexthop(rt, inner_iph->daddr);
579 		}
580 #if IS_ENABLED(CONFIG_IPV6)
581 		else if (skb->protocol == htons(ETH_P_IPV6)) {
582 			const struct in6_addr *addr6;
583 			struct neighbour *neigh;
584 			bool do_tx_error_icmp;
585 			int addr_type;
586 
587 			neigh = dst_neigh_lookup(skb_dst(skb),
588 						 &ipv6_hdr(skb)->daddr);
589 			if (!neigh)
590 				goto tx_error;
591 
592 			addr6 = (const struct in6_addr *)&neigh->primary_key;
593 			addr_type = ipv6_addr_type(addr6);
594 
595 			if (addr_type == IPV6_ADDR_ANY) {
596 				addr6 = &ipv6_hdr(skb)->daddr;
597 				addr_type = ipv6_addr_type(addr6);
598 			}
599 
600 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
601 				do_tx_error_icmp = true;
602 			else {
603 				do_tx_error_icmp = false;
604 				dst = addr6->s6_addr32[3];
605 			}
606 			neigh_release(neigh);
607 			if (do_tx_error_icmp)
608 				goto tx_error_icmp;
609 		}
610 #endif
611 		else
612 			goto tx_error;
613 
614 		connected = false;
615 	}
616 
617 	tos = tnl_params->tos;
618 	if (tos & 0x1) {
619 		tos &= ~0x1;
620 		if (skb->protocol == htons(ETH_P_IP)) {
621 			tos = inner_iph->tos;
622 			connected = false;
623 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
624 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
625 			connected = false;
626 		}
627 	}
628 
629 	init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
630 			 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
631 
632 	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
633 		goto tx_error;
634 
635 	rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
636 			 NULL;
637 
638 	if (!rt) {
639 		rt = ip_route_output_key(tunnel->net, &fl4);
640 
641 		if (IS_ERR(rt)) {
642 			dev->stats.tx_carrier_errors++;
643 			goto tx_error;
644 		}
645 		if (connected)
646 			dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
647 					  fl4.saddr);
648 	}
649 
650 	if (rt->dst.dev == dev) {
651 		ip_rt_put(rt);
652 		dev->stats.collisions++;
653 		goto tx_error;
654 	}
655 
656 	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
657 		ip_rt_put(rt);
658 		goto tx_error;
659 	}
660 
661 	if (tunnel->err_count > 0) {
662 		if (time_before(jiffies,
663 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
664 			tunnel->err_count--;
665 
666 			dst_link_failure(skb);
667 		} else
668 			tunnel->err_count = 0;
669 	}
670 
671 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
672 	ttl = tnl_params->ttl;
673 	if (ttl == 0) {
674 		if (skb->protocol == htons(ETH_P_IP))
675 			ttl = inner_iph->ttl;
676 #if IS_ENABLED(CONFIG_IPV6)
677 		else if (skb->protocol == htons(ETH_P_IPV6))
678 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
679 #endif
680 		else
681 			ttl = ip4_dst_hoplimit(&rt->dst);
682 	}
683 
684 	df = tnl_params->frag_off;
685 	if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
686 		df |= (inner_iph->frag_off&htons(IP_DF));
687 
688 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
689 			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
690 	if (max_headroom > dev->needed_headroom)
691 		dev->needed_headroom = max_headroom;
692 
693 	if (skb_cow_head(skb, dev->needed_headroom)) {
694 		ip_rt_put(rt);
695 		dev->stats.tx_dropped++;
696 		kfree_skb(skb);
697 		return;
698 	}
699 
700 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
701 		      df, !net_eq(tunnel->net, dev_net(dev)));
702 	return;
703 
704 #if IS_ENABLED(CONFIG_IPV6)
705 tx_error_icmp:
706 	dst_link_failure(skb);
707 #endif
708 tx_error:
709 	dev->stats.tx_errors++;
710 	kfree_skb(skb);
711 }
712 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
713 
714 static void ip_tunnel_update(struct ip_tunnel_net *itn,
715 			     struct ip_tunnel *t,
716 			     struct net_device *dev,
717 			     struct ip_tunnel_parm *p,
718 			     bool set_mtu)
719 {
720 	ip_tunnel_del(itn, t);
721 	t->parms.iph.saddr = p->iph.saddr;
722 	t->parms.iph.daddr = p->iph.daddr;
723 	t->parms.i_key = p->i_key;
724 	t->parms.o_key = p->o_key;
725 	if (dev->type != ARPHRD_ETHER) {
726 		memcpy(dev->dev_addr, &p->iph.saddr, 4);
727 		memcpy(dev->broadcast, &p->iph.daddr, 4);
728 	}
729 	ip_tunnel_add(itn, t);
730 
731 	t->parms.iph.ttl = p->iph.ttl;
732 	t->parms.iph.tos = p->iph.tos;
733 	t->parms.iph.frag_off = p->iph.frag_off;
734 
735 	if (t->parms.link != p->link) {
736 		int mtu;
737 
738 		t->parms.link = p->link;
739 		mtu = ip_tunnel_bind_dev(dev);
740 		if (set_mtu)
741 			dev->mtu = mtu;
742 	}
743 	dst_cache_reset(&t->dst_cache);
744 	netdev_state_change(dev);
745 }
746 
747 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
748 {
749 	int err = 0;
750 	struct ip_tunnel *t = netdev_priv(dev);
751 	struct net *net = t->net;
752 	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
753 
754 	BUG_ON(!itn->fb_tunnel_dev);
755 	switch (cmd) {
756 	case SIOCGETTUNNEL:
757 		if (dev == itn->fb_tunnel_dev) {
758 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
759 			if (!t)
760 				t = netdev_priv(dev);
761 		}
762 		memcpy(p, &t->parms, sizeof(*p));
763 		break;
764 
765 	case SIOCADDTUNNEL:
766 	case SIOCCHGTUNNEL:
767 		err = -EPERM;
768 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
769 			goto done;
770 		if (p->iph.ttl)
771 			p->iph.frag_off |= htons(IP_DF);
772 		if (!(p->i_flags & VTI_ISVTI)) {
773 			if (!(p->i_flags & TUNNEL_KEY))
774 				p->i_key = 0;
775 			if (!(p->o_flags & TUNNEL_KEY))
776 				p->o_key = 0;
777 		}
778 
779 		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
780 
781 		if (cmd == SIOCADDTUNNEL) {
782 			if (!t) {
783 				t = ip_tunnel_create(net, itn, p);
784 				err = PTR_ERR_OR_ZERO(t);
785 				break;
786 			}
787 
788 			err = -EEXIST;
789 			break;
790 		}
791 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
792 			if (t) {
793 				if (t->dev != dev) {
794 					err = -EEXIST;
795 					break;
796 				}
797 			} else {
798 				unsigned int nflags = 0;
799 
800 				if (ipv4_is_multicast(p->iph.daddr))
801 					nflags = IFF_BROADCAST;
802 				else if (p->iph.daddr)
803 					nflags = IFF_POINTOPOINT;
804 
805 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
806 					err = -EINVAL;
807 					break;
808 				}
809 
810 				t = netdev_priv(dev);
811 			}
812 		}
813 
814 		if (t) {
815 			err = 0;
816 			ip_tunnel_update(itn, t, dev, p, true);
817 		} else {
818 			err = -ENOENT;
819 		}
820 		break;
821 
822 	case SIOCDELTUNNEL:
823 		err = -EPERM;
824 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
825 			goto done;
826 
827 		if (dev == itn->fb_tunnel_dev) {
828 			err = -ENOENT;
829 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
830 			if (!t)
831 				goto done;
832 			err = -EPERM;
833 			if (t == netdev_priv(itn->fb_tunnel_dev))
834 				goto done;
835 			dev = t->dev;
836 		}
837 		unregister_netdevice(dev);
838 		err = 0;
839 		break;
840 
841 	default:
842 		err = -EINVAL;
843 	}
844 
845 done:
846 	return err;
847 }
848 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
849 
850 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
851 {
852 	struct ip_tunnel *tunnel = netdev_priv(dev);
853 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
854 	int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
855 
856 	if (new_mtu < 68)
857 		return -EINVAL;
858 
859 	if (new_mtu > max_mtu) {
860 		if (strict)
861 			return -EINVAL;
862 
863 		new_mtu = max_mtu;
864 	}
865 
866 	dev->mtu = new_mtu;
867 	return 0;
868 }
869 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
870 
871 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
872 {
873 	return __ip_tunnel_change_mtu(dev, new_mtu, true);
874 }
875 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
876 
877 static void ip_tunnel_dev_free(struct net_device *dev)
878 {
879 	struct ip_tunnel *tunnel = netdev_priv(dev);
880 
881 	gro_cells_destroy(&tunnel->gro_cells);
882 	dst_cache_destroy(&tunnel->dst_cache);
883 	free_percpu(dev->tstats);
884 	free_netdev(dev);
885 }
886 
887 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
888 {
889 	struct ip_tunnel *tunnel = netdev_priv(dev);
890 	struct ip_tunnel_net *itn;
891 
892 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
893 
894 	if (itn->fb_tunnel_dev != dev) {
895 		ip_tunnel_del(itn, netdev_priv(dev));
896 		unregister_netdevice_queue(dev, head);
897 	}
898 }
899 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
900 
901 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
902 {
903 	struct ip_tunnel *tunnel = netdev_priv(dev);
904 
905 	return tunnel->net;
906 }
907 EXPORT_SYMBOL(ip_tunnel_get_link_net);
908 
909 int ip_tunnel_get_iflink(const struct net_device *dev)
910 {
911 	struct ip_tunnel *tunnel = netdev_priv(dev);
912 
913 	return tunnel->parms.link;
914 }
915 EXPORT_SYMBOL(ip_tunnel_get_iflink);
916 
917 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
918 				  struct rtnl_link_ops *ops, char *devname)
919 {
920 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
921 	struct ip_tunnel_parm parms;
922 	unsigned int i;
923 
924 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
925 		INIT_HLIST_HEAD(&itn->tunnels[i]);
926 
927 	if (!ops) {
928 		itn->fb_tunnel_dev = NULL;
929 		return 0;
930 	}
931 
932 	memset(&parms, 0, sizeof(parms));
933 	if (devname)
934 		strlcpy(parms.name, devname, IFNAMSIZ);
935 
936 	rtnl_lock();
937 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
938 	/* FB netdevice is special: we have one, and only one per netns.
939 	 * Allowing to move it to another netns is clearly unsafe.
940 	 */
941 	if (!IS_ERR(itn->fb_tunnel_dev)) {
942 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
943 		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
944 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
945 	}
946 	rtnl_unlock();
947 
948 	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
949 }
950 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
951 
952 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
953 			      struct rtnl_link_ops *ops)
954 {
955 	struct net *net = dev_net(itn->fb_tunnel_dev);
956 	struct net_device *dev, *aux;
957 	int h;
958 
959 	for_each_netdev_safe(net, dev, aux)
960 		if (dev->rtnl_link_ops == ops)
961 			unregister_netdevice_queue(dev, head);
962 
963 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
964 		struct ip_tunnel *t;
965 		struct hlist_node *n;
966 		struct hlist_head *thead = &itn->tunnels[h];
967 
968 		hlist_for_each_entry_safe(t, n, thead, hash_node)
969 			/* If dev is in the same netns, it has already
970 			 * been added to the list by the previous loop.
971 			 */
972 			if (!net_eq(dev_net(t->dev), net))
973 				unregister_netdevice_queue(t->dev, head);
974 	}
975 }
976 
977 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
978 {
979 	LIST_HEAD(list);
980 
981 	rtnl_lock();
982 	ip_tunnel_destroy(itn, &list, ops);
983 	unregister_netdevice_many(&list);
984 	rtnl_unlock();
985 }
986 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
987 
988 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
989 		      struct ip_tunnel_parm *p)
990 {
991 	struct ip_tunnel *nt;
992 	struct net *net = dev_net(dev);
993 	struct ip_tunnel_net *itn;
994 	int mtu;
995 	int err;
996 
997 	nt = netdev_priv(dev);
998 	itn = net_generic(net, nt->ip_tnl_net_id);
999 
1000 	if (nt->collect_md) {
1001 		if (rtnl_dereference(itn->collect_md_tun))
1002 			return -EEXIST;
1003 	} else {
1004 		if (ip_tunnel_find(itn, p, dev->type))
1005 			return -EEXIST;
1006 	}
1007 
1008 	nt->net = net;
1009 	nt->parms = *p;
1010 	err = register_netdevice(dev);
1011 	if (err)
1012 		goto out;
1013 
1014 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1015 		eth_hw_addr_random(dev);
1016 
1017 	mtu = ip_tunnel_bind_dev(dev);
1018 	if (!tb[IFLA_MTU])
1019 		dev->mtu = mtu;
1020 
1021 	ip_tunnel_add(itn, nt);
1022 out:
1023 	return err;
1024 }
1025 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1026 
1027 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1028 			 struct ip_tunnel_parm *p)
1029 {
1030 	struct ip_tunnel *t;
1031 	struct ip_tunnel *tunnel = netdev_priv(dev);
1032 	struct net *net = tunnel->net;
1033 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1034 
1035 	if (dev == itn->fb_tunnel_dev)
1036 		return -EINVAL;
1037 
1038 	t = ip_tunnel_find(itn, p, dev->type);
1039 
1040 	if (t) {
1041 		if (t->dev != dev)
1042 			return -EEXIST;
1043 	} else {
1044 		t = tunnel;
1045 
1046 		if (dev->type != ARPHRD_ETHER) {
1047 			unsigned int nflags = 0;
1048 
1049 			if (ipv4_is_multicast(p->iph.daddr))
1050 				nflags = IFF_BROADCAST;
1051 			else if (p->iph.daddr)
1052 				nflags = IFF_POINTOPOINT;
1053 
1054 			if ((dev->flags ^ nflags) &
1055 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1056 				return -EINVAL;
1057 		}
1058 	}
1059 
1060 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1061 	return 0;
1062 }
1063 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1064 
1065 int ip_tunnel_init(struct net_device *dev)
1066 {
1067 	struct ip_tunnel *tunnel = netdev_priv(dev);
1068 	struct iphdr *iph = &tunnel->parms.iph;
1069 	int err;
1070 
1071 	dev->destructor	= ip_tunnel_dev_free;
1072 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1073 	if (!dev->tstats)
1074 		return -ENOMEM;
1075 
1076 	err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1077 	if (err) {
1078 		free_percpu(dev->tstats);
1079 		return err;
1080 	}
1081 
1082 	err = gro_cells_init(&tunnel->gro_cells, dev);
1083 	if (err) {
1084 		dst_cache_destroy(&tunnel->dst_cache);
1085 		free_percpu(dev->tstats);
1086 		return err;
1087 	}
1088 
1089 	tunnel->dev = dev;
1090 	tunnel->net = dev_net(dev);
1091 	strcpy(tunnel->parms.name, dev->name);
1092 	iph->version		= 4;
1093 	iph->ihl		= 5;
1094 
1095 	if (tunnel->collect_md) {
1096 		dev->features |= NETIF_F_NETNS_LOCAL;
1097 		netif_keep_dst(dev);
1098 	}
1099 	return 0;
1100 }
1101 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1102 
1103 void ip_tunnel_uninit(struct net_device *dev)
1104 {
1105 	struct ip_tunnel *tunnel = netdev_priv(dev);
1106 	struct net *net = tunnel->net;
1107 	struct ip_tunnel_net *itn;
1108 
1109 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1110 	/* fb_tunnel_dev will be unregisted in net-exit call. */
1111 	if (itn->fb_tunnel_dev != dev)
1112 		ip_tunnel_del(itn, netdev_priv(dev));
1113 
1114 	dst_cache_reset(&tunnel->dst_cache);
1115 }
1116 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1117 
1118 /* Do least required initialization, rest of init is done in tunnel_init call */
1119 void ip_tunnel_setup(struct net_device *dev, int net_id)
1120 {
1121 	struct ip_tunnel *tunnel = netdev_priv(dev);
1122 	tunnel->ip_tnl_net_id = net_id;
1123 }
1124 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1125 
1126 MODULE_LICENSE("GPL");
1127