xref: /openbmc/linux/net/ipv4/ip_tunnel.c (revision f6723b56)
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20 
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/mroute.h>
34 #include <linux/init.h>
35 #include <linux/in6.h>
36 #include <linux/inetdevice.h>
37 #include <linux/igmp.h>
38 #include <linux/netfilter_ipv4.h>
39 #include <linux/etherdevice.h>
40 #include <linux/if_ether.h>
41 #include <linux/if_vlan.h>
42 #include <linux/rculist.h>
43 #include <linux/err.h>
44 
45 #include <net/sock.h>
46 #include <net/ip.h>
47 #include <net/icmp.h>
48 #include <net/protocol.h>
49 #include <net/ip_tunnels.h>
50 #include <net/arp.h>
51 #include <net/checksum.h>
52 #include <net/dsfield.h>
53 #include <net/inet_ecn.h>
54 #include <net/xfrm.h>
55 #include <net/net_namespace.h>
56 #include <net/netns/generic.h>
57 #include <net/rtnetlink.h>
58 
59 #if IS_ENABLED(CONFIG_IPV6)
60 #include <net/ipv6.h>
61 #include <net/ip6_fib.h>
62 #include <net/ip6_route.h>
63 #endif
64 
65 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
66 {
67 	return hash_32((__force u32)key ^ (__force u32)remote,
68 			 IP_TNL_HASH_BITS);
69 }
70 
71 static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
72 			     struct dst_entry *dst)
73 {
74 	struct dst_entry *old_dst;
75 
76 	if (dst) {
77 		if (dst->flags & DST_NOCACHE)
78 			dst = NULL;
79 		else
80 			dst_clone(dst);
81 	}
82 	old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
83 	dst_release(old_dst);
84 }
85 
86 static void tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst)
87 {
88 	__tunnel_dst_set(this_cpu_ptr(t->dst_cache), dst);
89 }
90 
91 static void tunnel_dst_reset(struct ip_tunnel *t)
92 {
93 	tunnel_dst_set(t, NULL);
94 }
95 
96 static void tunnel_dst_reset_all(struct ip_tunnel *t)
97 {
98 	int i;
99 
100 	for_each_possible_cpu(i)
101 		__tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL);
102 }
103 
104 static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, u32 cookie)
105 {
106 	struct dst_entry *dst;
107 
108 	rcu_read_lock();
109 	dst = rcu_dereference(this_cpu_ptr(t->dst_cache)->dst);
110 	if (dst) {
111 		if (dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
112 			rcu_read_unlock();
113 			tunnel_dst_reset(t);
114 			return NULL;
115 		}
116 		dst_hold(dst);
117 	}
118 	rcu_read_unlock();
119 	return (struct rtable *)dst;
120 }
121 
122 /* Often modified stats are per cpu, other are shared (netdev->stats) */
123 struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
124 						struct rtnl_link_stats64 *tot)
125 {
126 	int i;
127 
128 	for_each_possible_cpu(i) {
129 		const struct pcpu_sw_netstats *tstats =
130 						   per_cpu_ptr(dev->tstats, i);
131 		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
132 		unsigned int start;
133 
134 		do {
135 			start = u64_stats_fetch_begin_bh(&tstats->syncp);
136 			rx_packets = tstats->rx_packets;
137 			tx_packets = tstats->tx_packets;
138 			rx_bytes = tstats->rx_bytes;
139 			tx_bytes = tstats->tx_bytes;
140 		} while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
141 
142 		tot->rx_packets += rx_packets;
143 		tot->tx_packets += tx_packets;
144 		tot->rx_bytes   += rx_bytes;
145 		tot->tx_bytes   += tx_bytes;
146 	}
147 
148 	tot->multicast = dev->stats.multicast;
149 
150 	tot->rx_crc_errors = dev->stats.rx_crc_errors;
151 	tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
152 	tot->rx_length_errors = dev->stats.rx_length_errors;
153 	tot->rx_frame_errors = dev->stats.rx_frame_errors;
154 	tot->rx_errors = dev->stats.rx_errors;
155 
156 	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
157 	tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
158 	tot->tx_dropped = dev->stats.tx_dropped;
159 	tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
160 	tot->tx_errors = dev->stats.tx_errors;
161 
162 	tot->collisions  = dev->stats.collisions;
163 
164 	return tot;
165 }
166 EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
167 
168 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
169 				__be16 flags, __be32 key)
170 {
171 	if (p->i_flags & TUNNEL_KEY) {
172 		if (flags & TUNNEL_KEY)
173 			return key == p->i_key;
174 		else
175 			/* key expected, none present */
176 			return false;
177 	} else
178 		return !(flags & TUNNEL_KEY);
179 }
180 
181 /* Fallback tunnel: no source, no destination, no key, no options
182 
183    Tunnel hash table:
184    We require exact key match i.e. if a key is present in packet
185    it will match only tunnel with the same key; if it is not present,
186    it will match only keyless tunnel.
187 
188    All keysless packets, if not matched configured keyless tunnels
189    will match fallback tunnel.
190    Given src, dst and key, find appropriate for input tunnel.
191 */
192 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
193 				   int link, __be16 flags,
194 				   __be32 remote, __be32 local,
195 				   __be32 key)
196 {
197 	unsigned int hash;
198 	struct ip_tunnel *t, *cand = NULL;
199 	struct hlist_head *head;
200 
201 	hash = ip_tunnel_hash(key, remote);
202 	head = &itn->tunnels[hash];
203 
204 	hlist_for_each_entry_rcu(t, head, hash_node) {
205 		if (local != t->parms.iph.saddr ||
206 		    remote != t->parms.iph.daddr ||
207 		    !(t->dev->flags & IFF_UP))
208 			continue;
209 
210 		if (!ip_tunnel_key_match(&t->parms, flags, key))
211 			continue;
212 
213 		if (t->parms.link == link)
214 			return t;
215 		else
216 			cand = t;
217 	}
218 
219 	hlist_for_each_entry_rcu(t, head, hash_node) {
220 		if (remote != t->parms.iph.daddr ||
221 		    !(t->dev->flags & IFF_UP))
222 			continue;
223 
224 		if (!ip_tunnel_key_match(&t->parms, flags, key))
225 			continue;
226 
227 		if (t->parms.link == link)
228 			return t;
229 		else if (!cand)
230 			cand = t;
231 	}
232 
233 	hash = ip_tunnel_hash(key, 0);
234 	head = &itn->tunnels[hash];
235 
236 	hlist_for_each_entry_rcu(t, head, hash_node) {
237 		if ((local != t->parms.iph.saddr &&
238 		     (local != t->parms.iph.daddr ||
239 		      !ipv4_is_multicast(local))) ||
240 		    !(t->dev->flags & IFF_UP))
241 			continue;
242 
243 		if (!ip_tunnel_key_match(&t->parms, flags, key))
244 			continue;
245 
246 		if (t->parms.link == link)
247 			return t;
248 		else if (!cand)
249 			cand = t;
250 	}
251 
252 	if (flags & TUNNEL_NO_KEY)
253 		goto skip_key_lookup;
254 
255 	hlist_for_each_entry_rcu(t, head, hash_node) {
256 		if (t->parms.i_key != key ||
257 		    !(t->dev->flags & IFF_UP))
258 			continue;
259 
260 		if (t->parms.link == link)
261 			return t;
262 		else if (!cand)
263 			cand = t;
264 	}
265 
266 skip_key_lookup:
267 	if (cand)
268 		return cand;
269 
270 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
271 		return netdev_priv(itn->fb_tunnel_dev);
272 
273 
274 	return NULL;
275 }
276 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
277 
278 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
279 				    struct ip_tunnel_parm *parms)
280 {
281 	unsigned int h;
282 	__be32 remote;
283 
284 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
285 		remote = parms->iph.daddr;
286 	else
287 		remote = 0;
288 
289 	h = ip_tunnel_hash(parms->i_key, remote);
290 	return &itn->tunnels[h];
291 }
292 
293 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
294 {
295 	struct hlist_head *head = ip_bucket(itn, &t->parms);
296 
297 	hlist_add_head_rcu(&t->hash_node, head);
298 }
299 
300 static void ip_tunnel_del(struct ip_tunnel *t)
301 {
302 	hlist_del_init_rcu(&t->hash_node);
303 }
304 
305 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
306 					struct ip_tunnel_parm *parms,
307 					int type)
308 {
309 	__be32 remote = parms->iph.daddr;
310 	__be32 local = parms->iph.saddr;
311 	__be32 key = parms->i_key;
312 	int link = parms->link;
313 	struct ip_tunnel *t = NULL;
314 	struct hlist_head *head = ip_bucket(itn, parms);
315 
316 	hlist_for_each_entry_rcu(t, head, hash_node) {
317 		if (local == t->parms.iph.saddr &&
318 		    remote == t->parms.iph.daddr &&
319 		    key == t->parms.i_key &&
320 		    link == t->parms.link &&
321 		    type == t->dev->type)
322 			break;
323 	}
324 	return t;
325 }
326 
327 static struct net_device *__ip_tunnel_create(struct net *net,
328 					     const struct rtnl_link_ops *ops,
329 					     struct ip_tunnel_parm *parms)
330 {
331 	int err;
332 	struct ip_tunnel *tunnel;
333 	struct net_device *dev;
334 	char name[IFNAMSIZ];
335 
336 	if (parms->name[0])
337 		strlcpy(name, parms->name, IFNAMSIZ);
338 	else {
339 		if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
340 			err = -E2BIG;
341 			goto failed;
342 		}
343 		strlcpy(name, ops->kind, IFNAMSIZ);
344 		strncat(name, "%d", 2);
345 	}
346 
347 	ASSERT_RTNL();
348 	dev = alloc_netdev(ops->priv_size, name, ops->setup);
349 	if (!dev) {
350 		err = -ENOMEM;
351 		goto failed;
352 	}
353 	dev_net_set(dev, net);
354 
355 	dev->rtnl_link_ops = ops;
356 
357 	tunnel = netdev_priv(dev);
358 	tunnel->parms = *parms;
359 	tunnel->net = net;
360 
361 	err = register_netdevice(dev);
362 	if (err)
363 		goto failed_free;
364 
365 	return dev;
366 
367 failed_free:
368 	free_netdev(dev);
369 failed:
370 	return ERR_PTR(err);
371 }
372 
373 static inline void init_tunnel_flow(struct flowi4 *fl4,
374 				    int proto,
375 				    __be32 daddr, __be32 saddr,
376 				    __be32 key, __u8 tos, int oif)
377 {
378 	memset(fl4, 0, sizeof(*fl4));
379 	fl4->flowi4_oif = oif;
380 	fl4->daddr = daddr;
381 	fl4->saddr = saddr;
382 	fl4->flowi4_tos = tos;
383 	fl4->flowi4_proto = proto;
384 	fl4->fl4_gre_key = key;
385 }
386 
387 static int ip_tunnel_bind_dev(struct net_device *dev)
388 {
389 	struct net_device *tdev = NULL;
390 	struct ip_tunnel *tunnel = netdev_priv(dev);
391 	const struct iphdr *iph;
392 	int hlen = LL_MAX_HEADER;
393 	int mtu = ETH_DATA_LEN;
394 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
395 
396 	iph = &tunnel->parms.iph;
397 
398 	/* Guess output device to choose reasonable mtu and needed_headroom */
399 	if (iph->daddr) {
400 		struct flowi4 fl4;
401 		struct rtable *rt;
402 
403 		init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
404 				 iph->saddr, tunnel->parms.o_key,
405 				 RT_TOS(iph->tos), tunnel->parms.link);
406 		rt = ip_route_output_key(tunnel->net, &fl4);
407 
408 		if (!IS_ERR(rt)) {
409 			tdev = rt->dst.dev;
410 			tunnel_dst_set(tunnel, &rt->dst);
411 			ip_rt_put(rt);
412 		}
413 		if (dev->type != ARPHRD_ETHER)
414 			dev->flags |= IFF_POINTOPOINT;
415 	}
416 
417 	if (!tdev && tunnel->parms.link)
418 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
419 
420 	if (tdev) {
421 		hlen = tdev->hard_header_len + tdev->needed_headroom;
422 		mtu = tdev->mtu;
423 	}
424 	dev->iflink = tunnel->parms.link;
425 
426 	dev->needed_headroom = t_hlen + hlen;
427 	mtu -= (dev->hard_header_len + t_hlen);
428 
429 	if (mtu < 68)
430 		mtu = 68;
431 
432 	return mtu;
433 }
434 
435 static struct ip_tunnel *ip_tunnel_create(struct net *net,
436 					  struct ip_tunnel_net *itn,
437 					  struct ip_tunnel_parm *parms)
438 {
439 	struct ip_tunnel *nt, *fbt;
440 	struct net_device *dev;
441 
442 	BUG_ON(!itn->fb_tunnel_dev);
443 	fbt = netdev_priv(itn->fb_tunnel_dev);
444 	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
445 	if (IS_ERR(dev))
446 		return NULL;
447 
448 	dev->mtu = ip_tunnel_bind_dev(dev);
449 
450 	nt = netdev_priv(dev);
451 	ip_tunnel_add(itn, nt);
452 	return nt;
453 }
454 
455 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
456 		  const struct tnl_ptk_info *tpi, bool log_ecn_error)
457 {
458 	struct pcpu_sw_netstats *tstats;
459 	const struct iphdr *iph = ip_hdr(skb);
460 	int err;
461 
462 #ifdef CONFIG_NET_IPGRE_BROADCAST
463 	if (ipv4_is_multicast(iph->daddr)) {
464 		/* Looped back packet, drop it! */
465 		if (rt_is_output_route(skb_rtable(skb)))
466 			goto drop;
467 		tunnel->dev->stats.multicast++;
468 		skb->pkt_type = PACKET_BROADCAST;
469 	}
470 #endif
471 
472 	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
473 	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
474 		tunnel->dev->stats.rx_crc_errors++;
475 		tunnel->dev->stats.rx_errors++;
476 		goto drop;
477 	}
478 
479 	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
480 		if (!(tpi->flags&TUNNEL_SEQ) ||
481 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
482 			tunnel->dev->stats.rx_fifo_errors++;
483 			tunnel->dev->stats.rx_errors++;
484 			goto drop;
485 		}
486 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
487 	}
488 
489 	err = IP_ECN_decapsulate(iph, skb);
490 	if (unlikely(err)) {
491 		if (log_ecn_error)
492 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
493 					&iph->saddr, iph->tos);
494 		if (err > 1) {
495 			++tunnel->dev->stats.rx_frame_errors;
496 			++tunnel->dev->stats.rx_errors;
497 			goto drop;
498 		}
499 	}
500 
501 	tstats = this_cpu_ptr(tunnel->dev->tstats);
502 	u64_stats_update_begin(&tstats->syncp);
503 	tstats->rx_packets++;
504 	tstats->rx_bytes += skb->len;
505 	u64_stats_update_end(&tstats->syncp);
506 
507 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
508 
509 	if (tunnel->dev->type == ARPHRD_ETHER) {
510 		skb->protocol = eth_type_trans(skb, tunnel->dev);
511 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
512 	} else {
513 		skb->dev = tunnel->dev;
514 	}
515 
516 	gro_cells_receive(&tunnel->gro_cells, skb);
517 	return 0;
518 
519 drop:
520 	kfree_skb(skb);
521 	return 0;
522 }
523 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
524 
525 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
526 			    struct rtable *rt, __be16 df)
527 {
528 	struct ip_tunnel *tunnel = netdev_priv(dev);
529 	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
530 	int mtu;
531 
532 	if (df)
533 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
534 					- sizeof(struct iphdr) - tunnel->hlen;
535 	else
536 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
537 
538 	if (skb_dst(skb))
539 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
540 
541 	if (skb->protocol == htons(ETH_P_IP)) {
542 		if (!skb_is_gso(skb) &&
543 		    (df & htons(IP_DF)) && mtu < pkt_size) {
544 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
545 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
546 			return -E2BIG;
547 		}
548 	}
549 #if IS_ENABLED(CONFIG_IPV6)
550 	else if (skb->protocol == htons(ETH_P_IPV6)) {
551 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
552 
553 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
554 			   mtu >= IPV6_MIN_MTU) {
555 			if ((tunnel->parms.iph.daddr &&
556 			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
557 			    rt6->rt6i_dst.plen == 128) {
558 				rt6->rt6i_flags |= RTF_MODIFIED;
559 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
560 			}
561 		}
562 
563 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
564 					mtu < pkt_size) {
565 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
566 			return -E2BIG;
567 		}
568 	}
569 #endif
570 	return 0;
571 }
572 
573 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
574 		    const struct iphdr *tnl_params, const u8 protocol)
575 {
576 	struct ip_tunnel *tunnel = netdev_priv(dev);
577 	const struct iphdr *inner_iph;
578 	struct flowi4 fl4;
579 	u8     tos, ttl;
580 	__be16 df;
581 	struct rtable *rt;		/* Route to the other host */
582 	unsigned int max_headroom;	/* The extra header space needed */
583 	__be32 dst;
584 	int err;
585 	bool connected = true;
586 
587 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
588 
589 	dst = tnl_params->daddr;
590 	if (dst == 0) {
591 		/* NBMA tunnel */
592 
593 		if (skb_dst(skb) == NULL) {
594 			dev->stats.tx_fifo_errors++;
595 			goto tx_error;
596 		}
597 
598 		if (skb->protocol == htons(ETH_P_IP)) {
599 			rt = skb_rtable(skb);
600 			dst = rt_nexthop(rt, inner_iph->daddr);
601 		}
602 #if IS_ENABLED(CONFIG_IPV6)
603 		else if (skb->protocol == htons(ETH_P_IPV6)) {
604 			const struct in6_addr *addr6;
605 			struct neighbour *neigh;
606 			bool do_tx_error_icmp;
607 			int addr_type;
608 
609 			neigh = dst_neigh_lookup(skb_dst(skb),
610 						 &ipv6_hdr(skb)->daddr);
611 			if (neigh == NULL)
612 				goto tx_error;
613 
614 			addr6 = (const struct in6_addr *)&neigh->primary_key;
615 			addr_type = ipv6_addr_type(addr6);
616 
617 			if (addr_type == IPV6_ADDR_ANY) {
618 				addr6 = &ipv6_hdr(skb)->daddr;
619 				addr_type = ipv6_addr_type(addr6);
620 			}
621 
622 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
623 				do_tx_error_icmp = true;
624 			else {
625 				do_tx_error_icmp = false;
626 				dst = addr6->s6_addr32[3];
627 			}
628 			neigh_release(neigh);
629 			if (do_tx_error_icmp)
630 				goto tx_error_icmp;
631 		}
632 #endif
633 		else
634 			goto tx_error;
635 
636 		connected = false;
637 	}
638 
639 	tos = tnl_params->tos;
640 	if (tos & 0x1) {
641 		tos &= ~0x1;
642 		if (skb->protocol == htons(ETH_P_IP)) {
643 			tos = inner_iph->tos;
644 			connected = false;
645 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
646 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
647 			connected = false;
648 		}
649 	}
650 
651 	init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
652 			 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
653 
654 	rt = connected ? tunnel_rtable_get(tunnel, 0) : NULL;
655 
656 	if (!rt) {
657 		rt = ip_route_output_key(tunnel->net, &fl4);
658 
659 		if (IS_ERR(rt)) {
660 			dev->stats.tx_carrier_errors++;
661 			goto tx_error;
662 		}
663 		if (connected)
664 			tunnel_dst_set(tunnel, &rt->dst);
665 	}
666 
667 	if (rt->dst.dev == dev) {
668 		ip_rt_put(rt);
669 		dev->stats.collisions++;
670 		goto tx_error;
671 	}
672 
673 	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) {
674 		ip_rt_put(rt);
675 		goto tx_error;
676 	}
677 
678 	if (tunnel->err_count > 0) {
679 		if (time_before(jiffies,
680 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
681 			tunnel->err_count--;
682 
683 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
684 			dst_link_failure(skb);
685 		} else
686 			tunnel->err_count = 0;
687 	}
688 
689 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
690 	ttl = tnl_params->ttl;
691 	if (ttl == 0) {
692 		if (skb->protocol == htons(ETH_P_IP))
693 			ttl = inner_iph->ttl;
694 #if IS_ENABLED(CONFIG_IPV6)
695 		else if (skb->protocol == htons(ETH_P_IPV6))
696 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
697 #endif
698 		else
699 			ttl = ip4_dst_hoplimit(&rt->dst);
700 	}
701 
702 	df = tnl_params->frag_off;
703 	if (skb->protocol == htons(ETH_P_IP))
704 		df |= (inner_iph->frag_off&htons(IP_DF));
705 
706 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
707 			+ rt->dst.header_len;
708 	if (max_headroom > dev->needed_headroom)
709 		dev->needed_headroom = max_headroom;
710 
711 	if (skb_cow_head(skb, dev->needed_headroom)) {
712 		dev->stats.tx_dropped++;
713 		kfree_skb(skb);
714 		return;
715 	}
716 
717 	err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, protocol,
718 			    tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
719 	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
720 
721 	return;
722 
723 #if IS_ENABLED(CONFIG_IPV6)
724 tx_error_icmp:
725 	dst_link_failure(skb);
726 #endif
727 tx_error:
728 	dev->stats.tx_errors++;
729 	kfree_skb(skb);
730 }
731 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
732 
733 static void ip_tunnel_update(struct ip_tunnel_net *itn,
734 			     struct ip_tunnel *t,
735 			     struct net_device *dev,
736 			     struct ip_tunnel_parm *p,
737 			     bool set_mtu)
738 {
739 	ip_tunnel_del(t);
740 	t->parms.iph.saddr = p->iph.saddr;
741 	t->parms.iph.daddr = p->iph.daddr;
742 	t->parms.i_key = p->i_key;
743 	t->parms.o_key = p->o_key;
744 	if (dev->type != ARPHRD_ETHER) {
745 		memcpy(dev->dev_addr, &p->iph.saddr, 4);
746 		memcpy(dev->broadcast, &p->iph.daddr, 4);
747 	}
748 	ip_tunnel_add(itn, t);
749 
750 	t->parms.iph.ttl = p->iph.ttl;
751 	t->parms.iph.tos = p->iph.tos;
752 	t->parms.iph.frag_off = p->iph.frag_off;
753 
754 	if (t->parms.link != p->link) {
755 		int mtu;
756 
757 		t->parms.link = p->link;
758 		mtu = ip_tunnel_bind_dev(dev);
759 		if (set_mtu)
760 			dev->mtu = mtu;
761 	}
762 	tunnel_dst_reset_all(t);
763 	netdev_state_change(dev);
764 }
765 
766 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
767 {
768 	int err = 0;
769 	struct ip_tunnel *t;
770 	struct net *net = dev_net(dev);
771 	struct ip_tunnel *tunnel = netdev_priv(dev);
772 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
773 
774 	BUG_ON(!itn->fb_tunnel_dev);
775 	switch (cmd) {
776 	case SIOCGETTUNNEL:
777 		t = NULL;
778 		if (dev == itn->fb_tunnel_dev)
779 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
780 		if (t == NULL)
781 			t = netdev_priv(dev);
782 		memcpy(p, &t->parms, sizeof(*p));
783 		break;
784 
785 	case SIOCADDTUNNEL:
786 	case SIOCCHGTUNNEL:
787 		err = -EPERM;
788 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
789 			goto done;
790 		if (p->iph.ttl)
791 			p->iph.frag_off |= htons(IP_DF);
792 		if (!(p->i_flags&TUNNEL_KEY))
793 			p->i_key = 0;
794 		if (!(p->o_flags&TUNNEL_KEY))
795 			p->o_key = 0;
796 
797 		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
798 
799 		if (!t && (cmd == SIOCADDTUNNEL))
800 			t = ip_tunnel_create(net, itn, p);
801 
802 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
803 			if (t != NULL) {
804 				if (t->dev != dev) {
805 					err = -EEXIST;
806 					break;
807 				}
808 			} else {
809 				unsigned int nflags = 0;
810 
811 				if (ipv4_is_multicast(p->iph.daddr))
812 					nflags = IFF_BROADCAST;
813 				else if (p->iph.daddr)
814 					nflags = IFF_POINTOPOINT;
815 
816 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
817 					err = -EINVAL;
818 					break;
819 				}
820 
821 				t = netdev_priv(dev);
822 			}
823 		}
824 
825 		if (t) {
826 			err = 0;
827 			ip_tunnel_update(itn, t, dev, p, true);
828 		} else
829 			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
830 		break;
831 
832 	case SIOCDELTUNNEL:
833 		err = -EPERM;
834 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
835 			goto done;
836 
837 		if (dev == itn->fb_tunnel_dev) {
838 			err = -ENOENT;
839 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
840 			if (t == NULL)
841 				goto done;
842 			err = -EPERM;
843 			if (t == netdev_priv(itn->fb_tunnel_dev))
844 				goto done;
845 			dev = t->dev;
846 		}
847 		unregister_netdevice(dev);
848 		err = 0;
849 		break;
850 
851 	default:
852 		err = -EINVAL;
853 	}
854 
855 done:
856 	return err;
857 }
858 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
859 
860 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
861 {
862 	struct ip_tunnel *tunnel = netdev_priv(dev);
863 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
864 
865 	if (new_mtu < 68 ||
866 	    new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
867 		return -EINVAL;
868 	dev->mtu = new_mtu;
869 	return 0;
870 }
871 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
872 
873 static void ip_tunnel_dev_free(struct net_device *dev)
874 {
875 	struct ip_tunnel *tunnel = netdev_priv(dev);
876 
877 	gro_cells_destroy(&tunnel->gro_cells);
878 	free_percpu(tunnel->dst_cache);
879 	free_percpu(dev->tstats);
880 	free_netdev(dev);
881 }
882 
883 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
884 {
885 	struct ip_tunnel *tunnel = netdev_priv(dev);
886 	struct ip_tunnel_net *itn;
887 
888 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
889 
890 	if (itn->fb_tunnel_dev != dev) {
891 		ip_tunnel_del(netdev_priv(dev));
892 		unregister_netdevice_queue(dev, head);
893 	}
894 }
895 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
896 
897 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
898 				  struct rtnl_link_ops *ops, char *devname)
899 {
900 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
901 	struct ip_tunnel_parm parms;
902 	unsigned int i;
903 
904 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
905 		INIT_HLIST_HEAD(&itn->tunnels[i]);
906 
907 	if (!ops) {
908 		itn->fb_tunnel_dev = NULL;
909 		return 0;
910 	}
911 
912 	memset(&parms, 0, sizeof(parms));
913 	if (devname)
914 		strlcpy(parms.name, devname, IFNAMSIZ);
915 
916 	rtnl_lock();
917 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
918 	/* FB netdevice is special: we have one, and only one per netns.
919 	 * Allowing to move it to another netns is clearly unsafe.
920 	 */
921 	if (!IS_ERR(itn->fb_tunnel_dev)) {
922 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
923 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
924 	}
925 	rtnl_unlock();
926 
927 	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
928 }
929 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
930 
931 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
932 			      struct rtnl_link_ops *ops)
933 {
934 	struct net *net = dev_net(itn->fb_tunnel_dev);
935 	struct net_device *dev, *aux;
936 	int h;
937 
938 	for_each_netdev_safe(net, dev, aux)
939 		if (dev->rtnl_link_ops == ops)
940 			unregister_netdevice_queue(dev, head);
941 
942 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
943 		struct ip_tunnel *t;
944 		struct hlist_node *n;
945 		struct hlist_head *thead = &itn->tunnels[h];
946 
947 		hlist_for_each_entry_safe(t, n, thead, hash_node)
948 			/* If dev is in the same netns, it has already
949 			 * been added to the list by the previous loop.
950 			 */
951 			if (!net_eq(dev_net(t->dev), net))
952 				unregister_netdevice_queue(t->dev, head);
953 	}
954 }
955 
956 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
957 {
958 	LIST_HEAD(list);
959 
960 	rtnl_lock();
961 	ip_tunnel_destroy(itn, &list, ops);
962 	unregister_netdevice_many(&list);
963 	rtnl_unlock();
964 }
965 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
966 
967 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
968 		      struct ip_tunnel_parm *p)
969 {
970 	struct ip_tunnel *nt;
971 	struct net *net = dev_net(dev);
972 	struct ip_tunnel_net *itn;
973 	int mtu;
974 	int err;
975 
976 	nt = netdev_priv(dev);
977 	itn = net_generic(net, nt->ip_tnl_net_id);
978 
979 	if (ip_tunnel_find(itn, p, dev->type))
980 		return -EEXIST;
981 
982 	nt->net = net;
983 	nt->parms = *p;
984 	err = register_netdevice(dev);
985 	if (err)
986 		goto out;
987 
988 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
989 		eth_hw_addr_random(dev);
990 
991 	mtu = ip_tunnel_bind_dev(dev);
992 	if (!tb[IFLA_MTU])
993 		dev->mtu = mtu;
994 
995 	ip_tunnel_add(itn, nt);
996 
997 out:
998 	return err;
999 }
1000 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1001 
1002 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1003 			 struct ip_tunnel_parm *p)
1004 {
1005 	struct ip_tunnel *t;
1006 	struct ip_tunnel *tunnel = netdev_priv(dev);
1007 	struct net *net = tunnel->net;
1008 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1009 
1010 	if (dev == itn->fb_tunnel_dev)
1011 		return -EINVAL;
1012 
1013 	t = ip_tunnel_find(itn, p, dev->type);
1014 
1015 	if (t) {
1016 		if (t->dev != dev)
1017 			return -EEXIST;
1018 	} else {
1019 		t = tunnel;
1020 
1021 		if (dev->type != ARPHRD_ETHER) {
1022 			unsigned int nflags = 0;
1023 
1024 			if (ipv4_is_multicast(p->iph.daddr))
1025 				nflags = IFF_BROADCAST;
1026 			else if (p->iph.daddr)
1027 				nflags = IFF_POINTOPOINT;
1028 
1029 			if ((dev->flags ^ nflags) &
1030 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1031 				return -EINVAL;
1032 		}
1033 	}
1034 
1035 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1036 	return 0;
1037 }
1038 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1039 
1040 int ip_tunnel_init(struct net_device *dev)
1041 {
1042 	struct ip_tunnel *tunnel = netdev_priv(dev);
1043 	struct iphdr *iph = &tunnel->parms.iph;
1044 	int i, err;
1045 
1046 	dev->destructor	= ip_tunnel_dev_free;
1047 	dev->tstats = alloc_percpu(struct pcpu_sw_netstats);
1048 	if (!dev->tstats)
1049 		return -ENOMEM;
1050 
1051 	for_each_possible_cpu(i) {
1052 		struct pcpu_sw_netstats *ipt_stats;
1053 		ipt_stats = per_cpu_ptr(dev->tstats, i);
1054 		u64_stats_init(&ipt_stats->syncp);
1055 	}
1056 
1057 	tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
1058 	if (!tunnel->dst_cache) {
1059 		free_percpu(dev->tstats);
1060 		return -ENOMEM;
1061 	}
1062 
1063 	err = gro_cells_init(&tunnel->gro_cells, dev);
1064 	if (err) {
1065 		free_percpu(tunnel->dst_cache);
1066 		free_percpu(dev->tstats);
1067 		return err;
1068 	}
1069 
1070 	tunnel->dev = dev;
1071 	tunnel->net = dev_net(dev);
1072 	strcpy(tunnel->parms.name, dev->name);
1073 	iph->version		= 4;
1074 	iph->ihl		= 5;
1075 
1076 	return 0;
1077 }
1078 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1079 
1080 void ip_tunnel_uninit(struct net_device *dev)
1081 {
1082 	struct ip_tunnel *tunnel = netdev_priv(dev);
1083 	struct net *net = tunnel->net;
1084 	struct ip_tunnel_net *itn;
1085 
1086 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1087 	/* fb_tunnel_dev will be unregisted in net-exit call. */
1088 	if (itn->fb_tunnel_dev != dev)
1089 		ip_tunnel_del(netdev_priv(dev));
1090 
1091 	tunnel_dst_reset_all(tunnel);
1092 }
1093 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1094 
1095 /* Do least required initialization, rest of init is done in tunnel_init call */
1096 void ip_tunnel_setup(struct net_device *dev, int net_id)
1097 {
1098 	struct ip_tunnel *tunnel = netdev_priv(dev);
1099 	tunnel->ip_tnl_net_id = net_id;
1100 }
1101 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1102 
1103 MODULE_LICENSE("GPL");
1104