xref: /openbmc/linux/net/ipv4/ip_tunnel.c (revision 206204a1)
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20 
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/mroute.h>
34 #include <linux/init.h>
35 #include <linux/in6.h>
36 #include <linux/inetdevice.h>
37 #include <linux/igmp.h>
38 #include <linux/netfilter_ipv4.h>
39 #include <linux/etherdevice.h>
40 #include <linux/if_ether.h>
41 #include <linux/if_vlan.h>
42 #include <linux/rculist.h>
43 #include <linux/err.h>
44 
45 #include <net/sock.h>
46 #include <net/ip.h>
47 #include <net/icmp.h>
48 #include <net/protocol.h>
49 #include <net/ip_tunnels.h>
50 #include <net/arp.h>
51 #include <net/checksum.h>
52 #include <net/dsfield.h>
53 #include <net/inet_ecn.h>
54 #include <net/xfrm.h>
55 #include <net/net_namespace.h>
56 #include <net/netns/generic.h>
57 #include <net/rtnetlink.h>
58 
59 #if IS_ENABLED(CONFIG_IPV6)
60 #include <net/ipv6.h>
61 #include <net/ip6_fib.h>
62 #include <net/ip6_route.h>
63 #endif
64 
65 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
66 {
67 	return hash_32((__force u32)key ^ (__force u32)remote,
68 			 IP_TNL_HASH_BITS);
69 }
70 
71 static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
72 			     struct dst_entry *dst)
73 {
74 	struct dst_entry *old_dst;
75 
76 	if (dst) {
77 		if (dst->flags & DST_NOCACHE)
78 			dst = NULL;
79 		else
80 			dst_clone(dst);
81 	}
82 	old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
83 	dst_release(old_dst);
84 }
85 
86 static void tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst)
87 {
88 	__tunnel_dst_set(this_cpu_ptr(t->dst_cache), dst);
89 }
90 
91 static void tunnel_dst_reset(struct ip_tunnel *t)
92 {
93 	tunnel_dst_set(t, NULL);
94 }
95 
96 void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
97 {
98 	int i;
99 
100 	for_each_possible_cpu(i)
101 		__tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL);
102 }
103 EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
104 
105 static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, u32 cookie)
106 {
107 	struct dst_entry *dst;
108 
109 	rcu_read_lock();
110 	dst = rcu_dereference(this_cpu_ptr(t->dst_cache)->dst);
111 	if (dst) {
112 		if (dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
113 			rcu_read_unlock();
114 			tunnel_dst_reset(t);
115 			return NULL;
116 		}
117 		dst_hold(dst);
118 	}
119 	rcu_read_unlock();
120 	return (struct rtable *)dst;
121 }
122 
123 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
124 				__be16 flags, __be32 key)
125 {
126 	if (p->i_flags & TUNNEL_KEY) {
127 		if (flags & TUNNEL_KEY)
128 			return key == p->i_key;
129 		else
130 			/* key expected, none present */
131 			return false;
132 	} else
133 		return !(flags & TUNNEL_KEY);
134 }
135 
136 /* Fallback tunnel: no source, no destination, no key, no options
137 
138    Tunnel hash table:
139    We require exact key match i.e. if a key is present in packet
140    it will match only tunnel with the same key; if it is not present,
141    it will match only keyless tunnel.
142 
143    All keysless packets, if not matched configured keyless tunnels
144    will match fallback tunnel.
145    Given src, dst and key, find appropriate for input tunnel.
146 */
147 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
148 				   int link, __be16 flags,
149 				   __be32 remote, __be32 local,
150 				   __be32 key)
151 {
152 	unsigned int hash;
153 	struct ip_tunnel *t, *cand = NULL;
154 	struct hlist_head *head;
155 
156 	hash = ip_tunnel_hash(key, remote);
157 	head = &itn->tunnels[hash];
158 
159 	hlist_for_each_entry_rcu(t, head, hash_node) {
160 		if (local != t->parms.iph.saddr ||
161 		    remote != t->parms.iph.daddr ||
162 		    !(t->dev->flags & IFF_UP))
163 			continue;
164 
165 		if (!ip_tunnel_key_match(&t->parms, flags, key))
166 			continue;
167 
168 		if (t->parms.link == link)
169 			return t;
170 		else
171 			cand = t;
172 	}
173 
174 	hlist_for_each_entry_rcu(t, head, hash_node) {
175 		if (remote != t->parms.iph.daddr ||
176 		    !(t->dev->flags & IFF_UP))
177 			continue;
178 
179 		if (!ip_tunnel_key_match(&t->parms, flags, key))
180 			continue;
181 
182 		if (t->parms.link == link)
183 			return t;
184 		else if (!cand)
185 			cand = t;
186 	}
187 
188 	hash = ip_tunnel_hash(key, 0);
189 	head = &itn->tunnels[hash];
190 
191 	hlist_for_each_entry_rcu(t, head, hash_node) {
192 		if ((local != t->parms.iph.saddr &&
193 		     (local != t->parms.iph.daddr ||
194 		      !ipv4_is_multicast(local))) ||
195 		    !(t->dev->flags & IFF_UP))
196 			continue;
197 
198 		if (!ip_tunnel_key_match(&t->parms, flags, key))
199 			continue;
200 
201 		if (t->parms.link == link)
202 			return t;
203 		else if (!cand)
204 			cand = t;
205 	}
206 
207 	if (flags & TUNNEL_NO_KEY)
208 		goto skip_key_lookup;
209 
210 	hlist_for_each_entry_rcu(t, head, hash_node) {
211 		if (t->parms.i_key != key ||
212 		    !(t->dev->flags & IFF_UP))
213 			continue;
214 
215 		if (t->parms.link == link)
216 			return t;
217 		else if (!cand)
218 			cand = t;
219 	}
220 
221 skip_key_lookup:
222 	if (cand)
223 		return cand;
224 
225 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
226 		return netdev_priv(itn->fb_tunnel_dev);
227 
228 
229 	return NULL;
230 }
231 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
232 
233 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
234 				    struct ip_tunnel_parm *parms)
235 {
236 	unsigned int h;
237 	__be32 remote;
238 	__be32 i_key = parms->i_key;
239 
240 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
241 		remote = parms->iph.daddr;
242 	else
243 		remote = 0;
244 
245 	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
246 		i_key = 0;
247 
248 	h = ip_tunnel_hash(i_key, remote);
249 	return &itn->tunnels[h];
250 }
251 
252 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
253 {
254 	struct hlist_head *head = ip_bucket(itn, &t->parms);
255 
256 	hlist_add_head_rcu(&t->hash_node, head);
257 }
258 
259 static void ip_tunnel_del(struct ip_tunnel *t)
260 {
261 	hlist_del_init_rcu(&t->hash_node);
262 }
263 
264 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
265 					struct ip_tunnel_parm *parms,
266 					int type)
267 {
268 	__be32 remote = parms->iph.daddr;
269 	__be32 local = parms->iph.saddr;
270 	__be32 key = parms->i_key;
271 	__be16 flags = parms->i_flags;
272 	int link = parms->link;
273 	struct ip_tunnel *t = NULL;
274 	struct hlist_head *head = ip_bucket(itn, parms);
275 
276 	hlist_for_each_entry_rcu(t, head, hash_node) {
277 		if (local == t->parms.iph.saddr &&
278 		    remote == t->parms.iph.daddr &&
279 		    link == t->parms.link &&
280 		    type == t->dev->type &&
281 		    ip_tunnel_key_match(&t->parms, flags, key))
282 			break;
283 	}
284 	return t;
285 }
286 
287 static struct net_device *__ip_tunnel_create(struct net *net,
288 					     const struct rtnl_link_ops *ops,
289 					     struct ip_tunnel_parm *parms)
290 {
291 	int err;
292 	struct ip_tunnel *tunnel;
293 	struct net_device *dev;
294 	char name[IFNAMSIZ];
295 
296 	if (parms->name[0])
297 		strlcpy(name, parms->name, IFNAMSIZ);
298 	else {
299 		if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
300 			err = -E2BIG;
301 			goto failed;
302 		}
303 		strlcpy(name, ops->kind, IFNAMSIZ);
304 		strncat(name, "%d", 2);
305 	}
306 
307 	ASSERT_RTNL();
308 	dev = alloc_netdev(ops->priv_size, name, ops->setup);
309 	if (!dev) {
310 		err = -ENOMEM;
311 		goto failed;
312 	}
313 	dev_net_set(dev, net);
314 
315 	dev->rtnl_link_ops = ops;
316 
317 	tunnel = netdev_priv(dev);
318 	tunnel->parms = *parms;
319 	tunnel->net = net;
320 
321 	err = register_netdevice(dev);
322 	if (err)
323 		goto failed_free;
324 
325 	return dev;
326 
327 failed_free:
328 	free_netdev(dev);
329 failed:
330 	return ERR_PTR(err);
331 }
332 
333 static inline void init_tunnel_flow(struct flowi4 *fl4,
334 				    int proto,
335 				    __be32 daddr, __be32 saddr,
336 				    __be32 key, __u8 tos, int oif)
337 {
338 	memset(fl4, 0, sizeof(*fl4));
339 	fl4->flowi4_oif = oif;
340 	fl4->daddr = daddr;
341 	fl4->saddr = saddr;
342 	fl4->flowi4_tos = tos;
343 	fl4->flowi4_proto = proto;
344 	fl4->fl4_gre_key = key;
345 }
346 
347 static int ip_tunnel_bind_dev(struct net_device *dev)
348 {
349 	struct net_device *tdev = NULL;
350 	struct ip_tunnel *tunnel = netdev_priv(dev);
351 	const struct iphdr *iph;
352 	int hlen = LL_MAX_HEADER;
353 	int mtu = ETH_DATA_LEN;
354 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
355 
356 	iph = &tunnel->parms.iph;
357 
358 	/* Guess output device to choose reasonable mtu and needed_headroom */
359 	if (iph->daddr) {
360 		struct flowi4 fl4;
361 		struct rtable *rt;
362 
363 		init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
364 				 iph->saddr, tunnel->parms.o_key,
365 				 RT_TOS(iph->tos), tunnel->parms.link);
366 		rt = ip_route_output_key(tunnel->net, &fl4);
367 
368 		if (!IS_ERR(rt)) {
369 			tdev = rt->dst.dev;
370 			tunnel_dst_set(tunnel, &rt->dst);
371 			ip_rt_put(rt);
372 		}
373 		if (dev->type != ARPHRD_ETHER)
374 			dev->flags |= IFF_POINTOPOINT;
375 	}
376 
377 	if (!tdev && tunnel->parms.link)
378 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
379 
380 	if (tdev) {
381 		hlen = tdev->hard_header_len + tdev->needed_headroom;
382 		mtu = tdev->mtu;
383 	}
384 	dev->iflink = tunnel->parms.link;
385 
386 	dev->needed_headroom = t_hlen + hlen;
387 	mtu -= (dev->hard_header_len + t_hlen);
388 
389 	if (mtu < 68)
390 		mtu = 68;
391 
392 	return mtu;
393 }
394 
395 static struct ip_tunnel *ip_tunnel_create(struct net *net,
396 					  struct ip_tunnel_net *itn,
397 					  struct ip_tunnel_parm *parms)
398 {
399 	struct ip_tunnel *nt;
400 	struct net_device *dev;
401 
402 	BUG_ON(!itn->fb_tunnel_dev);
403 	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
404 	if (IS_ERR(dev))
405 		return ERR_CAST(dev);
406 
407 	dev->mtu = ip_tunnel_bind_dev(dev);
408 
409 	nt = netdev_priv(dev);
410 	ip_tunnel_add(itn, nt);
411 	return nt;
412 }
413 
414 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
415 		  const struct tnl_ptk_info *tpi, bool log_ecn_error)
416 {
417 	struct pcpu_sw_netstats *tstats;
418 	const struct iphdr *iph = ip_hdr(skb);
419 	int err;
420 
421 #ifdef CONFIG_NET_IPGRE_BROADCAST
422 	if (ipv4_is_multicast(iph->daddr)) {
423 		tunnel->dev->stats.multicast++;
424 		skb->pkt_type = PACKET_BROADCAST;
425 	}
426 #endif
427 
428 	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
429 	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
430 		tunnel->dev->stats.rx_crc_errors++;
431 		tunnel->dev->stats.rx_errors++;
432 		goto drop;
433 	}
434 
435 	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
436 		if (!(tpi->flags&TUNNEL_SEQ) ||
437 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
438 			tunnel->dev->stats.rx_fifo_errors++;
439 			tunnel->dev->stats.rx_errors++;
440 			goto drop;
441 		}
442 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
443 	}
444 
445 	skb_reset_network_header(skb);
446 
447 	err = IP_ECN_decapsulate(iph, skb);
448 	if (unlikely(err)) {
449 		if (log_ecn_error)
450 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
451 					&iph->saddr, iph->tos);
452 		if (err > 1) {
453 			++tunnel->dev->stats.rx_frame_errors;
454 			++tunnel->dev->stats.rx_errors;
455 			goto drop;
456 		}
457 	}
458 
459 	tstats = this_cpu_ptr(tunnel->dev->tstats);
460 	u64_stats_update_begin(&tstats->syncp);
461 	tstats->rx_packets++;
462 	tstats->rx_bytes += skb->len;
463 	u64_stats_update_end(&tstats->syncp);
464 
465 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
466 
467 	if (tunnel->dev->type == ARPHRD_ETHER) {
468 		skb->protocol = eth_type_trans(skb, tunnel->dev);
469 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
470 	} else {
471 		skb->dev = tunnel->dev;
472 	}
473 
474 	gro_cells_receive(&tunnel->gro_cells, skb);
475 	return 0;
476 
477 drop:
478 	kfree_skb(skb);
479 	return 0;
480 }
481 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
482 
483 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
484 			    struct rtable *rt, __be16 df)
485 {
486 	struct ip_tunnel *tunnel = netdev_priv(dev);
487 	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
488 	int mtu;
489 
490 	if (df)
491 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
492 					- sizeof(struct iphdr) - tunnel->hlen;
493 	else
494 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
495 
496 	if (skb_dst(skb))
497 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
498 
499 	if (skb->protocol == htons(ETH_P_IP)) {
500 		if (!skb_is_gso(skb) &&
501 		    (df & htons(IP_DF)) && mtu < pkt_size) {
502 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
503 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
504 			return -E2BIG;
505 		}
506 	}
507 #if IS_ENABLED(CONFIG_IPV6)
508 	else if (skb->protocol == htons(ETH_P_IPV6)) {
509 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
510 
511 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
512 			   mtu >= IPV6_MIN_MTU) {
513 			if ((tunnel->parms.iph.daddr &&
514 			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
515 			    rt6->rt6i_dst.plen == 128) {
516 				rt6->rt6i_flags |= RTF_MODIFIED;
517 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
518 			}
519 		}
520 
521 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
522 					mtu < pkt_size) {
523 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
524 			return -E2BIG;
525 		}
526 	}
527 #endif
528 	return 0;
529 }
530 
531 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
532 		    const struct iphdr *tnl_params, const u8 protocol)
533 {
534 	struct ip_tunnel *tunnel = netdev_priv(dev);
535 	const struct iphdr *inner_iph;
536 	struct flowi4 fl4;
537 	u8     tos, ttl;
538 	__be16 df;
539 	struct rtable *rt;		/* Route to the other host */
540 	unsigned int max_headroom;	/* The extra header space needed */
541 	__be32 dst;
542 	int err;
543 	bool connected;
544 
545 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
546 	connected = (tunnel->parms.iph.daddr != 0);
547 
548 	dst = tnl_params->daddr;
549 	if (dst == 0) {
550 		/* NBMA tunnel */
551 
552 		if (skb_dst(skb) == NULL) {
553 			dev->stats.tx_fifo_errors++;
554 			goto tx_error;
555 		}
556 
557 		if (skb->protocol == htons(ETH_P_IP)) {
558 			rt = skb_rtable(skb);
559 			dst = rt_nexthop(rt, inner_iph->daddr);
560 		}
561 #if IS_ENABLED(CONFIG_IPV6)
562 		else if (skb->protocol == htons(ETH_P_IPV6)) {
563 			const struct in6_addr *addr6;
564 			struct neighbour *neigh;
565 			bool do_tx_error_icmp;
566 			int addr_type;
567 
568 			neigh = dst_neigh_lookup(skb_dst(skb),
569 						 &ipv6_hdr(skb)->daddr);
570 			if (neigh == NULL)
571 				goto tx_error;
572 
573 			addr6 = (const struct in6_addr *)&neigh->primary_key;
574 			addr_type = ipv6_addr_type(addr6);
575 
576 			if (addr_type == IPV6_ADDR_ANY) {
577 				addr6 = &ipv6_hdr(skb)->daddr;
578 				addr_type = ipv6_addr_type(addr6);
579 			}
580 
581 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
582 				do_tx_error_icmp = true;
583 			else {
584 				do_tx_error_icmp = false;
585 				dst = addr6->s6_addr32[3];
586 			}
587 			neigh_release(neigh);
588 			if (do_tx_error_icmp)
589 				goto tx_error_icmp;
590 		}
591 #endif
592 		else
593 			goto tx_error;
594 
595 		connected = false;
596 	}
597 
598 	tos = tnl_params->tos;
599 	if (tos & 0x1) {
600 		tos &= ~0x1;
601 		if (skb->protocol == htons(ETH_P_IP)) {
602 			tos = inner_iph->tos;
603 			connected = false;
604 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
605 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
606 			connected = false;
607 		}
608 	}
609 
610 	init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
611 			 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
612 
613 	rt = connected ? tunnel_rtable_get(tunnel, 0) : NULL;
614 
615 	if (!rt) {
616 		rt = ip_route_output_key(tunnel->net, &fl4);
617 
618 		if (IS_ERR(rt)) {
619 			dev->stats.tx_carrier_errors++;
620 			goto tx_error;
621 		}
622 		if (connected)
623 			tunnel_dst_set(tunnel, &rt->dst);
624 	}
625 
626 	if (rt->dst.dev == dev) {
627 		ip_rt_put(rt);
628 		dev->stats.collisions++;
629 		goto tx_error;
630 	}
631 
632 	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) {
633 		ip_rt_put(rt);
634 		goto tx_error;
635 	}
636 
637 	if (tunnel->err_count > 0) {
638 		if (time_before(jiffies,
639 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
640 			tunnel->err_count--;
641 
642 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
643 			dst_link_failure(skb);
644 		} else
645 			tunnel->err_count = 0;
646 	}
647 
648 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
649 	ttl = tnl_params->ttl;
650 	if (ttl == 0) {
651 		if (skb->protocol == htons(ETH_P_IP))
652 			ttl = inner_iph->ttl;
653 #if IS_ENABLED(CONFIG_IPV6)
654 		else if (skb->protocol == htons(ETH_P_IPV6))
655 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
656 #endif
657 		else
658 			ttl = ip4_dst_hoplimit(&rt->dst);
659 	}
660 
661 	df = tnl_params->frag_off;
662 	if (skb->protocol == htons(ETH_P_IP))
663 		df |= (inner_iph->frag_off&htons(IP_DF));
664 
665 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
666 			+ rt->dst.header_len;
667 	if (max_headroom > dev->needed_headroom)
668 		dev->needed_headroom = max_headroom;
669 
670 	if (skb_cow_head(skb, dev->needed_headroom)) {
671 		ip_rt_put(rt);
672 		dev->stats.tx_dropped++;
673 		kfree_skb(skb);
674 		return;
675 	}
676 
677 	err = iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, protocol,
678 			    tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
679 	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
680 
681 	return;
682 
683 #if IS_ENABLED(CONFIG_IPV6)
684 tx_error_icmp:
685 	dst_link_failure(skb);
686 #endif
687 tx_error:
688 	dev->stats.tx_errors++;
689 	kfree_skb(skb);
690 }
691 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
692 
693 static void ip_tunnel_update(struct ip_tunnel_net *itn,
694 			     struct ip_tunnel *t,
695 			     struct net_device *dev,
696 			     struct ip_tunnel_parm *p,
697 			     bool set_mtu)
698 {
699 	ip_tunnel_del(t);
700 	t->parms.iph.saddr = p->iph.saddr;
701 	t->parms.iph.daddr = p->iph.daddr;
702 	t->parms.i_key = p->i_key;
703 	t->parms.o_key = p->o_key;
704 	if (dev->type != ARPHRD_ETHER) {
705 		memcpy(dev->dev_addr, &p->iph.saddr, 4);
706 		memcpy(dev->broadcast, &p->iph.daddr, 4);
707 	}
708 	ip_tunnel_add(itn, t);
709 
710 	t->parms.iph.ttl = p->iph.ttl;
711 	t->parms.iph.tos = p->iph.tos;
712 	t->parms.iph.frag_off = p->iph.frag_off;
713 
714 	if (t->parms.link != p->link) {
715 		int mtu;
716 
717 		t->parms.link = p->link;
718 		mtu = ip_tunnel_bind_dev(dev);
719 		if (set_mtu)
720 			dev->mtu = mtu;
721 	}
722 	ip_tunnel_dst_reset_all(t);
723 	netdev_state_change(dev);
724 }
725 
726 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
727 {
728 	int err = 0;
729 	struct ip_tunnel *t = netdev_priv(dev);
730 	struct net *net = t->net;
731 	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
732 
733 	BUG_ON(!itn->fb_tunnel_dev);
734 	switch (cmd) {
735 	case SIOCGETTUNNEL:
736 		if (dev == itn->fb_tunnel_dev) {
737 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
738 			if (t == NULL)
739 				t = netdev_priv(dev);
740 		}
741 		memcpy(p, &t->parms, sizeof(*p));
742 		break;
743 
744 	case SIOCADDTUNNEL:
745 	case SIOCCHGTUNNEL:
746 		err = -EPERM;
747 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
748 			goto done;
749 		if (p->iph.ttl)
750 			p->iph.frag_off |= htons(IP_DF);
751 		if (!(p->i_flags & VTI_ISVTI)) {
752 			if (!(p->i_flags & TUNNEL_KEY))
753 				p->i_key = 0;
754 			if (!(p->o_flags & TUNNEL_KEY))
755 				p->o_key = 0;
756 		}
757 
758 		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
759 
760 		if (!t && (cmd == SIOCADDTUNNEL)) {
761 			t = ip_tunnel_create(net, itn, p);
762 			err = PTR_ERR_OR_ZERO(t);
763 			break;
764 		}
765 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
766 			if (t != NULL) {
767 				if (t->dev != dev) {
768 					err = -EEXIST;
769 					break;
770 				}
771 			} else {
772 				unsigned int nflags = 0;
773 
774 				if (ipv4_is_multicast(p->iph.daddr))
775 					nflags = IFF_BROADCAST;
776 				else if (p->iph.daddr)
777 					nflags = IFF_POINTOPOINT;
778 
779 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
780 					err = -EINVAL;
781 					break;
782 				}
783 
784 				t = netdev_priv(dev);
785 			}
786 		}
787 
788 		if (t) {
789 			err = 0;
790 			ip_tunnel_update(itn, t, dev, p, true);
791 		} else {
792 			err = -ENOENT;
793 		}
794 		break;
795 
796 	case SIOCDELTUNNEL:
797 		err = -EPERM;
798 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
799 			goto done;
800 
801 		if (dev == itn->fb_tunnel_dev) {
802 			err = -ENOENT;
803 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
804 			if (t == NULL)
805 				goto done;
806 			err = -EPERM;
807 			if (t == netdev_priv(itn->fb_tunnel_dev))
808 				goto done;
809 			dev = t->dev;
810 		}
811 		unregister_netdevice(dev);
812 		err = 0;
813 		break;
814 
815 	default:
816 		err = -EINVAL;
817 	}
818 
819 done:
820 	return err;
821 }
822 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
823 
824 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
825 {
826 	struct ip_tunnel *tunnel = netdev_priv(dev);
827 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
828 
829 	if (new_mtu < 68 ||
830 	    new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
831 		return -EINVAL;
832 	dev->mtu = new_mtu;
833 	return 0;
834 }
835 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
836 
837 static void ip_tunnel_dev_free(struct net_device *dev)
838 {
839 	struct ip_tunnel *tunnel = netdev_priv(dev);
840 
841 	gro_cells_destroy(&tunnel->gro_cells);
842 	free_percpu(tunnel->dst_cache);
843 	free_percpu(dev->tstats);
844 	free_netdev(dev);
845 }
846 
847 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
848 {
849 	struct ip_tunnel *tunnel = netdev_priv(dev);
850 	struct ip_tunnel_net *itn;
851 
852 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
853 
854 	if (itn->fb_tunnel_dev != dev) {
855 		ip_tunnel_del(netdev_priv(dev));
856 		unregister_netdevice_queue(dev, head);
857 	}
858 }
859 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
860 
861 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
862 				  struct rtnl_link_ops *ops, char *devname)
863 {
864 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
865 	struct ip_tunnel_parm parms;
866 	unsigned int i;
867 
868 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
869 		INIT_HLIST_HEAD(&itn->tunnels[i]);
870 
871 	if (!ops) {
872 		itn->fb_tunnel_dev = NULL;
873 		return 0;
874 	}
875 
876 	memset(&parms, 0, sizeof(parms));
877 	if (devname)
878 		strlcpy(parms.name, devname, IFNAMSIZ);
879 
880 	rtnl_lock();
881 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
882 	/* FB netdevice is special: we have one, and only one per netns.
883 	 * Allowing to move it to another netns is clearly unsafe.
884 	 */
885 	if (!IS_ERR(itn->fb_tunnel_dev)) {
886 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
887 		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
888 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
889 	}
890 	rtnl_unlock();
891 
892 	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
893 }
894 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
895 
896 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
897 			      struct rtnl_link_ops *ops)
898 {
899 	struct net *net = dev_net(itn->fb_tunnel_dev);
900 	struct net_device *dev, *aux;
901 	int h;
902 
903 	for_each_netdev_safe(net, dev, aux)
904 		if (dev->rtnl_link_ops == ops)
905 			unregister_netdevice_queue(dev, head);
906 
907 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
908 		struct ip_tunnel *t;
909 		struct hlist_node *n;
910 		struct hlist_head *thead = &itn->tunnels[h];
911 
912 		hlist_for_each_entry_safe(t, n, thead, hash_node)
913 			/* If dev is in the same netns, it has already
914 			 * been added to the list by the previous loop.
915 			 */
916 			if (!net_eq(dev_net(t->dev), net))
917 				unregister_netdevice_queue(t->dev, head);
918 	}
919 }
920 
921 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
922 {
923 	LIST_HEAD(list);
924 
925 	rtnl_lock();
926 	ip_tunnel_destroy(itn, &list, ops);
927 	unregister_netdevice_many(&list);
928 	rtnl_unlock();
929 }
930 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
931 
932 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
933 		      struct ip_tunnel_parm *p)
934 {
935 	struct ip_tunnel *nt;
936 	struct net *net = dev_net(dev);
937 	struct ip_tunnel_net *itn;
938 	int mtu;
939 	int err;
940 
941 	nt = netdev_priv(dev);
942 	itn = net_generic(net, nt->ip_tnl_net_id);
943 
944 	if (ip_tunnel_find(itn, p, dev->type))
945 		return -EEXIST;
946 
947 	nt->net = net;
948 	nt->parms = *p;
949 	err = register_netdevice(dev);
950 	if (err)
951 		goto out;
952 
953 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
954 		eth_hw_addr_random(dev);
955 
956 	mtu = ip_tunnel_bind_dev(dev);
957 	if (!tb[IFLA_MTU])
958 		dev->mtu = mtu;
959 
960 	ip_tunnel_add(itn, nt);
961 
962 out:
963 	return err;
964 }
965 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
966 
967 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
968 			 struct ip_tunnel_parm *p)
969 {
970 	struct ip_tunnel *t;
971 	struct ip_tunnel *tunnel = netdev_priv(dev);
972 	struct net *net = tunnel->net;
973 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
974 
975 	if (dev == itn->fb_tunnel_dev)
976 		return -EINVAL;
977 
978 	t = ip_tunnel_find(itn, p, dev->type);
979 
980 	if (t) {
981 		if (t->dev != dev)
982 			return -EEXIST;
983 	} else {
984 		t = tunnel;
985 
986 		if (dev->type != ARPHRD_ETHER) {
987 			unsigned int nflags = 0;
988 
989 			if (ipv4_is_multicast(p->iph.daddr))
990 				nflags = IFF_BROADCAST;
991 			else if (p->iph.daddr)
992 				nflags = IFF_POINTOPOINT;
993 
994 			if ((dev->flags ^ nflags) &
995 			    (IFF_POINTOPOINT | IFF_BROADCAST))
996 				return -EINVAL;
997 		}
998 	}
999 
1000 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1001 	return 0;
1002 }
1003 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1004 
1005 int ip_tunnel_init(struct net_device *dev)
1006 {
1007 	struct ip_tunnel *tunnel = netdev_priv(dev);
1008 	struct iphdr *iph = &tunnel->parms.iph;
1009 	int err;
1010 
1011 	dev->destructor	= ip_tunnel_dev_free;
1012 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1013 	if (!dev->tstats)
1014 		return -ENOMEM;
1015 
1016 	tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
1017 	if (!tunnel->dst_cache) {
1018 		free_percpu(dev->tstats);
1019 		return -ENOMEM;
1020 	}
1021 
1022 	err = gro_cells_init(&tunnel->gro_cells, dev);
1023 	if (err) {
1024 		free_percpu(tunnel->dst_cache);
1025 		free_percpu(dev->tstats);
1026 		return err;
1027 	}
1028 
1029 	tunnel->dev = dev;
1030 	tunnel->net = dev_net(dev);
1031 	strcpy(tunnel->parms.name, dev->name);
1032 	iph->version		= 4;
1033 	iph->ihl		= 5;
1034 
1035 	return 0;
1036 }
1037 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1038 
1039 void ip_tunnel_uninit(struct net_device *dev)
1040 {
1041 	struct ip_tunnel *tunnel = netdev_priv(dev);
1042 	struct net *net = tunnel->net;
1043 	struct ip_tunnel_net *itn;
1044 
1045 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1046 	/* fb_tunnel_dev will be unregisted in net-exit call. */
1047 	if (itn->fb_tunnel_dev != dev)
1048 		ip_tunnel_del(netdev_priv(dev));
1049 
1050 	ip_tunnel_dst_reset_all(tunnel);
1051 }
1052 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1053 
1054 /* Do least required initialization, rest of init is done in tunnel_init call */
1055 void ip_tunnel_setup(struct net_device *dev, int net_id)
1056 {
1057 	struct ip_tunnel *tunnel = netdev_priv(dev);
1058 	tunnel->ip_tnl_net_id = net_id;
1059 }
1060 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1061 
1062 MODULE_LICENSE("GPL");
1063