xref: /openbmc/linux/net/ipv4/ip_tunnel.c (revision 84d517f3)
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20 
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/mroute.h>
34 #include <linux/init.h>
35 #include <linux/in6.h>
36 #include <linux/inetdevice.h>
37 #include <linux/igmp.h>
38 #include <linux/netfilter_ipv4.h>
39 #include <linux/etherdevice.h>
40 #include <linux/if_ether.h>
41 #include <linux/if_vlan.h>
42 #include <linux/rculist.h>
43 #include <linux/err.h>
44 
45 #include <net/sock.h>
46 #include <net/ip.h>
47 #include <net/icmp.h>
48 #include <net/protocol.h>
49 #include <net/ip_tunnels.h>
50 #include <net/arp.h>
51 #include <net/checksum.h>
52 #include <net/dsfield.h>
53 #include <net/inet_ecn.h>
54 #include <net/xfrm.h>
55 #include <net/net_namespace.h>
56 #include <net/netns/generic.h>
57 #include <net/rtnetlink.h>
58 
59 #if IS_ENABLED(CONFIG_IPV6)
60 #include <net/ipv6.h>
61 #include <net/ip6_fib.h>
62 #include <net/ip6_route.h>
63 #endif
64 
65 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
66 {
67 	return hash_32((__force u32)key ^ (__force u32)remote,
68 			 IP_TNL_HASH_BITS);
69 }
70 
71 static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
72 			     struct dst_entry *dst)
73 {
74 	struct dst_entry *old_dst;
75 
76 	if (dst) {
77 		if (dst->flags & DST_NOCACHE)
78 			dst = NULL;
79 		else
80 			dst_clone(dst);
81 	}
82 	old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
83 	dst_release(old_dst);
84 }
85 
86 static void tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst)
87 {
88 	__tunnel_dst_set(this_cpu_ptr(t->dst_cache), dst);
89 }
90 
91 static void tunnel_dst_reset(struct ip_tunnel *t)
92 {
93 	tunnel_dst_set(t, NULL);
94 }
95 
96 void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
97 {
98 	int i;
99 
100 	for_each_possible_cpu(i)
101 		__tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL);
102 }
103 EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
104 
105 static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, u32 cookie)
106 {
107 	struct dst_entry *dst;
108 
109 	rcu_read_lock();
110 	dst = rcu_dereference(this_cpu_ptr(t->dst_cache)->dst);
111 	if (dst) {
112 		if (dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
113 			rcu_read_unlock();
114 			tunnel_dst_reset(t);
115 			return NULL;
116 		}
117 		dst_hold(dst);
118 	}
119 	rcu_read_unlock();
120 	return (struct rtable *)dst;
121 }
122 
123 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
124 				__be16 flags, __be32 key)
125 {
126 	if (p->i_flags & TUNNEL_KEY) {
127 		if (flags & TUNNEL_KEY)
128 			return key == p->i_key;
129 		else
130 			/* key expected, none present */
131 			return false;
132 	} else
133 		return !(flags & TUNNEL_KEY);
134 }
135 
136 /* Fallback tunnel: no source, no destination, no key, no options
137 
138    Tunnel hash table:
139    We require exact key match i.e. if a key is present in packet
140    it will match only tunnel with the same key; if it is not present,
141    it will match only keyless tunnel.
142 
143    All keysless packets, if not matched configured keyless tunnels
144    will match fallback tunnel.
145    Given src, dst and key, find appropriate for input tunnel.
146 */
147 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
148 				   int link, __be16 flags,
149 				   __be32 remote, __be32 local,
150 				   __be32 key)
151 {
152 	unsigned int hash;
153 	struct ip_tunnel *t, *cand = NULL;
154 	struct hlist_head *head;
155 
156 	hash = ip_tunnel_hash(key, remote);
157 	head = &itn->tunnels[hash];
158 
159 	hlist_for_each_entry_rcu(t, head, hash_node) {
160 		if (local != t->parms.iph.saddr ||
161 		    remote != t->parms.iph.daddr ||
162 		    !(t->dev->flags & IFF_UP))
163 			continue;
164 
165 		if (!ip_tunnel_key_match(&t->parms, flags, key))
166 			continue;
167 
168 		if (t->parms.link == link)
169 			return t;
170 		else
171 			cand = t;
172 	}
173 
174 	hlist_for_each_entry_rcu(t, head, hash_node) {
175 		if (remote != t->parms.iph.daddr ||
176 		    !(t->dev->flags & IFF_UP))
177 			continue;
178 
179 		if (!ip_tunnel_key_match(&t->parms, flags, key))
180 			continue;
181 
182 		if (t->parms.link == link)
183 			return t;
184 		else if (!cand)
185 			cand = t;
186 	}
187 
188 	hash = ip_tunnel_hash(key, 0);
189 	head = &itn->tunnels[hash];
190 
191 	hlist_for_each_entry_rcu(t, head, hash_node) {
192 		if ((local != t->parms.iph.saddr &&
193 		     (local != t->parms.iph.daddr ||
194 		      !ipv4_is_multicast(local))) ||
195 		    !(t->dev->flags & IFF_UP))
196 			continue;
197 
198 		if (!ip_tunnel_key_match(&t->parms, flags, key))
199 			continue;
200 
201 		if (t->parms.link == link)
202 			return t;
203 		else if (!cand)
204 			cand = t;
205 	}
206 
207 	if (flags & TUNNEL_NO_KEY)
208 		goto skip_key_lookup;
209 
210 	hlist_for_each_entry_rcu(t, head, hash_node) {
211 		if (t->parms.i_key != key ||
212 		    !(t->dev->flags & IFF_UP))
213 			continue;
214 
215 		if (t->parms.link == link)
216 			return t;
217 		else if (!cand)
218 			cand = t;
219 	}
220 
221 skip_key_lookup:
222 	if (cand)
223 		return cand;
224 
225 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
226 		return netdev_priv(itn->fb_tunnel_dev);
227 
228 
229 	return NULL;
230 }
231 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
232 
233 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
234 				    struct ip_tunnel_parm *parms)
235 {
236 	unsigned int h;
237 	__be32 remote;
238 	__be32 i_key = parms->i_key;
239 
240 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
241 		remote = parms->iph.daddr;
242 	else
243 		remote = 0;
244 
245 	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
246 		i_key = 0;
247 
248 	h = ip_tunnel_hash(i_key, remote);
249 	return &itn->tunnels[h];
250 }
251 
252 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
253 {
254 	struct hlist_head *head = ip_bucket(itn, &t->parms);
255 
256 	hlist_add_head_rcu(&t->hash_node, head);
257 }
258 
259 static void ip_tunnel_del(struct ip_tunnel *t)
260 {
261 	hlist_del_init_rcu(&t->hash_node);
262 }
263 
264 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
265 					struct ip_tunnel_parm *parms,
266 					int type)
267 {
268 	__be32 remote = parms->iph.daddr;
269 	__be32 local = parms->iph.saddr;
270 	__be32 key = parms->i_key;
271 	int link = parms->link;
272 	struct ip_tunnel *t = NULL;
273 	struct hlist_head *head = ip_bucket(itn, parms);
274 
275 	hlist_for_each_entry_rcu(t, head, hash_node) {
276 		if (local == t->parms.iph.saddr &&
277 		    remote == t->parms.iph.daddr &&
278 		    key == t->parms.i_key &&
279 		    link == t->parms.link &&
280 		    type == t->dev->type)
281 			break;
282 	}
283 	return t;
284 }
285 
286 static struct net_device *__ip_tunnel_create(struct net *net,
287 					     const struct rtnl_link_ops *ops,
288 					     struct ip_tunnel_parm *parms)
289 {
290 	int err;
291 	struct ip_tunnel *tunnel;
292 	struct net_device *dev;
293 	char name[IFNAMSIZ];
294 
295 	if (parms->name[0])
296 		strlcpy(name, parms->name, IFNAMSIZ);
297 	else {
298 		if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
299 			err = -E2BIG;
300 			goto failed;
301 		}
302 		strlcpy(name, ops->kind, IFNAMSIZ);
303 		strncat(name, "%d", 2);
304 	}
305 
306 	ASSERT_RTNL();
307 	dev = alloc_netdev(ops->priv_size, name, ops->setup);
308 	if (!dev) {
309 		err = -ENOMEM;
310 		goto failed;
311 	}
312 	dev_net_set(dev, net);
313 
314 	dev->rtnl_link_ops = ops;
315 
316 	tunnel = netdev_priv(dev);
317 	tunnel->parms = *parms;
318 	tunnel->net = net;
319 
320 	err = register_netdevice(dev);
321 	if (err)
322 		goto failed_free;
323 
324 	return dev;
325 
326 failed_free:
327 	free_netdev(dev);
328 failed:
329 	return ERR_PTR(err);
330 }
331 
332 static inline void init_tunnel_flow(struct flowi4 *fl4,
333 				    int proto,
334 				    __be32 daddr, __be32 saddr,
335 				    __be32 key, __u8 tos, int oif)
336 {
337 	memset(fl4, 0, sizeof(*fl4));
338 	fl4->flowi4_oif = oif;
339 	fl4->daddr = daddr;
340 	fl4->saddr = saddr;
341 	fl4->flowi4_tos = tos;
342 	fl4->flowi4_proto = proto;
343 	fl4->fl4_gre_key = key;
344 }
345 
346 static int ip_tunnel_bind_dev(struct net_device *dev)
347 {
348 	struct net_device *tdev = NULL;
349 	struct ip_tunnel *tunnel = netdev_priv(dev);
350 	const struct iphdr *iph;
351 	int hlen = LL_MAX_HEADER;
352 	int mtu = ETH_DATA_LEN;
353 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
354 
355 	iph = &tunnel->parms.iph;
356 
357 	/* Guess output device to choose reasonable mtu and needed_headroom */
358 	if (iph->daddr) {
359 		struct flowi4 fl4;
360 		struct rtable *rt;
361 
362 		init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
363 				 iph->saddr, tunnel->parms.o_key,
364 				 RT_TOS(iph->tos), tunnel->parms.link);
365 		rt = ip_route_output_key(tunnel->net, &fl4);
366 
367 		if (!IS_ERR(rt)) {
368 			tdev = rt->dst.dev;
369 			tunnel_dst_set(tunnel, &rt->dst);
370 			ip_rt_put(rt);
371 		}
372 		if (dev->type != ARPHRD_ETHER)
373 			dev->flags |= IFF_POINTOPOINT;
374 	}
375 
376 	if (!tdev && tunnel->parms.link)
377 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
378 
379 	if (tdev) {
380 		hlen = tdev->hard_header_len + tdev->needed_headroom;
381 		mtu = tdev->mtu;
382 	}
383 	dev->iflink = tunnel->parms.link;
384 
385 	dev->needed_headroom = t_hlen + hlen;
386 	mtu -= (dev->hard_header_len + t_hlen);
387 
388 	if (mtu < 68)
389 		mtu = 68;
390 
391 	return mtu;
392 }
393 
394 static struct ip_tunnel *ip_tunnel_create(struct net *net,
395 					  struct ip_tunnel_net *itn,
396 					  struct ip_tunnel_parm *parms)
397 {
398 	struct ip_tunnel *nt, *fbt;
399 	struct net_device *dev;
400 
401 	BUG_ON(!itn->fb_tunnel_dev);
402 	fbt = netdev_priv(itn->fb_tunnel_dev);
403 	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
404 	if (IS_ERR(dev))
405 		return ERR_CAST(dev);
406 
407 	dev->mtu = ip_tunnel_bind_dev(dev);
408 
409 	nt = netdev_priv(dev);
410 	ip_tunnel_add(itn, nt);
411 	return nt;
412 }
413 
414 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
415 		  const struct tnl_ptk_info *tpi, bool log_ecn_error)
416 {
417 	struct pcpu_sw_netstats *tstats;
418 	const struct iphdr *iph = ip_hdr(skb);
419 	int err;
420 
421 #ifdef CONFIG_NET_IPGRE_BROADCAST
422 	if (ipv4_is_multicast(iph->daddr)) {
423 		tunnel->dev->stats.multicast++;
424 		skb->pkt_type = PACKET_BROADCAST;
425 	}
426 #endif
427 
428 	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
429 	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
430 		tunnel->dev->stats.rx_crc_errors++;
431 		tunnel->dev->stats.rx_errors++;
432 		goto drop;
433 	}
434 
435 	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
436 		if (!(tpi->flags&TUNNEL_SEQ) ||
437 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
438 			tunnel->dev->stats.rx_fifo_errors++;
439 			tunnel->dev->stats.rx_errors++;
440 			goto drop;
441 		}
442 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
443 	}
444 
445 	skb_reset_network_header(skb);
446 
447 	err = IP_ECN_decapsulate(iph, skb);
448 	if (unlikely(err)) {
449 		if (log_ecn_error)
450 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
451 					&iph->saddr, iph->tos);
452 		if (err > 1) {
453 			++tunnel->dev->stats.rx_frame_errors;
454 			++tunnel->dev->stats.rx_errors;
455 			goto drop;
456 		}
457 	}
458 
459 	tstats = this_cpu_ptr(tunnel->dev->tstats);
460 	u64_stats_update_begin(&tstats->syncp);
461 	tstats->rx_packets++;
462 	tstats->rx_bytes += skb->len;
463 	u64_stats_update_end(&tstats->syncp);
464 
465 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
466 
467 	if (tunnel->dev->type == ARPHRD_ETHER) {
468 		skb->protocol = eth_type_trans(skb, tunnel->dev);
469 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
470 	} else {
471 		skb->dev = tunnel->dev;
472 	}
473 
474 	gro_cells_receive(&tunnel->gro_cells, skb);
475 	return 0;
476 
477 drop:
478 	kfree_skb(skb);
479 	return 0;
480 }
481 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
482 
483 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
484 			    struct rtable *rt, __be16 df)
485 {
486 	struct ip_tunnel *tunnel = netdev_priv(dev);
487 	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
488 	int mtu;
489 
490 	if (df)
491 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
492 					- sizeof(struct iphdr) - tunnel->hlen;
493 	else
494 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
495 
496 	if (skb_dst(skb))
497 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
498 
499 	if (skb->protocol == htons(ETH_P_IP)) {
500 		if (!skb_is_gso(skb) &&
501 		    (df & htons(IP_DF)) && mtu < pkt_size) {
502 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
503 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
504 			return -E2BIG;
505 		}
506 	}
507 #if IS_ENABLED(CONFIG_IPV6)
508 	else if (skb->protocol == htons(ETH_P_IPV6)) {
509 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
510 
511 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
512 			   mtu >= IPV6_MIN_MTU) {
513 			if ((tunnel->parms.iph.daddr &&
514 			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
515 			    rt6->rt6i_dst.plen == 128) {
516 				rt6->rt6i_flags |= RTF_MODIFIED;
517 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
518 			}
519 		}
520 
521 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
522 					mtu < pkt_size) {
523 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
524 			return -E2BIG;
525 		}
526 	}
527 #endif
528 	return 0;
529 }
530 
531 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
532 		    const struct iphdr *tnl_params, const u8 protocol)
533 {
534 	struct ip_tunnel *tunnel = netdev_priv(dev);
535 	const struct iphdr *inner_iph;
536 	struct flowi4 fl4;
537 	u8     tos, ttl;
538 	__be16 df;
539 	struct rtable *rt;		/* Route to the other host */
540 	unsigned int max_headroom;	/* The extra header space needed */
541 	__be32 dst;
542 	int err;
543 	bool connected;
544 
545 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
546 	connected = (tunnel->parms.iph.daddr != 0);
547 
548 	dst = tnl_params->daddr;
549 	if (dst == 0) {
550 		/* NBMA tunnel */
551 
552 		if (skb_dst(skb) == NULL) {
553 			dev->stats.tx_fifo_errors++;
554 			goto tx_error;
555 		}
556 
557 		if (skb->protocol == htons(ETH_P_IP)) {
558 			rt = skb_rtable(skb);
559 			dst = rt_nexthop(rt, inner_iph->daddr);
560 		}
561 #if IS_ENABLED(CONFIG_IPV6)
562 		else if (skb->protocol == htons(ETH_P_IPV6)) {
563 			const struct in6_addr *addr6;
564 			struct neighbour *neigh;
565 			bool do_tx_error_icmp;
566 			int addr_type;
567 
568 			neigh = dst_neigh_lookup(skb_dst(skb),
569 						 &ipv6_hdr(skb)->daddr);
570 			if (neigh == NULL)
571 				goto tx_error;
572 
573 			addr6 = (const struct in6_addr *)&neigh->primary_key;
574 			addr_type = ipv6_addr_type(addr6);
575 
576 			if (addr_type == IPV6_ADDR_ANY) {
577 				addr6 = &ipv6_hdr(skb)->daddr;
578 				addr_type = ipv6_addr_type(addr6);
579 			}
580 
581 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
582 				do_tx_error_icmp = true;
583 			else {
584 				do_tx_error_icmp = false;
585 				dst = addr6->s6_addr32[3];
586 			}
587 			neigh_release(neigh);
588 			if (do_tx_error_icmp)
589 				goto tx_error_icmp;
590 		}
591 #endif
592 		else
593 			goto tx_error;
594 
595 		connected = false;
596 	}
597 
598 	tos = tnl_params->tos;
599 	if (tos & 0x1) {
600 		tos &= ~0x1;
601 		if (skb->protocol == htons(ETH_P_IP)) {
602 			tos = inner_iph->tos;
603 			connected = false;
604 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
605 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
606 			connected = false;
607 		}
608 	}
609 
610 	init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
611 			 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
612 
613 	rt = connected ? tunnel_rtable_get(tunnel, 0) : NULL;
614 
615 	if (!rt) {
616 		rt = ip_route_output_key(tunnel->net, &fl4);
617 
618 		if (IS_ERR(rt)) {
619 			dev->stats.tx_carrier_errors++;
620 			goto tx_error;
621 		}
622 		if (connected)
623 			tunnel_dst_set(tunnel, &rt->dst);
624 	}
625 
626 	if (rt->dst.dev == dev) {
627 		ip_rt_put(rt);
628 		dev->stats.collisions++;
629 		goto tx_error;
630 	}
631 
632 	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) {
633 		ip_rt_put(rt);
634 		goto tx_error;
635 	}
636 
637 	if (tunnel->err_count > 0) {
638 		if (time_before(jiffies,
639 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
640 			tunnel->err_count--;
641 
642 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
643 			dst_link_failure(skb);
644 		} else
645 			tunnel->err_count = 0;
646 	}
647 
648 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
649 	ttl = tnl_params->ttl;
650 	if (ttl == 0) {
651 		if (skb->protocol == htons(ETH_P_IP))
652 			ttl = inner_iph->ttl;
653 #if IS_ENABLED(CONFIG_IPV6)
654 		else if (skb->protocol == htons(ETH_P_IPV6))
655 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
656 #endif
657 		else
658 			ttl = ip4_dst_hoplimit(&rt->dst);
659 	}
660 
661 	df = tnl_params->frag_off;
662 	if (skb->protocol == htons(ETH_P_IP))
663 		df |= (inner_iph->frag_off&htons(IP_DF));
664 
665 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
666 			+ rt->dst.header_len;
667 	if (max_headroom > dev->needed_headroom)
668 		dev->needed_headroom = max_headroom;
669 
670 	if (skb_cow_head(skb, dev->needed_headroom)) {
671 		dev->stats.tx_dropped++;
672 		kfree_skb(skb);
673 		return;
674 	}
675 
676 	err = iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, protocol,
677 			    tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
678 	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
679 
680 	return;
681 
682 #if IS_ENABLED(CONFIG_IPV6)
683 tx_error_icmp:
684 	dst_link_failure(skb);
685 #endif
686 tx_error:
687 	dev->stats.tx_errors++;
688 	kfree_skb(skb);
689 }
690 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
691 
692 static void ip_tunnel_update(struct ip_tunnel_net *itn,
693 			     struct ip_tunnel *t,
694 			     struct net_device *dev,
695 			     struct ip_tunnel_parm *p,
696 			     bool set_mtu)
697 {
698 	ip_tunnel_del(t);
699 	t->parms.iph.saddr = p->iph.saddr;
700 	t->parms.iph.daddr = p->iph.daddr;
701 	t->parms.i_key = p->i_key;
702 	t->parms.o_key = p->o_key;
703 	if (dev->type != ARPHRD_ETHER) {
704 		memcpy(dev->dev_addr, &p->iph.saddr, 4);
705 		memcpy(dev->broadcast, &p->iph.daddr, 4);
706 	}
707 	ip_tunnel_add(itn, t);
708 
709 	t->parms.iph.ttl = p->iph.ttl;
710 	t->parms.iph.tos = p->iph.tos;
711 	t->parms.iph.frag_off = p->iph.frag_off;
712 
713 	if (t->parms.link != p->link) {
714 		int mtu;
715 
716 		t->parms.link = p->link;
717 		mtu = ip_tunnel_bind_dev(dev);
718 		if (set_mtu)
719 			dev->mtu = mtu;
720 	}
721 	ip_tunnel_dst_reset_all(t);
722 	netdev_state_change(dev);
723 }
724 
725 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
726 {
727 	int err = 0;
728 	struct ip_tunnel *t = netdev_priv(dev);
729 	struct net *net = t->net;
730 	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
731 
732 	BUG_ON(!itn->fb_tunnel_dev);
733 	switch (cmd) {
734 	case SIOCGETTUNNEL:
735 		if (dev == itn->fb_tunnel_dev) {
736 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
737 			if (t == NULL)
738 				t = netdev_priv(dev);
739 		}
740 		memcpy(p, &t->parms, sizeof(*p));
741 		break;
742 
743 	case SIOCADDTUNNEL:
744 	case SIOCCHGTUNNEL:
745 		err = -EPERM;
746 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
747 			goto done;
748 		if (p->iph.ttl)
749 			p->iph.frag_off |= htons(IP_DF);
750 		if (!(p->i_flags&TUNNEL_KEY))
751 			p->i_key = 0;
752 		if (!(p->o_flags&TUNNEL_KEY))
753 			p->o_key = 0;
754 
755 		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
756 
757 		if (!t && (cmd == SIOCADDTUNNEL)) {
758 			t = ip_tunnel_create(net, itn, p);
759 			if (IS_ERR(t)) {
760 				err = PTR_ERR(t);
761 				break;
762 			}
763 		}
764 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
765 			if (t != NULL) {
766 				if (t->dev != dev) {
767 					err = -EEXIST;
768 					break;
769 				}
770 			} else {
771 				unsigned int nflags = 0;
772 
773 				if (ipv4_is_multicast(p->iph.daddr))
774 					nflags = IFF_BROADCAST;
775 				else if (p->iph.daddr)
776 					nflags = IFF_POINTOPOINT;
777 
778 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
779 					err = -EINVAL;
780 					break;
781 				}
782 
783 				t = netdev_priv(dev);
784 			}
785 		}
786 
787 		if (t) {
788 			err = 0;
789 			ip_tunnel_update(itn, t, dev, p, true);
790 		} else {
791 			err = -ENOENT;
792 		}
793 		break;
794 
795 	case SIOCDELTUNNEL:
796 		err = -EPERM;
797 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
798 			goto done;
799 
800 		if (dev == itn->fb_tunnel_dev) {
801 			err = -ENOENT;
802 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
803 			if (t == NULL)
804 				goto done;
805 			err = -EPERM;
806 			if (t == netdev_priv(itn->fb_tunnel_dev))
807 				goto done;
808 			dev = t->dev;
809 		}
810 		unregister_netdevice(dev);
811 		err = 0;
812 		break;
813 
814 	default:
815 		err = -EINVAL;
816 	}
817 
818 done:
819 	return err;
820 }
821 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
822 
823 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
824 {
825 	struct ip_tunnel *tunnel = netdev_priv(dev);
826 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
827 
828 	if (new_mtu < 68 ||
829 	    new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
830 		return -EINVAL;
831 	dev->mtu = new_mtu;
832 	return 0;
833 }
834 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
835 
836 static void ip_tunnel_dev_free(struct net_device *dev)
837 {
838 	struct ip_tunnel *tunnel = netdev_priv(dev);
839 
840 	gro_cells_destroy(&tunnel->gro_cells);
841 	free_percpu(tunnel->dst_cache);
842 	free_percpu(dev->tstats);
843 	free_netdev(dev);
844 }
845 
846 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
847 {
848 	struct ip_tunnel *tunnel = netdev_priv(dev);
849 	struct ip_tunnel_net *itn;
850 
851 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
852 
853 	if (itn->fb_tunnel_dev != dev) {
854 		ip_tunnel_del(netdev_priv(dev));
855 		unregister_netdevice_queue(dev, head);
856 	}
857 }
858 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
859 
860 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
861 				  struct rtnl_link_ops *ops, char *devname)
862 {
863 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
864 	struct ip_tunnel_parm parms;
865 	unsigned int i;
866 
867 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
868 		INIT_HLIST_HEAD(&itn->tunnels[i]);
869 
870 	if (!ops) {
871 		itn->fb_tunnel_dev = NULL;
872 		return 0;
873 	}
874 
875 	memset(&parms, 0, sizeof(parms));
876 	if (devname)
877 		strlcpy(parms.name, devname, IFNAMSIZ);
878 
879 	rtnl_lock();
880 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
881 	/* FB netdevice is special: we have one, and only one per netns.
882 	 * Allowing to move it to another netns is clearly unsafe.
883 	 */
884 	if (!IS_ERR(itn->fb_tunnel_dev)) {
885 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
886 		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
887 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
888 	}
889 	rtnl_unlock();
890 
891 	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
892 }
893 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
894 
895 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
896 			      struct rtnl_link_ops *ops)
897 {
898 	struct net *net = dev_net(itn->fb_tunnel_dev);
899 	struct net_device *dev, *aux;
900 	int h;
901 
902 	for_each_netdev_safe(net, dev, aux)
903 		if (dev->rtnl_link_ops == ops)
904 			unregister_netdevice_queue(dev, head);
905 
906 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
907 		struct ip_tunnel *t;
908 		struct hlist_node *n;
909 		struct hlist_head *thead = &itn->tunnels[h];
910 
911 		hlist_for_each_entry_safe(t, n, thead, hash_node)
912 			/* If dev is in the same netns, it has already
913 			 * been added to the list by the previous loop.
914 			 */
915 			if (!net_eq(dev_net(t->dev), net))
916 				unregister_netdevice_queue(t->dev, head);
917 	}
918 }
919 
920 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
921 {
922 	LIST_HEAD(list);
923 
924 	rtnl_lock();
925 	ip_tunnel_destroy(itn, &list, ops);
926 	unregister_netdevice_many(&list);
927 	rtnl_unlock();
928 }
929 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
930 
931 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
932 		      struct ip_tunnel_parm *p)
933 {
934 	struct ip_tunnel *nt;
935 	struct net *net = dev_net(dev);
936 	struct ip_tunnel_net *itn;
937 	int mtu;
938 	int err;
939 
940 	nt = netdev_priv(dev);
941 	itn = net_generic(net, nt->ip_tnl_net_id);
942 
943 	if (ip_tunnel_find(itn, p, dev->type))
944 		return -EEXIST;
945 
946 	nt->net = net;
947 	nt->parms = *p;
948 	err = register_netdevice(dev);
949 	if (err)
950 		goto out;
951 
952 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
953 		eth_hw_addr_random(dev);
954 
955 	mtu = ip_tunnel_bind_dev(dev);
956 	if (!tb[IFLA_MTU])
957 		dev->mtu = mtu;
958 
959 	ip_tunnel_add(itn, nt);
960 
961 out:
962 	return err;
963 }
964 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
965 
966 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
967 			 struct ip_tunnel_parm *p)
968 {
969 	struct ip_tunnel *t;
970 	struct ip_tunnel *tunnel = netdev_priv(dev);
971 	struct net *net = tunnel->net;
972 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
973 
974 	if (dev == itn->fb_tunnel_dev)
975 		return -EINVAL;
976 
977 	t = ip_tunnel_find(itn, p, dev->type);
978 
979 	if (t) {
980 		if (t->dev != dev)
981 			return -EEXIST;
982 	} else {
983 		t = tunnel;
984 
985 		if (dev->type != ARPHRD_ETHER) {
986 			unsigned int nflags = 0;
987 
988 			if (ipv4_is_multicast(p->iph.daddr))
989 				nflags = IFF_BROADCAST;
990 			else if (p->iph.daddr)
991 				nflags = IFF_POINTOPOINT;
992 
993 			if ((dev->flags ^ nflags) &
994 			    (IFF_POINTOPOINT | IFF_BROADCAST))
995 				return -EINVAL;
996 		}
997 	}
998 
999 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1000 	return 0;
1001 }
1002 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1003 
1004 int ip_tunnel_init(struct net_device *dev)
1005 {
1006 	struct ip_tunnel *tunnel = netdev_priv(dev);
1007 	struct iphdr *iph = &tunnel->parms.iph;
1008 	int err;
1009 
1010 	dev->destructor	= ip_tunnel_dev_free;
1011 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1012 	if (!dev->tstats)
1013 		return -ENOMEM;
1014 
1015 	tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
1016 	if (!tunnel->dst_cache) {
1017 		free_percpu(dev->tstats);
1018 		return -ENOMEM;
1019 	}
1020 
1021 	err = gro_cells_init(&tunnel->gro_cells, dev);
1022 	if (err) {
1023 		free_percpu(tunnel->dst_cache);
1024 		free_percpu(dev->tstats);
1025 		return err;
1026 	}
1027 
1028 	tunnel->dev = dev;
1029 	tunnel->net = dev_net(dev);
1030 	strcpy(tunnel->parms.name, dev->name);
1031 	iph->version		= 4;
1032 	iph->ihl		= 5;
1033 
1034 	return 0;
1035 }
1036 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1037 
1038 void ip_tunnel_uninit(struct net_device *dev)
1039 {
1040 	struct ip_tunnel *tunnel = netdev_priv(dev);
1041 	struct net *net = tunnel->net;
1042 	struct ip_tunnel_net *itn;
1043 
1044 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1045 	/* fb_tunnel_dev will be unregisted in net-exit call. */
1046 	if (itn->fb_tunnel_dev != dev)
1047 		ip_tunnel_del(netdev_priv(dev));
1048 
1049 	ip_tunnel_dst_reset_all(tunnel);
1050 }
1051 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1052 
1053 /* Do least required initialization, rest of init is done in tunnel_init call */
1054 void ip_tunnel_setup(struct net_device *dev, int net_id)
1055 {
1056 	struct ip_tunnel *tunnel = netdev_priv(dev);
1057 	tunnel->ip_tnl_net_id = net_id;
1058 }
1059 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1060 
1061 MODULE_LICENSE("GPL");
1062