xref: /openbmc/linux/net/ipv4/ip_tunnel.c (revision b34e08d5)
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20 
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/mroute.h>
34 #include <linux/init.h>
35 #include <linux/in6.h>
36 #include <linux/inetdevice.h>
37 #include <linux/igmp.h>
38 #include <linux/netfilter_ipv4.h>
39 #include <linux/etherdevice.h>
40 #include <linux/if_ether.h>
41 #include <linux/if_vlan.h>
42 #include <linux/rculist.h>
43 #include <linux/err.h>
44 
45 #include <net/sock.h>
46 #include <net/ip.h>
47 #include <net/icmp.h>
48 #include <net/protocol.h>
49 #include <net/ip_tunnels.h>
50 #include <net/arp.h>
51 #include <net/checksum.h>
52 #include <net/dsfield.h>
53 #include <net/inet_ecn.h>
54 #include <net/xfrm.h>
55 #include <net/net_namespace.h>
56 #include <net/netns/generic.h>
57 #include <net/rtnetlink.h>
58 
59 #if IS_ENABLED(CONFIG_IPV6)
60 #include <net/ipv6.h>
61 #include <net/ip6_fib.h>
62 #include <net/ip6_route.h>
63 #endif
64 
65 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
66 {
67 	return hash_32((__force u32)key ^ (__force u32)remote,
68 			 IP_TNL_HASH_BITS);
69 }
70 
71 static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
72 			     struct dst_entry *dst)
73 {
74 	struct dst_entry *old_dst;
75 
76 	if (dst) {
77 		if (dst->flags & DST_NOCACHE)
78 			dst = NULL;
79 		else
80 			dst_clone(dst);
81 	}
82 	old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
83 	dst_release(old_dst);
84 }
85 
86 static void tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst)
87 {
88 	__tunnel_dst_set(this_cpu_ptr(t->dst_cache), dst);
89 }
90 
91 static void tunnel_dst_reset(struct ip_tunnel *t)
92 {
93 	tunnel_dst_set(t, NULL);
94 }
95 
96 void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
97 {
98 	int i;
99 
100 	for_each_possible_cpu(i)
101 		__tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL);
102 }
103 EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
104 
105 static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, u32 cookie)
106 {
107 	struct dst_entry *dst;
108 
109 	rcu_read_lock();
110 	dst = rcu_dereference(this_cpu_ptr(t->dst_cache)->dst);
111 	if (dst) {
112 		if (dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
113 			rcu_read_unlock();
114 			tunnel_dst_reset(t);
115 			return NULL;
116 		}
117 		dst_hold(dst);
118 	}
119 	rcu_read_unlock();
120 	return (struct rtable *)dst;
121 }
122 
123 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
124 				__be16 flags, __be32 key)
125 {
126 	if (p->i_flags & TUNNEL_KEY) {
127 		if (flags & TUNNEL_KEY)
128 			return key == p->i_key;
129 		else
130 			/* key expected, none present */
131 			return false;
132 	} else
133 		return !(flags & TUNNEL_KEY);
134 }
135 
136 /* Fallback tunnel: no source, no destination, no key, no options
137 
138    Tunnel hash table:
139    We require exact key match i.e. if a key is present in packet
140    it will match only tunnel with the same key; if it is not present,
141    it will match only keyless tunnel.
142 
143    All keysless packets, if not matched configured keyless tunnels
144    will match fallback tunnel.
145    Given src, dst and key, find appropriate for input tunnel.
146 */
147 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
148 				   int link, __be16 flags,
149 				   __be32 remote, __be32 local,
150 				   __be32 key)
151 {
152 	unsigned int hash;
153 	struct ip_tunnel *t, *cand = NULL;
154 	struct hlist_head *head;
155 
156 	hash = ip_tunnel_hash(key, remote);
157 	head = &itn->tunnels[hash];
158 
159 	hlist_for_each_entry_rcu(t, head, hash_node) {
160 		if (local != t->parms.iph.saddr ||
161 		    remote != t->parms.iph.daddr ||
162 		    !(t->dev->flags & IFF_UP))
163 			continue;
164 
165 		if (!ip_tunnel_key_match(&t->parms, flags, key))
166 			continue;
167 
168 		if (t->parms.link == link)
169 			return t;
170 		else
171 			cand = t;
172 	}
173 
174 	hlist_for_each_entry_rcu(t, head, hash_node) {
175 		if (remote != t->parms.iph.daddr ||
176 		    !(t->dev->flags & IFF_UP))
177 			continue;
178 
179 		if (!ip_tunnel_key_match(&t->parms, flags, key))
180 			continue;
181 
182 		if (t->parms.link == link)
183 			return t;
184 		else if (!cand)
185 			cand = t;
186 	}
187 
188 	hash = ip_tunnel_hash(key, 0);
189 	head = &itn->tunnels[hash];
190 
191 	hlist_for_each_entry_rcu(t, head, hash_node) {
192 		if ((local != t->parms.iph.saddr &&
193 		     (local != t->parms.iph.daddr ||
194 		      !ipv4_is_multicast(local))) ||
195 		    !(t->dev->flags & IFF_UP))
196 			continue;
197 
198 		if (!ip_tunnel_key_match(&t->parms, flags, key))
199 			continue;
200 
201 		if (t->parms.link == link)
202 			return t;
203 		else if (!cand)
204 			cand = t;
205 	}
206 
207 	if (flags & TUNNEL_NO_KEY)
208 		goto skip_key_lookup;
209 
210 	hlist_for_each_entry_rcu(t, head, hash_node) {
211 		if (t->parms.i_key != key ||
212 		    !(t->dev->flags & IFF_UP))
213 			continue;
214 
215 		if (t->parms.link == link)
216 			return t;
217 		else if (!cand)
218 			cand = t;
219 	}
220 
221 skip_key_lookup:
222 	if (cand)
223 		return cand;
224 
225 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
226 		return netdev_priv(itn->fb_tunnel_dev);
227 
228 
229 	return NULL;
230 }
231 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
232 
233 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
234 				    struct ip_tunnel_parm *parms)
235 {
236 	unsigned int h;
237 	__be32 remote;
238 	__be32 i_key = parms->i_key;
239 
240 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
241 		remote = parms->iph.daddr;
242 	else
243 		remote = 0;
244 
245 	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
246 		i_key = 0;
247 
248 	h = ip_tunnel_hash(i_key, remote);
249 	return &itn->tunnels[h];
250 }
251 
252 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
253 {
254 	struct hlist_head *head = ip_bucket(itn, &t->parms);
255 
256 	hlist_add_head_rcu(&t->hash_node, head);
257 }
258 
259 static void ip_tunnel_del(struct ip_tunnel *t)
260 {
261 	hlist_del_init_rcu(&t->hash_node);
262 }
263 
264 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
265 					struct ip_tunnel_parm *parms,
266 					int type)
267 {
268 	__be32 remote = parms->iph.daddr;
269 	__be32 local = parms->iph.saddr;
270 	__be32 key = parms->i_key;
271 	int link = parms->link;
272 	struct ip_tunnel *t = NULL;
273 	struct hlist_head *head = ip_bucket(itn, parms);
274 
275 	hlist_for_each_entry_rcu(t, head, hash_node) {
276 		if (local == t->parms.iph.saddr &&
277 		    remote == t->parms.iph.daddr &&
278 		    key == t->parms.i_key &&
279 		    link == t->parms.link &&
280 		    type == t->dev->type)
281 			break;
282 	}
283 	return t;
284 }
285 
286 static struct net_device *__ip_tunnel_create(struct net *net,
287 					     const struct rtnl_link_ops *ops,
288 					     struct ip_tunnel_parm *parms)
289 {
290 	int err;
291 	struct ip_tunnel *tunnel;
292 	struct net_device *dev;
293 	char name[IFNAMSIZ];
294 
295 	if (parms->name[0])
296 		strlcpy(name, parms->name, IFNAMSIZ);
297 	else {
298 		if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
299 			err = -E2BIG;
300 			goto failed;
301 		}
302 		strlcpy(name, ops->kind, IFNAMSIZ);
303 		strncat(name, "%d", 2);
304 	}
305 
306 	ASSERT_RTNL();
307 	dev = alloc_netdev(ops->priv_size, name, ops->setup);
308 	if (!dev) {
309 		err = -ENOMEM;
310 		goto failed;
311 	}
312 	dev_net_set(dev, net);
313 
314 	dev->rtnl_link_ops = ops;
315 
316 	tunnel = netdev_priv(dev);
317 	tunnel->parms = *parms;
318 	tunnel->net = net;
319 
320 	err = register_netdevice(dev);
321 	if (err)
322 		goto failed_free;
323 
324 	return dev;
325 
326 failed_free:
327 	free_netdev(dev);
328 failed:
329 	return ERR_PTR(err);
330 }
331 
332 static inline void init_tunnel_flow(struct flowi4 *fl4,
333 				    int proto,
334 				    __be32 daddr, __be32 saddr,
335 				    __be32 key, __u8 tos, int oif)
336 {
337 	memset(fl4, 0, sizeof(*fl4));
338 	fl4->flowi4_oif = oif;
339 	fl4->daddr = daddr;
340 	fl4->saddr = saddr;
341 	fl4->flowi4_tos = tos;
342 	fl4->flowi4_proto = proto;
343 	fl4->fl4_gre_key = key;
344 }
345 
346 static int ip_tunnel_bind_dev(struct net_device *dev)
347 {
348 	struct net_device *tdev = NULL;
349 	struct ip_tunnel *tunnel = netdev_priv(dev);
350 	const struct iphdr *iph;
351 	int hlen = LL_MAX_HEADER;
352 	int mtu = ETH_DATA_LEN;
353 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
354 
355 	iph = &tunnel->parms.iph;
356 
357 	/* Guess output device to choose reasonable mtu and needed_headroom */
358 	if (iph->daddr) {
359 		struct flowi4 fl4;
360 		struct rtable *rt;
361 
362 		init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
363 				 iph->saddr, tunnel->parms.o_key,
364 				 RT_TOS(iph->tos), tunnel->parms.link);
365 		rt = ip_route_output_key(tunnel->net, &fl4);
366 
367 		if (!IS_ERR(rt)) {
368 			tdev = rt->dst.dev;
369 			tunnel_dst_set(tunnel, &rt->dst);
370 			ip_rt_put(rt);
371 		}
372 		if (dev->type != ARPHRD_ETHER)
373 			dev->flags |= IFF_POINTOPOINT;
374 	}
375 
376 	if (!tdev && tunnel->parms.link)
377 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
378 
379 	if (tdev) {
380 		hlen = tdev->hard_header_len + tdev->needed_headroom;
381 		mtu = tdev->mtu;
382 	}
383 	dev->iflink = tunnel->parms.link;
384 
385 	dev->needed_headroom = t_hlen + hlen;
386 	mtu -= (dev->hard_header_len + t_hlen);
387 
388 	if (mtu < 68)
389 		mtu = 68;
390 
391 	return mtu;
392 }
393 
394 static struct ip_tunnel *ip_tunnel_create(struct net *net,
395 					  struct ip_tunnel_net *itn,
396 					  struct ip_tunnel_parm *parms)
397 {
398 	struct ip_tunnel *nt, *fbt;
399 	struct net_device *dev;
400 
401 	BUG_ON(!itn->fb_tunnel_dev);
402 	fbt = netdev_priv(itn->fb_tunnel_dev);
403 	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
404 	if (IS_ERR(dev))
405 		return ERR_CAST(dev);
406 
407 	dev->mtu = ip_tunnel_bind_dev(dev);
408 
409 	nt = netdev_priv(dev);
410 	ip_tunnel_add(itn, nt);
411 	return nt;
412 }
413 
414 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
415 		  const struct tnl_ptk_info *tpi, bool log_ecn_error)
416 {
417 	struct pcpu_sw_netstats *tstats;
418 	const struct iphdr *iph = ip_hdr(skb);
419 	int err;
420 
421 #ifdef CONFIG_NET_IPGRE_BROADCAST
422 	if (ipv4_is_multicast(iph->daddr)) {
423 		tunnel->dev->stats.multicast++;
424 		skb->pkt_type = PACKET_BROADCAST;
425 	}
426 #endif
427 
428 	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
429 	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
430 		tunnel->dev->stats.rx_crc_errors++;
431 		tunnel->dev->stats.rx_errors++;
432 		goto drop;
433 	}
434 
435 	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
436 		if (!(tpi->flags&TUNNEL_SEQ) ||
437 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
438 			tunnel->dev->stats.rx_fifo_errors++;
439 			tunnel->dev->stats.rx_errors++;
440 			goto drop;
441 		}
442 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
443 	}
444 
445 	err = IP_ECN_decapsulate(iph, skb);
446 	if (unlikely(err)) {
447 		if (log_ecn_error)
448 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
449 					&iph->saddr, iph->tos);
450 		if (err > 1) {
451 			++tunnel->dev->stats.rx_frame_errors;
452 			++tunnel->dev->stats.rx_errors;
453 			goto drop;
454 		}
455 	}
456 
457 	tstats = this_cpu_ptr(tunnel->dev->tstats);
458 	u64_stats_update_begin(&tstats->syncp);
459 	tstats->rx_packets++;
460 	tstats->rx_bytes += skb->len;
461 	u64_stats_update_end(&tstats->syncp);
462 
463 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
464 
465 	if (tunnel->dev->type == ARPHRD_ETHER) {
466 		skb->protocol = eth_type_trans(skb, tunnel->dev);
467 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
468 	} else {
469 		skb->dev = tunnel->dev;
470 	}
471 
472 	gro_cells_receive(&tunnel->gro_cells, skb);
473 	return 0;
474 
475 drop:
476 	kfree_skb(skb);
477 	return 0;
478 }
479 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
480 
481 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
482 			    struct rtable *rt, __be16 df)
483 {
484 	struct ip_tunnel *tunnel = netdev_priv(dev);
485 	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
486 	int mtu;
487 
488 	if (df)
489 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
490 					- sizeof(struct iphdr) - tunnel->hlen;
491 	else
492 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
493 
494 	if (skb_dst(skb))
495 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
496 
497 	if (skb->protocol == htons(ETH_P_IP)) {
498 		if (!skb_is_gso(skb) &&
499 		    (df & htons(IP_DF)) && mtu < pkt_size) {
500 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
501 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
502 			return -E2BIG;
503 		}
504 	}
505 #if IS_ENABLED(CONFIG_IPV6)
506 	else if (skb->protocol == htons(ETH_P_IPV6)) {
507 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
508 
509 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
510 			   mtu >= IPV6_MIN_MTU) {
511 			if ((tunnel->parms.iph.daddr &&
512 			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
513 			    rt6->rt6i_dst.plen == 128) {
514 				rt6->rt6i_flags |= RTF_MODIFIED;
515 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
516 			}
517 		}
518 
519 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
520 					mtu < pkt_size) {
521 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
522 			return -E2BIG;
523 		}
524 	}
525 #endif
526 	return 0;
527 }
528 
529 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
530 		    const struct iphdr *tnl_params, const u8 protocol)
531 {
532 	struct ip_tunnel *tunnel = netdev_priv(dev);
533 	const struct iphdr *inner_iph;
534 	struct flowi4 fl4;
535 	u8     tos, ttl;
536 	__be16 df;
537 	struct rtable *rt;		/* Route to the other host */
538 	unsigned int max_headroom;	/* The extra header space needed */
539 	__be32 dst;
540 	int err;
541 	bool connected = true;
542 
543 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
544 
545 	dst = tnl_params->daddr;
546 	if (dst == 0) {
547 		/* NBMA tunnel */
548 
549 		if (skb_dst(skb) == NULL) {
550 			dev->stats.tx_fifo_errors++;
551 			goto tx_error;
552 		}
553 
554 		if (skb->protocol == htons(ETH_P_IP)) {
555 			rt = skb_rtable(skb);
556 			dst = rt_nexthop(rt, inner_iph->daddr);
557 		}
558 #if IS_ENABLED(CONFIG_IPV6)
559 		else if (skb->protocol == htons(ETH_P_IPV6)) {
560 			const struct in6_addr *addr6;
561 			struct neighbour *neigh;
562 			bool do_tx_error_icmp;
563 			int addr_type;
564 
565 			neigh = dst_neigh_lookup(skb_dst(skb),
566 						 &ipv6_hdr(skb)->daddr);
567 			if (neigh == NULL)
568 				goto tx_error;
569 
570 			addr6 = (const struct in6_addr *)&neigh->primary_key;
571 			addr_type = ipv6_addr_type(addr6);
572 
573 			if (addr_type == IPV6_ADDR_ANY) {
574 				addr6 = &ipv6_hdr(skb)->daddr;
575 				addr_type = ipv6_addr_type(addr6);
576 			}
577 
578 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
579 				do_tx_error_icmp = true;
580 			else {
581 				do_tx_error_icmp = false;
582 				dst = addr6->s6_addr32[3];
583 			}
584 			neigh_release(neigh);
585 			if (do_tx_error_icmp)
586 				goto tx_error_icmp;
587 		}
588 #endif
589 		else
590 			goto tx_error;
591 
592 		connected = false;
593 	}
594 
595 	tos = tnl_params->tos;
596 	if (tos & 0x1) {
597 		tos &= ~0x1;
598 		if (skb->protocol == htons(ETH_P_IP)) {
599 			tos = inner_iph->tos;
600 			connected = false;
601 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
602 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
603 			connected = false;
604 		}
605 	}
606 
607 	init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
608 			 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
609 
610 	rt = connected ? tunnel_rtable_get(tunnel, 0) : NULL;
611 
612 	if (!rt) {
613 		rt = ip_route_output_key(tunnel->net, &fl4);
614 
615 		if (IS_ERR(rt)) {
616 			dev->stats.tx_carrier_errors++;
617 			goto tx_error;
618 		}
619 		if (connected)
620 			tunnel_dst_set(tunnel, &rt->dst);
621 	}
622 
623 	if (rt->dst.dev == dev) {
624 		ip_rt_put(rt);
625 		dev->stats.collisions++;
626 		goto tx_error;
627 	}
628 
629 	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) {
630 		ip_rt_put(rt);
631 		goto tx_error;
632 	}
633 
634 	if (tunnel->err_count > 0) {
635 		if (time_before(jiffies,
636 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
637 			tunnel->err_count--;
638 
639 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
640 			dst_link_failure(skb);
641 		} else
642 			tunnel->err_count = 0;
643 	}
644 
645 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
646 	ttl = tnl_params->ttl;
647 	if (ttl == 0) {
648 		if (skb->protocol == htons(ETH_P_IP))
649 			ttl = inner_iph->ttl;
650 #if IS_ENABLED(CONFIG_IPV6)
651 		else if (skb->protocol == htons(ETH_P_IPV6))
652 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
653 #endif
654 		else
655 			ttl = ip4_dst_hoplimit(&rt->dst);
656 	}
657 
658 	df = tnl_params->frag_off;
659 	if (skb->protocol == htons(ETH_P_IP))
660 		df |= (inner_iph->frag_off&htons(IP_DF));
661 
662 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
663 			+ rt->dst.header_len;
664 	if (max_headroom > dev->needed_headroom)
665 		dev->needed_headroom = max_headroom;
666 
667 	if (skb_cow_head(skb, dev->needed_headroom)) {
668 		dev->stats.tx_dropped++;
669 		kfree_skb(skb);
670 		return;
671 	}
672 
673 	err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, protocol,
674 			    tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
675 	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
676 
677 	return;
678 
679 #if IS_ENABLED(CONFIG_IPV6)
680 tx_error_icmp:
681 	dst_link_failure(skb);
682 #endif
683 tx_error:
684 	dev->stats.tx_errors++;
685 	kfree_skb(skb);
686 }
687 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
688 
689 static void ip_tunnel_update(struct ip_tunnel_net *itn,
690 			     struct ip_tunnel *t,
691 			     struct net_device *dev,
692 			     struct ip_tunnel_parm *p,
693 			     bool set_mtu)
694 {
695 	ip_tunnel_del(t);
696 	t->parms.iph.saddr = p->iph.saddr;
697 	t->parms.iph.daddr = p->iph.daddr;
698 	t->parms.i_key = p->i_key;
699 	t->parms.o_key = p->o_key;
700 	if (dev->type != ARPHRD_ETHER) {
701 		memcpy(dev->dev_addr, &p->iph.saddr, 4);
702 		memcpy(dev->broadcast, &p->iph.daddr, 4);
703 	}
704 	ip_tunnel_add(itn, t);
705 
706 	t->parms.iph.ttl = p->iph.ttl;
707 	t->parms.iph.tos = p->iph.tos;
708 	t->parms.iph.frag_off = p->iph.frag_off;
709 
710 	if (t->parms.link != p->link) {
711 		int mtu;
712 
713 		t->parms.link = p->link;
714 		mtu = ip_tunnel_bind_dev(dev);
715 		if (set_mtu)
716 			dev->mtu = mtu;
717 	}
718 	ip_tunnel_dst_reset_all(t);
719 	netdev_state_change(dev);
720 }
721 
722 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
723 {
724 	int err = 0;
725 	struct ip_tunnel *t;
726 	struct net *net = dev_net(dev);
727 	struct ip_tunnel *tunnel = netdev_priv(dev);
728 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
729 
730 	BUG_ON(!itn->fb_tunnel_dev);
731 	switch (cmd) {
732 	case SIOCGETTUNNEL:
733 		t = NULL;
734 		if (dev == itn->fb_tunnel_dev)
735 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
736 		if (t == NULL)
737 			t = netdev_priv(dev);
738 		memcpy(p, &t->parms, sizeof(*p));
739 		break;
740 
741 	case SIOCADDTUNNEL:
742 	case SIOCCHGTUNNEL:
743 		err = -EPERM;
744 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
745 			goto done;
746 		if (p->iph.ttl)
747 			p->iph.frag_off |= htons(IP_DF);
748 		if (!(p->i_flags&TUNNEL_KEY))
749 			p->i_key = 0;
750 		if (!(p->o_flags&TUNNEL_KEY))
751 			p->o_key = 0;
752 
753 		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
754 
755 		if (!t && (cmd == SIOCADDTUNNEL)) {
756 			t = ip_tunnel_create(net, itn, p);
757 			if (IS_ERR(t)) {
758 				err = PTR_ERR(t);
759 				break;
760 			}
761 		}
762 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
763 			if (t != NULL) {
764 				if (t->dev != dev) {
765 					err = -EEXIST;
766 					break;
767 				}
768 			} else {
769 				unsigned int nflags = 0;
770 
771 				if (ipv4_is_multicast(p->iph.daddr))
772 					nflags = IFF_BROADCAST;
773 				else if (p->iph.daddr)
774 					nflags = IFF_POINTOPOINT;
775 
776 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
777 					err = -EINVAL;
778 					break;
779 				}
780 
781 				t = netdev_priv(dev);
782 			}
783 		}
784 
785 		if (t) {
786 			err = 0;
787 			ip_tunnel_update(itn, t, dev, p, true);
788 		} else {
789 			err = -ENOENT;
790 		}
791 		break;
792 
793 	case SIOCDELTUNNEL:
794 		err = -EPERM;
795 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
796 			goto done;
797 
798 		if (dev == itn->fb_tunnel_dev) {
799 			err = -ENOENT;
800 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
801 			if (t == NULL)
802 				goto done;
803 			err = -EPERM;
804 			if (t == netdev_priv(itn->fb_tunnel_dev))
805 				goto done;
806 			dev = t->dev;
807 		}
808 		unregister_netdevice(dev);
809 		err = 0;
810 		break;
811 
812 	default:
813 		err = -EINVAL;
814 	}
815 
816 done:
817 	return err;
818 }
819 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
820 
821 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
822 {
823 	struct ip_tunnel *tunnel = netdev_priv(dev);
824 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
825 
826 	if (new_mtu < 68 ||
827 	    new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
828 		return -EINVAL;
829 	dev->mtu = new_mtu;
830 	return 0;
831 }
832 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
833 
834 static void ip_tunnel_dev_free(struct net_device *dev)
835 {
836 	struct ip_tunnel *tunnel = netdev_priv(dev);
837 
838 	gro_cells_destroy(&tunnel->gro_cells);
839 	free_percpu(tunnel->dst_cache);
840 	free_percpu(dev->tstats);
841 	free_netdev(dev);
842 }
843 
844 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
845 {
846 	struct ip_tunnel *tunnel = netdev_priv(dev);
847 	struct ip_tunnel_net *itn;
848 
849 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
850 
851 	if (itn->fb_tunnel_dev != dev) {
852 		ip_tunnel_del(netdev_priv(dev));
853 		unregister_netdevice_queue(dev, head);
854 	}
855 }
856 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
857 
858 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
859 				  struct rtnl_link_ops *ops, char *devname)
860 {
861 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
862 	struct ip_tunnel_parm parms;
863 	unsigned int i;
864 
865 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
866 		INIT_HLIST_HEAD(&itn->tunnels[i]);
867 
868 	if (!ops) {
869 		itn->fb_tunnel_dev = NULL;
870 		return 0;
871 	}
872 
873 	memset(&parms, 0, sizeof(parms));
874 	if (devname)
875 		strlcpy(parms.name, devname, IFNAMSIZ);
876 
877 	rtnl_lock();
878 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
879 	/* FB netdevice is special: we have one, and only one per netns.
880 	 * Allowing to move it to another netns is clearly unsafe.
881 	 */
882 	if (!IS_ERR(itn->fb_tunnel_dev)) {
883 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
884 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
885 	}
886 	rtnl_unlock();
887 
888 	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
889 }
890 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
891 
892 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
893 			      struct rtnl_link_ops *ops)
894 {
895 	struct net *net = dev_net(itn->fb_tunnel_dev);
896 	struct net_device *dev, *aux;
897 	int h;
898 
899 	for_each_netdev_safe(net, dev, aux)
900 		if (dev->rtnl_link_ops == ops)
901 			unregister_netdevice_queue(dev, head);
902 
903 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
904 		struct ip_tunnel *t;
905 		struct hlist_node *n;
906 		struct hlist_head *thead = &itn->tunnels[h];
907 
908 		hlist_for_each_entry_safe(t, n, thead, hash_node)
909 			/* If dev is in the same netns, it has already
910 			 * been added to the list by the previous loop.
911 			 */
912 			if (!net_eq(dev_net(t->dev), net))
913 				unregister_netdevice_queue(t->dev, head);
914 	}
915 }
916 
917 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
918 {
919 	LIST_HEAD(list);
920 
921 	rtnl_lock();
922 	ip_tunnel_destroy(itn, &list, ops);
923 	unregister_netdevice_many(&list);
924 	rtnl_unlock();
925 }
926 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
927 
928 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
929 		      struct ip_tunnel_parm *p)
930 {
931 	struct ip_tunnel *nt;
932 	struct net *net = dev_net(dev);
933 	struct ip_tunnel_net *itn;
934 	int mtu;
935 	int err;
936 
937 	nt = netdev_priv(dev);
938 	itn = net_generic(net, nt->ip_tnl_net_id);
939 
940 	if (ip_tunnel_find(itn, p, dev->type))
941 		return -EEXIST;
942 
943 	nt->net = net;
944 	nt->parms = *p;
945 	err = register_netdevice(dev);
946 	if (err)
947 		goto out;
948 
949 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
950 		eth_hw_addr_random(dev);
951 
952 	mtu = ip_tunnel_bind_dev(dev);
953 	if (!tb[IFLA_MTU])
954 		dev->mtu = mtu;
955 
956 	ip_tunnel_add(itn, nt);
957 
958 out:
959 	return err;
960 }
961 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
962 
963 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
964 			 struct ip_tunnel_parm *p)
965 {
966 	struct ip_tunnel *t;
967 	struct ip_tunnel *tunnel = netdev_priv(dev);
968 	struct net *net = tunnel->net;
969 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
970 
971 	if (dev == itn->fb_tunnel_dev)
972 		return -EINVAL;
973 
974 	t = ip_tunnel_find(itn, p, dev->type);
975 
976 	if (t) {
977 		if (t->dev != dev)
978 			return -EEXIST;
979 	} else {
980 		t = tunnel;
981 
982 		if (dev->type != ARPHRD_ETHER) {
983 			unsigned int nflags = 0;
984 
985 			if (ipv4_is_multicast(p->iph.daddr))
986 				nflags = IFF_BROADCAST;
987 			else if (p->iph.daddr)
988 				nflags = IFF_POINTOPOINT;
989 
990 			if ((dev->flags ^ nflags) &
991 			    (IFF_POINTOPOINT | IFF_BROADCAST))
992 				return -EINVAL;
993 		}
994 	}
995 
996 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
997 	return 0;
998 }
999 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1000 
1001 int ip_tunnel_init(struct net_device *dev)
1002 {
1003 	struct ip_tunnel *tunnel = netdev_priv(dev);
1004 	struct iphdr *iph = &tunnel->parms.iph;
1005 	int err;
1006 
1007 	dev->destructor	= ip_tunnel_dev_free;
1008 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1009 	if (!dev->tstats)
1010 		return -ENOMEM;
1011 
1012 	tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
1013 	if (!tunnel->dst_cache) {
1014 		free_percpu(dev->tstats);
1015 		return -ENOMEM;
1016 	}
1017 
1018 	err = gro_cells_init(&tunnel->gro_cells, dev);
1019 	if (err) {
1020 		free_percpu(tunnel->dst_cache);
1021 		free_percpu(dev->tstats);
1022 		return err;
1023 	}
1024 
1025 	tunnel->dev = dev;
1026 	tunnel->net = dev_net(dev);
1027 	strcpy(tunnel->parms.name, dev->name);
1028 	iph->version		= 4;
1029 	iph->ihl		= 5;
1030 
1031 	return 0;
1032 }
1033 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1034 
1035 void ip_tunnel_uninit(struct net_device *dev)
1036 {
1037 	struct ip_tunnel *tunnel = netdev_priv(dev);
1038 	struct net *net = tunnel->net;
1039 	struct ip_tunnel_net *itn;
1040 
1041 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1042 	/* fb_tunnel_dev will be unregisted in net-exit call. */
1043 	if (itn->fb_tunnel_dev != dev)
1044 		ip_tunnel_del(netdev_priv(dev));
1045 
1046 	ip_tunnel_dst_reset_all(tunnel);
1047 }
1048 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1049 
1050 /* Do least required initialization, rest of init is done in tunnel_init call */
1051 void ip_tunnel_setup(struct net_device *dev, int net_id)
1052 {
1053 	struct ip_tunnel *tunnel = netdev_priv(dev);
1054 	tunnel->ip_tnl_net_id = net_id;
1055 }
1056 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1057 
1058 MODULE_LICENSE("GPL");
1059