xref: /openbmc/linux/net/ipv4/ip_tunnel.c (revision af958a38)
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20 
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/mroute.h>
34 #include <linux/init.h>
35 #include <linux/in6.h>
36 #include <linux/inetdevice.h>
37 #include <linux/igmp.h>
38 #include <linux/netfilter_ipv4.h>
39 #include <linux/etherdevice.h>
40 #include <linux/if_ether.h>
41 #include <linux/if_vlan.h>
42 #include <linux/rculist.h>
43 #include <linux/err.h>
44 
45 #include <net/sock.h>
46 #include <net/ip.h>
47 #include <net/icmp.h>
48 #include <net/protocol.h>
49 #include <net/ip_tunnels.h>
50 #include <net/arp.h>
51 #include <net/checksum.h>
52 #include <net/dsfield.h>
53 #include <net/inet_ecn.h>
54 #include <net/xfrm.h>
55 #include <net/net_namespace.h>
56 #include <net/netns/generic.h>
57 #include <net/rtnetlink.h>
58 
59 #if IS_ENABLED(CONFIG_IPV6)
60 #include <net/ipv6.h>
61 #include <net/ip6_fib.h>
62 #include <net/ip6_route.h>
63 #endif
64 
65 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
66 {
67 	return hash_32((__force u32)key ^ (__force u32)remote,
68 			 IP_TNL_HASH_BITS);
69 }
70 
71 static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
72 			     struct dst_entry *dst, __be32 saddr)
73 {
74 	struct dst_entry *old_dst;
75 
76 	dst_clone(dst);
77 	old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
78 	dst_release(old_dst);
79 	idst->saddr = saddr;
80 }
81 
82 static void tunnel_dst_set(struct ip_tunnel *t,
83 			   struct dst_entry *dst, __be32 saddr)
84 {
85 	__tunnel_dst_set(this_cpu_ptr(t->dst_cache), dst, saddr);
86 }
87 
88 static void tunnel_dst_reset(struct ip_tunnel *t)
89 {
90 	tunnel_dst_set(t, NULL, 0);
91 }
92 
93 void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
94 {
95 	int i;
96 
97 	for_each_possible_cpu(i)
98 		__tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0);
99 }
100 EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
101 
102 static struct rtable *tunnel_rtable_get(struct ip_tunnel *t,
103 					u32 cookie, __be32 *saddr)
104 {
105 	struct ip_tunnel_dst *idst;
106 	struct dst_entry *dst;
107 
108 	rcu_read_lock();
109 	idst = this_cpu_ptr(t->dst_cache);
110 	dst = rcu_dereference(idst->dst);
111 	if (dst && !atomic_inc_not_zero(&dst->__refcnt))
112 		dst = NULL;
113 	if (dst) {
114 		if (!dst->obsolete || dst->ops->check(dst, cookie)) {
115 			*saddr = idst->saddr;
116 		} else {
117 			tunnel_dst_reset(t);
118 			dst_release(dst);
119 			dst = NULL;
120 		}
121 	}
122 	rcu_read_unlock();
123 	return (struct rtable *)dst;
124 }
125 
126 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
127 				__be16 flags, __be32 key)
128 {
129 	if (p->i_flags & TUNNEL_KEY) {
130 		if (flags & TUNNEL_KEY)
131 			return key == p->i_key;
132 		else
133 			/* key expected, none present */
134 			return false;
135 	} else
136 		return !(flags & TUNNEL_KEY);
137 }
138 
139 /* Fallback tunnel: no source, no destination, no key, no options
140 
141    Tunnel hash table:
142    We require exact key match i.e. if a key is present in packet
143    it will match only tunnel with the same key; if it is not present,
144    it will match only keyless tunnel.
145 
146    All keysless packets, if not matched configured keyless tunnels
147    will match fallback tunnel.
148    Given src, dst and key, find appropriate for input tunnel.
149 */
150 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
151 				   int link, __be16 flags,
152 				   __be32 remote, __be32 local,
153 				   __be32 key)
154 {
155 	unsigned int hash;
156 	struct ip_tunnel *t, *cand = NULL;
157 	struct hlist_head *head;
158 
159 	hash = ip_tunnel_hash(key, remote);
160 	head = &itn->tunnels[hash];
161 
162 	hlist_for_each_entry_rcu(t, head, hash_node) {
163 		if (local != t->parms.iph.saddr ||
164 		    remote != t->parms.iph.daddr ||
165 		    !(t->dev->flags & IFF_UP))
166 			continue;
167 
168 		if (!ip_tunnel_key_match(&t->parms, flags, key))
169 			continue;
170 
171 		if (t->parms.link == link)
172 			return t;
173 		else
174 			cand = t;
175 	}
176 
177 	hlist_for_each_entry_rcu(t, head, hash_node) {
178 		if (remote != t->parms.iph.daddr ||
179 		    t->parms.iph.saddr != 0 ||
180 		    !(t->dev->flags & IFF_UP))
181 			continue;
182 
183 		if (!ip_tunnel_key_match(&t->parms, flags, key))
184 			continue;
185 
186 		if (t->parms.link == link)
187 			return t;
188 		else if (!cand)
189 			cand = t;
190 	}
191 
192 	hash = ip_tunnel_hash(key, 0);
193 	head = &itn->tunnels[hash];
194 
195 	hlist_for_each_entry_rcu(t, head, hash_node) {
196 		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
197 		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
198 			continue;
199 
200 		if (!(t->dev->flags & IFF_UP))
201 			continue;
202 
203 		if (!ip_tunnel_key_match(&t->parms, flags, key))
204 			continue;
205 
206 		if (t->parms.link == link)
207 			return t;
208 		else if (!cand)
209 			cand = t;
210 	}
211 
212 	if (flags & TUNNEL_NO_KEY)
213 		goto skip_key_lookup;
214 
215 	hlist_for_each_entry_rcu(t, head, hash_node) {
216 		if (t->parms.i_key != key ||
217 		    t->parms.iph.saddr != 0 ||
218 		    t->parms.iph.daddr != 0 ||
219 		    !(t->dev->flags & IFF_UP))
220 			continue;
221 
222 		if (t->parms.link == link)
223 			return t;
224 		else if (!cand)
225 			cand = t;
226 	}
227 
228 skip_key_lookup:
229 	if (cand)
230 		return cand;
231 
232 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
233 		return netdev_priv(itn->fb_tunnel_dev);
234 
235 
236 	return NULL;
237 }
238 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
239 
240 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
241 				    struct ip_tunnel_parm *parms)
242 {
243 	unsigned int h;
244 	__be32 remote;
245 	__be32 i_key = parms->i_key;
246 
247 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
248 		remote = parms->iph.daddr;
249 	else
250 		remote = 0;
251 
252 	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
253 		i_key = 0;
254 
255 	h = ip_tunnel_hash(i_key, remote);
256 	return &itn->tunnels[h];
257 }
258 
259 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
260 {
261 	struct hlist_head *head = ip_bucket(itn, &t->parms);
262 
263 	hlist_add_head_rcu(&t->hash_node, head);
264 }
265 
266 static void ip_tunnel_del(struct ip_tunnel *t)
267 {
268 	hlist_del_init_rcu(&t->hash_node);
269 }
270 
271 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
272 					struct ip_tunnel_parm *parms,
273 					int type)
274 {
275 	__be32 remote = parms->iph.daddr;
276 	__be32 local = parms->iph.saddr;
277 	__be32 key = parms->i_key;
278 	__be16 flags = parms->i_flags;
279 	int link = parms->link;
280 	struct ip_tunnel *t = NULL;
281 	struct hlist_head *head = ip_bucket(itn, parms);
282 
283 	hlist_for_each_entry_rcu(t, head, hash_node) {
284 		if (local == t->parms.iph.saddr &&
285 		    remote == t->parms.iph.daddr &&
286 		    link == t->parms.link &&
287 		    type == t->dev->type &&
288 		    ip_tunnel_key_match(&t->parms, flags, key))
289 			break;
290 	}
291 	return t;
292 }
293 
294 static struct net_device *__ip_tunnel_create(struct net *net,
295 					     const struct rtnl_link_ops *ops,
296 					     struct ip_tunnel_parm *parms)
297 {
298 	int err;
299 	struct ip_tunnel *tunnel;
300 	struct net_device *dev;
301 	char name[IFNAMSIZ];
302 
303 	if (parms->name[0])
304 		strlcpy(name, parms->name, IFNAMSIZ);
305 	else {
306 		if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
307 			err = -E2BIG;
308 			goto failed;
309 		}
310 		strlcpy(name, ops->kind, IFNAMSIZ);
311 		strncat(name, "%d", 2);
312 	}
313 
314 	ASSERT_RTNL();
315 	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
316 	if (!dev) {
317 		err = -ENOMEM;
318 		goto failed;
319 	}
320 	dev_net_set(dev, net);
321 
322 	dev->rtnl_link_ops = ops;
323 
324 	tunnel = netdev_priv(dev);
325 	tunnel->parms = *parms;
326 	tunnel->net = net;
327 
328 	err = register_netdevice(dev);
329 	if (err)
330 		goto failed_free;
331 
332 	return dev;
333 
334 failed_free:
335 	free_netdev(dev);
336 failed:
337 	return ERR_PTR(err);
338 }
339 
340 static inline void init_tunnel_flow(struct flowi4 *fl4,
341 				    int proto,
342 				    __be32 daddr, __be32 saddr,
343 				    __be32 key, __u8 tos, int oif)
344 {
345 	memset(fl4, 0, sizeof(*fl4));
346 	fl4->flowi4_oif = oif;
347 	fl4->daddr = daddr;
348 	fl4->saddr = saddr;
349 	fl4->flowi4_tos = tos;
350 	fl4->flowi4_proto = proto;
351 	fl4->fl4_gre_key = key;
352 }
353 
354 static int ip_tunnel_bind_dev(struct net_device *dev)
355 {
356 	struct net_device *tdev = NULL;
357 	struct ip_tunnel *tunnel = netdev_priv(dev);
358 	const struct iphdr *iph;
359 	int hlen = LL_MAX_HEADER;
360 	int mtu = ETH_DATA_LEN;
361 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
362 
363 	iph = &tunnel->parms.iph;
364 
365 	/* Guess output device to choose reasonable mtu and needed_headroom */
366 	if (iph->daddr) {
367 		struct flowi4 fl4;
368 		struct rtable *rt;
369 
370 		init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
371 				 iph->saddr, tunnel->parms.o_key,
372 				 RT_TOS(iph->tos), tunnel->parms.link);
373 		rt = ip_route_output_key(tunnel->net, &fl4);
374 
375 		if (!IS_ERR(rt)) {
376 			tdev = rt->dst.dev;
377 			tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
378 			ip_rt_put(rt);
379 		}
380 		if (dev->type != ARPHRD_ETHER)
381 			dev->flags |= IFF_POINTOPOINT;
382 	}
383 
384 	if (!tdev && tunnel->parms.link)
385 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
386 
387 	if (tdev) {
388 		hlen = tdev->hard_header_len + tdev->needed_headroom;
389 		mtu = tdev->mtu;
390 	}
391 	dev->iflink = tunnel->parms.link;
392 
393 	dev->needed_headroom = t_hlen + hlen;
394 	mtu -= (dev->hard_header_len + t_hlen);
395 
396 	if (mtu < 68)
397 		mtu = 68;
398 
399 	return mtu;
400 }
401 
402 static struct ip_tunnel *ip_tunnel_create(struct net *net,
403 					  struct ip_tunnel_net *itn,
404 					  struct ip_tunnel_parm *parms)
405 {
406 	struct ip_tunnel *nt;
407 	struct net_device *dev;
408 
409 	BUG_ON(!itn->fb_tunnel_dev);
410 	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
411 	if (IS_ERR(dev))
412 		return ERR_CAST(dev);
413 
414 	dev->mtu = ip_tunnel_bind_dev(dev);
415 
416 	nt = netdev_priv(dev);
417 	ip_tunnel_add(itn, nt);
418 	return nt;
419 }
420 
421 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
422 		  const struct tnl_ptk_info *tpi, bool log_ecn_error)
423 {
424 	struct pcpu_sw_netstats *tstats;
425 	const struct iphdr *iph = ip_hdr(skb);
426 	int err;
427 
428 #ifdef CONFIG_NET_IPGRE_BROADCAST
429 	if (ipv4_is_multicast(iph->daddr)) {
430 		tunnel->dev->stats.multicast++;
431 		skb->pkt_type = PACKET_BROADCAST;
432 	}
433 #endif
434 
435 	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
436 	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
437 		tunnel->dev->stats.rx_crc_errors++;
438 		tunnel->dev->stats.rx_errors++;
439 		goto drop;
440 	}
441 
442 	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
443 		if (!(tpi->flags&TUNNEL_SEQ) ||
444 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
445 			tunnel->dev->stats.rx_fifo_errors++;
446 			tunnel->dev->stats.rx_errors++;
447 			goto drop;
448 		}
449 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
450 	}
451 
452 	skb_reset_network_header(skb);
453 
454 	err = IP_ECN_decapsulate(iph, skb);
455 	if (unlikely(err)) {
456 		if (log_ecn_error)
457 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
458 					&iph->saddr, iph->tos);
459 		if (err > 1) {
460 			++tunnel->dev->stats.rx_frame_errors;
461 			++tunnel->dev->stats.rx_errors;
462 			goto drop;
463 		}
464 	}
465 
466 	tstats = this_cpu_ptr(tunnel->dev->tstats);
467 	u64_stats_update_begin(&tstats->syncp);
468 	tstats->rx_packets++;
469 	tstats->rx_bytes += skb->len;
470 	u64_stats_update_end(&tstats->syncp);
471 
472 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
473 
474 	if (tunnel->dev->type == ARPHRD_ETHER) {
475 		skb->protocol = eth_type_trans(skb, tunnel->dev);
476 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
477 	} else {
478 		skb->dev = tunnel->dev;
479 	}
480 
481 	gro_cells_receive(&tunnel->gro_cells, skb);
482 	return 0;
483 
484 drop:
485 	kfree_skb(skb);
486 	return 0;
487 }
488 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
489 
490 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
491 			    struct rtable *rt, __be16 df)
492 {
493 	struct ip_tunnel *tunnel = netdev_priv(dev);
494 	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
495 	int mtu;
496 
497 	if (df)
498 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
499 					- sizeof(struct iphdr) - tunnel->hlen;
500 	else
501 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
502 
503 	if (skb_dst(skb))
504 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
505 
506 	if (skb->protocol == htons(ETH_P_IP)) {
507 		if (!skb_is_gso(skb) &&
508 		    (df & htons(IP_DF)) && mtu < pkt_size) {
509 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
510 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
511 			return -E2BIG;
512 		}
513 	}
514 #if IS_ENABLED(CONFIG_IPV6)
515 	else if (skb->protocol == htons(ETH_P_IPV6)) {
516 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
517 
518 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
519 			   mtu >= IPV6_MIN_MTU) {
520 			if ((tunnel->parms.iph.daddr &&
521 			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
522 			    rt6->rt6i_dst.plen == 128) {
523 				rt6->rt6i_flags |= RTF_MODIFIED;
524 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
525 			}
526 		}
527 
528 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
529 					mtu < pkt_size) {
530 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
531 			return -E2BIG;
532 		}
533 	}
534 #endif
535 	return 0;
536 }
537 
538 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
539 		    const struct iphdr *tnl_params, const u8 protocol)
540 {
541 	struct ip_tunnel *tunnel = netdev_priv(dev);
542 	const struct iphdr *inner_iph;
543 	struct flowi4 fl4;
544 	u8     tos, ttl;
545 	__be16 df;
546 	struct rtable *rt;		/* Route to the other host */
547 	unsigned int max_headroom;	/* The extra header space needed */
548 	__be32 dst;
549 	int err;
550 	bool connected;
551 
552 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
553 	connected = (tunnel->parms.iph.daddr != 0);
554 
555 	dst = tnl_params->daddr;
556 	if (dst == 0) {
557 		/* NBMA tunnel */
558 
559 		if (skb_dst(skb) == NULL) {
560 			dev->stats.tx_fifo_errors++;
561 			goto tx_error;
562 		}
563 
564 		if (skb->protocol == htons(ETH_P_IP)) {
565 			rt = skb_rtable(skb);
566 			dst = rt_nexthop(rt, inner_iph->daddr);
567 		}
568 #if IS_ENABLED(CONFIG_IPV6)
569 		else if (skb->protocol == htons(ETH_P_IPV6)) {
570 			const struct in6_addr *addr6;
571 			struct neighbour *neigh;
572 			bool do_tx_error_icmp;
573 			int addr_type;
574 
575 			neigh = dst_neigh_lookup(skb_dst(skb),
576 						 &ipv6_hdr(skb)->daddr);
577 			if (neigh == NULL)
578 				goto tx_error;
579 
580 			addr6 = (const struct in6_addr *)&neigh->primary_key;
581 			addr_type = ipv6_addr_type(addr6);
582 
583 			if (addr_type == IPV6_ADDR_ANY) {
584 				addr6 = &ipv6_hdr(skb)->daddr;
585 				addr_type = ipv6_addr_type(addr6);
586 			}
587 
588 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
589 				do_tx_error_icmp = true;
590 			else {
591 				do_tx_error_icmp = false;
592 				dst = addr6->s6_addr32[3];
593 			}
594 			neigh_release(neigh);
595 			if (do_tx_error_icmp)
596 				goto tx_error_icmp;
597 		}
598 #endif
599 		else
600 			goto tx_error;
601 
602 		connected = false;
603 	}
604 
605 	tos = tnl_params->tos;
606 	if (tos & 0x1) {
607 		tos &= ~0x1;
608 		if (skb->protocol == htons(ETH_P_IP)) {
609 			tos = inner_iph->tos;
610 			connected = false;
611 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
612 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
613 			connected = false;
614 		}
615 	}
616 
617 	init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
618 			 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
619 
620 	rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL;
621 
622 	if (!rt) {
623 		rt = ip_route_output_key(tunnel->net, &fl4);
624 
625 		if (IS_ERR(rt)) {
626 			dev->stats.tx_carrier_errors++;
627 			goto tx_error;
628 		}
629 		if (connected)
630 			tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
631 	}
632 
633 	if (rt->dst.dev == dev) {
634 		ip_rt_put(rt);
635 		dev->stats.collisions++;
636 		goto tx_error;
637 	}
638 
639 	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) {
640 		ip_rt_put(rt);
641 		goto tx_error;
642 	}
643 
644 	if (tunnel->err_count > 0) {
645 		if (time_before(jiffies,
646 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
647 			tunnel->err_count--;
648 
649 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
650 			dst_link_failure(skb);
651 		} else
652 			tunnel->err_count = 0;
653 	}
654 
655 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
656 	ttl = tnl_params->ttl;
657 	if (ttl == 0) {
658 		if (skb->protocol == htons(ETH_P_IP))
659 			ttl = inner_iph->ttl;
660 #if IS_ENABLED(CONFIG_IPV6)
661 		else if (skb->protocol == htons(ETH_P_IPV6))
662 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
663 #endif
664 		else
665 			ttl = ip4_dst_hoplimit(&rt->dst);
666 	}
667 
668 	df = tnl_params->frag_off;
669 	if (skb->protocol == htons(ETH_P_IP))
670 		df |= (inner_iph->frag_off&htons(IP_DF));
671 
672 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
673 			+ rt->dst.header_len;
674 	if (max_headroom > dev->needed_headroom)
675 		dev->needed_headroom = max_headroom;
676 
677 	if (skb_cow_head(skb, dev->needed_headroom)) {
678 		ip_rt_put(rt);
679 		dev->stats.tx_dropped++;
680 		kfree_skb(skb);
681 		return;
682 	}
683 
684 	err = iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, protocol,
685 			    tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
686 	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
687 
688 	return;
689 
690 #if IS_ENABLED(CONFIG_IPV6)
691 tx_error_icmp:
692 	dst_link_failure(skb);
693 #endif
694 tx_error:
695 	dev->stats.tx_errors++;
696 	kfree_skb(skb);
697 }
698 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
699 
700 static void ip_tunnel_update(struct ip_tunnel_net *itn,
701 			     struct ip_tunnel *t,
702 			     struct net_device *dev,
703 			     struct ip_tunnel_parm *p,
704 			     bool set_mtu)
705 {
706 	ip_tunnel_del(t);
707 	t->parms.iph.saddr = p->iph.saddr;
708 	t->parms.iph.daddr = p->iph.daddr;
709 	t->parms.i_key = p->i_key;
710 	t->parms.o_key = p->o_key;
711 	if (dev->type != ARPHRD_ETHER) {
712 		memcpy(dev->dev_addr, &p->iph.saddr, 4);
713 		memcpy(dev->broadcast, &p->iph.daddr, 4);
714 	}
715 	ip_tunnel_add(itn, t);
716 
717 	t->parms.iph.ttl = p->iph.ttl;
718 	t->parms.iph.tos = p->iph.tos;
719 	t->parms.iph.frag_off = p->iph.frag_off;
720 
721 	if (t->parms.link != p->link) {
722 		int mtu;
723 
724 		t->parms.link = p->link;
725 		mtu = ip_tunnel_bind_dev(dev);
726 		if (set_mtu)
727 			dev->mtu = mtu;
728 	}
729 	ip_tunnel_dst_reset_all(t);
730 	netdev_state_change(dev);
731 }
732 
733 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
734 {
735 	int err = 0;
736 	struct ip_tunnel *t = netdev_priv(dev);
737 	struct net *net = t->net;
738 	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
739 
740 	BUG_ON(!itn->fb_tunnel_dev);
741 	switch (cmd) {
742 	case SIOCGETTUNNEL:
743 		if (dev == itn->fb_tunnel_dev) {
744 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
745 			if (t == NULL)
746 				t = netdev_priv(dev);
747 		}
748 		memcpy(p, &t->parms, sizeof(*p));
749 		break;
750 
751 	case SIOCADDTUNNEL:
752 	case SIOCCHGTUNNEL:
753 		err = -EPERM;
754 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
755 			goto done;
756 		if (p->iph.ttl)
757 			p->iph.frag_off |= htons(IP_DF);
758 		if (!(p->i_flags & VTI_ISVTI)) {
759 			if (!(p->i_flags & TUNNEL_KEY))
760 				p->i_key = 0;
761 			if (!(p->o_flags & TUNNEL_KEY))
762 				p->o_key = 0;
763 		}
764 
765 		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
766 
767 		if (!t && (cmd == SIOCADDTUNNEL)) {
768 			t = ip_tunnel_create(net, itn, p);
769 			err = PTR_ERR_OR_ZERO(t);
770 			break;
771 		}
772 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
773 			if (t != NULL) {
774 				if (t->dev != dev) {
775 					err = -EEXIST;
776 					break;
777 				}
778 			} else {
779 				unsigned int nflags = 0;
780 
781 				if (ipv4_is_multicast(p->iph.daddr))
782 					nflags = IFF_BROADCAST;
783 				else if (p->iph.daddr)
784 					nflags = IFF_POINTOPOINT;
785 
786 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
787 					err = -EINVAL;
788 					break;
789 				}
790 
791 				t = netdev_priv(dev);
792 			}
793 		}
794 
795 		if (t) {
796 			err = 0;
797 			ip_tunnel_update(itn, t, dev, p, true);
798 		} else {
799 			err = -ENOENT;
800 		}
801 		break;
802 
803 	case SIOCDELTUNNEL:
804 		err = -EPERM;
805 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
806 			goto done;
807 
808 		if (dev == itn->fb_tunnel_dev) {
809 			err = -ENOENT;
810 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
811 			if (t == NULL)
812 				goto done;
813 			err = -EPERM;
814 			if (t == netdev_priv(itn->fb_tunnel_dev))
815 				goto done;
816 			dev = t->dev;
817 		}
818 		unregister_netdevice(dev);
819 		err = 0;
820 		break;
821 
822 	default:
823 		err = -EINVAL;
824 	}
825 
826 done:
827 	return err;
828 }
829 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
830 
831 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
832 {
833 	struct ip_tunnel *tunnel = netdev_priv(dev);
834 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
835 
836 	if (new_mtu < 68 ||
837 	    new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
838 		return -EINVAL;
839 	dev->mtu = new_mtu;
840 	return 0;
841 }
842 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
843 
844 static void ip_tunnel_dev_free(struct net_device *dev)
845 {
846 	struct ip_tunnel *tunnel = netdev_priv(dev);
847 
848 	gro_cells_destroy(&tunnel->gro_cells);
849 	free_percpu(tunnel->dst_cache);
850 	free_percpu(dev->tstats);
851 	free_netdev(dev);
852 }
853 
854 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
855 {
856 	struct ip_tunnel *tunnel = netdev_priv(dev);
857 	struct ip_tunnel_net *itn;
858 
859 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
860 
861 	if (itn->fb_tunnel_dev != dev) {
862 		ip_tunnel_del(netdev_priv(dev));
863 		unregister_netdevice_queue(dev, head);
864 	}
865 }
866 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
867 
868 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
869 				  struct rtnl_link_ops *ops, char *devname)
870 {
871 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
872 	struct ip_tunnel_parm parms;
873 	unsigned int i;
874 
875 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
876 		INIT_HLIST_HEAD(&itn->tunnels[i]);
877 
878 	if (!ops) {
879 		itn->fb_tunnel_dev = NULL;
880 		return 0;
881 	}
882 
883 	memset(&parms, 0, sizeof(parms));
884 	if (devname)
885 		strlcpy(parms.name, devname, IFNAMSIZ);
886 
887 	rtnl_lock();
888 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
889 	/* FB netdevice is special: we have one, and only one per netns.
890 	 * Allowing to move it to another netns is clearly unsafe.
891 	 */
892 	if (!IS_ERR(itn->fb_tunnel_dev)) {
893 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
894 		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
895 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
896 	}
897 	rtnl_unlock();
898 
899 	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
900 }
901 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
902 
903 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
904 			      struct rtnl_link_ops *ops)
905 {
906 	struct net *net = dev_net(itn->fb_tunnel_dev);
907 	struct net_device *dev, *aux;
908 	int h;
909 
910 	for_each_netdev_safe(net, dev, aux)
911 		if (dev->rtnl_link_ops == ops)
912 			unregister_netdevice_queue(dev, head);
913 
914 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
915 		struct ip_tunnel *t;
916 		struct hlist_node *n;
917 		struct hlist_head *thead = &itn->tunnels[h];
918 
919 		hlist_for_each_entry_safe(t, n, thead, hash_node)
920 			/* If dev is in the same netns, it has already
921 			 * been added to the list by the previous loop.
922 			 */
923 			if (!net_eq(dev_net(t->dev), net))
924 				unregister_netdevice_queue(t->dev, head);
925 	}
926 }
927 
928 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
929 {
930 	LIST_HEAD(list);
931 
932 	rtnl_lock();
933 	ip_tunnel_destroy(itn, &list, ops);
934 	unregister_netdevice_many(&list);
935 	rtnl_unlock();
936 }
937 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
938 
939 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
940 		      struct ip_tunnel_parm *p)
941 {
942 	struct ip_tunnel *nt;
943 	struct net *net = dev_net(dev);
944 	struct ip_tunnel_net *itn;
945 	int mtu;
946 	int err;
947 
948 	nt = netdev_priv(dev);
949 	itn = net_generic(net, nt->ip_tnl_net_id);
950 
951 	if (ip_tunnel_find(itn, p, dev->type))
952 		return -EEXIST;
953 
954 	nt->net = net;
955 	nt->parms = *p;
956 	err = register_netdevice(dev);
957 	if (err)
958 		goto out;
959 
960 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
961 		eth_hw_addr_random(dev);
962 
963 	mtu = ip_tunnel_bind_dev(dev);
964 	if (!tb[IFLA_MTU])
965 		dev->mtu = mtu;
966 
967 	ip_tunnel_add(itn, nt);
968 
969 out:
970 	return err;
971 }
972 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
973 
974 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
975 			 struct ip_tunnel_parm *p)
976 {
977 	struct ip_tunnel *t;
978 	struct ip_tunnel *tunnel = netdev_priv(dev);
979 	struct net *net = tunnel->net;
980 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
981 
982 	if (dev == itn->fb_tunnel_dev)
983 		return -EINVAL;
984 
985 	t = ip_tunnel_find(itn, p, dev->type);
986 
987 	if (t) {
988 		if (t->dev != dev)
989 			return -EEXIST;
990 	} else {
991 		t = tunnel;
992 
993 		if (dev->type != ARPHRD_ETHER) {
994 			unsigned int nflags = 0;
995 
996 			if (ipv4_is_multicast(p->iph.daddr))
997 				nflags = IFF_BROADCAST;
998 			else if (p->iph.daddr)
999 				nflags = IFF_POINTOPOINT;
1000 
1001 			if ((dev->flags ^ nflags) &
1002 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1003 				return -EINVAL;
1004 		}
1005 	}
1006 
1007 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1008 	return 0;
1009 }
1010 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1011 
1012 int ip_tunnel_init(struct net_device *dev)
1013 {
1014 	struct ip_tunnel *tunnel = netdev_priv(dev);
1015 	struct iphdr *iph = &tunnel->parms.iph;
1016 	int err;
1017 
1018 	dev->destructor	= ip_tunnel_dev_free;
1019 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1020 	if (!dev->tstats)
1021 		return -ENOMEM;
1022 
1023 	tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
1024 	if (!tunnel->dst_cache) {
1025 		free_percpu(dev->tstats);
1026 		return -ENOMEM;
1027 	}
1028 
1029 	err = gro_cells_init(&tunnel->gro_cells, dev);
1030 	if (err) {
1031 		free_percpu(tunnel->dst_cache);
1032 		free_percpu(dev->tstats);
1033 		return err;
1034 	}
1035 
1036 	tunnel->dev = dev;
1037 	tunnel->net = dev_net(dev);
1038 	strcpy(tunnel->parms.name, dev->name);
1039 	iph->version		= 4;
1040 	iph->ihl		= 5;
1041 
1042 	return 0;
1043 }
1044 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1045 
1046 void ip_tunnel_uninit(struct net_device *dev)
1047 {
1048 	struct ip_tunnel *tunnel = netdev_priv(dev);
1049 	struct net *net = tunnel->net;
1050 	struct ip_tunnel_net *itn;
1051 
1052 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1053 	/* fb_tunnel_dev will be unregisted in net-exit call. */
1054 	if (itn->fb_tunnel_dev != dev)
1055 		ip_tunnel_del(netdev_priv(dev));
1056 
1057 	ip_tunnel_dst_reset_all(tunnel);
1058 }
1059 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1060 
1061 /* Do least required initialization, rest of init is done in tunnel_init call */
1062 void ip_tunnel_setup(struct net_device *dev, int net_id)
1063 {
1064 	struct ip_tunnel *tunnel = netdev_priv(dev);
1065 	tunnel->ip_tnl_net_id = net_id;
1066 }
1067 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1068 
1069 MODULE_LICENSE("GPL");
1070