xref: /openbmc/linux/net/ipv4/ip_tunnel.c (revision 078a55fc)
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20 
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/mroute.h>
34 #include <linux/init.h>
35 #include <linux/in6.h>
36 #include <linux/inetdevice.h>
37 #include <linux/igmp.h>
38 #include <linux/netfilter_ipv4.h>
39 #include <linux/etherdevice.h>
40 #include <linux/if_ether.h>
41 #include <linux/if_vlan.h>
42 #include <linux/rculist.h>
43 
44 #include <net/sock.h>
45 #include <net/ip.h>
46 #include <net/icmp.h>
47 #include <net/protocol.h>
48 #include <net/ip_tunnels.h>
49 #include <net/arp.h>
50 #include <net/checksum.h>
51 #include <net/dsfield.h>
52 #include <net/inet_ecn.h>
53 #include <net/xfrm.h>
54 #include <net/net_namespace.h>
55 #include <net/netns/generic.h>
56 #include <net/rtnetlink.h>
57 
58 #if IS_ENABLED(CONFIG_IPV6)
59 #include <net/ipv6.h>
60 #include <net/ip6_fib.h>
61 #include <net/ip6_route.h>
62 #endif
63 
64 static unsigned int ip_tunnel_hash(struct ip_tunnel_net *itn,
65 				   __be32 key, __be32 remote)
66 {
67 	return hash_32((__force u32)key ^ (__force u32)remote,
68 			 IP_TNL_HASH_BITS);
69 }
70 
71 /* Often modified stats are per cpu, other are shared (netdev->stats) */
72 struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
73 						struct rtnl_link_stats64 *tot)
74 {
75 	int i;
76 
77 	for_each_possible_cpu(i) {
78 		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
79 		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
80 		unsigned int start;
81 
82 		do {
83 			start = u64_stats_fetch_begin_bh(&tstats->syncp);
84 			rx_packets = tstats->rx_packets;
85 			tx_packets = tstats->tx_packets;
86 			rx_bytes = tstats->rx_bytes;
87 			tx_bytes = tstats->tx_bytes;
88 		} while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
89 
90 		tot->rx_packets += rx_packets;
91 		tot->tx_packets += tx_packets;
92 		tot->rx_bytes   += rx_bytes;
93 		tot->tx_bytes   += tx_bytes;
94 	}
95 
96 	tot->multicast = dev->stats.multicast;
97 
98 	tot->rx_crc_errors = dev->stats.rx_crc_errors;
99 	tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
100 	tot->rx_length_errors = dev->stats.rx_length_errors;
101 	tot->rx_frame_errors = dev->stats.rx_frame_errors;
102 	tot->rx_errors = dev->stats.rx_errors;
103 
104 	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
105 	tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
106 	tot->tx_dropped = dev->stats.tx_dropped;
107 	tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
108 	tot->tx_errors = dev->stats.tx_errors;
109 
110 	tot->collisions  = dev->stats.collisions;
111 
112 	return tot;
113 }
114 EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
115 
116 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
117 				__be16 flags, __be32 key)
118 {
119 	if (p->i_flags & TUNNEL_KEY) {
120 		if (flags & TUNNEL_KEY)
121 			return key == p->i_key;
122 		else
123 			/* key expected, none present */
124 			return false;
125 	} else
126 		return !(flags & TUNNEL_KEY);
127 }
128 
129 /* Fallback tunnel: no source, no destination, no key, no options
130 
131    Tunnel hash table:
132    We require exact key match i.e. if a key is present in packet
133    it will match only tunnel with the same key; if it is not present,
134    it will match only keyless tunnel.
135 
136    All keysless packets, if not matched configured keyless tunnels
137    will match fallback tunnel.
138    Given src, dst and key, find appropriate for input tunnel.
139 */
140 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
141 				   int link, __be16 flags,
142 				   __be32 remote, __be32 local,
143 				   __be32 key)
144 {
145 	unsigned int hash;
146 	struct ip_tunnel *t, *cand = NULL;
147 	struct hlist_head *head;
148 
149 	hash = ip_tunnel_hash(itn, key, remote);
150 	head = &itn->tunnels[hash];
151 
152 	hlist_for_each_entry_rcu(t, head, hash_node) {
153 		if (local != t->parms.iph.saddr ||
154 		    remote != t->parms.iph.daddr ||
155 		    !(t->dev->flags & IFF_UP))
156 			continue;
157 
158 		if (!ip_tunnel_key_match(&t->parms, flags, key))
159 			continue;
160 
161 		if (t->parms.link == link)
162 			return t;
163 		else
164 			cand = t;
165 	}
166 
167 	hlist_for_each_entry_rcu(t, head, hash_node) {
168 		if (remote != t->parms.iph.daddr ||
169 		    !(t->dev->flags & IFF_UP))
170 			continue;
171 
172 		if (!ip_tunnel_key_match(&t->parms, flags, key))
173 			continue;
174 
175 		if (t->parms.link == link)
176 			return t;
177 		else if (!cand)
178 			cand = t;
179 	}
180 
181 	hash = ip_tunnel_hash(itn, key, 0);
182 	head = &itn->tunnels[hash];
183 
184 	hlist_for_each_entry_rcu(t, head, hash_node) {
185 		if ((local != t->parms.iph.saddr &&
186 		     (local != t->parms.iph.daddr ||
187 		      !ipv4_is_multicast(local))) ||
188 		    !(t->dev->flags & IFF_UP))
189 			continue;
190 
191 		if (!ip_tunnel_key_match(&t->parms, flags, key))
192 			continue;
193 
194 		if (t->parms.link == link)
195 			return t;
196 		else if (!cand)
197 			cand = t;
198 	}
199 
200 	if (flags & TUNNEL_NO_KEY)
201 		goto skip_key_lookup;
202 
203 	hlist_for_each_entry_rcu(t, head, hash_node) {
204 		if (t->parms.i_key != key ||
205 		    !(t->dev->flags & IFF_UP))
206 			continue;
207 
208 		if (t->parms.link == link)
209 			return t;
210 		else if (!cand)
211 			cand = t;
212 	}
213 
214 skip_key_lookup:
215 	if (cand)
216 		return cand;
217 
218 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
219 		return netdev_priv(itn->fb_tunnel_dev);
220 
221 
222 	return NULL;
223 }
224 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
225 
226 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
227 				    struct ip_tunnel_parm *parms)
228 {
229 	unsigned int h;
230 	__be32 remote;
231 
232 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
233 		remote = parms->iph.daddr;
234 	else
235 		remote = 0;
236 
237 	h = ip_tunnel_hash(itn, parms->i_key, remote);
238 	return &itn->tunnels[h];
239 }
240 
241 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
242 {
243 	struct hlist_head *head = ip_bucket(itn, &t->parms);
244 
245 	hlist_add_head_rcu(&t->hash_node, head);
246 }
247 
248 static void ip_tunnel_del(struct ip_tunnel *t)
249 {
250 	hlist_del_init_rcu(&t->hash_node);
251 }
252 
253 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
254 					struct ip_tunnel_parm *parms,
255 					int type)
256 {
257 	__be32 remote = parms->iph.daddr;
258 	__be32 local = parms->iph.saddr;
259 	__be32 key = parms->i_key;
260 	int link = parms->link;
261 	struct ip_tunnel *t = NULL;
262 	struct hlist_head *head = ip_bucket(itn, parms);
263 
264 	hlist_for_each_entry_rcu(t, head, hash_node) {
265 		if (local == t->parms.iph.saddr &&
266 		    remote == t->parms.iph.daddr &&
267 		    key == t->parms.i_key &&
268 		    link == t->parms.link &&
269 		    type == t->dev->type)
270 			break;
271 	}
272 	return t;
273 }
274 
275 static struct net_device *__ip_tunnel_create(struct net *net,
276 					     const struct rtnl_link_ops *ops,
277 					     struct ip_tunnel_parm *parms)
278 {
279 	int err;
280 	struct ip_tunnel *tunnel;
281 	struct net_device *dev;
282 	char name[IFNAMSIZ];
283 
284 	if (parms->name[0])
285 		strlcpy(name, parms->name, IFNAMSIZ);
286 	else {
287 		if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
288 			err = -E2BIG;
289 			goto failed;
290 		}
291 		strlcpy(name, ops->kind, IFNAMSIZ);
292 		strncat(name, "%d", 2);
293 	}
294 
295 	ASSERT_RTNL();
296 	dev = alloc_netdev(ops->priv_size, name, ops->setup);
297 	if (!dev) {
298 		err = -ENOMEM;
299 		goto failed;
300 	}
301 	dev_net_set(dev, net);
302 
303 	dev->rtnl_link_ops = ops;
304 
305 	tunnel = netdev_priv(dev);
306 	tunnel->parms = *parms;
307 	tunnel->net = net;
308 
309 	err = register_netdevice(dev);
310 	if (err)
311 		goto failed_free;
312 
313 	return dev;
314 
315 failed_free:
316 	free_netdev(dev);
317 failed:
318 	return ERR_PTR(err);
319 }
320 
321 static inline struct rtable *ip_route_output_tunnel(struct net *net,
322 						    struct flowi4 *fl4,
323 						    int proto,
324 						    __be32 daddr, __be32 saddr,
325 						    __be32 key, __u8 tos, int oif)
326 {
327 	memset(fl4, 0, sizeof(*fl4));
328 	fl4->flowi4_oif = oif;
329 	fl4->daddr = daddr;
330 	fl4->saddr = saddr;
331 	fl4->flowi4_tos = tos;
332 	fl4->flowi4_proto = proto;
333 	fl4->fl4_gre_key = key;
334 	return ip_route_output_key(net, fl4);
335 }
336 
337 static int ip_tunnel_bind_dev(struct net_device *dev)
338 {
339 	struct net_device *tdev = NULL;
340 	struct ip_tunnel *tunnel = netdev_priv(dev);
341 	const struct iphdr *iph;
342 	int hlen = LL_MAX_HEADER;
343 	int mtu = ETH_DATA_LEN;
344 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
345 
346 	iph = &tunnel->parms.iph;
347 
348 	/* Guess output device to choose reasonable mtu and needed_headroom */
349 	if (iph->daddr) {
350 		struct flowi4 fl4;
351 		struct rtable *rt;
352 
353 		rt = ip_route_output_tunnel(dev_net(dev), &fl4,
354 					    tunnel->parms.iph.protocol,
355 					    iph->daddr, iph->saddr,
356 					    tunnel->parms.o_key,
357 					    RT_TOS(iph->tos),
358 					    tunnel->parms.link);
359 		if (!IS_ERR(rt)) {
360 			tdev = rt->dst.dev;
361 			ip_rt_put(rt);
362 		}
363 		if (dev->type != ARPHRD_ETHER)
364 			dev->flags |= IFF_POINTOPOINT;
365 	}
366 
367 	if (!tdev && tunnel->parms.link)
368 		tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
369 
370 	if (tdev) {
371 		hlen = tdev->hard_header_len + tdev->needed_headroom;
372 		mtu = tdev->mtu;
373 	}
374 	dev->iflink = tunnel->parms.link;
375 
376 	dev->needed_headroom = t_hlen + hlen;
377 	mtu -= (dev->hard_header_len + t_hlen);
378 
379 	if (mtu < 68)
380 		mtu = 68;
381 
382 	return mtu;
383 }
384 
385 static struct ip_tunnel *ip_tunnel_create(struct net *net,
386 					  struct ip_tunnel_net *itn,
387 					  struct ip_tunnel_parm *parms)
388 {
389 	struct ip_tunnel *nt, *fbt;
390 	struct net_device *dev;
391 
392 	BUG_ON(!itn->fb_tunnel_dev);
393 	fbt = netdev_priv(itn->fb_tunnel_dev);
394 	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
395 	if (IS_ERR(dev))
396 		return NULL;
397 
398 	dev->mtu = ip_tunnel_bind_dev(dev);
399 
400 	nt = netdev_priv(dev);
401 	ip_tunnel_add(itn, nt);
402 	return nt;
403 }
404 
405 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
406 		  const struct tnl_ptk_info *tpi, bool log_ecn_error)
407 {
408 	struct pcpu_tstats *tstats;
409 	const struct iphdr *iph = ip_hdr(skb);
410 	int err;
411 
412 #ifdef CONFIG_NET_IPGRE_BROADCAST
413 	if (ipv4_is_multicast(iph->daddr)) {
414 		/* Looped back packet, drop it! */
415 		if (rt_is_output_route(skb_rtable(skb)))
416 			goto drop;
417 		tunnel->dev->stats.multicast++;
418 		skb->pkt_type = PACKET_BROADCAST;
419 	}
420 #endif
421 
422 	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
423 	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
424 		tunnel->dev->stats.rx_crc_errors++;
425 		tunnel->dev->stats.rx_errors++;
426 		goto drop;
427 	}
428 
429 	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
430 		if (!(tpi->flags&TUNNEL_SEQ) ||
431 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
432 			tunnel->dev->stats.rx_fifo_errors++;
433 			tunnel->dev->stats.rx_errors++;
434 			goto drop;
435 		}
436 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
437 	}
438 
439 	err = IP_ECN_decapsulate(iph, skb);
440 	if (unlikely(err)) {
441 		if (log_ecn_error)
442 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
443 					&iph->saddr, iph->tos);
444 		if (err > 1) {
445 			++tunnel->dev->stats.rx_frame_errors;
446 			++tunnel->dev->stats.rx_errors;
447 			goto drop;
448 		}
449 	}
450 
451 	tstats = this_cpu_ptr(tunnel->dev->tstats);
452 	u64_stats_update_begin(&tstats->syncp);
453 	tstats->rx_packets++;
454 	tstats->rx_bytes += skb->len;
455 	u64_stats_update_end(&tstats->syncp);
456 
457 	if (tunnel->net != dev_net(tunnel->dev))
458 		skb_scrub_packet(skb);
459 
460 	if (tunnel->dev->type == ARPHRD_ETHER) {
461 		skb->protocol = eth_type_trans(skb, tunnel->dev);
462 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
463 	} else {
464 		skb->dev = tunnel->dev;
465 	}
466 	gro_cells_receive(&tunnel->gro_cells, skb);
467 	return 0;
468 
469 drop:
470 	kfree_skb(skb);
471 	return 0;
472 }
473 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
474 
475 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
476 			    struct rtable *rt, __be16 df)
477 {
478 	struct ip_tunnel *tunnel = netdev_priv(dev);
479 	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
480 	int mtu;
481 
482 	if (df)
483 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
484 					- sizeof(struct iphdr) - tunnel->hlen;
485 	else
486 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
487 
488 	if (skb_dst(skb))
489 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
490 
491 	if (skb->protocol == htons(ETH_P_IP)) {
492 		if (!skb_is_gso(skb) &&
493 		    (df & htons(IP_DF)) && mtu < pkt_size) {
494 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
495 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
496 			return -E2BIG;
497 		}
498 	}
499 #if IS_ENABLED(CONFIG_IPV6)
500 	else if (skb->protocol == htons(ETH_P_IPV6)) {
501 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
502 
503 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
504 			   mtu >= IPV6_MIN_MTU) {
505 			if ((tunnel->parms.iph.daddr &&
506 			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
507 			    rt6->rt6i_dst.plen == 128) {
508 				rt6->rt6i_flags |= RTF_MODIFIED;
509 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
510 			}
511 		}
512 
513 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
514 					mtu < pkt_size) {
515 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
516 			return -E2BIG;
517 		}
518 	}
519 #endif
520 	return 0;
521 }
522 
523 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
524 		    const struct iphdr *tnl_params, const u8 protocol)
525 {
526 	struct ip_tunnel *tunnel = netdev_priv(dev);
527 	const struct iphdr *inner_iph;
528 	struct flowi4 fl4;
529 	u8     tos, ttl;
530 	__be16 df;
531 	struct rtable *rt;		/* Route to the other host */
532 	unsigned int max_headroom;	/* The extra header space needed */
533 	__be32 dst;
534 	int err;
535 
536 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
537 
538 	dst = tnl_params->daddr;
539 	if (dst == 0) {
540 		/* NBMA tunnel */
541 
542 		if (skb_dst(skb) == NULL) {
543 			dev->stats.tx_fifo_errors++;
544 			goto tx_error;
545 		}
546 
547 		if (skb->protocol == htons(ETH_P_IP)) {
548 			rt = skb_rtable(skb);
549 			dst = rt_nexthop(rt, inner_iph->daddr);
550 		}
551 #if IS_ENABLED(CONFIG_IPV6)
552 		else if (skb->protocol == htons(ETH_P_IPV6)) {
553 			const struct in6_addr *addr6;
554 			struct neighbour *neigh;
555 			bool do_tx_error_icmp;
556 			int addr_type;
557 
558 			neigh = dst_neigh_lookup(skb_dst(skb),
559 						 &ipv6_hdr(skb)->daddr);
560 			if (neigh == NULL)
561 				goto tx_error;
562 
563 			addr6 = (const struct in6_addr *)&neigh->primary_key;
564 			addr_type = ipv6_addr_type(addr6);
565 
566 			if (addr_type == IPV6_ADDR_ANY) {
567 				addr6 = &ipv6_hdr(skb)->daddr;
568 				addr_type = ipv6_addr_type(addr6);
569 			}
570 
571 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
572 				do_tx_error_icmp = true;
573 			else {
574 				do_tx_error_icmp = false;
575 				dst = addr6->s6_addr32[3];
576 			}
577 			neigh_release(neigh);
578 			if (do_tx_error_icmp)
579 				goto tx_error_icmp;
580 		}
581 #endif
582 		else
583 			goto tx_error;
584 	}
585 
586 	tos = tnl_params->tos;
587 	if (tos & 0x1) {
588 		tos &= ~0x1;
589 		if (skb->protocol == htons(ETH_P_IP))
590 			tos = inner_iph->tos;
591 		else if (skb->protocol == htons(ETH_P_IPV6))
592 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
593 	}
594 
595 	rt = ip_route_output_tunnel(tunnel->net, &fl4,
596 				    protocol,
597 				    dst, tnl_params->saddr,
598 				    tunnel->parms.o_key,
599 				    RT_TOS(tos),
600 				    tunnel->parms.link);
601 	if (IS_ERR(rt)) {
602 		dev->stats.tx_carrier_errors++;
603 		goto tx_error;
604 	}
605 	if (rt->dst.dev == dev) {
606 		ip_rt_put(rt);
607 		dev->stats.collisions++;
608 		goto tx_error;
609 	}
610 
611 	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) {
612 		ip_rt_put(rt);
613 		goto tx_error;
614 	}
615 
616 	if (tunnel->net != dev_net(dev))
617 		skb_scrub_packet(skb);
618 
619 	if (tunnel->err_count > 0) {
620 		if (time_before(jiffies,
621 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
622 			tunnel->err_count--;
623 
624 			dst_link_failure(skb);
625 		} else
626 			tunnel->err_count = 0;
627 	}
628 
629 	ttl = tnl_params->ttl;
630 	if (ttl == 0) {
631 		if (skb->protocol == htons(ETH_P_IP))
632 			ttl = inner_iph->ttl;
633 #if IS_ENABLED(CONFIG_IPV6)
634 		else if (skb->protocol == htons(ETH_P_IPV6))
635 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
636 #endif
637 		else
638 			ttl = ip4_dst_hoplimit(&rt->dst);
639 	}
640 
641 	df = tnl_params->frag_off;
642 	if (skb->protocol == htons(ETH_P_IP))
643 		df |= (inner_iph->frag_off&htons(IP_DF));
644 
645 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
646 			+ rt->dst.header_len;
647 	if (max_headroom > dev->needed_headroom) {
648 		dev->needed_headroom = max_headroom;
649 		if (skb_cow_head(skb, dev->needed_headroom)) {
650 			dev->stats.tx_dropped++;
651 			dev_kfree_skb(skb);
652 			return;
653 		}
654 	}
655 
656 	err = iptunnel_xmit(dev_net(dev), rt, skb,
657 			    fl4.saddr, fl4.daddr, protocol,
658 			    ip_tunnel_ecn_encap(tos, inner_iph, skb), ttl, df);
659 	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
660 
661 	return;
662 
663 #if IS_ENABLED(CONFIG_IPV6)
664 tx_error_icmp:
665 	dst_link_failure(skb);
666 #endif
667 tx_error:
668 	dev->stats.tx_errors++;
669 	dev_kfree_skb(skb);
670 }
671 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
672 
673 static void ip_tunnel_update(struct ip_tunnel_net *itn,
674 			     struct ip_tunnel *t,
675 			     struct net_device *dev,
676 			     struct ip_tunnel_parm *p,
677 			     bool set_mtu)
678 {
679 	ip_tunnel_del(t);
680 	t->parms.iph.saddr = p->iph.saddr;
681 	t->parms.iph.daddr = p->iph.daddr;
682 	t->parms.i_key = p->i_key;
683 	t->parms.o_key = p->o_key;
684 	if (dev->type != ARPHRD_ETHER) {
685 		memcpy(dev->dev_addr, &p->iph.saddr, 4);
686 		memcpy(dev->broadcast, &p->iph.daddr, 4);
687 	}
688 	ip_tunnel_add(itn, t);
689 
690 	t->parms.iph.ttl = p->iph.ttl;
691 	t->parms.iph.tos = p->iph.tos;
692 	t->parms.iph.frag_off = p->iph.frag_off;
693 
694 	if (t->parms.link != p->link) {
695 		int mtu;
696 
697 		t->parms.link = p->link;
698 		mtu = ip_tunnel_bind_dev(dev);
699 		if (set_mtu)
700 			dev->mtu = mtu;
701 	}
702 	netdev_state_change(dev);
703 }
704 
705 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
706 {
707 	int err = 0;
708 	struct ip_tunnel *t;
709 	struct net *net = dev_net(dev);
710 	struct ip_tunnel *tunnel = netdev_priv(dev);
711 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
712 
713 	BUG_ON(!itn->fb_tunnel_dev);
714 	switch (cmd) {
715 	case SIOCGETTUNNEL:
716 		t = NULL;
717 		if (dev == itn->fb_tunnel_dev)
718 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
719 		if (t == NULL)
720 			t = netdev_priv(dev);
721 		memcpy(p, &t->parms, sizeof(*p));
722 		break;
723 
724 	case SIOCADDTUNNEL:
725 	case SIOCCHGTUNNEL:
726 		err = -EPERM;
727 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
728 			goto done;
729 		if (p->iph.ttl)
730 			p->iph.frag_off |= htons(IP_DF);
731 		if (!(p->i_flags&TUNNEL_KEY))
732 			p->i_key = 0;
733 		if (!(p->o_flags&TUNNEL_KEY))
734 			p->o_key = 0;
735 
736 		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
737 
738 		if (!t && (cmd == SIOCADDTUNNEL))
739 			t = ip_tunnel_create(net, itn, p);
740 
741 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
742 			if (t != NULL) {
743 				if (t->dev != dev) {
744 					err = -EEXIST;
745 					break;
746 				}
747 			} else {
748 				unsigned int nflags = 0;
749 
750 				if (ipv4_is_multicast(p->iph.daddr))
751 					nflags = IFF_BROADCAST;
752 				else if (p->iph.daddr)
753 					nflags = IFF_POINTOPOINT;
754 
755 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
756 					err = -EINVAL;
757 					break;
758 				}
759 
760 				t = netdev_priv(dev);
761 			}
762 		}
763 
764 		if (t) {
765 			err = 0;
766 			ip_tunnel_update(itn, t, dev, p, true);
767 		} else
768 			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
769 		break;
770 
771 	case SIOCDELTUNNEL:
772 		err = -EPERM;
773 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
774 			goto done;
775 
776 		if (dev == itn->fb_tunnel_dev) {
777 			err = -ENOENT;
778 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
779 			if (t == NULL)
780 				goto done;
781 			err = -EPERM;
782 			if (t == netdev_priv(itn->fb_tunnel_dev))
783 				goto done;
784 			dev = t->dev;
785 		}
786 		unregister_netdevice(dev);
787 		err = 0;
788 		break;
789 
790 	default:
791 		err = -EINVAL;
792 	}
793 
794 done:
795 	return err;
796 }
797 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
798 
799 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
800 {
801 	struct ip_tunnel *tunnel = netdev_priv(dev);
802 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
803 
804 	if (new_mtu < 68 ||
805 	    new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
806 		return -EINVAL;
807 	dev->mtu = new_mtu;
808 	return 0;
809 }
810 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
811 
812 static void ip_tunnel_dev_free(struct net_device *dev)
813 {
814 	struct ip_tunnel *tunnel = netdev_priv(dev);
815 
816 	gro_cells_destroy(&tunnel->gro_cells);
817 	free_percpu(dev->tstats);
818 	free_netdev(dev);
819 }
820 
821 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
822 {
823 	struct net *net = dev_net(dev);
824 	struct ip_tunnel *tunnel = netdev_priv(dev);
825 	struct ip_tunnel_net *itn;
826 
827 	itn = net_generic(net, tunnel->ip_tnl_net_id);
828 
829 	if (itn->fb_tunnel_dev != dev) {
830 		ip_tunnel_del(netdev_priv(dev));
831 		unregister_netdevice_queue(dev, head);
832 	}
833 }
834 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
835 
836 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
837 				  struct rtnl_link_ops *ops, char *devname)
838 {
839 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
840 	struct ip_tunnel_parm parms;
841 
842 	itn->tunnels = kzalloc(IP_TNL_HASH_SIZE * sizeof(struct hlist_head), GFP_KERNEL);
843 	if (!itn->tunnels)
844 		return -ENOMEM;
845 
846 	if (!ops) {
847 		itn->fb_tunnel_dev = NULL;
848 		return 0;
849 	}
850 	memset(&parms, 0, sizeof(parms));
851 	if (devname)
852 		strlcpy(parms.name, devname, IFNAMSIZ);
853 
854 	rtnl_lock();
855 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
856 	rtnl_unlock();
857 	if (IS_ERR(itn->fb_tunnel_dev)) {
858 		kfree(itn->tunnels);
859 		return PTR_ERR(itn->fb_tunnel_dev);
860 	}
861 
862 	return 0;
863 }
864 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
865 
866 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head)
867 {
868 	int h;
869 
870 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
871 		struct ip_tunnel *t;
872 		struct hlist_node *n;
873 		struct hlist_head *thead = &itn->tunnels[h];
874 
875 		hlist_for_each_entry_safe(t, n, thead, hash_node)
876 			unregister_netdevice_queue(t->dev, head);
877 	}
878 	if (itn->fb_tunnel_dev)
879 		unregister_netdevice_queue(itn->fb_tunnel_dev, head);
880 }
881 
882 void ip_tunnel_delete_net(struct ip_tunnel_net *itn)
883 {
884 	LIST_HEAD(list);
885 
886 	rtnl_lock();
887 	ip_tunnel_destroy(itn, &list);
888 	unregister_netdevice_many(&list);
889 	rtnl_unlock();
890 	kfree(itn->tunnels);
891 }
892 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
893 
894 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
895 		      struct ip_tunnel_parm *p)
896 {
897 	struct ip_tunnel *nt;
898 	struct net *net = dev_net(dev);
899 	struct ip_tunnel_net *itn;
900 	int mtu;
901 	int err;
902 
903 	nt = netdev_priv(dev);
904 	itn = net_generic(net, nt->ip_tnl_net_id);
905 
906 	if (ip_tunnel_find(itn, p, dev->type))
907 		return -EEXIST;
908 
909 	nt->net = net;
910 	nt->parms = *p;
911 	err = register_netdevice(dev);
912 	if (err)
913 		goto out;
914 
915 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
916 		eth_hw_addr_random(dev);
917 
918 	mtu = ip_tunnel_bind_dev(dev);
919 	if (!tb[IFLA_MTU])
920 		dev->mtu = mtu;
921 
922 	ip_tunnel_add(itn, nt);
923 
924 out:
925 	return err;
926 }
927 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
928 
929 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
930 			 struct ip_tunnel_parm *p)
931 {
932 	struct ip_tunnel *t, *nt;
933 	struct net *net = dev_net(dev);
934 	struct ip_tunnel *tunnel = netdev_priv(dev);
935 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
936 
937 	if (dev == itn->fb_tunnel_dev)
938 		return -EINVAL;
939 
940 	nt = netdev_priv(dev);
941 
942 	t = ip_tunnel_find(itn, p, dev->type);
943 
944 	if (t) {
945 		if (t->dev != dev)
946 			return -EEXIST;
947 	} else {
948 		t = nt;
949 
950 		if (dev->type != ARPHRD_ETHER) {
951 			unsigned int nflags = 0;
952 
953 			if (ipv4_is_multicast(p->iph.daddr))
954 				nflags = IFF_BROADCAST;
955 			else if (p->iph.daddr)
956 				nflags = IFF_POINTOPOINT;
957 
958 			if ((dev->flags ^ nflags) &
959 			    (IFF_POINTOPOINT | IFF_BROADCAST))
960 				return -EINVAL;
961 		}
962 	}
963 
964 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
965 	return 0;
966 }
967 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
968 
969 int ip_tunnel_init(struct net_device *dev)
970 {
971 	struct ip_tunnel *tunnel = netdev_priv(dev);
972 	struct iphdr *iph = &tunnel->parms.iph;
973 	int err;
974 
975 	dev->destructor	= ip_tunnel_dev_free;
976 	dev->tstats = alloc_percpu(struct pcpu_tstats);
977 	if (!dev->tstats)
978 		return -ENOMEM;
979 
980 	err = gro_cells_init(&tunnel->gro_cells, dev);
981 	if (err) {
982 		free_percpu(dev->tstats);
983 		return err;
984 	}
985 
986 	tunnel->dev = dev;
987 	strcpy(tunnel->parms.name, dev->name);
988 	iph->version		= 4;
989 	iph->ihl		= 5;
990 
991 	return 0;
992 }
993 EXPORT_SYMBOL_GPL(ip_tunnel_init);
994 
995 void ip_tunnel_uninit(struct net_device *dev)
996 {
997 	struct net *net = dev_net(dev);
998 	struct ip_tunnel *tunnel = netdev_priv(dev);
999 	struct ip_tunnel_net *itn;
1000 
1001 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1002 	/* fb_tunnel_dev will be unregisted in net-exit call. */
1003 	if (itn->fb_tunnel_dev != dev)
1004 		ip_tunnel_del(netdev_priv(dev));
1005 }
1006 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1007 
1008 /* Do least required initialization, rest of init is done in tunnel_init call */
1009 void ip_tunnel_setup(struct net_device *dev, int net_id)
1010 {
1011 	struct ip_tunnel *tunnel = netdev_priv(dev);
1012 	tunnel->ip_tnl_net_id = net_id;
1013 }
1014 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1015 
1016 MODULE_LICENSE("GPL");
1017