xref: /openbmc/linux/net/ipv4/ip_tunnel.c (revision e23feb16)
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20 
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/mroute.h>
34 #include <linux/init.h>
35 #include <linux/in6.h>
36 #include <linux/inetdevice.h>
37 #include <linux/igmp.h>
38 #include <linux/netfilter_ipv4.h>
39 #include <linux/etherdevice.h>
40 #include <linux/if_ether.h>
41 #include <linux/if_vlan.h>
42 #include <linux/rculist.h>
43 
44 #include <net/sock.h>
45 #include <net/ip.h>
46 #include <net/icmp.h>
47 #include <net/protocol.h>
48 #include <net/ip_tunnels.h>
49 #include <net/arp.h>
50 #include <net/checksum.h>
51 #include <net/dsfield.h>
52 #include <net/inet_ecn.h>
53 #include <net/xfrm.h>
54 #include <net/net_namespace.h>
55 #include <net/netns/generic.h>
56 #include <net/rtnetlink.h>
57 
58 #if IS_ENABLED(CONFIG_IPV6)
59 #include <net/ipv6.h>
60 #include <net/ip6_fib.h>
61 #include <net/ip6_route.h>
62 #endif
63 
64 static unsigned int ip_tunnel_hash(struct ip_tunnel_net *itn,
65 				   __be32 key, __be32 remote)
66 {
67 	return hash_32((__force u32)key ^ (__force u32)remote,
68 			 IP_TNL_HASH_BITS);
69 }
70 
71 /* Often modified stats are per cpu, other are shared (netdev->stats) */
72 struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
73 						struct rtnl_link_stats64 *tot)
74 {
75 	int i;
76 
77 	for_each_possible_cpu(i) {
78 		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
79 		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
80 		unsigned int start;
81 
82 		do {
83 			start = u64_stats_fetch_begin_bh(&tstats->syncp);
84 			rx_packets = tstats->rx_packets;
85 			tx_packets = tstats->tx_packets;
86 			rx_bytes = tstats->rx_bytes;
87 			tx_bytes = tstats->tx_bytes;
88 		} while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
89 
90 		tot->rx_packets += rx_packets;
91 		tot->tx_packets += tx_packets;
92 		tot->rx_bytes   += rx_bytes;
93 		tot->tx_bytes   += tx_bytes;
94 	}
95 
96 	tot->multicast = dev->stats.multicast;
97 
98 	tot->rx_crc_errors = dev->stats.rx_crc_errors;
99 	tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
100 	tot->rx_length_errors = dev->stats.rx_length_errors;
101 	tot->rx_frame_errors = dev->stats.rx_frame_errors;
102 	tot->rx_errors = dev->stats.rx_errors;
103 
104 	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
105 	tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
106 	tot->tx_dropped = dev->stats.tx_dropped;
107 	tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
108 	tot->tx_errors = dev->stats.tx_errors;
109 
110 	tot->collisions  = dev->stats.collisions;
111 
112 	return tot;
113 }
114 EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
115 
116 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
117 				__be16 flags, __be32 key)
118 {
119 	if (p->i_flags & TUNNEL_KEY) {
120 		if (flags & TUNNEL_KEY)
121 			return key == p->i_key;
122 		else
123 			/* key expected, none present */
124 			return false;
125 	} else
126 		return !(flags & TUNNEL_KEY);
127 }
128 
129 /* Fallback tunnel: no source, no destination, no key, no options
130 
131    Tunnel hash table:
132    We require exact key match i.e. if a key is present in packet
133    it will match only tunnel with the same key; if it is not present,
134    it will match only keyless tunnel.
135 
136    All keysless packets, if not matched configured keyless tunnels
137    will match fallback tunnel.
138    Given src, dst and key, find appropriate for input tunnel.
139 */
140 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
141 				   int link, __be16 flags,
142 				   __be32 remote, __be32 local,
143 				   __be32 key)
144 {
145 	unsigned int hash;
146 	struct ip_tunnel *t, *cand = NULL;
147 	struct hlist_head *head;
148 
149 	hash = ip_tunnel_hash(itn, key, remote);
150 	head = &itn->tunnels[hash];
151 
152 	hlist_for_each_entry_rcu(t, head, hash_node) {
153 		if (local != t->parms.iph.saddr ||
154 		    remote != t->parms.iph.daddr ||
155 		    !(t->dev->flags & IFF_UP))
156 			continue;
157 
158 		if (!ip_tunnel_key_match(&t->parms, flags, key))
159 			continue;
160 
161 		if (t->parms.link == link)
162 			return t;
163 		else
164 			cand = t;
165 	}
166 
167 	hlist_for_each_entry_rcu(t, head, hash_node) {
168 		if (remote != t->parms.iph.daddr ||
169 		    !(t->dev->flags & IFF_UP))
170 			continue;
171 
172 		if (!ip_tunnel_key_match(&t->parms, flags, key))
173 			continue;
174 
175 		if (t->parms.link == link)
176 			return t;
177 		else if (!cand)
178 			cand = t;
179 	}
180 
181 	hash = ip_tunnel_hash(itn, key, 0);
182 	head = &itn->tunnels[hash];
183 
184 	hlist_for_each_entry_rcu(t, head, hash_node) {
185 		if ((local != t->parms.iph.saddr &&
186 		     (local != t->parms.iph.daddr ||
187 		      !ipv4_is_multicast(local))) ||
188 		    !(t->dev->flags & IFF_UP))
189 			continue;
190 
191 		if (!ip_tunnel_key_match(&t->parms, flags, key))
192 			continue;
193 
194 		if (t->parms.link == link)
195 			return t;
196 		else if (!cand)
197 			cand = t;
198 	}
199 
200 	if (flags & TUNNEL_NO_KEY)
201 		goto skip_key_lookup;
202 
203 	hlist_for_each_entry_rcu(t, head, hash_node) {
204 		if (t->parms.i_key != key ||
205 		    !(t->dev->flags & IFF_UP))
206 			continue;
207 
208 		if (t->parms.link == link)
209 			return t;
210 		else if (!cand)
211 			cand = t;
212 	}
213 
214 skip_key_lookup:
215 	if (cand)
216 		return cand;
217 
218 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
219 		return netdev_priv(itn->fb_tunnel_dev);
220 
221 
222 	return NULL;
223 }
224 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
225 
226 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
227 				    struct ip_tunnel_parm *parms)
228 {
229 	unsigned int h;
230 	__be32 remote;
231 
232 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
233 		remote = parms->iph.daddr;
234 	else
235 		remote = 0;
236 
237 	h = ip_tunnel_hash(itn, parms->i_key, remote);
238 	return &itn->tunnels[h];
239 }
240 
241 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
242 {
243 	struct hlist_head *head = ip_bucket(itn, &t->parms);
244 
245 	hlist_add_head_rcu(&t->hash_node, head);
246 }
247 
248 static void ip_tunnel_del(struct ip_tunnel *t)
249 {
250 	hlist_del_init_rcu(&t->hash_node);
251 }
252 
253 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
254 					struct ip_tunnel_parm *parms,
255 					int type)
256 {
257 	__be32 remote = parms->iph.daddr;
258 	__be32 local = parms->iph.saddr;
259 	__be32 key = parms->i_key;
260 	int link = parms->link;
261 	struct ip_tunnel *t = NULL;
262 	struct hlist_head *head = ip_bucket(itn, parms);
263 
264 	hlist_for_each_entry_rcu(t, head, hash_node) {
265 		if (local == t->parms.iph.saddr &&
266 		    remote == t->parms.iph.daddr &&
267 		    key == t->parms.i_key &&
268 		    link == t->parms.link &&
269 		    type == t->dev->type)
270 			break;
271 	}
272 	return t;
273 }
274 
275 static struct net_device *__ip_tunnel_create(struct net *net,
276 					     const struct rtnl_link_ops *ops,
277 					     struct ip_tunnel_parm *parms)
278 {
279 	int err;
280 	struct ip_tunnel *tunnel;
281 	struct net_device *dev;
282 	char name[IFNAMSIZ];
283 
284 	if (parms->name[0])
285 		strlcpy(name, parms->name, IFNAMSIZ);
286 	else {
287 		if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
288 			err = -E2BIG;
289 			goto failed;
290 		}
291 		strlcpy(name, ops->kind, IFNAMSIZ);
292 		strncat(name, "%d", 2);
293 	}
294 
295 	ASSERT_RTNL();
296 	dev = alloc_netdev(ops->priv_size, name, ops->setup);
297 	if (!dev) {
298 		err = -ENOMEM;
299 		goto failed;
300 	}
301 	dev_net_set(dev, net);
302 
303 	dev->rtnl_link_ops = ops;
304 
305 	tunnel = netdev_priv(dev);
306 	tunnel->parms = *parms;
307 	tunnel->net = net;
308 
309 	err = register_netdevice(dev);
310 	if (err)
311 		goto failed_free;
312 
313 	return dev;
314 
315 failed_free:
316 	free_netdev(dev);
317 failed:
318 	return ERR_PTR(err);
319 }
320 
321 static inline struct rtable *ip_route_output_tunnel(struct net *net,
322 						    struct flowi4 *fl4,
323 						    int proto,
324 						    __be32 daddr, __be32 saddr,
325 						    __be32 key, __u8 tos, int oif)
326 {
327 	memset(fl4, 0, sizeof(*fl4));
328 	fl4->flowi4_oif = oif;
329 	fl4->daddr = daddr;
330 	fl4->saddr = saddr;
331 	fl4->flowi4_tos = tos;
332 	fl4->flowi4_proto = proto;
333 	fl4->fl4_gre_key = key;
334 	return ip_route_output_key(net, fl4);
335 }
336 
337 static int ip_tunnel_bind_dev(struct net_device *dev)
338 {
339 	struct net_device *tdev = NULL;
340 	struct ip_tunnel *tunnel = netdev_priv(dev);
341 	const struct iphdr *iph;
342 	int hlen = LL_MAX_HEADER;
343 	int mtu = ETH_DATA_LEN;
344 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
345 
346 	iph = &tunnel->parms.iph;
347 
348 	/* Guess output device to choose reasonable mtu and needed_headroom */
349 	if (iph->daddr) {
350 		struct flowi4 fl4;
351 		struct rtable *rt;
352 
353 		rt = ip_route_output_tunnel(tunnel->net, &fl4,
354 					    tunnel->parms.iph.protocol,
355 					    iph->daddr, iph->saddr,
356 					    tunnel->parms.o_key,
357 					    RT_TOS(iph->tos),
358 					    tunnel->parms.link);
359 		if (!IS_ERR(rt)) {
360 			tdev = rt->dst.dev;
361 			ip_rt_put(rt);
362 		}
363 		if (dev->type != ARPHRD_ETHER)
364 			dev->flags |= IFF_POINTOPOINT;
365 	}
366 
367 	if (!tdev && tunnel->parms.link)
368 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
369 
370 	if (tdev) {
371 		hlen = tdev->hard_header_len + tdev->needed_headroom;
372 		mtu = tdev->mtu;
373 	}
374 	dev->iflink = tunnel->parms.link;
375 
376 	dev->needed_headroom = t_hlen + hlen;
377 	mtu -= (dev->hard_header_len + t_hlen);
378 
379 	if (mtu < 68)
380 		mtu = 68;
381 
382 	return mtu;
383 }
384 
385 static struct ip_tunnel *ip_tunnel_create(struct net *net,
386 					  struct ip_tunnel_net *itn,
387 					  struct ip_tunnel_parm *parms)
388 {
389 	struct ip_tunnel *nt, *fbt;
390 	struct net_device *dev;
391 
392 	BUG_ON(!itn->fb_tunnel_dev);
393 	fbt = netdev_priv(itn->fb_tunnel_dev);
394 	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
395 	if (IS_ERR(dev))
396 		return NULL;
397 
398 	dev->mtu = ip_tunnel_bind_dev(dev);
399 
400 	nt = netdev_priv(dev);
401 	ip_tunnel_add(itn, nt);
402 	return nt;
403 }
404 
405 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
406 		  const struct tnl_ptk_info *tpi, bool log_ecn_error)
407 {
408 	struct pcpu_tstats *tstats;
409 	const struct iphdr *iph = ip_hdr(skb);
410 	int err;
411 
412 #ifdef CONFIG_NET_IPGRE_BROADCAST
413 	if (ipv4_is_multicast(iph->daddr)) {
414 		/* Looped back packet, drop it! */
415 		if (rt_is_output_route(skb_rtable(skb)))
416 			goto drop;
417 		tunnel->dev->stats.multicast++;
418 		skb->pkt_type = PACKET_BROADCAST;
419 	}
420 #endif
421 
422 	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
423 	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
424 		tunnel->dev->stats.rx_crc_errors++;
425 		tunnel->dev->stats.rx_errors++;
426 		goto drop;
427 	}
428 
429 	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
430 		if (!(tpi->flags&TUNNEL_SEQ) ||
431 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
432 			tunnel->dev->stats.rx_fifo_errors++;
433 			tunnel->dev->stats.rx_errors++;
434 			goto drop;
435 		}
436 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
437 	}
438 
439 	err = IP_ECN_decapsulate(iph, skb);
440 	if (unlikely(err)) {
441 		if (log_ecn_error)
442 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
443 					&iph->saddr, iph->tos);
444 		if (err > 1) {
445 			++tunnel->dev->stats.rx_frame_errors;
446 			++tunnel->dev->stats.rx_errors;
447 			goto drop;
448 		}
449 	}
450 
451 	tstats = this_cpu_ptr(tunnel->dev->tstats);
452 	u64_stats_update_begin(&tstats->syncp);
453 	tstats->rx_packets++;
454 	tstats->rx_bytes += skb->len;
455 	u64_stats_update_end(&tstats->syncp);
456 
457 	if (tunnel->dev->type == ARPHRD_ETHER) {
458 		skb->protocol = eth_type_trans(skb, tunnel->dev);
459 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
460 	} else {
461 		skb->dev = tunnel->dev;
462 	}
463 
464 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
465 
466 	gro_cells_receive(&tunnel->gro_cells, skb);
467 	return 0;
468 
469 drop:
470 	kfree_skb(skb);
471 	return 0;
472 }
473 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
474 
475 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
476 			    struct rtable *rt, __be16 df)
477 {
478 	struct ip_tunnel *tunnel = netdev_priv(dev);
479 	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
480 	int mtu;
481 
482 	if (df)
483 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
484 					- sizeof(struct iphdr) - tunnel->hlen;
485 	else
486 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
487 
488 	if (skb_dst(skb))
489 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
490 
491 	if (skb->protocol == htons(ETH_P_IP)) {
492 		if (!skb_is_gso(skb) &&
493 		    (df & htons(IP_DF)) && mtu < pkt_size) {
494 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
495 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
496 			return -E2BIG;
497 		}
498 	}
499 #if IS_ENABLED(CONFIG_IPV6)
500 	else if (skb->protocol == htons(ETH_P_IPV6)) {
501 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
502 
503 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
504 			   mtu >= IPV6_MIN_MTU) {
505 			if ((tunnel->parms.iph.daddr &&
506 			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
507 			    rt6->rt6i_dst.plen == 128) {
508 				rt6->rt6i_flags |= RTF_MODIFIED;
509 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
510 			}
511 		}
512 
513 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
514 					mtu < pkt_size) {
515 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
516 			return -E2BIG;
517 		}
518 	}
519 #endif
520 	return 0;
521 }
522 
523 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
524 		    const struct iphdr *tnl_params, const u8 protocol)
525 {
526 	struct ip_tunnel *tunnel = netdev_priv(dev);
527 	const struct iphdr *inner_iph;
528 	struct flowi4 fl4;
529 	u8     tos, ttl;
530 	__be16 df;
531 	struct rtable *rt;		/* Route to the other host */
532 	unsigned int max_headroom;	/* The extra header space needed */
533 	__be32 dst;
534 	int err;
535 
536 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
537 
538 	dst = tnl_params->daddr;
539 	if (dst == 0) {
540 		/* NBMA tunnel */
541 
542 		if (skb_dst(skb) == NULL) {
543 			dev->stats.tx_fifo_errors++;
544 			goto tx_error;
545 		}
546 
547 		if (skb->protocol == htons(ETH_P_IP)) {
548 			rt = skb_rtable(skb);
549 			dst = rt_nexthop(rt, inner_iph->daddr);
550 		}
551 #if IS_ENABLED(CONFIG_IPV6)
552 		else if (skb->protocol == htons(ETH_P_IPV6)) {
553 			const struct in6_addr *addr6;
554 			struct neighbour *neigh;
555 			bool do_tx_error_icmp;
556 			int addr_type;
557 
558 			neigh = dst_neigh_lookup(skb_dst(skb),
559 						 &ipv6_hdr(skb)->daddr);
560 			if (neigh == NULL)
561 				goto tx_error;
562 
563 			addr6 = (const struct in6_addr *)&neigh->primary_key;
564 			addr_type = ipv6_addr_type(addr6);
565 
566 			if (addr_type == IPV6_ADDR_ANY) {
567 				addr6 = &ipv6_hdr(skb)->daddr;
568 				addr_type = ipv6_addr_type(addr6);
569 			}
570 
571 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
572 				do_tx_error_icmp = true;
573 			else {
574 				do_tx_error_icmp = false;
575 				dst = addr6->s6_addr32[3];
576 			}
577 			neigh_release(neigh);
578 			if (do_tx_error_icmp)
579 				goto tx_error_icmp;
580 		}
581 #endif
582 		else
583 			goto tx_error;
584 	}
585 
586 	tos = tnl_params->tos;
587 	if (tos & 0x1) {
588 		tos &= ~0x1;
589 		if (skb->protocol == htons(ETH_P_IP))
590 			tos = inner_iph->tos;
591 		else if (skb->protocol == htons(ETH_P_IPV6))
592 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
593 	}
594 
595 	rt = ip_route_output_tunnel(tunnel->net, &fl4,
596 				    protocol,
597 				    dst, tnl_params->saddr,
598 				    tunnel->parms.o_key,
599 				    RT_TOS(tos),
600 				    tunnel->parms.link);
601 	if (IS_ERR(rt)) {
602 		dev->stats.tx_carrier_errors++;
603 		goto tx_error;
604 	}
605 	if (rt->dst.dev == dev) {
606 		ip_rt_put(rt);
607 		dev->stats.collisions++;
608 		goto tx_error;
609 	}
610 
611 	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) {
612 		ip_rt_put(rt);
613 		goto tx_error;
614 	}
615 
616 	if (tunnel->err_count > 0) {
617 		if (time_before(jiffies,
618 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
619 			tunnel->err_count--;
620 
621 			dst_link_failure(skb);
622 		} else
623 			tunnel->err_count = 0;
624 	}
625 
626 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
627 	ttl = tnl_params->ttl;
628 	if (ttl == 0) {
629 		if (skb->protocol == htons(ETH_P_IP))
630 			ttl = inner_iph->ttl;
631 #if IS_ENABLED(CONFIG_IPV6)
632 		else if (skb->protocol == htons(ETH_P_IPV6))
633 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
634 #endif
635 		else
636 			ttl = ip4_dst_hoplimit(&rt->dst);
637 	}
638 
639 	df = tnl_params->frag_off;
640 	if (skb->protocol == htons(ETH_P_IP))
641 		df |= (inner_iph->frag_off&htons(IP_DF));
642 
643 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
644 			+ rt->dst.header_len;
645 	if (max_headroom > dev->needed_headroom)
646 		dev->needed_headroom = max_headroom;
647 
648 	if (skb_cow_head(skb, dev->needed_headroom)) {
649 		dev->stats.tx_dropped++;
650 		dev_kfree_skb(skb);
651 		return;
652 	}
653 
654 	err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, protocol,
655 			    tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
656 	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
657 
658 	return;
659 
660 #if IS_ENABLED(CONFIG_IPV6)
661 tx_error_icmp:
662 	dst_link_failure(skb);
663 #endif
664 tx_error:
665 	dev->stats.tx_errors++;
666 	dev_kfree_skb(skb);
667 }
668 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
669 
670 static void ip_tunnel_update(struct ip_tunnel_net *itn,
671 			     struct ip_tunnel *t,
672 			     struct net_device *dev,
673 			     struct ip_tunnel_parm *p,
674 			     bool set_mtu)
675 {
676 	ip_tunnel_del(t);
677 	t->parms.iph.saddr = p->iph.saddr;
678 	t->parms.iph.daddr = p->iph.daddr;
679 	t->parms.i_key = p->i_key;
680 	t->parms.o_key = p->o_key;
681 	if (dev->type != ARPHRD_ETHER) {
682 		memcpy(dev->dev_addr, &p->iph.saddr, 4);
683 		memcpy(dev->broadcast, &p->iph.daddr, 4);
684 	}
685 	ip_tunnel_add(itn, t);
686 
687 	t->parms.iph.ttl = p->iph.ttl;
688 	t->parms.iph.tos = p->iph.tos;
689 	t->parms.iph.frag_off = p->iph.frag_off;
690 
691 	if (t->parms.link != p->link) {
692 		int mtu;
693 
694 		t->parms.link = p->link;
695 		mtu = ip_tunnel_bind_dev(dev);
696 		if (set_mtu)
697 			dev->mtu = mtu;
698 	}
699 	netdev_state_change(dev);
700 }
701 
702 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
703 {
704 	int err = 0;
705 	struct ip_tunnel *t;
706 	struct net *net = dev_net(dev);
707 	struct ip_tunnel *tunnel = netdev_priv(dev);
708 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
709 
710 	BUG_ON(!itn->fb_tunnel_dev);
711 	switch (cmd) {
712 	case SIOCGETTUNNEL:
713 		t = NULL;
714 		if (dev == itn->fb_tunnel_dev)
715 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
716 		if (t == NULL)
717 			t = netdev_priv(dev);
718 		memcpy(p, &t->parms, sizeof(*p));
719 		break;
720 
721 	case SIOCADDTUNNEL:
722 	case SIOCCHGTUNNEL:
723 		err = -EPERM;
724 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
725 			goto done;
726 		if (p->iph.ttl)
727 			p->iph.frag_off |= htons(IP_DF);
728 		if (!(p->i_flags&TUNNEL_KEY))
729 			p->i_key = 0;
730 		if (!(p->o_flags&TUNNEL_KEY))
731 			p->o_key = 0;
732 
733 		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
734 
735 		if (!t && (cmd == SIOCADDTUNNEL))
736 			t = ip_tunnel_create(net, itn, p);
737 
738 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
739 			if (t != NULL) {
740 				if (t->dev != dev) {
741 					err = -EEXIST;
742 					break;
743 				}
744 			} else {
745 				unsigned int nflags = 0;
746 
747 				if (ipv4_is_multicast(p->iph.daddr))
748 					nflags = IFF_BROADCAST;
749 				else if (p->iph.daddr)
750 					nflags = IFF_POINTOPOINT;
751 
752 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
753 					err = -EINVAL;
754 					break;
755 				}
756 
757 				t = netdev_priv(dev);
758 			}
759 		}
760 
761 		if (t) {
762 			err = 0;
763 			ip_tunnel_update(itn, t, dev, p, true);
764 		} else
765 			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
766 		break;
767 
768 	case SIOCDELTUNNEL:
769 		err = -EPERM;
770 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
771 			goto done;
772 
773 		if (dev == itn->fb_tunnel_dev) {
774 			err = -ENOENT;
775 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
776 			if (t == NULL)
777 				goto done;
778 			err = -EPERM;
779 			if (t == netdev_priv(itn->fb_tunnel_dev))
780 				goto done;
781 			dev = t->dev;
782 		}
783 		unregister_netdevice(dev);
784 		err = 0;
785 		break;
786 
787 	default:
788 		err = -EINVAL;
789 	}
790 
791 done:
792 	return err;
793 }
794 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
795 
796 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
797 {
798 	struct ip_tunnel *tunnel = netdev_priv(dev);
799 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
800 
801 	if (new_mtu < 68 ||
802 	    new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
803 		return -EINVAL;
804 	dev->mtu = new_mtu;
805 	return 0;
806 }
807 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
808 
809 static void ip_tunnel_dev_free(struct net_device *dev)
810 {
811 	struct ip_tunnel *tunnel = netdev_priv(dev);
812 
813 	gro_cells_destroy(&tunnel->gro_cells);
814 	free_percpu(dev->tstats);
815 	free_netdev(dev);
816 }
817 
818 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
819 {
820 	struct ip_tunnel *tunnel = netdev_priv(dev);
821 	struct ip_tunnel_net *itn;
822 
823 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
824 
825 	if (itn->fb_tunnel_dev != dev) {
826 		ip_tunnel_del(netdev_priv(dev));
827 		unregister_netdevice_queue(dev, head);
828 	}
829 }
830 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
831 
832 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
833 				  struct rtnl_link_ops *ops, char *devname)
834 {
835 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
836 	struct ip_tunnel_parm parms;
837 	unsigned int i;
838 
839 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
840 		INIT_HLIST_HEAD(&itn->tunnels[i]);
841 
842 	if (!ops) {
843 		itn->fb_tunnel_dev = NULL;
844 		return 0;
845 	}
846 
847 	memset(&parms, 0, sizeof(parms));
848 	if (devname)
849 		strlcpy(parms.name, devname, IFNAMSIZ);
850 
851 	rtnl_lock();
852 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
853 	/* FB netdevice is special: we have one, and only one per netns.
854 	 * Allowing to move it to another netns is clearly unsafe.
855 	 */
856 	if (!IS_ERR(itn->fb_tunnel_dev)) {
857 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
858 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
859 	}
860 	rtnl_unlock();
861 
862 	return PTR_RET(itn->fb_tunnel_dev);
863 }
864 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
865 
866 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
867 			      struct rtnl_link_ops *ops)
868 {
869 	struct net *net = dev_net(itn->fb_tunnel_dev);
870 	struct net_device *dev, *aux;
871 	int h;
872 
873 	for_each_netdev_safe(net, dev, aux)
874 		if (dev->rtnl_link_ops == ops)
875 			unregister_netdevice_queue(dev, head);
876 
877 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
878 		struct ip_tunnel *t;
879 		struct hlist_node *n;
880 		struct hlist_head *thead = &itn->tunnels[h];
881 
882 		hlist_for_each_entry_safe(t, n, thead, hash_node)
883 			/* If dev is in the same netns, it has already
884 			 * been added to the list by the previous loop.
885 			 */
886 			if (!net_eq(dev_net(t->dev), net))
887 				unregister_netdevice_queue(t->dev, head);
888 	}
889 }
890 
891 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
892 {
893 	LIST_HEAD(list);
894 
895 	rtnl_lock();
896 	ip_tunnel_destroy(itn, &list, ops);
897 	unregister_netdevice_many(&list);
898 	rtnl_unlock();
899 }
900 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
901 
902 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
903 		      struct ip_tunnel_parm *p)
904 {
905 	struct ip_tunnel *nt;
906 	struct net *net = dev_net(dev);
907 	struct ip_tunnel_net *itn;
908 	int mtu;
909 	int err;
910 
911 	nt = netdev_priv(dev);
912 	itn = net_generic(net, nt->ip_tnl_net_id);
913 
914 	if (ip_tunnel_find(itn, p, dev->type))
915 		return -EEXIST;
916 
917 	nt->net = net;
918 	nt->parms = *p;
919 	err = register_netdevice(dev);
920 	if (err)
921 		goto out;
922 
923 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
924 		eth_hw_addr_random(dev);
925 
926 	mtu = ip_tunnel_bind_dev(dev);
927 	if (!tb[IFLA_MTU])
928 		dev->mtu = mtu;
929 
930 	ip_tunnel_add(itn, nt);
931 
932 out:
933 	return err;
934 }
935 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
936 
937 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
938 			 struct ip_tunnel_parm *p)
939 {
940 	struct ip_tunnel *t;
941 	struct ip_tunnel *tunnel = netdev_priv(dev);
942 	struct net *net = tunnel->net;
943 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
944 
945 	if (dev == itn->fb_tunnel_dev)
946 		return -EINVAL;
947 
948 	t = ip_tunnel_find(itn, p, dev->type);
949 
950 	if (t) {
951 		if (t->dev != dev)
952 			return -EEXIST;
953 	} else {
954 		t = tunnel;
955 
956 		if (dev->type != ARPHRD_ETHER) {
957 			unsigned int nflags = 0;
958 
959 			if (ipv4_is_multicast(p->iph.daddr))
960 				nflags = IFF_BROADCAST;
961 			else if (p->iph.daddr)
962 				nflags = IFF_POINTOPOINT;
963 
964 			if ((dev->flags ^ nflags) &
965 			    (IFF_POINTOPOINT | IFF_BROADCAST))
966 				return -EINVAL;
967 		}
968 	}
969 
970 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
971 	return 0;
972 }
973 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
974 
975 int ip_tunnel_init(struct net_device *dev)
976 {
977 	struct ip_tunnel *tunnel = netdev_priv(dev);
978 	struct iphdr *iph = &tunnel->parms.iph;
979 	int err;
980 
981 	dev->destructor	= ip_tunnel_dev_free;
982 	dev->tstats = alloc_percpu(struct pcpu_tstats);
983 	if (!dev->tstats)
984 		return -ENOMEM;
985 
986 	err = gro_cells_init(&tunnel->gro_cells, dev);
987 	if (err) {
988 		free_percpu(dev->tstats);
989 		return err;
990 	}
991 
992 	tunnel->dev = dev;
993 	tunnel->net = dev_net(dev);
994 	strcpy(tunnel->parms.name, dev->name);
995 	iph->version		= 4;
996 	iph->ihl		= 5;
997 
998 	return 0;
999 }
1000 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1001 
1002 void ip_tunnel_uninit(struct net_device *dev)
1003 {
1004 	struct ip_tunnel *tunnel = netdev_priv(dev);
1005 	struct net *net = tunnel->net;
1006 	struct ip_tunnel_net *itn;
1007 
1008 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1009 	/* fb_tunnel_dev will be unregisted in net-exit call. */
1010 	if (itn->fb_tunnel_dev != dev)
1011 		ip_tunnel_del(netdev_priv(dev));
1012 }
1013 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1014 
1015 /* Do least required initialization, rest of init is done in tunnel_init call */
1016 void ip_tunnel_setup(struct net_device *dev, int net_id)
1017 {
1018 	struct ip_tunnel *tunnel = netdev_priv(dev);
1019 	tunnel->ip_tnl_net_id = net_id;
1020 }
1021 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1022 
1023 MODULE_LICENSE("GPL");
1024