xref: /openbmc/linux/net/ipv4/ip_tunnel.c (revision 7b6d864b)
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20 
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/mroute.h>
34 #include <linux/init.h>
35 #include <linux/in6.h>
36 #include <linux/inetdevice.h>
37 #include <linux/igmp.h>
38 #include <linux/netfilter_ipv4.h>
39 #include <linux/etherdevice.h>
40 #include <linux/if_ether.h>
41 #include <linux/if_vlan.h>
42 #include <linux/rculist.h>
43 
44 #include <net/sock.h>
45 #include <net/ip.h>
46 #include <net/icmp.h>
47 #include <net/protocol.h>
48 #include <net/ip_tunnels.h>
49 #include <net/arp.h>
50 #include <net/checksum.h>
51 #include <net/dsfield.h>
52 #include <net/inet_ecn.h>
53 #include <net/xfrm.h>
54 #include <net/net_namespace.h>
55 #include <net/netns/generic.h>
56 #include <net/rtnetlink.h>
57 
58 #if IS_ENABLED(CONFIG_IPV6)
59 #include <net/ipv6.h>
60 #include <net/ip6_fib.h>
61 #include <net/ip6_route.h>
62 #endif
63 
64 static unsigned int ip_tunnel_hash(struct ip_tunnel_net *itn,
65 				   __be32 key, __be32 remote)
66 {
67 	return hash_32((__force u32)key ^ (__force u32)remote,
68 			 IP_TNL_HASH_BITS);
69 }
70 
71 /* Often modified stats are per cpu, other are shared (netdev->stats) */
72 struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
73 						struct rtnl_link_stats64 *tot)
74 {
75 	int i;
76 
77 	for_each_possible_cpu(i) {
78 		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
79 		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
80 		unsigned int start;
81 
82 		do {
83 			start = u64_stats_fetch_begin_bh(&tstats->syncp);
84 			rx_packets = tstats->rx_packets;
85 			tx_packets = tstats->tx_packets;
86 			rx_bytes = tstats->rx_bytes;
87 			tx_bytes = tstats->tx_bytes;
88 		} while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
89 
90 		tot->rx_packets += rx_packets;
91 		tot->tx_packets += tx_packets;
92 		tot->rx_bytes   += rx_bytes;
93 		tot->tx_bytes   += tx_bytes;
94 	}
95 
96 	tot->multicast = dev->stats.multicast;
97 
98 	tot->rx_crc_errors = dev->stats.rx_crc_errors;
99 	tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
100 	tot->rx_length_errors = dev->stats.rx_length_errors;
101 	tot->rx_frame_errors = dev->stats.rx_frame_errors;
102 	tot->rx_errors = dev->stats.rx_errors;
103 
104 	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
105 	tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
106 	tot->tx_dropped = dev->stats.tx_dropped;
107 	tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
108 	tot->tx_errors = dev->stats.tx_errors;
109 
110 	tot->collisions  = dev->stats.collisions;
111 
112 	return tot;
113 }
114 EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
115 
116 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
117 				__be16 flags, __be32 key)
118 {
119 	if (p->i_flags & TUNNEL_KEY) {
120 		if (flags & TUNNEL_KEY)
121 			return key == p->i_key;
122 		else
123 			/* key expected, none present */
124 			return false;
125 	} else
126 		return !(flags & TUNNEL_KEY);
127 }
128 
129 /* Fallback tunnel: no source, no destination, no key, no options
130 
131    Tunnel hash table:
132    We require exact key match i.e. if a key is present in packet
133    it will match only tunnel with the same key; if it is not present,
134    it will match only keyless tunnel.
135 
136    All keysless packets, if not matched configured keyless tunnels
137    will match fallback tunnel.
138    Given src, dst and key, find appropriate for input tunnel.
139 */
140 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
141 				   int link, __be16 flags,
142 				   __be32 remote, __be32 local,
143 				   __be32 key)
144 {
145 	unsigned int hash;
146 	struct ip_tunnel *t, *cand = NULL;
147 	struct hlist_head *head;
148 
149 	hash = ip_tunnel_hash(itn, key, remote);
150 	head = &itn->tunnels[hash];
151 
152 	hlist_for_each_entry_rcu(t, head, hash_node) {
153 		if (local != t->parms.iph.saddr ||
154 		    remote != t->parms.iph.daddr ||
155 		    !(t->dev->flags & IFF_UP))
156 			continue;
157 
158 		if (!ip_tunnel_key_match(&t->parms, flags, key))
159 			continue;
160 
161 		if (t->parms.link == link)
162 			return t;
163 		else
164 			cand = t;
165 	}
166 
167 	hlist_for_each_entry_rcu(t, head, hash_node) {
168 		if (remote != t->parms.iph.daddr ||
169 		    !(t->dev->flags & IFF_UP))
170 			continue;
171 
172 		if (!ip_tunnel_key_match(&t->parms, flags, key))
173 			continue;
174 
175 		if (t->parms.link == link)
176 			return t;
177 		else if (!cand)
178 			cand = t;
179 	}
180 
181 	hash = ip_tunnel_hash(itn, key, 0);
182 	head = &itn->tunnels[hash];
183 
184 	hlist_for_each_entry_rcu(t, head, hash_node) {
185 		if ((local != t->parms.iph.saddr &&
186 		     (local != t->parms.iph.daddr ||
187 		      !ipv4_is_multicast(local))) ||
188 		    !(t->dev->flags & IFF_UP))
189 			continue;
190 
191 		if (!ip_tunnel_key_match(&t->parms, flags, key))
192 			continue;
193 
194 		if (t->parms.link == link)
195 			return t;
196 		else if (!cand)
197 			cand = t;
198 	}
199 
200 	if (flags & TUNNEL_NO_KEY)
201 		goto skip_key_lookup;
202 
203 	hlist_for_each_entry_rcu(t, head, hash_node) {
204 		if (t->parms.i_key != key ||
205 		    !(t->dev->flags & IFF_UP))
206 			continue;
207 
208 		if (t->parms.link == link)
209 			return t;
210 		else if (!cand)
211 			cand = t;
212 	}
213 
214 skip_key_lookup:
215 	if (cand)
216 		return cand;
217 
218 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
219 		return netdev_priv(itn->fb_tunnel_dev);
220 
221 
222 	return NULL;
223 }
224 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
225 
226 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
227 				    struct ip_tunnel_parm *parms)
228 {
229 	unsigned int h;
230 	__be32 remote;
231 
232 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
233 		remote = parms->iph.daddr;
234 	else
235 		remote = 0;
236 
237 	h = ip_tunnel_hash(itn, parms->i_key, remote);
238 	return &itn->tunnels[h];
239 }
240 
241 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
242 {
243 	struct hlist_head *head = ip_bucket(itn, &t->parms);
244 
245 	hlist_add_head_rcu(&t->hash_node, head);
246 }
247 
248 static void ip_tunnel_del(struct ip_tunnel *t)
249 {
250 	hlist_del_init_rcu(&t->hash_node);
251 }
252 
253 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
254 					struct ip_tunnel_parm *parms,
255 					int type)
256 {
257 	__be32 remote = parms->iph.daddr;
258 	__be32 local = parms->iph.saddr;
259 	__be32 key = parms->i_key;
260 	int link = parms->link;
261 	struct ip_tunnel *t = NULL;
262 	struct hlist_head *head = ip_bucket(itn, parms);
263 
264 	hlist_for_each_entry_rcu(t, head, hash_node) {
265 		if (local == t->parms.iph.saddr &&
266 		    remote == t->parms.iph.daddr &&
267 		    key == t->parms.i_key &&
268 		    link == t->parms.link &&
269 		    type == t->dev->type)
270 			break;
271 	}
272 	return t;
273 }
274 
275 static struct net_device *__ip_tunnel_create(struct net *net,
276 					     const struct rtnl_link_ops *ops,
277 					     struct ip_tunnel_parm *parms)
278 {
279 	int err;
280 	struct ip_tunnel *tunnel;
281 	struct net_device *dev;
282 	char name[IFNAMSIZ];
283 
284 	if (parms->name[0])
285 		strlcpy(name, parms->name, IFNAMSIZ);
286 	else {
287 		if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
288 			err = -E2BIG;
289 			goto failed;
290 		}
291 		strlcpy(name, ops->kind, IFNAMSIZ);
292 		strncat(name, "%d", 2);
293 	}
294 
295 	ASSERT_RTNL();
296 	dev = alloc_netdev(ops->priv_size, name, ops->setup);
297 	if (!dev) {
298 		err = -ENOMEM;
299 		goto failed;
300 	}
301 	dev_net_set(dev, net);
302 
303 	dev->rtnl_link_ops = ops;
304 
305 	tunnel = netdev_priv(dev);
306 	tunnel->parms = *parms;
307 
308 	err = register_netdevice(dev);
309 	if (err)
310 		goto failed_free;
311 
312 	return dev;
313 
314 failed_free:
315 	free_netdev(dev);
316 failed:
317 	return ERR_PTR(err);
318 }
319 
320 static inline struct rtable *ip_route_output_tunnel(struct net *net,
321 						    struct flowi4 *fl4,
322 						    int proto,
323 						    __be32 daddr, __be32 saddr,
324 						    __be32 key, __u8 tos, int oif)
325 {
326 	memset(fl4, 0, sizeof(*fl4));
327 	fl4->flowi4_oif = oif;
328 	fl4->daddr = daddr;
329 	fl4->saddr = saddr;
330 	fl4->flowi4_tos = tos;
331 	fl4->flowi4_proto = proto;
332 	fl4->fl4_gre_key = key;
333 	return ip_route_output_key(net, fl4);
334 }
335 
336 static int ip_tunnel_bind_dev(struct net_device *dev)
337 {
338 	struct net_device *tdev = NULL;
339 	struct ip_tunnel *tunnel = netdev_priv(dev);
340 	const struct iphdr *iph;
341 	int hlen = LL_MAX_HEADER;
342 	int mtu = ETH_DATA_LEN;
343 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
344 
345 	iph = &tunnel->parms.iph;
346 
347 	/* Guess output device to choose reasonable mtu and needed_headroom */
348 	if (iph->daddr) {
349 		struct flowi4 fl4;
350 		struct rtable *rt;
351 
352 		rt = ip_route_output_tunnel(dev_net(dev), &fl4,
353 					    tunnel->parms.iph.protocol,
354 					    iph->daddr, iph->saddr,
355 					    tunnel->parms.o_key,
356 					    RT_TOS(iph->tos),
357 					    tunnel->parms.link);
358 		if (!IS_ERR(rt)) {
359 			tdev = rt->dst.dev;
360 			ip_rt_put(rt);
361 		}
362 		if (dev->type != ARPHRD_ETHER)
363 			dev->flags |= IFF_POINTOPOINT;
364 	}
365 
366 	if (!tdev && tunnel->parms.link)
367 		tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
368 
369 	if (tdev) {
370 		hlen = tdev->hard_header_len + tdev->needed_headroom;
371 		mtu = tdev->mtu;
372 	}
373 	dev->iflink = tunnel->parms.link;
374 
375 	dev->needed_headroom = t_hlen + hlen;
376 	mtu -= (dev->hard_header_len + t_hlen);
377 
378 	if (mtu < 68)
379 		mtu = 68;
380 
381 	return mtu;
382 }
383 
384 static struct ip_tunnel *ip_tunnel_create(struct net *net,
385 					  struct ip_tunnel_net *itn,
386 					  struct ip_tunnel_parm *parms)
387 {
388 	struct ip_tunnel *nt, *fbt;
389 	struct net_device *dev;
390 
391 	BUG_ON(!itn->fb_tunnel_dev);
392 	fbt = netdev_priv(itn->fb_tunnel_dev);
393 	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
394 	if (IS_ERR(dev))
395 		return NULL;
396 
397 	dev->mtu = ip_tunnel_bind_dev(dev);
398 
399 	nt = netdev_priv(dev);
400 	ip_tunnel_add(itn, nt);
401 	return nt;
402 }
403 
404 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
405 		  const struct tnl_ptk_info *tpi, bool log_ecn_error)
406 {
407 	struct pcpu_tstats *tstats;
408 	const struct iphdr *iph = ip_hdr(skb);
409 	int err;
410 
411 	secpath_reset(skb);
412 
413 	skb->protocol = tpi->proto;
414 
415 	skb->mac_header = skb->network_header;
416 	__pskb_pull(skb, tunnel->hlen);
417 	skb_postpull_rcsum(skb, skb_transport_header(skb), tunnel->hlen);
418 #ifdef CONFIG_NET_IPGRE_BROADCAST
419 	if (ipv4_is_multicast(iph->daddr)) {
420 		/* Looped back packet, drop it! */
421 		if (rt_is_output_route(skb_rtable(skb)))
422 			goto drop;
423 		tunnel->dev->stats.multicast++;
424 		skb->pkt_type = PACKET_BROADCAST;
425 	}
426 #endif
427 
428 	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
429 	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
430 		tunnel->dev->stats.rx_crc_errors++;
431 		tunnel->dev->stats.rx_errors++;
432 		goto drop;
433 	}
434 
435 	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
436 		if (!(tpi->flags&TUNNEL_SEQ) ||
437 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
438 			tunnel->dev->stats.rx_fifo_errors++;
439 			tunnel->dev->stats.rx_errors++;
440 			goto drop;
441 		}
442 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
443 	}
444 
445 	/* Warning: All skb pointers will be invalidated! */
446 	if (tunnel->dev->type == ARPHRD_ETHER) {
447 		if (!pskb_may_pull(skb, ETH_HLEN)) {
448 			tunnel->dev->stats.rx_length_errors++;
449 			tunnel->dev->stats.rx_errors++;
450 			goto drop;
451 		}
452 
453 		iph = ip_hdr(skb);
454 		skb->protocol = eth_type_trans(skb, tunnel->dev);
455 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
456 	}
457 
458 	skb->pkt_type = PACKET_HOST;
459 	__skb_tunnel_rx(skb, tunnel->dev);
460 
461 	skb_reset_network_header(skb);
462 	err = IP_ECN_decapsulate(iph, skb);
463 	if (unlikely(err)) {
464 		if (log_ecn_error)
465 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
466 					&iph->saddr, iph->tos);
467 		if (err > 1) {
468 			++tunnel->dev->stats.rx_frame_errors;
469 			++tunnel->dev->stats.rx_errors;
470 			goto drop;
471 		}
472 	}
473 
474 	tstats = this_cpu_ptr(tunnel->dev->tstats);
475 	u64_stats_update_begin(&tstats->syncp);
476 	tstats->rx_packets++;
477 	tstats->rx_bytes += skb->len;
478 	u64_stats_update_end(&tstats->syncp);
479 
480 	gro_cells_receive(&tunnel->gro_cells, skb);
481 	return 0;
482 
483 drop:
484 	kfree_skb(skb);
485 	return 0;
486 }
487 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
488 
489 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
490 		    const struct iphdr *tnl_params)
491 {
492 	struct ip_tunnel *tunnel = netdev_priv(dev);
493 	const struct iphdr *inner_iph;
494 	struct iphdr *iph;
495 	struct flowi4 fl4;
496 	u8     tos, ttl;
497 	__be16 df;
498 	struct rtable *rt;		/* Route to the other host */
499 	struct net_device *tdev;	/* Device to other host */
500 	unsigned int max_headroom;	/* The extra header space needed */
501 	__be32 dst;
502 	int mtu;
503 
504 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
505 
506 	memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
507 	dst = tnl_params->daddr;
508 	if (dst == 0) {
509 		/* NBMA tunnel */
510 
511 		if (skb_dst(skb) == NULL) {
512 			dev->stats.tx_fifo_errors++;
513 			goto tx_error;
514 		}
515 
516 		if (skb->protocol == htons(ETH_P_IP)) {
517 			rt = skb_rtable(skb);
518 			dst = rt_nexthop(rt, inner_iph->daddr);
519 		}
520 #if IS_ENABLED(CONFIG_IPV6)
521 		else if (skb->protocol == htons(ETH_P_IPV6)) {
522 			const struct in6_addr *addr6;
523 			struct neighbour *neigh;
524 			bool do_tx_error_icmp;
525 			int addr_type;
526 
527 			neigh = dst_neigh_lookup(skb_dst(skb),
528 						 &ipv6_hdr(skb)->daddr);
529 			if (neigh == NULL)
530 				goto tx_error;
531 
532 			addr6 = (const struct in6_addr *)&neigh->primary_key;
533 			addr_type = ipv6_addr_type(addr6);
534 
535 			if (addr_type == IPV6_ADDR_ANY) {
536 				addr6 = &ipv6_hdr(skb)->daddr;
537 				addr_type = ipv6_addr_type(addr6);
538 			}
539 
540 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
541 				do_tx_error_icmp = true;
542 			else {
543 				do_tx_error_icmp = false;
544 				dst = addr6->s6_addr32[3];
545 			}
546 			neigh_release(neigh);
547 			if (do_tx_error_icmp)
548 				goto tx_error_icmp;
549 		}
550 #endif
551 		else
552 			goto tx_error;
553 	}
554 
555 	tos = tnl_params->tos;
556 	if (tos & 0x1) {
557 		tos &= ~0x1;
558 		if (skb->protocol == htons(ETH_P_IP))
559 			tos = inner_iph->tos;
560 		else if (skb->protocol == htons(ETH_P_IPV6))
561 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
562 	}
563 
564 	rt = ip_route_output_tunnel(dev_net(dev), &fl4,
565 				    tunnel->parms.iph.protocol,
566 				    dst, tnl_params->saddr,
567 				    tunnel->parms.o_key,
568 				    RT_TOS(tos),
569 				    tunnel->parms.link);
570 	if (IS_ERR(rt)) {
571 		dev->stats.tx_carrier_errors++;
572 		goto tx_error;
573 	}
574 	tdev = rt->dst.dev;
575 
576 	if (tdev == dev) {
577 		ip_rt_put(rt);
578 		dev->stats.collisions++;
579 		goto tx_error;
580 	}
581 
582 	df = tnl_params->frag_off;
583 
584 	if (df)
585 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
586 					- sizeof(struct iphdr);
587 	else
588 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
589 
590 	if (skb_dst(skb))
591 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
592 
593 	if (skb->protocol == htons(ETH_P_IP)) {
594 		df |= (inner_iph->frag_off&htons(IP_DF));
595 
596 		if (!skb_is_gso(skb) &&
597 		    (inner_iph->frag_off&htons(IP_DF)) &&
598 		     mtu < ntohs(inner_iph->tot_len)) {
599 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
600 			ip_rt_put(rt);
601 			goto tx_error;
602 		}
603 	}
604 #if IS_ENABLED(CONFIG_IPV6)
605 	else if (skb->protocol == htons(ETH_P_IPV6)) {
606 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
607 
608 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
609 		    mtu >= IPV6_MIN_MTU) {
610 			if ((tunnel->parms.iph.daddr &&
611 			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
612 			    rt6->rt6i_dst.plen == 128) {
613 				rt6->rt6i_flags |= RTF_MODIFIED;
614 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
615 			}
616 		}
617 
618 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
619 		    mtu < skb->len) {
620 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
621 			ip_rt_put(rt);
622 			goto tx_error;
623 		}
624 	}
625 #endif
626 
627 	if (tunnel->err_count > 0) {
628 		if (time_before(jiffies,
629 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
630 			tunnel->err_count--;
631 
632 			dst_link_failure(skb);
633 		} else
634 			tunnel->err_count = 0;
635 	}
636 
637 	ttl = tnl_params->ttl;
638 	if (ttl == 0) {
639 		if (skb->protocol == htons(ETH_P_IP))
640 			ttl = inner_iph->ttl;
641 #if IS_ENABLED(CONFIG_IPV6)
642 		else if (skb->protocol == htons(ETH_P_IPV6))
643 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
644 #endif
645 		else
646 			ttl = ip4_dst_hoplimit(&rt->dst);
647 	}
648 
649 	max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr)
650 					       + rt->dst.header_len;
651 	if (max_headroom > dev->needed_headroom) {
652 		dev->needed_headroom = max_headroom;
653 		if (skb_cow_head(skb, dev->needed_headroom)) {
654 			dev->stats.tx_dropped++;
655 			dev_kfree_skb(skb);
656 			return;
657 		}
658 	}
659 
660 	skb_dst_drop(skb);
661 	skb_dst_set(skb, &rt->dst);
662 
663 	/* Push down and install the IP header. */
664 	skb_push(skb, sizeof(struct iphdr));
665 	skb_reset_network_header(skb);
666 
667 	iph = ip_hdr(skb);
668 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
669 
670 	iph->version	=	4;
671 	iph->ihl	=	sizeof(struct iphdr) >> 2;
672 	iph->frag_off	=	df;
673 	iph->protocol	=	tnl_params->protocol;
674 	iph->tos	=	ip_tunnel_ecn_encap(tos, inner_iph, skb);
675 	iph->daddr	=	fl4.daddr;
676 	iph->saddr	=	fl4.saddr;
677 	iph->ttl	=	ttl;
678 	tunnel_ip_select_ident(skb, inner_iph, &rt->dst);
679 
680 	iptunnel_xmit(skb, dev);
681 	return;
682 
683 #if IS_ENABLED(CONFIG_IPV6)
684 tx_error_icmp:
685 	dst_link_failure(skb);
686 #endif
687 tx_error:
688 	dev->stats.tx_errors++;
689 	dev_kfree_skb(skb);
690 }
691 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
692 
693 static void ip_tunnel_update(struct ip_tunnel_net *itn,
694 			     struct ip_tunnel *t,
695 			     struct net_device *dev,
696 			     struct ip_tunnel_parm *p,
697 			     bool set_mtu)
698 {
699 	ip_tunnel_del(t);
700 	t->parms.iph.saddr = p->iph.saddr;
701 	t->parms.iph.daddr = p->iph.daddr;
702 	t->parms.i_key = p->i_key;
703 	t->parms.o_key = p->o_key;
704 	if (dev->type != ARPHRD_ETHER) {
705 		memcpy(dev->dev_addr, &p->iph.saddr, 4);
706 		memcpy(dev->broadcast, &p->iph.daddr, 4);
707 	}
708 	ip_tunnel_add(itn, t);
709 
710 	t->parms.iph.ttl = p->iph.ttl;
711 	t->parms.iph.tos = p->iph.tos;
712 	t->parms.iph.frag_off = p->iph.frag_off;
713 
714 	if (t->parms.link != p->link) {
715 		int mtu;
716 
717 		t->parms.link = p->link;
718 		mtu = ip_tunnel_bind_dev(dev);
719 		if (set_mtu)
720 			dev->mtu = mtu;
721 	}
722 	netdev_state_change(dev);
723 }
724 
725 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
726 {
727 	int err = 0;
728 	struct ip_tunnel *t;
729 	struct net *net = dev_net(dev);
730 	struct ip_tunnel *tunnel = netdev_priv(dev);
731 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
732 
733 	BUG_ON(!itn->fb_tunnel_dev);
734 	switch (cmd) {
735 	case SIOCGETTUNNEL:
736 		t = NULL;
737 		if (dev == itn->fb_tunnel_dev)
738 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
739 		if (t == NULL)
740 			t = netdev_priv(dev);
741 		memcpy(p, &t->parms, sizeof(*p));
742 		break;
743 
744 	case SIOCADDTUNNEL:
745 	case SIOCCHGTUNNEL:
746 		err = -EPERM;
747 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
748 			goto done;
749 		if (p->iph.ttl)
750 			p->iph.frag_off |= htons(IP_DF);
751 		if (!(p->i_flags&TUNNEL_KEY))
752 			p->i_key = 0;
753 		if (!(p->o_flags&TUNNEL_KEY))
754 			p->o_key = 0;
755 
756 		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
757 
758 		if (!t && (cmd == SIOCADDTUNNEL))
759 			t = ip_tunnel_create(net, itn, p);
760 
761 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
762 			if (t != NULL) {
763 				if (t->dev != dev) {
764 					err = -EEXIST;
765 					break;
766 				}
767 			} else {
768 				unsigned int nflags = 0;
769 
770 				if (ipv4_is_multicast(p->iph.daddr))
771 					nflags = IFF_BROADCAST;
772 				else if (p->iph.daddr)
773 					nflags = IFF_POINTOPOINT;
774 
775 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
776 					err = -EINVAL;
777 					break;
778 				}
779 
780 				t = netdev_priv(dev);
781 			}
782 		}
783 
784 		if (t) {
785 			err = 0;
786 			ip_tunnel_update(itn, t, dev, p, true);
787 		} else
788 			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
789 		break;
790 
791 	case SIOCDELTUNNEL:
792 		err = -EPERM;
793 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
794 			goto done;
795 
796 		if (dev == itn->fb_tunnel_dev) {
797 			err = -ENOENT;
798 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
799 			if (t == NULL)
800 				goto done;
801 			err = -EPERM;
802 			if (t == netdev_priv(itn->fb_tunnel_dev))
803 				goto done;
804 			dev = t->dev;
805 		}
806 		unregister_netdevice(dev);
807 		err = 0;
808 		break;
809 
810 	default:
811 		err = -EINVAL;
812 	}
813 
814 done:
815 	return err;
816 }
817 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
818 
819 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
820 {
821 	struct ip_tunnel *tunnel = netdev_priv(dev);
822 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
823 
824 	if (new_mtu < 68 ||
825 	    new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
826 		return -EINVAL;
827 	dev->mtu = new_mtu;
828 	return 0;
829 }
830 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
831 
832 static void ip_tunnel_dev_free(struct net_device *dev)
833 {
834 	struct ip_tunnel *tunnel = netdev_priv(dev);
835 
836 	gro_cells_destroy(&tunnel->gro_cells);
837 	free_percpu(dev->tstats);
838 	free_netdev(dev);
839 }
840 
841 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
842 {
843 	struct net *net = dev_net(dev);
844 	struct ip_tunnel *tunnel = netdev_priv(dev);
845 	struct ip_tunnel_net *itn;
846 
847 	itn = net_generic(net, tunnel->ip_tnl_net_id);
848 
849 	if (itn->fb_tunnel_dev != dev) {
850 		ip_tunnel_del(netdev_priv(dev));
851 		unregister_netdevice_queue(dev, head);
852 	}
853 }
854 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
855 
856 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
857 				  struct rtnl_link_ops *ops, char *devname)
858 {
859 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
860 	struct ip_tunnel_parm parms;
861 
862 	itn->tunnels = kzalloc(IP_TNL_HASH_SIZE * sizeof(struct hlist_head), GFP_KERNEL);
863 	if (!itn->tunnels)
864 		return -ENOMEM;
865 
866 	if (!ops) {
867 		itn->fb_tunnel_dev = NULL;
868 		return 0;
869 	}
870 	memset(&parms, 0, sizeof(parms));
871 	if (devname)
872 		strlcpy(parms.name, devname, IFNAMSIZ);
873 
874 	rtnl_lock();
875 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
876 	rtnl_unlock();
877 	if (IS_ERR(itn->fb_tunnel_dev)) {
878 		kfree(itn->tunnels);
879 		return PTR_ERR(itn->fb_tunnel_dev);
880 	}
881 
882 	return 0;
883 }
884 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
885 
886 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head)
887 {
888 	int h;
889 
890 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
891 		struct ip_tunnel *t;
892 		struct hlist_node *n;
893 		struct hlist_head *thead = &itn->tunnels[h];
894 
895 		hlist_for_each_entry_safe(t, n, thead, hash_node)
896 			unregister_netdevice_queue(t->dev, head);
897 	}
898 	if (itn->fb_tunnel_dev)
899 		unregister_netdevice_queue(itn->fb_tunnel_dev, head);
900 }
901 
902 void ip_tunnel_delete_net(struct ip_tunnel_net *itn)
903 {
904 	LIST_HEAD(list);
905 
906 	rtnl_lock();
907 	ip_tunnel_destroy(itn, &list);
908 	unregister_netdevice_many(&list);
909 	rtnl_unlock();
910 	kfree(itn->tunnels);
911 }
912 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
913 
914 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
915 		      struct ip_tunnel_parm *p)
916 {
917 	struct ip_tunnel *nt;
918 	struct net *net = dev_net(dev);
919 	struct ip_tunnel_net *itn;
920 	int mtu;
921 	int err;
922 
923 	nt = netdev_priv(dev);
924 	itn = net_generic(net, nt->ip_tnl_net_id);
925 
926 	if (ip_tunnel_find(itn, p, dev->type))
927 		return -EEXIST;
928 
929 	nt->parms = *p;
930 	err = register_netdevice(dev);
931 	if (err)
932 		goto out;
933 
934 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
935 		eth_hw_addr_random(dev);
936 
937 	mtu = ip_tunnel_bind_dev(dev);
938 	if (!tb[IFLA_MTU])
939 		dev->mtu = mtu;
940 
941 	ip_tunnel_add(itn, nt);
942 
943 out:
944 	return err;
945 }
946 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
947 
948 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
949 			 struct ip_tunnel_parm *p)
950 {
951 	struct ip_tunnel *t, *nt;
952 	struct net *net = dev_net(dev);
953 	struct ip_tunnel *tunnel = netdev_priv(dev);
954 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
955 
956 	if (dev == itn->fb_tunnel_dev)
957 		return -EINVAL;
958 
959 	nt = netdev_priv(dev);
960 
961 	t = ip_tunnel_find(itn, p, dev->type);
962 
963 	if (t) {
964 		if (t->dev != dev)
965 			return -EEXIST;
966 	} else {
967 		t = nt;
968 
969 		if (dev->type != ARPHRD_ETHER) {
970 			unsigned int nflags = 0;
971 
972 			if (ipv4_is_multicast(p->iph.daddr))
973 				nflags = IFF_BROADCAST;
974 			else if (p->iph.daddr)
975 				nflags = IFF_POINTOPOINT;
976 
977 			if ((dev->flags ^ nflags) &
978 			    (IFF_POINTOPOINT | IFF_BROADCAST))
979 				return -EINVAL;
980 		}
981 	}
982 
983 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
984 	return 0;
985 }
986 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
987 
988 int ip_tunnel_init(struct net_device *dev)
989 {
990 	struct ip_tunnel *tunnel = netdev_priv(dev);
991 	struct iphdr *iph = &tunnel->parms.iph;
992 	int err;
993 
994 	dev->destructor	= ip_tunnel_dev_free;
995 	dev->tstats = alloc_percpu(struct pcpu_tstats);
996 	if (!dev->tstats)
997 		return -ENOMEM;
998 
999 	err = gro_cells_init(&tunnel->gro_cells, dev);
1000 	if (err) {
1001 		free_percpu(dev->tstats);
1002 		return err;
1003 	}
1004 
1005 	tunnel->dev = dev;
1006 	strcpy(tunnel->parms.name, dev->name);
1007 	iph->version		= 4;
1008 	iph->ihl		= 5;
1009 
1010 	return 0;
1011 }
1012 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1013 
1014 void ip_tunnel_uninit(struct net_device *dev)
1015 {
1016 	struct net *net = dev_net(dev);
1017 	struct ip_tunnel *tunnel = netdev_priv(dev);
1018 	struct ip_tunnel_net *itn;
1019 
1020 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1021 	/* fb_tunnel_dev will be unregisted in net-exit call. */
1022 	if (itn->fb_tunnel_dev != dev)
1023 		ip_tunnel_del(netdev_priv(dev));
1024 }
1025 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1026 
1027 /* Do least required initialization, rest of init is done in tunnel_init call */
1028 void ip_tunnel_setup(struct net_device *dev, int net_id)
1029 {
1030 	struct ip_tunnel *tunnel = netdev_priv(dev);
1031 	tunnel->ip_tnl_net_id = net_id;
1032 }
1033 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1034 
1035 MODULE_LICENSE("GPL");
1036