xref: /openbmc/linux/net/ipv4/ip_tunnel.c (revision 4da722ca)
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20 
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/init.h>
34 #include <linux/in6.h>
35 #include <linux/inetdevice.h>
36 #include <linux/igmp.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/etherdevice.h>
39 #include <linux/if_ether.h>
40 #include <linux/if_vlan.h>
41 #include <linux/rculist.h>
42 #include <linux/err.h>
43 
44 #include <net/sock.h>
45 #include <net/ip.h>
46 #include <net/icmp.h>
47 #include <net/protocol.h>
48 #include <net/ip_tunnels.h>
49 #include <net/arp.h>
50 #include <net/checksum.h>
51 #include <net/dsfield.h>
52 #include <net/inet_ecn.h>
53 #include <net/xfrm.h>
54 #include <net/net_namespace.h>
55 #include <net/netns/generic.h>
56 #include <net/rtnetlink.h>
57 #include <net/udp.h>
58 #include <net/dst_metadata.h>
59 
60 #if IS_ENABLED(CONFIG_IPV6)
61 #include <net/ipv6.h>
62 #include <net/ip6_fib.h>
63 #include <net/ip6_route.h>
64 #endif
65 
66 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
67 {
68 	return hash_32((__force u32)key ^ (__force u32)remote,
69 			 IP_TNL_HASH_BITS);
70 }
71 
72 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
73 				__be16 flags, __be32 key)
74 {
75 	if (p->i_flags & TUNNEL_KEY) {
76 		if (flags & TUNNEL_KEY)
77 			return key == p->i_key;
78 		else
79 			/* key expected, none present */
80 			return false;
81 	} else
82 		return !(flags & TUNNEL_KEY);
83 }
84 
85 /* Fallback tunnel: no source, no destination, no key, no options
86 
87    Tunnel hash table:
88    We require exact key match i.e. if a key is present in packet
89    it will match only tunnel with the same key; if it is not present,
90    it will match only keyless tunnel.
91 
92    All keysless packets, if not matched configured keyless tunnels
93    will match fallback tunnel.
94    Given src, dst and key, find appropriate for input tunnel.
95 */
96 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
97 				   int link, __be16 flags,
98 				   __be32 remote, __be32 local,
99 				   __be32 key)
100 {
101 	unsigned int hash;
102 	struct ip_tunnel *t, *cand = NULL;
103 	struct hlist_head *head;
104 
105 	hash = ip_tunnel_hash(key, remote);
106 	head = &itn->tunnels[hash];
107 
108 	hlist_for_each_entry_rcu(t, head, hash_node) {
109 		if (local != t->parms.iph.saddr ||
110 		    remote != t->parms.iph.daddr ||
111 		    !(t->dev->flags & IFF_UP))
112 			continue;
113 
114 		if (!ip_tunnel_key_match(&t->parms, flags, key))
115 			continue;
116 
117 		if (t->parms.link == link)
118 			return t;
119 		else
120 			cand = t;
121 	}
122 
123 	hlist_for_each_entry_rcu(t, head, hash_node) {
124 		if (remote != t->parms.iph.daddr ||
125 		    t->parms.iph.saddr != 0 ||
126 		    !(t->dev->flags & IFF_UP))
127 			continue;
128 
129 		if (!ip_tunnel_key_match(&t->parms, flags, key))
130 			continue;
131 
132 		if (t->parms.link == link)
133 			return t;
134 		else if (!cand)
135 			cand = t;
136 	}
137 
138 	hash = ip_tunnel_hash(key, 0);
139 	head = &itn->tunnels[hash];
140 
141 	hlist_for_each_entry_rcu(t, head, hash_node) {
142 		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
143 		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
144 			continue;
145 
146 		if (!(t->dev->flags & IFF_UP))
147 			continue;
148 
149 		if (!ip_tunnel_key_match(&t->parms, flags, key))
150 			continue;
151 
152 		if (t->parms.link == link)
153 			return t;
154 		else if (!cand)
155 			cand = t;
156 	}
157 
158 	if (flags & TUNNEL_NO_KEY)
159 		goto skip_key_lookup;
160 
161 	hlist_for_each_entry_rcu(t, head, hash_node) {
162 		if (t->parms.i_key != key ||
163 		    t->parms.iph.saddr != 0 ||
164 		    t->parms.iph.daddr != 0 ||
165 		    !(t->dev->flags & IFF_UP))
166 			continue;
167 
168 		if (t->parms.link == link)
169 			return t;
170 		else if (!cand)
171 			cand = t;
172 	}
173 
174 skip_key_lookup:
175 	if (cand)
176 		return cand;
177 
178 	t = rcu_dereference(itn->collect_md_tun);
179 	if (t)
180 		return t;
181 
182 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
183 		return netdev_priv(itn->fb_tunnel_dev);
184 
185 	return NULL;
186 }
187 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
188 
189 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
190 				    struct ip_tunnel_parm *parms)
191 {
192 	unsigned int h;
193 	__be32 remote;
194 	__be32 i_key = parms->i_key;
195 
196 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
197 		remote = parms->iph.daddr;
198 	else
199 		remote = 0;
200 
201 	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
202 		i_key = 0;
203 
204 	h = ip_tunnel_hash(i_key, remote);
205 	return &itn->tunnels[h];
206 }
207 
208 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
209 {
210 	struct hlist_head *head = ip_bucket(itn, &t->parms);
211 
212 	if (t->collect_md)
213 		rcu_assign_pointer(itn->collect_md_tun, t);
214 	hlist_add_head_rcu(&t->hash_node, head);
215 }
216 
217 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
218 {
219 	if (t->collect_md)
220 		rcu_assign_pointer(itn->collect_md_tun, NULL);
221 	hlist_del_init_rcu(&t->hash_node);
222 }
223 
224 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
225 					struct ip_tunnel_parm *parms,
226 					int type)
227 {
228 	__be32 remote = parms->iph.daddr;
229 	__be32 local = parms->iph.saddr;
230 	__be32 key = parms->i_key;
231 	__be16 flags = parms->i_flags;
232 	int link = parms->link;
233 	struct ip_tunnel *t = NULL;
234 	struct hlist_head *head = ip_bucket(itn, parms);
235 
236 	hlist_for_each_entry_rcu(t, head, hash_node) {
237 		if (local == t->parms.iph.saddr &&
238 		    remote == t->parms.iph.daddr &&
239 		    link == t->parms.link &&
240 		    type == t->dev->type &&
241 		    ip_tunnel_key_match(&t->parms, flags, key))
242 			break;
243 	}
244 	return t;
245 }
246 
247 static struct net_device *__ip_tunnel_create(struct net *net,
248 					     const struct rtnl_link_ops *ops,
249 					     struct ip_tunnel_parm *parms)
250 {
251 	int err;
252 	struct ip_tunnel *tunnel;
253 	struct net_device *dev;
254 	char name[IFNAMSIZ];
255 
256 	if (parms->name[0])
257 		strlcpy(name, parms->name, IFNAMSIZ);
258 	else {
259 		if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
260 			err = -E2BIG;
261 			goto failed;
262 		}
263 		strlcpy(name, ops->kind, IFNAMSIZ);
264 		strncat(name, "%d", 2);
265 	}
266 
267 	ASSERT_RTNL();
268 	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
269 	if (!dev) {
270 		err = -ENOMEM;
271 		goto failed;
272 	}
273 	dev_net_set(dev, net);
274 
275 	dev->rtnl_link_ops = ops;
276 
277 	tunnel = netdev_priv(dev);
278 	tunnel->parms = *parms;
279 	tunnel->net = net;
280 
281 	err = register_netdevice(dev);
282 	if (err)
283 		goto failed_free;
284 
285 	return dev;
286 
287 failed_free:
288 	free_netdev(dev);
289 failed:
290 	return ERR_PTR(err);
291 }
292 
293 static inline void init_tunnel_flow(struct flowi4 *fl4,
294 				    int proto,
295 				    __be32 daddr, __be32 saddr,
296 				    __be32 key, __u8 tos, int oif,
297 				    __u32 mark)
298 {
299 	memset(fl4, 0, sizeof(*fl4));
300 	fl4->flowi4_oif = oif;
301 	fl4->daddr = daddr;
302 	fl4->saddr = saddr;
303 	fl4->flowi4_tos = tos;
304 	fl4->flowi4_proto = proto;
305 	fl4->fl4_gre_key = key;
306 	fl4->flowi4_mark = mark;
307 }
308 
309 static int ip_tunnel_bind_dev(struct net_device *dev)
310 {
311 	struct net_device *tdev = NULL;
312 	struct ip_tunnel *tunnel = netdev_priv(dev);
313 	const struct iphdr *iph;
314 	int hlen = LL_MAX_HEADER;
315 	int mtu = ETH_DATA_LEN;
316 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
317 
318 	iph = &tunnel->parms.iph;
319 
320 	/* Guess output device to choose reasonable mtu and needed_headroom */
321 	if (iph->daddr) {
322 		struct flowi4 fl4;
323 		struct rtable *rt;
324 
325 		init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
326 				 iph->saddr, tunnel->parms.o_key,
327 				 RT_TOS(iph->tos), tunnel->parms.link,
328 				 tunnel->fwmark);
329 		rt = ip_route_output_key(tunnel->net, &fl4);
330 
331 		if (!IS_ERR(rt)) {
332 			tdev = rt->dst.dev;
333 			ip_rt_put(rt);
334 		}
335 		if (dev->type != ARPHRD_ETHER)
336 			dev->flags |= IFF_POINTOPOINT;
337 
338 		dst_cache_reset(&tunnel->dst_cache);
339 	}
340 
341 	if (!tdev && tunnel->parms.link)
342 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
343 
344 	if (tdev) {
345 		hlen = tdev->hard_header_len + tdev->needed_headroom;
346 		mtu = tdev->mtu;
347 	}
348 
349 	dev->needed_headroom = t_hlen + hlen;
350 	mtu -= (dev->hard_header_len + t_hlen);
351 
352 	if (mtu < 68)
353 		mtu = 68;
354 
355 	return mtu;
356 }
357 
358 static struct ip_tunnel *ip_tunnel_create(struct net *net,
359 					  struct ip_tunnel_net *itn,
360 					  struct ip_tunnel_parm *parms)
361 {
362 	struct ip_tunnel *nt;
363 	struct net_device *dev;
364 	int t_hlen;
365 
366 	BUG_ON(!itn->fb_tunnel_dev);
367 	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
368 	if (IS_ERR(dev))
369 		return ERR_CAST(dev);
370 
371 	dev->mtu = ip_tunnel_bind_dev(dev);
372 
373 	nt = netdev_priv(dev);
374 	t_hlen = nt->hlen + sizeof(struct iphdr);
375 	dev->min_mtu = ETH_MIN_MTU;
376 	dev->max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
377 	ip_tunnel_add(itn, nt);
378 	return nt;
379 }
380 
381 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
382 		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
383 		  bool log_ecn_error)
384 {
385 	struct pcpu_sw_netstats *tstats;
386 	const struct iphdr *iph = ip_hdr(skb);
387 	int err;
388 
389 #ifdef CONFIG_NET_IPGRE_BROADCAST
390 	if (ipv4_is_multicast(iph->daddr)) {
391 		tunnel->dev->stats.multicast++;
392 		skb->pkt_type = PACKET_BROADCAST;
393 	}
394 #endif
395 
396 	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
397 	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
398 		tunnel->dev->stats.rx_crc_errors++;
399 		tunnel->dev->stats.rx_errors++;
400 		goto drop;
401 	}
402 
403 	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
404 		if (!(tpi->flags&TUNNEL_SEQ) ||
405 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
406 			tunnel->dev->stats.rx_fifo_errors++;
407 			tunnel->dev->stats.rx_errors++;
408 			goto drop;
409 		}
410 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
411 	}
412 
413 	skb_reset_network_header(skb);
414 
415 	err = IP_ECN_decapsulate(iph, skb);
416 	if (unlikely(err)) {
417 		if (log_ecn_error)
418 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
419 					&iph->saddr, iph->tos);
420 		if (err > 1) {
421 			++tunnel->dev->stats.rx_frame_errors;
422 			++tunnel->dev->stats.rx_errors;
423 			goto drop;
424 		}
425 	}
426 
427 	tstats = this_cpu_ptr(tunnel->dev->tstats);
428 	u64_stats_update_begin(&tstats->syncp);
429 	tstats->rx_packets++;
430 	tstats->rx_bytes += skb->len;
431 	u64_stats_update_end(&tstats->syncp);
432 
433 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
434 
435 	if (tunnel->dev->type == ARPHRD_ETHER) {
436 		skb->protocol = eth_type_trans(skb, tunnel->dev);
437 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
438 	} else {
439 		skb->dev = tunnel->dev;
440 	}
441 
442 	if (tun_dst)
443 		skb_dst_set(skb, (struct dst_entry *)tun_dst);
444 
445 	gro_cells_receive(&tunnel->gro_cells, skb);
446 	return 0;
447 
448 drop:
449 	if (tun_dst)
450 		dst_release((struct dst_entry *)tun_dst);
451 	kfree_skb(skb);
452 	return 0;
453 }
454 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
455 
456 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
457 			    unsigned int num)
458 {
459 	if (num >= MAX_IPTUN_ENCAP_OPS)
460 		return -ERANGE;
461 
462 	return !cmpxchg((const struct ip_tunnel_encap_ops **)
463 			&iptun_encaps[num],
464 			NULL, ops) ? 0 : -1;
465 }
466 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
467 
468 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
469 			    unsigned int num)
470 {
471 	int ret;
472 
473 	if (num >= MAX_IPTUN_ENCAP_OPS)
474 		return -ERANGE;
475 
476 	ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
477 		       &iptun_encaps[num],
478 		       ops, NULL) == ops) ? 0 : -1;
479 
480 	synchronize_net();
481 
482 	return ret;
483 }
484 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
485 
486 int ip_tunnel_encap_setup(struct ip_tunnel *t,
487 			  struct ip_tunnel_encap *ipencap)
488 {
489 	int hlen;
490 
491 	memset(&t->encap, 0, sizeof(t->encap));
492 
493 	hlen = ip_encap_hlen(ipencap);
494 	if (hlen < 0)
495 		return hlen;
496 
497 	t->encap.type = ipencap->type;
498 	t->encap.sport = ipencap->sport;
499 	t->encap.dport = ipencap->dport;
500 	t->encap.flags = ipencap->flags;
501 
502 	t->encap_hlen = hlen;
503 	t->hlen = t->encap_hlen + t->tun_hlen;
504 
505 	return 0;
506 }
507 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
508 
509 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
510 			    struct rtable *rt, __be16 df,
511 			    const struct iphdr *inner_iph)
512 {
513 	struct ip_tunnel *tunnel = netdev_priv(dev);
514 	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
515 	int mtu;
516 
517 	if (df)
518 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
519 					- sizeof(struct iphdr) - tunnel->hlen;
520 	else
521 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
522 
523 	if (skb_dst(skb))
524 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
525 
526 	if (skb->protocol == htons(ETH_P_IP)) {
527 		if (!skb_is_gso(skb) &&
528 		    (inner_iph->frag_off & htons(IP_DF)) &&
529 		    mtu < pkt_size) {
530 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
531 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
532 			return -E2BIG;
533 		}
534 	}
535 #if IS_ENABLED(CONFIG_IPV6)
536 	else if (skb->protocol == htons(ETH_P_IPV6)) {
537 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
538 
539 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
540 			   mtu >= IPV6_MIN_MTU) {
541 			if ((tunnel->parms.iph.daddr &&
542 			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
543 			    rt6->rt6i_dst.plen == 128) {
544 				rt6->rt6i_flags |= RTF_MODIFIED;
545 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
546 			}
547 		}
548 
549 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
550 					mtu < pkt_size) {
551 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
552 			return -E2BIG;
553 		}
554 	}
555 #endif
556 	return 0;
557 }
558 
559 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
560 {
561 	struct ip_tunnel *tunnel = netdev_priv(dev);
562 	u32 headroom = sizeof(struct iphdr);
563 	struct ip_tunnel_info *tun_info;
564 	const struct ip_tunnel_key *key;
565 	const struct iphdr *inner_iph;
566 	struct rtable *rt;
567 	struct flowi4 fl4;
568 	__be16 df = 0;
569 	u8 tos, ttl;
570 
571 	tun_info = skb_tunnel_info(skb);
572 	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
573 		     ip_tunnel_info_af(tun_info) != AF_INET))
574 		goto tx_error;
575 	key = &tun_info->key;
576 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
577 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
578 	tos = key->tos;
579 	if (tos == 1) {
580 		if (skb->protocol == htons(ETH_P_IP))
581 			tos = inner_iph->tos;
582 		else if (skb->protocol == htons(ETH_P_IPV6))
583 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
584 	}
585 	init_tunnel_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0,
586 			 RT_TOS(tos), tunnel->parms.link, tunnel->fwmark);
587 	if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
588 		goto tx_error;
589 	rt = ip_route_output_key(tunnel->net, &fl4);
590 	if (IS_ERR(rt)) {
591 		dev->stats.tx_carrier_errors++;
592 		goto tx_error;
593 	}
594 	if (rt->dst.dev == dev) {
595 		ip_rt_put(rt);
596 		dev->stats.collisions++;
597 		goto tx_error;
598 	}
599 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
600 	ttl = key->ttl;
601 	if (ttl == 0) {
602 		if (skb->protocol == htons(ETH_P_IP))
603 			ttl = inner_iph->ttl;
604 		else if (skb->protocol == htons(ETH_P_IPV6))
605 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
606 		else
607 			ttl = ip4_dst_hoplimit(&rt->dst);
608 	}
609 	if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
610 		df = htons(IP_DF);
611 	else if (skb->protocol == htons(ETH_P_IP))
612 		df = inner_iph->frag_off & htons(IP_DF);
613 	headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
614 	if (headroom > dev->needed_headroom)
615 		dev->needed_headroom = headroom;
616 
617 	if (skb_cow_head(skb, dev->needed_headroom)) {
618 		ip_rt_put(rt);
619 		goto tx_dropped;
620 	}
621 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, key->tos,
622 		      key->ttl, df, !net_eq(tunnel->net, dev_net(dev)));
623 	return;
624 tx_error:
625 	dev->stats.tx_errors++;
626 	goto kfree;
627 tx_dropped:
628 	dev->stats.tx_dropped++;
629 kfree:
630 	kfree_skb(skb);
631 }
632 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
633 
634 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
635 		    const struct iphdr *tnl_params, u8 protocol)
636 {
637 	struct ip_tunnel *tunnel = netdev_priv(dev);
638 	const struct iphdr *inner_iph;
639 	struct flowi4 fl4;
640 	u8     tos, ttl;
641 	__be16 df;
642 	struct rtable *rt;		/* Route to the other host */
643 	unsigned int max_headroom;	/* The extra header space needed */
644 	__be32 dst;
645 	bool connected;
646 
647 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
648 	connected = (tunnel->parms.iph.daddr != 0);
649 
650 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
651 
652 	dst = tnl_params->daddr;
653 	if (dst == 0) {
654 		/* NBMA tunnel */
655 
656 		if (!skb_dst(skb)) {
657 			dev->stats.tx_fifo_errors++;
658 			goto tx_error;
659 		}
660 
661 		if (skb->protocol == htons(ETH_P_IP)) {
662 			rt = skb_rtable(skb);
663 			dst = rt_nexthop(rt, inner_iph->daddr);
664 		}
665 #if IS_ENABLED(CONFIG_IPV6)
666 		else if (skb->protocol == htons(ETH_P_IPV6)) {
667 			const struct in6_addr *addr6;
668 			struct neighbour *neigh;
669 			bool do_tx_error_icmp;
670 			int addr_type;
671 
672 			neigh = dst_neigh_lookup(skb_dst(skb),
673 						 &ipv6_hdr(skb)->daddr);
674 			if (!neigh)
675 				goto tx_error;
676 
677 			addr6 = (const struct in6_addr *)&neigh->primary_key;
678 			addr_type = ipv6_addr_type(addr6);
679 
680 			if (addr_type == IPV6_ADDR_ANY) {
681 				addr6 = &ipv6_hdr(skb)->daddr;
682 				addr_type = ipv6_addr_type(addr6);
683 			}
684 
685 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
686 				do_tx_error_icmp = true;
687 			else {
688 				do_tx_error_icmp = false;
689 				dst = addr6->s6_addr32[3];
690 			}
691 			neigh_release(neigh);
692 			if (do_tx_error_icmp)
693 				goto tx_error_icmp;
694 		}
695 #endif
696 		else
697 			goto tx_error;
698 
699 		connected = false;
700 	}
701 
702 	tos = tnl_params->tos;
703 	if (tos & 0x1) {
704 		tos &= ~0x1;
705 		if (skb->protocol == htons(ETH_P_IP)) {
706 			tos = inner_iph->tos;
707 			connected = false;
708 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
709 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
710 			connected = false;
711 		}
712 	}
713 
714 	init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
715 			 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
716 			 tunnel->fwmark);
717 
718 	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
719 		goto tx_error;
720 
721 	rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
722 			 NULL;
723 
724 	if (!rt) {
725 		rt = ip_route_output_key(tunnel->net, &fl4);
726 
727 		if (IS_ERR(rt)) {
728 			dev->stats.tx_carrier_errors++;
729 			goto tx_error;
730 		}
731 		if (connected)
732 			dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
733 					  fl4.saddr);
734 	}
735 
736 	if (rt->dst.dev == dev) {
737 		ip_rt_put(rt);
738 		dev->stats.collisions++;
739 		goto tx_error;
740 	}
741 
742 	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
743 		ip_rt_put(rt);
744 		goto tx_error;
745 	}
746 
747 	if (tunnel->err_count > 0) {
748 		if (time_before(jiffies,
749 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
750 			tunnel->err_count--;
751 
752 			dst_link_failure(skb);
753 		} else
754 			tunnel->err_count = 0;
755 	}
756 
757 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
758 	ttl = tnl_params->ttl;
759 	if (ttl == 0) {
760 		if (skb->protocol == htons(ETH_P_IP))
761 			ttl = inner_iph->ttl;
762 #if IS_ENABLED(CONFIG_IPV6)
763 		else if (skb->protocol == htons(ETH_P_IPV6))
764 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
765 #endif
766 		else
767 			ttl = ip4_dst_hoplimit(&rt->dst);
768 	}
769 
770 	df = tnl_params->frag_off;
771 	if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
772 		df |= (inner_iph->frag_off&htons(IP_DF));
773 
774 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
775 			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
776 	if (max_headroom > dev->needed_headroom)
777 		dev->needed_headroom = max_headroom;
778 
779 	if (skb_cow_head(skb, dev->needed_headroom)) {
780 		ip_rt_put(rt);
781 		dev->stats.tx_dropped++;
782 		kfree_skb(skb);
783 		return;
784 	}
785 
786 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
787 		      df, !net_eq(tunnel->net, dev_net(dev)));
788 	return;
789 
790 #if IS_ENABLED(CONFIG_IPV6)
791 tx_error_icmp:
792 	dst_link_failure(skb);
793 #endif
794 tx_error:
795 	dev->stats.tx_errors++;
796 	kfree_skb(skb);
797 }
798 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
799 
800 static void ip_tunnel_update(struct ip_tunnel_net *itn,
801 			     struct ip_tunnel *t,
802 			     struct net_device *dev,
803 			     struct ip_tunnel_parm *p,
804 			     bool set_mtu,
805 			     __u32 fwmark)
806 {
807 	ip_tunnel_del(itn, t);
808 	t->parms.iph.saddr = p->iph.saddr;
809 	t->parms.iph.daddr = p->iph.daddr;
810 	t->parms.i_key = p->i_key;
811 	t->parms.o_key = p->o_key;
812 	if (dev->type != ARPHRD_ETHER) {
813 		memcpy(dev->dev_addr, &p->iph.saddr, 4);
814 		memcpy(dev->broadcast, &p->iph.daddr, 4);
815 	}
816 	ip_tunnel_add(itn, t);
817 
818 	t->parms.iph.ttl = p->iph.ttl;
819 	t->parms.iph.tos = p->iph.tos;
820 	t->parms.iph.frag_off = p->iph.frag_off;
821 
822 	if (t->parms.link != p->link || t->fwmark != fwmark) {
823 		int mtu;
824 
825 		t->parms.link = p->link;
826 		t->fwmark = fwmark;
827 		mtu = ip_tunnel_bind_dev(dev);
828 		if (set_mtu)
829 			dev->mtu = mtu;
830 	}
831 	dst_cache_reset(&t->dst_cache);
832 	netdev_state_change(dev);
833 }
834 
835 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
836 {
837 	int err = 0;
838 	struct ip_tunnel *t = netdev_priv(dev);
839 	struct net *net = t->net;
840 	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
841 
842 	BUG_ON(!itn->fb_tunnel_dev);
843 	switch (cmd) {
844 	case SIOCGETTUNNEL:
845 		if (dev == itn->fb_tunnel_dev) {
846 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
847 			if (!t)
848 				t = netdev_priv(dev);
849 		}
850 		memcpy(p, &t->parms, sizeof(*p));
851 		break;
852 
853 	case SIOCADDTUNNEL:
854 	case SIOCCHGTUNNEL:
855 		err = -EPERM;
856 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
857 			goto done;
858 		if (p->iph.ttl)
859 			p->iph.frag_off |= htons(IP_DF);
860 		if (!(p->i_flags & VTI_ISVTI)) {
861 			if (!(p->i_flags & TUNNEL_KEY))
862 				p->i_key = 0;
863 			if (!(p->o_flags & TUNNEL_KEY))
864 				p->o_key = 0;
865 		}
866 
867 		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
868 
869 		if (cmd == SIOCADDTUNNEL) {
870 			if (!t) {
871 				t = ip_tunnel_create(net, itn, p);
872 				err = PTR_ERR_OR_ZERO(t);
873 				break;
874 			}
875 
876 			err = -EEXIST;
877 			break;
878 		}
879 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
880 			if (t) {
881 				if (t->dev != dev) {
882 					err = -EEXIST;
883 					break;
884 				}
885 			} else {
886 				unsigned int nflags = 0;
887 
888 				if (ipv4_is_multicast(p->iph.daddr))
889 					nflags = IFF_BROADCAST;
890 				else if (p->iph.daddr)
891 					nflags = IFF_POINTOPOINT;
892 
893 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
894 					err = -EINVAL;
895 					break;
896 				}
897 
898 				t = netdev_priv(dev);
899 			}
900 		}
901 
902 		if (t) {
903 			err = 0;
904 			ip_tunnel_update(itn, t, dev, p, true, 0);
905 		} else {
906 			err = -ENOENT;
907 		}
908 		break;
909 
910 	case SIOCDELTUNNEL:
911 		err = -EPERM;
912 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
913 			goto done;
914 
915 		if (dev == itn->fb_tunnel_dev) {
916 			err = -ENOENT;
917 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
918 			if (!t)
919 				goto done;
920 			err = -EPERM;
921 			if (t == netdev_priv(itn->fb_tunnel_dev))
922 				goto done;
923 			dev = t->dev;
924 		}
925 		unregister_netdevice(dev);
926 		err = 0;
927 		break;
928 
929 	default:
930 		err = -EINVAL;
931 	}
932 
933 done:
934 	return err;
935 }
936 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
937 
938 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
939 {
940 	struct ip_tunnel *tunnel = netdev_priv(dev);
941 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
942 	int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
943 
944 	if (new_mtu < ETH_MIN_MTU)
945 		return -EINVAL;
946 
947 	if (new_mtu > max_mtu) {
948 		if (strict)
949 			return -EINVAL;
950 
951 		new_mtu = max_mtu;
952 	}
953 
954 	dev->mtu = new_mtu;
955 	return 0;
956 }
957 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
958 
959 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
960 {
961 	return __ip_tunnel_change_mtu(dev, new_mtu, true);
962 }
963 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
964 
965 static void ip_tunnel_dev_free(struct net_device *dev)
966 {
967 	struct ip_tunnel *tunnel = netdev_priv(dev);
968 
969 	gro_cells_destroy(&tunnel->gro_cells);
970 	dst_cache_destroy(&tunnel->dst_cache);
971 	free_percpu(dev->tstats);
972 }
973 
974 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
975 {
976 	struct ip_tunnel *tunnel = netdev_priv(dev);
977 	struct ip_tunnel_net *itn;
978 
979 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
980 
981 	if (itn->fb_tunnel_dev != dev) {
982 		ip_tunnel_del(itn, netdev_priv(dev));
983 		unregister_netdevice_queue(dev, head);
984 	}
985 }
986 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
987 
988 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
989 {
990 	struct ip_tunnel *tunnel = netdev_priv(dev);
991 
992 	return tunnel->net;
993 }
994 EXPORT_SYMBOL(ip_tunnel_get_link_net);
995 
996 int ip_tunnel_get_iflink(const struct net_device *dev)
997 {
998 	struct ip_tunnel *tunnel = netdev_priv(dev);
999 
1000 	return tunnel->parms.link;
1001 }
1002 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1003 
1004 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1005 				  struct rtnl_link_ops *ops, char *devname)
1006 {
1007 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1008 	struct ip_tunnel_parm parms;
1009 	unsigned int i;
1010 
1011 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1012 		INIT_HLIST_HEAD(&itn->tunnels[i]);
1013 
1014 	if (!ops) {
1015 		itn->fb_tunnel_dev = NULL;
1016 		return 0;
1017 	}
1018 
1019 	memset(&parms, 0, sizeof(parms));
1020 	if (devname)
1021 		strlcpy(parms.name, devname, IFNAMSIZ);
1022 
1023 	rtnl_lock();
1024 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1025 	/* FB netdevice is special: we have one, and only one per netns.
1026 	 * Allowing to move it to another netns is clearly unsafe.
1027 	 */
1028 	if (!IS_ERR(itn->fb_tunnel_dev)) {
1029 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1030 		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1031 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1032 	}
1033 	rtnl_unlock();
1034 
1035 	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1036 }
1037 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1038 
1039 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1040 			      struct rtnl_link_ops *ops)
1041 {
1042 	struct net *net = dev_net(itn->fb_tunnel_dev);
1043 	struct net_device *dev, *aux;
1044 	int h;
1045 
1046 	for_each_netdev_safe(net, dev, aux)
1047 		if (dev->rtnl_link_ops == ops)
1048 			unregister_netdevice_queue(dev, head);
1049 
1050 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1051 		struct ip_tunnel *t;
1052 		struct hlist_node *n;
1053 		struct hlist_head *thead = &itn->tunnels[h];
1054 
1055 		hlist_for_each_entry_safe(t, n, thead, hash_node)
1056 			/* If dev is in the same netns, it has already
1057 			 * been added to the list by the previous loop.
1058 			 */
1059 			if (!net_eq(dev_net(t->dev), net))
1060 				unregister_netdevice_queue(t->dev, head);
1061 	}
1062 }
1063 
1064 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1065 {
1066 	LIST_HEAD(list);
1067 
1068 	rtnl_lock();
1069 	ip_tunnel_destroy(itn, &list, ops);
1070 	unregister_netdevice_many(&list);
1071 	rtnl_unlock();
1072 }
1073 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1074 
1075 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1076 		      struct ip_tunnel_parm *p, __u32 fwmark)
1077 {
1078 	struct ip_tunnel *nt;
1079 	struct net *net = dev_net(dev);
1080 	struct ip_tunnel_net *itn;
1081 	int mtu;
1082 	int err;
1083 
1084 	nt = netdev_priv(dev);
1085 	itn = net_generic(net, nt->ip_tnl_net_id);
1086 
1087 	if (nt->collect_md) {
1088 		if (rtnl_dereference(itn->collect_md_tun))
1089 			return -EEXIST;
1090 	} else {
1091 		if (ip_tunnel_find(itn, p, dev->type))
1092 			return -EEXIST;
1093 	}
1094 
1095 	nt->net = net;
1096 	nt->parms = *p;
1097 	nt->fwmark = fwmark;
1098 	err = register_netdevice(dev);
1099 	if (err)
1100 		goto out;
1101 
1102 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1103 		eth_hw_addr_random(dev);
1104 
1105 	mtu = ip_tunnel_bind_dev(dev);
1106 	if (!tb[IFLA_MTU])
1107 		dev->mtu = mtu;
1108 
1109 	ip_tunnel_add(itn, nt);
1110 out:
1111 	return err;
1112 }
1113 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1114 
1115 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1116 			 struct ip_tunnel_parm *p, __u32 fwmark)
1117 {
1118 	struct ip_tunnel *t;
1119 	struct ip_tunnel *tunnel = netdev_priv(dev);
1120 	struct net *net = tunnel->net;
1121 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1122 
1123 	if (dev == itn->fb_tunnel_dev)
1124 		return -EINVAL;
1125 
1126 	t = ip_tunnel_find(itn, p, dev->type);
1127 
1128 	if (t) {
1129 		if (t->dev != dev)
1130 			return -EEXIST;
1131 	} else {
1132 		t = tunnel;
1133 
1134 		if (dev->type != ARPHRD_ETHER) {
1135 			unsigned int nflags = 0;
1136 
1137 			if (ipv4_is_multicast(p->iph.daddr))
1138 				nflags = IFF_BROADCAST;
1139 			else if (p->iph.daddr)
1140 				nflags = IFF_POINTOPOINT;
1141 
1142 			if ((dev->flags ^ nflags) &
1143 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1144 				return -EINVAL;
1145 		}
1146 	}
1147 
1148 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1149 	return 0;
1150 }
1151 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1152 
1153 int ip_tunnel_init(struct net_device *dev)
1154 {
1155 	struct ip_tunnel *tunnel = netdev_priv(dev);
1156 	struct iphdr *iph = &tunnel->parms.iph;
1157 	int err;
1158 
1159 	dev->needs_free_netdev = true;
1160 	dev->priv_destructor = ip_tunnel_dev_free;
1161 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1162 	if (!dev->tstats)
1163 		return -ENOMEM;
1164 
1165 	err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1166 	if (err) {
1167 		free_percpu(dev->tstats);
1168 		return err;
1169 	}
1170 
1171 	err = gro_cells_init(&tunnel->gro_cells, dev);
1172 	if (err) {
1173 		dst_cache_destroy(&tunnel->dst_cache);
1174 		free_percpu(dev->tstats);
1175 		return err;
1176 	}
1177 
1178 	tunnel->dev = dev;
1179 	tunnel->net = dev_net(dev);
1180 	strcpy(tunnel->parms.name, dev->name);
1181 	iph->version		= 4;
1182 	iph->ihl		= 5;
1183 
1184 	if (tunnel->collect_md) {
1185 		dev->features |= NETIF_F_NETNS_LOCAL;
1186 		netif_keep_dst(dev);
1187 	}
1188 	return 0;
1189 }
1190 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1191 
1192 void ip_tunnel_uninit(struct net_device *dev)
1193 {
1194 	struct ip_tunnel *tunnel = netdev_priv(dev);
1195 	struct net *net = tunnel->net;
1196 	struct ip_tunnel_net *itn;
1197 
1198 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1199 	/* fb_tunnel_dev will be unregisted in net-exit call. */
1200 	if (itn->fb_tunnel_dev != dev)
1201 		ip_tunnel_del(itn, netdev_priv(dev));
1202 
1203 	dst_cache_reset(&tunnel->dst_cache);
1204 }
1205 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1206 
1207 /* Do least required initialization, rest of init is done in tunnel_init call */
1208 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1209 {
1210 	struct ip_tunnel *tunnel = netdev_priv(dev);
1211 	tunnel->ip_tnl_net_id = net_id;
1212 }
1213 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1214 
1215 MODULE_LICENSE("GPL");
1216