xref: /openbmc/linux/net/ipv4/ip_tunnel.c (revision a34a3ed7)
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20 
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/init.h>
34 #include <linux/in6.h>
35 #include <linux/inetdevice.h>
36 #include <linux/igmp.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/etherdevice.h>
39 #include <linux/if_ether.h>
40 #include <linux/if_vlan.h>
41 #include <linux/rculist.h>
42 #include <linux/err.h>
43 
44 #include <net/sock.h>
45 #include <net/ip.h>
46 #include <net/icmp.h>
47 #include <net/protocol.h>
48 #include <net/ip_tunnels.h>
49 #include <net/arp.h>
50 #include <net/checksum.h>
51 #include <net/dsfield.h>
52 #include <net/inet_ecn.h>
53 #include <net/xfrm.h>
54 #include <net/net_namespace.h>
55 #include <net/netns/generic.h>
56 #include <net/rtnetlink.h>
57 #include <net/udp.h>
58 #include <net/dst_metadata.h>
59 
60 #if IS_ENABLED(CONFIG_IPV6)
61 #include <net/ipv6.h>
62 #include <net/ip6_fib.h>
63 #include <net/ip6_route.h>
64 #endif
65 
66 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
67 {
68 	return hash_32((__force u32)key ^ (__force u32)remote,
69 			 IP_TNL_HASH_BITS);
70 }
71 
72 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
73 				__be16 flags, __be32 key)
74 {
75 	if (p->i_flags & TUNNEL_KEY) {
76 		if (flags & TUNNEL_KEY)
77 			return key == p->i_key;
78 		else
79 			/* key expected, none present */
80 			return false;
81 	} else
82 		return !(flags & TUNNEL_KEY);
83 }
84 
85 /* Fallback tunnel: no source, no destination, no key, no options
86 
87    Tunnel hash table:
88    We require exact key match i.e. if a key is present in packet
89    it will match only tunnel with the same key; if it is not present,
90    it will match only keyless tunnel.
91 
92    All keysless packets, if not matched configured keyless tunnels
93    will match fallback tunnel.
94    Given src, dst and key, find appropriate for input tunnel.
95 */
96 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
97 				   int link, __be16 flags,
98 				   __be32 remote, __be32 local,
99 				   __be32 key)
100 {
101 	unsigned int hash;
102 	struct ip_tunnel *t, *cand = NULL;
103 	struct hlist_head *head;
104 
105 	hash = ip_tunnel_hash(key, remote);
106 	head = &itn->tunnels[hash];
107 
108 	hlist_for_each_entry_rcu(t, head, hash_node) {
109 		if (local != t->parms.iph.saddr ||
110 		    remote != t->parms.iph.daddr ||
111 		    !(t->dev->flags & IFF_UP))
112 			continue;
113 
114 		if (!ip_tunnel_key_match(&t->parms, flags, key))
115 			continue;
116 
117 		if (t->parms.link == link)
118 			return t;
119 		else
120 			cand = t;
121 	}
122 
123 	hlist_for_each_entry_rcu(t, head, hash_node) {
124 		if (remote != t->parms.iph.daddr ||
125 		    t->parms.iph.saddr != 0 ||
126 		    !(t->dev->flags & IFF_UP))
127 			continue;
128 
129 		if (!ip_tunnel_key_match(&t->parms, flags, key))
130 			continue;
131 
132 		if (t->parms.link == link)
133 			return t;
134 		else if (!cand)
135 			cand = t;
136 	}
137 
138 	hash = ip_tunnel_hash(key, 0);
139 	head = &itn->tunnels[hash];
140 
141 	hlist_for_each_entry_rcu(t, head, hash_node) {
142 		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
143 		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
144 			continue;
145 
146 		if (!(t->dev->flags & IFF_UP))
147 			continue;
148 
149 		if (!ip_tunnel_key_match(&t->parms, flags, key))
150 			continue;
151 
152 		if (t->parms.link == link)
153 			return t;
154 		else if (!cand)
155 			cand = t;
156 	}
157 
158 	if (flags & TUNNEL_NO_KEY)
159 		goto skip_key_lookup;
160 
161 	hlist_for_each_entry_rcu(t, head, hash_node) {
162 		if (t->parms.i_key != key ||
163 		    t->parms.iph.saddr != 0 ||
164 		    t->parms.iph.daddr != 0 ||
165 		    !(t->dev->flags & IFF_UP))
166 			continue;
167 
168 		if (t->parms.link == link)
169 			return t;
170 		else if (!cand)
171 			cand = t;
172 	}
173 
174 skip_key_lookup:
175 	if (cand)
176 		return cand;
177 
178 	t = rcu_dereference(itn->collect_md_tun);
179 	if (t && t->dev->flags & IFF_UP)
180 		return t;
181 
182 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
183 		return netdev_priv(itn->fb_tunnel_dev);
184 
185 	return NULL;
186 }
187 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
188 
189 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
190 				    struct ip_tunnel_parm *parms)
191 {
192 	unsigned int h;
193 	__be32 remote;
194 	__be32 i_key = parms->i_key;
195 
196 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
197 		remote = parms->iph.daddr;
198 	else
199 		remote = 0;
200 
201 	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
202 		i_key = 0;
203 
204 	h = ip_tunnel_hash(i_key, remote);
205 	return &itn->tunnels[h];
206 }
207 
208 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
209 {
210 	struct hlist_head *head = ip_bucket(itn, &t->parms);
211 
212 	if (t->collect_md)
213 		rcu_assign_pointer(itn->collect_md_tun, t);
214 	hlist_add_head_rcu(&t->hash_node, head);
215 }
216 
217 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
218 {
219 	if (t->collect_md)
220 		rcu_assign_pointer(itn->collect_md_tun, NULL);
221 	hlist_del_init_rcu(&t->hash_node);
222 }
223 
224 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
225 					struct ip_tunnel_parm *parms,
226 					int type)
227 {
228 	__be32 remote = parms->iph.daddr;
229 	__be32 local = parms->iph.saddr;
230 	__be32 key = parms->i_key;
231 	__be16 flags = parms->i_flags;
232 	int link = parms->link;
233 	struct ip_tunnel *t = NULL;
234 	struct hlist_head *head = ip_bucket(itn, parms);
235 
236 	hlist_for_each_entry_rcu(t, head, hash_node) {
237 		if (local == t->parms.iph.saddr &&
238 		    remote == t->parms.iph.daddr &&
239 		    link == t->parms.link &&
240 		    type == t->dev->type &&
241 		    ip_tunnel_key_match(&t->parms, flags, key))
242 			break;
243 	}
244 	return t;
245 }
246 
247 static struct net_device *__ip_tunnel_create(struct net *net,
248 					     const struct rtnl_link_ops *ops,
249 					     struct ip_tunnel_parm *parms)
250 {
251 	int err;
252 	struct ip_tunnel *tunnel;
253 	struct net_device *dev;
254 	char name[IFNAMSIZ];
255 
256 	if (parms->name[0])
257 		strlcpy(name, parms->name, IFNAMSIZ);
258 	else {
259 		if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
260 			err = -E2BIG;
261 			goto failed;
262 		}
263 		strlcpy(name, ops->kind, IFNAMSIZ);
264 		strncat(name, "%d", 2);
265 	}
266 
267 	ASSERT_RTNL();
268 	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
269 	if (!dev) {
270 		err = -ENOMEM;
271 		goto failed;
272 	}
273 	dev_net_set(dev, net);
274 
275 	dev->rtnl_link_ops = ops;
276 
277 	tunnel = netdev_priv(dev);
278 	tunnel->parms = *parms;
279 	tunnel->net = net;
280 
281 	err = register_netdevice(dev);
282 	if (err)
283 		goto failed_free;
284 
285 	return dev;
286 
287 failed_free:
288 	free_netdev(dev);
289 failed:
290 	return ERR_PTR(err);
291 }
292 
293 static inline void init_tunnel_flow(struct flowi4 *fl4,
294 				    int proto,
295 				    __be32 daddr, __be32 saddr,
296 				    __be32 key, __u8 tos, int oif,
297 				    __u32 mark)
298 {
299 	memset(fl4, 0, sizeof(*fl4));
300 	fl4->flowi4_oif = oif;
301 	fl4->daddr = daddr;
302 	fl4->saddr = saddr;
303 	fl4->flowi4_tos = tos;
304 	fl4->flowi4_proto = proto;
305 	fl4->fl4_gre_key = key;
306 	fl4->flowi4_mark = mark;
307 }
308 
309 static int ip_tunnel_bind_dev(struct net_device *dev)
310 {
311 	struct net_device *tdev = NULL;
312 	struct ip_tunnel *tunnel = netdev_priv(dev);
313 	const struct iphdr *iph;
314 	int hlen = LL_MAX_HEADER;
315 	int mtu = ETH_DATA_LEN;
316 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
317 
318 	iph = &tunnel->parms.iph;
319 
320 	/* Guess output device to choose reasonable mtu and needed_headroom */
321 	if (iph->daddr) {
322 		struct flowi4 fl4;
323 		struct rtable *rt;
324 
325 		init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
326 				 iph->saddr, tunnel->parms.o_key,
327 				 RT_TOS(iph->tos), tunnel->parms.link,
328 				 tunnel->fwmark);
329 		rt = ip_route_output_key(tunnel->net, &fl4);
330 
331 		if (!IS_ERR(rt)) {
332 			tdev = rt->dst.dev;
333 			ip_rt_put(rt);
334 		}
335 		if (dev->type != ARPHRD_ETHER)
336 			dev->flags |= IFF_POINTOPOINT;
337 
338 		dst_cache_reset(&tunnel->dst_cache);
339 	}
340 
341 	if (!tdev && tunnel->parms.link)
342 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
343 
344 	if (tdev) {
345 		hlen = tdev->hard_header_len + tdev->needed_headroom;
346 		mtu = tdev->mtu;
347 	}
348 
349 	dev->needed_headroom = t_hlen + hlen;
350 	mtu -= (dev->hard_header_len + t_hlen);
351 
352 	if (mtu < IPV4_MIN_MTU)
353 		mtu = IPV4_MIN_MTU;
354 
355 	return mtu;
356 }
357 
358 static struct ip_tunnel *ip_tunnel_create(struct net *net,
359 					  struct ip_tunnel_net *itn,
360 					  struct ip_tunnel_parm *parms)
361 {
362 	struct ip_tunnel *nt;
363 	struct net_device *dev;
364 	int t_hlen;
365 
366 	BUG_ON(!itn->fb_tunnel_dev);
367 	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
368 	if (IS_ERR(dev))
369 		return ERR_CAST(dev);
370 
371 	dev->mtu = ip_tunnel_bind_dev(dev);
372 
373 	nt = netdev_priv(dev);
374 	t_hlen = nt->hlen + sizeof(struct iphdr);
375 	dev->min_mtu = ETH_MIN_MTU;
376 	dev->max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
377 	ip_tunnel_add(itn, nt);
378 	return nt;
379 }
380 
381 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
382 		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
383 		  bool log_ecn_error)
384 {
385 	struct pcpu_sw_netstats *tstats;
386 	const struct iphdr *iph = ip_hdr(skb);
387 	int err;
388 
389 #ifdef CONFIG_NET_IPGRE_BROADCAST
390 	if (ipv4_is_multicast(iph->daddr)) {
391 		tunnel->dev->stats.multicast++;
392 		skb->pkt_type = PACKET_BROADCAST;
393 	}
394 #endif
395 
396 	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
397 	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
398 		tunnel->dev->stats.rx_crc_errors++;
399 		tunnel->dev->stats.rx_errors++;
400 		goto drop;
401 	}
402 
403 	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
404 		if (!(tpi->flags&TUNNEL_SEQ) ||
405 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
406 			tunnel->dev->stats.rx_fifo_errors++;
407 			tunnel->dev->stats.rx_errors++;
408 			goto drop;
409 		}
410 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
411 	}
412 
413 	skb_reset_network_header(skb);
414 
415 	err = IP_ECN_decapsulate(iph, skb);
416 	if (unlikely(err)) {
417 		if (log_ecn_error)
418 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
419 					&iph->saddr, iph->tos);
420 		if (err > 1) {
421 			++tunnel->dev->stats.rx_frame_errors;
422 			++tunnel->dev->stats.rx_errors;
423 			goto drop;
424 		}
425 	}
426 
427 	tstats = this_cpu_ptr(tunnel->dev->tstats);
428 	u64_stats_update_begin(&tstats->syncp);
429 	tstats->rx_packets++;
430 	tstats->rx_bytes += skb->len;
431 	u64_stats_update_end(&tstats->syncp);
432 
433 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
434 
435 	if (tunnel->dev->type == ARPHRD_ETHER) {
436 		skb->protocol = eth_type_trans(skb, tunnel->dev);
437 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
438 	} else {
439 		skb->dev = tunnel->dev;
440 	}
441 
442 	if (tun_dst)
443 		skb_dst_set(skb, (struct dst_entry *)tun_dst);
444 
445 	gro_cells_receive(&tunnel->gro_cells, skb);
446 	return 0;
447 
448 drop:
449 	if (tun_dst)
450 		dst_release((struct dst_entry *)tun_dst);
451 	kfree_skb(skb);
452 	return 0;
453 }
454 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
455 
456 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
457 			    unsigned int num)
458 {
459 	if (num >= MAX_IPTUN_ENCAP_OPS)
460 		return -ERANGE;
461 
462 	return !cmpxchg((const struct ip_tunnel_encap_ops **)
463 			&iptun_encaps[num],
464 			NULL, ops) ? 0 : -1;
465 }
466 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
467 
468 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
469 			    unsigned int num)
470 {
471 	int ret;
472 
473 	if (num >= MAX_IPTUN_ENCAP_OPS)
474 		return -ERANGE;
475 
476 	ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
477 		       &iptun_encaps[num],
478 		       ops, NULL) == ops) ? 0 : -1;
479 
480 	synchronize_net();
481 
482 	return ret;
483 }
484 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
485 
486 int ip_tunnel_encap_setup(struct ip_tunnel *t,
487 			  struct ip_tunnel_encap *ipencap)
488 {
489 	int hlen;
490 
491 	memset(&t->encap, 0, sizeof(t->encap));
492 
493 	hlen = ip_encap_hlen(ipencap);
494 	if (hlen < 0)
495 		return hlen;
496 
497 	t->encap.type = ipencap->type;
498 	t->encap.sport = ipencap->sport;
499 	t->encap.dport = ipencap->dport;
500 	t->encap.flags = ipencap->flags;
501 
502 	t->encap_hlen = hlen;
503 	t->hlen = t->encap_hlen + t->tun_hlen;
504 
505 	return 0;
506 }
507 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
508 
509 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
510 			    struct rtable *rt, __be16 df,
511 			    const struct iphdr *inner_iph)
512 {
513 	struct ip_tunnel *tunnel = netdev_priv(dev);
514 	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
515 	int mtu;
516 
517 	if (df)
518 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
519 					- sizeof(struct iphdr) - tunnel->hlen;
520 	else
521 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
522 
523 	skb_dst_update_pmtu(skb, mtu);
524 
525 	if (skb->protocol == htons(ETH_P_IP)) {
526 		if (!skb_is_gso(skb) &&
527 		    (inner_iph->frag_off & htons(IP_DF)) &&
528 		    mtu < pkt_size) {
529 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
530 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
531 			return -E2BIG;
532 		}
533 	}
534 #if IS_ENABLED(CONFIG_IPV6)
535 	else if (skb->protocol == htons(ETH_P_IPV6)) {
536 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
537 
538 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
539 			   mtu >= IPV6_MIN_MTU) {
540 			if ((tunnel->parms.iph.daddr &&
541 			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
542 			    rt6->rt6i_dst.plen == 128) {
543 				rt6->rt6i_flags |= RTF_MODIFIED;
544 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
545 			}
546 		}
547 
548 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
549 					mtu < pkt_size) {
550 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
551 			return -E2BIG;
552 		}
553 	}
554 #endif
555 	return 0;
556 }
557 
558 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
559 {
560 	struct ip_tunnel *tunnel = netdev_priv(dev);
561 	u32 headroom = sizeof(struct iphdr);
562 	struct ip_tunnel_info *tun_info;
563 	const struct ip_tunnel_key *key;
564 	const struct iphdr *inner_iph;
565 	struct rtable *rt;
566 	struct flowi4 fl4;
567 	__be16 df = 0;
568 	u8 tos, ttl;
569 
570 	tun_info = skb_tunnel_info(skb);
571 	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
572 		     ip_tunnel_info_af(tun_info) != AF_INET))
573 		goto tx_error;
574 	key = &tun_info->key;
575 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
576 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
577 	tos = key->tos;
578 	if (tos == 1) {
579 		if (skb->protocol == htons(ETH_P_IP))
580 			tos = inner_iph->tos;
581 		else if (skb->protocol == htons(ETH_P_IPV6))
582 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
583 	}
584 	init_tunnel_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0,
585 			 RT_TOS(tos), tunnel->parms.link, tunnel->fwmark);
586 	if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
587 		goto tx_error;
588 	rt = ip_route_output_key(tunnel->net, &fl4);
589 	if (IS_ERR(rt)) {
590 		dev->stats.tx_carrier_errors++;
591 		goto tx_error;
592 	}
593 	if (rt->dst.dev == dev) {
594 		ip_rt_put(rt);
595 		dev->stats.collisions++;
596 		goto tx_error;
597 	}
598 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
599 	ttl = key->ttl;
600 	if (ttl == 0) {
601 		if (skb->protocol == htons(ETH_P_IP))
602 			ttl = inner_iph->ttl;
603 		else if (skb->protocol == htons(ETH_P_IPV6))
604 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
605 		else
606 			ttl = ip4_dst_hoplimit(&rt->dst);
607 	}
608 	if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
609 		df = htons(IP_DF);
610 	else if (skb->protocol == htons(ETH_P_IP))
611 		df = inner_iph->frag_off & htons(IP_DF);
612 	headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
613 	if (headroom > dev->needed_headroom)
614 		dev->needed_headroom = headroom;
615 
616 	if (skb_cow_head(skb, dev->needed_headroom)) {
617 		ip_rt_put(rt);
618 		goto tx_dropped;
619 	}
620 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
621 		      df, !net_eq(tunnel->net, dev_net(dev)));
622 	return;
623 tx_error:
624 	dev->stats.tx_errors++;
625 	goto kfree;
626 tx_dropped:
627 	dev->stats.tx_dropped++;
628 kfree:
629 	kfree_skb(skb);
630 }
631 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
632 
633 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
634 		    const struct iphdr *tnl_params, u8 protocol)
635 {
636 	struct ip_tunnel *tunnel = netdev_priv(dev);
637 	const struct iphdr *inner_iph;
638 	struct flowi4 fl4;
639 	u8     tos, ttl;
640 	__be16 df;
641 	struct rtable *rt;		/* Route to the other host */
642 	unsigned int max_headroom;	/* The extra header space needed */
643 	__be32 dst;
644 	bool connected;
645 
646 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
647 	connected = (tunnel->parms.iph.daddr != 0);
648 
649 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
650 
651 	dst = tnl_params->daddr;
652 	if (dst == 0) {
653 		/* NBMA tunnel */
654 
655 		if (!skb_dst(skb)) {
656 			dev->stats.tx_fifo_errors++;
657 			goto tx_error;
658 		}
659 
660 		if (skb->protocol == htons(ETH_P_IP)) {
661 			rt = skb_rtable(skb);
662 			dst = rt_nexthop(rt, inner_iph->daddr);
663 		}
664 #if IS_ENABLED(CONFIG_IPV6)
665 		else if (skb->protocol == htons(ETH_P_IPV6)) {
666 			const struct in6_addr *addr6;
667 			struct neighbour *neigh;
668 			bool do_tx_error_icmp;
669 			int addr_type;
670 
671 			neigh = dst_neigh_lookup(skb_dst(skb),
672 						 &ipv6_hdr(skb)->daddr);
673 			if (!neigh)
674 				goto tx_error;
675 
676 			addr6 = (const struct in6_addr *)&neigh->primary_key;
677 			addr_type = ipv6_addr_type(addr6);
678 
679 			if (addr_type == IPV6_ADDR_ANY) {
680 				addr6 = &ipv6_hdr(skb)->daddr;
681 				addr_type = ipv6_addr_type(addr6);
682 			}
683 
684 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
685 				do_tx_error_icmp = true;
686 			else {
687 				do_tx_error_icmp = false;
688 				dst = addr6->s6_addr32[3];
689 			}
690 			neigh_release(neigh);
691 			if (do_tx_error_icmp)
692 				goto tx_error_icmp;
693 		}
694 #endif
695 		else
696 			goto tx_error;
697 
698 		connected = false;
699 	}
700 
701 	tos = tnl_params->tos;
702 	if (tos & 0x1) {
703 		tos &= ~0x1;
704 		if (skb->protocol == htons(ETH_P_IP)) {
705 			tos = inner_iph->tos;
706 			connected = false;
707 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
708 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
709 			connected = false;
710 		}
711 	}
712 
713 	if (tunnel->fwmark) {
714 		init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
715 				 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
716 				 tunnel->fwmark);
717 	}
718 	else {
719 		init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
720 				 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
721 				 skb->mark);
722 	}
723 
724 	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
725 		goto tx_error;
726 
727 	rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
728 			 NULL;
729 
730 	if (!rt) {
731 		rt = ip_route_output_key(tunnel->net, &fl4);
732 
733 		if (IS_ERR(rt)) {
734 			dev->stats.tx_carrier_errors++;
735 			goto tx_error;
736 		}
737 		if (connected)
738 			dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
739 					  fl4.saddr);
740 	}
741 
742 	if (rt->dst.dev == dev) {
743 		ip_rt_put(rt);
744 		dev->stats.collisions++;
745 		goto tx_error;
746 	}
747 
748 	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
749 		ip_rt_put(rt);
750 		goto tx_error;
751 	}
752 
753 	if (tunnel->err_count > 0) {
754 		if (time_before(jiffies,
755 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
756 			tunnel->err_count--;
757 
758 			dst_link_failure(skb);
759 		} else
760 			tunnel->err_count = 0;
761 	}
762 
763 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
764 	ttl = tnl_params->ttl;
765 	if (ttl == 0) {
766 		if (skb->protocol == htons(ETH_P_IP))
767 			ttl = inner_iph->ttl;
768 #if IS_ENABLED(CONFIG_IPV6)
769 		else if (skb->protocol == htons(ETH_P_IPV6))
770 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
771 #endif
772 		else
773 			ttl = ip4_dst_hoplimit(&rt->dst);
774 	}
775 
776 	df = tnl_params->frag_off;
777 	if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
778 		df |= (inner_iph->frag_off&htons(IP_DF));
779 
780 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
781 			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
782 	if (max_headroom > dev->needed_headroom)
783 		dev->needed_headroom = max_headroom;
784 
785 	if (skb_cow_head(skb, dev->needed_headroom)) {
786 		ip_rt_put(rt);
787 		dev->stats.tx_dropped++;
788 		kfree_skb(skb);
789 		return;
790 	}
791 
792 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
793 		      df, !net_eq(tunnel->net, dev_net(dev)));
794 	return;
795 
796 #if IS_ENABLED(CONFIG_IPV6)
797 tx_error_icmp:
798 	dst_link_failure(skb);
799 #endif
800 tx_error:
801 	dev->stats.tx_errors++;
802 	kfree_skb(skb);
803 }
804 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
805 
806 static void ip_tunnel_update(struct ip_tunnel_net *itn,
807 			     struct ip_tunnel *t,
808 			     struct net_device *dev,
809 			     struct ip_tunnel_parm *p,
810 			     bool set_mtu,
811 			     __u32 fwmark)
812 {
813 	ip_tunnel_del(itn, t);
814 	t->parms.iph.saddr = p->iph.saddr;
815 	t->parms.iph.daddr = p->iph.daddr;
816 	t->parms.i_key = p->i_key;
817 	t->parms.o_key = p->o_key;
818 	if (dev->type != ARPHRD_ETHER) {
819 		memcpy(dev->dev_addr, &p->iph.saddr, 4);
820 		memcpy(dev->broadcast, &p->iph.daddr, 4);
821 	}
822 	ip_tunnel_add(itn, t);
823 
824 	t->parms.iph.ttl = p->iph.ttl;
825 	t->parms.iph.tos = p->iph.tos;
826 	t->parms.iph.frag_off = p->iph.frag_off;
827 
828 	if (t->parms.link != p->link || t->fwmark != fwmark) {
829 		int mtu;
830 
831 		t->parms.link = p->link;
832 		t->fwmark = fwmark;
833 		mtu = ip_tunnel_bind_dev(dev);
834 		if (set_mtu)
835 			dev->mtu = mtu;
836 	}
837 	dst_cache_reset(&t->dst_cache);
838 	netdev_state_change(dev);
839 }
840 
841 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
842 {
843 	int err = 0;
844 	struct ip_tunnel *t = netdev_priv(dev);
845 	struct net *net = t->net;
846 	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
847 
848 	BUG_ON(!itn->fb_tunnel_dev);
849 	switch (cmd) {
850 	case SIOCGETTUNNEL:
851 		if (dev == itn->fb_tunnel_dev) {
852 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
853 			if (!t)
854 				t = netdev_priv(dev);
855 		}
856 		memcpy(p, &t->parms, sizeof(*p));
857 		break;
858 
859 	case SIOCADDTUNNEL:
860 	case SIOCCHGTUNNEL:
861 		err = -EPERM;
862 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
863 			goto done;
864 		if (p->iph.ttl)
865 			p->iph.frag_off |= htons(IP_DF);
866 		if (!(p->i_flags & VTI_ISVTI)) {
867 			if (!(p->i_flags & TUNNEL_KEY))
868 				p->i_key = 0;
869 			if (!(p->o_flags & TUNNEL_KEY))
870 				p->o_key = 0;
871 		}
872 
873 		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
874 
875 		if (cmd == SIOCADDTUNNEL) {
876 			if (!t) {
877 				t = ip_tunnel_create(net, itn, p);
878 				err = PTR_ERR_OR_ZERO(t);
879 				break;
880 			}
881 
882 			err = -EEXIST;
883 			break;
884 		}
885 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
886 			if (t) {
887 				if (t->dev != dev) {
888 					err = -EEXIST;
889 					break;
890 				}
891 			} else {
892 				unsigned int nflags = 0;
893 
894 				if (ipv4_is_multicast(p->iph.daddr))
895 					nflags = IFF_BROADCAST;
896 				else if (p->iph.daddr)
897 					nflags = IFF_POINTOPOINT;
898 
899 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
900 					err = -EINVAL;
901 					break;
902 				}
903 
904 				t = netdev_priv(dev);
905 			}
906 		}
907 
908 		if (t) {
909 			err = 0;
910 			ip_tunnel_update(itn, t, dev, p, true, 0);
911 		} else {
912 			err = -ENOENT;
913 		}
914 		break;
915 
916 	case SIOCDELTUNNEL:
917 		err = -EPERM;
918 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
919 			goto done;
920 
921 		if (dev == itn->fb_tunnel_dev) {
922 			err = -ENOENT;
923 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
924 			if (!t)
925 				goto done;
926 			err = -EPERM;
927 			if (t == netdev_priv(itn->fb_tunnel_dev))
928 				goto done;
929 			dev = t->dev;
930 		}
931 		unregister_netdevice(dev);
932 		err = 0;
933 		break;
934 
935 	default:
936 		err = -EINVAL;
937 	}
938 
939 done:
940 	return err;
941 }
942 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
943 
944 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
945 {
946 	struct ip_tunnel *tunnel = netdev_priv(dev);
947 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
948 	int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
949 
950 	if (new_mtu < ETH_MIN_MTU)
951 		return -EINVAL;
952 
953 	if (new_mtu > max_mtu) {
954 		if (strict)
955 			return -EINVAL;
956 
957 		new_mtu = max_mtu;
958 	}
959 
960 	dev->mtu = new_mtu;
961 	return 0;
962 }
963 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
964 
965 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
966 {
967 	return __ip_tunnel_change_mtu(dev, new_mtu, true);
968 }
969 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
970 
971 static void ip_tunnel_dev_free(struct net_device *dev)
972 {
973 	struct ip_tunnel *tunnel = netdev_priv(dev);
974 
975 	gro_cells_destroy(&tunnel->gro_cells);
976 	dst_cache_destroy(&tunnel->dst_cache);
977 	free_percpu(dev->tstats);
978 }
979 
980 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
981 {
982 	struct ip_tunnel *tunnel = netdev_priv(dev);
983 	struct ip_tunnel_net *itn;
984 
985 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
986 
987 	if (itn->fb_tunnel_dev != dev) {
988 		ip_tunnel_del(itn, netdev_priv(dev));
989 		unregister_netdevice_queue(dev, head);
990 	}
991 }
992 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
993 
994 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
995 {
996 	struct ip_tunnel *tunnel = netdev_priv(dev);
997 
998 	return tunnel->net;
999 }
1000 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1001 
1002 int ip_tunnel_get_iflink(const struct net_device *dev)
1003 {
1004 	struct ip_tunnel *tunnel = netdev_priv(dev);
1005 
1006 	return tunnel->parms.link;
1007 }
1008 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1009 
1010 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1011 				  struct rtnl_link_ops *ops, char *devname)
1012 {
1013 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1014 	struct ip_tunnel_parm parms;
1015 	unsigned int i;
1016 
1017 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1018 		INIT_HLIST_HEAD(&itn->tunnels[i]);
1019 
1020 	if (!ops) {
1021 		itn->fb_tunnel_dev = NULL;
1022 		return 0;
1023 	}
1024 
1025 	memset(&parms, 0, sizeof(parms));
1026 	if (devname)
1027 		strlcpy(parms.name, devname, IFNAMSIZ);
1028 
1029 	rtnl_lock();
1030 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1031 	/* FB netdevice is special: we have one, and only one per netns.
1032 	 * Allowing to move it to another netns is clearly unsafe.
1033 	 */
1034 	if (!IS_ERR(itn->fb_tunnel_dev)) {
1035 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1036 		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1037 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1038 	}
1039 	rtnl_unlock();
1040 
1041 	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1042 }
1043 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1044 
1045 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1046 			      struct rtnl_link_ops *ops)
1047 {
1048 	struct net *net = dev_net(itn->fb_tunnel_dev);
1049 	struct net_device *dev, *aux;
1050 	int h;
1051 
1052 	for_each_netdev_safe(net, dev, aux)
1053 		if (dev->rtnl_link_ops == ops)
1054 			unregister_netdevice_queue(dev, head);
1055 
1056 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1057 		struct ip_tunnel *t;
1058 		struct hlist_node *n;
1059 		struct hlist_head *thead = &itn->tunnels[h];
1060 
1061 		hlist_for_each_entry_safe(t, n, thead, hash_node)
1062 			/* If dev is in the same netns, it has already
1063 			 * been added to the list by the previous loop.
1064 			 */
1065 			if (!net_eq(dev_net(t->dev), net))
1066 				unregister_netdevice_queue(t->dev, head);
1067 	}
1068 }
1069 
1070 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1071 			   struct rtnl_link_ops *ops)
1072 {
1073 	struct ip_tunnel_net *itn;
1074 	struct net *net;
1075 	LIST_HEAD(list);
1076 
1077 	rtnl_lock();
1078 	list_for_each_entry(net, net_list, exit_list) {
1079 		itn = net_generic(net, id);
1080 		ip_tunnel_destroy(itn, &list, ops);
1081 	}
1082 	unregister_netdevice_many(&list);
1083 	rtnl_unlock();
1084 }
1085 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1086 
1087 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1088 		      struct ip_tunnel_parm *p, __u32 fwmark)
1089 {
1090 	struct ip_tunnel *nt;
1091 	struct net *net = dev_net(dev);
1092 	struct ip_tunnel_net *itn;
1093 	int mtu;
1094 	int err;
1095 
1096 	nt = netdev_priv(dev);
1097 	itn = net_generic(net, nt->ip_tnl_net_id);
1098 
1099 	if (nt->collect_md) {
1100 		if (rtnl_dereference(itn->collect_md_tun))
1101 			return -EEXIST;
1102 	} else {
1103 		if (ip_tunnel_find(itn, p, dev->type))
1104 			return -EEXIST;
1105 	}
1106 
1107 	nt->net = net;
1108 	nt->parms = *p;
1109 	nt->fwmark = fwmark;
1110 	err = register_netdevice(dev);
1111 	if (err)
1112 		goto out;
1113 
1114 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1115 		eth_hw_addr_random(dev);
1116 
1117 	mtu = ip_tunnel_bind_dev(dev);
1118 	if (!tb[IFLA_MTU])
1119 		dev->mtu = mtu;
1120 
1121 	ip_tunnel_add(itn, nt);
1122 out:
1123 	return err;
1124 }
1125 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1126 
1127 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1128 			 struct ip_tunnel_parm *p, __u32 fwmark)
1129 {
1130 	struct ip_tunnel *t;
1131 	struct ip_tunnel *tunnel = netdev_priv(dev);
1132 	struct net *net = tunnel->net;
1133 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1134 
1135 	if (dev == itn->fb_tunnel_dev)
1136 		return -EINVAL;
1137 
1138 	t = ip_tunnel_find(itn, p, dev->type);
1139 
1140 	if (t) {
1141 		if (t->dev != dev)
1142 			return -EEXIST;
1143 	} else {
1144 		t = tunnel;
1145 
1146 		if (dev->type != ARPHRD_ETHER) {
1147 			unsigned int nflags = 0;
1148 
1149 			if (ipv4_is_multicast(p->iph.daddr))
1150 				nflags = IFF_BROADCAST;
1151 			else if (p->iph.daddr)
1152 				nflags = IFF_POINTOPOINT;
1153 
1154 			if ((dev->flags ^ nflags) &
1155 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1156 				return -EINVAL;
1157 		}
1158 	}
1159 
1160 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1161 	return 0;
1162 }
1163 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1164 
1165 int ip_tunnel_init(struct net_device *dev)
1166 {
1167 	struct ip_tunnel *tunnel = netdev_priv(dev);
1168 	struct iphdr *iph = &tunnel->parms.iph;
1169 	int err;
1170 
1171 	dev->needs_free_netdev = true;
1172 	dev->priv_destructor = ip_tunnel_dev_free;
1173 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1174 	if (!dev->tstats)
1175 		return -ENOMEM;
1176 
1177 	err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1178 	if (err) {
1179 		free_percpu(dev->tstats);
1180 		return err;
1181 	}
1182 
1183 	err = gro_cells_init(&tunnel->gro_cells, dev);
1184 	if (err) {
1185 		dst_cache_destroy(&tunnel->dst_cache);
1186 		free_percpu(dev->tstats);
1187 		return err;
1188 	}
1189 
1190 	tunnel->dev = dev;
1191 	tunnel->net = dev_net(dev);
1192 	strcpy(tunnel->parms.name, dev->name);
1193 	iph->version		= 4;
1194 	iph->ihl		= 5;
1195 
1196 	if (tunnel->collect_md) {
1197 		dev->features |= NETIF_F_NETNS_LOCAL;
1198 		netif_keep_dst(dev);
1199 	}
1200 	return 0;
1201 }
1202 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1203 
1204 void ip_tunnel_uninit(struct net_device *dev)
1205 {
1206 	struct ip_tunnel *tunnel = netdev_priv(dev);
1207 	struct net *net = tunnel->net;
1208 	struct ip_tunnel_net *itn;
1209 
1210 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1211 	/* fb_tunnel_dev will be unregisted in net-exit call. */
1212 	if (itn->fb_tunnel_dev != dev)
1213 		ip_tunnel_del(itn, netdev_priv(dev));
1214 
1215 	dst_cache_reset(&tunnel->dst_cache);
1216 }
1217 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1218 
1219 /* Do least required initialization, rest of init is done in tunnel_init call */
1220 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1221 {
1222 	struct ip_tunnel *tunnel = netdev_priv(dev);
1223 	tunnel->ip_tnl_net_id = net_id;
1224 }
1225 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1226 
1227 MODULE_LICENSE("GPL");
1228