xref: /openbmc/linux/net/ipv4/ip_tunnel.c (revision 020c5260)
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20 
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/init.h>
34 #include <linux/in6.h>
35 #include <linux/inetdevice.h>
36 #include <linux/igmp.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/etherdevice.h>
39 #include <linux/if_ether.h>
40 #include <linux/if_vlan.h>
41 #include <linux/rculist.h>
42 #include <linux/err.h>
43 
44 #include <net/sock.h>
45 #include <net/ip.h>
46 #include <net/icmp.h>
47 #include <net/protocol.h>
48 #include <net/ip_tunnels.h>
49 #include <net/arp.h>
50 #include <net/checksum.h>
51 #include <net/dsfield.h>
52 #include <net/inet_ecn.h>
53 #include <net/xfrm.h>
54 #include <net/net_namespace.h>
55 #include <net/netns/generic.h>
56 #include <net/rtnetlink.h>
57 #include <net/udp.h>
58 #include <net/dst_metadata.h>
59 
60 #if IS_ENABLED(CONFIG_IPV6)
61 #include <net/ipv6.h>
62 #include <net/ip6_fib.h>
63 #include <net/ip6_route.h>
64 #endif
65 
66 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
67 {
68 	return hash_32((__force u32)key ^ (__force u32)remote,
69 			 IP_TNL_HASH_BITS);
70 }
71 
72 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
73 				__be16 flags, __be32 key)
74 {
75 	if (p->i_flags & TUNNEL_KEY) {
76 		if (flags & TUNNEL_KEY)
77 			return key == p->i_key;
78 		else
79 			/* key expected, none present */
80 			return false;
81 	} else
82 		return !(flags & TUNNEL_KEY);
83 }
84 
85 /* Fallback tunnel: no source, no destination, no key, no options
86 
87    Tunnel hash table:
88    We require exact key match i.e. if a key is present in packet
89    it will match only tunnel with the same key; if it is not present,
90    it will match only keyless tunnel.
91 
92    All keysless packets, if not matched configured keyless tunnels
93    will match fallback tunnel.
94    Given src, dst and key, find appropriate for input tunnel.
95 */
96 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
97 				   int link, __be16 flags,
98 				   __be32 remote, __be32 local,
99 				   __be32 key)
100 {
101 	unsigned int hash;
102 	struct ip_tunnel *t, *cand = NULL;
103 	struct hlist_head *head;
104 
105 	hash = ip_tunnel_hash(key, remote);
106 	head = &itn->tunnels[hash];
107 
108 	hlist_for_each_entry_rcu(t, head, hash_node) {
109 		if (local != t->parms.iph.saddr ||
110 		    remote != t->parms.iph.daddr ||
111 		    !(t->dev->flags & IFF_UP))
112 			continue;
113 
114 		if (!ip_tunnel_key_match(&t->parms, flags, key))
115 			continue;
116 
117 		if (t->parms.link == link)
118 			return t;
119 		else
120 			cand = t;
121 	}
122 
123 	hlist_for_each_entry_rcu(t, head, hash_node) {
124 		if (remote != t->parms.iph.daddr ||
125 		    t->parms.iph.saddr != 0 ||
126 		    !(t->dev->flags & IFF_UP))
127 			continue;
128 
129 		if (!ip_tunnel_key_match(&t->parms, flags, key))
130 			continue;
131 
132 		if (t->parms.link == link)
133 			return t;
134 		else if (!cand)
135 			cand = t;
136 	}
137 
138 	hash = ip_tunnel_hash(key, 0);
139 	head = &itn->tunnels[hash];
140 
141 	hlist_for_each_entry_rcu(t, head, hash_node) {
142 		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
143 		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
144 			continue;
145 
146 		if (!(t->dev->flags & IFF_UP))
147 			continue;
148 
149 		if (!ip_tunnel_key_match(&t->parms, flags, key))
150 			continue;
151 
152 		if (t->parms.link == link)
153 			return t;
154 		else if (!cand)
155 			cand = t;
156 	}
157 
158 	if (flags & TUNNEL_NO_KEY)
159 		goto skip_key_lookup;
160 
161 	hlist_for_each_entry_rcu(t, head, hash_node) {
162 		if (t->parms.i_key != key ||
163 		    t->parms.iph.saddr != 0 ||
164 		    t->parms.iph.daddr != 0 ||
165 		    !(t->dev->flags & IFF_UP))
166 			continue;
167 
168 		if (t->parms.link == link)
169 			return t;
170 		else if (!cand)
171 			cand = t;
172 	}
173 
174 skip_key_lookup:
175 	if (cand)
176 		return cand;
177 
178 	t = rcu_dereference(itn->collect_md_tun);
179 	if (t)
180 		return t;
181 
182 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
183 		return netdev_priv(itn->fb_tunnel_dev);
184 
185 	return NULL;
186 }
187 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
188 
189 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
190 				    struct ip_tunnel_parm *parms)
191 {
192 	unsigned int h;
193 	__be32 remote;
194 	__be32 i_key = parms->i_key;
195 
196 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
197 		remote = parms->iph.daddr;
198 	else
199 		remote = 0;
200 
201 	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
202 		i_key = 0;
203 
204 	h = ip_tunnel_hash(i_key, remote);
205 	return &itn->tunnels[h];
206 }
207 
208 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
209 {
210 	struct hlist_head *head = ip_bucket(itn, &t->parms);
211 
212 	if (t->collect_md)
213 		rcu_assign_pointer(itn->collect_md_tun, t);
214 	hlist_add_head_rcu(&t->hash_node, head);
215 }
216 
217 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
218 {
219 	if (t->collect_md)
220 		rcu_assign_pointer(itn->collect_md_tun, NULL);
221 	hlist_del_init_rcu(&t->hash_node);
222 }
223 
224 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
225 					struct ip_tunnel_parm *parms,
226 					int type)
227 {
228 	__be32 remote = parms->iph.daddr;
229 	__be32 local = parms->iph.saddr;
230 	__be32 key = parms->i_key;
231 	__be16 flags = parms->i_flags;
232 	int link = parms->link;
233 	struct ip_tunnel *t = NULL;
234 	struct hlist_head *head = ip_bucket(itn, parms);
235 
236 	hlist_for_each_entry_rcu(t, head, hash_node) {
237 		if (local == t->parms.iph.saddr &&
238 		    remote == t->parms.iph.daddr &&
239 		    link == t->parms.link &&
240 		    type == t->dev->type &&
241 		    ip_tunnel_key_match(&t->parms, flags, key))
242 			break;
243 	}
244 	return t;
245 }
246 
247 static struct net_device *__ip_tunnel_create(struct net *net,
248 					     const struct rtnl_link_ops *ops,
249 					     struct ip_tunnel_parm *parms)
250 {
251 	int err;
252 	struct ip_tunnel *tunnel;
253 	struct net_device *dev;
254 	char name[IFNAMSIZ];
255 
256 	if (parms->name[0])
257 		strlcpy(name, parms->name, IFNAMSIZ);
258 	else {
259 		if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
260 			err = -E2BIG;
261 			goto failed;
262 		}
263 		strlcpy(name, ops->kind, IFNAMSIZ);
264 		strncat(name, "%d", 2);
265 	}
266 
267 	ASSERT_RTNL();
268 	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
269 	if (!dev) {
270 		err = -ENOMEM;
271 		goto failed;
272 	}
273 	dev_net_set(dev, net);
274 
275 	dev->rtnl_link_ops = ops;
276 
277 	tunnel = netdev_priv(dev);
278 	tunnel->parms = *parms;
279 	tunnel->net = net;
280 
281 	err = register_netdevice(dev);
282 	if (err)
283 		goto failed_free;
284 
285 	return dev;
286 
287 failed_free:
288 	free_netdev(dev);
289 failed:
290 	return ERR_PTR(err);
291 }
292 
293 static inline void init_tunnel_flow(struct flowi4 *fl4,
294 				    int proto,
295 				    __be32 daddr, __be32 saddr,
296 				    __be32 key, __u8 tos, int oif,
297 				    __u32 mark)
298 {
299 	memset(fl4, 0, sizeof(*fl4));
300 	fl4->flowi4_oif = oif;
301 	fl4->daddr = daddr;
302 	fl4->saddr = saddr;
303 	fl4->flowi4_tos = tos;
304 	fl4->flowi4_proto = proto;
305 	fl4->fl4_gre_key = key;
306 	fl4->flowi4_mark = mark;
307 }
308 
309 static int ip_tunnel_bind_dev(struct net_device *dev)
310 {
311 	struct net_device *tdev = NULL;
312 	struct ip_tunnel *tunnel = netdev_priv(dev);
313 	const struct iphdr *iph;
314 	int hlen = LL_MAX_HEADER;
315 	int mtu = ETH_DATA_LEN;
316 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
317 
318 	iph = &tunnel->parms.iph;
319 
320 	/* Guess output device to choose reasonable mtu and needed_headroom */
321 	if (iph->daddr) {
322 		struct flowi4 fl4;
323 		struct rtable *rt;
324 
325 		init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
326 				 iph->saddr, tunnel->parms.o_key,
327 				 RT_TOS(iph->tos), tunnel->parms.link,
328 				 tunnel->fwmark);
329 		rt = ip_route_output_key(tunnel->net, &fl4);
330 
331 		if (!IS_ERR(rt)) {
332 			tdev = rt->dst.dev;
333 			ip_rt_put(rt);
334 		}
335 		if (dev->type != ARPHRD_ETHER)
336 			dev->flags |= IFF_POINTOPOINT;
337 
338 		dst_cache_reset(&tunnel->dst_cache);
339 	}
340 
341 	if (!tdev && tunnel->parms.link)
342 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
343 
344 	if (tdev) {
345 		hlen = tdev->hard_header_len + tdev->needed_headroom;
346 		mtu = tdev->mtu;
347 	}
348 
349 	dev->needed_headroom = t_hlen + hlen;
350 	mtu -= (dev->hard_header_len + t_hlen);
351 
352 	if (mtu < 68)
353 		mtu = 68;
354 
355 	return mtu;
356 }
357 
358 static struct ip_tunnel *ip_tunnel_create(struct net *net,
359 					  struct ip_tunnel_net *itn,
360 					  struct ip_tunnel_parm *parms)
361 {
362 	struct ip_tunnel *nt;
363 	struct net_device *dev;
364 	int t_hlen;
365 
366 	BUG_ON(!itn->fb_tunnel_dev);
367 	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
368 	if (IS_ERR(dev))
369 		return ERR_CAST(dev);
370 
371 	dev->mtu = ip_tunnel_bind_dev(dev);
372 
373 	nt = netdev_priv(dev);
374 	t_hlen = nt->hlen + sizeof(struct iphdr);
375 	dev->min_mtu = ETH_MIN_MTU;
376 	dev->max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
377 	ip_tunnel_add(itn, nt);
378 	return nt;
379 }
380 
381 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
382 		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
383 		  bool log_ecn_error)
384 {
385 	struct pcpu_sw_netstats *tstats;
386 	const struct iphdr *iph = ip_hdr(skb);
387 	int err;
388 
389 #ifdef CONFIG_NET_IPGRE_BROADCAST
390 	if (ipv4_is_multicast(iph->daddr)) {
391 		tunnel->dev->stats.multicast++;
392 		skb->pkt_type = PACKET_BROADCAST;
393 	}
394 #endif
395 
396 	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
397 	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
398 		tunnel->dev->stats.rx_crc_errors++;
399 		tunnel->dev->stats.rx_errors++;
400 		goto drop;
401 	}
402 
403 	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
404 		if (!(tpi->flags&TUNNEL_SEQ) ||
405 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
406 			tunnel->dev->stats.rx_fifo_errors++;
407 			tunnel->dev->stats.rx_errors++;
408 			goto drop;
409 		}
410 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
411 	}
412 
413 	skb_reset_network_header(skb);
414 
415 	err = IP_ECN_decapsulate(iph, skb);
416 	if (unlikely(err)) {
417 		if (log_ecn_error)
418 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
419 					&iph->saddr, iph->tos);
420 		if (err > 1) {
421 			++tunnel->dev->stats.rx_frame_errors;
422 			++tunnel->dev->stats.rx_errors;
423 			goto drop;
424 		}
425 	}
426 
427 	tstats = this_cpu_ptr(tunnel->dev->tstats);
428 	u64_stats_update_begin(&tstats->syncp);
429 	tstats->rx_packets++;
430 	tstats->rx_bytes += skb->len;
431 	u64_stats_update_end(&tstats->syncp);
432 
433 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
434 
435 	if (tunnel->dev->type == ARPHRD_ETHER) {
436 		skb->protocol = eth_type_trans(skb, tunnel->dev);
437 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
438 	} else {
439 		skb->dev = tunnel->dev;
440 	}
441 
442 	if (tun_dst)
443 		skb_dst_set(skb, (struct dst_entry *)tun_dst);
444 
445 	gro_cells_receive(&tunnel->gro_cells, skb);
446 	return 0;
447 
448 drop:
449 	kfree_skb(skb);
450 	return 0;
451 }
452 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
453 
454 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
455 			    unsigned int num)
456 {
457 	if (num >= MAX_IPTUN_ENCAP_OPS)
458 		return -ERANGE;
459 
460 	return !cmpxchg((const struct ip_tunnel_encap_ops **)
461 			&iptun_encaps[num],
462 			NULL, ops) ? 0 : -1;
463 }
464 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
465 
466 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
467 			    unsigned int num)
468 {
469 	int ret;
470 
471 	if (num >= MAX_IPTUN_ENCAP_OPS)
472 		return -ERANGE;
473 
474 	ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
475 		       &iptun_encaps[num],
476 		       ops, NULL) == ops) ? 0 : -1;
477 
478 	synchronize_net();
479 
480 	return ret;
481 }
482 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
483 
484 int ip_tunnel_encap_setup(struct ip_tunnel *t,
485 			  struct ip_tunnel_encap *ipencap)
486 {
487 	int hlen;
488 
489 	memset(&t->encap, 0, sizeof(t->encap));
490 
491 	hlen = ip_encap_hlen(ipencap);
492 	if (hlen < 0)
493 		return hlen;
494 
495 	t->encap.type = ipencap->type;
496 	t->encap.sport = ipencap->sport;
497 	t->encap.dport = ipencap->dport;
498 	t->encap.flags = ipencap->flags;
499 
500 	t->encap_hlen = hlen;
501 	t->hlen = t->encap_hlen + t->tun_hlen;
502 
503 	return 0;
504 }
505 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
506 
507 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
508 			    struct rtable *rt, __be16 df,
509 			    const struct iphdr *inner_iph)
510 {
511 	struct ip_tunnel *tunnel = netdev_priv(dev);
512 	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
513 	int mtu;
514 
515 	if (df)
516 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
517 					- sizeof(struct iphdr) - tunnel->hlen;
518 	else
519 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
520 
521 	if (skb_dst(skb))
522 		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
523 
524 	if (skb->protocol == htons(ETH_P_IP)) {
525 		if (!skb_is_gso(skb) &&
526 		    (inner_iph->frag_off & htons(IP_DF)) &&
527 		    mtu < pkt_size) {
528 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
529 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
530 			return -E2BIG;
531 		}
532 	}
533 #if IS_ENABLED(CONFIG_IPV6)
534 	else if (skb->protocol == htons(ETH_P_IPV6)) {
535 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
536 
537 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
538 			   mtu >= IPV6_MIN_MTU) {
539 			if ((tunnel->parms.iph.daddr &&
540 			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
541 			    rt6->rt6i_dst.plen == 128) {
542 				rt6->rt6i_flags |= RTF_MODIFIED;
543 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
544 			}
545 		}
546 
547 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
548 					mtu < pkt_size) {
549 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
550 			return -E2BIG;
551 		}
552 	}
553 #endif
554 	return 0;
555 }
556 
557 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
558 {
559 	struct ip_tunnel *tunnel = netdev_priv(dev);
560 	u32 headroom = sizeof(struct iphdr);
561 	struct ip_tunnel_info *tun_info;
562 	const struct ip_tunnel_key *key;
563 	const struct iphdr *inner_iph;
564 	struct rtable *rt;
565 	struct flowi4 fl4;
566 	__be16 df = 0;
567 	u8 tos, ttl;
568 
569 	tun_info = skb_tunnel_info(skb);
570 	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
571 		     ip_tunnel_info_af(tun_info) != AF_INET))
572 		goto tx_error;
573 	key = &tun_info->key;
574 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
575 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
576 	tos = key->tos;
577 	if (tos == 1) {
578 		if (skb->protocol == htons(ETH_P_IP))
579 			tos = inner_iph->tos;
580 		else if (skb->protocol == htons(ETH_P_IPV6))
581 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
582 	}
583 	init_tunnel_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0,
584 			 RT_TOS(tos), tunnel->parms.link, tunnel->fwmark);
585 	if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
586 		goto tx_error;
587 	rt = ip_route_output_key(tunnel->net, &fl4);
588 	if (IS_ERR(rt)) {
589 		dev->stats.tx_carrier_errors++;
590 		goto tx_error;
591 	}
592 	if (rt->dst.dev == dev) {
593 		ip_rt_put(rt);
594 		dev->stats.collisions++;
595 		goto tx_error;
596 	}
597 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
598 	ttl = key->ttl;
599 	if (ttl == 0) {
600 		if (skb->protocol == htons(ETH_P_IP))
601 			ttl = inner_iph->ttl;
602 		else if (skb->protocol == htons(ETH_P_IPV6))
603 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
604 		else
605 			ttl = ip4_dst_hoplimit(&rt->dst);
606 	}
607 	if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
608 		df = htons(IP_DF);
609 	else if (skb->protocol == htons(ETH_P_IP))
610 		df = inner_iph->frag_off & htons(IP_DF);
611 	headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
612 	if (headroom > dev->needed_headroom)
613 		dev->needed_headroom = headroom;
614 
615 	if (skb_cow_head(skb, dev->needed_headroom)) {
616 		ip_rt_put(rt);
617 		goto tx_dropped;
618 	}
619 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, key->tos,
620 		      key->ttl, df, !net_eq(tunnel->net, dev_net(dev)));
621 	return;
622 tx_error:
623 	dev->stats.tx_errors++;
624 	goto kfree;
625 tx_dropped:
626 	dev->stats.tx_dropped++;
627 kfree:
628 	kfree_skb(skb);
629 }
630 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
631 
632 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
633 		    const struct iphdr *tnl_params, u8 protocol)
634 {
635 	struct ip_tunnel *tunnel = netdev_priv(dev);
636 	const struct iphdr *inner_iph;
637 	struct flowi4 fl4;
638 	u8     tos, ttl;
639 	__be16 df;
640 	struct rtable *rt;		/* Route to the other host */
641 	unsigned int max_headroom;	/* The extra header space needed */
642 	__be32 dst;
643 	bool connected;
644 
645 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
646 	connected = (tunnel->parms.iph.daddr != 0);
647 
648 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
649 
650 	dst = tnl_params->daddr;
651 	if (dst == 0) {
652 		/* NBMA tunnel */
653 
654 		if (!skb_dst(skb)) {
655 			dev->stats.tx_fifo_errors++;
656 			goto tx_error;
657 		}
658 
659 		if (skb->protocol == htons(ETH_P_IP)) {
660 			rt = skb_rtable(skb);
661 			dst = rt_nexthop(rt, inner_iph->daddr);
662 		}
663 #if IS_ENABLED(CONFIG_IPV6)
664 		else if (skb->protocol == htons(ETH_P_IPV6)) {
665 			const struct in6_addr *addr6;
666 			struct neighbour *neigh;
667 			bool do_tx_error_icmp;
668 			int addr_type;
669 
670 			neigh = dst_neigh_lookup(skb_dst(skb),
671 						 &ipv6_hdr(skb)->daddr);
672 			if (!neigh)
673 				goto tx_error;
674 
675 			addr6 = (const struct in6_addr *)&neigh->primary_key;
676 			addr_type = ipv6_addr_type(addr6);
677 
678 			if (addr_type == IPV6_ADDR_ANY) {
679 				addr6 = &ipv6_hdr(skb)->daddr;
680 				addr_type = ipv6_addr_type(addr6);
681 			}
682 
683 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
684 				do_tx_error_icmp = true;
685 			else {
686 				do_tx_error_icmp = false;
687 				dst = addr6->s6_addr32[3];
688 			}
689 			neigh_release(neigh);
690 			if (do_tx_error_icmp)
691 				goto tx_error_icmp;
692 		}
693 #endif
694 		else
695 			goto tx_error;
696 
697 		connected = false;
698 	}
699 
700 	tos = tnl_params->tos;
701 	if (tos & 0x1) {
702 		tos &= ~0x1;
703 		if (skb->protocol == htons(ETH_P_IP)) {
704 			tos = inner_iph->tos;
705 			connected = false;
706 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
707 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
708 			connected = false;
709 		}
710 	}
711 
712 	init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
713 			 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
714 			 tunnel->fwmark);
715 
716 	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
717 		goto tx_error;
718 
719 	rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
720 			 NULL;
721 
722 	if (!rt) {
723 		rt = ip_route_output_key(tunnel->net, &fl4);
724 
725 		if (IS_ERR(rt)) {
726 			dev->stats.tx_carrier_errors++;
727 			goto tx_error;
728 		}
729 		if (connected)
730 			dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
731 					  fl4.saddr);
732 	}
733 
734 	if (rt->dst.dev == dev) {
735 		ip_rt_put(rt);
736 		dev->stats.collisions++;
737 		goto tx_error;
738 	}
739 
740 	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
741 		ip_rt_put(rt);
742 		goto tx_error;
743 	}
744 
745 	if (tunnel->err_count > 0) {
746 		if (time_before(jiffies,
747 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
748 			tunnel->err_count--;
749 
750 			dst_link_failure(skb);
751 		} else
752 			tunnel->err_count = 0;
753 	}
754 
755 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
756 	ttl = tnl_params->ttl;
757 	if (ttl == 0) {
758 		if (skb->protocol == htons(ETH_P_IP))
759 			ttl = inner_iph->ttl;
760 #if IS_ENABLED(CONFIG_IPV6)
761 		else if (skb->protocol == htons(ETH_P_IPV6))
762 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
763 #endif
764 		else
765 			ttl = ip4_dst_hoplimit(&rt->dst);
766 	}
767 
768 	df = tnl_params->frag_off;
769 	if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
770 		df |= (inner_iph->frag_off&htons(IP_DF));
771 
772 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
773 			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
774 	if (max_headroom > dev->needed_headroom)
775 		dev->needed_headroom = max_headroom;
776 
777 	if (skb_cow_head(skb, dev->needed_headroom)) {
778 		ip_rt_put(rt);
779 		dev->stats.tx_dropped++;
780 		kfree_skb(skb);
781 		return;
782 	}
783 
784 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
785 		      df, !net_eq(tunnel->net, dev_net(dev)));
786 	return;
787 
788 #if IS_ENABLED(CONFIG_IPV6)
789 tx_error_icmp:
790 	dst_link_failure(skb);
791 #endif
792 tx_error:
793 	dev->stats.tx_errors++;
794 	kfree_skb(skb);
795 }
796 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
797 
798 static void ip_tunnel_update(struct ip_tunnel_net *itn,
799 			     struct ip_tunnel *t,
800 			     struct net_device *dev,
801 			     struct ip_tunnel_parm *p,
802 			     bool set_mtu,
803 			     __u32 fwmark)
804 {
805 	ip_tunnel_del(itn, t);
806 	t->parms.iph.saddr = p->iph.saddr;
807 	t->parms.iph.daddr = p->iph.daddr;
808 	t->parms.i_key = p->i_key;
809 	t->parms.o_key = p->o_key;
810 	if (dev->type != ARPHRD_ETHER) {
811 		memcpy(dev->dev_addr, &p->iph.saddr, 4);
812 		memcpy(dev->broadcast, &p->iph.daddr, 4);
813 	}
814 	ip_tunnel_add(itn, t);
815 
816 	t->parms.iph.ttl = p->iph.ttl;
817 	t->parms.iph.tos = p->iph.tos;
818 	t->parms.iph.frag_off = p->iph.frag_off;
819 
820 	if (t->parms.link != p->link || t->fwmark != fwmark) {
821 		int mtu;
822 
823 		t->parms.link = p->link;
824 		t->fwmark = fwmark;
825 		mtu = ip_tunnel_bind_dev(dev);
826 		if (set_mtu)
827 			dev->mtu = mtu;
828 	}
829 	dst_cache_reset(&t->dst_cache);
830 	netdev_state_change(dev);
831 }
832 
833 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
834 {
835 	int err = 0;
836 	struct ip_tunnel *t = netdev_priv(dev);
837 	struct net *net = t->net;
838 	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
839 
840 	BUG_ON(!itn->fb_tunnel_dev);
841 	switch (cmd) {
842 	case SIOCGETTUNNEL:
843 		if (dev == itn->fb_tunnel_dev) {
844 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
845 			if (!t)
846 				t = netdev_priv(dev);
847 		}
848 		memcpy(p, &t->parms, sizeof(*p));
849 		break;
850 
851 	case SIOCADDTUNNEL:
852 	case SIOCCHGTUNNEL:
853 		err = -EPERM;
854 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
855 			goto done;
856 		if (p->iph.ttl)
857 			p->iph.frag_off |= htons(IP_DF);
858 		if (!(p->i_flags & VTI_ISVTI)) {
859 			if (!(p->i_flags & TUNNEL_KEY))
860 				p->i_key = 0;
861 			if (!(p->o_flags & TUNNEL_KEY))
862 				p->o_key = 0;
863 		}
864 
865 		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
866 
867 		if (cmd == SIOCADDTUNNEL) {
868 			if (!t) {
869 				t = ip_tunnel_create(net, itn, p);
870 				err = PTR_ERR_OR_ZERO(t);
871 				break;
872 			}
873 
874 			err = -EEXIST;
875 			break;
876 		}
877 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
878 			if (t) {
879 				if (t->dev != dev) {
880 					err = -EEXIST;
881 					break;
882 				}
883 			} else {
884 				unsigned int nflags = 0;
885 
886 				if (ipv4_is_multicast(p->iph.daddr))
887 					nflags = IFF_BROADCAST;
888 				else if (p->iph.daddr)
889 					nflags = IFF_POINTOPOINT;
890 
891 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
892 					err = -EINVAL;
893 					break;
894 				}
895 
896 				t = netdev_priv(dev);
897 			}
898 		}
899 
900 		if (t) {
901 			err = 0;
902 			ip_tunnel_update(itn, t, dev, p, true, 0);
903 		} else {
904 			err = -ENOENT;
905 		}
906 		break;
907 
908 	case SIOCDELTUNNEL:
909 		err = -EPERM;
910 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
911 			goto done;
912 
913 		if (dev == itn->fb_tunnel_dev) {
914 			err = -ENOENT;
915 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
916 			if (!t)
917 				goto done;
918 			err = -EPERM;
919 			if (t == netdev_priv(itn->fb_tunnel_dev))
920 				goto done;
921 			dev = t->dev;
922 		}
923 		unregister_netdevice(dev);
924 		err = 0;
925 		break;
926 
927 	default:
928 		err = -EINVAL;
929 	}
930 
931 done:
932 	return err;
933 }
934 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
935 
936 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
937 {
938 	struct ip_tunnel *tunnel = netdev_priv(dev);
939 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
940 	int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
941 
942 	if (new_mtu < ETH_MIN_MTU)
943 		return -EINVAL;
944 
945 	if (new_mtu > max_mtu) {
946 		if (strict)
947 			return -EINVAL;
948 
949 		new_mtu = max_mtu;
950 	}
951 
952 	dev->mtu = new_mtu;
953 	return 0;
954 }
955 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
956 
957 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
958 {
959 	return __ip_tunnel_change_mtu(dev, new_mtu, true);
960 }
961 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
962 
963 static void ip_tunnel_dev_free(struct net_device *dev)
964 {
965 	struct ip_tunnel *tunnel = netdev_priv(dev);
966 
967 	gro_cells_destroy(&tunnel->gro_cells);
968 	dst_cache_destroy(&tunnel->dst_cache);
969 	free_percpu(dev->tstats);
970 	free_netdev(dev);
971 }
972 
973 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
974 {
975 	struct ip_tunnel *tunnel = netdev_priv(dev);
976 	struct ip_tunnel_net *itn;
977 
978 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
979 
980 	if (itn->fb_tunnel_dev != dev) {
981 		ip_tunnel_del(itn, netdev_priv(dev));
982 		unregister_netdevice_queue(dev, head);
983 	}
984 }
985 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
986 
987 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
988 {
989 	struct ip_tunnel *tunnel = netdev_priv(dev);
990 
991 	return tunnel->net;
992 }
993 EXPORT_SYMBOL(ip_tunnel_get_link_net);
994 
995 int ip_tunnel_get_iflink(const struct net_device *dev)
996 {
997 	struct ip_tunnel *tunnel = netdev_priv(dev);
998 
999 	return tunnel->parms.link;
1000 }
1001 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1002 
1003 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1004 				  struct rtnl_link_ops *ops, char *devname)
1005 {
1006 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1007 	struct ip_tunnel_parm parms;
1008 	unsigned int i;
1009 
1010 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1011 		INIT_HLIST_HEAD(&itn->tunnels[i]);
1012 
1013 	if (!ops) {
1014 		itn->fb_tunnel_dev = NULL;
1015 		return 0;
1016 	}
1017 
1018 	memset(&parms, 0, sizeof(parms));
1019 	if (devname)
1020 		strlcpy(parms.name, devname, IFNAMSIZ);
1021 
1022 	rtnl_lock();
1023 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1024 	/* FB netdevice is special: we have one, and only one per netns.
1025 	 * Allowing to move it to another netns is clearly unsafe.
1026 	 */
1027 	if (!IS_ERR(itn->fb_tunnel_dev)) {
1028 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1029 		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1030 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1031 	}
1032 	rtnl_unlock();
1033 
1034 	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1035 }
1036 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1037 
1038 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1039 			      struct rtnl_link_ops *ops)
1040 {
1041 	struct net *net = dev_net(itn->fb_tunnel_dev);
1042 	struct net_device *dev, *aux;
1043 	int h;
1044 
1045 	for_each_netdev_safe(net, dev, aux)
1046 		if (dev->rtnl_link_ops == ops)
1047 			unregister_netdevice_queue(dev, head);
1048 
1049 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1050 		struct ip_tunnel *t;
1051 		struct hlist_node *n;
1052 		struct hlist_head *thead = &itn->tunnels[h];
1053 
1054 		hlist_for_each_entry_safe(t, n, thead, hash_node)
1055 			/* If dev is in the same netns, it has already
1056 			 * been added to the list by the previous loop.
1057 			 */
1058 			if (!net_eq(dev_net(t->dev), net))
1059 				unregister_netdevice_queue(t->dev, head);
1060 	}
1061 }
1062 
1063 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1064 {
1065 	LIST_HEAD(list);
1066 
1067 	rtnl_lock();
1068 	ip_tunnel_destroy(itn, &list, ops);
1069 	unregister_netdevice_many(&list);
1070 	rtnl_unlock();
1071 }
1072 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1073 
1074 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1075 		      struct ip_tunnel_parm *p, __u32 fwmark)
1076 {
1077 	struct ip_tunnel *nt;
1078 	struct net *net = dev_net(dev);
1079 	struct ip_tunnel_net *itn;
1080 	int mtu;
1081 	int err;
1082 
1083 	nt = netdev_priv(dev);
1084 	itn = net_generic(net, nt->ip_tnl_net_id);
1085 
1086 	if (nt->collect_md) {
1087 		if (rtnl_dereference(itn->collect_md_tun))
1088 			return -EEXIST;
1089 	} else {
1090 		if (ip_tunnel_find(itn, p, dev->type))
1091 			return -EEXIST;
1092 	}
1093 
1094 	nt->net = net;
1095 	nt->parms = *p;
1096 	nt->fwmark = fwmark;
1097 	err = register_netdevice(dev);
1098 	if (err)
1099 		goto out;
1100 
1101 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1102 		eth_hw_addr_random(dev);
1103 
1104 	mtu = ip_tunnel_bind_dev(dev);
1105 	if (!tb[IFLA_MTU])
1106 		dev->mtu = mtu;
1107 
1108 	ip_tunnel_add(itn, nt);
1109 out:
1110 	return err;
1111 }
1112 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1113 
1114 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1115 			 struct ip_tunnel_parm *p, __u32 fwmark)
1116 {
1117 	struct ip_tunnel *t;
1118 	struct ip_tunnel *tunnel = netdev_priv(dev);
1119 	struct net *net = tunnel->net;
1120 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1121 
1122 	if (dev == itn->fb_tunnel_dev)
1123 		return -EINVAL;
1124 
1125 	t = ip_tunnel_find(itn, p, dev->type);
1126 
1127 	if (t) {
1128 		if (t->dev != dev)
1129 			return -EEXIST;
1130 	} else {
1131 		t = tunnel;
1132 
1133 		if (dev->type != ARPHRD_ETHER) {
1134 			unsigned int nflags = 0;
1135 
1136 			if (ipv4_is_multicast(p->iph.daddr))
1137 				nflags = IFF_BROADCAST;
1138 			else if (p->iph.daddr)
1139 				nflags = IFF_POINTOPOINT;
1140 
1141 			if ((dev->flags ^ nflags) &
1142 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1143 				return -EINVAL;
1144 		}
1145 	}
1146 
1147 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1148 	return 0;
1149 }
1150 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1151 
1152 int ip_tunnel_init(struct net_device *dev)
1153 {
1154 	struct ip_tunnel *tunnel = netdev_priv(dev);
1155 	struct iphdr *iph = &tunnel->parms.iph;
1156 	int err;
1157 
1158 	dev->destructor	= ip_tunnel_dev_free;
1159 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1160 	if (!dev->tstats)
1161 		return -ENOMEM;
1162 
1163 	err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1164 	if (err) {
1165 		free_percpu(dev->tstats);
1166 		return err;
1167 	}
1168 
1169 	err = gro_cells_init(&tunnel->gro_cells, dev);
1170 	if (err) {
1171 		dst_cache_destroy(&tunnel->dst_cache);
1172 		free_percpu(dev->tstats);
1173 		return err;
1174 	}
1175 
1176 	tunnel->dev = dev;
1177 	tunnel->net = dev_net(dev);
1178 	strcpy(tunnel->parms.name, dev->name);
1179 	iph->version		= 4;
1180 	iph->ihl		= 5;
1181 
1182 	if (tunnel->collect_md) {
1183 		dev->features |= NETIF_F_NETNS_LOCAL;
1184 		netif_keep_dst(dev);
1185 	}
1186 	return 0;
1187 }
1188 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1189 
1190 void ip_tunnel_uninit(struct net_device *dev)
1191 {
1192 	struct ip_tunnel *tunnel = netdev_priv(dev);
1193 	struct net *net = tunnel->net;
1194 	struct ip_tunnel_net *itn;
1195 
1196 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1197 	/* fb_tunnel_dev will be unregisted in net-exit call. */
1198 	if (itn->fb_tunnel_dev != dev)
1199 		ip_tunnel_del(itn, netdev_priv(dev));
1200 
1201 	dst_cache_reset(&tunnel->dst_cache);
1202 }
1203 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1204 
1205 /* Do least required initialization, rest of init is done in tunnel_init call */
1206 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1207 {
1208 	struct ip_tunnel *tunnel = netdev_priv(dev);
1209 	tunnel->ip_tnl_net_id = net_id;
1210 }
1211 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1212 
1213 MODULE_LICENSE("GPL");
1214