xref: /openbmc/linux/net/ipv4/ip_tunnel.c (revision bc05aa6e)
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18 
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20 
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/init.h>
34 #include <linux/in6.h>
35 #include <linux/inetdevice.h>
36 #include <linux/igmp.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/etherdevice.h>
39 #include <linux/if_ether.h>
40 #include <linux/if_vlan.h>
41 #include <linux/rculist.h>
42 #include <linux/err.h>
43 
44 #include <net/sock.h>
45 #include <net/ip.h>
46 #include <net/icmp.h>
47 #include <net/protocol.h>
48 #include <net/ip_tunnels.h>
49 #include <net/arp.h>
50 #include <net/checksum.h>
51 #include <net/dsfield.h>
52 #include <net/inet_ecn.h>
53 #include <net/xfrm.h>
54 #include <net/net_namespace.h>
55 #include <net/netns/generic.h>
56 #include <net/rtnetlink.h>
57 #include <net/udp.h>
58 #include <net/dst_metadata.h>
59 
60 #if IS_ENABLED(CONFIG_IPV6)
61 #include <net/ipv6.h>
62 #include <net/ip6_fib.h>
63 #include <net/ip6_route.h>
64 #endif
65 
66 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
67 {
68 	return hash_32((__force u32)key ^ (__force u32)remote,
69 			 IP_TNL_HASH_BITS);
70 }
71 
72 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
73 				__be16 flags, __be32 key)
74 {
75 	if (p->i_flags & TUNNEL_KEY) {
76 		if (flags & TUNNEL_KEY)
77 			return key == p->i_key;
78 		else
79 			/* key expected, none present */
80 			return false;
81 	} else
82 		return !(flags & TUNNEL_KEY);
83 }
84 
85 /* Fallback tunnel: no source, no destination, no key, no options
86 
87    Tunnel hash table:
88    We require exact key match i.e. if a key is present in packet
89    it will match only tunnel with the same key; if it is not present,
90    it will match only keyless tunnel.
91 
92    All keysless packets, if not matched configured keyless tunnels
93    will match fallback tunnel.
94    Given src, dst and key, find appropriate for input tunnel.
95 */
96 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
97 				   int link, __be16 flags,
98 				   __be32 remote, __be32 local,
99 				   __be32 key)
100 {
101 	unsigned int hash;
102 	struct ip_tunnel *t, *cand = NULL;
103 	struct hlist_head *head;
104 
105 	hash = ip_tunnel_hash(key, remote);
106 	head = &itn->tunnels[hash];
107 
108 	hlist_for_each_entry_rcu(t, head, hash_node) {
109 		if (local != t->parms.iph.saddr ||
110 		    remote != t->parms.iph.daddr ||
111 		    !(t->dev->flags & IFF_UP))
112 			continue;
113 
114 		if (!ip_tunnel_key_match(&t->parms, flags, key))
115 			continue;
116 
117 		if (t->parms.link == link)
118 			return t;
119 		else
120 			cand = t;
121 	}
122 
123 	hlist_for_each_entry_rcu(t, head, hash_node) {
124 		if (remote != t->parms.iph.daddr ||
125 		    t->parms.iph.saddr != 0 ||
126 		    !(t->dev->flags & IFF_UP))
127 			continue;
128 
129 		if (!ip_tunnel_key_match(&t->parms, flags, key))
130 			continue;
131 
132 		if (t->parms.link == link)
133 			return t;
134 		else if (!cand)
135 			cand = t;
136 	}
137 
138 	hash = ip_tunnel_hash(key, 0);
139 	head = &itn->tunnels[hash];
140 
141 	hlist_for_each_entry_rcu(t, head, hash_node) {
142 		if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
143 		    (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
144 			continue;
145 
146 		if (!(t->dev->flags & IFF_UP))
147 			continue;
148 
149 		if (!ip_tunnel_key_match(&t->parms, flags, key))
150 			continue;
151 
152 		if (t->parms.link == link)
153 			return t;
154 		else if (!cand)
155 			cand = t;
156 	}
157 
158 	if (flags & TUNNEL_NO_KEY)
159 		goto skip_key_lookup;
160 
161 	hlist_for_each_entry_rcu(t, head, hash_node) {
162 		if (t->parms.i_key != key ||
163 		    t->parms.iph.saddr != 0 ||
164 		    t->parms.iph.daddr != 0 ||
165 		    !(t->dev->flags & IFF_UP))
166 			continue;
167 
168 		if (t->parms.link == link)
169 			return t;
170 		else if (!cand)
171 			cand = t;
172 	}
173 
174 skip_key_lookup:
175 	if (cand)
176 		return cand;
177 
178 	t = rcu_dereference(itn->collect_md_tun);
179 	if (t && t->dev->flags & IFF_UP)
180 		return t;
181 
182 	if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
183 		return netdev_priv(itn->fb_tunnel_dev);
184 
185 	return NULL;
186 }
187 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
188 
189 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
190 				    struct ip_tunnel_parm *parms)
191 {
192 	unsigned int h;
193 	__be32 remote;
194 	__be32 i_key = parms->i_key;
195 
196 	if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
197 		remote = parms->iph.daddr;
198 	else
199 		remote = 0;
200 
201 	if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
202 		i_key = 0;
203 
204 	h = ip_tunnel_hash(i_key, remote);
205 	return &itn->tunnels[h];
206 }
207 
208 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
209 {
210 	struct hlist_head *head = ip_bucket(itn, &t->parms);
211 
212 	if (t->collect_md)
213 		rcu_assign_pointer(itn->collect_md_tun, t);
214 	hlist_add_head_rcu(&t->hash_node, head);
215 }
216 
217 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
218 {
219 	if (t->collect_md)
220 		rcu_assign_pointer(itn->collect_md_tun, NULL);
221 	hlist_del_init_rcu(&t->hash_node);
222 }
223 
224 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
225 					struct ip_tunnel_parm *parms,
226 					int type)
227 {
228 	__be32 remote = parms->iph.daddr;
229 	__be32 local = parms->iph.saddr;
230 	__be32 key = parms->i_key;
231 	__be16 flags = parms->i_flags;
232 	int link = parms->link;
233 	struct ip_tunnel *t = NULL;
234 	struct hlist_head *head = ip_bucket(itn, parms);
235 
236 	hlist_for_each_entry_rcu(t, head, hash_node) {
237 		if (local == t->parms.iph.saddr &&
238 		    remote == t->parms.iph.daddr &&
239 		    link == t->parms.link &&
240 		    type == t->dev->type &&
241 		    ip_tunnel_key_match(&t->parms, flags, key))
242 			break;
243 	}
244 	return t;
245 }
246 
247 static struct net_device *__ip_tunnel_create(struct net *net,
248 					     const struct rtnl_link_ops *ops,
249 					     struct ip_tunnel_parm *parms)
250 {
251 	int err;
252 	struct ip_tunnel *tunnel;
253 	struct net_device *dev;
254 	char name[IFNAMSIZ];
255 
256 	if (parms->name[0])
257 		strlcpy(name, parms->name, IFNAMSIZ);
258 	else {
259 		if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
260 			err = -E2BIG;
261 			goto failed;
262 		}
263 		strlcpy(name, ops->kind, IFNAMSIZ);
264 		strncat(name, "%d", 2);
265 	}
266 
267 	ASSERT_RTNL();
268 	dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
269 	if (!dev) {
270 		err = -ENOMEM;
271 		goto failed;
272 	}
273 	dev_net_set(dev, net);
274 
275 	dev->rtnl_link_ops = ops;
276 
277 	tunnel = netdev_priv(dev);
278 	tunnel->parms = *parms;
279 	tunnel->net = net;
280 
281 	err = register_netdevice(dev);
282 	if (err)
283 		goto failed_free;
284 
285 	return dev;
286 
287 failed_free:
288 	free_netdev(dev);
289 failed:
290 	return ERR_PTR(err);
291 }
292 
293 static inline void init_tunnel_flow(struct flowi4 *fl4,
294 				    int proto,
295 				    __be32 daddr, __be32 saddr,
296 				    __be32 key, __u8 tos, int oif,
297 				    __u32 mark)
298 {
299 	memset(fl4, 0, sizeof(*fl4));
300 	fl4->flowi4_oif = oif;
301 	fl4->daddr = daddr;
302 	fl4->saddr = saddr;
303 	fl4->flowi4_tos = tos;
304 	fl4->flowi4_proto = proto;
305 	fl4->fl4_gre_key = key;
306 	fl4->flowi4_mark = mark;
307 }
308 
309 static int ip_tunnel_bind_dev(struct net_device *dev)
310 {
311 	struct net_device *tdev = NULL;
312 	struct ip_tunnel *tunnel = netdev_priv(dev);
313 	const struct iphdr *iph;
314 	int hlen = LL_MAX_HEADER;
315 	int mtu = ETH_DATA_LEN;
316 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
317 
318 	iph = &tunnel->parms.iph;
319 
320 	/* Guess output device to choose reasonable mtu and needed_headroom */
321 	if (iph->daddr) {
322 		struct flowi4 fl4;
323 		struct rtable *rt;
324 
325 		init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
326 				 iph->saddr, tunnel->parms.o_key,
327 				 RT_TOS(iph->tos), tunnel->parms.link,
328 				 tunnel->fwmark);
329 		rt = ip_route_output_key(tunnel->net, &fl4);
330 
331 		if (!IS_ERR(rt)) {
332 			tdev = rt->dst.dev;
333 			ip_rt_put(rt);
334 		}
335 		if (dev->type != ARPHRD_ETHER)
336 			dev->flags |= IFF_POINTOPOINT;
337 
338 		dst_cache_reset(&tunnel->dst_cache);
339 	}
340 
341 	if (!tdev && tunnel->parms.link)
342 		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
343 
344 	if (tdev) {
345 		hlen = tdev->hard_header_len + tdev->needed_headroom;
346 		mtu = tdev->mtu;
347 	}
348 
349 	dev->needed_headroom = t_hlen + hlen;
350 	mtu -= (dev->hard_header_len + t_hlen);
351 
352 	if (mtu < IPV4_MIN_MTU)
353 		mtu = IPV4_MIN_MTU;
354 
355 	return mtu;
356 }
357 
358 static struct ip_tunnel *ip_tunnel_create(struct net *net,
359 					  struct ip_tunnel_net *itn,
360 					  struct ip_tunnel_parm *parms)
361 {
362 	struct ip_tunnel *nt;
363 	struct net_device *dev;
364 	int t_hlen;
365 	int mtu;
366 	int err;
367 
368 	BUG_ON(!itn->fb_tunnel_dev);
369 	dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
370 	if (IS_ERR(dev))
371 		return ERR_CAST(dev);
372 
373 	mtu = ip_tunnel_bind_dev(dev);
374 	err = dev_set_mtu(dev, mtu);
375 	if (err)
376 		goto err_dev_set_mtu;
377 
378 	nt = netdev_priv(dev);
379 	t_hlen = nt->hlen + sizeof(struct iphdr);
380 	dev->min_mtu = ETH_MIN_MTU;
381 	dev->max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
382 	ip_tunnel_add(itn, nt);
383 	return nt;
384 
385 err_dev_set_mtu:
386 	unregister_netdevice(dev);
387 	return ERR_PTR(err);
388 }
389 
390 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
391 		  const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
392 		  bool log_ecn_error)
393 {
394 	struct pcpu_sw_netstats *tstats;
395 	const struct iphdr *iph = ip_hdr(skb);
396 	int err;
397 
398 #ifdef CONFIG_NET_IPGRE_BROADCAST
399 	if (ipv4_is_multicast(iph->daddr)) {
400 		tunnel->dev->stats.multicast++;
401 		skb->pkt_type = PACKET_BROADCAST;
402 	}
403 #endif
404 
405 	if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
406 	     ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
407 		tunnel->dev->stats.rx_crc_errors++;
408 		tunnel->dev->stats.rx_errors++;
409 		goto drop;
410 	}
411 
412 	if (tunnel->parms.i_flags&TUNNEL_SEQ) {
413 		if (!(tpi->flags&TUNNEL_SEQ) ||
414 		    (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
415 			tunnel->dev->stats.rx_fifo_errors++;
416 			tunnel->dev->stats.rx_errors++;
417 			goto drop;
418 		}
419 		tunnel->i_seqno = ntohl(tpi->seq) + 1;
420 	}
421 
422 	skb_reset_network_header(skb);
423 
424 	err = IP_ECN_decapsulate(iph, skb);
425 	if (unlikely(err)) {
426 		if (log_ecn_error)
427 			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
428 					&iph->saddr, iph->tos);
429 		if (err > 1) {
430 			++tunnel->dev->stats.rx_frame_errors;
431 			++tunnel->dev->stats.rx_errors;
432 			goto drop;
433 		}
434 	}
435 
436 	tstats = this_cpu_ptr(tunnel->dev->tstats);
437 	u64_stats_update_begin(&tstats->syncp);
438 	tstats->rx_packets++;
439 	tstats->rx_bytes += skb->len;
440 	u64_stats_update_end(&tstats->syncp);
441 
442 	skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
443 
444 	if (tunnel->dev->type == ARPHRD_ETHER) {
445 		skb->protocol = eth_type_trans(skb, tunnel->dev);
446 		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
447 	} else {
448 		skb->dev = tunnel->dev;
449 	}
450 
451 	if (tun_dst)
452 		skb_dst_set(skb, (struct dst_entry *)tun_dst);
453 
454 	gro_cells_receive(&tunnel->gro_cells, skb);
455 	return 0;
456 
457 drop:
458 	if (tun_dst)
459 		dst_release((struct dst_entry *)tun_dst);
460 	kfree_skb(skb);
461 	return 0;
462 }
463 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
464 
465 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
466 			    unsigned int num)
467 {
468 	if (num >= MAX_IPTUN_ENCAP_OPS)
469 		return -ERANGE;
470 
471 	return !cmpxchg((const struct ip_tunnel_encap_ops **)
472 			&iptun_encaps[num],
473 			NULL, ops) ? 0 : -1;
474 }
475 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
476 
477 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
478 			    unsigned int num)
479 {
480 	int ret;
481 
482 	if (num >= MAX_IPTUN_ENCAP_OPS)
483 		return -ERANGE;
484 
485 	ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
486 		       &iptun_encaps[num],
487 		       ops, NULL) == ops) ? 0 : -1;
488 
489 	synchronize_net();
490 
491 	return ret;
492 }
493 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
494 
495 int ip_tunnel_encap_setup(struct ip_tunnel *t,
496 			  struct ip_tunnel_encap *ipencap)
497 {
498 	int hlen;
499 
500 	memset(&t->encap, 0, sizeof(t->encap));
501 
502 	hlen = ip_encap_hlen(ipencap);
503 	if (hlen < 0)
504 		return hlen;
505 
506 	t->encap.type = ipencap->type;
507 	t->encap.sport = ipencap->sport;
508 	t->encap.dport = ipencap->dport;
509 	t->encap.flags = ipencap->flags;
510 
511 	t->encap_hlen = hlen;
512 	t->hlen = t->encap_hlen + t->tun_hlen;
513 
514 	return 0;
515 }
516 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
517 
518 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
519 			    struct rtable *rt, __be16 df,
520 			    const struct iphdr *inner_iph)
521 {
522 	struct ip_tunnel *tunnel = netdev_priv(dev);
523 	int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
524 	int mtu;
525 
526 	if (df)
527 		mtu = dst_mtu(&rt->dst) - dev->hard_header_len
528 					- sizeof(struct iphdr) - tunnel->hlen;
529 	else
530 		mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
531 
532 	skb_dst_update_pmtu(skb, mtu);
533 
534 	if (skb->protocol == htons(ETH_P_IP)) {
535 		if (!skb_is_gso(skb) &&
536 		    (inner_iph->frag_off & htons(IP_DF)) &&
537 		    mtu < pkt_size) {
538 			memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
539 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
540 			return -E2BIG;
541 		}
542 	}
543 #if IS_ENABLED(CONFIG_IPV6)
544 	else if (skb->protocol == htons(ETH_P_IPV6)) {
545 		struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
546 
547 		if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
548 			   mtu >= IPV6_MIN_MTU) {
549 			if ((tunnel->parms.iph.daddr &&
550 			    !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
551 			    rt6->rt6i_dst.plen == 128) {
552 				rt6->rt6i_flags |= RTF_MODIFIED;
553 				dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
554 			}
555 		}
556 
557 		if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
558 					mtu < pkt_size) {
559 			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
560 			return -E2BIG;
561 		}
562 	}
563 #endif
564 	return 0;
565 }
566 
567 void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
568 {
569 	struct ip_tunnel *tunnel = netdev_priv(dev);
570 	u32 headroom = sizeof(struct iphdr);
571 	struct ip_tunnel_info *tun_info;
572 	const struct ip_tunnel_key *key;
573 	const struct iphdr *inner_iph;
574 	struct rtable *rt;
575 	struct flowi4 fl4;
576 	__be16 df = 0;
577 	u8 tos, ttl;
578 
579 	tun_info = skb_tunnel_info(skb);
580 	if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
581 		     ip_tunnel_info_af(tun_info) != AF_INET))
582 		goto tx_error;
583 	key = &tun_info->key;
584 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
585 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
586 	tos = key->tos;
587 	if (tos == 1) {
588 		if (skb->protocol == htons(ETH_P_IP))
589 			tos = inner_iph->tos;
590 		else if (skb->protocol == htons(ETH_P_IPV6))
591 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
592 	}
593 	init_tunnel_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0,
594 			 RT_TOS(tos), tunnel->parms.link, tunnel->fwmark);
595 	if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
596 		goto tx_error;
597 	rt = ip_route_output_key(tunnel->net, &fl4);
598 	if (IS_ERR(rt)) {
599 		dev->stats.tx_carrier_errors++;
600 		goto tx_error;
601 	}
602 	if (rt->dst.dev == dev) {
603 		ip_rt_put(rt);
604 		dev->stats.collisions++;
605 		goto tx_error;
606 	}
607 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
608 	ttl = key->ttl;
609 	if (ttl == 0) {
610 		if (skb->protocol == htons(ETH_P_IP))
611 			ttl = inner_iph->ttl;
612 		else if (skb->protocol == htons(ETH_P_IPV6))
613 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
614 		else
615 			ttl = ip4_dst_hoplimit(&rt->dst);
616 	}
617 	if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
618 		df = htons(IP_DF);
619 	else if (skb->protocol == htons(ETH_P_IP))
620 		df = inner_iph->frag_off & htons(IP_DF);
621 	headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
622 	if (headroom > dev->needed_headroom)
623 		dev->needed_headroom = headroom;
624 
625 	if (skb_cow_head(skb, dev->needed_headroom)) {
626 		ip_rt_put(rt);
627 		goto tx_dropped;
628 	}
629 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
630 		      df, !net_eq(tunnel->net, dev_net(dev)));
631 	return;
632 tx_error:
633 	dev->stats.tx_errors++;
634 	goto kfree;
635 tx_dropped:
636 	dev->stats.tx_dropped++;
637 kfree:
638 	kfree_skb(skb);
639 }
640 EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
641 
642 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
643 		    const struct iphdr *tnl_params, u8 protocol)
644 {
645 	struct ip_tunnel *tunnel = netdev_priv(dev);
646 	const struct iphdr *inner_iph;
647 	struct flowi4 fl4;
648 	u8     tos, ttl;
649 	__be16 df;
650 	struct rtable *rt;		/* Route to the other host */
651 	unsigned int max_headroom;	/* The extra header space needed */
652 	__be32 dst;
653 	bool connected;
654 
655 	inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
656 	connected = (tunnel->parms.iph.daddr != 0);
657 
658 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
659 
660 	dst = tnl_params->daddr;
661 	if (dst == 0) {
662 		/* NBMA tunnel */
663 
664 		if (!skb_dst(skb)) {
665 			dev->stats.tx_fifo_errors++;
666 			goto tx_error;
667 		}
668 
669 		if (skb->protocol == htons(ETH_P_IP)) {
670 			rt = skb_rtable(skb);
671 			dst = rt_nexthop(rt, inner_iph->daddr);
672 		}
673 #if IS_ENABLED(CONFIG_IPV6)
674 		else if (skb->protocol == htons(ETH_P_IPV6)) {
675 			const struct in6_addr *addr6;
676 			struct neighbour *neigh;
677 			bool do_tx_error_icmp;
678 			int addr_type;
679 
680 			neigh = dst_neigh_lookup(skb_dst(skb),
681 						 &ipv6_hdr(skb)->daddr);
682 			if (!neigh)
683 				goto tx_error;
684 
685 			addr6 = (const struct in6_addr *)&neigh->primary_key;
686 			addr_type = ipv6_addr_type(addr6);
687 
688 			if (addr_type == IPV6_ADDR_ANY) {
689 				addr6 = &ipv6_hdr(skb)->daddr;
690 				addr_type = ipv6_addr_type(addr6);
691 			}
692 
693 			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
694 				do_tx_error_icmp = true;
695 			else {
696 				do_tx_error_icmp = false;
697 				dst = addr6->s6_addr32[3];
698 			}
699 			neigh_release(neigh);
700 			if (do_tx_error_icmp)
701 				goto tx_error_icmp;
702 		}
703 #endif
704 		else
705 			goto tx_error;
706 
707 		connected = false;
708 	}
709 
710 	tos = tnl_params->tos;
711 	if (tos & 0x1) {
712 		tos &= ~0x1;
713 		if (skb->protocol == htons(ETH_P_IP)) {
714 			tos = inner_iph->tos;
715 			connected = false;
716 		} else if (skb->protocol == htons(ETH_P_IPV6)) {
717 			tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
718 			connected = false;
719 		}
720 	}
721 
722 	init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
723 			 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
724 			 tunnel->fwmark);
725 
726 	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
727 		goto tx_error;
728 
729 	rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
730 			 NULL;
731 
732 	if (!rt) {
733 		rt = ip_route_output_key(tunnel->net, &fl4);
734 
735 		if (IS_ERR(rt)) {
736 			dev->stats.tx_carrier_errors++;
737 			goto tx_error;
738 		}
739 		if (connected)
740 			dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
741 					  fl4.saddr);
742 	}
743 
744 	if (rt->dst.dev == dev) {
745 		ip_rt_put(rt);
746 		dev->stats.collisions++;
747 		goto tx_error;
748 	}
749 
750 	if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
751 		ip_rt_put(rt);
752 		goto tx_error;
753 	}
754 
755 	if (tunnel->err_count > 0) {
756 		if (time_before(jiffies,
757 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
758 			tunnel->err_count--;
759 
760 			dst_link_failure(skb);
761 		} else
762 			tunnel->err_count = 0;
763 	}
764 
765 	tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
766 	ttl = tnl_params->ttl;
767 	if (ttl == 0) {
768 		if (skb->protocol == htons(ETH_P_IP))
769 			ttl = inner_iph->ttl;
770 #if IS_ENABLED(CONFIG_IPV6)
771 		else if (skb->protocol == htons(ETH_P_IPV6))
772 			ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
773 #endif
774 		else
775 			ttl = ip4_dst_hoplimit(&rt->dst);
776 	}
777 
778 	df = tnl_params->frag_off;
779 	if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
780 		df |= (inner_iph->frag_off&htons(IP_DF));
781 
782 	max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
783 			+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
784 	if (max_headroom > dev->needed_headroom)
785 		dev->needed_headroom = max_headroom;
786 
787 	if (skb_cow_head(skb, dev->needed_headroom)) {
788 		ip_rt_put(rt);
789 		dev->stats.tx_dropped++;
790 		kfree_skb(skb);
791 		return;
792 	}
793 
794 	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
795 		      df, !net_eq(tunnel->net, dev_net(dev)));
796 	return;
797 
798 #if IS_ENABLED(CONFIG_IPV6)
799 tx_error_icmp:
800 	dst_link_failure(skb);
801 #endif
802 tx_error:
803 	dev->stats.tx_errors++;
804 	kfree_skb(skb);
805 }
806 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
807 
808 static void ip_tunnel_update(struct ip_tunnel_net *itn,
809 			     struct ip_tunnel *t,
810 			     struct net_device *dev,
811 			     struct ip_tunnel_parm *p,
812 			     bool set_mtu,
813 			     __u32 fwmark)
814 {
815 	ip_tunnel_del(itn, t);
816 	t->parms.iph.saddr = p->iph.saddr;
817 	t->parms.iph.daddr = p->iph.daddr;
818 	t->parms.i_key = p->i_key;
819 	t->parms.o_key = p->o_key;
820 	if (dev->type != ARPHRD_ETHER) {
821 		memcpy(dev->dev_addr, &p->iph.saddr, 4);
822 		memcpy(dev->broadcast, &p->iph.daddr, 4);
823 	}
824 	ip_tunnel_add(itn, t);
825 
826 	t->parms.iph.ttl = p->iph.ttl;
827 	t->parms.iph.tos = p->iph.tos;
828 	t->parms.iph.frag_off = p->iph.frag_off;
829 
830 	if (t->parms.link != p->link || t->fwmark != fwmark) {
831 		int mtu;
832 
833 		t->parms.link = p->link;
834 		t->fwmark = fwmark;
835 		mtu = ip_tunnel_bind_dev(dev);
836 		if (set_mtu)
837 			dev->mtu = mtu;
838 	}
839 	dst_cache_reset(&t->dst_cache);
840 	netdev_state_change(dev);
841 }
842 
843 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
844 {
845 	int err = 0;
846 	struct ip_tunnel *t = netdev_priv(dev);
847 	struct net *net = t->net;
848 	struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
849 
850 	BUG_ON(!itn->fb_tunnel_dev);
851 	switch (cmd) {
852 	case SIOCGETTUNNEL:
853 		if (dev == itn->fb_tunnel_dev) {
854 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
855 			if (!t)
856 				t = netdev_priv(dev);
857 		}
858 		memcpy(p, &t->parms, sizeof(*p));
859 		break;
860 
861 	case SIOCADDTUNNEL:
862 	case SIOCCHGTUNNEL:
863 		err = -EPERM;
864 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
865 			goto done;
866 		if (p->iph.ttl)
867 			p->iph.frag_off |= htons(IP_DF);
868 		if (!(p->i_flags & VTI_ISVTI)) {
869 			if (!(p->i_flags & TUNNEL_KEY))
870 				p->i_key = 0;
871 			if (!(p->o_flags & TUNNEL_KEY))
872 				p->o_key = 0;
873 		}
874 
875 		t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
876 
877 		if (cmd == SIOCADDTUNNEL) {
878 			if (!t) {
879 				t = ip_tunnel_create(net, itn, p);
880 				err = PTR_ERR_OR_ZERO(t);
881 				break;
882 			}
883 
884 			err = -EEXIST;
885 			break;
886 		}
887 		if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
888 			if (t) {
889 				if (t->dev != dev) {
890 					err = -EEXIST;
891 					break;
892 				}
893 			} else {
894 				unsigned int nflags = 0;
895 
896 				if (ipv4_is_multicast(p->iph.daddr))
897 					nflags = IFF_BROADCAST;
898 				else if (p->iph.daddr)
899 					nflags = IFF_POINTOPOINT;
900 
901 				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
902 					err = -EINVAL;
903 					break;
904 				}
905 
906 				t = netdev_priv(dev);
907 			}
908 		}
909 
910 		if (t) {
911 			err = 0;
912 			ip_tunnel_update(itn, t, dev, p, true, 0);
913 		} else {
914 			err = -ENOENT;
915 		}
916 		break;
917 
918 	case SIOCDELTUNNEL:
919 		err = -EPERM;
920 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
921 			goto done;
922 
923 		if (dev == itn->fb_tunnel_dev) {
924 			err = -ENOENT;
925 			t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
926 			if (!t)
927 				goto done;
928 			err = -EPERM;
929 			if (t == netdev_priv(itn->fb_tunnel_dev))
930 				goto done;
931 			dev = t->dev;
932 		}
933 		unregister_netdevice(dev);
934 		err = 0;
935 		break;
936 
937 	default:
938 		err = -EINVAL;
939 	}
940 
941 done:
942 	return err;
943 }
944 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
945 
946 int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
947 {
948 	struct ip_tunnel *tunnel = netdev_priv(dev);
949 	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
950 	int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
951 
952 	if (new_mtu < ETH_MIN_MTU)
953 		return -EINVAL;
954 
955 	if (new_mtu > max_mtu) {
956 		if (strict)
957 			return -EINVAL;
958 
959 		new_mtu = max_mtu;
960 	}
961 
962 	dev->mtu = new_mtu;
963 	return 0;
964 }
965 EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
966 
967 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
968 {
969 	return __ip_tunnel_change_mtu(dev, new_mtu, true);
970 }
971 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
972 
973 static void ip_tunnel_dev_free(struct net_device *dev)
974 {
975 	struct ip_tunnel *tunnel = netdev_priv(dev);
976 
977 	gro_cells_destroy(&tunnel->gro_cells);
978 	dst_cache_destroy(&tunnel->dst_cache);
979 	free_percpu(dev->tstats);
980 }
981 
982 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
983 {
984 	struct ip_tunnel *tunnel = netdev_priv(dev);
985 	struct ip_tunnel_net *itn;
986 
987 	itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
988 
989 	if (itn->fb_tunnel_dev != dev) {
990 		ip_tunnel_del(itn, netdev_priv(dev));
991 		unregister_netdevice_queue(dev, head);
992 	}
993 }
994 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
995 
996 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
997 {
998 	struct ip_tunnel *tunnel = netdev_priv(dev);
999 
1000 	return tunnel->net;
1001 }
1002 EXPORT_SYMBOL(ip_tunnel_get_link_net);
1003 
1004 int ip_tunnel_get_iflink(const struct net_device *dev)
1005 {
1006 	struct ip_tunnel *tunnel = netdev_priv(dev);
1007 
1008 	return tunnel->parms.link;
1009 }
1010 EXPORT_SYMBOL(ip_tunnel_get_iflink);
1011 
1012 int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1013 				  struct rtnl_link_ops *ops, char *devname)
1014 {
1015 	struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1016 	struct ip_tunnel_parm parms;
1017 	unsigned int i;
1018 
1019 	for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1020 		INIT_HLIST_HEAD(&itn->tunnels[i]);
1021 
1022 	if (!ops) {
1023 		itn->fb_tunnel_dev = NULL;
1024 		return 0;
1025 	}
1026 
1027 	memset(&parms, 0, sizeof(parms));
1028 	if (devname)
1029 		strlcpy(parms.name, devname, IFNAMSIZ);
1030 
1031 	rtnl_lock();
1032 	itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1033 	/* FB netdevice is special: we have one, and only one per netns.
1034 	 * Allowing to move it to another netns is clearly unsafe.
1035 	 */
1036 	if (!IS_ERR(itn->fb_tunnel_dev)) {
1037 		itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1038 		itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1039 		ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1040 	}
1041 	rtnl_unlock();
1042 
1043 	return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1044 }
1045 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1046 
1047 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1048 			      struct rtnl_link_ops *ops)
1049 {
1050 	struct net *net = dev_net(itn->fb_tunnel_dev);
1051 	struct net_device *dev, *aux;
1052 	int h;
1053 
1054 	for_each_netdev_safe(net, dev, aux)
1055 		if (dev->rtnl_link_ops == ops)
1056 			unregister_netdevice_queue(dev, head);
1057 
1058 	for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1059 		struct ip_tunnel *t;
1060 		struct hlist_node *n;
1061 		struct hlist_head *thead = &itn->tunnels[h];
1062 
1063 		hlist_for_each_entry_safe(t, n, thead, hash_node)
1064 			/* If dev is in the same netns, it has already
1065 			 * been added to the list by the previous loop.
1066 			 */
1067 			if (!net_eq(dev_net(t->dev), net))
1068 				unregister_netdevice_queue(t->dev, head);
1069 	}
1070 }
1071 
1072 void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1073 			   struct rtnl_link_ops *ops)
1074 {
1075 	struct ip_tunnel_net *itn;
1076 	struct net *net;
1077 	LIST_HEAD(list);
1078 
1079 	rtnl_lock();
1080 	list_for_each_entry(net, net_list, exit_list) {
1081 		itn = net_generic(net, id);
1082 		ip_tunnel_destroy(itn, &list, ops);
1083 	}
1084 	unregister_netdevice_many(&list);
1085 	rtnl_unlock();
1086 }
1087 EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1088 
1089 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1090 		      struct ip_tunnel_parm *p, __u32 fwmark)
1091 {
1092 	struct ip_tunnel *nt;
1093 	struct net *net = dev_net(dev);
1094 	struct ip_tunnel_net *itn;
1095 	int mtu;
1096 	int err;
1097 
1098 	nt = netdev_priv(dev);
1099 	itn = net_generic(net, nt->ip_tnl_net_id);
1100 
1101 	if (nt->collect_md) {
1102 		if (rtnl_dereference(itn->collect_md_tun))
1103 			return -EEXIST;
1104 	} else {
1105 		if (ip_tunnel_find(itn, p, dev->type))
1106 			return -EEXIST;
1107 	}
1108 
1109 	nt->net = net;
1110 	nt->parms = *p;
1111 	nt->fwmark = fwmark;
1112 	err = register_netdevice(dev);
1113 	if (err)
1114 		goto err_register_netdevice;
1115 
1116 	if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1117 		eth_hw_addr_random(dev);
1118 
1119 	mtu = ip_tunnel_bind_dev(dev);
1120 	if (tb[IFLA_MTU]) {
1121 		unsigned int max = 0xfff8 - dev->hard_header_len - nt->hlen;
1122 
1123 		mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU,
1124 			    (unsigned int)(max - sizeof(struct iphdr)));
1125 	}
1126 
1127 	err = dev_set_mtu(dev, mtu);
1128 	if (err)
1129 		goto err_dev_set_mtu;
1130 
1131 	ip_tunnel_add(itn, nt);
1132 	return 0;
1133 
1134 err_dev_set_mtu:
1135 	unregister_netdevice(dev);
1136 err_register_netdevice:
1137 	return err;
1138 }
1139 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1140 
1141 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1142 			 struct ip_tunnel_parm *p, __u32 fwmark)
1143 {
1144 	struct ip_tunnel *t;
1145 	struct ip_tunnel *tunnel = netdev_priv(dev);
1146 	struct net *net = tunnel->net;
1147 	struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1148 
1149 	if (dev == itn->fb_tunnel_dev)
1150 		return -EINVAL;
1151 
1152 	t = ip_tunnel_find(itn, p, dev->type);
1153 
1154 	if (t) {
1155 		if (t->dev != dev)
1156 			return -EEXIST;
1157 	} else {
1158 		t = tunnel;
1159 
1160 		if (dev->type != ARPHRD_ETHER) {
1161 			unsigned int nflags = 0;
1162 
1163 			if (ipv4_is_multicast(p->iph.daddr))
1164 				nflags = IFF_BROADCAST;
1165 			else if (p->iph.daddr)
1166 				nflags = IFF_POINTOPOINT;
1167 
1168 			if ((dev->flags ^ nflags) &
1169 			    (IFF_POINTOPOINT | IFF_BROADCAST))
1170 				return -EINVAL;
1171 		}
1172 	}
1173 
1174 	ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1175 	return 0;
1176 }
1177 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1178 
1179 int ip_tunnel_init(struct net_device *dev)
1180 {
1181 	struct ip_tunnel *tunnel = netdev_priv(dev);
1182 	struct iphdr *iph = &tunnel->parms.iph;
1183 	int err;
1184 
1185 	dev->needs_free_netdev = true;
1186 	dev->priv_destructor = ip_tunnel_dev_free;
1187 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1188 	if (!dev->tstats)
1189 		return -ENOMEM;
1190 
1191 	err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1192 	if (err) {
1193 		free_percpu(dev->tstats);
1194 		return err;
1195 	}
1196 
1197 	err = gro_cells_init(&tunnel->gro_cells, dev);
1198 	if (err) {
1199 		dst_cache_destroy(&tunnel->dst_cache);
1200 		free_percpu(dev->tstats);
1201 		return err;
1202 	}
1203 
1204 	tunnel->dev = dev;
1205 	tunnel->net = dev_net(dev);
1206 	strcpy(tunnel->parms.name, dev->name);
1207 	iph->version		= 4;
1208 	iph->ihl		= 5;
1209 
1210 	if (tunnel->collect_md) {
1211 		dev->features |= NETIF_F_NETNS_LOCAL;
1212 		netif_keep_dst(dev);
1213 	}
1214 	return 0;
1215 }
1216 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1217 
1218 void ip_tunnel_uninit(struct net_device *dev)
1219 {
1220 	struct ip_tunnel *tunnel = netdev_priv(dev);
1221 	struct net *net = tunnel->net;
1222 	struct ip_tunnel_net *itn;
1223 
1224 	itn = net_generic(net, tunnel->ip_tnl_net_id);
1225 	/* fb_tunnel_dev will be unregisted in net-exit call. */
1226 	if (itn->fb_tunnel_dev != dev)
1227 		ip_tunnel_del(itn, netdev_priv(dev));
1228 
1229 	dst_cache_reset(&tunnel->dst_cache);
1230 }
1231 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1232 
1233 /* Do least required initialization, rest of init is done in tunnel_init call */
1234 void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1235 {
1236 	struct ip_tunnel *tunnel = netdev_priv(dev);
1237 	tunnel->ip_tnl_net_id = net_id;
1238 }
1239 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1240 
1241 MODULE_LICENSE("GPL");
1242