xref: /openbmc/linux/net/ipv4/ipip.c (revision 0d456bad)
1 /*
2  *	Linux NET3:	IP/IP protocol decoder.
3  *
4  *	Authors:
5  *		Sam Lantinga (slouken@cs.ucdavis.edu)  02/01/95
6  *
7  *	Fixes:
8  *		Alan Cox	:	Merged and made usable non modular (its so tiny its silly as
9  *					a module taking up 2 pages).
10  *		Alan Cox	: 	Fixed bug with 1.3.18 and IPIP not working (now needs to set skb->h.iph)
11  *					to keep ip_forward happy.
12  *		Alan Cox	:	More fixes for 1.3.21, and firewall fix. Maybe this will work soon 8).
13  *		Kai Schulte	:	Fixed #defines for IP_FIREWALL->FIREWALL
14  *              David Woodhouse :       Perform some basic ICMP handling.
15  *                                      IPIP Routing without decapsulation.
16  *              Carlos Picoto   :       GRE over IP support
17  *		Alexey Kuznetsov:	Reworked. Really, now it is truncated version of ipv4/ip_gre.c.
18  *					I do not want to merge them together.
19  *
20  *	This program is free software; you can redistribute it and/or
21  *	modify it under the terms of the GNU General Public License
22  *	as published by the Free Software Foundation; either version
23  *	2 of the License, or (at your option) any later version.
24  *
25  */
26 
27 /* tunnel.c: an IP tunnel driver
28 
29 	The purpose of this driver is to provide an IP tunnel through
30 	which you can tunnel network traffic transparently across subnets.
31 
32 	This was written by looking at Nick Holloway's dummy driver
33 	Thanks for the great code!
34 
35 		-Sam Lantinga	(slouken@cs.ucdavis.edu)  02/01/95
36 
37 	Minor tweaks:
38 		Cleaned up the code a little and added some pre-1.3.0 tweaks.
39 		dev->hard_header/hard_header_len changed to use no headers.
40 		Comments/bracketing tweaked.
41 		Made the tunnels use dev->name not tunnel: when error reporting.
42 		Added tx_dropped stat
43 
44 		-Alan Cox	(alan@lxorguk.ukuu.org.uk) 21 March 95
45 
46 	Reworked:
47 		Changed to tunnel to destination gateway in addition to the
48 			tunnel's pointopoint address
49 		Almost completely rewritten
50 		Note:  There is currently no firewall or ICMP handling done.
51 
52 		-Sam Lantinga	(slouken@cs.ucdavis.edu) 02/13/96
53 
54 */
55 
56 /* Things I wish I had known when writing the tunnel driver:
57 
58 	When the tunnel_xmit() function is called, the skb contains the
59 	packet to be sent (plus a great deal of extra info), and dev
60 	contains the tunnel device that _we_ are.
61 
62 	When we are passed a packet, we are expected to fill in the
63 	source address with our source IP address.
64 
65 	What is the proper way to allocate, copy and free a buffer?
66 	After you allocate it, it is a "0 length" chunk of memory
67 	starting at zero.  If you want to add headers to the buffer
68 	later, you'll have to call "skb_reserve(skb, amount)" with
69 	the amount of memory you want reserved.  Then, you call
70 	"skb_put(skb, amount)" with the amount of space you want in
71 	the buffer.  skb_put() returns a pointer to the top (#0) of
72 	that buffer.  skb->len is set to the amount of space you have
73 	"allocated" with skb_put().  You can then write up to skb->len
74 	bytes to that buffer.  If you need more, you can call skb_put()
75 	again with the additional amount of space you need.  You can
76 	find out how much more space you can allocate by calling
77 	"skb_tailroom(skb)".
78 	Now, to add header space, call "skb_push(skb, header_len)".
79 	This creates space at the beginning of the buffer and returns
80 	a pointer to this new space.  If later you need to strip a
81 	header from a buffer, call "skb_pull(skb, header_len)".
82 	skb_headroom() will return how much space is left at the top
83 	of the buffer (before the main data).  Remember, this headroom
84 	space must be reserved before the skb_put() function is called.
85 	*/
86 
87 /*
88    This version of net/ipv4/ipip.c is cloned of net/ipv4/ip_gre.c
89 
90    For comments look at net/ipv4/ip_gre.c --ANK
91  */
92 
93 
94 #include <linux/capability.h>
95 #include <linux/module.h>
96 #include <linux/types.h>
97 #include <linux/kernel.h>
98 #include <linux/slab.h>
99 #include <asm/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <linux/in.h>
103 #include <linux/tcp.h>
104 #include <linux/udp.h>
105 #include <linux/if_arp.h>
106 #include <linux/mroute.h>
107 #include <linux/init.h>
108 #include <linux/netfilter_ipv4.h>
109 #include <linux/if_ether.h>
110 
111 #include <net/sock.h>
112 #include <net/ip.h>
113 #include <net/icmp.h>
114 #include <net/ipip.h>
115 #include <net/inet_ecn.h>
116 #include <net/xfrm.h>
117 #include <net/net_namespace.h>
118 #include <net/netns/generic.h>
119 
120 #define HASH_SIZE  16
121 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
122 
123 static bool log_ecn_error = true;
124 module_param(log_ecn_error, bool, 0644);
125 MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
126 
127 static int ipip_net_id __read_mostly;
128 struct ipip_net {
129 	struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
130 	struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
131 	struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
132 	struct ip_tunnel __rcu *tunnels_wc[1];
133 	struct ip_tunnel __rcu **tunnels[4];
134 
135 	struct net_device *fb_tunnel_dev;
136 };
137 
138 static int ipip_tunnel_init(struct net_device *dev);
139 static void ipip_tunnel_setup(struct net_device *dev);
140 static void ipip_dev_free(struct net_device *dev);
141 static struct rtnl_link_ops ipip_link_ops __read_mostly;
142 
143 static struct rtnl_link_stats64 *ipip_get_stats64(struct net_device *dev,
144 						  struct rtnl_link_stats64 *tot)
145 {
146 	int i;
147 
148 	for_each_possible_cpu(i) {
149 		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
150 		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
151 		unsigned int start;
152 
153 		do {
154 			start = u64_stats_fetch_begin_bh(&tstats->syncp);
155 			rx_packets = tstats->rx_packets;
156 			tx_packets = tstats->tx_packets;
157 			rx_bytes = tstats->rx_bytes;
158 			tx_bytes = tstats->tx_bytes;
159 		} while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
160 
161 		tot->rx_packets += rx_packets;
162 		tot->tx_packets += tx_packets;
163 		tot->rx_bytes   += rx_bytes;
164 		tot->tx_bytes   += tx_bytes;
165 	}
166 
167 	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
168 	tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
169 	tot->tx_dropped = dev->stats.tx_dropped;
170 	tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
171 	tot->tx_errors = dev->stats.tx_errors;
172 	tot->collisions = dev->stats.collisions;
173 
174 	return tot;
175 }
176 
177 static struct ip_tunnel *ipip_tunnel_lookup(struct net *net,
178 		__be32 remote, __be32 local)
179 {
180 	unsigned int h0 = HASH(remote);
181 	unsigned int h1 = HASH(local);
182 	struct ip_tunnel *t;
183 	struct ipip_net *ipn = net_generic(net, ipip_net_id);
184 
185 	for_each_ip_tunnel_rcu(t, ipn->tunnels_r_l[h0 ^ h1])
186 		if (local == t->parms.iph.saddr &&
187 		    remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
188 			return t;
189 
190 	for_each_ip_tunnel_rcu(t, ipn->tunnels_r[h0])
191 		if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
192 			return t;
193 
194 	for_each_ip_tunnel_rcu(t, ipn->tunnels_l[h1])
195 		if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
196 			return t;
197 
198 	t = rcu_dereference(ipn->tunnels_wc[0]);
199 	if (t && (t->dev->flags&IFF_UP))
200 		return t;
201 	return NULL;
202 }
203 
204 static struct ip_tunnel __rcu **__ipip_bucket(struct ipip_net *ipn,
205 		struct ip_tunnel_parm *parms)
206 {
207 	__be32 remote = parms->iph.daddr;
208 	__be32 local = parms->iph.saddr;
209 	unsigned int h = 0;
210 	int prio = 0;
211 
212 	if (remote) {
213 		prio |= 2;
214 		h ^= HASH(remote);
215 	}
216 	if (local) {
217 		prio |= 1;
218 		h ^= HASH(local);
219 	}
220 	return &ipn->tunnels[prio][h];
221 }
222 
223 static inline struct ip_tunnel __rcu **ipip_bucket(struct ipip_net *ipn,
224 		struct ip_tunnel *t)
225 {
226 	return __ipip_bucket(ipn, &t->parms);
227 }
228 
229 static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
230 {
231 	struct ip_tunnel __rcu **tp;
232 	struct ip_tunnel *iter;
233 
234 	for (tp = ipip_bucket(ipn, t);
235 	     (iter = rtnl_dereference(*tp)) != NULL;
236 	     tp = &iter->next) {
237 		if (t == iter) {
238 			rcu_assign_pointer(*tp, t->next);
239 			break;
240 		}
241 	}
242 }
243 
244 static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t)
245 {
246 	struct ip_tunnel __rcu **tp = ipip_bucket(ipn, t);
247 
248 	rcu_assign_pointer(t->next, rtnl_dereference(*tp));
249 	rcu_assign_pointer(*tp, t);
250 }
251 
252 static int ipip_tunnel_create(struct net_device *dev)
253 {
254 	struct ip_tunnel *t = netdev_priv(dev);
255 	struct net *net = dev_net(dev);
256 	struct ipip_net *ipn = net_generic(net, ipip_net_id);
257 	int err;
258 
259 	err = ipip_tunnel_init(dev);
260 	if (err < 0)
261 		goto out;
262 
263 	err = register_netdevice(dev);
264 	if (err < 0)
265 		goto out;
266 
267 	strcpy(t->parms.name, dev->name);
268 	dev->rtnl_link_ops = &ipip_link_ops;
269 
270 	dev_hold(dev);
271 	ipip_tunnel_link(ipn, t);
272 	return 0;
273 
274 out:
275 	return err;
276 }
277 
278 static struct ip_tunnel *ipip_tunnel_locate(struct net *net,
279 		struct ip_tunnel_parm *parms, int create)
280 {
281 	__be32 remote = parms->iph.daddr;
282 	__be32 local = parms->iph.saddr;
283 	struct ip_tunnel *t, *nt;
284 	struct ip_tunnel __rcu **tp;
285 	struct net_device *dev;
286 	char name[IFNAMSIZ];
287 	struct ipip_net *ipn = net_generic(net, ipip_net_id);
288 
289 	for (tp = __ipip_bucket(ipn, parms);
290 		 (t = rtnl_dereference(*tp)) != NULL;
291 		 tp = &t->next) {
292 		if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
293 			return t;
294 	}
295 	if (!create)
296 		return NULL;
297 
298 	if (parms->name[0])
299 		strlcpy(name, parms->name, IFNAMSIZ);
300 	else
301 		strcpy(name, "tunl%d");
302 
303 	dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup);
304 	if (dev == NULL)
305 		return NULL;
306 
307 	dev_net_set(dev, net);
308 
309 	nt = netdev_priv(dev);
310 	nt->parms = *parms;
311 
312 	if (ipip_tunnel_create(dev) < 0)
313 		goto failed_free;
314 
315 	return nt;
316 
317 failed_free:
318 	ipip_dev_free(dev);
319 	return NULL;
320 }
321 
322 /* called with RTNL */
323 static void ipip_tunnel_uninit(struct net_device *dev)
324 {
325 	struct net *net = dev_net(dev);
326 	struct ipip_net *ipn = net_generic(net, ipip_net_id);
327 
328 	if (dev == ipn->fb_tunnel_dev)
329 		RCU_INIT_POINTER(ipn->tunnels_wc[0], NULL);
330 	else
331 		ipip_tunnel_unlink(ipn, netdev_priv(dev));
332 	dev_put(dev);
333 }
334 
335 static int ipip_err(struct sk_buff *skb, u32 info)
336 {
337 
338 /* All the routers (except for Linux) return only
339    8 bytes of packet payload. It means, that precise relaying of
340    ICMP in the real Internet is absolutely infeasible.
341  */
342 	const struct iphdr *iph = (const struct iphdr *)skb->data;
343 	const int type = icmp_hdr(skb)->type;
344 	const int code = icmp_hdr(skb)->code;
345 	struct ip_tunnel *t;
346 	int err;
347 
348 	switch (type) {
349 	default:
350 	case ICMP_PARAMETERPROB:
351 		return 0;
352 
353 	case ICMP_DEST_UNREACH:
354 		switch (code) {
355 		case ICMP_SR_FAILED:
356 		case ICMP_PORT_UNREACH:
357 			/* Impossible event. */
358 			return 0;
359 		default:
360 			/* All others are translated to HOST_UNREACH.
361 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
362 			   I believe they are just ether pollution. --ANK
363 			 */
364 			break;
365 		}
366 		break;
367 	case ICMP_TIME_EXCEEDED:
368 		if (code != ICMP_EXC_TTL)
369 			return 0;
370 		break;
371 	case ICMP_REDIRECT:
372 		break;
373 	}
374 
375 	err = -ENOENT;
376 	t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
377 	if (t == NULL)
378 		goto out;
379 
380 	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
381 		ipv4_update_pmtu(skb, dev_net(skb->dev), info,
382 				 t->dev->ifindex, 0, IPPROTO_IPIP, 0);
383 		err = 0;
384 		goto out;
385 	}
386 
387 	if (type == ICMP_REDIRECT) {
388 		ipv4_redirect(skb, dev_net(skb->dev), t->dev->ifindex, 0,
389 			      IPPROTO_IPIP, 0);
390 		err = 0;
391 		goto out;
392 	}
393 
394 	if (t->parms.iph.daddr == 0)
395 		goto out;
396 
397 	err = 0;
398 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
399 		goto out;
400 
401 	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
402 		t->err_count++;
403 	else
404 		t->err_count = 1;
405 	t->err_time = jiffies;
406 out:
407 
408 	return err;
409 }
410 
411 static int ipip_rcv(struct sk_buff *skb)
412 {
413 	struct ip_tunnel *tunnel;
414 	const struct iphdr *iph = ip_hdr(skb);
415 	int err;
416 
417 	tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
418 	if (tunnel != NULL) {
419 		struct pcpu_tstats *tstats;
420 
421 		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
422 			goto drop;
423 
424 		secpath_reset(skb);
425 
426 		skb->mac_header = skb->network_header;
427 		skb_reset_network_header(skb);
428 		skb->protocol = htons(ETH_P_IP);
429 		skb->pkt_type = PACKET_HOST;
430 
431 		__skb_tunnel_rx(skb, tunnel->dev);
432 
433 		err = IP_ECN_decapsulate(iph, skb);
434 		if (unlikely(err)) {
435 			if (log_ecn_error)
436 				net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
437 						     &iph->saddr, iph->tos);
438 			if (err > 1) {
439 				++tunnel->dev->stats.rx_frame_errors;
440 				++tunnel->dev->stats.rx_errors;
441 				goto drop;
442 			}
443 		}
444 
445 		tstats = this_cpu_ptr(tunnel->dev->tstats);
446 		u64_stats_update_begin(&tstats->syncp);
447 		tstats->rx_packets++;
448 		tstats->rx_bytes += skb->len;
449 		u64_stats_update_end(&tstats->syncp);
450 
451 		netif_rx(skb);
452 		return 0;
453 	}
454 
455 	return -1;
456 
457 drop:
458 	kfree_skb(skb);
459 	return 0;
460 }
461 
462 /*
463  *	This function assumes it is being called from dev_queue_xmit()
464  *	and that skb is filled properly by that function.
465  */
466 
467 static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
468 {
469 	struct ip_tunnel *tunnel = netdev_priv(dev);
470 	const struct iphdr  *tiph = &tunnel->parms.iph;
471 	u8     tos = tunnel->parms.iph.tos;
472 	__be16 df = tiph->frag_off;
473 	struct rtable *rt;     			/* Route to the other host */
474 	struct net_device *tdev;		/* Device to other host */
475 	const struct iphdr  *old_iph = ip_hdr(skb);
476 	struct iphdr  *iph;			/* Our new IP header */
477 	unsigned int max_headroom;		/* The extra header space needed */
478 	__be32 dst = tiph->daddr;
479 	struct flowi4 fl4;
480 	int    mtu;
481 
482 	if (skb->protocol != htons(ETH_P_IP))
483 		goto tx_error;
484 
485 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
486 	    skb_checksum_help(skb))
487 		goto tx_error;
488 
489 	if (tos & 1)
490 		tos = old_iph->tos;
491 
492 	if (!dst) {
493 		/* NBMA tunnel */
494 		if ((rt = skb_rtable(skb)) == NULL) {
495 			dev->stats.tx_fifo_errors++;
496 			goto tx_error;
497 		}
498 		dst = rt_nexthop(rt, old_iph->daddr);
499 	}
500 
501 	rt = ip_route_output_ports(dev_net(dev), &fl4, NULL,
502 				   dst, tiph->saddr,
503 				   0, 0,
504 				   IPPROTO_IPIP, RT_TOS(tos),
505 				   tunnel->parms.link);
506 	if (IS_ERR(rt)) {
507 		dev->stats.tx_carrier_errors++;
508 		goto tx_error_icmp;
509 	}
510 	tdev = rt->dst.dev;
511 
512 	if (tdev == dev) {
513 		ip_rt_put(rt);
514 		dev->stats.collisions++;
515 		goto tx_error;
516 	}
517 
518 	df |= old_iph->frag_off & htons(IP_DF);
519 
520 	if (df) {
521 		mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
522 
523 		if (mtu < 68) {
524 			dev->stats.collisions++;
525 			ip_rt_put(rt);
526 			goto tx_error;
527 		}
528 
529 		if (skb_dst(skb))
530 			skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
531 
532 		if ((old_iph->frag_off & htons(IP_DF)) &&
533 		    mtu < ntohs(old_iph->tot_len)) {
534 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
535 				  htonl(mtu));
536 			ip_rt_put(rt);
537 			goto tx_error;
538 		}
539 	}
540 
541 	if (tunnel->err_count > 0) {
542 		if (time_before(jiffies,
543 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
544 			tunnel->err_count--;
545 			dst_link_failure(skb);
546 		} else
547 			tunnel->err_count = 0;
548 	}
549 
550 	/*
551 	 * Okay, now see if we can stuff it in the buffer as-is.
552 	 */
553 	max_headroom = (LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr));
554 
555 	if (skb_headroom(skb) < max_headroom || skb_shared(skb) ||
556 	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
557 		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
558 		if (!new_skb) {
559 			ip_rt_put(rt);
560 			dev->stats.tx_dropped++;
561 			dev_kfree_skb(skb);
562 			return NETDEV_TX_OK;
563 		}
564 		if (skb->sk)
565 			skb_set_owner_w(new_skb, skb->sk);
566 		dev_kfree_skb(skb);
567 		skb = new_skb;
568 		old_iph = ip_hdr(skb);
569 	}
570 
571 	skb->transport_header = skb->network_header;
572 	skb_push(skb, sizeof(struct iphdr));
573 	skb_reset_network_header(skb);
574 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
575 	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
576 			      IPSKB_REROUTED);
577 	skb_dst_drop(skb);
578 	skb_dst_set(skb, &rt->dst);
579 
580 	/*
581 	 *	Push down and install the IPIP header.
582 	 */
583 
584 	iph 			=	ip_hdr(skb);
585 	iph->version		=	4;
586 	iph->ihl		=	sizeof(struct iphdr)>>2;
587 	iph->frag_off		=	df;
588 	iph->protocol		=	IPPROTO_IPIP;
589 	iph->tos		=	INET_ECN_encapsulate(tos, old_iph->tos);
590 	iph->daddr		=	fl4.daddr;
591 	iph->saddr		=	fl4.saddr;
592 
593 	if ((iph->ttl = tiph->ttl) == 0)
594 		iph->ttl	=	old_iph->ttl;
595 
596 	iptunnel_xmit(skb, dev);
597 	return NETDEV_TX_OK;
598 
599 tx_error_icmp:
600 	dst_link_failure(skb);
601 tx_error:
602 	dev->stats.tx_errors++;
603 	dev_kfree_skb(skb);
604 	return NETDEV_TX_OK;
605 }
606 
607 static void ipip_tunnel_bind_dev(struct net_device *dev)
608 {
609 	struct net_device *tdev = NULL;
610 	struct ip_tunnel *tunnel;
611 	const struct iphdr *iph;
612 
613 	tunnel = netdev_priv(dev);
614 	iph = &tunnel->parms.iph;
615 
616 	if (iph->daddr) {
617 		struct rtable *rt;
618 		struct flowi4 fl4;
619 
620 		rt = ip_route_output_ports(dev_net(dev), &fl4, NULL,
621 					   iph->daddr, iph->saddr,
622 					   0, 0,
623 					   IPPROTO_IPIP,
624 					   RT_TOS(iph->tos),
625 					   tunnel->parms.link);
626 		if (!IS_ERR(rt)) {
627 			tdev = rt->dst.dev;
628 			ip_rt_put(rt);
629 		}
630 		dev->flags |= IFF_POINTOPOINT;
631 	}
632 
633 	if (!tdev && tunnel->parms.link)
634 		tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
635 
636 	if (tdev) {
637 		dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr);
638 		dev->mtu = tdev->mtu - sizeof(struct iphdr);
639 	}
640 	dev->iflink = tunnel->parms.link;
641 }
642 
643 static void ipip_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p)
644 {
645 	struct net *net = dev_net(t->dev);
646 	struct ipip_net *ipn = net_generic(net, ipip_net_id);
647 
648 	ipip_tunnel_unlink(ipn, t);
649 	synchronize_net();
650 	t->parms.iph.saddr = p->iph.saddr;
651 	t->parms.iph.daddr = p->iph.daddr;
652 	memcpy(t->dev->dev_addr, &p->iph.saddr, 4);
653 	memcpy(t->dev->broadcast, &p->iph.daddr, 4);
654 	ipip_tunnel_link(ipn, t);
655 	t->parms.iph.ttl = p->iph.ttl;
656 	t->parms.iph.tos = p->iph.tos;
657 	t->parms.iph.frag_off = p->iph.frag_off;
658 	if (t->parms.link != p->link) {
659 		t->parms.link = p->link;
660 		ipip_tunnel_bind_dev(t->dev);
661 	}
662 	netdev_state_change(t->dev);
663 }
664 
665 static int
666 ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
667 {
668 	int err = 0;
669 	struct ip_tunnel_parm p;
670 	struct ip_tunnel *t;
671 	struct net *net = dev_net(dev);
672 	struct ipip_net *ipn = net_generic(net, ipip_net_id);
673 
674 	switch (cmd) {
675 	case SIOCGETTUNNEL:
676 		t = NULL;
677 		if (dev == ipn->fb_tunnel_dev) {
678 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
679 				err = -EFAULT;
680 				break;
681 			}
682 			t = ipip_tunnel_locate(net, &p, 0);
683 		}
684 		if (t == NULL)
685 			t = netdev_priv(dev);
686 		memcpy(&p, &t->parms, sizeof(p));
687 		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
688 			err = -EFAULT;
689 		break;
690 
691 	case SIOCADDTUNNEL:
692 	case SIOCCHGTUNNEL:
693 		err = -EPERM;
694 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
695 			goto done;
696 
697 		err = -EFAULT;
698 		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
699 			goto done;
700 
701 		err = -EINVAL;
702 		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
703 		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
704 			goto done;
705 		if (p.iph.ttl)
706 			p.iph.frag_off |= htons(IP_DF);
707 
708 		t = ipip_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
709 
710 		if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
711 			if (t != NULL) {
712 				if (t->dev != dev) {
713 					err = -EEXIST;
714 					break;
715 				}
716 			} else {
717 				if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) ||
718 				    (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) {
719 					err = -EINVAL;
720 					break;
721 				}
722 				t = netdev_priv(dev);
723 			}
724 
725 			ipip_tunnel_update(t, &p);
726 		}
727 
728 		if (t) {
729 			err = 0;
730 			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
731 				err = -EFAULT;
732 		} else
733 			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
734 		break;
735 
736 	case SIOCDELTUNNEL:
737 		err = -EPERM;
738 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
739 			goto done;
740 
741 		if (dev == ipn->fb_tunnel_dev) {
742 			err = -EFAULT;
743 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
744 				goto done;
745 			err = -ENOENT;
746 			if ((t = ipip_tunnel_locate(net, &p, 0)) == NULL)
747 				goto done;
748 			err = -EPERM;
749 			if (t->dev == ipn->fb_tunnel_dev)
750 				goto done;
751 			dev = t->dev;
752 		}
753 		unregister_netdevice(dev);
754 		err = 0;
755 		break;
756 
757 	default:
758 		err = -EINVAL;
759 	}
760 
761 done:
762 	return err;
763 }
764 
765 static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
766 {
767 	if (new_mtu < 68 || new_mtu > 0xFFF8 - sizeof(struct iphdr))
768 		return -EINVAL;
769 	dev->mtu = new_mtu;
770 	return 0;
771 }
772 
773 static const struct net_device_ops ipip_netdev_ops = {
774 	.ndo_uninit	= ipip_tunnel_uninit,
775 	.ndo_start_xmit	= ipip_tunnel_xmit,
776 	.ndo_do_ioctl	= ipip_tunnel_ioctl,
777 	.ndo_change_mtu	= ipip_tunnel_change_mtu,
778 	.ndo_get_stats64 = ipip_get_stats64,
779 };
780 
781 static void ipip_dev_free(struct net_device *dev)
782 {
783 	free_percpu(dev->tstats);
784 	free_netdev(dev);
785 }
786 
787 #define IPIP_FEATURES (NETIF_F_SG |		\
788 		       NETIF_F_FRAGLIST |	\
789 		       NETIF_F_HIGHDMA |	\
790 		       NETIF_F_HW_CSUM)
791 
792 static void ipip_tunnel_setup(struct net_device *dev)
793 {
794 	dev->netdev_ops		= &ipip_netdev_ops;
795 	dev->destructor		= ipip_dev_free;
796 
797 	dev->type		= ARPHRD_TUNNEL;
798 	dev->hard_header_len 	= LL_MAX_HEADER + sizeof(struct iphdr);
799 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr);
800 	dev->flags		= IFF_NOARP;
801 	dev->iflink		= 0;
802 	dev->addr_len		= 4;
803 	dev->features		|= NETIF_F_NETNS_LOCAL;
804 	dev->features		|= NETIF_F_LLTX;
805 	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
806 
807 	dev->features		|= IPIP_FEATURES;
808 	dev->hw_features	|= IPIP_FEATURES;
809 }
810 
811 static int ipip_tunnel_init(struct net_device *dev)
812 {
813 	struct ip_tunnel *tunnel = netdev_priv(dev);
814 
815 	tunnel->dev = dev;
816 
817 	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
818 	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
819 
820 	ipip_tunnel_bind_dev(dev);
821 
822 	dev->tstats = alloc_percpu(struct pcpu_tstats);
823 	if (!dev->tstats)
824 		return -ENOMEM;
825 
826 	return 0;
827 }
828 
829 static int __net_init ipip_fb_tunnel_init(struct net_device *dev)
830 {
831 	struct ip_tunnel *tunnel = netdev_priv(dev);
832 	struct iphdr *iph = &tunnel->parms.iph;
833 	struct ipip_net *ipn = net_generic(dev_net(dev), ipip_net_id);
834 
835 	tunnel->dev = dev;
836 	strcpy(tunnel->parms.name, dev->name);
837 
838 	iph->version		= 4;
839 	iph->protocol		= IPPROTO_IPIP;
840 	iph->ihl		= 5;
841 
842 	dev->tstats = alloc_percpu(struct pcpu_tstats);
843 	if (!dev->tstats)
844 		return -ENOMEM;
845 
846 	dev_hold(dev);
847 	rcu_assign_pointer(ipn->tunnels_wc[0], tunnel);
848 	return 0;
849 }
850 
851 static void ipip_netlink_parms(struct nlattr *data[],
852 			       struct ip_tunnel_parm *parms)
853 {
854 	memset(parms, 0, sizeof(*parms));
855 
856 	parms->iph.version = 4;
857 	parms->iph.protocol = IPPROTO_IPIP;
858 	parms->iph.ihl = 5;
859 
860 	if (!data)
861 		return;
862 
863 	if (data[IFLA_IPTUN_LINK])
864 		parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]);
865 
866 	if (data[IFLA_IPTUN_LOCAL])
867 		parms->iph.saddr = nla_get_be32(data[IFLA_IPTUN_LOCAL]);
868 
869 	if (data[IFLA_IPTUN_REMOTE])
870 		parms->iph.daddr = nla_get_be32(data[IFLA_IPTUN_REMOTE]);
871 
872 	if (data[IFLA_IPTUN_TTL]) {
873 		parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]);
874 		if (parms->iph.ttl)
875 			parms->iph.frag_off = htons(IP_DF);
876 	}
877 
878 	if (data[IFLA_IPTUN_TOS])
879 		parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]);
880 
881 	if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC]))
882 		parms->iph.frag_off = htons(IP_DF);
883 }
884 
885 static int ipip_newlink(struct net *src_net, struct net_device *dev,
886 			struct nlattr *tb[], struct nlattr *data[])
887 {
888 	struct net *net = dev_net(dev);
889 	struct ip_tunnel *nt;
890 
891 	nt = netdev_priv(dev);
892 	ipip_netlink_parms(data, &nt->parms);
893 
894 	if (ipip_tunnel_locate(net, &nt->parms, 0))
895 		return -EEXIST;
896 
897 	return ipip_tunnel_create(dev);
898 }
899 
900 static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
901 			   struct nlattr *data[])
902 {
903 	struct ip_tunnel *t;
904 	struct ip_tunnel_parm p;
905 	struct net *net = dev_net(dev);
906 	struct ipip_net *ipn = net_generic(net, ipip_net_id);
907 
908 	if (dev == ipn->fb_tunnel_dev)
909 		return -EINVAL;
910 
911 	ipip_netlink_parms(data, &p);
912 
913 	if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) ||
914 	    (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr))
915 		return -EINVAL;
916 
917 	t = ipip_tunnel_locate(net, &p, 0);
918 
919 	if (t) {
920 		if (t->dev != dev)
921 			return -EEXIST;
922 	} else
923 		t = netdev_priv(dev);
924 
925 	ipip_tunnel_update(t, &p);
926 	return 0;
927 }
928 
929 static size_t ipip_get_size(const struct net_device *dev)
930 {
931 	return
932 		/* IFLA_IPTUN_LINK */
933 		nla_total_size(4) +
934 		/* IFLA_IPTUN_LOCAL */
935 		nla_total_size(4) +
936 		/* IFLA_IPTUN_REMOTE */
937 		nla_total_size(4) +
938 		/* IFLA_IPTUN_TTL */
939 		nla_total_size(1) +
940 		/* IFLA_IPTUN_TOS */
941 		nla_total_size(1) +
942 		/* IFLA_IPTUN_PMTUDISC */
943 		nla_total_size(1) +
944 		0;
945 }
946 
947 static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev)
948 {
949 	struct ip_tunnel *tunnel = netdev_priv(dev);
950 	struct ip_tunnel_parm *parm = &tunnel->parms;
951 
952 	if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) ||
953 	    nla_put_be32(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) ||
954 	    nla_put_be32(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) ||
955 	    nla_put_u8(skb, IFLA_IPTUN_TTL, parm->iph.ttl) ||
956 	    nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) ||
957 	    nla_put_u8(skb, IFLA_IPTUN_PMTUDISC,
958 		       !!(parm->iph.frag_off & htons(IP_DF))))
959 		goto nla_put_failure;
960 	return 0;
961 
962 nla_put_failure:
963 	return -EMSGSIZE;
964 }
965 
966 static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = {
967 	[IFLA_IPTUN_LINK]		= { .type = NLA_U32 },
968 	[IFLA_IPTUN_LOCAL]		= { .type = NLA_U32 },
969 	[IFLA_IPTUN_REMOTE]		= { .type = NLA_U32 },
970 	[IFLA_IPTUN_TTL]		= { .type = NLA_U8 },
971 	[IFLA_IPTUN_TOS]		= { .type = NLA_U8 },
972 	[IFLA_IPTUN_PMTUDISC]		= { .type = NLA_U8 },
973 };
974 
975 static struct rtnl_link_ops ipip_link_ops __read_mostly = {
976 	.kind		= "ipip",
977 	.maxtype	= IFLA_IPTUN_MAX,
978 	.policy		= ipip_policy,
979 	.priv_size	= sizeof(struct ip_tunnel),
980 	.setup		= ipip_tunnel_setup,
981 	.newlink	= ipip_newlink,
982 	.changelink	= ipip_changelink,
983 	.get_size	= ipip_get_size,
984 	.fill_info	= ipip_fill_info,
985 };
986 
987 static struct xfrm_tunnel ipip_handler __read_mostly = {
988 	.handler	=	ipip_rcv,
989 	.err_handler	=	ipip_err,
990 	.priority	=	1,
991 };
992 
993 static const char banner[] __initconst =
994 	KERN_INFO "IPv4 over IPv4 tunneling driver\n";
995 
996 static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head)
997 {
998 	int prio;
999 
1000 	for (prio = 1; prio < 4; prio++) {
1001 		int h;
1002 		for (h = 0; h < HASH_SIZE; h++) {
1003 			struct ip_tunnel *t;
1004 
1005 			t = rtnl_dereference(ipn->tunnels[prio][h]);
1006 			while (t != NULL) {
1007 				unregister_netdevice_queue(t->dev, head);
1008 				t = rtnl_dereference(t->next);
1009 			}
1010 		}
1011 	}
1012 }
1013 
1014 static int __net_init ipip_init_net(struct net *net)
1015 {
1016 	struct ipip_net *ipn = net_generic(net, ipip_net_id);
1017 	struct ip_tunnel *t;
1018 	int err;
1019 
1020 	ipn->tunnels[0] = ipn->tunnels_wc;
1021 	ipn->tunnels[1] = ipn->tunnels_l;
1022 	ipn->tunnels[2] = ipn->tunnels_r;
1023 	ipn->tunnels[3] = ipn->tunnels_r_l;
1024 
1025 	ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel),
1026 					   "tunl0",
1027 					   ipip_tunnel_setup);
1028 	if (!ipn->fb_tunnel_dev) {
1029 		err = -ENOMEM;
1030 		goto err_alloc_dev;
1031 	}
1032 	dev_net_set(ipn->fb_tunnel_dev, net);
1033 
1034 	err = ipip_fb_tunnel_init(ipn->fb_tunnel_dev);
1035 	if (err)
1036 		goto err_reg_dev;
1037 
1038 	if ((err = register_netdev(ipn->fb_tunnel_dev)))
1039 		goto err_reg_dev;
1040 
1041 	t = netdev_priv(ipn->fb_tunnel_dev);
1042 
1043 	strcpy(t->parms.name, ipn->fb_tunnel_dev->name);
1044 	return 0;
1045 
1046 err_reg_dev:
1047 	ipip_dev_free(ipn->fb_tunnel_dev);
1048 err_alloc_dev:
1049 	/* nothing */
1050 	return err;
1051 }
1052 
1053 static void __net_exit ipip_exit_net(struct net *net)
1054 {
1055 	struct ipip_net *ipn = net_generic(net, ipip_net_id);
1056 	LIST_HEAD(list);
1057 
1058 	rtnl_lock();
1059 	ipip_destroy_tunnels(ipn, &list);
1060 	unregister_netdevice_queue(ipn->fb_tunnel_dev, &list);
1061 	unregister_netdevice_many(&list);
1062 	rtnl_unlock();
1063 }
1064 
1065 static struct pernet_operations ipip_net_ops = {
1066 	.init = ipip_init_net,
1067 	.exit = ipip_exit_net,
1068 	.id   = &ipip_net_id,
1069 	.size = sizeof(struct ipip_net),
1070 };
1071 
1072 static int __init ipip_init(void)
1073 {
1074 	int err;
1075 
1076 	printk(banner);
1077 
1078 	err = register_pernet_device(&ipip_net_ops);
1079 	if (err < 0)
1080 		return err;
1081 	err = xfrm4_tunnel_register(&ipip_handler, AF_INET);
1082 	if (err < 0) {
1083 		pr_info("%s: can't register tunnel\n", __func__);
1084 		goto xfrm_tunnel_failed;
1085 	}
1086 	err = rtnl_link_register(&ipip_link_ops);
1087 	if (err < 0)
1088 		goto rtnl_link_failed;
1089 
1090 out:
1091 	return err;
1092 
1093 rtnl_link_failed:
1094 	xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
1095 xfrm_tunnel_failed:
1096 	unregister_pernet_device(&ipip_net_ops);
1097 	goto out;
1098 }
1099 
1100 static void __exit ipip_fini(void)
1101 {
1102 	rtnl_link_unregister(&ipip_link_ops);
1103 	if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET))
1104 		pr_info("%s: can't deregister tunnel\n", __func__);
1105 
1106 	unregister_pernet_device(&ipip_net_ops);
1107 }
1108 
1109 module_init(ipip_init);
1110 module_exit(ipip_fini);
1111 MODULE_LICENSE("GPL");
1112 MODULE_ALIAS_NETDEV("tunl0");
1113