xref: /openbmc/linux/net/ipv4/ipip.c (revision 363737d6)
1 /*
2  *	Linux NET3:	IP/IP protocol decoder.
3  *
4  *	Authors:
5  *		Sam Lantinga (slouken@cs.ucdavis.edu)  02/01/95
6  *
7  *	Fixes:
8  *		Alan Cox	:	Merged and made usable non modular (its so tiny its silly as
9  *					a module taking up 2 pages).
10  *		Alan Cox	: 	Fixed bug with 1.3.18 and IPIP not working (now needs to set skb->h.iph)
11  *					to keep ip_forward happy.
12  *		Alan Cox	:	More fixes for 1.3.21, and firewall fix. Maybe this will work soon 8).
13  *		Kai Schulte	:	Fixed #defines for IP_FIREWALL->FIREWALL
14  *              David Woodhouse :       Perform some basic ICMP handling.
15  *                                      IPIP Routing without decapsulation.
16  *              Carlos Picoto   :       GRE over IP support
17  *		Alexey Kuznetsov:	Reworked. Really, now it is truncated version of ipv4/ip_gre.c.
18  *					I do not want to merge them together.
19  *
20  *	This program is free software; you can redistribute it and/or
21  *	modify it under the terms of the GNU General Public License
22  *	as published by the Free Software Foundation; either version
23  *	2 of the License, or (at your option) any later version.
24  *
25  */
26 
27 /* tunnel.c: an IP tunnel driver
28 
29 	The purpose of this driver is to provide an IP tunnel through
30 	which you can tunnel network traffic transparently across subnets.
31 
32 	This was written by looking at Nick Holloway's dummy driver
33 	Thanks for the great code!
34 
35 		-Sam Lantinga	(slouken@cs.ucdavis.edu)  02/01/95
36 
37 	Minor tweaks:
38 		Cleaned up the code a little and added some pre-1.3.0 tweaks.
39 		dev->hard_header/hard_header_len changed to use no headers.
40 		Comments/bracketing tweaked.
41 		Made the tunnels use dev->name not tunnel: when error reporting.
42 		Added tx_dropped stat
43 
44 		-Alan Cox	(alan@lxorguk.ukuu.org.uk) 21 March 95
45 
46 	Reworked:
47 		Changed to tunnel to destination gateway in addition to the
48 			tunnel's pointopoint address
49 		Almost completely rewritten
50 		Note:  There is currently no firewall or ICMP handling done.
51 
52 		-Sam Lantinga	(slouken@cs.ucdavis.edu) 02/13/96
53 
54 */
55 
56 /* Things I wish I had known when writing the tunnel driver:
57 
58 	When the tunnel_xmit() function is called, the skb contains the
59 	packet to be sent (plus a great deal of extra info), and dev
60 	contains the tunnel device that _we_ are.
61 
62 	When we are passed a packet, we are expected to fill in the
63 	source address with our source IP address.
64 
65 	What is the proper way to allocate, copy and free a buffer?
66 	After you allocate it, it is a "0 length" chunk of memory
67 	starting at zero.  If you want to add headers to the buffer
68 	later, you'll have to call "skb_reserve(skb, amount)" with
69 	the amount of memory you want reserved.  Then, you call
70 	"skb_put(skb, amount)" with the amount of space you want in
71 	the buffer.  skb_put() returns a pointer to the top (#0) of
72 	that buffer.  skb->len is set to the amount of space you have
73 	"allocated" with skb_put().  You can then write up to skb->len
74 	bytes to that buffer.  If you need more, you can call skb_put()
75 	again with the additional amount of space you need.  You can
76 	find out how much more space you can allocate by calling
77 	"skb_tailroom(skb)".
78 	Now, to add header space, call "skb_push(skb, header_len)".
79 	This creates space at the beginning of the buffer and returns
80 	a pointer to this new space.  If later you need to strip a
81 	header from a buffer, call "skb_pull(skb, header_len)".
82 	skb_headroom() will return how much space is left at the top
83 	of the buffer (before the main data).  Remember, this headroom
84 	space must be reserved before the skb_put() function is called.
85 	*/
86 
87 /*
88    This version of net/ipv4/ipip.c is cloned of net/ipv4/ip_gre.c
89 
90    For comments look at net/ipv4/ip_gre.c --ANK
91  */
92 
93 
94 #include <linux/capability.h>
95 #include <linux/module.h>
96 #include <linux/types.h>
97 #include <linux/kernel.h>
98 #include <linux/slab.h>
99 #include <asm/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <linux/in.h>
103 #include <linux/tcp.h>
104 #include <linux/udp.h>
105 #include <linux/if_arp.h>
106 #include <linux/mroute.h>
107 #include <linux/init.h>
108 #include <linux/netfilter_ipv4.h>
109 #include <linux/if_ether.h>
110 
111 #include <net/sock.h>
112 #include <net/ip.h>
113 #include <net/icmp.h>
114 #include <net/ipip.h>
115 #include <net/inet_ecn.h>
116 #include <net/xfrm.h>
117 #include <net/net_namespace.h>
118 #include <net/netns/generic.h>
119 
120 #define HASH_SIZE  16
121 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
122 
123 static int ipip_net_id __read_mostly;
124 struct ipip_net {
125 	struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
126 	struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
127 	struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
128 	struct ip_tunnel __rcu *tunnels_wc[1];
129 	struct ip_tunnel __rcu **tunnels[4];
130 
131 	struct net_device *fb_tunnel_dev;
132 };
133 
134 static int ipip_tunnel_init(struct net_device *dev);
135 static void ipip_tunnel_setup(struct net_device *dev);
136 static void ipip_dev_free(struct net_device *dev);
137 
138 /*
139  * Locking : hash tables are protected by RCU and RTNL
140  */
141 
142 #define for_each_ip_tunnel_rcu(start) \
143 	for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
144 
145 /* often modified stats are per cpu, other are shared (netdev->stats) */
146 struct pcpu_tstats {
147 	u64	rx_packets;
148 	u64	rx_bytes;
149 	u64	tx_packets;
150 	u64	tx_bytes;
151 	struct u64_stats_sync	syncp;
152 };
153 
154 static struct rtnl_link_stats64 *ipip_get_stats64(struct net_device *dev,
155 						  struct rtnl_link_stats64 *tot)
156 {
157 	int i;
158 
159 	for_each_possible_cpu(i) {
160 		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
161 		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
162 		unsigned int start;
163 
164 		do {
165 			start = u64_stats_fetch_begin_bh(&tstats->syncp);
166 			rx_packets = tstats->rx_packets;
167 			tx_packets = tstats->tx_packets;
168 			rx_bytes = tstats->rx_bytes;
169 			tx_bytes = tstats->tx_bytes;
170 		} while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
171 
172 		tot->rx_packets += rx_packets;
173 		tot->tx_packets += tx_packets;
174 		tot->rx_bytes   += rx_bytes;
175 		tot->tx_bytes   += tx_bytes;
176 	}
177 
178 	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
179 	tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
180 	tot->tx_dropped = dev->stats.tx_dropped;
181 	tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
182 	tot->tx_errors = dev->stats.tx_errors;
183 	tot->collisions = dev->stats.collisions;
184 
185 	return tot;
186 }
187 
188 static struct ip_tunnel *ipip_tunnel_lookup(struct net *net,
189 		__be32 remote, __be32 local)
190 {
191 	unsigned int h0 = HASH(remote);
192 	unsigned int h1 = HASH(local);
193 	struct ip_tunnel *t;
194 	struct ipip_net *ipn = net_generic(net, ipip_net_id);
195 
196 	for_each_ip_tunnel_rcu(ipn->tunnels_r_l[h0 ^ h1])
197 		if (local == t->parms.iph.saddr &&
198 		    remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
199 			return t;
200 
201 	for_each_ip_tunnel_rcu(ipn->tunnels_r[h0])
202 		if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
203 			return t;
204 
205 	for_each_ip_tunnel_rcu(ipn->tunnels_l[h1])
206 		if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
207 			return t;
208 
209 	t = rcu_dereference(ipn->tunnels_wc[0]);
210 	if (t && (t->dev->flags&IFF_UP))
211 		return t;
212 	return NULL;
213 }
214 
215 static struct ip_tunnel __rcu **__ipip_bucket(struct ipip_net *ipn,
216 		struct ip_tunnel_parm *parms)
217 {
218 	__be32 remote = parms->iph.daddr;
219 	__be32 local = parms->iph.saddr;
220 	unsigned int h = 0;
221 	int prio = 0;
222 
223 	if (remote) {
224 		prio |= 2;
225 		h ^= HASH(remote);
226 	}
227 	if (local) {
228 		prio |= 1;
229 		h ^= HASH(local);
230 	}
231 	return &ipn->tunnels[prio][h];
232 }
233 
234 static inline struct ip_tunnel __rcu **ipip_bucket(struct ipip_net *ipn,
235 		struct ip_tunnel *t)
236 {
237 	return __ipip_bucket(ipn, &t->parms);
238 }
239 
240 static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
241 {
242 	struct ip_tunnel __rcu **tp;
243 	struct ip_tunnel *iter;
244 
245 	for (tp = ipip_bucket(ipn, t);
246 	     (iter = rtnl_dereference(*tp)) != NULL;
247 	     tp = &iter->next) {
248 		if (t == iter) {
249 			rcu_assign_pointer(*tp, t->next);
250 			break;
251 		}
252 	}
253 }
254 
255 static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t)
256 {
257 	struct ip_tunnel __rcu **tp = ipip_bucket(ipn, t);
258 
259 	rcu_assign_pointer(t->next, rtnl_dereference(*tp));
260 	rcu_assign_pointer(*tp, t);
261 }
262 
263 static struct ip_tunnel *ipip_tunnel_locate(struct net *net,
264 		struct ip_tunnel_parm *parms, int create)
265 {
266 	__be32 remote = parms->iph.daddr;
267 	__be32 local = parms->iph.saddr;
268 	struct ip_tunnel *t, *nt;
269 	struct ip_tunnel __rcu **tp;
270 	struct net_device *dev;
271 	char name[IFNAMSIZ];
272 	struct ipip_net *ipn = net_generic(net, ipip_net_id);
273 
274 	for (tp = __ipip_bucket(ipn, parms);
275 		 (t = rtnl_dereference(*tp)) != NULL;
276 		 tp = &t->next) {
277 		if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
278 			return t;
279 	}
280 	if (!create)
281 		return NULL;
282 
283 	if (parms->name[0])
284 		strlcpy(name, parms->name, IFNAMSIZ);
285 	else
286 		strcpy(name, "tunl%d");
287 
288 	dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup);
289 	if (dev == NULL)
290 		return NULL;
291 
292 	dev_net_set(dev, net);
293 
294 	nt = netdev_priv(dev);
295 	nt->parms = *parms;
296 
297 	if (ipip_tunnel_init(dev) < 0)
298 		goto failed_free;
299 
300 	if (register_netdevice(dev) < 0)
301 		goto failed_free;
302 
303 	strcpy(nt->parms.name, dev->name);
304 
305 	dev_hold(dev);
306 	ipip_tunnel_link(ipn, nt);
307 	return nt;
308 
309 failed_free:
310 	ipip_dev_free(dev);
311 	return NULL;
312 }
313 
314 /* called with RTNL */
315 static void ipip_tunnel_uninit(struct net_device *dev)
316 {
317 	struct net *net = dev_net(dev);
318 	struct ipip_net *ipn = net_generic(net, ipip_net_id);
319 
320 	if (dev == ipn->fb_tunnel_dev)
321 		RCU_INIT_POINTER(ipn->tunnels_wc[0], NULL);
322 	else
323 		ipip_tunnel_unlink(ipn, netdev_priv(dev));
324 	dev_put(dev);
325 }
326 
327 static int ipip_err(struct sk_buff *skb, u32 info)
328 {
329 
330 /* All the routers (except for Linux) return only
331    8 bytes of packet payload. It means, that precise relaying of
332    ICMP in the real Internet is absolutely infeasible.
333  */
334 	const struct iphdr *iph = (const struct iphdr *)skb->data;
335 	const int type = icmp_hdr(skb)->type;
336 	const int code = icmp_hdr(skb)->code;
337 	struct ip_tunnel *t;
338 	int err;
339 
340 	switch (type) {
341 	default:
342 	case ICMP_PARAMETERPROB:
343 		return 0;
344 
345 	case ICMP_DEST_UNREACH:
346 		switch (code) {
347 		case ICMP_SR_FAILED:
348 		case ICMP_PORT_UNREACH:
349 			/* Impossible event. */
350 			return 0;
351 		case ICMP_FRAG_NEEDED:
352 			/* Soft state for pmtu is maintained by IP core. */
353 			return 0;
354 		default:
355 			/* All others are translated to HOST_UNREACH.
356 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
357 			   I believe they are just ether pollution. --ANK
358 			 */
359 			break;
360 		}
361 		break;
362 	case ICMP_TIME_EXCEEDED:
363 		if (code != ICMP_EXC_TTL)
364 			return 0;
365 		break;
366 	}
367 
368 	err = -ENOENT;
369 
370 	rcu_read_lock();
371 	t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
372 	if (t == NULL || t->parms.iph.daddr == 0)
373 		goto out;
374 
375 	err = 0;
376 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
377 		goto out;
378 
379 	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
380 		t->err_count++;
381 	else
382 		t->err_count = 1;
383 	t->err_time = jiffies;
384 out:
385 	rcu_read_unlock();
386 	return err;
387 }
388 
389 static inline void ipip_ecn_decapsulate(const struct iphdr *outer_iph,
390 					struct sk_buff *skb)
391 {
392 	struct iphdr *inner_iph = ip_hdr(skb);
393 
394 	if (INET_ECN_is_ce(outer_iph->tos))
395 		IP_ECN_set_ce(inner_iph);
396 }
397 
398 static int ipip_rcv(struct sk_buff *skb)
399 {
400 	struct ip_tunnel *tunnel;
401 	const struct iphdr *iph = ip_hdr(skb);
402 
403 	rcu_read_lock();
404 	tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
405 	if (tunnel != NULL) {
406 		struct pcpu_tstats *tstats;
407 
408 		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
409 			rcu_read_unlock();
410 			kfree_skb(skb);
411 			return 0;
412 		}
413 
414 		secpath_reset(skb);
415 
416 		skb->mac_header = skb->network_header;
417 		skb_reset_network_header(skb);
418 		skb->protocol = htons(ETH_P_IP);
419 		skb->pkt_type = PACKET_HOST;
420 
421 		tstats = this_cpu_ptr(tunnel->dev->tstats);
422 		u64_stats_update_begin(&tstats->syncp);
423 		tstats->rx_packets++;
424 		tstats->rx_bytes += skb->len;
425 		u64_stats_update_end(&tstats->syncp);
426 
427 		__skb_tunnel_rx(skb, tunnel->dev);
428 
429 		ipip_ecn_decapsulate(iph, skb);
430 
431 		netif_rx(skb);
432 
433 		rcu_read_unlock();
434 		return 0;
435 	}
436 	rcu_read_unlock();
437 
438 	return -1;
439 }
440 
441 /*
442  *	This function assumes it is being called from dev_queue_xmit()
443  *	and that skb is filled properly by that function.
444  */
445 
446 static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
447 {
448 	struct ip_tunnel *tunnel = netdev_priv(dev);
449 	struct pcpu_tstats *tstats;
450 	const struct iphdr  *tiph = &tunnel->parms.iph;
451 	u8     tos = tunnel->parms.iph.tos;
452 	__be16 df = tiph->frag_off;
453 	struct rtable *rt;     			/* Route to the other host */
454 	struct net_device *tdev;		/* Device to other host */
455 	const struct iphdr  *old_iph = ip_hdr(skb);
456 	struct iphdr  *iph;			/* Our new IP header */
457 	unsigned int max_headroom;		/* The extra header space needed */
458 	__be32 dst = tiph->daddr;
459 	struct flowi4 fl4;
460 	int    mtu;
461 
462 	if (skb->protocol != htons(ETH_P_IP))
463 		goto tx_error;
464 
465 	if (tos & 1)
466 		tos = old_iph->tos;
467 
468 	if (!dst) {
469 		/* NBMA tunnel */
470 		if ((rt = skb_rtable(skb)) == NULL) {
471 			dev->stats.tx_fifo_errors++;
472 			goto tx_error;
473 		}
474 		dst = rt->rt_gateway;
475 	}
476 
477 	rt = ip_route_output_ports(dev_net(dev), &fl4, NULL,
478 				   dst, tiph->saddr,
479 				   0, 0,
480 				   IPPROTO_IPIP, RT_TOS(tos),
481 				   tunnel->parms.link);
482 	if (IS_ERR(rt)) {
483 		dev->stats.tx_carrier_errors++;
484 		goto tx_error_icmp;
485 	}
486 	tdev = rt->dst.dev;
487 
488 	if (tdev == dev) {
489 		ip_rt_put(rt);
490 		dev->stats.collisions++;
491 		goto tx_error;
492 	}
493 
494 	df |= old_iph->frag_off & htons(IP_DF);
495 
496 	if (df) {
497 		mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
498 
499 		if (mtu < 68) {
500 			dev->stats.collisions++;
501 			ip_rt_put(rt);
502 			goto tx_error;
503 		}
504 
505 		if (skb_dst(skb))
506 			skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
507 
508 		if ((old_iph->frag_off & htons(IP_DF)) &&
509 		    mtu < ntohs(old_iph->tot_len)) {
510 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
511 				  htonl(mtu));
512 			ip_rt_put(rt);
513 			goto tx_error;
514 		}
515 	}
516 
517 	if (tunnel->err_count > 0) {
518 		if (time_before(jiffies,
519 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
520 			tunnel->err_count--;
521 			dst_link_failure(skb);
522 		} else
523 			tunnel->err_count = 0;
524 	}
525 
526 	/*
527 	 * Okay, now see if we can stuff it in the buffer as-is.
528 	 */
529 	max_headroom = (LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr));
530 
531 	if (skb_headroom(skb) < max_headroom || skb_shared(skb) ||
532 	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
533 		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
534 		if (!new_skb) {
535 			ip_rt_put(rt);
536 			dev->stats.tx_dropped++;
537 			dev_kfree_skb(skb);
538 			return NETDEV_TX_OK;
539 		}
540 		if (skb->sk)
541 			skb_set_owner_w(new_skb, skb->sk);
542 		dev_kfree_skb(skb);
543 		skb = new_skb;
544 		old_iph = ip_hdr(skb);
545 	}
546 
547 	skb->transport_header = skb->network_header;
548 	skb_push(skb, sizeof(struct iphdr));
549 	skb_reset_network_header(skb);
550 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
551 	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
552 			      IPSKB_REROUTED);
553 	skb_dst_drop(skb);
554 	skb_dst_set(skb, &rt->dst);
555 
556 	/*
557 	 *	Push down and install the IPIP header.
558 	 */
559 
560 	iph 			=	ip_hdr(skb);
561 	iph->version		=	4;
562 	iph->ihl		=	sizeof(struct iphdr)>>2;
563 	iph->frag_off		=	df;
564 	iph->protocol		=	IPPROTO_IPIP;
565 	iph->tos		=	INET_ECN_encapsulate(tos, old_iph->tos);
566 	iph->daddr		=	fl4.daddr;
567 	iph->saddr		=	fl4.saddr;
568 
569 	if ((iph->ttl = tiph->ttl) == 0)
570 		iph->ttl	=	old_iph->ttl;
571 
572 	nf_reset(skb);
573 	tstats = this_cpu_ptr(dev->tstats);
574 	__IPTUNNEL_XMIT(tstats, &dev->stats);
575 	return NETDEV_TX_OK;
576 
577 tx_error_icmp:
578 	dst_link_failure(skb);
579 tx_error:
580 	dev->stats.tx_errors++;
581 	dev_kfree_skb(skb);
582 	return NETDEV_TX_OK;
583 }
584 
585 static void ipip_tunnel_bind_dev(struct net_device *dev)
586 {
587 	struct net_device *tdev = NULL;
588 	struct ip_tunnel *tunnel;
589 	const struct iphdr *iph;
590 
591 	tunnel = netdev_priv(dev);
592 	iph = &tunnel->parms.iph;
593 
594 	if (iph->daddr) {
595 		struct rtable *rt;
596 		struct flowi4 fl4;
597 
598 		rt = ip_route_output_ports(dev_net(dev), &fl4, NULL,
599 					   iph->daddr, iph->saddr,
600 					   0, 0,
601 					   IPPROTO_IPIP,
602 					   RT_TOS(iph->tos),
603 					   tunnel->parms.link);
604 		if (!IS_ERR(rt)) {
605 			tdev = rt->dst.dev;
606 			ip_rt_put(rt);
607 		}
608 		dev->flags |= IFF_POINTOPOINT;
609 	}
610 
611 	if (!tdev && tunnel->parms.link)
612 		tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
613 
614 	if (tdev) {
615 		dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr);
616 		dev->mtu = tdev->mtu - sizeof(struct iphdr);
617 	}
618 	dev->iflink = tunnel->parms.link;
619 }
620 
621 static int
622 ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
623 {
624 	int err = 0;
625 	struct ip_tunnel_parm p;
626 	struct ip_tunnel *t;
627 	struct net *net = dev_net(dev);
628 	struct ipip_net *ipn = net_generic(net, ipip_net_id);
629 
630 	switch (cmd) {
631 	case SIOCGETTUNNEL:
632 		t = NULL;
633 		if (dev == ipn->fb_tunnel_dev) {
634 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
635 				err = -EFAULT;
636 				break;
637 			}
638 			t = ipip_tunnel_locate(net, &p, 0);
639 		}
640 		if (t == NULL)
641 			t = netdev_priv(dev);
642 		memcpy(&p, &t->parms, sizeof(p));
643 		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
644 			err = -EFAULT;
645 		break;
646 
647 	case SIOCADDTUNNEL:
648 	case SIOCCHGTUNNEL:
649 		err = -EPERM;
650 		if (!capable(CAP_NET_ADMIN))
651 			goto done;
652 
653 		err = -EFAULT;
654 		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
655 			goto done;
656 
657 		err = -EINVAL;
658 		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
659 		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
660 			goto done;
661 		if (p.iph.ttl)
662 			p.iph.frag_off |= htons(IP_DF);
663 
664 		t = ipip_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
665 
666 		if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
667 			if (t != NULL) {
668 				if (t->dev != dev) {
669 					err = -EEXIST;
670 					break;
671 				}
672 			} else {
673 				if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) ||
674 				    (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) {
675 					err = -EINVAL;
676 					break;
677 				}
678 				t = netdev_priv(dev);
679 				ipip_tunnel_unlink(ipn, t);
680 				synchronize_net();
681 				t->parms.iph.saddr = p.iph.saddr;
682 				t->parms.iph.daddr = p.iph.daddr;
683 				memcpy(dev->dev_addr, &p.iph.saddr, 4);
684 				memcpy(dev->broadcast, &p.iph.daddr, 4);
685 				ipip_tunnel_link(ipn, t);
686 				netdev_state_change(dev);
687 			}
688 		}
689 
690 		if (t) {
691 			err = 0;
692 			if (cmd == SIOCCHGTUNNEL) {
693 				t->parms.iph.ttl = p.iph.ttl;
694 				t->parms.iph.tos = p.iph.tos;
695 				t->parms.iph.frag_off = p.iph.frag_off;
696 				if (t->parms.link != p.link) {
697 					t->parms.link = p.link;
698 					ipip_tunnel_bind_dev(dev);
699 					netdev_state_change(dev);
700 				}
701 			}
702 			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
703 				err = -EFAULT;
704 		} else
705 			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
706 		break;
707 
708 	case SIOCDELTUNNEL:
709 		err = -EPERM;
710 		if (!capable(CAP_NET_ADMIN))
711 			goto done;
712 
713 		if (dev == ipn->fb_tunnel_dev) {
714 			err = -EFAULT;
715 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
716 				goto done;
717 			err = -ENOENT;
718 			if ((t = ipip_tunnel_locate(net, &p, 0)) == NULL)
719 				goto done;
720 			err = -EPERM;
721 			if (t->dev == ipn->fb_tunnel_dev)
722 				goto done;
723 			dev = t->dev;
724 		}
725 		unregister_netdevice(dev);
726 		err = 0;
727 		break;
728 
729 	default:
730 		err = -EINVAL;
731 	}
732 
733 done:
734 	return err;
735 }
736 
737 static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
738 {
739 	if (new_mtu < 68 || new_mtu > 0xFFF8 - sizeof(struct iphdr))
740 		return -EINVAL;
741 	dev->mtu = new_mtu;
742 	return 0;
743 }
744 
745 static const struct net_device_ops ipip_netdev_ops = {
746 	.ndo_uninit	= ipip_tunnel_uninit,
747 	.ndo_start_xmit	= ipip_tunnel_xmit,
748 	.ndo_do_ioctl	= ipip_tunnel_ioctl,
749 	.ndo_change_mtu	= ipip_tunnel_change_mtu,
750 	.ndo_get_stats64 = ipip_get_stats64,
751 };
752 
753 static void ipip_dev_free(struct net_device *dev)
754 {
755 	free_percpu(dev->tstats);
756 	free_netdev(dev);
757 }
758 
759 static void ipip_tunnel_setup(struct net_device *dev)
760 {
761 	dev->netdev_ops		= &ipip_netdev_ops;
762 	dev->destructor		= ipip_dev_free;
763 
764 	dev->type		= ARPHRD_TUNNEL;
765 	dev->hard_header_len 	= LL_MAX_HEADER + sizeof(struct iphdr);
766 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr);
767 	dev->flags		= IFF_NOARP;
768 	dev->iflink		= 0;
769 	dev->addr_len		= 4;
770 	dev->features		|= NETIF_F_NETNS_LOCAL;
771 	dev->features		|= NETIF_F_LLTX;
772 	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
773 }
774 
775 static int ipip_tunnel_init(struct net_device *dev)
776 {
777 	struct ip_tunnel *tunnel = netdev_priv(dev);
778 
779 	tunnel->dev = dev;
780 
781 	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
782 	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
783 
784 	ipip_tunnel_bind_dev(dev);
785 
786 	dev->tstats = alloc_percpu(struct pcpu_tstats);
787 	if (!dev->tstats)
788 		return -ENOMEM;
789 
790 	return 0;
791 }
792 
793 static int __net_init ipip_fb_tunnel_init(struct net_device *dev)
794 {
795 	struct ip_tunnel *tunnel = netdev_priv(dev);
796 	struct iphdr *iph = &tunnel->parms.iph;
797 	struct ipip_net *ipn = net_generic(dev_net(dev), ipip_net_id);
798 
799 	tunnel->dev = dev;
800 	strcpy(tunnel->parms.name, dev->name);
801 
802 	iph->version		= 4;
803 	iph->protocol		= IPPROTO_IPIP;
804 	iph->ihl		= 5;
805 
806 	dev->tstats = alloc_percpu(struct pcpu_tstats);
807 	if (!dev->tstats)
808 		return -ENOMEM;
809 
810 	dev_hold(dev);
811 	rcu_assign_pointer(ipn->tunnels_wc[0], tunnel);
812 	return 0;
813 }
814 
815 static struct xfrm_tunnel ipip_handler __read_mostly = {
816 	.handler	=	ipip_rcv,
817 	.err_handler	=	ipip_err,
818 	.priority	=	1,
819 };
820 
821 static const char banner[] __initconst =
822 	KERN_INFO "IPv4 over IPv4 tunneling driver\n";
823 
824 static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head)
825 {
826 	int prio;
827 
828 	for (prio = 1; prio < 4; prio++) {
829 		int h;
830 		for (h = 0; h < HASH_SIZE; h++) {
831 			struct ip_tunnel *t;
832 
833 			t = rtnl_dereference(ipn->tunnels[prio][h]);
834 			while (t != NULL) {
835 				unregister_netdevice_queue(t->dev, head);
836 				t = rtnl_dereference(t->next);
837 			}
838 		}
839 	}
840 }
841 
842 static int __net_init ipip_init_net(struct net *net)
843 {
844 	struct ipip_net *ipn = net_generic(net, ipip_net_id);
845 	struct ip_tunnel *t;
846 	int err;
847 
848 	ipn->tunnels[0] = ipn->tunnels_wc;
849 	ipn->tunnels[1] = ipn->tunnels_l;
850 	ipn->tunnels[2] = ipn->tunnels_r;
851 	ipn->tunnels[3] = ipn->tunnels_r_l;
852 
853 	ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel),
854 					   "tunl0",
855 					   ipip_tunnel_setup);
856 	if (!ipn->fb_tunnel_dev) {
857 		err = -ENOMEM;
858 		goto err_alloc_dev;
859 	}
860 	dev_net_set(ipn->fb_tunnel_dev, net);
861 
862 	err = ipip_fb_tunnel_init(ipn->fb_tunnel_dev);
863 	if (err)
864 		goto err_reg_dev;
865 
866 	if ((err = register_netdev(ipn->fb_tunnel_dev)))
867 		goto err_reg_dev;
868 
869 	t = netdev_priv(ipn->fb_tunnel_dev);
870 
871 	strcpy(t->parms.name, ipn->fb_tunnel_dev->name);
872 	return 0;
873 
874 err_reg_dev:
875 	ipip_dev_free(ipn->fb_tunnel_dev);
876 err_alloc_dev:
877 	/* nothing */
878 	return err;
879 }
880 
881 static void __net_exit ipip_exit_net(struct net *net)
882 {
883 	struct ipip_net *ipn = net_generic(net, ipip_net_id);
884 	LIST_HEAD(list);
885 
886 	rtnl_lock();
887 	ipip_destroy_tunnels(ipn, &list);
888 	unregister_netdevice_queue(ipn->fb_tunnel_dev, &list);
889 	unregister_netdevice_many(&list);
890 	rtnl_unlock();
891 }
892 
893 static struct pernet_operations ipip_net_ops = {
894 	.init = ipip_init_net,
895 	.exit = ipip_exit_net,
896 	.id   = &ipip_net_id,
897 	.size = sizeof(struct ipip_net),
898 };
899 
900 static int __init ipip_init(void)
901 {
902 	int err;
903 
904 	printk(banner);
905 
906 	err = register_pernet_device(&ipip_net_ops);
907 	if (err < 0)
908 		return err;
909 	err = xfrm4_tunnel_register(&ipip_handler, AF_INET);
910 	if (err < 0) {
911 		unregister_pernet_device(&ipip_net_ops);
912 		pr_info("%s: can't register tunnel\n", __func__);
913 	}
914 	return err;
915 }
916 
917 static void __exit ipip_fini(void)
918 {
919 	if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET))
920 		pr_info("%s: can't deregister tunnel\n", __func__);
921 
922 	unregister_pernet_device(&ipip_net_ops);
923 }
924 
925 module_init(ipip_init);
926 module_exit(ipip_fini);
927 MODULE_LICENSE("GPL");
928 MODULE_ALIAS_NETDEV("tunl0");
929