xref: /openbmc/linux/net/ipv4/ipip.c (revision 643d1f7f)
1 /*
2  *	Linux NET3:	IP/IP protocol decoder.
3  *
4  *	Version: $Id: ipip.c,v 1.50 2001/10/02 02:22:36 davem Exp $
5  *
6  *	Authors:
7  *		Sam Lantinga (slouken@cs.ucdavis.edu)  02/01/95
8  *
9  *	Fixes:
10  *		Alan Cox	:	Merged and made usable non modular (its so tiny its silly as
11  *					a module taking up 2 pages).
12  *		Alan Cox	: 	Fixed bug with 1.3.18 and IPIP not working (now needs to set skb->h.iph)
13  *					to keep ip_forward happy.
14  *		Alan Cox	:	More fixes for 1.3.21, and firewall fix. Maybe this will work soon 8).
15  *		Kai Schulte	:	Fixed #defines for IP_FIREWALL->FIREWALL
16  *              David Woodhouse :       Perform some basic ICMP handling.
17  *                                      IPIP Routing without decapsulation.
18  *              Carlos Picoto   :       GRE over IP support
19  *		Alexey Kuznetsov:	Reworked. Really, now it is truncated version of ipv4/ip_gre.c.
20  *					I do not want to merge them together.
21  *
22  *	This program is free software; you can redistribute it and/or
23  *	modify it under the terms of the GNU General Public License
24  *	as published by the Free Software Foundation; either version
25  *	2 of the License, or (at your option) any later version.
26  *
27  */
28 
29 /* tunnel.c: an IP tunnel driver
30 
31 	The purpose of this driver is to provide an IP tunnel through
32 	which you can tunnel network traffic transparently across subnets.
33 
34 	This was written by looking at Nick Holloway's dummy driver
35 	Thanks for the great code!
36 
37 		-Sam Lantinga	(slouken@cs.ucdavis.edu)  02/01/95
38 
39 	Minor tweaks:
40 		Cleaned up the code a little and added some pre-1.3.0 tweaks.
41 		dev->hard_header/hard_header_len changed to use no headers.
42 		Comments/bracketing tweaked.
43 		Made the tunnels use dev->name not tunnel: when error reporting.
44 		Added tx_dropped stat
45 
46 		-Alan Cox	(Alan.Cox@linux.org) 21 March 95
47 
48 	Reworked:
49 		Changed to tunnel to destination gateway in addition to the
50 			tunnel's pointopoint address
51 		Almost completely rewritten
52 		Note:  There is currently no firewall or ICMP handling done.
53 
54 		-Sam Lantinga	(slouken@cs.ucdavis.edu) 02/13/96
55 
56 */
57 
58 /* Things I wish I had known when writing the tunnel driver:
59 
60 	When the tunnel_xmit() function is called, the skb contains the
61 	packet to be sent (plus a great deal of extra info), and dev
62 	contains the tunnel device that _we_ are.
63 
64 	When we are passed a packet, we are expected to fill in the
65 	source address with our source IP address.
66 
67 	What is the proper way to allocate, copy and free a buffer?
68 	After you allocate it, it is a "0 length" chunk of memory
69 	starting at zero.  If you want to add headers to the buffer
70 	later, you'll have to call "skb_reserve(skb, amount)" with
71 	the amount of memory you want reserved.  Then, you call
72 	"skb_put(skb, amount)" with the amount of space you want in
73 	the buffer.  skb_put() returns a pointer to the top (#0) of
74 	that buffer.  skb->len is set to the amount of space you have
75 	"allocated" with skb_put().  You can then write up to skb->len
76 	bytes to that buffer.  If you need more, you can call skb_put()
77 	again with the additional amount of space you need.  You can
78 	find out how much more space you can allocate by calling
79 	"skb_tailroom(skb)".
80 	Now, to add header space, call "skb_push(skb, header_len)".
81 	This creates space at the beginning of the buffer and returns
82 	a pointer to this new space.  If later you need to strip a
83 	header from a buffer, call "skb_pull(skb, header_len)".
84 	skb_headroom() will return how much space is left at the top
85 	of the buffer (before the main data).  Remember, this headroom
86 	space must be reserved before the skb_put() function is called.
87 	*/
88 
89 /*
90    This version of net/ipv4/ipip.c is cloned of net/ipv4/ip_gre.c
91 
92    For comments look at net/ipv4/ip_gre.c --ANK
93  */
94 
95 
96 #include <linux/capability.h>
97 #include <linux/module.h>
98 #include <linux/types.h>
99 #include <linux/kernel.h>
100 #include <asm/uaccess.h>
101 #include <linux/skbuff.h>
102 #include <linux/netdevice.h>
103 #include <linux/in.h>
104 #include <linux/tcp.h>
105 #include <linux/udp.h>
106 #include <linux/if_arp.h>
107 #include <linux/mroute.h>
108 #include <linux/init.h>
109 #include <linux/netfilter_ipv4.h>
110 #include <linux/if_ether.h>
111 
112 #include <net/sock.h>
113 #include <net/ip.h>
114 #include <net/icmp.h>
115 #include <net/ipip.h>
116 #include <net/inet_ecn.h>
117 #include <net/xfrm.h>
118 
119 #define HASH_SIZE  16
120 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
121 
122 static int ipip_fb_tunnel_init(struct net_device *dev);
123 static int ipip_tunnel_init(struct net_device *dev);
124 static void ipip_tunnel_setup(struct net_device *dev);
125 
126 static struct net_device *ipip_fb_tunnel_dev;
127 
128 static struct ip_tunnel *tunnels_r_l[HASH_SIZE];
129 static struct ip_tunnel *tunnels_r[HASH_SIZE];
130 static struct ip_tunnel *tunnels_l[HASH_SIZE];
131 static struct ip_tunnel *tunnels_wc[1];
132 static struct ip_tunnel **tunnels[4] = { tunnels_wc, tunnels_l, tunnels_r, tunnels_r_l };
133 
134 static DEFINE_RWLOCK(ipip_lock);
135 
136 static struct ip_tunnel * ipip_tunnel_lookup(__be32 remote, __be32 local)
137 {
138 	unsigned h0 = HASH(remote);
139 	unsigned h1 = HASH(local);
140 	struct ip_tunnel *t;
141 
142 	for (t = tunnels_r_l[h0^h1]; t; t = t->next) {
143 		if (local == t->parms.iph.saddr &&
144 		    remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
145 			return t;
146 	}
147 	for (t = tunnels_r[h0]; t; t = t->next) {
148 		if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
149 			return t;
150 	}
151 	for (t = tunnels_l[h1]; t; t = t->next) {
152 		if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
153 			return t;
154 	}
155 	if ((t = tunnels_wc[0]) != NULL && (t->dev->flags&IFF_UP))
156 		return t;
157 	return NULL;
158 }
159 
160 static struct ip_tunnel **__ipip_bucket(struct ip_tunnel_parm *parms)
161 {
162 	__be32 remote = parms->iph.daddr;
163 	__be32 local = parms->iph.saddr;
164 	unsigned h = 0;
165 	int prio = 0;
166 
167 	if (remote) {
168 		prio |= 2;
169 		h ^= HASH(remote);
170 	}
171 	if (local) {
172 		prio |= 1;
173 		h ^= HASH(local);
174 	}
175 	return &tunnels[prio][h];
176 }
177 
178 static inline struct ip_tunnel **ipip_bucket(struct ip_tunnel *t)
179 {
180 	return __ipip_bucket(&t->parms);
181 }
182 
183 static void ipip_tunnel_unlink(struct ip_tunnel *t)
184 {
185 	struct ip_tunnel **tp;
186 
187 	for (tp = ipip_bucket(t); *tp; tp = &(*tp)->next) {
188 		if (t == *tp) {
189 			write_lock_bh(&ipip_lock);
190 			*tp = t->next;
191 			write_unlock_bh(&ipip_lock);
192 			break;
193 		}
194 	}
195 }
196 
197 static void ipip_tunnel_link(struct ip_tunnel *t)
198 {
199 	struct ip_tunnel **tp = ipip_bucket(t);
200 
201 	t->next = *tp;
202 	write_lock_bh(&ipip_lock);
203 	*tp = t;
204 	write_unlock_bh(&ipip_lock);
205 }
206 
207 static struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int create)
208 {
209 	__be32 remote = parms->iph.daddr;
210 	__be32 local = parms->iph.saddr;
211 	struct ip_tunnel *t, **tp, *nt;
212 	struct net_device *dev;
213 	char name[IFNAMSIZ];
214 
215 	for (tp = __ipip_bucket(parms); (t = *tp) != NULL; tp = &t->next) {
216 		if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
217 			return t;
218 	}
219 	if (!create)
220 		return NULL;
221 
222 	if (parms->name[0])
223 		strlcpy(name, parms->name, IFNAMSIZ);
224 	else {
225 		int i;
226 		for (i=1; i<100; i++) {
227 			sprintf(name, "tunl%d", i);
228 			if (__dev_get_by_name(&init_net, name) == NULL)
229 				break;
230 		}
231 		if (i==100)
232 			goto failed;
233 	}
234 
235 	dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup);
236 	if (dev == NULL)
237 		return NULL;
238 
239 	nt = netdev_priv(dev);
240 	dev->init = ipip_tunnel_init;
241 	nt->parms = *parms;
242 
243 	if (register_netdevice(dev) < 0) {
244 		free_netdev(dev);
245 		goto failed;
246 	}
247 
248 	dev_hold(dev);
249 	ipip_tunnel_link(nt);
250 	return nt;
251 
252 failed:
253 	return NULL;
254 }
255 
256 static void ipip_tunnel_uninit(struct net_device *dev)
257 {
258 	if (dev == ipip_fb_tunnel_dev) {
259 		write_lock_bh(&ipip_lock);
260 		tunnels_wc[0] = NULL;
261 		write_unlock_bh(&ipip_lock);
262 	} else
263 		ipip_tunnel_unlink(netdev_priv(dev));
264 	dev_put(dev);
265 }
266 
267 static int ipip_err(struct sk_buff *skb, u32 info)
268 {
269 #ifndef I_WISH_WORLD_WERE_PERFECT
270 
271 /* It is not :-( All the routers (except for Linux) return only
272    8 bytes of packet payload. It means, that precise relaying of
273    ICMP in the real Internet is absolutely infeasible.
274  */
275 	struct iphdr *iph = (struct iphdr*)skb->data;
276 	const int type = icmp_hdr(skb)->type;
277 	const int code = icmp_hdr(skb)->code;
278 	struct ip_tunnel *t;
279 	int err;
280 
281 	switch (type) {
282 	default:
283 	case ICMP_PARAMETERPROB:
284 		return 0;
285 
286 	case ICMP_DEST_UNREACH:
287 		switch (code) {
288 		case ICMP_SR_FAILED:
289 		case ICMP_PORT_UNREACH:
290 			/* Impossible event. */
291 			return 0;
292 		case ICMP_FRAG_NEEDED:
293 			/* Soft state for pmtu is maintained by IP core. */
294 			return 0;
295 		default:
296 			/* All others are translated to HOST_UNREACH.
297 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
298 			   I believe they are just ether pollution. --ANK
299 			 */
300 			break;
301 		}
302 		break;
303 	case ICMP_TIME_EXCEEDED:
304 		if (code != ICMP_EXC_TTL)
305 			return 0;
306 		break;
307 	}
308 
309 	err = -ENOENT;
310 
311 	read_lock(&ipip_lock);
312 	t = ipip_tunnel_lookup(iph->daddr, iph->saddr);
313 	if (t == NULL || t->parms.iph.daddr == 0)
314 		goto out;
315 
316 	err = 0;
317 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
318 		goto out;
319 
320 	if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
321 		t->err_count++;
322 	else
323 		t->err_count = 1;
324 	t->err_time = jiffies;
325 out:
326 	read_unlock(&ipip_lock);
327 	return err;
328 #else
329 	struct iphdr *iph = (struct iphdr*)dp;
330 	int hlen = iph->ihl<<2;
331 	struct iphdr *eiph;
332 	const int type = icmp_hdr(skb)->type;
333 	const int code = icmp_hdr(skb)->code;
334 	int rel_type = 0;
335 	int rel_code = 0;
336 	__be32 rel_info = 0;
337 	__u32 n = 0;
338 	struct sk_buff *skb2;
339 	struct flowi fl;
340 	struct rtable *rt;
341 
342 	if (len < hlen + sizeof(struct iphdr))
343 		return 0;
344 	eiph = (struct iphdr*)(dp + hlen);
345 
346 	switch (type) {
347 	default:
348 		return 0;
349 	case ICMP_PARAMETERPROB:
350 		n = ntohl(icmp_hdr(skb)->un.gateway) >> 24;
351 		if (n < hlen)
352 			return 0;
353 
354 		/* So... This guy found something strange INSIDE encapsulated
355 		   packet. Well, he is fool, but what can we do ?
356 		 */
357 		rel_type = ICMP_PARAMETERPROB;
358 		rel_info = htonl((n - hlen) << 24);
359 		break;
360 
361 	case ICMP_DEST_UNREACH:
362 		switch (code) {
363 		case ICMP_SR_FAILED:
364 		case ICMP_PORT_UNREACH:
365 			/* Impossible event. */
366 			return 0;
367 		case ICMP_FRAG_NEEDED:
368 			/* And it is the only really necessary thing :-) */
369 			n = ntohs(icmp_hdr(skb)->un.frag.mtu);
370 			if (n < hlen+68)
371 				return 0;
372 			n -= hlen;
373 			/* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */
374 			if (n > ntohs(eiph->tot_len))
375 				return 0;
376 			rel_info = htonl(n);
377 			break;
378 		default:
379 			/* All others are translated to HOST_UNREACH.
380 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
381 			   I believe, it is just ether pollution. --ANK
382 			 */
383 			rel_type = ICMP_DEST_UNREACH;
384 			rel_code = ICMP_HOST_UNREACH;
385 			break;
386 		}
387 		break;
388 	case ICMP_TIME_EXCEEDED:
389 		if (code != ICMP_EXC_TTL)
390 			return 0;
391 		break;
392 	}
393 
394 	/* Prepare fake skb to feed it to icmp_send */
395 	skb2 = skb_clone(skb, GFP_ATOMIC);
396 	if (skb2 == NULL)
397 		return 0;
398 	dst_release(skb2->dst);
399 	skb2->dst = NULL;
400 	skb_pull(skb2, skb->data - (u8*)eiph);
401 	skb_reset_network_header(skb2);
402 
403 	/* Try to guess incoming interface */
404 	memset(&fl, 0, sizeof(fl));
405 	fl.fl4_daddr = eiph->saddr;
406 	fl.fl4_tos = RT_TOS(eiph->tos);
407 	fl.proto = IPPROTO_IPIP;
408 	if (ip_route_output_key(&init_net, &rt, &key)) {
409 		kfree_skb(skb2);
410 		return 0;
411 	}
412 	skb2->dev = rt->u.dst.dev;
413 
414 	/* route "incoming" packet */
415 	if (rt->rt_flags&RTCF_LOCAL) {
416 		ip_rt_put(rt);
417 		rt = NULL;
418 		fl.fl4_daddr = eiph->daddr;
419 		fl.fl4_src = eiph->saddr;
420 		fl.fl4_tos = eiph->tos;
421 		if (ip_route_output_key(&init_net, &rt, &fl) ||
422 		    rt->u.dst.dev->type != ARPHRD_TUNNEL) {
423 			ip_rt_put(rt);
424 			kfree_skb(skb2);
425 			return 0;
426 		}
427 	} else {
428 		ip_rt_put(rt);
429 		if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) ||
430 		    skb2->dst->dev->type != ARPHRD_TUNNEL) {
431 			kfree_skb(skb2);
432 			return 0;
433 		}
434 	}
435 
436 	/* change mtu on this route */
437 	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
438 		if (n > dst_mtu(skb2->dst)) {
439 			kfree_skb(skb2);
440 			return 0;
441 		}
442 		skb2->dst->ops->update_pmtu(skb2->dst, n);
443 	} else if (type == ICMP_TIME_EXCEEDED) {
444 		struct ip_tunnel *t = netdev_priv(skb2->dev);
445 		if (t->parms.iph.ttl) {
446 			rel_type = ICMP_DEST_UNREACH;
447 			rel_code = ICMP_HOST_UNREACH;
448 		}
449 	}
450 
451 	icmp_send(skb2, rel_type, rel_code, rel_info);
452 	kfree_skb(skb2);
453 	return 0;
454 #endif
455 }
456 
457 static inline void ipip_ecn_decapsulate(const struct iphdr *outer_iph,
458 					struct sk_buff *skb)
459 {
460 	struct iphdr *inner_iph = ip_hdr(skb);
461 
462 	if (INET_ECN_is_ce(outer_iph->tos))
463 		IP_ECN_set_ce(inner_iph);
464 }
465 
466 static int ipip_rcv(struct sk_buff *skb)
467 {
468 	struct ip_tunnel *tunnel;
469 	const struct iphdr *iph = ip_hdr(skb);
470 
471 	read_lock(&ipip_lock);
472 	if ((tunnel = ipip_tunnel_lookup(iph->saddr, iph->daddr)) != NULL) {
473 		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
474 			read_unlock(&ipip_lock);
475 			kfree_skb(skb);
476 			return 0;
477 		}
478 
479 		secpath_reset(skb);
480 
481 		skb->mac_header = skb->network_header;
482 		skb_reset_network_header(skb);
483 		skb->protocol = htons(ETH_P_IP);
484 		skb->pkt_type = PACKET_HOST;
485 
486 		tunnel->stat.rx_packets++;
487 		tunnel->stat.rx_bytes += skb->len;
488 		skb->dev = tunnel->dev;
489 		dst_release(skb->dst);
490 		skb->dst = NULL;
491 		nf_reset(skb);
492 		ipip_ecn_decapsulate(iph, skb);
493 		netif_rx(skb);
494 		read_unlock(&ipip_lock);
495 		return 0;
496 	}
497 	read_unlock(&ipip_lock);
498 
499 	return -1;
500 }
501 
502 /*
503  *	This function assumes it is being called from dev_queue_xmit()
504  *	and that skb is filled properly by that function.
505  */
506 
507 static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
508 {
509 	struct ip_tunnel *tunnel = netdev_priv(dev);
510 	struct net_device_stats *stats = &tunnel->stat;
511 	struct iphdr  *tiph = &tunnel->parms.iph;
512 	u8     tos = tunnel->parms.iph.tos;
513 	__be16 df = tiph->frag_off;
514 	struct rtable *rt;     			/* Route to the other host */
515 	struct net_device *tdev;			/* Device to other host */
516 	struct iphdr  *old_iph = ip_hdr(skb);
517 	struct iphdr  *iph;			/* Our new IP header */
518 	unsigned int max_headroom;		/* The extra header space needed */
519 	__be32 dst = tiph->daddr;
520 	int    mtu;
521 
522 	if (tunnel->recursion++) {
523 		tunnel->stat.collisions++;
524 		goto tx_error;
525 	}
526 
527 	if (skb->protocol != htons(ETH_P_IP))
528 		goto tx_error;
529 
530 	if (tos&1)
531 		tos = old_iph->tos;
532 
533 	if (!dst) {
534 		/* NBMA tunnel */
535 		if ((rt = (struct rtable*)skb->dst) == NULL) {
536 			tunnel->stat.tx_fifo_errors++;
537 			goto tx_error;
538 		}
539 		if ((dst = rt->rt_gateway) == 0)
540 			goto tx_error_icmp;
541 	}
542 
543 	{
544 		struct flowi fl = { .oif = tunnel->parms.link,
545 				    .nl_u = { .ip4_u =
546 					      { .daddr = dst,
547 						.saddr = tiph->saddr,
548 						.tos = RT_TOS(tos) } },
549 				    .proto = IPPROTO_IPIP };
550 		if (ip_route_output_key(&init_net, &rt, &fl)) {
551 			tunnel->stat.tx_carrier_errors++;
552 			goto tx_error_icmp;
553 		}
554 	}
555 	tdev = rt->u.dst.dev;
556 
557 	if (tdev == dev) {
558 		ip_rt_put(rt);
559 		tunnel->stat.collisions++;
560 		goto tx_error;
561 	}
562 
563 	if (tiph->frag_off)
564 		mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
565 	else
566 		mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
567 
568 	if (mtu < 68) {
569 		tunnel->stat.collisions++;
570 		ip_rt_put(rt);
571 		goto tx_error;
572 	}
573 	if (skb->dst)
574 		skb->dst->ops->update_pmtu(skb->dst, mtu);
575 
576 	df |= (old_iph->frag_off&htons(IP_DF));
577 
578 	if ((old_iph->frag_off&htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) {
579 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
580 		ip_rt_put(rt);
581 		goto tx_error;
582 	}
583 
584 	if (tunnel->err_count > 0) {
585 		if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
586 			tunnel->err_count--;
587 			dst_link_failure(skb);
588 		} else
589 			tunnel->err_count = 0;
590 	}
591 
592 	/*
593 	 * Okay, now see if we can stuff it in the buffer as-is.
594 	 */
595 	max_headroom = (LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr));
596 
597 	if (skb_headroom(skb) < max_headroom || skb_shared(skb) ||
598 	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
599 		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
600 		if (!new_skb) {
601 			ip_rt_put(rt);
602 			stats->tx_dropped++;
603 			dev_kfree_skb(skb);
604 			tunnel->recursion--;
605 			return 0;
606 		}
607 		if (skb->sk)
608 			skb_set_owner_w(new_skb, skb->sk);
609 		dev_kfree_skb(skb);
610 		skb = new_skb;
611 		old_iph = ip_hdr(skb);
612 	}
613 
614 	skb->transport_header = skb->network_header;
615 	skb_push(skb, sizeof(struct iphdr));
616 	skb_reset_network_header(skb);
617 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
618 	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
619 			      IPSKB_REROUTED);
620 	dst_release(skb->dst);
621 	skb->dst = &rt->u.dst;
622 
623 	/*
624 	 *	Push down and install the IPIP header.
625 	 */
626 
627 	iph 			=	ip_hdr(skb);
628 	iph->version		=	4;
629 	iph->ihl		=	sizeof(struct iphdr)>>2;
630 	iph->frag_off		=	df;
631 	iph->protocol		=	IPPROTO_IPIP;
632 	iph->tos		=	INET_ECN_encapsulate(tos, old_iph->tos);
633 	iph->daddr		=	rt->rt_dst;
634 	iph->saddr		=	rt->rt_src;
635 
636 	if ((iph->ttl = tiph->ttl) == 0)
637 		iph->ttl	=	old_iph->ttl;
638 
639 	nf_reset(skb);
640 
641 	IPTUNNEL_XMIT();
642 	tunnel->recursion--;
643 	return 0;
644 
645 tx_error_icmp:
646 	dst_link_failure(skb);
647 tx_error:
648 	stats->tx_errors++;
649 	dev_kfree_skb(skb);
650 	tunnel->recursion--;
651 	return 0;
652 }
653 
654 static void ipip_tunnel_bind_dev(struct net_device *dev)
655 {
656 	struct net_device *tdev = NULL;
657 	struct ip_tunnel *tunnel;
658 	struct iphdr *iph;
659 
660 	tunnel = netdev_priv(dev);
661 	iph = &tunnel->parms.iph;
662 
663 	if (iph->daddr) {
664 		struct flowi fl = { .oif = tunnel->parms.link,
665 				    .nl_u = { .ip4_u =
666 					      { .daddr = iph->daddr,
667 						.saddr = iph->saddr,
668 						.tos = RT_TOS(iph->tos) } },
669 				    .proto = IPPROTO_IPIP };
670 		struct rtable *rt;
671 		if (!ip_route_output_key(&init_net, &rt, &fl)) {
672 			tdev = rt->u.dst.dev;
673 			ip_rt_put(rt);
674 		}
675 		dev->flags |= IFF_POINTOPOINT;
676 	}
677 
678 	if (!tdev && tunnel->parms.link)
679 		tdev = __dev_get_by_index(&init_net, tunnel->parms.link);
680 
681 	if (tdev) {
682 		dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr);
683 		dev->mtu = tdev->mtu - sizeof(struct iphdr);
684 	}
685 	dev->iflink = tunnel->parms.link;
686 }
687 
688 static int
689 ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
690 {
691 	int err = 0;
692 	struct ip_tunnel_parm p;
693 	struct ip_tunnel *t;
694 
695 	switch (cmd) {
696 	case SIOCGETTUNNEL:
697 		t = NULL;
698 		if (dev == ipip_fb_tunnel_dev) {
699 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
700 				err = -EFAULT;
701 				break;
702 			}
703 			t = ipip_tunnel_locate(&p, 0);
704 		}
705 		if (t == NULL)
706 			t = netdev_priv(dev);
707 		memcpy(&p, &t->parms, sizeof(p));
708 		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
709 			err = -EFAULT;
710 		break;
711 
712 	case SIOCADDTUNNEL:
713 	case SIOCCHGTUNNEL:
714 		err = -EPERM;
715 		if (!capable(CAP_NET_ADMIN))
716 			goto done;
717 
718 		err = -EFAULT;
719 		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
720 			goto done;
721 
722 		err = -EINVAL;
723 		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
724 		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
725 			goto done;
726 		if (p.iph.ttl)
727 			p.iph.frag_off |= htons(IP_DF);
728 
729 		t = ipip_tunnel_locate(&p, cmd == SIOCADDTUNNEL);
730 
731 		if (dev != ipip_fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
732 			if (t != NULL) {
733 				if (t->dev != dev) {
734 					err = -EEXIST;
735 					break;
736 				}
737 			} else {
738 				if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) ||
739 				    (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) {
740 					err = -EINVAL;
741 					break;
742 				}
743 				t = netdev_priv(dev);
744 				ipip_tunnel_unlink(t);
745 				t->parms.iph.saddr = p.iph.saddr;
746 				t->parms.iph.daddr = p.iph.daddr;
747 				memcpy(dev->dev_addr, &p.iph.saddr, 4);
748 				memcpy(dev->broadcast, &p.iph.daddr, 4);
749 				ipip_tunnel_link(t);
750 				netdev_state_change(dev);
751 			}
752 		}
753 
754 		if (t) {
755 			err = 0;
756 			if (cmd == SIOCCHGTUNNEL) {
757 				t->parms.iph.ttl = p.iph.ttl;
758 				t->parms.iph.tos = p.iph.tos;
759 				t->parms.iph.frag_off = p.iph.frag_off;
760 				if (t->parms.link != p.link) {
761 					t->parms.link = p.link;
762 					ipip_tunnel_bind_dev(dev);
763 					netdev_state_change(dev);
764 				}
765 			}
766 			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
767 				err = -EFAULT;
768 		} else
769 			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
770 		break;
771 
772 	case SIOCDELTUNNEL:
773 		err = -EPERM;
774 		if (!capable(CAP_NET_ADMIN))
775 			goto done;
776 
777 		if (dev == ipip_fb_tunnel_dev) {
778 			err = -EFAULT;
779 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
780 				goto done;
781 			err = -ENOENT;
782 			if ((t = ipip_tunnel_locate(&p, 0)) == NULL)
783 				goto done;
784 			err = -EPERM;
785 			if (t->dev == ipip_fb_tunnel_dev)
786 				goto done;
787 			dev = t->dev;
788 		}
789 		unregister_netdevice(dev);
790 		err = 0;
791 		break;
792 
793 	default:
794 		err = -EINVAL;
795 	}
796 
797 done:
798 	return err;
799 }
800 
801 static struct net_device_stats *ipip_tunnel_get_stats(struct net_device *dev)
802 {
803 	return &(((struct ip_tunnel*)netdev_priv(dev))->stat);
804 }
805 
806 static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
807 {
808 	if (new_mtu < 68 || new_mtu > 0xFFF8 - sizeof(struct iphdr))
809 		return -EINVAL;
810 	dev->mtu = new_mtu;
811 	return 0;
812 }
813 
814 static void ipip_tunnel_setup(struct net_device *dev)
815 {
816 	dev->uninit		= ipip_tunnel_uninit;
817 	dev->hard_start_xmit	= ipip_tunnel_xmit;
818 	dev->get_stats		= ipip_tunnel_get_stats;
819 	dev->do_ioctl		= ipip_tunnel_ioctl;
820 	dev->change_mtu		= ipip_tunnel_change_mtu;
821 	dev->destructor		= free_netdev;
822 
823 	dev->type		= ARPHRD_TUNNEL;
824 	dev->hard_header_len 	= LL_MAX_HEADER + sizeof(struct iphdr);
825 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr);
826 	dev->flags		= IFF_NOARP;
827 	dev->iflink		= 0;
828 	dev->addr_len		= 4;
829 }
830 
831 static int ipip_tunnel_init(struct net_device *dev)
832 {
833 	struct ip_tunnel *tunnel;
834 
835 	tunnel = netdev_priv(dev);
836 
837 	tunnel->dev = dev;
838 	strcpy(tunnel->parms.name, dev->name);
839 
840 	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
841 	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
842 
843 	ipip_tunnel_bind_dev(dev);
844 
845 	return 0;
846 }
847 
848 static int __init ipip_fb_tunnel_init(struct net_device *dev)
849 {
850 	struct ip_tunnel *tunnel = netdev_priv(dev);
851 	struct iphdr *iph = &tunnel->parms.iph;
852 
853 	tunnel->dev = dev;
854 	strcpy(tunnel->parms.name, dev->name);
855 
856 	iph->version		= 4;
857 	iph->protocol		= IPPROTO_IPIP;
858 	iph->ihl		= 5;
859 
860 	dev_hold(dev);
861 	tunnels_wc[0]		= tunnel;
862 	return 0;
863 }
864 
865 static struct xfrm_tunnel ipip_handler = {
866 	.handler	=	ipip_rcv,
867 	.err_handler	=	ipip_err,
868 	.priority	=	1,
869 };
870 
871 static char banner[] __initdata =
872 	KERN_INFO "IPv4 over IPv4 tunneling driver\n";
873 
874 static int __init ipip_init(void)
875 {
876 	int err;
877 
878 	printk(banner);
879 
880 	if (xfrm4_tunnel_register(&ipip_handler, AF_INET)) {
881 		printk(KERN_INFO "ipip init: can't register tunnel\n");
882 		return -EAGAIN;
883 	}
884 
885 	ipip_fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel),
886 					   "tunl0",
887 					   ipip_tunnel_setup);
888 	if (!ipip_fb_tunnel_dev) {
889 		err = -ENOMEM;
890 		goto err1;
891 	}
892 
893 	ipip_fb_tunnel_dev->init = ipip_fb_tunnel_init;
894 
895 	if ((err = register_netdev(ipip_fb_tunnel_dev)))
896 		goto err2;
897  out:
898 	return err;
899  err2:
900 	free_netdev(ipip_fb_tunnel_dev);
901  err1:
902 	xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
903 	goto out;
904 }
905 
906 static void __exit ipip_destroy_tunnels(void)
907 {
908 	int prio;
909 
910 	for (prio = 1; prio < 4; prio++) {
911 		int h;
912 		for (h = 0; h < HASH_SIZE; h++) {
913 			struct ip_tunnel *t;
914 			while ((t = tunnels[prio][h]) != NULL)
915 				unregister_netdevice(t->dev);
916 		}
917 	}
918 }
919 
920 static void __exit ipip_fini(void)
921 {
922 	if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET))
923 		printk(KERN_INFO "ipip close: can't deregister tunnel\n");
924 
925 	rtnl_lock();
926 	ipip_destroy_tunnels();
927 	unregister_netdevice(ipip_fb_tunnel_dev);
928 	rtnl_unlock();
929 }
930 
931 module_init(ipip_init);
932 module_exit(ipip_fini);
933 MODULE_LICENSE("GPL");
934