xref: /openbmc/linux/net/ipv4/ipip.c (revision f42b3800)
1 /*
2  *	Linux NET3:	IP/IP protocol decoder.
3  *
4  *	Version: $Id: ipip.c,v 1.50 2001/10/02 02:22:36 davem Exp $
5  *
6  *	Authors:
7  *		Sam Lantinga (slouken@cs.ucdavis.edu)  02/01/95
8  *
9  *	Fixes:
10  *		Alan Cox	:	Merged and made usable non modular (its so tiny its silly as
11  *					a module taking up 2 pages).
12  *		Alan Cox	: 	Fixed bug with 1.3.18 and IPIP not working (now needs to set skb->h.iph)
13  *					to keep ip_forward happy.
14  *		Alan Cox	:	More fixes for 1.3.21, and firewall fix. Maybe this will work soon 8).
15  *		Kai Schulte	:	Fixed #defines for IP_FIREWALL->FIREWALL
16  *              David Woodhouse :       Perform some basic ICMP handling.
17  *                                      IPIP Routing without decapsulation.
18  *              Carlos Picoto   :       GRE over IP support
19  *		Alexey Kuznetsov:	Reworked. Really, now it is truncated version of ipv4/ip_gre.c.
20  *					I do not want to merge them together.
21  *
22  *	This program is free software; you can redistribute it and/or
23  *	modify it under the terms of the GNU General Public License
24  *	as published by the Free Software Foundation; either version
25  *	2 of the License, or (at your option) any later version.
26  *
27  */
28 
29 /* tunnel.c: an IP tunnel driver
30 
31 	The purpose of this driver is to provide an IP tunnel through
32 	which you can tunnel network traffic transparently across subnets.
33 
34 	This was written by looking at Nick Holloway's dummy driver
35 	Thanks for the great code!
36 
37 		-Sam Lantinga	(slouken@cs.ucdavis.edu)  02/01/95
38 
39 	Minor tweaks:
40 		Cleaned up the code a little and added some pre-1.3.0 tweaks.
41 		dev->hard_header/hard_header_len changed to use no headers.
42 		Comments/bracketing tweaked.
43 		Made the tunnels use dev->name not tunnel: when error reporting.
44 		Added tx_dropped stat
45 
46 		-Alan Cox	(Alan.Cox@linux.org) 21 March 95
47 
48 	Reworked:
49 		Changed to tunnel to destination gateway in addition to the
50 			tunnel's pointopoint address
51 		Almost completely rewritten
52 		Note:  There is currently no firewall or ICMP handling done.
53 
54 		-Sam Lantinga	(slouken@cs.ucdavis.edu) 02/13/96
55 
56 */
57 
58 /* Things I wish I had known when writing the tunnel driver:
59 
60 	When the tunnel_xmit() function is called, the skb contains the
61 	packet to be sent (plus a great deal of extra info), and dev
62 	contains the tunnel device that _we_ are.
63 
64 	When we are passed a packet, we are expected to fill in the
65 	source address with our source IP address.
66 
67 	What is the proper way to allocate, copy and free a buffer?
68 	After you allocate it, it is a "0 length" chunk of memory
69 	starting at zero.  If you want to add headers to the buffer
70 	later, you'll have to call "skb_reserve(skb, amount)" with
71 	the amount of memory you want reserved.  Then, you call
72 	"skb_put(skb, amount)" with the amount of space you want in
73 	the buffer.  skb_put() returns a pointer to the top (#0) of
74 	that buffer.  skb->len is set to the amount of space you have
75 	"allocated" with skb_put().  You can then write up to skb->len
76 	bytes to that buffer.  If you need more, you can call skb_put()
77 	again with the additional amount of space you need.  You can
78 	find out how much more space you can allocate by calling
79 	"skb_tailroom(skb)".
80 	Now, to add header space, call "skb_push(skb, header_len)".
81 	This creates space at the beginning of the buffer and returns
82 	a pointer to this new space.  If later you need to strip a
83 	header from a buffer, call "skb_pull(skb, header_len)".
84 	skb_headroom() will return how much space is left at the top
85 	of the buffer (before the main data).  Remember, this headroom
86 	space must be reserved before the skb_put() function is called.
87 	*/
88 
89 /*
90    This version of net/ipv4/ipip.c is cloned of net/ipv4/ip_gre.c
91 
92    For comments look at net/ipv4/ip_gre.c --ANK
93  */
94 
95 
96 #include <linux/capability.h>
97 #include <linux/module.h>
98 #include <linux/types.h>
99 #include <linux/kernel.h>
100 #include <asm/uaccess.h>
101 #include <linux/skbuff.h>
102 #include <linux/netdevice.h>
103 #include <linux/in.h>
104 #include <linux/tcp.h>
105 #include <linux/udp.h>
106 #include <linux/if_arp.h>
107 #include <linux/mroute.h>
108 #include <linux/init.h>
109 #include <linux/netfilter_ipv4.h>
110 #include <linux/if_ether.h>
111 
112 #include <net/sock.h>
113 #include <net/ip.h>
114 #include <net/icmp.h>
115 #include <net/ipip.h>
116 #include <net/inet_ecn.h>
117 #include <net/xfrm.h>
118 #include <net/net_namespace.h>
119 #include <net/netns/generic.h>
120 
121 #define HASH_SIZE  16
122 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
123 
124 static int ipip_net_id;
125 struct ipip_net {
126 	struct ip_tunnel *tunnels_r_l[HASH_SIZE];
127 	struct ip_tunnel *tunnels_r[HASH_SIZE];
128 	struct ip_tunnel *tunnels_l[HASH_SIZE];
129 	struct ip_tunnel *tunnels_wc[1];
130 	struct ip_tunnel **tunnels[4];
131 
132 	struct net_device *fb_tunnel_dev;
133 };
134 
135 static int ipip_fb_tunnel_init(struct net_device *dev);
136 static int ipip_tunnel_init(struct net_device *dev);
137 static void ipip_tunnel_setup(struct net_device *dev);
138 
139 static DEFINE_RWLOCK(ipip_lock);
140 
141 static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
142 		__be32 remote, __be32 local)
143 {
144 	unsigned h0 = HASH(remote);
145 	unsigned h1 = HASH(local);
146 	struct ip_tunnel *t;
147 	struct ipip_net *ipn = net_generic(net, ipip_net_id);
148 
149 	for (t = ipn->tunnels_r_l[h0^h1]; t; t = t->next) {
150 		if (local == t->parms.iph.saddr &&
151 		    remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
152 			return t;
153 	}
154 	for (t = ipn->tunnels_r[h0]; t; t = t->next) {
155 		if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
156 			return t;
157 	}
158 	for (t = ipn->tunnels_l[h1]; t; t = t->next) {
159 		if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
160 			return t;
161 	}
162 	if ((t = ipn->tunnels_wc[0]) != NULL && (t->dev->flags&IFF_UP))
163 		return t;
164 	return NULL;
165 }
166 
167 static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn,
168 		struct ip_tunnel_parm *parms)
169 {
170 	__be32 remote = parms->iph.daddr;
171 	__be32 local = parms->iph.saddr;
172 	unsigned h = 0;
173 	int prio = 0;
174 
175 	if (remote) {
176 		prio |= 2;
177 		h ^= HASH(remote);
178 	}
179 	if (local) {
180 		prio |= 1;
181 		h ^= HASH(local);
182 	}
183 	return &ipn->tunnels[prio][h];
184 }
185 
186 static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn,
187 		struct ip_tunnel *t)
188 {
189 	return __ipip_bucket(ipn, &t->parms);
190 }
191 
192 static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
193 {
194 	struct ip_tunnel **tp;
195 
196 	for (tp = ipip_bucket(ipn, t); *tp; tp = &(*tp)->next) {
197 		if (t == *tp) {
198 			write_lock_bh(&ipip_lock);
199 			*tp = t->next;
200 			write_unlock_bh(&ipip_lock);
201 			break;
202 		}
203 	}
204 }
205 
206 static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t)
207 {
208 	struct ip_tunnel **tp = ipip_bucket(ipn, t);
209 
210 	t->next = *tp;
211 	write_lock_bh(&ipip_lock);
212 	*tp = t;
213 	write_unlock_bh(&ipip_lock);
214 }
215 
216 static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
217 		struct ip_tunnel_parm *parms, int create)
218 {
219 	__be32 remote = parms->iph.daddr;
220 	__be32 local = parms->iph.saddr;
221 	struct ip_tunnel *t, **tp, *nt;
222 	struct net_device *dev;
223 	char name[IFNAMSIZ];
224 	struct ipip_net *ipn = net_generic(net, ipip_net_id);
225 
226 	for (tp = __ipip_bucket(ipn, parms); (t = *tp) != NULL; tp = &t->next) {
227 		if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
228 			return t;
229 	}
230 	if (!create)
231 		return NULL;
232 
233 	if (parms->name[0])
234 		strlcpy(name, parms->name, IFNAMSIZ);
235 	else
236 		sprintf(name, "tunl%%d");
237 
238 	dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup);
239 	if (dev == NULL)
240 		return NULL;
241 
242 	dev_net_set(dev, net);
243 
244 	if (strchr(name, '%')) {
245 		if (dev_alloc_name(dev, name) < 0)
246 			goto failed_free;
247 	}
248 
249 	nt = netdev_priv(dev);
250 	dev->init = ipip_tunnel_init;
251 	nt->parms = *parms;
252 
253 	if (register_netdevice(dev) < 0)
254 		goto failed_free;
255 
256 	dev_hold(dev);
257 	ipip_tunnel_link(ipn, nt);
258 	return nt;
259 
260 failed_free:
261 	free_netdev(dev);
262 	return NULL;
263 }
264 
265 static void ipip_tunnel_uninit(struct net_device *dev)
266 {
267 	struct net *net = dev_net(dev);
268 	struct ipip_net *ipn = net_generic(net, ipip_net_id);
269 
270 	if (dev == ipn->fb_tunnel_dev) {
271 		write_lock_bh(&ipip_lock);
272 		ipn->tunnels_wc[0] = NULL;
273 		write_unlock_bh(&ipip_lock);
274 	} else
275 		ipip_tunnel_unlink(ipn, netdev_priv(dev));
276 	dev_put(dev);
277 }
278 
279 static int ipip_err(struct sk_buff *skb, u32 info)
280 {
281 #ifndef I_WISH_WORLD_WERE_PERFECT
282 
283 /* It is not :-( All the routers (except for Linux) return only
284    8 bytes of packet payload. It means, that precise relaying of
285    ICMP in the real Internet is absolutely infeasible.
286  */
287 	struct iphdr *iph = (struct iphdr*)skb->data;
288 	const int type = icmp_hdr(skb)->type;
289 	const int code = icmp_hdr(skb)->code;
290 	struct ip_tunnel *t;
291 	int err;
292 
293 	switch (type) {
294 	default:
295 	case ICMP_PARAMETERPROB:
296 		return 0;
297 
298 	case ICMP_DEST_UNREACH:
299 		switch (code) {
300 		case ICMP_SR_FAILED:
301 		case ICMP_PORT_UNREACH:
302 			/* Impossible event. */
303 			return 0;
304 		case ICMP_FRAG_NEEDED:
305 			/* Soft state for pmtu is maintained by IP core. */
306 			return 0;
307 		default:
308 			/* All others are translated to HOST_UNREACH.
309 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
310 			   I believe they are just ether pollution. --ANK
311 			 */
312 			break;
313 		}
314 		break;
315 	case ICMP_TIME_EXCEEDED:
316 		if (code != ICMP_EXC_TTL)
317 			return 0;
318 		break;
319 	}
320 
321 	err = -ENOENT;
322 
323 	read_lock(&ipip_lock);
324 	t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
325 	if (t == NULL || t->parms.iph.daddr == 0)
326 		goto out;
327 
328 	err = 0;
329 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
330 		goto out;
331 
332 	if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
333 		t->err_count++;
334 	else
335 		t->err_count = 1;
336 	t->err_time = jiffies;
337 out:
338 	read_unlock(&ipip_lock);
339 	return err;
340 #else
341 	struct iphdr *iph = (struct iphdr*)dp;
342 	int hlen = iph->ihl<<2;
343 	struct iphdr *eiph;
344 	const int type = icmp_hdr(skb)->type;
345 	const int code = icmp_hdr(skb)->code;
346 	int rel_type = 0;
347 	int rel_code = 0;
348 	__be32 rel_info = 0;
349 	__u32 n = 0;
350 	struct sk_buff *skb2;
351 	struct flowi fl;
352 	struct rtable *rt;
353 
354 	if (len < hlen + sizeof(struct iphdr))
355 		return 0;
356 	eiph = (struct iphdr*)(dp + hlen);
357 
358 	switch (type) {
359 	default:
360 		return 0;
361 	case ICMP_PARAMETERPROB:
362 		n = ntohl(icmp_hdr(skb)->un.gateway) >> 24;
363 		if (n < hlen)
364 			return 0;
365 
366 		/* So... This guy found something strange INSIDE encapsulated
367 		   packet. Well, he is fool, but what can we do ?
368 		 */
369 		rel_type = ICMP_PARAMETERPROB;
370 		rel_info = htonl((n - hlen) << 24);
371 		break;
372 
373 	case ICMP_DEST_UNREACH:
374 		switch (code) {
375 		case ICMP_SR_FAILED:
376 		case ICMP_PORT_UNREACH:
377 			/* Impossible event. */
378 			return 0;
379 		case ICMP_FRAG_NEEDED:
380 			/* And it is the only really necessary thing :-) */
381 			n = ntohs(icmp_hdr(skb)->un.frag.mtu);
382 			if (n < hlen+68)
383 				return 0;
384 			n -= hlen;
385 			/* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */
386 			if (n > ntohs(eiph->tot_len))
387 				return 0;
388 			rel_info = htonl(n);
389 			break;
390 		default:
391 			/* All others are translated to HOST_UNREACH.
392 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
393 			   I believe, it is just ether pollution. --ANK
394 			 */
395 			rel_type = ICMP_DEST_UNREACH;
396 			rel_code = ICMP_HOST_UNREACH;
397 			break;
398 		}
399 		break;
400 	case ICMP_TIME_EXCEEDED:
401 		if (code != ICMP_EXC_TTL)
402 			return 0;
403 		break;
404 	}
405 
406 	/* Prepare fake skb to feed it to icmp_send */
407 	skb2 = skb_clone(skb, GFP_ATOMIC);
408 	if (skb2 == NULL)
409 		return 0;
410 	dst_release(skb2->dst);
411 	skb2->dst = NULL;
412 	skb_pull(skb2, skb->data - (u8*)eiph);
413 	skb_reset_network_header(skb2);
414 
415 	/* Try to guess incoming interface */
416 	memset(&fl, 0, sizeof(fl));
417 	fl.fl4_daddr = eiph->saddr;
418 	fl.fl4_tos = RT_TOS(eiph->tos);
419 	fl.proto = IPPROTO_IPIP;
420 	if (ip_route_output_key(dev_net(skb->dev), &rt, &key)) {
421 		kfree_skb(skb2);
422 		return 0;
423 	}
424 	skb2->dev = rt->u.dst.dev;
425 
426 	/* route "incoming" packet */
427 	if (rt->rt_flags&RTCF_LOCAL) {
428 		ip_rt_put(rt);
429 		rt = NULL;
430 		fl.fl4_daddr = eiph->daddr;
431 		fl.fl4_src = eiph->saddr;
432 		fl.fl4_tos = eiph->tos;
433 		if (ip_route_output_key(dev_net(skb->dev), &rt, &fl) ||
434 		    rt->u.dst.dev->type != ARPHRD_TUNNEL) {
435 			ip_rt_put(rt);
436 			kfree_skb(skb2);
437 			return 0;
438 		}
439 	} else {
440 		ip_rt_put(rt);
441 		if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) ||
442 		    skb2->dst->dev->type != ARPHRD_TUNNEL) {
443 			kfree_skb(skb2);
444 			return 0;
445 		}
446 	}
447 
448 	/* change mtu on this route */
449 	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
450 		if (n > dst_mtu(skb2->dst)) {
451 			kfree_skb(skb2);
452 			return 0;
453 		}
454 		skb2->dst->ops->update_pmtu(skb2->dst, n);
455 	} else if (type == ICMP_TIME_EXCEEDED) {
456 		struct ip_tunnel *t = netdev_priv(skb2->dev);
457 		if (t->parms.iph.ttl) {
458 			rel_type = ICMP_DEST_UNREACH;
459 			rel_code = ICMP_HOST_UNREACH;
460 		}
461 	}
462 
463 	icmp_send(skb2, rel_type, rel_code, rel_info);
464 	kfree_skb(skb2);
465 	return 0;
466 #endif
467 }
468 
469 static inline void ipip_ecn_decapsulate(const struct iphdr *outer_iph,
470 					struct sk_buff *skb)
471 {
472 	struct iphdr *inner_iph = ip_hdr(skb);
473 
474 	if (INET_ECN_is_ce(outer_iph->tos))
475 		IP_ECN_set_ce(inner_iph);
476 }
477 
478 static int ipip_rcv(struct sk_buff *skb)
479 {
480 	struct ip_tunnel *tunnel;
481 	const struct iphdr *iph = ip_hdr(skb);
482 
483 	read_lock(&ipip_lock);
484 	if ((tunnel = ipip_tunnel_lookup(dev_net(skb->dev),
485 					iph->saddr, iph->daddr)) != NULL) {
486 		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
487 			read_unlock(&ipip_lock);
488 			kfree_skb(skb);
489 			return 0;
490 		}
491 
492 		secpath_reset(skb);
493 
494 		skb->mac_header = skb->network_header;
495 		skb_reset_network_header(skb);
496 		skb->protocol = htons(ETH_P_IP);
497 		skb->pkt_type = PACKET_HOST;
498 
499 		tunnel->stat.rx_packets++;
500 		tunnel->stat.rx_bytes += skb->len;
501 		skb->dev = tunnel->dev;
502 		dst_release(skb->dst);
503 		skb->dst = NULL;
504 		nf_reset(skb);
505 		ipip_ecn_decapsulate(iph, skb);
506 		netif_rx(skb);
507 		read_unlock(&ipip_lock);
508 		return 0;
509 	}
510 	read_unlock(&ipip_lock);
511 
512 	return -1;
513 }
514 
515 /*
516  *	This function assumes it is being called from dev_queue_xmit()
517  *	and that skb is filled properly by that function.
518  */
519 
520 static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
521 {
522 	struct ip_tunnel *tunnel = netdev_priv(dev);
523 	struct net_device_stats *stats = &tunnel->stat;
524 	struct iphdr  *tiph = &tunnel->parms.iph;
525 	u8     tos = tunnel->parms.iph.tos;
526 	__be16 df = tiph->frag_off;
527 	struct rtable *rt;     			/* Route to the other host */
528 	struct net_device *tdev;			/* Device to other host */
529 	struct iphdr  *old_iph = ip_hdr(skb);
530 	struct iphdr  *iph;			/* Our new IP header */
531 	unsigned int max_headroom;		/* The extra header space needed */
532 	__be32 dst = tiph->daddr;
533 	int    mtu;
534 
535 	if (tunnel->recursion++) {
536 		tunnel->stat.collisions++;
537 		goto tx_error;
538 	}
539 
540 	if (skb->protocol != htons(ETH_P_IP))
541 		goto tx_error;
542 
543 	if (tos&1)
544 		tos = old_iph->tos;
545 
546 	if (!dst) {
547 		/* NBMA tunnel */
548 		if ((rt = skb->rtable) == NULL) {
549 			tunnel->stat.tx_fifo_errors++;
550 			goto tx_error;
551 		}
552 		if ((dst = rt->rt_gateway) == 0)
553 			goto tx_error_icmp;
554 	}
555 
556 	{
557 		struct flowi fl = { .oif = tunnel->parms.link,
558 				    .nl_u = { .ip4_u =
559 					      { .daddr = dst,
560 						.saddr = tiph->saddr,
561 						.tos = RT_TOS(tos) } },
562 				    .proto = IPPROTO_IPIP };
563 		if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
564 			tunnel->stat.tx_carrier_errors++;
565 			goto tx_error_icmp;
566 		}
567 	}
568 	tdev = rt->u.dst.dev;
569 
570 	if (tdev == dev) {
571 		ip_rt_put(rt);
572 		tunnel->stat.collisions++;
573 		goto tx_error;
574 	}
575 
576 	if (tiph->frag_off)
577 		mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
578 	else
579 		mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
580 
581 	if (mtu < 68) {
582 		tunnel->stat.collisions++;
583 		ip_rt_put(rt);
584 		goto tx_error;
585 	}
586 	if (skb->dst)
587 		skb->dst->ops->update_pmtu(skb->dst, mtu);
588 
589 	df |= (old_iph->frag_off&htons(IP_DF));
590 
591 	if ((old_iph->frag_off&htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) {
592 		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
593 		ip_rt_put(rt);
594 		goto tx_error;
595 	}
596 
597 	if (tunnel->err_count > 0) {
598 		if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
599 			tunnel->err_count--;
600 			dst_link_failure(skb);
601 		} else
602 			tunnel->err_count = 0;
603 	}
604 
605 	/*
606 	 * Okay, now see if we can stuff it in the buffer as-is.
607 	 */
608 	max_headroom = (LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr));
609 
610 	if (skb_headroom(skb) < max_headroom || skb_shared(skb) ||
611 	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
612 		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
613 		if (!new_skb) {
614 			ip_rt_put(rt);
615 			stats->tx_dropped++;
616 			dev_kfree_skb(skb);
617 			tunnel->recursion--;
618 			return 0;
619 		}
620 		if (skb->sk)
621 			skb_set_owner_w(new_skb, skb->sk);
622 		dev_kfree_skb(skb);
623 		skb = new_skb;
624 		old_iph = ip_hdr(skb);
625 	}
626 
627 	skb->transport_header = skb->network_header;
628 	skb_push(skb, sizeof(struct iphdr));
629 	skb_reset_network_header(skb);
630 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
631 	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
632 			      IPSKB_REROUTED);
633 	dst_release(skb->dst);
634 	skb->dst = &rt->u.dst;
635 
636 	/*
637 	 *	Push down and install the IPIP header.
638 	 */
639 
640 	iph 			=	ip_hdr(skb);
641 	iph->version		=	4;
642 	iph->ihl		=	sizeof(struct iphdr)>>2;
643 	iph->frag_off		=	df;
644 	iph->protocol		=	IPPROTO_IPIP;
645 	iph->tos		=	INET_ECN_encapsulate(tos, old_iph->tos);
646 	iph->daddr		=	rt->rt_dst;
647 	iph->saddr		=	rt->rt_src;
648 
649 	if ((iph->ttl = tiph->ttl) == 0)
650 		iph->ttl	=	old_iph->ttl;
651 
652 	nf_reset(skb);
653 
654 	IPTUNNEL_XMIT();
655 	tunnel->recursion--;
656 	return 0;
657 
658 tx_error_icmp:
659 	dst_link_failure(skb);
660 tx_error:
661 	stats->tx_errors++;
662 	dev_kfree_skb(skb);
663 	tunnel->recursion--;
664 	return 0;
665 }
666 
667 static void ipip_tunnel_bind_dev(struct net_device *dev)
668 {
669 	struct net_device *tdev = NULL;
670 	struct ip_tunnel *tunnel;
671 	struct iphdr *iph;
672 
673 	tunnel = netdev_priv(dev);
674 	iph = &tunnel->parms.iph;
675 
676 	if (iph->daddr) {
677 		struct flowi fl = { .oif = tunnel->parms.link,
678 				    .nl_u = { .ip4_u =
679 					      { .daddr = iph->daddr,
680 						.saddr = iph->saddr,
681 						.tos = RT_TOS(iph->tos) } },
682 				    .proto = IPPROTO_IPIP };
683 		struct rtable *rt;
684 		if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
685 			tdev = rt->u.dst.dev;
686 			ip_rt_put(rt);
687 		}
688 		dev->flags |= IFF_POINTOPOINT;
689 	}
690 
691 	if (!tdev && tunnel->parms.link)
692 		tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
693 
694 	if (tdev) {
695 		dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr);
696 		dev->mtu = tdev->mtu - sizeof(struct iphdr);
697 	}
698 	dev->iflink = tunnel->parms.link;
699 }
700 
701 static int
702 ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
703 {
704 	int err = 0;
705 	struct ip_tunnel_parm p;
706 	struct ip_tunnel *t;
707 	struct net *net = dev_net(dev);
708 	struct ipip_net *ipn = net_generic(net, ipip_net_id);
709 
710 	switch (cmd) {
711 	case SIOCGETTUNNEL:
712 		t = NULL;
713 		if (dev == ipn->fb_tunnel_dev) {
714 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
715 				err = -EFAULT;
716 				break;
717 			}
718 			t = ipip_tunnel_locate(net, &p, 0);
719 		}
720 		if (t == NULL)
721 			t = netdev_priv(dev);
722 		memcpy(&p, &t->parms, sizeof(p));
723 		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
724 			err = -EFAULT;
725 		break;
726 
727 	case SIOCADDTUNNEL:
728 	case SIOCCHGTUNNEL:
729 		err = -EPERM;
730 		if (!capable(CAP_NET_ADMIN))
731 			goto done;
732 
733 		err = -EFAULT;
734 		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
735 			goto done;
736 
737 		err = -EINVAL;
738 		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
739 		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
740 			goto done;
741 		if (p.iph.ttl)
742 			p.iph.frag_off |= htons(IP_DF);
743 
744 		t = ipip_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
745 
746 		if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
747 			if (t != NULL) {
748 				if (t->dev != dev) {
749 					err = -EEXIST;
750 					break;
751 				}
752 			} else {
753 				if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) ||
754 				    (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) {
755 					err = -EINVAL;
756 					break;
757 				}
758 				t = netdev_priv(dev);
759 				ipip_tunnel_unlink(ipn, t);
760 				t->parms.iph.saddr = p.iph.saddr;
761 				t->parms.iph.daddr = p.iph.daddr;
762 				memcpy(dev->dev_addr, &p.iph.saddr, 4);
763 				memcpy(dev->broadcast, &p.iph.daddr, 4);
764 				ipip_tunnel_link(ipn, t);
765 				netdev_state_change(dev);
766 			}
767 		}
768 
769 		if (t) {
770 			err = 0;
771 			if (cmd == SIOCCHGTUNNEL) {
772 				t->parms.iph.ttl = p.iph.ttl;
773 				t->parms.iph.tos = p.iph.tos;
774 				t->parms.iph.frag_off = p.iph.frag_off;
775 				if (t->parms.link != p.link) {
776 					t->parms.link = p.link;
777 					ipip_tunnel_bind_dev(dev);
778 					netdev_state_change(dev);
779 				}
780 			}
781 			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
782 				err = -EFAULT;
783 		} else
784 			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
785 		break;
786 
787 	case SIOCDELTUNNEL:
788 		err = -EPERM;
789 		if (!capable(CAP_NET_ADMIN))
790 			goto done;
791 
792 		if (dev == ipn->fb_tunnel_dev) {
793 			err = -EFAULT;
794 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
795 				goto done;
796 			err = -ENOENT;
797 			if ((t = ipip_tunnel_locate(net, &p, 0)) == NULL)
798 				goto done;
799 			err = -EPERM;
800 			if (t->dev == ipn->fb_tunnel_dev)
801 				goto done;
802 			dev = t->dev;
803 		}
804 		unregister_netdevice(dev);
805 		err = 0;
806 		break;
807 
808 	default:
809 		err = -EINVAL;
810 	}
811 
812 done:
813 	return err;
814 }
815 
816 static struct net_device_stats *ipip_tunnel_get_stats(struct net_device *dev)
817 {
818 	return &(((struct ip_tunnel*)netdev_priv(dev))->stat);
819 }
820 
821 static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
822 {
823 	if (new_mtu < 68 || new_mtu > 0xFFF8 - sizeof(struct iphdr))
824 		return -EINVAL;
825 	dev->mtu = new_mtu;
826 	return 0;
827 }
828 
829 static void ipip_tunnel_setup(struct net_device *dev)
830 {
831 	dev->uninit		= ipip_tunnel_uninit;
832 	dev->hard_start_xmit	= ipip_tunnel_xmit;
833 	dev->get_stats		= ipip_tunnel_get_stats;
834 	dev->do_ioctl		= ipip_tunnel_ioctl;
835 	dev->change_mtu		= ipip_tunnel_change_mtu;
836 	dev->destructor		= free_netdev;
837 
838 	dev->type		= ARPHRD_TUNNEL;
839 	dev->hard_header_len 	= LL_MAX_HEADER + sizeof(struct iphdr);
840 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr);
841 	dev->flags		= IFF_NOARP;
842 	dev->iflink		= 0;
843 	dev->addr_len		= 4;
844 	dev->features		|= NETIF_F_NETNS_LOCAL;
845 }
846 
847 static int ipip_tunnel_init(struct net_device *dev)
848 {
849 	struct ip_tunnel *tunnel;
850 
851 	tunnel = netdev_priv(dev);
852 
853 	tunnel->dev = dev;
854 	strcpy(tunnel->parms.name, dev->name);
855 
856 	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
857 	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
858 
859 	ipip_tunnel_bind_dev(dev);
860 
861 	return 0;
862 }
863 
864 static int ipip_fb_tunnel_init(struct net_device *dev)
865 {
866 	struct ip_tunnel *tunnel = netdev_priv(dev);
867 	struct iphdr *iph = &tunnel->parms.iph;
868 	struct ipip_net *ipn = net_generic(dev_net(dev), ipip_net_id);
869 
870 	tunnel->dev = dev;
871 	strcpy(tunnel->parms.name, dev->name);
872 
873 	iph->version		= 4;
874 	iph->protocol		= IPPROTO_IPIP;
875 	iph->ihl		= 5;
876 
877 	dev_hold(dev);
878 	ipn->tunnels_wc[0]	= tunnel;
879 	return 0;
880 }
881 
882 static struct xfrm_tunnel ipip_handler = {
883 	.handler	=	ipip_rcv,
884 	.err_handler	=	ipip_err,
885 	.priority	=	1,
886 };
887 
888 static char banner[] __initdata =
889 	KERN_INFO "IPv4 over IPv4 tunneling driver\n";
890 
891 static void ipip_destroy_tunnels(struct ipip_net *ipn)
892 {
893 	int prio;
894 
895 	for (prio = 1; prio < 4; prio++) {
896 		int h;
897 		for (h = 0; h < HASH_SIZE; h++) {
898 			struct ip_tunnel *t;
899 			while ((t = ipn->tunnels[prio][h]) != NULL)
900 				unregister_netdevice(t->dev);
901 		}
902 	}
903 }
904 
905 static int ipip_init_net(struct net *net)
906 {
907 	int err;
908 	struct ipip_net *ipn;
909 
910 	err = -ENOMEM;
911 	ipn = kzalloc(sizeof(struct ipip_net), GFP_KERNEL);
912 	if (ipn == NULL)
913 		goto err_alloc;
914 
915 	err = net_assign_generic(net, ipip_net_id, ipn);
916 	if (err < 0)
917 		goto err_assign;
918 
919 	ipn->tunnels[0] = ipn->tunnels_wc;
920 	ipn->tunnels[1] = ipn->tunnels_l;
921 	ipn->tunnels[2] = ipn->tunnels_r;
922 	ipn->tunnels[3] = ipn->tunnels_r_l;
923 
924 	ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel),
925 					   "tunl0",
926 					   ipip_tunnel_setup);
927 	if (!ipn->fb_tunnel_dev) {
928 		err = -ENOMEM;
929 		goto err_alloc_dev;
930 	}
931 
932 	ipn->fb_tunnel_dev->init = ipip_fb_tunnel_init;
933 	dev_net_set(ipn->fb_tunnel_dev, net);
934 
935 	if ((err = register_netdev(ipn->fb_tunnel_dev)))
936 		goto err_reg_dev;
937 
938 	return 0;
939 
940 err_reg_dev:
941 	free_netdev(ipn->fb_tunnel_dev);
942 err_alloc_dev:
943 	/* nothing */
944 err_assign:
945 	kfree(ipn);
946 err_alloc:
947 	return err;
948 }
949 
950 static void ipip_exit_net(struct net *net)
951 {
952 	struct ipip_net *ipn;
953 
954 	ipn = net_generic(net, ipip_net_id);
955 	rtnl_lock();
956 	ipip_destroy_tunnels(ipn);
957 	unregister_netdevice(ipn->fb_tunnel_dev);
958 	rtnl_unlock();
959 	kfree(ipn);
960 }
961 
962 static struct pernet_operations ipip_net_ops = {
963 	.init = ipip_init_net,
964 	.exit = ipip_exit_net,
965 };
966 
967 static int __init ipip_init(void)
968 {
969 	int err;
970 
971 	printk(banner);
972 
973 	if (xfrm4_tunnel_register(&ipip_handler, AF_INET)) {
974 		printk(KERN_INFO "ipip init: can't register tunnel\n");
975 		return -EAGAIN;
976 	}
977 
978 	err = register_pernet_gen_device(&ipip_net_id, &ipip_net_ops);
979 	if (err)
980 		xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
981 
982 	return err;
983 }
984 
985 static void __exit ipip_fini(void)
986 {
987 	if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET))
988 		printk(KERN_INFO "ipip close: can't deregister tunnel\n");
989 
990 	unregister_pernet_gen_device(ipip_net_id, &ipip_net_ops);
991 }
992 
993 module_init(ipip_init);
994 module_exit(ipip_fini);
995 MODULE_LICENSE("GPL");
996