xref: /openbmc/linux/net/ipv4/ipip.c (revision ce932d0c5589e9766e089c22c66890dfc48fbd94)
1 /*
2  *	Linux NET3:	IP/IP protocol decoder.
3  *
4  *	Authors:
5  *		Sam Lantinga (slouken@cs.ucdavis.edu)  02/01/95
6  *
7  *	Fixes:
8  *		Alan Cox	:	Merged and made usable non modular (its so tiny its silly as
9  *					a module taking up 2 pages).
10  *		Alan Cox	: 	Fixed bug with 1.3.18 and IPIP not working (now needs to set skb->h.iph)
11  *					to keep ip_forward happy.
12  *		Alan Cox	:	More fixes for 1.3.21, and firewall fix. Maybe this will work soon 8).
13  *		Kai Schulte	:	Fixed #defines for IP_FIREWALL->FIREWALL
14  *              David Woodhouse :       Perform some basic ICMP handling.
15  *                                      IPIP Routing without decapsulation.
16  *              Carlos Picoto   :       GRE over IP support
17  *		Alexey Kuznetsov:	Reworked. Really, now it is truncated version of ipv4/ip_gre.c.
18  *					I do not want to merge them together.
19  *
20  *	This program is free software; you can redistribute it and/or
21  *	modify it under the terms of the GNU General Public License
22  *	as published by the Free Software Foundation; either version
23  *	2 of the License, or (at your option) any later version.
24  *
25  */
26 
27 /* tunnel.c: an IP tunnel driver
28 
29 	The purpose of this driver is to provide an IP tunnel through
30 	which you can tunnel network traffic transparently across subnets.
31 
32 	This was written by looking at Nick Holloway's dummy driver
33 	Thanks for the great code!
34 
35 		-Sam Lantinga	(slouken@cs.ucdavis.edu)  02/01/95
36 
37 	Minor tweaks:
38 		Cleaned up the code a little and added some pre-1.3.0 tweaks.
39 		dev->hard_header/hard_header_len changed to use no headers.
40 		Comments/bracketing tweaked.
41 		Made the tunnels use dev->name not tunnel: when error reporting.
42 		Added tx_dropped stat
43 
44 		-Alan Cox	(alan@lxorguk.ukuu.org.uk) 21 March 95
45 
46 	Reworked:
47 		Changed to tunnel to destination gateway in addition to the
48 			tunnel's pointopoint address
49 		Almost completely rewritten
50 		Note:  There is currently no firewall or ICMP handling done.
51 
52 		-Sam Lantinga	(slouken@cs.ucdavis.edu) 02/13/96
53 
54 */
55 
56 /* Things I wish I had known when writing the tunnel driver:
57 
58 	When the tunnel_xmit() function is called, the skb contains the
59 	packet to be sent (plus a great deal of extra info), and dev
60 	contains the tunnel device that _we_ are.
61 
62 	When we are passed a packet, we are expected to fill in the
63 	source address with our source IP address.
64 
65 	What is the proper way to allocate, copy and free a buffer?
66 	After you allocate it, it is a "0 length" chunk of memory
67 	starting at zero.  If you want to add headers to the buffer
68 	later, you'll have to call "skb_reserve(skb, amount)" with
69 	the amount of memory you want reserved.  Then, you call
70 	"skb_put(skb, amount)" with the amount of space you want in
71 	the buffer.  skb_put() returns a pointer to the top (#0) of
72 	that buffer.  skb->len is set to the amount of space you have
73 	"allocated" with skb_put().  You can then write up to skb->len
74 	bytes to that buffer.  If you need more, you can call skb_put()
75 	again with the additional amount of space you need.  You can
76 	find out how much more space you can allocate by calling
77 	"skb_tailroom(skb)".
78 	Now, to add header space, call "skb_push(skb, header_len)".
79 	This creates space at the beginning of the buffer and returns
80 	a pointer to this new space.  If later you need to strip a
81 	header from a buffer, call "skb_pull(skb, header_len)".
82 	skb_headroom() will return how much space is left at the top
83 	of the buffer (before the main data).  Remember, this headroom
84 	space must be reserved before the skb_put() function is called.
85 	*/
86 
87 /*
88    This version of net/ipv4/ipip.c is cloned of net/ipv4/ip_gre.c
89 
90    For comments look at net/ipv4/ip_gre.c --ANK
91  */
92 
93 
94 #include <linux/capability.h>
95 #include <linux/module.h>
96 #include <linux/types.h>
97 #include <linux/kernel.h>
98 #include <linux/slab.h>
99 #include <asm/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <linux/in.h>
103 #include <linux/tcp.h>
104 #include <linux/udp.h>
105 #include <linux/if_arp.h>
106 #include <linux/mroute.h>
107 #include <linux/init.h>
108 #include <linux/netfilter_ipv4.h>
109 #include <linux/if_ether.h>
110 
111 #include <net/sock.h>
112 #include <net/ip.h>
113 #include <net/icmp.h>
114 #include <net/ipip.h>
115 #include <net/inet_ecn.h>
116 #include <net/xfrm.h>
117 #include <net/net_namespace.h>
118 #include <net/netns/generic.h>
119 
120 #define HASH_SIZE  16
121 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
122 
123 static int ipip_net_id __read_mostly;
124 struct ipip_net {
125 	struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
126 	struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
127 	struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
128 	struct ip_tunnel __rcu *tunnels_wc[1];
129 	struct ip_tunnel __rcu **tunnels[4];
130 
131 	struct net_device *fb_tunnel_dev;
132 };
133 
134 static int ipip_tunnel_init(struct net_device *dev);
135 static void ipip_tunnel_setup(struct net_device *dev);
136 static void ipip_dev_free(struct net_device *dev);
137 
138 /*
139  * Locking : hash tables are protected by RCU and RTNL
140  */
141 
142 #define for_each_ip_tunnel_rcu(start) \
143 	for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
144 
145 /* often modified stats are per cpu, other are shared (netdev->stats) */
146 struct pcpu_tstats {
147 	unsigned long	rx_packets;
148 	unsigned long	rx_bytes;
149 	unsigned long	tx_packets;
150 	unsigned long	tx_bytes;
151 } __attribute__((aligned(4*sizeof(unsigned long))));
152 
153 static struct net_device_stats *ipip_get_stats(struct net_device *dev)
154 {
155 	struct pcpu_tstats sum = { 0 };
156 	int i;
157 
158 	for_each_possible_cpu(i) {
159 		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
160 
161 		sum.rx_packets += tstats->rx_packets;
162 		sum.rx_bytes   += tstats->rx_bytes;
163 		sum.tx_packets += tstats->tx_packets;
164 		sum.tx_bytes   += tstats->tx_bytes;
165 	}
166 	dev->stats.rx_packets = sum.rx_packets;
167 	dev->stats.rx_bytes   = sum.rx_bytes;
168 	dev->stats.tx_packets = sum.tx_packets;
169 	dev->stats.tx_bytes   = sum.tx_bytes;
170 	return &dev->stats;
171 }
172 
173 static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
174 		__be32 remote, __be32 local)
175 {
176 	unsigned int h0 = HASH(remote);
177 	unsigned int h1 = HASH(local);
178 	struct ip_tunnel *t;
179 	struct ipip_net *ipn = net_generic(net, ipip_net_id);
180 
181 	for_each_ip_tunnel_rcu(ipn->tunnels_r_l[h0 ^ h1])
182 		if (local == t->parms.iph.saddr &&
183 		    remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
184 			return t;
185 
186 	for_each_ip_tunnel_rcu(ipn->tunnels_r[h0])
187 		if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
188 			return t;
189 
190 	for_each_ip_tunnel_rcu(ipn->tunnels_l[h1])
191 		if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
192 			return t;
193 
194 	t = rcu_dereference(ipn->tunnels_wc[0]);
195 	if (t && (t->dev->flags&IFF_UP))
196 		return t;
197 	return NULL;
198 }
199 
200 static struct ip_tunnel __rcu **__ipip_bucket(struct ipip_net *ipn,
201 		struct ip_tunnel_parm *parms)
202 {
203 	__be32 remote = parms->iph.daddr;
204 	__be32 local = parms->iph.saddr;
205 	unsigned int h = 0;
206 	int prio = 0;
207 
208 	if (remote) {
209 		prio |= 2;
210 		h ^= HASH(remote);
211 	}
212 	if (local) {
213 		prio |= 1;
214 		h ^= HASH(local);
215 	}
216 	return &ipn->tunnels[prio][h];
217 }
218 
219 static inline struct ip_tunnel __rcu **ipip_bucket(struct ipip_net *ipn,
220 		struct ip_tunnel *t)
221 {
222 	return __ipip_bucket(ipn, &t->parms);
223 }
224 
225 static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
226 {
227 	struct ip_tunnel __rcu **tp;
228 	struct ip_tunnel *iter;
229 
230 	for (tp = ipip_bucket(ipn, t);
231 	     (iter = rtnl_dereference(*tp)) != NULL;
232 	     tp = &iter->next) {
233 		if (t == iter) {
234 			rcu_assign_pointer(*tp, t->next);
235 			break;
236 		}
237 	}
238 }
239 
240 static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t)
241 {
242 	struct ip_tunnel __rcu **tp = ipip_bucket(ipn, t);
243 
244 	rcu_assign_pointer(t->next, rtnl_dereference(*tp));
245 	rcu_assign_pointer(*tp, t);
246 }
247 
248 static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
249 		struct ip_tunnel_parm *parms, int create)
250 {
251 	__be32 remote = parms->iph.daddr;
252 	__be32 local = parms->iph.saddr;
253 	struct ip_tunnel *t, *nt;
254 	struct ip_tunnel __rcu **tp;
255 	struct net_device *dev;
256 	char name[IFNAMSIZ];
257 	struct ipip_net *ipn = net_generic(net, ipip_net_id);
258 
259 	for (tp = __ipip_bucket(ipn, parms);
260 		 (t = rtnl_dereference(*tp)) != NULL;
261 		 tp = &t->next) {
262 		if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
263 			return t;
264 	}
265 	if (!create)
266 		return NULL;
267 
268 	if (parms->name[0])
269 		strlcpy(name, parms->name, IFNAMSIZ);
270 	else
271 		strcpy(name, "tunl%d");
272 
273 	dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup);
274 	if (dev == NULL)
275 		return NULL;
276 
277 	dev_net_set(dev, net);
278 
279 	nt = netdev_priv(dev);
280 	nt->parms = *parms;
281 
282 	if (ipip_tunnel_init(dev) < 0)
283 		goto failed_free;
284 
285 	if (register_netdevice(dev) < 0)
286 		goto failed_free;
287 
288 	strcpy(nt->parms.name, dev->name);
289 
290 	dev_hold(dev);
291 	ipip_tunnel_link(ipn, nt);
292 	return nt;
293 
294 failed_free:
295 	ipip_dev_free(dev);
296 	return NULL;
297 }
298 
299 /* called with RTNL */
300 static void ipip_tunnel_uninit(struct net_device *dev)
301 {
302 	struct net *net = dev_net(dev);
303 	struct ipip_net *ipn = net_generic(net, ipip_net_id);
304 
305 	if (dev == ipn->fb_tunnel_dev)
306 		RCU_INIT_POINTER(ipn->tunnels_wc[0], NULL);
307 	else
308 		ipip_tunnel_unlink(ipn, netdev_priv(dev));
309 	dev_put(dev);
310 }
311 
312 static int ipip_err(struct sk_buff *skb, u32 info)
313 {
314 
315 /* All the routers (except for Linux) return only
316    8 bytes of packet payload. It means, that precise relaying of
317    ICMP in the real Internet is absolutely infeasible.
318  */
319 	const struct iphdr *iph = (const struct iphdr *)skb->data;
320 	const int type = icmp_hdr(skb)->type;
321 	const int code = icmp_hdr(skb)->code;
322 	struct ip_tunnel *t;
323 	int err;
324 
325 	switch (type) {
326 	default:
327 	case ICMP_PARAMETERPROB:
328 		return 0;
329 
330 	case ICMP_DEST_UNREACH:
331 		switch (code) {
332 		case ICMP_SR_FAILED:
333 		case ICMP_PORT_UNREACH:
334 			/* Impossible event. */
335 			return 0;
336 		case ICMP_FRAG_NEEDED:
337 			/* Soft state for pmtu is maintained by IP core. */
338 			return 0;
339 		default:
340 			/* All others are translated to HOST_UNREACH.
341 			   rfc2003 contains "deep thoughts" about NET_UNREACH,
342 			   I believe they are just ether pollution. --ANK
343 			 */
344 			break;
345 		}
346 		break;
347 	case ICMP_TIME_EXCEEDED:
348 		if (code != ICMP_EXC_TTL)
349 			return 0;
350 		break;
351 	}
352 
353 	err = -ENOENT;
354 
355 	rcu_read_lock();
356 	t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
357 	if (t == NULL || t->parms.iph.daddr == 0)
358 		goto out;
359 
360 	err = 0;
361 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
362 		goto out;
363 
364 	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
365 		t->err_count++;
366 	else
367 		t->err_count = 1;
368 	t->err_time = jiffies;
369 out:
370 	rcu_read_unlock();
371 	return err;
372 }
373 
374 static inline void ipip_ecn_decapsulate(const struct iphdr *outer_iph,
375 					struct sk_buff *skb)
376 {
377 	struct iphdr *inner_iph = ip_hdr(skb);
378 
379 	if (INET_ECN_is_ce(outer_iph->tos))
380 		IP_ECN_set_ce(inner_iph);
381 }
382 
383 static int ipip_rcv(struct sk_buff *skb)
384 {
385 	struct ip_tunnel *tunnel;
386 	const struct iphdr *iph = ip_hdr(skb);
387 
388 	rcu_read_lock();
389 	tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
390 	if (tunnel != NULL) {
391 		struct pcpu_tstats *tstats;
392 
393 		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
394 			rcu_read_unlock();
395 			kfree_skb(skb);
396 			return 0;
397 		}
398 
399 		secpath_reset(skb);
400 
401 		skb->mac_header = skb->network_header;
402 		skb_reset_network_header(skb);
403 		skb->protocol = htons(ETH_P_IP);
404 		skb->pkt_type = PACKET_HOST;
405 
406 		tstats = this_cpu_ptr(tunnel->dev->tstats);
407 		tstats->rx_packets++;
408 		tstats->rx_bytes += skb->len;
409 
410 		__skb_tunnel_rx(skb, tunnel->dev);
411 
412 		ipip_ecn_decapsulate(iph, skb);
413 
414 		netif_rx(skb);
415 
416 		rcu_read_unlock();
417 		return 0;
418 	}
419 	rcu_read_unlock();
420 
421 	return -1;
422 }
423 
424 /*
425  *	This function assumes it is being called from dev_queue_xmit()
426  *	and that skb is filled properly by that function.
427  */
428 
429 static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
430 {
431 	struct ip_tunnel *tunnel = netdev_priv(dev);
432 	struct pcpu_tstats *tstats;
433 	const struct iphdr  *tiph = &tunnel->parms.iph;
434 	u8     tos = tunnel->parms.iph.tos;
435 	__be16 df = tiph->frag_off;
436 	struct rtable *rt;     			/* Route to the other host */
437 	struct net_device *tdev;		/* Device to other host */
438 	const struct iphdr  *old_iph = ip_hdr(skb);
439 	struct iphdr  *iph;			/* Our new IP header */
440 	unsigned int max_headroom;		/* The extra header space needed */
441 	__be32 dst = tiph->daddr;
442 	struct flowi4 fl4;
443 	int    mtu;
444 
445 	if (skb->protocol != htons(ETH_P_IP))
446 		goto tx_error;
447 
448 	if (tos & 1)
449 		tos = old_iph->tos;
450 
451 	if (!dst) {
452 		/* NBMA tunnel */
453 		if ((rt = skb_rtable(skb)) == NULL) {
454 			dev->stats.tx_fifo_errors++;
455 			goto tx_error;
456 		}
457 		dst = rt->rt_gateway;
458 	}
459 
460 	rt = ip_route_output_ports(dev_net(dev), &fl4, NULL,
461 				   dst, tiph->saddr,
462 				   0, 0,
463 				   IPPROTO_IPIP, RT_TOS(tos),
464 				   tunnel->parms.link);
465 	if (IS_ERR(rt)) {
466 		dev->stats.tx_carrier_errors++;
467 		goto tx_error_icmp;
468 	}
469 	tdev = rt->dst.dev;
470 
471 	if (tdev == dev) {
472 		ip_rt_put(rt);
473 		dev->stats.collisions++;
474 		goto tx_error;
475 	}
476 
477 	df |= old_iph->frag_off & htons(IP_DF);
478 
479 	if (df) {
480 		mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
481 
482 		if (mtu < 68) {
483 			dev->stats.collisions++;
484 			ip_rt_put(rt);
485 			goto tx_error;
486 		}
487 
488 		if (skb_dst(skb))
489 			skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
490 
491 		if ((old_iph->frag_off & htons(IP_DF)) &&
492 		    mtu < ntohs(old_iph->tot_len)) {
493 			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
494 				  htonl(mtu));
495 			ip_rt_put(rt);
496 			goto tx_error;
497 		}
498 	}
499 
500 	if (tunnel->err_count > 0) {
501 		if (time_before(jiffies,
502 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
503 			tunnel->err_count--;
504 			dst_link_failure(skb);
505 		} else
506 			tunnel->err_count = 0;
507 	}
508 
509 	/*
510 	 * Okay, now see if we can stuff it in the buffer as-is.
511 	 */
512 	max_headroom = (LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr));
513 
514 	if (skb_headroom(skb) < max_headroom || skb_shared(skb) ||
515 	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
516 		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
517 		if (!new_skb) {
518 			ip_rt_put(rt);
519 			dev->stats.tx_dropped++;
520 			dev_kfree_skb(skb);
521 			return NETDEV_TX_OK;
522 		}
523 		if (skb->sk)
524 			skb_set_owner_w(new_skb, skb->sk);
525 		dev_kfree_skb(skb);
526 		skb = new_skb;
527 		old_iph = ip_hdr(skb);
528 	}
529 
530 	skb->transport_header = skb->network_header;
531 	skb_push(skb, sizeof(struct iphdr));
532 	skb_reset_network_header(skb);
533 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
534 	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
535 			      IPSKB_REROUTED);
536 	skb_dst_drop(skb);
537 	skb_dst_set(skb, &rt->dst);
538 
539 	/*
540 	 *	Push down and install the IPIP header.
541 	 */
542 
543 	iph 			=	ip_hdr(skb);
544 	iph->version		=	4;
545 	iph->ihl		=	sizeof(struct iphdr)>>2;
546 	iph->frag_off		=	df;
547 	iph->protocol		=	IPPROTO_IPIP;
548 	iph->tos		=	INET_ECN_encapsulate(tos, old_iph->tos);
549 	iph->daddr		=	fl4.daddr;
550 	iph->saddr		=	fl4.saddr;
551 
552 	if ((iph->ttl = tiph->ttl) == 0)
553 		iph->ttl	=	old_iph->ttl;
554 
555 	nf_reset(skb);
556 	tstats = this_cpu_ptr(dev->tstats);
557 	__IPTUNNEL_XMIT(tstats, &dev->stats);
558 	return NETDEV_TX_OK;
559 
560 tx_error_icmp:
561 	dst_link_failure(skb);
562 tx_error:
563 	dev->stats.tx_errors++;
564 	dev_kfree_skb(skb);
565 	return NETDEV_TX_OK;
566 }
567 
568 static void ipip_tunnel_bind_dev(struct net_device *dev)
569 {
570 	struct net_device *tdev = NULL;
571 	struct ip_tunnel *tunnel;
572 	const struct iphdr *iph;
573 
574 	tunnel = netdev_priv(dev);
575 	iph = &tunnel->parms.iph;
576 
577 	if (iph->daddr) {
578 		struct rtable *rt;
579 		struct flowi4 fl4;
580 
581 		rt = ip_route_output_ports(dev_net(dev), &fl4, NULL,
582 					   iph->daddr, iph->saddr,
583 					   0, 0,
584 					   IPPROTO_IPIP,
585 					   RT_TOS(iph->tos),
586 					   tunnel->parms.link);
587 		if (!IS_ERR(rt)) {
588 			tdev = rt->dst.dev;
589 			ip_rt_put(rt);
590 		}
591 		dev->flags |= IFF_POINTOPOINT;
592 	}
593 
594 	if (!tdev && tunnel->parms.link)
595 		tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
596 
597 	if (tdev) {
598 		dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr);
599 		dev->mtu = tdev->mtu - sizeof(struct iphdr);
600 	}
601 	dev->iflink = tunnel->parms.link;
602 }
603 
604 static int
605 ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
606 {
607 	int err = 0;
608 	struct ip_tunnel_parm p;
609 	struct ip_tunnel *t;
610 	struct net *net = dev_net(dev);
611 	struct ipip_net *ipn = net_generic(net, ipip_net_id);
612 
613 	switch (cmd) {
614 	case SIOCGETTUNNEL:
615 		t = NULL;
616 		if (dev == ipn->fb_tunnel_dev) {
617 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
618 				err = -EFAULT;
619 				break;
620 			}
621 			t = ipip_tunnel_locate(net, &p, 0);
622 		}
623 		if (t == NULL)
624 			t = netdev_priv(dev);
625 		memcpy(&p, &t->parms, sizeof(p));
626 		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
627 			err = -EFAULT;
628 		break;
629 
630 	case SIOCADDTUNNEL:
631 	case SIOCCHGTUNNEL:
632 		err = -EPERM;
633 		if (!capable(CAP_NET_ADMIN))
634 			goto done;
635 
636 		err = -EFAULT;
637 		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
638 			goto done;
639 
640 		err = -EINVAL;
641 		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
642 		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
643 			goto done;
644 		if (p.iph.ttl)
645 			p.iph.frag_off |= htons(IP_DF);
646 
647 		t = ipip_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
648 
649 		if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
650 			if (t != NULL) {
651 				if (t->dev != dev) {
652 					err = -EEXIST;
653 					break;
654 				}
655 			} else {
656 				if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) ||
657 				    (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) {
658 					err = -EINVAL;
659 					break;
660 				}
661 				t = netdev_priv(dev);
662 				ipip_tunnel_unlink(ipn, t);
663 				synchronize_net();
664 				t->parms.iph.saddr = p.iph.saddr;
665 				t->parms.iph.daddr = p.iph.daddr;
666 				memcpy(dev->dev_addr, &p.iph.saddr, 4);
667 				memcpy(dev->broadcast, &p.iph.daddr, 4);
668 				ipip_tunnel_link(ipn, t);
669 				netdev_state_change(dev);
670 			}
671 		}
672 
673 		if (t) {
674 			err = 0;
675 			if (cmd == SIOCCHGTUNNEL) {
676 				t->parms.iph.ttl = p.iph.ttl;
677 				t->parms.iph.tos = p.iph.tos;
678 				t->parms.iph.frag_off = p.iph.frag_off;
679 				if (t->parms.link != p.link) {
680 					t->parms.link = p.link;
681 					ipip_tunnel_bind_dev(dev);
682 					netdev_state_change(dev);
683 				}
684 			}
685 			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
686 				err = -EFAULT;
687 		} else
688 			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
689 		break;
690 
691 	case SIOCDELTUNNEL:
692 		err = -EPERM;
693 		if (!capable(CAP_NET_ADMIN))
694 			goto done;
695 
696 		if (dev == ipn->fb_tunnel_dev) {
697 			err = -EFAULT;
698 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
699 				goto done;
700 			err = -ENOENT;
701 			if ((t = ipip_tunnel_locate(net, &p, 0)) == NULL)
702 				goto done;
703 			err = -EPERM;
704 			if (t->dev == ipn->fb_tunnel_dev)
705 				goto done;
706 			dev = t->dev;
707 		}
708 		unregister_netdevice(dev);
709 		err = 0;
710 		break;
711 
712 	default:
713 		err = -EINVAL;
714 	}
715 
716 done:
717 	return err;
718 }
719 
720 static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
721 {
722 	if (new_mtu < 68 || new_mtu > 0xFFF8 - sizeof(struct iphdr))
723 		return -EINVAL;
724 	dev->mtu = new_mtu;
725 	return 0;
726 }
727 
728 static const struct net_device_ops ipip_netdev_ops = {
729 	.ndo_uninit	= ipip_tunnel_uninit,
730 	.ndo_start_xmit	= ipip_tunnel_xmit,
731 	.ndo_do_ioctl	= ipip_tunnel_ioctl,
732 	.ndo_change_mtu	= ipip_tunnel_change_mtu,
733 	.ndo_get_stats  = ipip_get_stats,
734 };
735 
736 static void ipip_dev_free(struct net_device *dev)
737 {
738 	free_percpu(dev->tstats);
739 	free_netdev(dev);
740 }
741 
742 static void ipip_tunnel_setup(struct net_device *dev)
743 {
744 	dev->netdev_ops		= &ipip_netdev_ops;
745 	dev->destructor		= ipip_dev_free;
746 
747 	dev->type		= ARPHRD_TUNNEL;
748 	dev->hard_header_len 	= LL_MAX_HEADER + sizeof(struct iphdr);
749 	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr);
750 	dev->flags		= IFF_NOARP;
751 	dev->iflink		= 0;
752 	dev->addr_len		= 4;
753 	dev->features		|= NETIF_F_NETNS_LOCAL;
754 	dev->features		|= NETIF_F_LLTX;
755 	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
756 }
757 
758 static int ipip_tunnel_init(struct net_device *dev)
759 {
760 	struct ip_tunnel *tunnel = netdev_priv(dev);
761 
762 	tunnel->dev = dev;
763 
764 	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
765 	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
766 
767 	ipip_tunnel_bind_dev(dev);
768 
769 	dev->tstats = alloc_percpu(struct pcpu_tstats);
770 	if (!dev->tstats)
771 		return -ENOMEM;
772 
773 	return 0;
774 }
775 
776 static int __net_init ipip_fb_tunnel_init(struct net_device *dev)
777 {
778 	struct ip_tunnel *tunnel = netdev_priv(dev);
779 	struct iphdr *iph = &tunnel->parms.iph;
780 	struct ipip_net *ipn = net_generic(dev_net(dev), ipip_net_id);
781 
782 	tunnel->dev = dev;
783 	strcpy(tunnel->parms.name, dev->name);
784 
785 	iph->version		= 4;
786 	iph->protocol		= IPPROTO_IPIP;
787 	iph->ihl		= 5;
788 
789 	dev->tstats = alloc_percpu(struct pcpu_tstats);
790 	if (!dev->tstats)
791 		return -ENOMEM;
792 
793 	dev_hold(dev);
794 	rcu_assign_pointer(ipn->tunnels_wc[0], tunnel);
795 	return 0;
796 }
797 
798 static struct xfrm_tunnel ipip_handler __read_mostly = {
799 	.handler	=	ipip_rcv,
800 	.err_handler	=	ipip_err,
801 	.priority	=	1,
802 };
803 
804 static const char banner[] __initconst =
805 	KERN_INFO "IPv4 over IPv4 tunneling driver\n";
806 
807 static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head)
808 {
809 	int prio;
810 
811 	for (prio = 1; prio < 4; prio++) {
812 		int h;
813 		for (h = 0; h < HASH_SIZE; h++) {
814 			struct ip_tunnel *t;
815 
816 			t = rtnl_dereference(ipn->tunnels[prio][h]);
817 			while (t != NULL) {
818 				unregister_netdevice_queue(t->dev, head);
819 				t = rtnl_dereference(t->next);
820 			}
821 		}
822 	}
823 }
824 
825 static int __net_init ipip_init_net(struct net *net)
826 {
827 	struct ipip_net *ipn = net_generic(net, ipip_net_id);
828 	struct ip_tunnel *t;
829 	int err;
830 
831 	ipn->tunnels[0] = ipn->tunnels_wc;
832 	ipn->tunnels[1] = ipn->tunnels_l;
833 	ipn->tunnels[2] = ipn->tunnels_r;
834 	ipn->tunnels[3] = ipn->tunnels_r_l;
835 
836 	ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel),
837 					   "tunl0",
838 					   ipip_tunnel_setup);
839 	if (!ipn->fb_tunnel_dev) {
840 		err = -ENOMEM;
841 		goto err_alloc_dev;
842 	}
843 	dev_net_set(ipn->fb_tunnel_dev, net);
844 
845 	err = ipip_fb_tunnel_init(ipn->fb_tunnel_dev);
846 	if (err)
847 		goto err_reg_dev;
848 
849 	if ((err = register_netdev(ipn->fb_tunnel_dev)))
850 		goto err_reg_dev;
851 
852 	t = netdev_priv(ipn->fb_tunnel_dev);
853 
854 	strcpy(t->parms.name, ipn->fb_tunnel_dev->name);
855 	return 0;
856 
857 err_reg_dev:
858 	ipip_dev_free(ipn->fb_tunnel_dev);
859 err_alloc_dev:
860 	/* nothing */
861 	return err;
862 }
863 
864 static void __net_exit ipip_exit_net(struct net *net)
865 {
866 	struct ipip_net *ipn = net_generic(net, ipip_net_id);
867 	LIST_HEAD(list);
868 
869 	rtnl_lock();
870 	ipip_destroy_tunnels(ipn, &list);
871 	unregister_netdevice_queue(ipn->fb_tunnel_dev, &list);
872 	unregister_netdevice_many(&list);
873 	rtnl_unlock();
874 }
875 
876 static struct pernet_operations ipip_net_ops = {
877 	.init = ipip_init_net,
878 	.exit = ipip_exit_net,
879 	.id   = &ipip_net_id,
880 	.size = sizeof(struct ipip_net),
881 };
882 
883 static int __init ipip_init(void)
884 {
885 	int err;
886 
887 	printk(banner);
888 
889 	err = register_pernet_device(&ipip_net_ops);
890 	if (err < 0)
891 		return err;
892 	err = xfrm4_tunnel_register(&ipip_handler, AF_INET);
893 	if (err < 0) {
894 		unregister_pernet_device(&ipip_net_ops);
895 		pr_info("%s: can't register tunnel\n", __func__);
896 	}
897 	return err;
898 }
899 
900 static void __exit ipip_fini(void)
901 {
902 	if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET))
903 		pr_info("%s: can't deregister tunnel\n", __func__);
904 
905 	unregister_pernet_device(&ipip_net_ops);
906 }
907 
908 module_init(ipip_init);
909 module_exit(ipip_fini);
910 MODULE_LICENSE("GPL");
911 MODULE_ALIAS_NETDEV("tunl0");
912