xref: /openbmc/linux/net/ipv4/ip_vti.c (revision d0b73b48)
1 /*
2  *	Linux NET3: IP/IP protocol decoder modified to support
3  *		    virtual tunnel interface
4  *
5  *	Authors:
6  *		Saurabh Mohan (saurabh.mohan@vyatta.com) 05/07/2012
7  *
8  *	This program is free software; you can redistribute it and/or
9  *	modify it under the terms of the GNU General Public License
10  *	as published by the Free Software Foundation; either version
11  *	2 of the License, or (at your option) any later version.
12  *
13  */
14 
15 /*
16    This version of net/ipv4/ip_vti.c is cloned of net/ipv4/ipip.c
17 
18    For comments look at net/ipv4/ip_gre.c --ANK
19  */
20 
21 
22 #include <linux/capability.h>
23 #include <linux/module.h>
24 #include <linux/types.h>
25 #include <linux/kernel.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/mroute.h>
34 #include <linux/init.h>
35 #include <linux/netfilter_ipv4.h>
36 #include <linux/if_ether.h>
37 
38 #include <net/sock.h>
39 #include <net/ip.h>
40 #include <net/icmp.h>
41 #include <net/ipip.h>
42 #include <net/inet_ecn.h>
43 #include <net/xfrm.h>
44 #include <net/net_namespace.h>
45 #include <net/netns/generic.h>
46 
47 #define HASH_SIZE  16
48 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&(HASH_SIZE-1))
49 
50 static struct rtnl_link_ops vti_link_ops __read_mostly;
51 
52 static int vti_net_id __read_mostly;
53 struct vti_net {
54 	struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
55 	struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
56 	struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
57 	struct ip_tunnel __rcu *tunnels_wc[1];
58 	struct ip_tunnel __rcu **tunnels[4];
59 
60 	struct net_device *fb_tunnel_dev;
61 };
62 
63 static int vti_fb_tunnel_init(struct net_device *dev);
64 static int vti_tunnel_init(struct net_device *dev);
65 static void vti_tunnel_setup(struct net_device *dev);
66 static void vti_dev_free(struct net_device *dev);
67 static int vti_tunnel_bind_dev(struct net_device *dev);
68 
69 #define VTI_XMIT(stats1, stats2) do {				\
70 	int err;						\
71 	int pkt_len = skb->len;					\
72 	err = dst_output(skb);					\
73 	if (net_xmit_eval(err) == 0) {				\
74 		u64_stats_update_begin(&(stats1)->syncp);	\
75 		(stats1)->tx_bytes += pkt_len;			\
76 		(stats1)->tx_packets++;				\
77 		u64_stats_update_end(&(stats1)->syncp);		\
78 	} else {						\
79 		(stats2)->tx_errors++;				\
80 		(stats2)->tx_aborted_errors++;			\
81 	}							\
82 } while (0)
83 
84 
85 static struct rtnl_link_stats64 *vti_get_stats64(struct net_device *dev,
86 						 struct rtnl_link_stats64 *tot)
87 {
88 	int i;
89 
90 	for_each_possible_cpu(i) {
91 		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
92 		u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
93 		unsigned int start;
94 
95 		do {
96 			start = u64_stats_fetch_begin_bh(&tstats->syncp);
97 			rx_packets = tstats->rx_packets;
98 			tx_packets = tstats->tx_packets;
99 			rx_bytes = tstats->rx_bytes;
100 			tx_bytes = tstats->tx_bytes;
101 		} while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
102 
103 		tot->rx_packets += rx_packets;
104 		tot->tx_packets += tx_packets;
105 		tot->rx_bytes   += rx_bytes;
106 		tot->tx_bytes   += tx_bytes;
107 	}
108 
109 	tot->multicast = dev->stats.multicast;
110 	tot->rx_crc_errors = dev->stats.rx_crc_errors;
111 	tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
112 	tot->rx_length_errors = dev->stats.rx_length_errors;
113 	tot->rx_errors = dev->stats.rx_errors;
114 	tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
115 	tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
116 	tot->tx_dropped = dev->stats.tx_dropped;
117 	tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
118 	tot->tx_errors = dev->stats.tx_errors;
119 
120 	return tot;
121 }
122 
123 static struct ip_tunnel *vti_tunnel_lookup(struct net *net,
124 					   __be32 remote, __be32 local)
125 {
126 	unsigned h0 = HASH(remote);
127 	unsigned h1 = HASH(local);
128 	struct ip_tunnel *t;
129 	struct vti_net *ipn = net_generic(net, vti_net_id);
130 
131 	for_each_ip_tunnel_rcu(t, ipn->tunnels_r_l[h0 ^ h1])
132 		if (local == t->parms.iph.saddr &&
133 		    remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
134 			return t;
135 	for_each_ip_tunnel_rcu(t, ipn->tunnels_r[h0])
136 		if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
137 			return t;
138 
139 	for_each_ip_tunnel_rcu(t, ipn->tunnels_l[h1])
140 		if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
141 			return t;
142 
143 	for_each_ip_tunnel_rcu(t, ipn->tunnels_wc[0])
144 		if (t && (t->dev->flags&IFF_UP))
145 			return t;
146 	return NULL;
147 }
148 
149 static struct ip_tunnel __rcu **__vti_bucket(struct vti_net *ipn,
150 					     struct ip_tunnel_parm *parms)
151 {
152 	__be32 remote = parms->iph.daddr;
153 	__be32 local = parms->iph.saddr;
154 	unsigned h = 0;
155 	int prio = 0;
156 
157 	if (remote) {
158 		prio |= 2;
159 		h ^= HASH(remote);
160 	}
161 	if (local) {
162 		prio |= 1;
163 		h ^= HASH(local);
164 	}
165 	return &ipn->tunnels[prio][h];
166 }
167 
168 static inline struct ip_tunnel __rcu **vti_bucket(struct vti_net *ipn,
169 						  struct ip_tunnel *t)
170 {
171 	return __vti_bucket(ipn, &t->parms);
172 }
173 
174 static void vti_tunnel_unlink(struct vti_net *ipn, struct ip_tunnel *t)
175 {
176 	struct ip_tunnel __rcu **tp;
177 	struct ip_tunnel *iter;
178 
179 	for (tp = vti_bucket(ipn, t);
180 	     (iter = rtnl_dereference(*tp)) != NULL;
181 	     tp = &iter->next) {
182 		if (t == iter) {
183 			rcu_assign_pointer(*tp, t->next);
184 			break;
185 		}
186 	}
187 }
188 
189 static void vti_tunnel_link(struct vti_net *ipn, struct ip_tunnel *t)
190 {
191 	struct ip_tunnel __rcu **tp = vti_bucket(ipn, t);
192 
193 	rcu_assign_pointer(t->next, rtnl_dereference(*tp));
194 	rcu_assign_pointer(*tp, t);
195 }
196 
197 static struct ip_tunnel *vti_tunnel_locate(struct net *net,
198 					   struct ip_tunnel_parm *parms,
199 					   int create)
200 {
201 	__be32 remote = parms->iph.daddr;
202 	__be32 local = parms->iph.saddr;
203 	struct ip_tunnel *t, *nt;
204 	struct ip_tunnel __rcu **tp;
205 	struct net_device *dev;
206 	char name[IFNAMSIZ];
207 	struct vti_net *ipn = net_generic(net, vti_net_id);
208 
209 	for (tp = __vti_bucket(ipn, parms);
210 	     (t = rtnl_dereference(*tp)) != NULL;
211 	     tp = &t->next) {
212 		if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
213 			return t;
214 	}
215 	if (!create)
216 		return NULL;
217 
218 	if (parms->name[0])
219 		strlcpy(name, parms->name, IFNAMSIZ);
220 	else
221 		strcpy(name, "vti%d");
222 
223 	dev = alloc_netdev(sizeof(*t), name, vti_tunnel_setup);
224 	if (dev == NULL)
225 		return NULL;
226 
227 	dev_net_set(dev, net);
228 
229 	nt = netdev_priv(dev);
230 	nt->parms = *parms;
231 	dev->rtnl_link_ops = &vti_link_ops;
232 
233 	vti_tunnel_bind_dev(dev);
234 
235 	if (register_netdevice(dev) < 0)
236 		goto failed_free;
237 
238 	dev_hold(dev);
239 	vti_tunnel_link(ipn, nt);
240 	return nt;
241 
242 failed_free:
243 	free_netdev(dev);
244 	return NULL;
245 }
246 
247 static void vti_tunnel_uninit(struct net_device *dev)
248 {
249 	struct net *net = dev_net(dev);
250 	struct vti_net *ipn = net_generic(net, vti_net_id);
251 
252 	vti_tunnel_unlink(ipn, netdev_priv(dev));
253 	dev_put(dev);
254 }
255 
256 static int vti_err(struct sk_buff *skb, u32 info)
257 {
258 
259 	/* All the routers (except for Linux) return only
260 	 * 8 bytes of packet payload. It means, that precise relaying of
261 	 * ICMP in the real Internet is absolutely infeasible.
262 	 */
263 	struct iphdr *iph = (struct iphdr *)skb->data;
264 	const int type = icmp_hdr(skb)->type;
265 	const int code = icmp_hdr(skb)->code;
266 	struct ip_tunnel *t;
267 	int err;
268 
269 	switch (type) {
270 	default:
271 	case ICMP_PARAMETERPROB:
272 		return 0;
273 
274 	case ICMP_DEST_UNREACH:
275 		switch (code) {
276 		case ICMP_SR_FAILED:
277 		case ICMP_PORT_UNREACH:
278 			/* Impossible event. */
279 			return 0;
280 		default:
281 			/* All others are translated to HOST_UNREACH. */
282 			break;
283 		}
284 		break;
285 	case ICMP_TIME_EXCEEDED:
286 		if (code != ICMP_EXC_TTL)
287 			return 0;
288 		break;
289 	}
290 
291 	err = -ENOENT;
292 
293 	t = vti_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
294 	if (t == NULL)
295 		goto out;
296 
297 	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
298 		ipv4_update_pmtu(skb, dev_net(skb->dev), info,
299 				 t->parms.link, 0, IPPROTO_IPIP, 0);
300 		err = 0;
301 		goto out;
302 	}
303 
304 	err = 0;
305 	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
306 		goto out;
307 
308 	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
309 		t->err_count++;
310 	else
311 		t->err_count = 1;
312 	t->err_time = jiffies;
313 out:
314 	return err;
315 }
316 
317 /* We dont digest the packet therefore let the packet pass */
318 static int vti_rcv(struct sk_buff *skb)
319 {
320 	struct ip_tunnel *tunnel;
321 	const struct iphdr *iph = ip_hdr(skb);
322 
323 	tunnel = vti_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
324 	if (tunnel != NULL) {
325 		struct pcpu_tstats *tstats;
326 
327 		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
328 			return -1;
329 
330 		tstats = this_cpu_ptr(tunnel->dev->tstats);
331 		u64_stats_update_begin(&tstats->syncp);
332 		tstats->rx_packets++;
333 		tstats->rx_bytes += skb->len;
334 		u64_stats_update_end(&tstats->syncp);
335 
336 		skb->mark = 0;
337 		secpath_reset(skb);
338 		skb->dev = tunnel->dev;
339 		return 1;
340 	}
341 
342 	return -1;
343 }
344 
345 /* This function assumes it is being called from dev_queue_xmit()
346  * and that skb is filled properly by that function.
347  */
348 
349 static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
350 {
351 	struct ip_tunnel *tunnel = netdev_priv(dev);
352 	struct pcpu_tstats *tstats;
353 	struct iphdr  *tiph = &tunnel->parms.iph;
354 	u8     tos;
355 	struct rtable *rt;		/* Route to the other host */
356 	struct net_device *tdev;	/* Device to other host */
357 	struct iphdr  *old_iph = ip_hdr(skb);
358 	__be32 dst = tiph->daddr;
359 	struct flowi4 fl4;
360 
361 	if (skb->protocol != htons(ETH_P_IP))
362 		goto tx_error;
363 
364 	tos = old_iph->tos;
365 
366 	memset(&fl4, 0, sizeof(fl4));
367 	flowi4_init_output(&fl4, tunnel->parms.link,
368 			   be32_to_cpu(tunnel->parms.i_key), RT_TOS(tos),
369 			   RT_SCOPE_UNIVERSE,
370 			   IPPROTO_IPIP, 0,
371 			   dst, tiph->saddr, 0, 0);
372 	rt = ip_route_output_key(dev_net(dev), &fl4);
373 	if (IS_ERR(rt)) {
374 		dev->stats.tx_carrier_errors++;
375 		goto tx_error_icmp;
376 	}
377 	/* if there is no transform then this tunnel is not functional.
378 	 * Or if the xfrm is not mode tunnel.
379 	 */
380 	if (!rt->dst.xfrm ||
381 	    rt->dst.xfrm->props.mode != XFRM_MODE_TUNNEL) {
382 		dev->stats.tx_carrier_errors++;
383 		goto tx_error_icmp;
384 	}
385 	tdev = rt->dst.dev;
386 
387 	if (tdev == dev) {
388 		ip_rt_put(rt);
389 		dev->stats.collisions++;
390 		goto tx_error;
391 	}
392 
393 	if (tunnel->err_count > 0) {
394 		if (time_before(jiffies,
395 				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
396 			tunnel->err_count--;
397 			dst_link_failure(skb);
398 		} else
399 			tunnel->err_count = 0;
400 	}
401 
402 	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
403 			      IPSKB_REROUTED);
404 	skb_dst_drop(skb);
405 	skb_dst_set(skb, &rt->dst);
406 	nf_reset(skb);
407 	skb->dev = skb_dst(skb)->dev;
408 
409 	tstats = this_cpu_ptr(dev->tstats);
410 	VTI_XMIT(tstats, &dev->stats);
411 	return NETDEV_TX_OK;
412 
413 tx_error_icmp:
414 	dst_link_failure(skb);
415 tx_error:
416 	dev->stats.tx_errors++;
417 	dev_kfree_skb(skb);
418 	return NETDEV_TX_OK;
419 }
420 
421 static int vti_tunnel_bind_dev(struct net_device *dev)
422 {
423 	struct net_device *tdev = NULL;
424 	struct ip_tunnel *tunnel;
425 	struct iphdr *iph;
426 
427 	tunnel = netdev_priv(dev);
428 	iph = &tunnel->parms.iph;
429 
430 	if (iph->daddr) {
431 		struct rtable *rt;
432 		struct flowi4 fl4;
433 		memset(&fl4, 0, sizeof(fl4));
434 		flowi4_init_output(&fl4, tunnel->parms.link,
435 				   be32_to_cpu(tunnel->parms.i_key),
436 				   RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
437 				   IPPROTO_IPIP, 0,
438 				   iph->daddr, iph->saddr, 0, 0);
439 		rt = ip_route_output_key(dev_net(dev), &fl4);
440 		if (!IS_ERR(rt)) {
441 			tdev = rt->dst.dev;
442 			ip_rt_put(rt);
443 		}
444 		dev->flags |= IFF_POINTOPOINT;
445 	}
446 
447 	if (!tdev && tunnel->parms.link)
448 		tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
449 
450 	if (tdev) {
451 		dev->hard_header_len = tdev->hard_header_len +
452 				       sizeof(struct iphdr);
453 		dev->mtu = tdev->mtu;
454 	}
455 	dev->iflink = tunnel->parms.link;
456 	return dev->mtu;
457 }
458 
459 static int
460 vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
461 {
462 	int err = 0;
463 	struct ip_tunnel_parm p;
464 	struct ip_tunnel *t;
465 	struct net *net = dev_net(dev);
466 	struct vti_net *ipn = net_generic(net, vti_net_id);
467 
468 	switch (cmd) {
469 	case SIOCGETTUNNEL:
470 		t = NULL;
471 		if (dev == ipn->fb_tunnel_dev) {
472 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data,
473 					   sizeof(p))) {
474 				err = -EFAULT;
475 				break;
476 			}
477 			t = vti_tunnel_locate(net, &p, 0);
478 		}
479 		if (t == NULL)
480 			t = netdev_priv(dev);
481 		memcpy(&p, &t->parms, sizeof(p));
482 		p.i_flags |= GRE_KEY | VTI_ISVTI;
483 		p.o_flags |= GRE_KEY;
484 		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
485 			err = -EFAULT;
486 		break;
487 
488 	case SIOCADDTUNNEL:
489 	case SIOCCHGTUNNEL:
490 		err = -EPERM;
491 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
492 			goto done;
493 
494 		err = -EFAULT;
495 		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
496 			goto done;
497 
498 		err = -EINVAL;
499 		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
500 		    p.iph.ihl != 5)
501 			goto done;
502 
503 		t = vti_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
504 
505 		if (dev != ipn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
506 			if (t != NULL) {
507 				if (t->dev != dev) {
508 					err = -EEXIST;
509 					break;
510 				}
511 			} else {
512 				if (((dev->flags&IFF_POINTOPOINT) &&
513 				    !p.iph.daddr) ||
514 				    (!(dev->flags&IFF_POINTOPOINT) &&
515 				    p.iph.daddr)) {
516 					err = -EINVAL;
517 					break;
518 				}
519 				t = netdev_priv(dev);
520 				vti_tunnel_unlink(ipn, t);
521 				synchronize_net();
522 				t->parms.iph.saddr = p.iph.saddr;
523 				t->parms.iph.daddr = p.iph.daddr;
524 				t->parms.i_key = p.i_key;
525 				t->parms.o_key = p.o_key;
526 				t->parms.iph.protocol = IPPROTO_IPIP;
527 				memcpy(dev->dev_addr, &p.iph.saddr, 4);
528 				memcpy(dev->broadcast, &p.iph.daddr, 4);
529 				vti_tunnel_link(ipn, t);
530 				netdev_state_change(dev);
531 			}
532 		}
533 
534 		if (t) {
535 			err = 0;
536 			if (cmd == SIOCCHGTUNNEL) {
537 				t->parms.i_key = p.i_key;
538 				t->parms.o_key = p.o_key;
539 				if (t->parms.link != p.link) {
540 					t->parms.link = p.link;
541 					vti_tunnel_bind_dev(dev);
542 					netdev_state_change(dev);
543 				}
544 			}
545 			p.i_flags |= GRE_KEY | VTI_ISVTI;
546 			p.o_flags |= GRE_KEY;
547 			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms,
548 					 sizeof(p)))
549 				err = -EFAULT;
550 		} else
551 			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
552 		break;
553 
554 	case SIOCDELTUNNEL:
555 		err = -EPERM;
556 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
557 			goto done;
558 
559 		if (dev == ipn->fb_tunnel_dev) {
560 			err = -EFAULT;
561 			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data,
562 					   sizeof(p)))
563 				goto done;
564 			err = -ENOENT;
565 
566 			t = vti_tunnel_locate(net, &p, 0);
567 			if (t == NULL)
568 				goto done;
569 			err = -EPERM;
570 			if (t->dev == ipn->fb_tunnel_dev)
571 				goto done;
572 			dev = t->dev;
573 		}
574 		unregister_netdevice(dev);
575 		err = 0;
576 		break;
577 
578 	default:
579 		err = -EINVAL;
580 	}
581 
582 done:
583 	return err;
584 }
585 
586 static int vti_tunnel_change_mtu(struct net_device *dev, int new_mtu)
587 {
588 	if (new_mtu < 68 || new_mtu > 0xFFF8)
589 		return -EINVAL;
590 	dev->mtu = new_mtu;
591 	return 0;
592 }
593 
594 static const struct net_device_ops vti_netdev_ops = {
595 	.ndo_init	= vti_tunnel_init,
596 	.ndo_uninit	= vti_tunnel_uninit,
597 	.ndo_start_xmit	= vti_tunnel_xmit,
598 	.ndo_do_ioctl	= vti_tunnel_ioctl,
599 	.ndo_change_mtu	= vti_tunnel_change_mtu,
600 	.ndo_get_stats64 = vti_get_stats64,
601 };
602 
603 static void vti_dev_free(struct net_device *dev)
604 {
605 	free_percpu(dev->tstats);
606 	free_netdev(dev);
607 }
608 
609 static void vti_tunnel_setup(struct net_device *dev)
610 {
611 	dev->netdev_ops		= &vti_netdev_ops;
612 	dev->destructor		= vti_dev_free;
613 
614 	dev->type		= ARPHRD_TUNNEL;
615 	dev->hard_header_len	= LL_MAX_HEADER + sizeof(struct iphdr);
616 	dev->mtu		= ETH_DATA_LEN;
617 	dev->flags		= IFF_NOARP;
618 	dev->iflink		= 0;
619 	dev->addr_len		= 4;
620 	dev->features		|= NETIF_F_NETNS_LOCAL;
621 	dev->features		|= NETIF_F_LLTX;
622 	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
623 }
624 
625 static int vti_tunnel_init(struct net_device *dev)
626 {
627 	struct ip_tunnel *tunnel = netdev_priv(dev);
628 
629 	tunnel->dev = dev;
630 	strcpy(tunnel->parms.name, dev->name);
631 
632 	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
633 	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
634 
635 	dev->tstats = alloc_percpu(struct pcpu_tstats);
636 	if (!dev->tstats)
637 		return -ENOMEM;
638 
639 	return 0;
640 }
641 
642 static int __net_init vti_fb_tunnel_init(struct net_device *dev)
643 {
644 	struct ip_tunnel *tunnel = netdev_priv(dev);
645 	struct iphdr *iph = &tunnel->parms.iph;
646 	struct vti_net *ipn = net_generic(dev_net(dev), vti_net_id);
647 
648 	tunnel->dev = dev;
649 	strcpy(tunnel->parms.name, dev->name);
650 
651 	iph->version		= 4;
652 	iph->protocol		= IPPROTO_IPIP;
653 	iph->ihl		= 5;
654 
655 	dev->tstats = alloc_percpu(struct pcpu_tstats);
656 	if (!dev->tstats)
657 		return -ENOMEM;
658 
659 	dev_hold(dev);
660 	rcu_assign_pointer(ipn->tunnels_wc[0], tunnel);
661 	return 0;
662 }
663 
664 static struct xfrm_tunnel vti_handler __read_mostly = {
665 	.handler	=	vti_rcv,
666 	.err_handler	=	vti_err,
667 	.priority	=	1,
668 };
669 
670 static void vti_destroy_tunnels(struct vti_net *ipn, struct list_head *head)
671 {
672 	int prio;
673 
674 	for (prio = 1; prio < 4; prio++) {
675 		int h;
676 		for (h = 0; h < HASH_SIZE; h++) {
677 			struct ip_tunnel *t;
678 
679 			t = rtnl_dereference(ipn->tunnels[prio][h]);
680 			while (t != NULL) {
681 				unregister_netdevice_queue(t->dev, head);
682 				t = rtnl_dereference(t->next);
683 			}
684 		}
685 	}
686 }
687 
688 static int __net_init vti_init_net(struct net *net)
689 {
690 	int err;
691 	struct vti_net *ipn = net_generic(net, vti_net_id);
692 
693 	ipn->tunnels[0] = ipn->tunnels_wc;
694 	ipn->tunnels[1] = ipn->tunnels_l;
695 	ipn->tunnels[2] = ipn->tunnels_r;
696 	ipn->tunnels[3] = ipn->tunnels_r_l;
697 
698 	ipn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel),
699 					  "ip_vti0",
700 					  vti_tunnel_setup);
701 	if (!ipn->fb_tunnel_dev) {
702 		err = -ENOMEM;
703 		goto err_alloc_dev;
704 	}
705 	dev_net_set(ipn->fb_tunnel_dev, net);
706 
707 	err = vti_fb_tunnel_init(ipn->fb_tunnel_dev);
708 	if (err)
709 		goto err_reg_dev;
710 	ipn->fb_tunnel_dev->rtnl_link_ops = &vti_link_ops;
711 
712 	err = register_netdev(ipn->fb_tunnel_dev);
713 	if (err)
714 		goto err_reg_dev;
715 	return 0;
716 
717 err_reg_dev:
718 	vti_dev_free(ipn->fb_tunnel_dev);
719 err_alloc_dev:
720 	/* nothing */
721 	return err;
722 }
723 
724 static void __net_exit vti_exit_net(struct net *net)
725 {
726 	struct vti_net *ipn = net_generic(net, vti_net_id);
727 	LIST_HEAD(list);
728 
729 	rtnl_lock();
730 	vti_destroy_tunnels(ipn, &list);
731 	unregister_netdevice_many(&list);
732 	rtnl_unlock();
733 }
734 
735 static struct pernet_operations vti_net_ops = {
736 	.init = vti_init_net,
737 	.exit = vti_exit_net,
738 	.id   = &vti_net_id,
739 	.size = sizeof(struct vti_net),
740 };
741 
742 static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
743 {
744 	return 0;
745 }
746 
747 static void vti_netlink_parms(struct nlattr *data[],
748 			      struct ip_tunnel_parm *parms)
749 {
750 	memset(parms, 0, sizeof(*parms));
751 
752 	parms->iph.protocol = IPPROTO_IPIP;
753 
754 	if (!data)
755 		return;
756 
757 	if (data[IFLA_VTI_LINK])
758 		parms->link = nla_get_u32(data[IFLA_VTI_LINK]);
759 
760 	if (data[IFLA_VTI_IKEY])
761 		parms->i_key = nla_get_be32(data[IFLA_VTI_IKEY]);
762 
763 	if (data[IFLA_VTI_OKEY])
764 		parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]);
765 
766 	if (data[IFLA_VTI_LOCAL])
767 		parms->iph.saddr = nla_get_be32(data[IFLA_VTI_LOCAL]);
768 
769 	if (data[IFLA_VTI_REMOTE])
770 		parms->iph.daddr = nla_get_be32(data[IFLA_VTI_REMOTE]);
771 
772 }
773 
774 static int vti_newlink(struct net *src_net, struct net_device *dev,
775 		       struct nlattr *tb[], struct nlattr *data[])
776 {
777 	struct ip_tunnel *nt;
778 	struct net *net = dev_net(dev);
779 	struct vti_net *ipn = net_generic(net, vti_net_id);
780 	int mtu;
781 	int err;
782 
783 	nt = netdev_priv(dev);
784 	vti_netlink_parms(data, &nt->parms);
785 
786 	if (vti_tunnel_locate(net, &nt->parms, 0))
787 		return -EEXIST;
788 
789 	mtu = vti_tunnel_bind_dev(dev);
790 	if (!tb[IFLA_MTU])
791 		dev->mtu = mtu;
792 
793 	err = register_netdevice(dev);
794 	if (err)
795 		goto out;
796 
797 	dev_hold(dev);
798 	vti_tunnel_link(ipn, nt);
799 
800 out:
801 	return err;
802 }
803 
804 static int vti_changelink(struct net_device *dev, struct nlattr *tb[],
805 			  struct nlattr *data[])
806 {
807 	struct ip_tunnel *t, *nt;
808 	struct net *net = dev_net(dev);
809 	struct vti_net *ipn = net_generic(net, vti_net_id);
810 	struct ip_tunnel_parm p;
811 	int mtu;
812 
813 	if (dev == ipn->fb_tunnel_dev)
814 		return -EINVAL;
815 
816 	nt = netdev_priv(dev);
817 	vti_netlink_parms(data, &p);
818 
819 	t = vti_tunnel_locate(net, &p, 0);
820 
821 	if (t) {
822 		if (t->dev != dev)
823 			return -EEXIST;
824 	} else {
825 		t = nt;
826 
827 		vti_tunnel_unlink(ipn, t);
828 		t->parms.iph.saddr = p.iph.saddr;
829 		t->parms.iph.daddr = p.iph.daddr;
830 		t->parms.i_key = p.i_key;
831 		t->parms.o_key = p.o_key;
832 		if (dev->type != ARPHRD_ETHER) {
833 			memcpy(dev->dev_addr, &p.iph.saddr, 4);
834 			memcpy(dev->broadcast, &p.iph.daddr, 4);
835 		}
836 		vti_tunnel_link(ipn, t);
837 		netdev_state_change(dev);
838 	}
839 
840 	if (t->parms.link != p.link) {
841 		t->parms.link = p.link;
842 		mtu = vti_tunnel_bind_dev(dev);
843 		if (!tb[IFLA_MTU])
844 			dev->mtu = mtu;
845 		netdev_state_change(dev);
846 	}
847 
848 	return 0;
849 }
850 
851 static size_t vti_get_size(const struct net_device *dev)
852 {
853 	return
854 		/* IFLA_VTI_LINK */
855 		nla_total_size(4) +
856 		/* IFLA_VTI_IKEY */
857 		nla_total_size(4) +
858 		/* IFLA_VTI_OKEY */
859 		nla_total_size(4) +
860 		/* IFLA_VTI_LOCAL */
861 		nla_total_size(4) +
862 		/* IFLA_VTI_REMOTE */
863 		nla_total_size(4) +
864 		0;
865 }
866 
867 static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev)
868 {
869 	struct ip_tunnel *t = netdev_priv(dev);
870 	struct ip_tunnel_parm *p = &t->parms;
871 
872 	nla_put_u32(skb, IFLA_VTI_LINK, p->link);
873 	nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key);
874 	nla_put_be32(skb, IFLA_VTI_OKEY, p->o_key);
875 	nla_put_be32(skb, IFLA_VTI_LOCAL, p->iph.saddr);
876 	nla_put_be32(skb, IFLA_VTI_REMOTE, p->iph.daddr);
877 
878 	return 0;
879 }
880 
881 static const struct nla_policy vti_policy[IFLA_VTI_MAX + 1] = {
882 	[IFLA_VTI_LINK]		= { .type = NLA_U32 },
883 	[IFLA_VTI_IKEY]		= { .type = NLA_U32 },
884 	[IFLA_VTI_OKEY]		= { .type = NLA_U32 },
885 	[IFLA_VTI_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
886 	[IFLA_VTI_REMOTE]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
887 };
888 
889 static struct rtnl_link_ops vti_link_ops __read_mostly = {
890 	.kind		= "vti",
891 	.maxtype	= IFLA_VTI_MAX,
892 	.policy		= vti_policy,
893 	.priv_size	= sizeof(struct ip_tunnel),
894 	.setup		= vti_tunnel_setup,
895 	.validate	= vti_tunnel_validate,
896 	.newlink	= vti_newlink,
897 	.changelink	= vti_changelink,
898 	.get_size	= vti_get_size,
899 	.fill_info	= vti_fill_info,
900 };
901 
902 static int __init vti_init(void)
903 {
904 	int err;
905 
906 	pr_info("IPv4 over IPSec tunneling driver\n");
907 
908 	err = register_pernet_device(&vti_net_ops);
909 	if (err < 0)
910 		return err;
911 	err = xfrm4_mode_tunnel_input_register(&vti_handler);
912 	if (err < 0) {
913 		unregister_pernet_device(&vti_net_ops);
914 		pr_info(KERN_INFO "vti init: can't register tunnel\n");
915 	}
916 
917 	err = rtnl_link_register(&vti_link_ops);
918 	if (err < 0)
919 		goto rtnl_link_failed;
920 
921 	return err;
922 
923 rtnl_link_failed:
924 	xfrm4_mode_tunnel_input_deregister(&vti_handler);
925 	unregister_pernet_device(&vti_net_ops);
926 	return err;
927 }
928 
929 static void __exit vti_fini(void)
930 {
931 	rtnl_link_unregister(&vti_link_ops);
932 	if (xfrm4_mode_tunnel_input_deregister(&vti_handler))
933 		pr_info("vti close: can't deregister tunnel\n");
934 
935 	unregister_pernet_device(&vti_net_ops);
936 }
937 
938 module_init(vti_init);
939 module_exit(vti_fini);
940 MODULE_LICENSE("GPL");
941 MODULE_ALIAS_RTNL_LINK("vti");
942 MODULE_ALIAS_NETDEV("ip_vti0");
943