xref: /openbmc/linux/drivers/net/tun.c (revision 7c0c3b1a8a175437991ccc898ed66ec5e4a96208)
1 /*
2  *  TUN - Universal TUN/TAP device driver.
3  *  Copyright (C) 1999-2002 Maxim Krasnyansky <maxk@qualcomm.com>
4  *
5  *  This program is free software; you can redistribute it and/or modify
6  *  it under the terms of the GNU General Public License as published by
7  *  the Free Software Foundation; either version 2 of the License, or
8  *  (at your option) any later version.
9  *
10  *  This program is distributed in the hope that it will be useful,
11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  *  GNU General Public License for more details.
14  *
15  *  $Id: tun.c,v 1.15 2002/03/01 02:44:24 maxk Exp $
16  */
17 
18 /*
19  *  Changes:
20  *
21  *  Mike Kershaw <dragorn@kismetwireless.net> 2005/08/14
22  *    Add TUNSETLINK ioctl to set the link encapsulation
23  *
24  *  Mark Smith <markzzzsmith@yahoo.com.au>
25  *    Use eth_random_addr() for tap MAC address.
26  *
27  *  Harald Roelle <harald.roelle@ifi.lmu.de>  2004/04/20
28  *    Fixes in packet dropping, queue length setting and queue wakeup.
29  *    Increased default tx queue length.
30  *    Added ethtool API.
31  *    Minor cleanups
32  *
33  *  Daniel Podlejski <underley@underley.eu.org>
34  *    Modifications for 2.3.99-pre5 kernel.
35  */
36 
37 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
38 
39 #define DRV_NAME	"tun"
40 #define DRV_VERSION	"1.6"
41 #define DRV_DESCRIPTION	"Universal TUN/TAP device driver"
42 #define DRV_COPYRIGHT	"(C) 1999-2004 Max Krasnyansky <maxk@qualcomm.com>"
43 
44 #include <linux/module.h>
45 #include <linux/errno.h>
46 #include <linux/kernel.h>
47 #include <linux/major.h>
48 #include <linux/slab.h>
49 #include <linux/poll.h>
50 #include <linux/fcntl.h>
51 #include <linux/init.h>
52 #include <linux/skbuff.h>
53 #include <linux/netdevice.h>
54 #include <linux/etherdevice.h>
55 #include <linux/miscdevice.h>
56 #include <linux/ethtool.h>
57 #include <linux/rtnetlink.h>
58 #include <linux/compat.h>
59 #include <linux/if.h>
60 #include <linux/if_arp.h>
61 #include <linux/if_ether.h>
62 #include <linux/if_tun.h>
63 #include <linux/crc32.h>
64 #include <linux/nsproxy.h>
65 #include <linux/virtio_net.h>
66 #include <linux/rcupdate.h>
67 #include <net/net_namespace.h>
68 #include <net/netns/generic.h>
69 #include <net/rtnetlink.h>
70 #include <net/sock.h>
71 
72 #include <asm/uaccess.h>
73 
74 /* Uncomment to enable debugging */
75 /* #define TUN_DEBUG 1 */
76 
77 #ifdef TUN_DEBUG
78 static int debug;
79 
80 #define tun_debug(level, tun, fmt, args...)			\
81 do {								\
82 	if (tun->debug)						\
83 		netdev_printk(level, tun->dev, fmt, ##args);	\
84 } while (0)
85 #define DBG1(level, fmt, args...)				\
86 do {								\
87 	if (debug == 2)						\
88 		printk(level fmt, ##args);			\
89 } while (0)
90 #else
91 #define tun_debug(level, tun, fmt, args...)			\
92 do {								\
93 	if (0)							\
94 		netdev_printk(level, tun->dev, fmt, ##args);	\
95 } while (0)
96 #define DBG1(level, fmt, args...)				\
97 do {								\
98 	if (0)							\
99 		printk(level fmt, ##args);			\
100 } while (0)
101 #endif
102 
103 #define GOODCOPY_LEN 128
104 
105 #define FLT_EXACT_COUNT 8
106 struct tap_filter {
107 	unsigned int    count;    /* Number of addrs. Zero means disabled */
108 	u32             mask[2];  /* Mask of the hashed addrs */
109 	unsigned char	addr[FLT_EXACT_COUNT][ETH_ALEN];
110 };
111 
112 /* 1024 is probably a high enough limit: modern hypervisors seem to support on
113  * the order of 100-200 CPUs so this leaves us some breathing space if we want
114  * to match a queue per guest CPU.
115  */
116 #define MAX_TAP_QUEUES 1024
117 
118 #define TUN_FLOW_EXPIRE (3 * HZ)
119 
120 /* A tun_file connects an open character device to a tuntap netdevice. It
121  * also contains all socket related strctures (except sock_fprog and tap_filter)
122  * to serve as one transmit queue for tuntap device. The sock_fprog and
123  * tap_filter were kept in tun_struct since they were used for filtering for the
124  * netdevice not for a specific queue (at least I didn't see the requirement for
125  * this).
126  *
127  * RCU usage:
128  * The tun_file and tun_struct are loosely coupled, the pointer from one to the
129  * other can only be read while rcu_read_lock or rtnl_lock is held.
130  */
131 struct tun_file {
132 	struct sock sk;
133 	struct socket socket;
134 	struct socket_wq wq;
135 	struct tun_struct __rcu *tun;
136 	struct net *net;
137 	struct fasync_struct *fasync;
138 	/* only used for fasnyc */
139 	unsigned int flags;
140 	u16 queue_index;
141 	struct list_head next;
142 	struct tun_struct *detached;
143 };
144 
145 struct tun_flow_entry {
146 	struct hlist_node hash_link;
147 	struct rcu_head rcu;
148 	struct tun_struct *tun;
149 
150 	u32 rxhash;
151 	int queue_index;
152 	unsigned long updated;
153 };
154 
155 #define TUN_NUM_FLOW_ENTRIES 1024
156 
157 /* Since the socket were moved to tun_file, to preserve the behavior of persist
158  * device, socket filter, sndbuf and vnet header size were restore when the
159  * file were attached to a persist device.
160  */
161 struct tun_struct {
162 	struct tun_file __rcu	*tfiles[MAX_TAP_QUEUES];
163 	unsigned int            numqueues;
164 	unsigned int 		flags;
165 	kuid_t			owner;
166 	kgid_t			group;
167 
168 	struct net_device	*dev;
169 	netdev_features_t	set_features;
170 #define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \
171 			  NETIF_F_TSO6|NETIF_F_UFO)
172 
173 	int			vnet_hdr_sz;
174 	int			sndbuf;
175 	struct tap_filter	txflt;
176 	struct sock_fprog	fprog;
177 	/* protected by rtnl lock */
178 	bool			filter_attached;
179 #ifdef TUN_DEBUG
180 	int debug;
181 #endif
182 	spinlock_t lock;
183 	struct hlist_head flows[TUN_NUM_FLOW_ENTRIES];
184 	struct timer_list flow_gc_timer;
185 	unsigned long ageing_time;
186 	unsigned int numdisabled;
187 	struct list_head disabled;
188 };
189 
190 static inline u32 tun_hashfn(u32 rxhash)
191 {
192 	return rxhash & 0x3ff;
193 }
194 
195 static struct tun_flow_entry *tun_flow_find(struct hlist_head *head, u32 rxhash)
196 {
197 	struct tun_flow_entry *e;
198 	struct hlist_node *n;
199 
200 	hlist_for_each_entry_rcu(e, n, head, hash_link) {
201 		if (e->rxhash == rxhash)
202 			return e;
203 	}
204 	return NULL;
205 }
206 
207 static struct tun_flow_entry *tun_flow_create(struct tun_struct *tun,
208 					      struct hlist_head *head,
209 					      u32 rxhash, u16 queue_index)
210 {
211 	struct tun_flow_entry *e = kmalloc(sizeof(*e), GFP_ATOMIC);
212 
213 	if (e) {
214 		tun_debug(KERN_INFO, tun, "create flow: hash %u index %u\n",
215 			  rxhash, queue_index);
216 		e->updated = jiffies;
217 		e->rxhash = rxhash;
218 		e->queue_index = queue_index;
219 		e->tun = tun;
220 		hlist_add_head_rcu(&e->hash_link, head);
221 	}
222 	return e;
223 }
224 
225 static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e)
226 {
227 	tun_debug(KERN_INFO, tun, "delete flow: hash %u index %u\n",
228 		  e->rxhash, e->queue_index);
229 	hlist_del_rcu(&e->hash_link);
230 	kfree_rcu(e, rcu);
231 }
232 
233 static void tun_flow_flush(struct tun_struct *tun)
234 {
235 	int i;
236 
237 	spin_lock_bh(&tun->lock);
238 	for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
239 		struct tun_flow_entry *e;
240 		struct hlist_node *h, *n;
241 
242 		hlist_for_each_entry_safe(e, h, n, &tun->flows[i], hash_link)
243 			tun_flow_delete(tun, e);
244 	}
245 	spin_unlock_bh(&tun->lock);
246 }
247 
248 static void tun_flow_delete_by_queue(struct tun_struct *tun, u16 queue_index)
249 {
250 	int i;
251 
252 	spin_lock_bh(&tun->lock);
253 	for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
254 		struct tun_flow_entry *e;
255 		struct hlist_node *h, *n;
256 
257 		hlist_for_each_entry_safe(e, h, n, &tun->flows[i], hash_link) {
258 			if (e->queue_index == queue_index)
259 				tun_flow_delete(tun, e);
260 		}
261 	}
262 	spin_unlock_bh(&tun->lock);
263 }
264 
265 static void tun_flow_cleanup(unsigned long data)
266 {
267 	struct tun_struct *tun = (struct tun_struct *)data;
268 	unsigned long delay = tun->ageing_time;
269 	unsigned long next_timer = jiffies + delay;
270 	unsigned long count = 0;
271 	int i;
272 
273 	tun_debug(KERN_INFO, tun, "tun_flow_cleanup\n");
274 
275 	spin_lock_bh(&tun->lock);
276 	for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
277 		struct tun_flow_entry *e;
278 		struct hlist_node *h, *n;
279 
280 		hlist_for_each_entry_safe(e, h, n, &tun->flows[i], hash_link) {
281 			unsigned long this_timer;
282 			count++;
283 			this_timer = e->updated + delay;
284 			if (time_before_eq(this_timer, jiffies))
285 				tun_flow_delete(tun, e);
286 			else if (time_before(this_timer, next_timer))
287 				next_timer = this_timer;
288 		}
289 	}
290 
291 	if (count)
292 		mod_timer(&tun->flow_gc_timer, round_jiffies_up(next_timer));
293 	spin_unlock_bh(&tun->lock);
294 }
295 
296 static void tun_flow_update(struct tun_struct *tun, u32 rxhash,
297 			    u16 queue_index)
298 {
299 	struct hlist_head *head;
300 	struct tun_flow_entry *e;
301 	unsigned long delay = tun->ageing_time;
302 
303 	if (!rxhash)
304 		return;
305 	else
306 		head = &tun->flows[tun_hashfn(rxhash)];
307 
308 	rcu_read_lock();
309 
310 	if (tun->numqueues == 1)
311 		goto unlock;
312 
313 	e = tun_flow_find(head, rxhash);
314 	if (likely(e)) {
315 		/* TODO: keep queueing to old queue until it's empty? */
316 		e->queue_index = queue_index;
317 		e->updated = jiffies;
318 	} else {
319 		spin_lock_bh(&tun->lock);
320 		if (!tun_flow_find(head, rxhash))
321 			tun_flow_create(tun, head, rxhash, queue_index);
322 
323 		if (!timer_pending(&tun->flow_gc_timer))
324 			mod_timer(&tun->flow_gc_timer,
325 				  round_jiffies_up(jiffies + delay));
326 		spin_unlock_bh(&tun->lock);
327 	}
328 
329 unlock:
330 	rcu_read_unlock();
331 }
332 
333 /* We try to identify a flow through its rxhash first. The reason that
334  * we do not check rxq no. is becuase some cards(e.g 82599), chooses
335  * the rxq based on the txq where the last packet of the flow comes. As
336  * the userspace application move between processors, we may get a
337  * different rxq no. here. If we could not get rxhash, then we would
338  * hope the rxq no. may help here.
339  */
340 static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb)
341 {
342 	struct tun_struct *tun = netdev_priv(dev);
343 	struct tun_flow_entry *e;
344 	u32 txq = 0;
345 	u32 numqueues = 0;
346 
347 	rcu_read_lock();
348 	numqueues = tun->numqueues;
349 
350 	txq = skb_get_rxhash(skb);
351 	if (txq) {
352 		e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq);
353 		if (e)
354 			txq = e->queue_index;
355 		else
356 			/* use multiply and shift instead of expensive divide */
357 			txq = ((u64)txq * numqueues) >> 32;
358 	} else if (likely(skb_rx_queue_recorded(skb))) {
359 		txq = skb_get_rx_queue(skb);
360 		while (unlikely(txq >= numqueues))
361 			txq -= numqueues;
362 	}
363 
364 	rcu_read_unlock();
365 	return txq;
366 }
367 
368 static inline bool tun_not_capable(struct tun_struct *tun)
369 {
370 	const struct cred *cred = current_cred();
371 	struct net *net = dev_net(tun->dev);
372 
373 	return ((uid_valid(tun->owner) && !uid_eq(cred->euid, tun->owner)) ||
374 		  (gid_valid(tun->group) && !in_egroup_p(tun->group))) &&
375 		!ns_capable(net->user_ns, CAP_NET_ADMIN);
376 }
377 
378 static void tun_set_real_num_queues(struct tun_struct *tun)
379 {
380 	netif_set_real_num_tx_queues(tun->dev, tun->numqueues);
381 	netif_set_real_num_rx_queues(tun->dev, tun->numqueues);
382 }
383 
384 static void tun_disable_queue(struct tun_struct *tun, struct tun_file *tfile)
385 {
386 	tfile->detached = tun;
387 	list_add_tail(&tfile->next, &tun->disabled);
388 	++tun->numdisabled;
389 }
390 
391 static struct tun_struct *tun_enable_queue(struct tun_file *tfile)
392 {
393 	struct tun_struct *tun = tfile->detached;
394 
395 	tfile->detached = NULL;
396 	list_del_init(&tfile->next);
397 	--tun->numdisabled;
398 	return tun;
399 }
400 
401 static void __tun_detach(struct tun_file *tfile, bool clean)
402 {
403 	struct tun_file *ntfile;
404 	struct tun_struct *tun;
405 	struct net_device *dev;
406 
407 	tun = rtnl_dereference(tfile->tun);
408 
409 	if (tun) {
410 		u16 index = tfile->queue_index;
411 		BUG_ON(index >= tun->numqueues);
412 		dev = tun->dev;
413 
414 		rcu_assign_pointer(tun->tfiles[index],
415 				   tun->tfiles[tun->numqueues - 1]);
416 		rcu_assign_pointer(tfile->tun, NULL);
417 		ntfile = rtnl_dereference(tun->tfiles[index]);
418 		ntfile->queue_index = index;
419 
420 		--tun->numqueues;
421 		if (clean)
422 			sock_put(&tfile->sk);
423 		else
424 			tun_disable_queue(tun, tfile);
425 
426 		synchronize_net();
427 		tun_flow_delete_by_queue(tun, tun->numqueues + 1);
428 		/* Drop read queue */
429 		skb_queue_purge(&tfile->sk.sk_receive_queue);
430 		tun_set_real_num_queues(tun);
431 	} else if (tfile->detached && clean)
432 		tun = tun_enable_queue(tfile);
433 
434 	if (clean) {
435 		if (tun && tun->numqueues == 0 && tun->numdisabled == 0 &&
436 		    !(tun->flags & TUN_PERSIST))
437 			if (tun->dev->reg_state == NETREG_REGISTERED)
438 				unregister_netdevice(tun->dev);
439 
440 		BUG_ON(!test_bit(SOCK_EXTERNALLY_ALLOCATED,
441 				 &tfile->socket.flags));
442 		sk_release_kernel(&tfile->sk);
443 	}
444 }
445 
446 static void tun_detach(struct tun_file *tfile, bool clean)
447 {
448 	rtnl_lock();
449 	__tun_detach(tfile, clean);
450 	rtnl_unlock();
451 }
452 
453 static void tun_detach_all(struct net_device *dev)
454 {
455 	struct tun_struct *tun = netdev_priv(dev);
456 	struct tun_file *tfile, *tmp;
457 	int i, n = tun->numqueues;
458 
459 	for (i = 0; i < n; i++) {
460 		tfile = rtnl_dereference(tun->tfiles[i]);
461 		BUG_ON(!tfile);
462 		wake_up_all(&tfile->wq.wait);
463 		rcu_assign_pointer(tfile->tun, NULL);
464 		--tun->numqueues;
465 	}
466 	BUG_ON(tun->numqueues != 0);
467 
468 	synchronize_net();
469 	for (i = 0; i < n; i++) {
470 		tfile = rtnl_dereference(tun->tfiles[i]);
471 		/* Drop read queue */
472 		skb_queue_purge(&tfile->sk.sk_receive_queue);
473 		sock_put(&tfile->sk);
474 	}
475 	list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) {
476 		tun_enable_queue(tfile);
477 		skb_queue_purge(&tfile->sk.sk_receive_queue);
478 		sock_put(&tfile->sk);
479 	}
480 	BUG_ON(tun->numdisabled != 0);
481 }
482 
483 static int tun_attach(struct tun_struct *tun, struct file *file)
484 {
485 	struct tun_file *tfile = file->private_data;
486 	int err;
487 
488 	err = -EINVAL;
489 	if (rtnl_dereference(tfile->tun))
490 		goto out;
491 
492 	err = -EBUSY;
493 	if (!(tun->flags & TUN_TAP_MQ) && tun->numqueues == 1)
494 		goto out;
495 
496 	err = -E2BIG;
497 	if (!tfile->detached &&
498 	    tun->numqueues + tun->numdisabled == MAX_TAP_QUEUES)
499 		goto out;
500 
501 	err = 0;
502 
503 	/* Re-attach the filter to presist device */
504 	if (tun->filter_attached == true) {
505 		err = sk_attach_filter(&tun->fprog, tfile->socket.sk);
506 		if (!err)
507 			goto out;
508 	}
509 	tfile->queue_index = tun->numqueues;
510 	rcu_assign_pointer(tfile->tun, tun);
511 	rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
512 	tun->numqueues++;
513 
514 	if (tfile->detached)
515 		tun_enable_queue(tfile);
516 	else
517 		sock_hold(&tfile->sk);
518 
519 	tun_set_real_num_queues(tun);
520 
521 	/* device is allowed to go away first, so no need to hold extra
522 	 * refcnt.
523 	 */
524 
525 out:
526 	return err;
527 }
528 
529 static struct tun_struct *__tun_get(struct tun_file *tfile)
530 {
531 	struct tun_struct *tun;
532 
533 	rcu_read_lock();
534 	tun = rcu_dereference(tfile->tun);
535 	if (tun)
536 		dev_hold(tun->dev);
537 	rcu_read_unlock();
538 
539 	return tun;
540 }
541 
542 static struct tun_struct *tun_get(struct file *file)
543 {
544 	return __tun_get(file->private_data);
545 }
546 
547 static void tun_put(struct tun_struct *tun)
548 {
549 	dev_put(tun->dev);
550 }
551 
552 /* TAP filtering */
553 static void addr_hash_set(u32 *mask, const u8 *addr)
554 {
555 	int n = ether_crc(ETH_ALEN, addr) >> 26;
556 	mask[n >> 5] |= (1 << (n & 31));
557 }
558 
559 static unsigned int addr_hash_test(const u32 *mask, const u8 *addr)
560 {
561 	int n = ether_crc(ETH_ALEN, addr) >> 26;
562 	return mask[n >> 5] & (1 << (n & 31));
563 }
564 
565 static int update_filter(struct tap_filter *filter, void __user *arg)
566 {
567 	struct { u8 u[ETH_ALEN]; } *addr;
568 	struct tun_filter uf;
569 	int err, alen, n, nexact;
570 
571 	if (copy_from_user(&uf, arg, sizeof(uf)))
572 		return -EFAULT;
573 
574 	if (!uf.count) {
575 		/* Disabled */
576 		filter->count = 0;
577 		return 0;
578 	}
579 
580 	alen = ETH_ALEN * uf.count;
581 	addr = kmalloc(alen, GFP_KERNEL);
582 	if (!addr)
583 		return -ENOMEM;
584 
585 	if (copy_from_user(addr, arg + sizeof(uf), alen)) {
586 		err = -EFAULT;
587 		goto done;
588 	}
589 
590 	/* The filter is updated without holding any locks. Which is
591 	 * perfectly safe. We disable it first and in the worst
592 	 * case we'll accept a few undesired packets. */
593 	filter->count = 0;
594 	wmb();
595 
596 	/* Use first set of addresses as an exact filter */
597 	for (n = 0; n < uf.count && n < FLT_EXACT_COUNT; n++)
598 		memcpy(filter->addr[n], addr[n].u, ETH_ALEN);
599 
600 	nexact = n;
601 
602 	/* Remaining multicast addresses are hashed,
603 	 * unicast will leave the filter disabled. */
604 	memset(filter->mask, 0, sizeof(filter->mask));
605 	for (; n < uf.count; n++) {
606 		if (!is_multicast_ether_addr(addr[n].u)) {
607 			err = 0; /* no filter */
608 			goto done;
609 		}
610 		addr_hash_set(filter->mask, addr[n].u);
611 	}
612 
613 	/* For ALLMULTI just set the mask to all ones.
614 	 * This overrides the mask populated above. */
615 	if ((uf.flags & TUN_FLT_ALLMULTI))
616 		memset(filter->mask, ~0, sizeof(filter->mask));
617 
618 	/* Now enable the filter */
619 	wmb();
620 	filter->count = nexact;
621 
622 	/* Return the number of exact filters */
623 	err = nexact;
624 
625 done:
626 	kfree(addr);
627 	return err;
628 }
629 
630 /* Returns: 0 - drop, !=0 - accept */
631 static int run_filter(struct tap_filter *filter, const struct sk_buff *skb)
632 {
633 	/* Cannot use eth_hdr(skb) here because skb_mac_hdr() is incorrect
634 	 * at this point. */
635 	struct ethhdr *eh = (struct ethhdr *) skb->data;
636 	int i;
637 
638 	/* Exact match */
639 	for (i = 0; i < filter->count; i++)
640 		if (ether_addr_equal(eh->h_dest, filter->addr[i]))
641 			return 1;
642 
643 	/* Inexact match (multicast only) */
644 	if (is_multicast_ether_addr(eh->h_dest))
645 		return addr_hash_test(filter->mask, eh->h_dest);
646 
647 	return 0;
648 }
649 
650 /*
651  * Checks whether the packet is accepted or not.
652  * Returns: 0 - drop, !=0 - accept
653  */
654 static int check_filter(struct tap_filter *filter, const struct sk_buff *skb)
655 {
656 	if (!filter->count)
657 		return 1;
658 
659 	return run_filter(filter, skb);
660 }
661 
662 /* Network device part of the driver */
663 
664 static const struct ethtool_ops tun_ethtool_ops;
665 
666 /* Net device detach from fd. */
667 static void tun_net_uninit(struct net_device *dev)
668 {
669 	tun_detach_all(dev);
670 }
671 
672 /* Net device open. */
673 static int tun_net_open(struct net_device *dev)
674 {
675 	netif_tx_start_all_queues(dev);
676 	return 0;
677 }
678 
679 /* Net device close. */
680 static int tun_net_close(struct net_device *dev)
681 {
682 	netif_tx_stop_all_queues(dev);
683 	return 0;
684 }
685 
686 /* Net device start xmit */
687 static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
688 {
689 	struct tun_struct *tun = netdev_priv(dev);
690 	int txq = skb->queue_mapping;
691 	struct tun_file *tfile;
692 
693 	rcu_read_lock();
694 	tfile = rcu_dereference(tun->tfiles[txq]);
695 
696 	/* Drop packet if interface is not attached */
697 	if (txq >= tun->numqueues)
698 		goto drop;
699 
700 	tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);
701 
702 	BUG_ON(!tfile);
703 
704 	/* Drop if the filter does not like it.
705 	 * This is a noop if the filter is disabled.
706 	 * Filter can be enabled only for the TAP devices. */
707 	if (!check_filter(&tun->txflt, skb))
708 		goto drop;
709 
710 	if (tfile->socket.sk->sk_filter &&
711 	    sk_filter(tfile->socket.sk, skb))
712 		goto drop;
713 
714 	/* Limit the number of packets queued by dividing txq length with the
715 	 * number of queues.
716 	 */
717 	if (skb_queue_len(&tfile->socket.sk->sk_receive_queue)
718 			  >= dev->tx_queue_len / tun->numqueues)
719 		goto drop;
720 
721 	/* Orphan the skb - required as we might hang on to it
722 	 * for indefinite time. */
723 	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
724 		goto drop;
725 	skb_orphan(skb);
726 
727 	/* Enqueue packet */
728 	skb_queue_tail(&tfile->socket.sk->sk_receive_queue, skb);
729 
730 	/* Notify and wake up reader process */
731 	if (tfile->flags & TUN_FASYNC)
732 		kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
733 	wake_up_interruptible_poll(&tfile->wq.wait, POLLIN |
734 				   POLLRDNORM | POLLRDBAND);
735 
736 	rcu_read_unlock();
737 	return NETDEV_TX_OK;
738 
739 drop:
740 	dev->stats.tx_dropped++;
741 	skb_tx_error(skb);
742 	kfree_skb(skb);
743 	rcu_read_unlock();
744 	return NETDEV_TX_OK;
745 }
746 
747 static void tun_net_mclist(struct net_device *dev)
748 {
749 	/*
750 	 * This callback is supposed to deal with mc filter in
751 	 * _rx_ path and has nothing to do with the _tx_ path.
752 	 * In rx path we always accept everything userspace gives us.
753 	 */
754 }
755 
756 #define MIN_MTU 68
757 #define MAX_MTU 65535
758 
759 static int
760 tun_net_change_mtu(struct net_device *dev, int new_mtu)
761 {
762 	if (new_mtu < MIN_MTU || new_mtu + dev->hard_header_len > MAX_MTU)
763 		return -EINVAL;
764 	dev->mtu = new_mtu;
765 	return 0;
766 }
767 
768 static netdev_features_t tun_net_fix_features(struct net_device *dev,
769 	netdev_features_t features)
770 {
771 	struct tun_struct *tun = netdev_priv(dev);
772 
773 	return (features & tun->set_features) | (features & ~TUN_USER_FEATURES);
774 }
775 #ifdef CONFIG_NET_POLL_CONTROLLER
776 static void tun_poll_controller(struct net_device *dev)
777 {
778 	/*
779 	 * Tun only receives frames when:
780 	 * 1) the char device endpoint gets data from user space
781 	 * 2) the tun socket gets a sendmsg call from user space
782 	 * Since both of those are syncronous operations, we are guaranteed
783 	 * never to have pending data when we poll for it
784 	 * so theres nothing to do here but return.
785 	 * We need this though so netpoll recognizes us as an interface that
786 	 * supports polling, which enables bridge devices in virt setups to
787 	 * still use netconsole
788 	 */
789 	return;
790 }
791 #endif
792 static const struct net_device_ops tun_netdev_ops = {
793 	.ndo_uninit		= tun_net_uninit,
794 	.ndo_open		= tun_net_open,
795 	.ndo_stop		= tun_net_close,
796 	.ndo_start_xmit		= tun_net_xmit,
797 	.ndo_change_mtu		= tun_net_change_mtu,
798 	.ndo_fix_features	= tun_net_fix_features,
799 	.ndo_select_queue	= tun_select_queue,
800 #ifdef CONFIG_NET_POLL_CONTROLLER
801 	.ndo_poll_controller	= tun_poll_controller,
802 #endif
803 };
804 
805 static const struct net_device_ops tap_netdev_ops = {
806 	.ndo_uninit		= tun_net_uninit,
807 	.ndo_open		= tun_net_open,
808 	.ndo_stop		= tun_net_close,
809 	.ndo_start_xmit		= tun_net_xmit,
810 	.ndo_change_mtu		= tun_net_change_mtu,
811 	.ndo_fix_features	= tun_net_fix_features,
812 	.ndo_set_rx_mode	= tun_net_mclist,
813 	.ndo_set_mac_address	= eth_mac_addr,
814 	.ndo_validate_addr	= eth_validate_addr,
815 	.ndo_select_queue	= tun_select_queue,
816 #ifdef CONFIG_NET_POLL_CONTROLLER
817 	.ndo_poll_controller	= tun_poll_controller,
818 #endif
819 };
820 
821 static int tun_flow_init(struct tun_struct *tun)
822 {
823 	int i;
824 
825 	for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++)
826 		INIT_HLIST_HEAD(&tun->flows[i]);
827 
828 	tun->ageing_time = TUN_FLOW_EXPIRE;
829 	setup_timer(&tun->flow_gc_timer, tun_flow_cleanup, (unsigned long)tun);
830 	mod_timer(&tun->flow_gc_timer,
831 		  round_jiffies_up(jiffies + tun->ageing_time));
832 
833 	return 0;
834 }
835 
836 static void tun_flow_uninit(struct tun_struct *tun)
837 {
838 	del_timer_sync(&tun->flow_gc_timer);
839 	tun_flow_flush(tun);
840 }
841 
842 /* Initialize net device. */
843 static void tun_net_init(struct net_device *dev)
844 {
845 	struct tun_struct *tun = netdev_priv(dev);
846 
847 	switch (tun->flags & TUN_TYPE_MASK) {
848 	case TUN_TUN_DEV:
849 		dev->netdev_ops = &tun_netdev_ops;
850 
851 		/* Point-to-Point TUN Device */
852 		dev->hard_header_len = 0;
853 		dev->addr_len = 0;
854 		dev->mtu = 1500;
855 
856 		/* Zero header length */
857 		dev->type = ARPHRD_NONE;
858 		dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
859 		dev->tx_queue_len = TUN_READQ_SIZE;  /* We prefer our own queue length */
860 		break;
861 
862 	case TUN_TAP_DEV:
863 		dev->netdev_ops = &tap_netdev_ops;
864 		/* Ethernet TAP Device */
865 		ether_setup(dev);
866 		dev->priv_flags &= ~IFF_TX_SKB_SHARING;
867 		dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
868 
869 		eth_hw_addr_random(dev);
870 
871 		dev->tx_queue_len = TUN_READQ_SIZE;  /* We prefer our own queue length */
872 		break;
873 	}
874 }
875 
876 /* Character device part */
877 
878 /* Poll */
879 static unsigned int tun_chr_poll(struct file *file, poll_table *wait)
880 {
881 	struct tun_file *tfile = file->private_data;
882 	struct tun_struct *tun = __tun_get(tfile);
883 	struct sock *sk;
884 	unsigned int mask = 0;
885 
886 	if (!tun)
887 		return POLLERR;
888 
889 	sk = tfile->socket.sk;
890 
891 	tun_debug(KERN_INFO, tun, "tun_chr_poll\n");
892 
893 	poll_wait(file, &tfile->wq.wait, wait);
894 
895 	if (!skb_queue_empty(&sk->sk_receive_queue))
896 		mask |= POLLIN | POLLRDNORM;
897 
898 	if (sock_writeable(sk) ||
899 	    (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
900 	     sock_writeable(sk)))
901 		mask |= POLLOUT | POLLWRNORM;
902 
903 	if (tun->dev->reg_state != NETREG_REGISTERED)
904 		mask = POLLERR;
905 
906 	tun_put(tun);
907 	return mask;
908 }
909 
910 /* prepad is the amount to reserve at front.  len is length after that.
911  * linear is a hint as to how much to copy (usually headers). */
912 static struct sk_buff *tun_alloc_skb(struct tun_file *tfile,
913 				     size_t prepad, size_t len,
914 				     size_t linear, int noblock)
915 {
916 	struct sock *sk = tfile->socket.sk;
917 	struct sk_buff *skb;
918 	int err;
919 
920 	/* Under a page?  Don't bother with paged skb. */
921 	if (prepad + len < PAGE_SIZE || !linear)
922 		linear = len;
923 
924 	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
925 				   &err);
926 	if (!skb)
927 		return ERR_PTR(err);
928 
929 	skb_reserve(skb, prepad);
930 	skb_put(skb, linear);
931 	skb->data_len = len - linear;
932 	skb->len += len - linear;
933 
934 	return skb;
935 }
936 
937 /* set skb frags from iovec, this can move to core network code for reuse */
938 static int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *from,
939 				  int offset, size_t count)
940 {
941 	int len = iov_length(from, count) - offset;
942 	int copy = skb_headlen(skb);
943 	int size, offset1 = 0;
944 	int i = 0;
945 
946 	/* Skip over from offset */
947 	while (count && (offset >= from->iov_len)) {
948 		offset -= from->iov_len;
949 		++from;
950 		--count;
951 	}
952 
953 	/* copy up to skb headlen */
954 	while (count && (copy > 0)) {
955 		size = min_t(unsigned int, copy, from->iov_len - offset);
956 		if (copy_from_user(skb->data + offset1, from->iov_base + offset,
957 				   size))
958 			return -EFAULT;
959 		if (copy > size) {
960 			++from;
961 			--count;
962 			offset = 0;
963 		} else
964 			offset += size;
965 		copy -= size;
966 		offset1 += size;
967 	}
968 
969 	if (len == offset1)
970 		return 0;
971 
972 	while (count--) {
973 		struct page *page[MAX_SKB_FRAGS];
974 		int num_pages;
975 		unsigned long base;
976 		unsigned long truesize;
977 
978 		len = from->iov_len - offset;
979 		if (!len) {
980 			offset = 0;
981 			++from;
982 			continue;
983 		}
984 		base = (unsigned long)from->iov_base + offset;
985 		size = ((base & ~PAGE_MASK) + len + ~PAGE_MASK) >> PAGE_SHIFT;
986 		if (i + size > MAX_SKB_FRAGS)
987 			return -EMSGSIZE;
988 		num_pages = get_user_pages_fast(base, size, 0, &page[i]);
989 		if (num_pages != size) {
990 			for (i = 0; i < num_pages; i++)
991 				put_page(page[i]);
992 			return -EFAULT;
993 		}
994 		truesize = size * PAGE_SIZE;
995 		skb->data_len += len;
996 		skb->len += len;
997 		skb->truesize += truesize;
998 		atomic_add(truesize, &skb->sk->sk_wmem_alloc);
999 		while (len) {
1000 			int off = base & ~PAGE_MASK;
1001 			int size = min_t(int, len, PAGE_SIZE - off);
1002 			__skb_fill_page_desc(skb, i, page[i], off, size);
1003 			skb_shinfo(skb)->nr_frags++;
1004 			/* increase sk_wmem_alloc */
1005 			base += size;
1006 			len -= size;
1007 			i++;
1008 		}
1009 		offset = 0;
1010 		++from;
1011 	}
1012 	return 0;
1013 }
1014 
1015 /* Get packet from user space buffer */
1016 static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
1017 			    void *msg_control, const struct iovec *iv,
1018 			    size_t total_len, size_t count, int noblock)
1019 {
1020 	struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) };
1021 	struct sk_buff *skb;
1022 	size_t len = total_len, align = NET_SKB_PAD;
1023 	struct virtio_net_hdr gso = { 0 };
1024 	int offset = 0;
1025 	int copylen;
1026 	bool zerocopy = false;
1027 	int err;
1028 	u32 rxhash;
1029 
1030 	if (!(tun->flags & TUN_NO_PI)) {
1031 		if ((len -= sizeof(pi)) > total_len)
1032 			return -EINVAL;
1033 
1034 		if (memcpy_fromiovecend((void *)&pi, iv, 0, sizeof(pi)))
1035 			return -EFAULT;
1036 		offset += sizeof(pi);
1037 	}
1038 
1039 	if (tun->flags & TUN_VNET_HDR) {
1040 		if ((len -= tun->vnet_hdr_sz) > total_len)
1041 			return -EINVAL;
1042 
1043 		if (memcpy_fromiovecend((void *)&gso, iv, offset, sizeof(gso)))
1044 			return -EFAULT;
1045 
1046 		if ((gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
1047 		    gso.csum_start + gso.csum_offset + 2 > gso.hdr_len)
1048 			gso.hdr_len = gso.csum_start + gso.csum_offset + 2;
1049 
1050 		if (gso.hdr_len > len)
1051 			return -EINVAL;
1052 		offset += tun->vnet_hdr_sz;
1053 	}
1054 
1055 	if ((tun->flags & TUN_TYPE_MASK) == TUN_TAP_DEV) {
1056 		align += NET_IP_ALIGN;
1057 		if (unlikely(len < ETH_HLEN ||
1058 			     (gso.hdr_len && gso.hdr_len < ETH_HLEN)))
1059 			return -EINVAL;
1060 	}
1061 
1062 	if (msg_control)
1063 		zerocopy = true;
1064 
1065 	if (zerocopy) {
1066 		/* Userspace may produce vectors with count greater than
1067 		 * MAX_SKB_FRAGS, so we need to linearize parts of the skb
1068 		 * to let the rest of data to be fit in the frags.
1069 		 */
1070 		if (count > MAX_SKB_FRAGS) {
1071 			copylen = iov_length(iv, count - MAX_SKB_FRAGS);
1072 			if (copylen < offset)
1073 				copylen = 0;
1074 			else
1075 				copylen -= offset;
1076 		} else
1077 				copylen = 0;
1078 		/* There are 256 bytes to be copied in skb, so there is enough
1079 		 * room for skb expand head in case it is used.
1080 		 * The rest of the buffer is mapped from userspace.
1081 		 */
1082 		if (copylen < gso.hdr_len)
1083 			copylen = gso.hdr_len;
1084 		if (!copylen)
1085 			copylen = GOODCOPY_LEN;
1086 	} else
1087 		copylen = len;
1088 
1089 	skb = tun_alloc_skb(tfile, align, copylen, gso.hdr_len, noblock);
1090 	if (IS_ERR(skb)) {
1091 		if (PTR_ERR(skb) != -EAGAIN)
1092 			tun->dev->stats.rx_dropped++;
1093 		return PTR_ERR(skb);
1094 	}
1095 
1096 	if (zerocopy)
1097 		err = zerocopy_sg_from_iovec(skb, iv, offset, count);
1098 	else
1099 		err = skb_copy_datagram_from_iovec(skb, 0, iv, offset, len);
1100 
1101 	if (err) {
1102 		tun->dev->stats.rx_dropped++;
1103 		kfree_skb(skb);
1104 		return -EFAULT;
1105 	}
1106 
1107 	if (gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
1108 		if (!skb_partial_csum_set(skb, gso.csum_start,
1109 					  gso.csum_offset)) {
1110 			tun->dev->stats.rx_frame_errors++;
1111 			kfree_skb(skb);
1112 			return -EINVAL;
1113 		}
1114 	}
1115 
1116 	switch (tun->flags & TUN_TYPE_MASK) {
1117 	case TUN_TUN_DEV:
1118 		if (tun->flags & TUN_NO_PI) {
1119 			switch (skb->data[0] & 0xf0) {
1120 			case 0x40:
1121 				pi.proto = htons(ETH_P_IP);
1122 				break;
1123 			case 0x60:
1124 				pi.proto = htons(ETH_P_IPV6);
1125 				break;
1126 			default:
1127 				tun->dev->stats.rx_dropped++;
1128 				kfree_skb(skb);
1129 				return -EINVAL;
1130 			}
1131 		}
1132 
1133 		skb_reset_mac_header(skb);
1134 		skb->protocol = pi.proto;
1135 		skb->dev = tun->dev;
1136 		break;
1137 	case TUN_TAP_DEV:
1138 		skb->protocol = eth_type_trans(skb, tun->dev);
1139 		break;
1140 	}
1141 
1142 	if (gso.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
1143 		pr_debug("GSO!\n");
1144 		switch (gso.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
1145 		case VIRTIO_NET_HDR_GSO_TCPV4:
1146 			skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
1147 			break;
1148 		case VIRTIO_NET_HDR_GSO_TCPV6:
1149 			skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
1150 			break;
1151 		case VIRTIO_NET_HDR_GSO_UDP:
1152 			skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1153 			break;
1154 		default:
1155 			tun->dev->stats.rx_frame_errors++;
1156 			kfree_skb(skb);
1157 			return -EINVAL;
1158 		}
1159 
1160 		if (gso.gso_type & VIRTIO_NET_HDR_GSO_ECN)
1161 			skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
1162 
1163 		skb_shinfo(skb)->gso_size = gso.gso_size;
1164 		if (skb_shinfo(skb)->gso_size == 0) {
1165 			tun->dev->stats.rx_frame_errors++;
1166 			kfree_skb(skb);
1167 			return -EINVAL;
1168 		}
1169 
1170 		/* Header must be checked, and gso_segs computed. */
1171 		skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
1172 		skb_shinfo(skb)->gso_segs = 0;
1173 	}
1174 
1175 	/* copy skb_ubuf_info for callback when skb has no error */
1176 	if (zerocopy) {
1177 		skb_shinfo(skb)->destructor_arg = msg_control;
1178 		skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
1179 	}
1180 
1181 	skb_reset_network_header(skb);
1182 	rxhash = skb_get_rxhash(skb);
1183 	netif_rx_ni(skb);
1184 
1185 	tun->dev->stats.rx_packets++;
1186 	tun->dev->stats.rx_bytes += len;
1187 
1188 	tun_flow_update(tun, rxhash, tfile->queue_index);
1189 	return total_len;
1190 }
1191 
1192 static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
1193 			      unsigned long count, loff_t pos)
1194 {
1195 	struct file *file = iocb->ki_filp;
1196 	struct tun_struct *tun = tun_get(file);
1197 	struct tun_file *tfile = file->private_data;
1198 	ssize_t result;
1199 
1200 	if (!tun)
1201 		return -EBADFD;
1202 
1203 	tun_debug(KERN_INFO, tun, "tun_chr_write %ld\n", count);
1204 
1205 	result = tun_get_user(tun, tfile, NULL, iv, iov_length(iv, count),
1206 			      count, file->f_flags & O_NONBLOCK);
1207 
1208 	tun_put(tun);
1209 	return result;
1210 }
1211 
1212 /* Put packet to the user space buffer */
1213 static ssize_t tun_put_user(struct tun_struct *tun,
1214 			    struct tun_file *tfile,
1215 			    struct sk_buff *skb,
1216 			    const struct iovec *iv, int len)
1217 {
1218 	struct tun_pi pi = { 0, skb->protocol };
1219 	ssize_t total = 0;
1220 
1221 	if (!(tun->flags & TUN_NO_PI)) {
1222 		if ((len -= sizeof(pi)) < 0)
1223 			return -EINVAL;
1224 
1225 		if (len < skb->len) {
1226 			/* Packet will be striped */
1227 			pi.flags |= TUN_PKT_STRIP;
1228 		}
1229 
1230 		if (memcpy_toiovecend(iv, (void *) &pi, 0, sizeof(pi)))
1231 			return -EFAULT;
1232 		total += sizeof(pi);
1233 	}
1234 
1235 	if (tun->flags & TUN_VNET_HDR) {
1236 		struct virtio_net_hdr gso = { 0 }; /* no info leak */
1237 		if ((len -= tun->vnet_hdr_sz) < 0)
1238 			return -EINVAL;
1239 
1240 		if (skb_is_gso(skb)) {
1241 			struct skb_shared_info *sinfo = skb_shinfo(skb);
1242 
1243 			/* This is a hint as to how much should be linear. */
1244 			gso.hdr_len = skb_headlen(skb);
1245 			gso.gso_size = sinfo->gso_size;
1246 			if (sinfo->gso_type & SKB_GSO_TCPV4)
1247 				gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
1248 			else if (sinfo->gso_type & SKB_GSO_TCPV6)
1249 				gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
1250 			else if (sinfo->gso_type & SKB_GSO_UDP)
1251 				gso.gso_type = VIRTIO_NET_HDR_GSO_UDP;
1252 			else {
1253 				pr_err("unexpected GSO type: "
1254 				       "0x%x, gso_size %d, hdr_len %d\n",
1255 				       sinfo->gso_type, gso.gso_size,
1256 				       gso.hdr_len);
1257 				print_hex_dump(KERN_ERR, "tun: ",
1258 					       DUMP_PREFIX_NONE,
1259 					       16, 1, skb->head,
1260 					       min((int)gso.hdr_len, 64), true);
1261 				WARN_ON_ONCE(1);
1262 				return -EINVAL;
1263 			}
1264 			if (sinfo->gso_type & SKB_GSO_TCP_ECN)
1265 				gso.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
1266 		} else
1267 			gso.gso_type = VIRTIO_NET_HDR_GSO_NONE;
1268 
1269 		if (skb->ip_summed == CHECKSUM_PARTIAL) {
1270 			gso.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
1271 			gso.csum_start = skb_checksum_start_offset(skb);
1272 			gso.csum_offset = skb->csum_offset;
1273 		} else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
1274 			gso.flags = VIRTIO_NET_HDR_F_DATA_VALID;
1275 		} /* else everything is zero */
1276 
1277 		if (unlikely(memcpy_toiovecend(iv, (void *)&gso, total,
1278 					       sizeof(gso))))
1279 			return -EFAULT;
1280 		total += tun->vnet_hdr_sz;
1281 	}
1282 
1283 	len = min_t(int, skb->len, len);
1284 
1285 	skb_copy_datagram_const_iovec(skb, 0, iv, total, len);
1286 	total += skb->len;
1287 
1288 	tun->dev->stats.tx_packets++;
1289 	tun->dev->stats.tx_bytes += len;
1290 
1291 	return total;
1292 }
1293 
1294 static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
1295 			   struct kiocb *iocb, const struct iovec *iv,
1296 			   ssize_t len, int noblock)
1297 {
1298 	DECLARE_WAITQUEUE(wait, current);
1299 	struct sk_buff *skb;
1300 	ssize_t ret = 0;
1301 
1302 	tun_debug(KERN_INFO, tun, "tun_do_read\n");
1303 
1304 	if (unlikely(!noblock))
1305 		add_wait_queue(&tfile->wq.wait, &wait);
1306 	while (len) {
1307 		current->state = TASK_INTERRUPTIBLE;
1308 
1309 		/* Read frames from the queue */
1310 		if (!(skb = skb_dequeue(&tfile->socket.sk->sk_receive_queue))) {
1311 			if (noblock) {
1312 				ret = -EAGAIN;
1313 				break;
1314 			}
1315 			if (signal_pending(current)) {
1316 				ret = -ERESTARTSYS;
1317 				break;
1318 			}
1319 			if (tun->dev->reg_state != NETREG_REGISTERED) {
1320 				ret = -EIO;
1321 				break;
1322 			}
1323 
1324 			/* Nothing to read, let's sleep */
1325 			schedule();
1326 			continue;
1327 		}
1328 
1329 		ret = tun_put_user(tun, tfile, skb, iv, len);
1330 		kfree_skb(skb);
1331 		break;
1332 	}
1333 
1334 	current->state = TASK_RUNNING;
1335 	if (unlikely(!noblock))
1336 		remove_wait_queue(&tfile->wq.wait, &wait);
1337 
1338 	return ret;
1339 }
1340 
1341 static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
1342 			    unsigned long count, loff_t pos)
1343 {
1344 	struct file *file = iocb->ki_filp;
1345 	struct tun_file *tfile = file->private_data;
1346 	struct tun_struct *tun = __tun_get(tfile);
1347 	ssize_t len, ret;
1348 
1349 	if (!tun)
1350 		return -EBADFD;
1351 	len = iov_length(iv, count);
1352 	if (len < 0) {
1353 		ret = -EINVAL;
1354 		goto out;
1355 	}
1356 
1357 	ret = tun_do_read(tun, tfile, iocb, iv, len,
1358 			  file->f_flags & O_NONBLOCK);
1359 	ret = min_t(ssize_t, ret, len);
1360 out:
1361 	tun_put(tun);
1362 	return ret;
1363 }
1364 
1365 static void tun_free_netdev(struct net_device *dev)
1366 {
1367 	struct tun_struct *tun = netdev_priv(dev);
1368 
1369 	BUG_ON(!(list_empty(&tun->disabled)));
1370 	tun_flow_uninit(tun);
1371 	free_netdev(dev);
1372 }
1373 
1374 static void tun_setup(struct net_device *dev)
1375 {
1376 	struct tun_struct *tun = netdev_priv(dev);
1377 
1378 	tun->owner = INVALID_UID;
1379 	tun->group = INVALID_GID;
1380 
1381 	dev->ethtool_ops = &tun_ethtool_ops;
1382 	dev->destructor = tun_free_netdev;
1383 }
1384 
1385 /* Trivial set of netlink ops to allow deleting tun or tap
1386  * device with netlink.
1387  */
1388 static int tun_validate(struct nlattr *tb[], struct nlattr *data[])
1389 {
1390 	return -EINVAL;
1391 }
1392 
1393 static struct rtnl_link_ops tun_link_ops __read_mostly = {
1394 	.kind		= DRV_NAME,
1395 	.priv_size	= sizeof(struct tun_struct),
1396 	.setup		= tun_setup,
1397 	.validate	= tun_validate,
1398 };
1399 
1400 static void tun_sock_write_space(struct sock *sk)
1401 {
1402 	struct tun_file *tfile;
1403 	wait_queue_head_t *wqueue;
1404 
1405 	if (!sock_writeable(sk))
1406 		return;
1407 
1408 	if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags))
1409 		return;
1410 
1411 	wqueue = sk_sleep(sk);
1412 	if (wqueue && waitqueue_active(wqueue))
1413 		wake_up_interruptible_sync_poll(wqueue, POLLOUT |
1414 						POLLWRNORM | POLLWRBAND);
1415 
1416 	tfile = container_of(sk, struct tun_file, sk);
1417 	kill_fasync(&tfile->fasync, SIGIO, POLL_OUT);
1418 }
1419 
1420 static int tun_sendmsg(struct kiocb *iocb, struct socket *sock,
1421 		       struct msghdr *m, size_t total_len)
1422 {
1423 	int ret;
1424 	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
1425 	struct tun_struct *tun = __tun_get(tfile);
1426 
1427 	if (!tun)
1428 		return -EBADFD;
1429 	ret = tun_get_user(tun, tfile, m->msg_control, m->msg_iov, total_len,
1430 			   m->msg_iovlen, m->msg_flags & MSG_DONTWAIT);
1431 	tun_put(tun);
1432 	return ret;
1433 }
1434 
1435 
1436 static int tun_recvmsg(struct kiocb *iocb, struct socket *sock,
1437 		       struct msghdr *m, size_t total_len,
1438 		       int flags)
1439 {
1440 	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
1441 	struct tun_struct *tun = __tun_get(tfile);
1442 	int ret;
1443 
1444 	if (!tun)
1445 		return -EBADFD;
1446 
1447 	if (flags & ~(MSG_DONTWAIT|MSG_TRUNC))
1448 		return -EINVAL;
1449 	ret = tun_do_read(tun, tfile, iocb, m->msg_iov, total_len,
1450 			  flags & MSG_DONTWAIT);
1451 	if (ret > total_len) {
1452 		m->msg_flags |= MSG_TRUNC;
1453 		ret = flags & MSG_TRUNC ? ret : total_len;
1454 	}
1455 	tun_put(tun);
1456 	return ret;
1457 }
1458 
1459 static int tun_release(struct socket *sock)
1460 {
1461 	if (sock->sk)
1462 		sock_put(sock->sk);
1463 	return 0;
1464 }
1465 
1466 /* Ops structure to mimic raw sockets with tun */
1467 static const struct proto_ops tun_socket_ops = {
1468 	.sendmsg = tun_sendmsg,
1469 	.recvmsg = tun_recvmsg,
1470 	.release = tun_release,
1471 };
1472 
1473 static struct proto tun_proto = {
1474 	.name		= "tun",
1475 	.owner		= THIS_MODULE,
1476 	.obj_size	= sizeof(struct tun_file),
1477 };
1478 
1479 static int tun_flags(struct tun_struct *tun)
1480 {
1481 	int flags = 0;
1482 
1483 	if (tun->flags & TUN_TUN_DEV)
1484 		flags |= IFF_TUN;
1485 	else
1486 		flags |= IFF_TAP;
1487 
1488 	if (tun->flags & TUN_NO_PI)
1489 		flags |= IFF_NO_PI;
1490 
1491 	/* This flag has no real effect.  We track the value for backwards
1492 	 * compatibility.
1493 	 */
1494 	if (tun->flags & TUN_ONE_QUEUE)
1495 		flags |= IFF_ONE_QUEUE;
1496 
1497 	if (tun->flags & TUN_VNET_HDR)
1498 		flags |= IFF_VNET_HDR;
1499 
1500 	if (tun->flags & TUN_TAP_MQ)
1501 		flags |= IFF_MULTI_QUEUE;
1502 
1503 	return flags;
1504 }
1505 
1506 static ssize_t tun_show_flags(struct device *dev, struct device_attribute *attr,
1507 			      char *buf)
1508 {
1509 	struct tun_struct *tun = netdev_priv(to_net_dev(dev));
1510 	return sprintf(buf, "0x%x\n", tun_flags(tun));
1511 }
1512 
1513 static ssize_t tun_show_owner(struct device *dev, struct device_attribute *attr,
1514 			      char *buf)
1515 {
1516 	struct tun_struct *tun = netdev_priv(to_net_dev(dev));
1517 	return uid_valid(tun->owner)?
1518 		sprintf(buf, "%u\n",
1519 			from_kuid_munged(current_user_ns(), tun->owner)):
1520 		sprintf(buf, "-1\n");
1521 }
1522 
1523 static ssize_t tun_show_group(struct device *dev, struct device_attribute *attr,
1524 			      char *buf)
1525 {
1526 	struct tun_struct *tun = netdev_priv(to_net_dev(dev));
1527 	return gid_valid(tun->group) ?
1528 		sprintf(buf, "%u\n",
1529 			from_kgid_munged(current_user_ns(), tun->group)):
1530 		sprintf(buf, "-1\n");
1531 }
1532 
1533 static DEVICE_ATTR(tun_flags, 0444, tun_show_flags, NULL);
1534 static DEVICE_ATTR(owner, 0444, tun_show_owner, NULL);
1535 static DEVICE_ATTR(group, 0444, tun_show_group, NULL);
1536 
1537 static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
1538 {
1539 	struct tun_struct *tun;
1540 	struct tun_file *tfile = file->private_data;
1541 	struct net_device *dev;
1542 	int err;
1543 
1544 	if (tfile->detached)
1545 		return -EINVAL;
1546 
1547 	dev = __dev_get_by_name(net, ifr->ifr_name);
1548 	if (dev) {
1549 		if (ifr->ifr_flags & IFF_TUN_EXCL)
1550 			return -EBUSY;
1551 		if ((ifr->ifr_flags & IFF_TUN) && dev->netdev_ops == &tun_netdev_ops)
1552 			tun = netdev_priv(dev);
1553 		else if ((ifr->ifr_flags & IFF_TAP) && dev->netdev_ops == &tap_netdev_ops)
1554 			tun = netdev_priv(dev);
1555 		else
1556 			return -EINVAL;
1557 
1558 		if (tun_not_capable(tun))
1559 			return -EPERM;
1560 		err = security_tun_dev_attach(tfile->socket.sk);
1561 		if (err < 0)
1562 			return err;
1563 
1564 		err = tun_attach(tun, file);
1565 		if (err < 0)
1566 			return err;
1567 
1568 		if (tun->flags & TUN_TAP_MQ &&
1569 		    (tun->numqueues + tun->numdisabled > 1))
1570 			return err;
1571 	}
1572 	else {
1573 		char *name;
1574 		unsigned long flags = 0;
1575 
1576 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
1577 			return -EPERM;
1578 		err = security_tun_dev_create();
1579 		if (err < 0)
1580 			return err;
1581 
1582 		/* Set dev type */
1583 		if (ifr->ifr_flags & IFF_TUN) {
1584 			/* TUN device */
1585 			flags |= TUN_TUN_DEV;
1586 			name = "tun%d";
1587 		} else if (ifr->ifr_flags & IFF_TAP) {
1588 			/* TAP device */
1589 			flags |= TUN_TAP_DEV;
1590 			name = "tap%d";
1591 		} else
1592 			return -EINVAL;
1593 
1594 		if (*ifr->ifr_name)
1595 			name = ifr->ifr_name;
1596 
1597 		dev = alloc_netdev_mqs(sizeof(struct tun_struct), name,
1598 				       tun_setup,
1599 				       MAX_TAP_QUEUES, MAX_TAP_QUEUES);
1600 		if (!dev)
1601 			return -ENOMEM;
1602 
1603 		dev_net_set(dev, net);
1604 		dev->rtnl_link_ops = &tun_link_ops;
1605 
1606 		tun = netdev_priv(dev);
1607 		tun->dev = dev;
1608 		tun->flags = flags;
1609 		tun->txflt.count = 0;
1610 		tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
1611 
1612 		tun->filter_attached = false;
1613 		tun->sndbuf = tfile->socket.sk->sk_sndbuf;
1614 
1615 		spin_lock_init(&tun->lock);
1616 
1617 		security_tun_dev_post_create(&tfile->sk);
1618 
1619 		tun_net_init(dev);
1620 
1621 		err = tun_flow_init(tun);
1622 		if (err < 0)
1623 			goto err_free_dev;
1624 
1625 		dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
1626 			TUN_USER_FEATURES;
1627 		dev->features = dev->hw_features;
1628 
1629 		INIT_LIST_HEAD(&tun->disabled);
1630 		err = tun_attach(tun, file);
1631 		if (err < 0)
1632 			goto err_free_dev;
1633 
1634 		err = register_netdevice(tun->dev);
1635 		if (err < 0)
1636 			goto err_free_dev;
1637 
1638 		if (device_create_file(&tun->dev->dev, &dev_attr_tun_flags) ||
1639 		    device_create_file(&tun->dev->dev, &dev_attr_owner) ||
1640 		    device_create_file(&tun->dev->dev, &dev_attr_group))
1641 			pr_err("Failed to create tun sysfs files\n");
1642 
1643 		netif_carrier_on(tun->dev);
1644 	}
1645 
1646 	tun_debug(KERN_INFO, tun, "tun_set_iff\n");
1647 
1648 	if (ifr->ifr_flags & IFF_NO_PI)
1649 		tun->flags |= TUN_NO_PI;
1650 	else
1651 		tun->flags &= ~TUN_NO_PI;
1652 
1653 	/* This flag has no real effect.  We track the value for backwards
1654 	 * compatibility.
1655 	 */
1656 	if (ifr->ifr_flags & IFF_ONE_QUEUE)
1657 		tun->flags |= TUN_ONE_QUEUE;
1658 	else
1659 		tun->flags &= ~TUN_ONE_QUEUE;
1660 
1661 	if (ifr->ifr_flags & IFF_VNET_HDR)
1662 		tun->flags |= TUN_VNET_HDR;
1663 	else
1664 		tun->flags &= ~TUN_VNET_HDR;
1665 
1666 	if (ifr->ifr_flags & IFF_MULTI_QUEUE)
1667 		tun->flags |= TUN_TAP_MQ;
1668 	else
1669 		tun->flags &= ~TUN_TAP_MQ;
1670 
1671 	/* Make sure persistent devices do not get stuck in
1672 	 * xoff state.
1673 	 */
1674 	if (netif_running(tun->dev))
1675 		netif_tx_wake_all_queues(tun->dev);
1676 
1677 	strcpy(ifr->ifr_name, tun->dev->name);
1678 	return 0;
1679 
1680  err_free_dev:
1681 	free_netdev(dev);
1682 	return err;
1683 }
1684 
1685 static void tun_get_iff(struct net *net, struct tun_struct *tun,
1686 		       struct ifreq *ifr)
1687 {
1688 	tun_debug(KERN_INFO, tun, "tun_get_iff\n");
1689 
1690 	strcpy(ifr->ifr_name, tun->dev->name);
1691 
1692 	ifr->ifr_flags = tun_flags(tun);
1693 
1694 }
1695 
1696 /* This is like a cut-down ethtool ops, except done via tun fd so no
1697  * privs required. */
1698 static int set_offload(struct tun_struct *tun, unsigned long arg)
1699 {
1700 	netdev_features_t features = 0;
1701 
1702 	if (arg & TUN_F_CSUM) {
1703 		features |= NETIF_F_HW_CSUM;
1704 		arg &= ~TUN_F_CSUM;
1705 
1706 		if (arg & (TUN_F_TSO4|TUN_F_TSO6)) {
1707 			if (arg & TUN_F_TSO_ECN) {
1708 				features |= NETIF_F_TSO_ECN;
1709 				arg &= ~TUN_F_TSO_ECN;
1710 			}
1711 			if (arg & TUN_F_TSO4)
1712 				features |= NETIF_F_TSO;
1713 			if (arg & TUN_F_TSO6)
1714 				features |= NETIF_F_TSO6;
1715 			arg &= ~(TUN_F_TSO4|TUN_F_TSO6);
1716 		}
1717 
1718 		if (arg & TUN_F_UFO) {
1719 			features |= NETIF_F_UFO;
1720 			arg &= ~TUN_F_UFO;
1721 		}
1722 	}
1723 
1724 	/* This gives the user a way to test for new features in future by
1725 	 * trying to set them. */
1726 	if (arg)
1727 		return -EINVAL;
1728 
1729 	tun->set_features = features;
1730 	netdev_update_features(tun->dev);
1731 
1732 	return 0;
1733 }
1734 
1735 static void tun_detach_filter(struct tun_struct *tun, int n)
1736 {
1737 	int i;
1738 	struct tun_file *tfile;
1739 
1740 	for (i = 0; i < n; i++) {
1741 		tfile = rtnl_dereference(tun->tfiles[i]);
1742 		sk_detach_filter(tfile->socket.sk);
1743 	}
1744 
1745 	tun->filter_attached = false;
1746 }
1747 
1748 static int tun_attach_filter(struct tun_struct *tun)
1749 {
1750 	int i, ret = 0;
1751 	struct tun_file *tfile;
1752 
1753 	for (i = 0; i < tun->numqueues; i++) {
1754 		tfile = rtnl_dereference(tun->tfiles[i]);
1755 		ret = sk_attach_filter(&tun->fprog, tfile->socket.sk);
1756 		if (ret) {
1757 			tun_detach_filter(tun, i);
1758 			return ret;
1759 		}
1760 	}
1761 
1762 	tun->filter_attached = true;
1763 	return ret;
1764 }
1765 
1766 static void tun_set_sndbuf(struct tun_struct *tun)
1767 {
1768 	struct tun_file *tfile;
1769 	int i;
1770 
1771 	for (i = 0; i < tun->numqueues; i++) {
1772 		tfile = rtnl_dereference(tun->tfiles[i]);
1773 		tfile->socket.sk->sk_sndbuf = tun->sndbuf;
1774 	}
1775 }
1776 
1777 static int tun_set_queue(struct file *file, struct ifreq *ifr)
1778 {
1779 	struct tun_file *tfile = file->private_data;
1780 	struct tun_struct *tun;
1781 	int ret = 0;
1782 
1783 	rtnl_lock();
1784 
1785 	if (ifr->ifr_flags & IFF_ATTACH_QUEUE) {
1786 		tun = tfile->detached;
1787 		if (!tun)
1788 			ret = -EINVAL;
1789 		else
1790 			ret = tun_attach(tun, file);
1791 	} else if (ifr->ifr_flags & IFF_DETACH_QUEUE) {
1792 		tun = rtnl_dereference(tfile->tun);
1793 		if (!tun || !(tun->flags & TUN_TAP_MQ))
1794 			ret = -EINVAL;
1795 		else
1796 			__tun_detach(tfile, false);
1797 	} else
1798 		ret = -EINVAL;
1799 
1800 	rtnl_unlock();
1801 	return ret;
1802 }
1803 
1804 static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
1805 			    unsigned long arg, int ifreq_len)
1806 {
1807 	struct tun_file *tfile = file->private_data;
1808 	struct tun_struct *tun;
1809 	void __user* argp = (void __user*)arg;
1810 	struct ifreq ifr;
1811 	kuid_t owner;
1812 	kgid_t group;
1813 	int sndbuf;
1814 	int vnet_hdr_sz;
1815 	int ret;
1816 
1817 	if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || _IOC_TYPE(cmd) == 0x89) {
1818 		if (copy_from_user(&ifr, argp, ifreq_len))
1819 			return -EFAULT;
1820 	} else {
1821 		memset(&ifr, 0, sizeof(ifr));
1822 	}
1823 	if (cmd == TUNGETFEATURES) {
1824 		/* Currently this just means: "what IFF flags are valid?".
1825 		 * This is needed because we never checked for invalid flags on
1826 		 * TUNSETIFF. */
1827 		return put_user(IFF_TUN | IFF_TAP | IFF_NO_PI | IFF_ONE_QUEUE |
1828 				IFF_VNET_HDR | IFF_MULTI_QUEUE,
1829 				(unsigned int __user*)argp);
1830 	} else if (cmd == TUNSETQUEUE)
1831 		return tun_set_queue(file, &ifr);
1832 
1833 	ret = 0;
1834 	rtnl_lock();
1835 
1836 	tun = __tun_get(tfile);
1837 	if (cmd == TUNSETIFF && !tun) {
1838 		ifr.ifr_name[IFNAMSIZ-1] = '\0';
1839 
1840 		ret = tun_set_iff(tfile->net, file, &ifr);
1841 
1842 		if (ret)
1843 			goto unlock;
1844 
1845 		if (copy_to_user(argp, &ifr, ifreq_len))
1846 			ret = -EFAULT;
1847 		goto unlock;
1848 	}
1849 
1850 	ret = -EBADFD;
1851 	if (!tun)
1852 		goto unlock;
1853 
1854 	tun_debug(KERN_INFO, tun, "tun_chr_ioctl cmd %u\n", cmd);
1855 
1856 	ret = 0;
1857 	switch (cmd) {
1858 	case TUNGETIFF:
1859 		tun_get_iff(current->nsproxy->net_ns, tun, &ifr);
1860 
1861 		if (copy_to_user(argp, &ifr, ifreq_len))
1862 			ret = -EFAULT;
1863 		break;
1864 
1865 	case TUNSETNOCSUM:
1866 		/* Disable/Enable checksum */
1867 
1868 		/* [unimplemented] */
1869 		tun_debug(KERN_INFO, tun, "ignored: set checksum %s\n",
1870 			  arg ? "disabled" : "enabled");
1871 		break;
1872 
1873 	case TUNSETPERSIST:
1874 		/* Disable/Enable persist mode. Keep an extra reference to the
1875 		 * module to prevent the module being unprobed.
1876 		 */
1877 		if (arg) {
1878 			tun->flags |= TUN_PERSIST;
1879 			__module_get(THIS_MODULE);
1880 		} else {
1881 			tun->flags &= ~TUN_PERSIST;
1882 			module_put(THIS_MODULE);
1883 		}
1884 
1885 		tun_debug(KERN_INFO, tun, "persist %s\n",
1886 			  arg ? "enabled" : "disabled");
1887 		break;
1888 
1889 	case TUNSETOWNER:
1890 		/* Set owner of the device */
1891 		owner = make_kuid(current_user_ns(), arg);
1892 		if (!uid_valid(owner)) {
1893 			ret = -EINVAL;
1894 			break;
1895 		}
1896 		tun->owner = owner;
1897 		tun_debug(KERN_INFO, tun, "owner set to %u\n",
1898 			  from_kuid(&init_user_ns, tun->owner));
1899 		break;
1900 
1901 	case TUNSETGROUP:
1902 		/* Set group of the device */
1903 		group = make_kgid(current_user_ns(), arg);
1904 		if (!gid_valid(group)) {
1905 			ret = -EINVAL;
1906 			break;
1907 		}
1908 		tun->group = group;
1909 		tun_debug(KERN_INFO, tun, "group set to %u\n",
1910 			  from_kgid(&init_user_ns, tun->group));
1911 		break;
1912 
1913 	case TUNSETLINK:
1914 		/* Only allow setting the type when the interface is down */
1915 		if (tun->dev->flags & IFF_UP) {
1916 			tun_debug(KERN_INFO, tun,
1917 				  "Linktype set failed because interface is up\n");
1918 			ret = -EBUSY;
1919 		} else {
1920 			tun->dev->type = (int) arg;
1921 			tun_debug(KERN_INFO, tun, "linktype set to %d\n",
1922 				  tun->dev->type);
1923 			ret = 0;
1924 		}
1925 		break;
1926 
1927 #ifdef TUN_DEBUG
1928 	case TUNSETDEBUG:
1929 		tun->debug = arg;
1930 		break;
1931 #endif
1932 	case TUNSETOFFLOAD:
1933 		ret = set_offload(tun, arg);
1934 		break;
1935 
1936 	case TUNSETTXFILTER:
1937 		/* Can be set only for TAPs */
1938 		ret = -EINVAL;
1939 		if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV)
1940 			break;
1941 		ret = update_filter(&tun->txflt, (void __user *)arg);
1942 		break;
1943 
1944 	case SIOCGIFHWADDR:
1945 		/* Get hw address */
1946 		memcpy(ifr.ifr_hwaddr.sa_data, tun->dev->dev_addr, ETH_ALEN);
1947 		ifr.ifr_hwaddr.sa_family = tun->dev->type;
1948 		if (copy_to_user(argp, &ifr, ifreq_len))
1949 			ret = -EFAULT;
1950 		break;
1951 
1952 	case SIOCSIFHWADDR:
1953 		/* Set hw address */
1954 		tun_debug(KERN_DEBUG, tun, "set hw address: %pM\n",
1955 			  ifr.ifr_hwaddr.sa_data);
1956 
1957 		ret = dev_set_mac_address(tun->dev, &ifr.ifr_hwaddr);
1958 		break;
1959 
1960 	case TUNGETSNDBUF:
1961 		sndbuf = tfile->socket.sk->sk_sndbuf;
1962 		if (copy_to_user(argp, &sndbuf, sizeof(sndbuf)))
1963 			ret = -EFAULT;
1964 		break;
1965 
1966 	case TUNSETSNDBUF:
1967 		if (copy_from_user(&sndbuf, argp, sizeof(sndbuf))) {
1968 			ret = -EFAULT;
1969 			break;
1970 		}
1971 
1972 		tun->sndbuf = sndbuf;
1973 		tun_set_sndbuf(tun);
1974 		break;
1975 
1976 	case TUNGETVNETHDRSZ:
1977 		vnet_hdr_sz = tun->vnet_hdr_sz;
1978 		if (copy_to_user(argp, &vnet_hdr_sz, sizeof(vnet_hdr_sz)))
1979 			ret = -EFAULT;
1980 		break;
1981 
1982 	case TUNSETVNETHDRSZ:
1983 		if (copy_from_user(&vnet_hdr_sz, argp, sizeof(vnet_hdr_sz))) {
1984 			ret = -EFAULT;
1985 			break;
1986 		}
1987 		if (vnet_hdr_sz < (int)sizeof(struct virtio_net_hdr)) {
1988 			ret = -EINVAL;
1989 			break;
1990 		}
1991 
1992 		tun->vnet_hdr_sz = vnet_hdr_sz;
1993 		break;
1994 
1995 	case TUNATTACHFILTER:
1996 		/* Can be set only for TAPs */
1997 		ret = -EINVAL;
1998 		if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV)
1999 			break;
2000 		ret = -EFAULT;
2001 		if (copy_from_user(&tun->fprog, argp, sizeof(tun->fprog)))
2002 			break;
2003 
2004 		ret = tun_attach_filter(tun);
2005 		break;
2006 
2007 	case TUNDETACHFILTER:
2008 		/* Can be set only for TAPs */
2009 		ret = -EINVAL;
2010 		if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV)
2011 			break;
2012 		ret = 0;
2013 		tun_detach_filter(tun, tun->numqueues);
2014 		break;
2015 
2016 	default:
2017 		ret = -EINVAL;
2018 		break;
2019 	}
2020 
2021 unlock:
2022 	rtnl_unlock();
2023 	if (tun)
2024 		tun_put(tun);
2025 	return ret;
2026 }
2027 
2028 static long tun_chr_ioctl(struct file *file,
2029 			  unsigned int cmd, unsigned long arg)
2030 {
2031 	return __tun_chr_ioctl(file, cmd, arg, sizeof (struct ifreq));
2032 }
2033 
2034 #ifdef CONFIG_COMPAT
2035 static long tun_chr_compat_ioctl(struct file *file,
2036 			 unsigned int cmd, unsigned long arg)
2037 {
2038 	switch (cmd) {
2039 	case TUNSETIFF:
2040 	case TUNGETIFF:
2041 	case TUNSETTXFILTER:
2042 	case TUNGETSNDBUF:
2043 	case TUNSETSNDBUF:
2044 	case SIOCGIFHWADDR:
2045 	case SIOCSIFHWADDR:
2046 		arg = (unsigned long)compat_ptr(arg);
2047 		break;
2048 	default:
2049 		arg = (compat_ulong_t)arg;
2050 		break;
2051 	}
2052 
2053 	/*
2054 	 * compat_ifreq is shorter than ifreq, so we must not access beyond
2055 	 * the end of that structure. All fields that are used in this
2056 	 * driver are compatible though, we don't need to convert the
2057 	 * contents.
2058 	 */
2059 	return __tun_chr_ioctl(file, cmd, arg, sizeof(struct compat_ifreq));
2060 }
2061 #endif /* CONFIG_COMPAT */
2062 
2063 static int tun_chr_fasync(int fd, struct file *file, int on)
2064 {
2065 	struct tun_file *tfile = file->private_data;
2066 	int ret;
2067 
2068 	if ((ret = fasync_helper(fd, file, on, &tfile->fasync)) < 0)
2069 		goto out;
2070 
2071 	if (on) {
2072 		ret = __f_setown(file, task_pid(current), PIDTYPE_PID, 0);
2073 		if (ret)
2074 			goto out;
2075 		tfile->flags |= TUN_FASYNC;
2076 	} else
2077 		tfile->flags &= ~TUN_FASYNC;
2078 	ret = 0;
2079 out:
2080 	return ret;
2081 }
2082 
2083 static int tun_chr_open(struct inode *inode, struct file * file)
2084 {
2085 	struct tun_file *tfile;
2086 
2087 	DBG1(KERN_INFO, "tunX: tun_chr_open\n");
2088 
2089 	tfile = (struct tun_file *)sk_alloc(&init_net, AF_UNSPEC, GFP_KERNEL,
2090 					    &tun_proto);
2091 	if (!tfile)
2092 		return -ENOMEM;
2093 	rcu_assign_pointer(tfile->tun, NULL);
2094 	tfile->net = get_net(current->nsproxy->net_ns);
2095 	tfile->flags = 0;
2096 
2097 	rcu_assign_pointer(tfile->socket.wq, &tfile->wq);
2098 	init_waitqueue_head(&tfile->wq.wait);
2099 
2100 	tfile->socket.file = file;
2101 	tfile->socket.ops = &tun_socket_ops;
2102 
2103 	sock_init_data(&tfile->socket, &tfile->sk);
2104 	sk_change_net(&tfile->sk, tfile->net);
2105 
2106 	tfile->sk.sk_write_space = tun_sock_write_space;
2107 	tfile->sk.sk_sndbuf = INT_MAX;
2108 
2109 	file->private_data = tfile;
2110 	set_bit(SOCK_EXTERNALLY_ALLOCATED, &tfile->socket.flags);
2111 	INIT_LIST_HEAD(&tfile->next);
2112 
2113 	return 0;
2114 }
2115 
2116 static int tun_chr_close(struct inode *inode, struct file *file)
2117 {
2118 	struct tun_file *tfile = file->private_data;
2119 	struct net *net = tfile->net;
2120 
2121 	tun_detach(tfile, true);
2122 	put_net(net);
2123 
2124 	return 0;
2125 }
2126 
2127 static const struct file_operations tun_fops = {
2128 	.owner	= THIS_MODULE,
2129 	.llseek = no_llseek,
2130 	.read  = do_sync_read,
2131 	.aio_read  = tun_chr_aio_read,
2132 	.write = do_sync_write,
2133 	.aio_write = tun_chr_aio_write,
2134 	.poll	= tun_chr_poll,
2135 	.unlocked_ioctl	= tun_chr_ioctl,
2136 #ifdef CONFIG_COMPAT
2137 	.compat_ioctl = tun_chr_compat_ioctl,
2138 #endif
2139 	.open	= tun_chr_open,
2140 	.release = tun_chr_close,
2141 	.fasync = tun_chr_fasync
2142 };
2143 
2144 static struct miscdevice tun_miscdev = {
2145 	.minor = TUN_MINOR,
2146 	.name = "tun",
2147 	.nodename = "net/tun",
2148 	.fops = &tun_fops,
2149 };
2150 
2151 /* ethtool interface */
2152 
2153 static int tun_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
2154 {
2155 	cmd->supported		= 0;
2156 	cmd->advertising	= 0;
2157 	ethtool_cmd_speed_set(cmd, SPEED_10);
2158 	cmd->duplex		= DUPLEX_FULL;
2159 	cmd->port		= PORT_TP;
2160 	cmd->phy_address	= 0;
2161 	cmd->transceiver	= XCVR_INTERNAL;
2162 	cmd->autoneg		= AUTONEG_DISABLE;
2163 	cmd->maxtxpkt		= 0;
2164 	cmd->maxrxpkt		= 0;
2165 	return 0;
2166 }
2167 
2168 static void tun_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
2169 {
2170 	struct tun_struct *tun = netdev_priv(dev);
2171 
2172 	strlcpy(info->driver, DRV_NAME, sizeof(info->driver));
2173 	strlcpy(info->version, DRV_VERSION, sizeof(info->version));
2174 
2175 	switch (tun->flags & TUN_TYPE_MASK) {
2176 	case TUN_TUN_DEV:
2177 		strlcpy(info->bus_info, "tun", sizeof(info->bus_info));
2178 		break;
2179 	case TUN_TAP_DEV:
2180 		strlcpy(info->bus_info, "tap", sizeof(info->bus_info));
2181 		break;
2182 	}
2183 }
2184 
2185 static u32 tun_get_msglevel(struct net_device *dev)
2186 {
2187 #ifdef TUN_DEBUG
2188 	struct tun_struct *tun = netdev_priv(dev);
2189 	return tun->debug;
2190 #else
2191 	return -EOPNOTSUPP;
2192 #endif
2193 }
2194 
2195 static void tun_set_msglevel(struct net_device *dev, u32 value)
2196 {
2197 #ifdef TUN_DEBUG
2198 	struct tun_struct *tun = netdev_priv(dev);
2199 	tun->debug = value;
2200 #endif
2201 }
2202 
2203 static const struct ethtool_ops tun_ethtool_ops = {
2204 	.get_settings	= tun_get_settings,
2205 	.get_drvinfo	= tun_get_drvinfo,
2206 	.get_msglevel	= tun_get_msglevel,
2207 	.set_msglevel	= tun_set_msglevel,
2208 	.get_link	= ethtool_op_get_link,
2209 };
2210 
2211 
2212 static int __init tun_init(void)
2213 {
2214 	int ret = 0;
2215 
2216 	pr_info("%s, %s\n", DRV_DESCRIPTION, DRV_VERSION);
2217 	pr_info("%s\n", DRV_COPYRIGHT);
2218 
2219 	ret = rtnl_link_register(&tun_link_ops);
2220 	if (ret) {
2221 		pr_err("Can't register link_ops\n");
2222 		goto err_linkops;
2223 	}
2224 
2225 	ret = misc_register(&tun_miscdev);
2226 	if (ret) {
2227 		pr_err("Can't register misc device %d\n", TUN_MINOR);
2228 		goto err_misc;
2229 	}
2230 	return  0;
2231 err_misc:
2232 	rtnl_link_unregister(&tun_link_ops);
2233 err_linkops:
2234 	return ret;
2235 }
2236 
2237 static void tun_cleanup(void)
2238 {
2239 	misc_deregister(&tun_miscdev);
2240 	rtnl_link_unregister(&tun_link_ops);
2241 }
2242 
2243 /* Get an underlying socket object from tun file.  Returns error unless file is
2244  * attached to a device.  The returned object works like a packet socket, it
2245  * can be used for sock_sendmsg/sock_recvmsg.  The caller is responsible for
2246  * holding a reference to the file for as long as the socket is in use. */
2247 struct socket *tun_get_socket(struct file *file)
2248 {
2249 	struct tun_file *tfile;
2250 	if (file->f_op != &tun_fops)
2251 		return ERR_PTR(-EINVAL);
2252 	tfile = file->private_data;
2253 	if (!tfile)
2254 		return ERR_PTR(-EBADFD);
2255 	return &tfile->socket;
2256 }
2257 EXPORT_SYMBOL_GPL(tun_get_socket);
2258 
2259 module_init(tun_init);
2260 module_exit(tun_cleanup);
2261 MODULE_DESCRIPTION(DRV_DESCRIPTION);
2262 MODULE_AUTHOR(DRV_COPYRIGHT);
2263 MODULE_LICENSE("GPL");
2264 MODULE_ALIAS_MISCDEV(TUN_MINOR);
2265 MODULE_ALIAS("devname:net/tun");
2266