xref: /openbmc/linux/drivers/net/tun.c (revision 360823a09426347ea8f232b0b0b5156d0aed0302)
1  // SPDX-License-Identifier: GPL-2.0-or-later
2  /*
3   *  TUN - Universal TUN/TAP device driver.
4   *  Copyright (C) 1999-2002 Maxim Krasnyansky <maxk@qualcomm.com>
5   *
6   *  $Id: tun.c,v 1.15 2002/03/01 02:44:24 maxk Exp $
7   */
8  
9  /*
10   *  Changes:
11   *
12   *  Mike Kershaw <dragorn@kismetwireless.net> 2005/08/14
13   *    Add TUNSETLINK ioctl to set the link encapsulation
14   *
15   *  Mark Smith <markzzzsmith@yahoo.com.au>
16   *    Use eth_random_addr() for tap MAC address.
17   *
18   *  Harald Roelle <harald.roelle@ifi.lmu.de>  2004/04/20
19   *    Fixes in packet dropping, queue length setting and queue wakeup.
20   *    Increased default tx queue length.
21   *    Added ethtool API.
22   *    Minor cleanups
23   *
24   *  Daniel Podlejski <underley@underley.eu.org>
25   *    Modifications for 2.3.99-pre5 kernel.
26   */
27  
28  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
29  
30  #define DRV_NAME	"tun"
31  #define DRV_VERSION	"1.6"
32  #define DRV_DESCRIPTION	"Universal TUN/TAP device driver"
33  #define DRV_COPYRIGHT	"(C) 1999-2004 Max Krasnyansky <maxk@qualcomm.com>"
34  
35  #include <linux/module.h>
36  #include <linux/errno.h>
37  #include <linux/kernel.h>
38  #include <linux/sched/signal.h>
39  #include <linux/major.h>
40  #include <linux/slab.h>
41  #include <linux/poll.h>
42  #include <linux/fcntl.h>
43  #include <linux/init.h>
44  #include <linux/skbuff.h>
45  #include <linux/netdevice.h>
46  #include <linux/etherdevice.h>
47  #include <linux/miscdevice.h>
48  #include <linux/ethtool.h>
49  #include <linux/rtnetlink.h>
50  #include <linux/compat.h>
51  #include <linux/if.h>
52  #include <linux/if_arp.h>
53  #include <linux/if_ether.h>
54  #include <linux/if_tun.h>
55  #include <linux/if_vlan.h>
56  #include <linux/crc32.h>
57  #include <linux/nsproxy.h>
58  #include <linux/virtio_net.h>
59  #include <linux/rcupdate.h>
60  #include <net/net_namespace.h>
61  #include <net/netns/generic.h>
62  #include <net/rtnetlink.h>
63  #include <net/sock.h>
64  #include <net/xdp.h>
65  #include <net/ip_tunnels.h>
66  #include <linux/seq_file.h>
67  #include <linux/uio.h>
68  #include <linux/skb_array.h>
69  #include <linux/bpf.h>
70  #include <linux/bpf_trace.h>
71  #include <linux/mutex.h>
72  #include <linux/ieee802154.h>
73  #include <linux/if_ltalk.h>
74  #include <uapi/linux/if_fddi.h>
75  #include <uapi/linux/if_hippi.h>
76  #include <uapi/linux/if_fc.h>
77  #include <net/ax25.h>
78  #include <net/rose.h>
79  #include <net/6lowpan.h>
80  
81  #include <linux/uaccess.h>
82  #include <linux/proc_fs.h>
83  
84  static void tun_default_link_ksettings(struct net_device *dev,
85  				       struct ethtool_link_ksettings *cmd);
86  
87  #define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
88  
89  /* TUN device flags */
90  
91  /* IFF_ATTACH_QUEUE is never stored in device flags,
92   * overload it to mean fasync when stored there.
93   */
94  #define TUN_FASYNC	IFF_ATTACH_QUEUE
95  /* High bits in flags field are unused. */
96  #define TUN_VNET_LE     0x80000000
97  #define TUN_VNET_BE     0x40000000
98  
99  #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
100  		      IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS)
101  
102  #define GOODCOPY_LEN 128
103  
104  #define FLT_EXACT_COUNT 8
105  struct tap_filter {
106  	unsigned int    count;    /* Number of addrs. Zero means disabled */
107  	u32             mask[2];  /* Mask of the hashed addrs */
108  	unsigned char	addr[FLT_EXACT_COUNT][ETH_ALEN];
109  };
110  
111  /* MAX_TAP_QUEUES 256 is chosen to allow rx/tx queues to be equal
112   * to max number of VCPUs in guest. */
113  #define MAX_TAP_QUEUES 256
114  #define MAX_TAP_FLOWS  4096
115  
116  #define TUN_FLOW_EXPIRE (3 * HZ)
117  
118  /* A tun_file connects an open character device to a tuntap netdevice. It
119   * also contains all socket related structures (except sock_fprog and tap_filter)
120   * to serve as one transmit queue for tuntap device. The sock_fprog and
121   * tap_filter were kept in tun_struct since they were used for filtering for the
122   * netdevice not for a specific queue (at least I didn't see the requirement for
123   * this).
124   *
125   * RCU usage:
126   * The tun_file and tun_struct are loosely coupled, the pointer from one to the
127   * other can only be read while rcu_read_lock or rtnl_lock is held.
128   */
129  struct tun_file {
130  	struct sock sk;
131  	struct socket socket;
132  	struct tun_struct __rcu *tun;
133  	struct fasync_struct *fasync;
134  	/* only used for fasnyc */
135  	unsigned int flags;
136  	union {
137  		u16 queue_index;
138  		unsigned int ifindex;
139  	};
140  	struct napi_struct napi;
141  	bool napi_enabled;
142  	bool napi_frags_enabled;
143  	struct mutex napi_mutex;	/* Protects access to the above napi */
144  	struct list_head next;
145  	struct tun_struct *detached;
146  	struct ptr_ring tx_ring;
147  	struct xdp_rxq_info xdp_rxq;
148  };
149  
150  struct tun_page {
151  	struct page *page;
152  	int count;
153  };
154  
155  struct tun_flow_entry {
156  	struct hlist_node hash_link;
157  	struct rcu_head rcu;
158  	struct tun_struct *tun;
159  
160  	u32 rxhash;
161  	u32 rps_rxhash;
162  	int queue_index;
163  	unsigned long updated ____cacheline_aligned_in_smp;
164  };
165  
166  #define TUN_NUM_FLOW_ENTRIES 1024
167  #define TUN_MASK_FLOW_ENTRIES (TUN_NUM_FLOW_ENTRIES - 1)
168  
169  struct tun_prog {
170  	struct rcu_head rcu;
171  	struct bpf_prog *prog;
172  };
173  
174  /* Since the socket were moved to tun_file, to preserve the behavior of persist
175   * device, socket filter, sndbuf and vnet header size were restore when the
176   * file were attached to a persist device.
177   */
178  struct tun_struct {
179  	struct tun_file __rcu	*tfiles[MAX_TAP_QUEUES];
180  	unsigned int            numqueues;
181  	unsigned int 		flags;
182  	kuid_t			owner;
183  	kgid_t			group;
184  
185  	struct net_device	*dev;
186  	netdev_features_t	set_features;
187  #define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \
188  			  NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4)
189  
190  	int			align;
191  	int			vnet_hdr_sz;
192  	int			sndbuf;
193  	struct tap_filter	txflt;
194  	struct sock_fprog	fprog;
195  	/* protected by rtnl lock */
196  	bool			filter_attached;
197  	u32			msg_enable;
198  	spinlock_t lock;
199  	struct hlist_head flows[TUN_NUM_FLOW_ENTRIES];
200  	struct timer_list flow_gc_timer;
201  	unsigned long ageing_time;
202  	unsigned int numdisabled;
203  	struct list_head disabled;
204  	void *security;
205  	u32 flow_count;
206  	u32 rx_batched;
207  	atomic_long_t rx_frame_errors;
208  	struct bpf_prog __rcu *xdp_prog;
209  	struct tun_prog __rcu *steering_prog;
210  	struct tun_prog __rcu *filter_prog;
211  	struct ethtool_link_ksettings link_ksettings;
212  	/* init args */
213  	struct file *file;
214  	struct ifreq *ifr;
215  };
216  
217  struct veth {
218  	__be16 h_vlan_proto;
219  	__be16 h_vlan_TCI;
220  };
221  
222  static void tun_flow_init(struct tun_struct *tun);
223  static void tun_flow_uninit(struct tun_struct *tun);
224  
tun_napi_receive(struct napi_struct * napi,int budget)225  static int tun_napi_receive(struct napi_struct *napi, int budget)
226  {
227  	struct tun_file *tfile = container_of(napi, struct tun_file, napi);
228  	struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
229  	struct sk_buff_head process_queue;
230  	struct sk_buff *skb;
231  	int received = 0;
232  
233  	__skb_queue_head_init(&process_queue);
234  
235  	spin_lock(&queue->lock);
236  	skb_queue_splice_tail_init(queue, &process_queue);
237  	spin_unlock(&queue->lock);
238  
239  	while (received < budget && (skb = __skb_dequeue(&process_queue))) {
240  		napi_gro_receive(napi, skb);
241  		++received;
242  	}
243  
244  	if (!skb_queue_empty(&process_queue)) {
245  		spin_lock(&queue->lock);
246  		skb_queue_splice(&process_queue, queue);
247  		spin_unlock(&queue->lock);
248  	}
249  
250  	return received;
251  }
252  
tun_napi_poll(struct napi_struct * napi,int budget)253  static int tun_napi_poll(struct napi_struct *napi, int budget)
254  {
255  	unsigned int received;
256  
257  	received = tun_napi_receive(napi, budget);
258  
259  	if (received < budget)
260  		napi_complete_done(napi, received);
261  
262  	return received;
263  }
264  
tun_napi_init(struct tun_struct * tun,struct tun_file * tfile,bool napi_en,bool napi_frags)265  static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile,
266  			  bool napi_en, bool napi_frags)
267  {
268  	tfile->napi_enabled = napi_en;
269  	tfile->napi_frags_enabled = napi_en && napi_frags;
270  	if (napi_en) {
271  		netif_napi_add_tx(tun->dev, &tfile->napi, tun_napi_poll);
272  		napi_enable(&tfile->napi);
273  	}
274  }
275  
tun_napi_enable(struct tun_file * tfile)276  static void tun_napi_enable(struct tun_file *tfile)
277  {
278  	if (tfile->napi_enabled)
279  		napi_enable(&tfile->napi);
280  }
281  
tun_napi_disable(struct tun_file * tfile)282  static void tun_napi_disable(struct tun_file *tfile)
283  {
284  	if (tfile->napi_enabled)
285  		napi_disable(&tfile->napi);
286  }
287  
tun_napi_del(struct tun_file * tfile)288  static void tun_napi_del(struct tun_file *tfile)
289  {
290  	if (tfile->napi_enabled)
291  		netif_napi_del(&tfile->napi);
292  }
293  
tun_napi_frags_enabled(const struct tun_file * tfile)294  static bool tun_napi_frags_enabled(const struct tun_file *tfile)
295  {
296  	return tfile->napi_frags_enabled;
297  }
298  
299  #ifdef CONFIG_TUN_VNET_CROSS_LE
tun_legacy_is_little_endian(struct tun_struct * tun)300  static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
301  {
302  	return tun->flags & TUN_VNET_BE ? false :
303  		virtio_legacy_is_little_endian();
304  }
305  
tun_get_vnet_be(struct tun_struct * tun,int __user * argp)306  static long tun_get_vnet_be(struct tun_struct *tun, int __user *argp)
307  {
308  	int be = !!(tun->flags & TUN_VNET_BE);
309  
310  	if (put_user(be, argp))
311  		return -EFAULT;
312  
313  	return 0;
314  }
315  
tun_set_vnet_be(struct tun_struct * tun,int __user * argp)316  static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp)
317  {
318  	int be;
319  
320  	if (get_user(be, argp))
321  		return -EFAULT;
322  
323  	if (be)
324  		tun->flags |= TUN_VNET_BE;
325  	else
326  		tun->flags &= ~TUN_VNET_BE;
327  
328  	return 0;
329  }
330  #else
tun_legacy_is_little_endian(struct tun_struct * tun)331  static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
332  {
333  	return virtio_legacy_is_little_endian();
334  }
335  
tun_get_vnet_be(struct tun_struct * tun,int __user * argp)336  static long tun_get_vnet_be(struct tun_struct *tun, int __user *argp)
337  {
338  	return -EINVAL;
339  }
340  
tun_set_vnet_be(struct tun_struct * tun,int __user * argp)341  static long tun_set_vnet_be(struct tun_struct *tun, int __user *argp)
342  {
343  	return -EINVAL;
344  }
345  #endif /* CONFIG_TUN_VNET_CROSS_LE */
346  
tun_is_little_endian(struct tun_struct * tun)347  static inline bool tun_is_little_endian(struct tun_struct *tun)
348  {
349  	return tun->flags & TUN_VNET_LE ||
350  		tun_legacy_is_little_endian(tun);
351  }
352  
tun16_to_cpu(struct tun_struct * tun,__virtio16 val)353  static inline u16 tun16_to_cpu(struct tun_struct *tun, __virtio16 val)
354  {
355  	return __virtio16_to_cpu(tun_is_little_endian(tun), val);
356  }
357  
cpu_to_tun16(struct tun_struct * tun,u16 val)358  static inline __virtio16 cpu_to_tun16(struct tun_struct *tun, u16 val)
359  {
360  	return __cpu_to_virtio16(tun_is_little_endian(tun), val);
361  }
362  
tun_hashfn(u32 rxhash)363  static inline u32 tun_hashfn(u32 rxhash)
364  {
365  	return rxhash & TUN_MASK_FLOW_ENTRIES;
366  }
367  
tun_flow_find(struct hlist_head * head,u32 rxhash)368  static struct tun_flow_entry *tun_flow_find(struct hlist_head *head, u32 rxhash)
369  {
370  	struct tun_flow_entry *e;
371  
372  	hlist_for_each_entry_rcu(e, head, hash_link) {
373  		if (e->rxhash == rxhash)
374  			return e;
375  	}
376  	return NULL;
377  }
378  
tun_flow_create(struct tun_struct * tun,struct hlist_head * head,u32 rxhash,u16 queue_index)379  static struct tun_flow_entry *tun_flow_create(struct tun_struct *tun,
380  					      struct hlist_head *head,
381  					      u32 rxhash, u16 queue_index)
382  {
383  	struct tun_flow_entry *e = kmalloc(sizeof(*e), GFP_ATOMIC);
384  
385  	if (e) {
386  		netif_info(tun, tx_queued, tun->dev,
387  			   "create flow: hash %u index %u\n",
388  			   rxhash, queue_index);
389  		e->updated = jiffies;
390  		e->rxhash = rxhash;
391  		e->rps_rxhash = 0;
392  		e->queue_index = queue_index;
393  		e->tun = tun;
394  		hlist_add_head_rcu(&e->hash_link, head);
395  		++tun->flow_count;
396  	}
397  	return e;
398  }
399  
tun_flow_delete(struct tun_struct * tun,struct tun_flow_entry * e)400  static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e)
401  {
402  	netif_info(tun, tx_queued, tun->dev, "delete flow: hash %u index %u\n",
403  		   e->rxhash, e->queue_index);
404  	hlist_del_rcu(&e->hash_link);
405  	kfree_rcu(e, rcu);
406  	--tun->flow_count;
407  }
408  
tun_flow_flush(struct tun_struct * tun)409  static void tun_flow_flush(struct tun_struct *tun)
410  {
411  	int i;
412  
413  	spin_lock_bh(&tun->lock);
414  	for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
415  		struct tun_flow_entry *e;
416  		struct hlist_node *n;
417  
418  		hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link)
419  			tun_flow_delete(tun, e);
420  	}
421  	spin_unlock_bh(&tun->lock);
422  }
423  
tun_flow_delete_by_queue(struct tun_struct * tun,u16 queue_index)424  static void tun_flow_delete_by_queue(struct tun_struct *tun, u16 queue_index)
425  {
426  	int i;
427  
428  	spin_lock_bh(&tun->lock);
429  	for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
430  		struct tun_flow_entry *e;
431  		struct hlist_node *n;
432  
433  		hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) {
434  			if (e->queue_index == queue_index)
435  				tun_flow_delete(tun, e);
436  		}
437  	}
438  	spin_unlock_bh(&tun->lock);
439  }
440  
tun_flow_cleanup(struct timer_list * t)441  static void tun_flow_cleanup(struct timer_list *t)
442  {
443  	struct tun_struct *tun = from_timer(tun, t, flow_gc_timer);
444  	unsigned long delay = tun->ageing_time;
445  	unsigned long next_timer = jiffies + delay;
446  	unsigned long count = 0;
447  	int i;
448  
449  	spin_lock(&tun->lock);
450  	for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
451  		struct tun_flow_entry *e;
452  		struct hlist_node *n;
453  
454  		hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) {
455  			unsigned long this_timer;
456  
457  			this_timer = e->updated + delay;
458  			if (time_before_eq(this_timer, jiffies)) {
459  				tun_flow_delete(tun, e);
460  				continue;
461  			}
462  			count++;
463  			if (time_before(this_timer, next_timer))
464  				next_timer = this_timer;
465  		}
466  	}
467  
468  	if (count)
469  		mod_timer(&tun->flow_gc_timer, round_jiffies_up(next_timer));
470  	spin_unlock(&tun->lock);
471  }
472  
tun_flow_update(struct tun_struct * tun,u32 rxhash,struct tun_file * tfile)473  static void tun_flow_update(struct tun_struct *tun, u32 rxhash,
474  			    struct tun_file *tfile)
475  {
476  	struct hlist_head *head;
477  	struct tun_flow_entry *e;
478  	unsigned long delay = tun->ageing_time;
479  	u16 queue_index = tfile->queue_index;
480  
481  	head = &tun->flows[tun_hashfn(rxhash)];
482  
483  	rcu_read_lock();
484  
485  	e = tun_flow_find(head, rxhash);
486  	if (likely(e)) {
487  		/* TODO: keep queueing to old queue until it's empty? */
488  		if (READ_ONCE(e->queue_index) != queue_index)
489  			WRITE_ONCE(e->queue_index, queue_index);
490  		if (e->updated != jiffies)
491  			e->updated = jiffies;
492  		sock_rps_record_flow_hash(e->rps_rxhash);
493  	} else {
494  		spin_lock_bh(&tun->lock);
495  		if (!tun_flow_find(head, rxhash) &&
496  		    tun->flow_count < MAX_TAP_FLOWS)
497  			tun_flow_create(tun, head, rxhash, queue_index);
498  
499  		if (!timer_pending(&tun->flow_gc_timer))
500  			mod_timer(&tun->flow_gc_timer,
501  				  round_jiffies_up(jiffies + delay));
502  		spin_unlock_bh(&tun->lock);
503  	}
504  
505  	rcu_read_unlock();
506  }
507  
508  /* Save the hash received in the stack receive path and update the
509   * flow_hash table accordingly.
510   */
tun_flow_save_rps_rxhash(struct tun_flow_entry * e,u32 hash)511  static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
512  {
513  	if (unlikely(e->rps_rxhash != hash))
514  		e->rps_rxhash = hash;
515  }
516  
517  /* We try to identify a flow through its rxhash. The reason that
518   * we do not check rxq no. is because some cards(e.g 82599), chooses
519   * the rxq based on the txq where the last packet of the flow comes. As
520   * the userspace application move between processors, we may get a
521   * different rxq no. here.
522   */
tun_automq_select_queue(struct tun_struct * tun,struct sk_buff * skb)523  static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
524  {
525  	struct tun_flow_entry *e;
526  	u32 txq = 0;
527  	u32 numqueues = 0;
528  
529  	numqueues = READ_ONCE(tun->numqueues);
530  
531  	txq = __skb_get_hash_symmetric(skb);
532  	e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq);
533  	if (e) {
534  		tun_flow_save_rps_rxhash(e, txq);
535  		txq = e->queue_index;
536  	} else {
537  		/* use multiply and shift instead of expensive divide */
538  		txq = ((u64)txq * numqueues) >> 32;
539  	}
540  
541  	return txq;
542  }
543  
tun_ebpf_select_queue(struct tun_struct * tun,struct sk_buff * skb)544  static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb)
545  {
546  	struct tun_prog *prog;
547  	u32 numqueues;
548  	u16 ret = 0;
549  
550  	numqueues = READ_ONCE(tun->numqueues);
551  	if (!numqueues)
552  		return 0;
553  
554  	prog = rcu_dereference(tun->steering_prog);
555  	if (prog)
556  		ret = bpf_prog_run_clear_cb(prog->prog, skb);
557  
558  	return ret % numqueues;
559  }
560  
tun_select_queue(struct net_device * dev,struct sk_buff * skb,struct net_device * sb_dev)561  static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
562  			    struct net_device *sb_dev)
563  {
564  	struct tun_struct *tun = netdev_priv(dev);
565  	u16 ret;
566  
567  	rcu_read_lock();
568  	if (rcu_dereference(tun->steering_prog))
569  		ret = tun_ebpf_select_queue(tun, skb);
570  	else
571  		ret = tun_automq_select_queue(tun, skb);
572  	rcu_read_unlock();
573  
574  	return ret;
575  }
576  
tun_not_capable(struct tun_struct * tun)577  static inline bool tun_not_capable(struct tun_struct *tun)
578  {
579  	const struct cred *cred = current_cred();
580  	struct net *net = dev_net(tun->dev);
581  
582  	return ((uid_valid(tun->owner) && !uid_eq(cred->euid, tun->owner)) ||
583  		(gid_valid(tun->group) && !in_egroup_p(tun->group))) &&
584  		!ns_capable(net->user_ns, CAP_NET_ADMIN);
585  }
586  
tun_set_real_num_queues(struct tun_struct * tun)587  static void tun_set_real_num_queues(struct tun_struct *tun)
588  {
589  	netif_set_real_num_tx_queues(tun->dev, tun->numqueues);
590  	netif_set_real_num_rx_queues(tun->dev, tun->numqueues);
591  }
592  
tun_disable_queue(struct tun_struct * tun,struct tun_file * tfile)593  static void tun_disable_queue(struct tun_struct *tun, struct tun_file *tfile)
594  {
595  	tfile->detached = tun;
596  	list_add_tail(&tfile->next, &tun->disabled);
597  	++tun->numdisabled;
598  }
599  
tun_enable_queue(struct tun_file * tfile)600  static struct tun_struct *tun_enable_queue(struct tun_file *tfile)
601  {
602  	struct tun_struct *tun = tfile->detached;
603  
604  	tfile->detached = NULL;
605  	list_del_init(&tfile->next);
606  	--tun->numdisabled;
607  	return tun;
608  }
609  
tun_ptr_free(void * ptr)610  void tun_ptr_free(void *ptr)
611  {
612  	if (!ptr)
613  		return;
614  	if (tun_is_xdp_frame(ptr)) {
615  		struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
616  
617  		xdp_return_frame(xdpf);
618  	} else {
619  		__skb_array_destroy_skb(ptr);
620  	}
621  }
622  EXPORT_SYMBOL_GPL(tun_ptr_free);
623  
tun_queue_purge(struct tun_file * tfile)624  static void tun_queue_purge(struct tun_file *tfile)
625  {
626  	void *ptr;
627  
628  	while ((ptr = ptr_ring_consume(&tfile->tx_ring)) != NULL)
629  		tun_ptr_free(ptr);
630  
631  	skb_queue_purge(&tfile->sk.sk_write_queue);
632  	skb_queue_purge(&tfile->sk.sk_error_queue);
633  }
634  
__tun_detach(struct tun_file * tfile,bool clean)635  static void __tun_detach(struct tun_file *tfile, bool clean)
636  {
637  	struct tun_file *ntfile;
638  	struct tun_struct *tun;
639  
640  	tun = rtnl_dereference(tfile->tun);
641  
642  	if (tun && clean) {
643  		if (!tfile->detached)
644  			tun_napi_disable(tfile);
645  		tun_napi_del(tfile);
646  	}
647  
648  	if (tun && !tfile->detached) {
649  		u16 index = tfile->queue_index;
650  		BUG_ON(index >= tun->numqueues);
651  
652  		rcu_assign_pointer(tun->tfiles[index],
653  				   tun->tfiles[tun->numqueues - 1]);
654  		ntfile = rtnl_dereference(tun->tfiles[index]);
655  		ntfile->queue_index = index;
656  		ntfile->xdp_rxq.queue_index = index;
657  		rcu_assign_pointer(tun->tfiles[tun->numqueues - 1],
658  				   NULL);
659  
660  		--tun->numqueues;
661  		if (clean) {
662  			RCU_INIT_POINTER(tfile->tun, NULL);
663  			sock_put(&tfile->sk);
664  		} else {
665  			tun_disable_queue(tun, tfile);
666  			tun_napi_disable(tfile);
667  		}
668  
669  		synchronize_net();
670  		tun_flow_delete_by_queue(tun, tun->numqueues + 1);
671  		/* Drop read queue */
672  		tun_queue_purge(tfile);
673  		tun_set_real_num_queues(tun);
674  	} else if (tfile->detached && clean) {
675  		tun = tun_enable_queue(tfile);
676  		sock_put(&tfile->sk);
677  	}
678  
679  	if (clean) {
680  		if (tun && tun->numqueues == 0 && tun->numdisabled == 0) {
681  			netif_carrier_off(tun->dev);
682  
683  			if (!(tun->flags & IFF_PERSIST) &&
684  			    tun->dev->reg_state == NETREG_REGISTERED)
685  				unregister_netdevice(tun->dev);
686  		}
687  		if (tun)
688  			xdp_rxq_info_unreg(&tfile->xdp_rxq);
689  		ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free);
690  	}
691  }
692  
tun_detach(struct tun_file * tfile,bool clean)693  static void tun_detach(struct tun_file *tfile, bool clean)
694  {
695  	struct tun_struct *tun;
696  	struct net_device *dev;
697  
698  	rtnl_lock();
699  	tun = rtnl_dereference(tfile->tun);
700  	dev = tun ? tun->dev : NULL;
701  	__tun_detach(tfile, clean);
702  	if (dev)
703  		netdev_state_change(dev);
704  	rtnl_unlock();
705  
706  	if (clean)
707  		sock_put(&tfile->sk);
708  }
709  
tun_detach_all(struct net_device * dev)710  static void tun_detach_all(struct net_device *dev)
711  {
712  	struct tun_struct *tun = netdev_priv(dev);
713  	struct tun_file *tfile, *tmp;
714  	int i, n = tun->numqueues;
715  
716  	for (i = 0; i < n; i++) {
717  		tfile = rtnl_dereference(tun->tfiles[i]);
718  		BUG_ON(!tfile);
719  		tun_napi_disable(tfile);
720  		tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
721  		tfile->socket.sk->sk_data_ready(tfile->socket.sk);
722  		RCU_INIT_POINTER(tfile->tun, NULL);
723  		--tun->numqueues;
724  	}
725  	list_for_each_entry(tfile, &tun->disabled, next) {
726  		tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
727  		tfile->socket.sk->sk_data_ready(tfile->socket.sk);
728  		RCU_INIT_POINTER(tfile->tun, NULL);
729  	}
730  	BUG_ON(tun->numqueues != 0);
731  
732  	synchronize_net();
733  	for (i = 0; i < n; i++) {
734  		tfile = rtnl_dereference(tun->tfiles[i]);
735  		tun_napi_del(tfile);
736  		/* Drop read queue */
737  		tun_queue_purge(tfile);
738  		xdp_rxq_info_unreg(&tfile->xdp_rxq);
739  		sock_put(&tfile->sk);
740  	}
741  	list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) {
742  		tun_napi_del(tfile);
743  		tun_enable_queue(tfile);
744  		tun_queue_purge(tfile);
745  		xdp_rxq_info_unreg(&tfile->xdp_rxq);
746  		sock_put(&tfile->sk);
747  	}
748  	BUG_ON(tun->numdisabled != 0);
749  
750  	if (tun->flags & IFF_PERSIST)
751  		module_put(THIS_MODULE);
752  }
753  
tun_attach(struct tun_struct * tun,struct file * file,bool skip_filter,bool napi,bool napi_frags,bool publish_tun)754  static int tun_attach(struct tun_struct *tun, struct file *file,
755  		      bool skip_filter, bool napi, bool napi_frags,
756  		      bool publish_tun)
757  {
758  	struct tun_file *tfile = file->private_data;
759  	struct net_device *dev = tun->dev;
760  	int err;
761  
762  	err = security_tun_dev_attach(tfile->socket.sk, tun->security);
763  	if (err < 0)
764  		goto out;
765  
766  	err = -EINVAL;
767  	if (rtnl_dereference(tfile->tun) && !tfile->detached)
768  		goto out;
769  
770  	err = -EBUSY;
771  	if (!(tun->flags & IFF_MULTI_QUEUE) && tun->numqueues == 1)
772  		goto out;
773  
774  	err = -E2BIG;
775  	if (!tfile->detached &&
776  	    tun->numqueues + tun->numdisabled == MAX_TAP_QUEUES)
777  		goto out;
778  
779  	err = 0;
780  
781  	/* Re-attach the filter to persist device */
782  	if (!skip_filter && (tun->filter_attached == true)) {
783  		lock_sock(tfile->socket.sk);
784  		err = sk_attach_filter(&tun->fprog, tfile->socket.sk);
785  		release_sock(tfile->socket.sk);
786  		if (!err)
787  			goto out;
788  	}
789  
790  	if (!tfile->detached &&
791  	    ptr_ring_resize(&tfile->tx_ring, dev->tx_queue_len,
792  			    GFP_KERNEL, tun_ptr_free)) {
793  		err = -ENOMEM;
794  		goto out;
795  	}
796  
797  	tfile->queue_index = tun->numqueues;
798  	tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN;
799  
800  	if (tfile->detached) {
801  		/* Re-attach detached tfile, updating XDP queue_index */
802  		WARN_ON(!xdp_rxq_info_is_reg(&tfile->xdp_rxq));
803  
804  		if (tfile->xdp_rxq.queue_index    != tfile->queue_index)
805  			tfile->xdp_rxq.queue_index = tfile->queue_index;
806  	} else {
807  		/* Setup XDP RX-queue info, for new tfile getting attached */
808  		err = xdp_rxq_info_reg(&tfile->xdp_rxq,
809  				       tun->dev, tfile->queue_index, 0);
810  		if (err < 0)
811  			goto out;
812  		err = xdp_rxq_info_reg_mem_model(&tfile->xdp_rxq,
813  						 MEM_TYPE_PAGE_SHARED, NULL);
814  		if (err < 0) {
815  			xdp_rxq_info_unreg(&tfile->xdp_rxq);
816  			goto out;
817  		}
818  		err = 0;
819  	}
820  
821  	if (tfile->detached) {
822  		tun_enable_queue(tfile);
823  		tun_napi_enable(tfile);
824  	} else {
825  		sock_hold(&tfile->sk);
826  		tun_napi_init(tun, tfile, napi, napi_frags);
827  	}
828  
829  	if (rtnl_dereference(tun->xdp_prog))
830  		sock_set_flag(&tfile->sk, SOCK_XDP);
831  
832  	/* device is allowed to go away first, so no need to hold extra
833  	 * refcnt.
834  	 */
835  
836  	/* Publish tfile->tun and tun->tfiles only after we've fully
837  	 * initialized tfile; otherwise we risk using half-initialized
838  	 * object.
839  	 */
840  	if (publish_tun)
841  		rcu_assign_pointer(tfile->tun, tun);
842  	rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
843  	tun->numqueues++;
844  	tun_set_real_num_queues(tun);
845  out:
846  	return err;
847  }
848  
tun_get(struct tun_file * tfile)849  static struct tun_struct *tun_get(struct tun_file *tfile)
850  {
851  	struct tun_struct *tun;
852  
853  	rcu_read_lock();
854  	tun = rcu_dereference(tfile->tun);
855  	if (tun)
856  		dev_hold(tun->dev);
857  	rcu_read_unlock();
858  
859  	return tun;
860  }
861  
tun_put(struct tun_struct * tun)862  static void tun_put(struct tun_struct *tun)
863  {
864  	dev_put(tun->dev);
865  }
866  
867  /* TAP filtering */
addr_hash_set(u32 * mask,const u8 * addr)868  static void addr_hash_set(u32 *mask, const u8 *addr)
869  {
870  	int n = ether_crc(ETH_ALEN, addr) >> 26;
871  	mask[n >> 5] |= (1 << (n & 31));
872  }
873  
addr_hash_test(const u32 * mask,const u8 * addr)874  static unsigned int addr_hash_test(const u32 *mask, const u8 *addr)
875  {
876  	int n = ether_crc(ETH_ALEN, addr) >> 26;
877  	return mask[n >> 5] & (1 << (n & 31));
878  }
879  
update_filter(struct tap_filter * filter,void __user * arg)880  static int update_filter(struct tap_filter *filter, void __user *arg)
881  {
882  	struct { u8 u[ETH_ALEN]; } *addr;
883  	struct tun_filter uf;
884  	int err, alen, n, nexact;
885  
886  	if (copy_from_user(&uf, arg, sizeof(uf)))
887  		return -EFAULT;
888  
889  	if (!uf.count) {
890  		/* Disabled */
891  		filter->count = 0;
892  		return 0;
893  	}
894  
895  	alen = ETH_ALEN * uf.count;
896  	addr = memdup_user(arg + sizeof(uf), alen);
897  	if (IS_ERR(addr))
898  		return PTR_ERR(addr);
899  
900  	/* The filter is updated without holding any locks. Which is
901  	 * perfectly safe. We disable it first and in the worst
902  	 * case we'll accept a few undesired packets. */
903  	filter->count = 0;
904  	wmb();
905  
906  	/* Use first set of addresses as an exact filter */
907  	for (n = 0; n < uf.count && n < FLT_EXACT_COUNT; n++)
908  		memcpy(filter->addr[n], addr[n].u, ETH_ALEN);
909  
910  	nexact = n;
911  
912  	/* Remaining multicast addresses are hashed,
913  	 * unicast will leave the filter disabled. */
914  	memset(filter->mask, 0, sizeof(filter->mask));
915  	for (; n < uf.count; n++) {
916  		if (!is_multicast_ether_addr(addr[n].u)) {
917  			err = 0; /* no filter */
918  			goto free_addr;
919  		}
920  		addr_hash_set(filter->mask, addr[n].u);
921  	}
922  
923  	/* For ALLMULTI just set the mask to all ones.
924  	 * This overrides the mask populated above. */
925  	if ((uf.flags & TUN_FLT_ALLMULTI))
926  		memset(filter->mask, ~0, sizeof(filter->mask));
927  
928  	/* Now enable the filter */
929  	wmb();
930  	filter->count = nexact;
931  
932  	/* Return the number of exact filters */
933  	err = nexact;
934  free_addr:
935  	kfree(addr);
936  	return err;
937  }
938  
939  /* Returns: 0 - drop, !=0 - accept */
run_filter(struct tap_filter * filter,const struct sk_buff * skb)940  static int run_filter(struct tap_filter *filter, const struct sk_buff *skb)
941  {
942  	/* Cannot use eth_hdr(skb) here because skb_mac_hdr() is incorrect
943  	 * at this point. */
944  	struct ethhdr *eh = (struct ethhdr *) skb->data;
945  	int i;
946  
947  	/* Exact match */
948  	for (i = 0; i < filter->count; i++)
949  		if (ether_addr_equal(eh->h_dest, filter->addr[i]))
950  			return 1;
951  
952  	/* Inexact match (multicast only) */
953  	if (is_multicast_ether_addr(eh->h_dest))
954  		return addr_hash_test(filter->mask, eh->h_dest);
955  
956  	return 0;
957  }
958  
959  /*
960   * Checks whether the packet is accepted or not.
961   * Returns: 0 - drop, !=0 - accept
962   */
check_filter(struct tap_filter * filter,const struct sk_buff * skb)963  static int check_filter(struct tap_filter *filter, const struct sk_buff *skb)
964  {
965  	if (!filter->count)
966  		return 1;
967  
968  	return run_filter(filter, skb);
969  }
970  
971  /* Network device part of the driver */
972  
973  static const struct ethtool_ops tun_ethtool_ops;
974  
tun_net_init(struct net_device * dev)975  static int tun_net_init(struct net_device *dev)
976  {
977  	struct tun_struct *tun = netdev_priv(dev);
978  	struct ifreq *ifr = tun->ifr;
979  	int err;
980  
981  	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
982  	if (!dev->tstats)
983  		return -ENOMEM;
984  
985  	spin_lock_init(&tun->lock);
986  
987  	err = security_tun_dev_alloc_security(&tun->security);
988  	if (err < 0) {
989  		free_percpu(dev->tstats);
990  		return err;
991  	}
992  
993  	tun_flow_init(tun);
994  
995  	dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
996  			   TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX |
997  			   NETIF_F_HW_VLAN_STAG_TX;
998  	dev->features = dev->hw_features | NETIF_F_LLTX;
999  	dev->vlan_features = dev->features &
1000  			     ~(NETIF_F_HW_VLAN_CTAG_TX |
1001  			       NETIF_F_HW_VLAN_STAG_TX);
1002  
1003  	tun->flags = (tun->flags & ~TUN_FEATURES) |
1004  		      (ifr->ifr_flags & TUN_FEATURES);
1005  
1006  	INIT_LIST_HEAD(&tun->disabled);
1007  	err = tun_attach(tun, tun->file, false, ifr->ifr_flags & IFF_NAPI,
1008  			 ifr->ifr_flags & IFF_NAPI_FRAGS, false);
1009  	if (err < 0) {
1010  		tun_flow_uninit(tun);
1011  		security_tun_dev_free_security(tun->security);
1012  		free_percpu(dev->tstats);
1013  		return err;
1014  	}
1015  	return 0;
1016  }
1017  
1018  /* Net device detach from fd. */
tun_net_uninit(struct net_device * dev)1019  static void tun_net_uninit(struct net_device *dev)
1020  {
1021  	tun_detach_all(dev);
1022  }
1023  
1024  /* Net device open. */
tun_net_open(struct net_device * dev)1025  static int tun_net_open(struct net_device *dev)
1026  {
1027  	netif_tx_start_all_queues(dev);
1028  
1029  	return 0;
1030  }
1031  
1032  /* Net device close. */
tun_net_close(struct net_device * dev)1033  static int tun_net_close(struct net_device *dev)
1034  {
1035  	netif_tx_stop_all_queues(dev);
1036  	return 0;
1037  }
1038  
1039  /* Net device start xmit */
tun_automq_xmit(struct tun_struct * tun,struct sk_buff * skb)1040  static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb)
1041  {
1042  #ifdef CONFIG_RPS
1043  	if (tun->numqueues == 1 && static_branch_unlikely(&rps_needed)) {
1044  		/* Select queue was not called for the skbuff, so we extract the
1045  		 * RPS hash and save it into the flow_table here.
1046  		 */
1047  		struct tun_flow_entry *e;
1048  		__u32 rxhash;
1049  
1050  		rxhash = __skb_get_hash_symmetric(skb);
1051  		e = tun_flow_find(&tun->flows[tun_hashfn(rxhash)], rxhash);
1052  		if (e)
1053  			tun_flow_save_rps_rxhash(e, rxhash);
1054  	}
1055  #endif
1056  }
1057  
run_ebpf_filter(struct tun_struct * tun,struct sk_buff * skb,int len)1058  static unsigned int run_ebpf_filter(struct tun_struct *tun,
1059  				    struct sk_buff *skb,
1060  				    int len)
1061  {
1062  	struct tun_prog *prog = rcu_dereference(tun->filter_prog);
1063  
1064  	if (prog)
1065  		len = bpf_prog_run_clear_cb(prog->prog, skb);
1066  
1067  	return len;
1068  }
1069  
1070  /* Net device start xmit */
tun_net_xmit(struct sk_buff * skb,struct net_device * dev)1071  static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
1072  {
1073  	struct tun_struct *tun = netdev_priv(dev);
1074  	enum skb_drop_reason drop_reason;
1075  	int txq = skb->queue_mapping;
1076  	struct netdev_queue *queue;
1077  	struct tun_file *tfile;
1078  	int len = skb->len;
1079  
1080  	rcu_read_lock();
1081  	tfile = rcu_dereference(tun->tfiles[txq]);
1082  
1083  	/* Drop packet if interface is not attached */
1084  	if (!tfile) {
1085  		drop_reason = SKB_DROP_REASON_DEV_READY;
1086  		goto drop;
1087  	}
1088  
1089  	if (!rcu_dereference(tun->steering_prog))
1090  		tun_automq_xmit(tun, skb);
1091  
1092  	netif_info(tun, tx_queued, tun->dev, "%s %d\n", __func__, skb->len);
1093  
1094  	/* Drop if the filter does not like it.
1095  	 * This is a noop if the filter is disabled.
1096  	 * Filter can be enabled only for the TAP devices. */
1097  	if (!check_filter(&tun->txflt, skb)) {
1098  		drop_reason = SKB_DROP_REASON_TAP_TXFILTER;
1099  		goto drop;
1100  	}
1101  
1102  	if (tfile->socket.sk->sk_filter &&
1103  	    sk_filter(tfile->socket.sk, skb)) {
1104  		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
1105  		goto drop;
1106  	}
1107  
1108  	len = run_ebpf_filter(tun, skb, len);
1109  	if (len == 0) {
1110  		drop_reason = SKB_DROP_REASON_TAP_FILTER;
1111  		goto drop;
1112  	}
1113  
1114  	if (pskb_trim(skb, len)) {
1115  		drop_reason = SKB_DROP_REASON_NOMEM;
1116  		goto drop;
1117  	}
1118  
1119  	if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) {
1120  		drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT;
1121  		goto drop;
1122  	}
1123  
1124  	skb_tx_timestamp(skb);
1125  
1126  	/* Orphan the skb - required as we might hang on to it
1127  	 * for indefinite time.
1128  	 */
1129  	skb_orphan(skb);
1130  
1131  	nf_reset_ct(skb);
1132  
1133  	if (ptr_ring_produce(&tfile->tx_ring, skb)) {
1134  		drop_reason = SKB_DROP_REASON_FULL_RING;
1135  		goto drop;
1136  	}
1137  
1138  	/* NETIF_F_LLTX requires to do our own update of trans_start */
1139  	queue = netdev_get_tx_queue(dev, txq);
1140  	txq_trans_cond_update(queue);
1141  
1142  	/* Notify and wake up reader process */
1143  	if (tfile->flags & TUN_FASYNC)
1144  		kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
1145  	tfile->socket.sk->sk_data_ready(tfile->socket.sk);
1146  
1147  	rcu_read_unlock();
1148  	return NETDEV_TX_OK;
1149  
1150  drop:
1151  	dev_core_stats_tx_dropped_inc(dev);
1152  	skb_tx_error(skb);
1153  	kfree_skb_reason(skb, drop_reason);
1154  	rcu_read_unlock();
1155  	return NET_XMIT_DROP;
1156  }
1157  
tun_net_mclist(struct net_device * dev)1158  static void tun_net_mclist(struct net_device *dev)
1159  {
1160  	/*
1161  	 * This callback is supposed to deal with mc filter in
1162  	 * _rx_ path and has nothing to do with the _tx_ path.
1163  	 * In rx path we always accept everything userspace gives us.
1164  	 */
1165  }
1166  
tun_net_fix_features(struct net_device * dev,netdev_features_t features)1167  static netdev_features_t tun_net_fix_features(struct net_device *dev,
1168  	netdev_features_t features)
1169  {
1170  	struct tun_struct *tun = netdev_priv(dev);
1171  
1172  	return (features & tun->set_features) | (features & ~TUN_USER_FEATURES);
1173  }
1174  
tun_set_headroom(struct net_device * dev,int new_hr)1175  static void tun_set_headroom(struct net_device *dev, int new_hr)
1176  {
1177  	struct tun_struct *tun = netdev_priv(dev);
1178  
1179  	if (new_hr < NET_SKB_PAD)
1180  		new_hr = NET_SKB_PAD;
1181  
1182  	tun->align = new_hr;
1183  }
1184  
1185  static void
tun_net_get_stats64(struct net_device * dev,struct rtnl_link_stats64 * stats)1186  tun_net_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
1187  {
1188  	struct tun_struct *tun = netdev_priv(dev);
1189  
1190  	dev_get_tstats64(dev, stats);
1191  
1192  	stats->rx_frame_errors +=
1193  		(unsigned long)atomic_long_read(&tun->rx_frame_errors);
1194  }
1195  
tun_xdp_set(struct net_device * dev,struct bpf_prog * prog,struct netlink_ext_ack * extack)1196  static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog,
1197  		       struct netlink_ext_ack *extack)
1198  {
1199  	struct tun_struct *tun = netdev_priv(dev);
1200  	struct tun_file *tfile;
1201  	struct bpf_prog *old_prog;
1202  	int i;
1203  
1204  	old_prog = rtnl_dereference(tun->xdp_prog);
1205  	rcu_assign_pointer(tun->xdp_prog, prog);
1206  	if (old_prog)
1207  		bpf_prog_put(old_prog);
1208  
1209  	for (i = 0; i < tun->numqueues; i++) {
1210  		tfile = rtnl_dereference(tun->tfiles[i]);
1211  		if (prog)
1212  			sock_set_flag(&tfile->sk, SOCK_XDP);
1213  		else
1214  			sock_reset_flag(&tfile->sk, SOCK_XDP);
1215  	}
1216  	list_for_each_entry(tfile, &tun->disabled, next) {
1217  		if (prog)
1218  			sock_set_flag(&tfile->sk, SOCK_XDP);
1219  		else
1220  			sock_reset_flag(&tfile->sk, SOCK_XDP);
1221  	}
1222  
1223  	return 0;
1224  }
1225  
tun_xdp(struct net_device * dev,struct netdev_bpf * xdp)1226  static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp)
1227  {
1228  	switch (xdp->command) {
1229  	case XDP_SETUP_PROG:
1230  		return tun_xdp_set(dev, xdp->prog, xdp->extack);
1231  	default:
1232  		return -EINVAL;
1233  	}
1234  }
1235  
tun_net_change_carrier(struct net_device * dev,bool new_carrier)1236  static int tun_net_change_carrier(struct net_device *dev, bool new_carrier)
1237  {
1238  	if (new_carrier) {
1239  		struct tun_struct *tun = netdev_priv(dev);
1240  
1241  		if (!tun->numqueues)
1242  			return -EPERM;
1243  
1244  		netif_carrier_on(dev);
1245  	} else {
1246  		netif_carrier_off(dev);
1247  	}
1248  	return 0;
1249  }
1250  
1251  static const struct net_device_ops tun_netdev_ops = {
1252  	.ndo_init		= tun_net_init,
1253  	.ndo_uninit		= tun_net_uninit,
1254  	.ndo_open		= tun_net_open,
1255  	.ndo_stop		= tun_net_close,
1256  	.ndo_start_xmit		= tun_net_xmit,
1257  	.ndo_fix_features	= tun_net_fix_features,
1258  	.ndo_select_queue	= tun_select_queue,
1259  	.ndo_set_rx_headroom	= tun_set_headroom,
1260  	.ndo_get_stats64	= tun_net_get_stats64,
1261  	.ndo_change_carrier	= tun_net_change_carrier,
1262  };
1263  
__tun_xdp_flush_tfile(struct tun_file * tfile)1264  static void __tun_xdp_flush_tfile(struct tun_file *tfile)
1265  {
1266  	/* Notify and wake up reader process */
1267  	if (tfile->flags & TUN_FASYNC)
1268  		kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
1269  	tfile->socket.sk->sk_data_ready(tfile->socket.sk);
1270  }
1271  
tun_xdp_xmit(struct net_device * dev,int n,struct xdp_frame ** frames,u32 flags)1272  static int tun_xdp_xmit(struct net_device *dev, int n,
1273  			struct xdp_frame **frames, u32 flags)
1274  {
1275  	struct tun_struct *tun = netdev_priv(dev);
1276  	struct tun_file *tfile;
1277  	u32 numqueues;
1278  	int nxmit = 0;
1279  	int i;
1280  
1281  	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
1282  		return -EINVAL;
1283  
1284  	rcu_read_lock();
1285  
1286  resample:
1287  	numqueues = READ_ONCE(tun->numqueues);
1288  	if (!numqueues) {
1289  		rcu_read_unlock();
1290  		return -ENXIO; /* Caller will free/return all frames */
1291  	}
1292  
1293  	tfile = rcu_dereference(tun->tfiles[smp_processor_id() %
1294  					    numqueues]);
1295  	if (unlikely(!tfile))
1296  		goto resample;
1297  
1298  	spin_lock(&tfile->tx_ring.producer_lock);
1299  	for (i = 0; i < n; i++) {
1300  		struct xdp_frame *xdp = frames[i];
1301  		/* Encode the XDP flag into lowest bit for consumer to differ
1302  		 * XDP buffer from sk_buff.
1303  		 */
1304  		void *frame = tun_xdp_to_ptr(xdp);
1305  
1306  		if (__ptr_ring_produce(&tfile->tx_ring, frame)) {
1307  			dev_core_stats_tx_dropped_inc(dev);
1308  			break;
1309  		}
1310  		nxmit++;
1311  	}
1312  	spin_unlock(&tfile->tx_ring.producer_lock);
1313  
1314  	if (flags & XDP_XMIT_FLUSH)
1315  		__tun_xdp_flush_tfile(tfile);
1316  
1317  	rcu_read_unlock();
1318  	return nxmit;
1319  }
1320  
tun_xdp_tx(struct net_device * dev,struct xdp_buff * xdp)1321  static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
1322  {
1323  	struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp);
1324  	int nxmit;
1325  
1326  	if (unlikely(!frame))
1327  		return -EOVERFLOW;
1328  
1329  	nxmit = tun_xdp_xmit(dev, 1, &frame, XDP_XMIT_FLUSH);
1330  	if (!nxmit)
1331  		xdp_return_frame_rx_napi(frame);
1332  	return nxmit;
1333  }
1334  
1335  static const struct net_device_ops tap_netdev_ops = {
1336  	.ndo_init		= tun_net_init,
1337  	.ndo_uninit		= tun_net_uninit,
1338  	.ndo_open		= tun_net_open,
1339  	.ndo_stop		= tun_net_close,
1340  	.ndo_start_xmit		= tun_net_xmit,
1341  	.ndo_fix_features	= tun_net_fix_features,
1342  	.ndo_set_rx_mode	= tun_net_mclist,
1343  	.ndo_set_mac_address	= eth_mac_addr,
1344  	.ndo_validate_addr	= eth_validate_addr,
1345  	.ndo_select_queue	= tun_select_queue,
1346  	.ndo_features_check	= passthru_features_check,
1347  	.ndo_set_rx_headroom	= tun_set_headroom,
1348  	.ndo_get_stats64	= dev_get_tstats64,
1349  	.ndo_bpf		= tun_xdp,
1350  	.ndo_xdp_xmit		= tun_xdp_xmit,
1351  	.ndo_change_carrier	= tun_net_change_carrier,
1352  };
1353  
tun_flow_init(struct tun_struct * tun)1354  static void tun_flow_init(struct tun_struct *tun)
1355  {
1356  	int i;
1357  
1358  	for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++)
1359  		INIT_HLIST_HEAD(&tun->flows[i]);
1360  
1361  	tun->ageing_time = TUN_FLOW_EXPIRE;
1362  	timer_setup(&tun->flow_gc_timer, tun_flow_cleanup, 0);
1363  	mod_timer(&tun->flow_gc_timer,
1364  		  round_jiffies_up(jiffies + tun->ageing_time));
1365  }
1366  
tun_flow_uninit(struct tun_struct * tun)1367  static void tun_flow_uninit(struct tun_struct *tun)
1368  {
1369  	del_timer_sync(&tun->flow_gc_timer);
1370  	tun_flow_flush(tun);
1371  }
1372  
1373  #define MIN_MTU 68
1374  #define MAX_MTU 65535
1375  
1376  /* Initialize net device. */
tun_net_initialize(struct net_device * dev)1377  static void tun_net_initialize(struct net_device *dev)
1378  {
1379  	struct tun_struct *tun = netdev_priv(dev);
1380  
1381  	switch (tun->flags & TUN_TYPE_MASK) {
1382  	case IFF_TUN:
1383  		dev->netdev_ops = &tun_netdev_ops;
1384  		dev->header_ops = &ip_tunnel_header_ops;
1385  
1386  		/* Point-to-Point TUN Device */
1387  		dev->hard_header_len = 0;
1388  		dev->addr_len = 0;
1389  		dev->mtu = 1500;
1390  
1391  		/* Zero header length */
1392  		dev->type = ARPHRD_NONE;
1393  		dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
1394  		break;
1395  
1396  	case IFF_TAP:
1397  		dev->netdev_ops = &tap_netdev_ops;
1398  		/* Ethernet TAP Device */
1399  		ether_setup(dev);
1400  		dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1401  		dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1402  
1403  		eth_hw_addr_random(dev);
1404  
1405  		/* Currently tun does not support XDP, only tap does. */
1406  		dev->xdp_features = NETDEV_XDP_ACT_BASIC |
1407  				    NETDEV_XDP_ACT_REDIRECT |
1408  				    NETDEV_XDP_ACT_NDO_XMIT;
1409  
1410  		break;
1411  	}
1412  
1413  	dev->min_mtu = MIN_MTU;
1414  	dev->max_mtu = MAX_MTU - dev->hard_header_len;
1415  }
1416  
tun_sock_writeable(struct tun_struct * tun,struct tun_file * tfile)1417  static bool tun_sock_writeable(struct tun_struct *tun, struct tun_file *tfile)
1418  {
1419  	struct sock *sk = tfile->socket.sk;
1420  
1421  	return (tun->dev->flags & IFF_UP) && sock_writeable(sk);
1422  }
1423  
1424  /* Character device part */
1425  
1426  /* Poll */
tun_chr_poll(struct file * file,poll_table * wait)1427  static __poll_t tun_chr_poll(struct file *file, poll_table *wait)
1428  {
1429  	struct tun_file *tfile = file->private_data;
1430  	struct tun_struct *tun = tun_get(tfile);
1431  	struct sock *sk;
1432  	__poll_t mask = 0;
1433  
1434  	if (!tun)
1435  		return EPOLLERR;
1436  
1437  	sk = tfile->socket.sk;
1438  
1439  	poll_wait(file, sk_sleep(sk), wait);
1440  
1441  	if (!ptr_ring_empty(&tfile->tx_ring))
1442  		mask |= EPOLLIN | EPOLLRDNORM;
1443  
1444  	/* Make sure SOCKWQ_ASYNC_NOSPACE is set if not writable to
1445  	 * guarantee EPOLLOUT to be raised by either here or
1446  	 * tun_sock_write_space(). Then process could get notification
1447  	 * after it writes to a down device and meets -EIO.
1448  	 */
1449  	if (tun_sock_writeable(tun, tfile) ||
1450  	    (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
1451  	     tun_sock_writeable(tun, tfile)))
1452  		mask |= EPOLLOUT | EPOLLWRNORM;
1453  
1454  	if (tun->dev->reg_state != NETREG_REGISTERED)
1455  		mask = EPOLLERR;
1456  
1457  	tun_put(tun);
1458  	return mask;
1459  }
1460  
tun_napi_alloc_frags(struct tun_file * tfile,size_t len,const struct iov_iter * it)1461  static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile,
1462  					    size_t len,
1463  					    const struct iov_iter *it)
1464  {
1465  	struct sk_buff *skb;
1466  	size_t linear;
1467  	int err;
1468  	int i;
1469  
1470  	if (it->nr_segs > MAX_SKB_FRAGS + 1 ||
1471  	    len > (ETH_MAX_MTU - NET_SKB_PAD - NET_IP_ALIGN))
1472  		return ERR_PTR(-EMSGSIZE);
1473  
1474  	local_bh_disable();
1475  	skb = napi_get_frags(&tfile->napi);
1476  	local_bh_enable();
1477  	if (!skb)
1478  		return ERR_PTR(-ENOMEM);
1479  
1480  	linear = iov_iter_single_seg_count(it);
1481  	err = __skb_grow(skb, linear);
1482  	if (err)
1483  		goto free;
1484  
1485  	skb->len = len;
1486  	skb->data_len = len - linear;
1487  	skb->truesize += skb->data_len;
1488  
1489  	for (i = 1; i < it->nr_segs; i++) {
1490  		const struct iovec *iov = iter_iov(it) + i;
1491  		size_t fragsz = iov->iov_len;
1492  		struct page *page;
1493  		void *frag;
1494  
1495  		if (fragsz == 0 || fragsz > PAGE_SIZE) {
1496  			err = -EINVAL;
1497  			goto free;
1498  		}
1499  		frag = netdev_alloc_frag(fragsz);
1500  		if (!frag) {
1501  			err = -ENOMEM;
1502  			goto free;
1503  		}
1504  		page = virt_to_head_page(frag);
1505  		skb_fill_page_desc(skb, i - 1, page,
1506  				   frag - page_address(page), fragsz);
1507  	}
1508  
1509  	return skb;
1510  free:
1511  	/* frees skb and all frags allocated with napi_alloc_frag() */
1512  	napi_free_frags(&tfile->napi);
1513  	return ERR_PTR(err);
1514  }
1515  
1516  /* prepad is the amount to reserve at front.  len is length after that.
1517   * linear is a hint as to how much to copy (usually headers). */
tun_alloc_skb(struct tun_file * tfile,size_t prepad,size_t len,size_t linear,int noblock)1518  static struct sk_buff *tun_alloc_skb(struct tun_file *tfile,
1519  				     size_t prepad, size_t len,
1520  				     size_t linear, int noblock)
1521  {
1522  	struct sock *sk = tfile->socket.sk;
1523  	struct sk_buff *skb;
1524  	int err;
1525  
1526  	/* Under a page?  Don't bother with paged skb. */
1527  	if (prepad + len < PAGE_SIZE)
1528  		linear = len;
1529  
1530  	if (len - linear > MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
1531  		linear = len - MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER);
1532  	skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
1533  				   &err, PAGE_ALLOC_COSTLY_ORDER);
1534  	if (!skb)
1535  		return ERR_PTR(err);
1536  
1537  	skb_reserve(skb, prepad);
1538  	skb_put(skb, linear);
1539  	skb->data_len = len - linear;
1540  	skb->len += len - linear;
1541  
1542  	return skb;
1543  }
1544  
tun_rx_batched(struct tun_struct * tun,struct tun_file * tfile,struct sk_buff * skb,int more)1545  static void tun_rx_batched(struct tun_struct *tun, struct tun_file *tfile,
1546  			   struct sk_buff *skb, int more)
1547  {
1548  	struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
1549  	struct sk_buff_head process_queue;
1550  	u32 rx_batched = tun->rx_batched;
1551  	bool rcv = false;
1552  
1553  	if (!rx_batched || (!more && skb_queue_empty(queue))) {
1554  		local_bh_disable();
1555  		skb_record_rx_queue(skb, tfile->queue_index);
1556  		netif_receive_skb(skb);
1557  		local_bh_enable();
1558  		return;
1559  	}
1560  
1561  	spin_lock(&queue->lock);
1562  	if (!more || skb_queue_len(queue) == rx_batched) {
1563  		__skb_queue_head_init(&process_queue);
1564  		skb_queue_splice_tail_init(queue, &process_queue);
1565  		rcv = true;
1566  	} else {
1567  		__skb_queue_tail(queue, skb);
1568  	}
1569  	spin_unlock(&queue->lock);
1570  
1571  	if (rcv) {
1572  		struct sk_buff *nskb;
1573  
1574  		local_bh_disable();
1575  		while ((nskb = __skb_dequeue(&process_queue))) {
1576  			skb_record_rx_queue(nskb, tfile->queue_index);
1577  			netif_receive_skb(nskb);
1578  		}
1579  		skb_record_rx_queue(skb, tfile->queue_index);
1580  		netif_receive_skb(skb);
1581  		local_bh_enable();
1582  	}
1583  }
1584  
tun_can_build_skb(struct tun_struct * tun,struct tun_file * tfile,int len,int noblock,bool zerocopy)1585  static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
1586  			      int len, int noblock, bool zerocopy)
1587  {
1588  	if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
1589  		return false;
1590  
1591  	if (tfile->socket.sk->sk_sndbuf != INT_MAX)
1592  		return false;
1593  
1594  	if (!noblock)
1595  		return false;
1596  
1597  	if (zerocopy)
1598  		return false;
1599  
1600  	if (SKB_DATA_ALIGN(len + TUN_RX_PAD + XDP_PACKET_HEADROOM) +
1601  	    SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
1602  		return false;
1603  
1604  	return true;
1605  }
1606  
__tun_build_skb(struct tun_file * tfile,struct page_frag * alloc_frag,char * buf,int buflen,int len,int pad)1607  static struct sk_buff *__tun_build_skb(struct tun_file *tfile,
1608  				       struct page_frag *alloc_frag, char *buf,
1609  				       int buflen, int len, int pad)
1610  {
1611  	struct sk_buff *skb = build_skb(buf, buflen);
1612  
1613  	if (!skb)
1614  		return ERR_PTR(-ENOMEM);
1615  
1616  	skb_reserve(skb, pad);
1617  	skb_put(skb, len);
1618  	skb_set_owner_w(skb, tfile->socket.sk);
1619  
1620  	get_page(alloc_frag->page);
1621  	alloc_frag->offset += buflen;
1622  
1623  	return skb;
1624  }
1625  
tun_xdp_act(struct tun_struct * tun,struct bpf_prog * xdp_prog,struct xdp_buff * xdp,u32 act)1626  static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog,
1627  		       struct xdp_buff *xdp, u32 act)
1628  {
1629  	int err;
1630  
1631  	switch (act) {
1632  	case XDP_REDIRECT:
1633  		err = xdp_do_redirect(tun->dev, xdp, xdp_prog);
1634  		if (err) {
1635  			dev_core_stats_rx_dropped_inc(tun->dev);
1636  			return err;
1637  		}
1638  		dev_sw_netstats_rx_add(tun->dev, xdp->data_end - xdp->data);
1639  		break;
1640  	case XDP_TX:
1641  		err = tun_xdp_tx(tun->dev, xdp);
1642  		if (err < 0) {
1643  			dev_core_stats_rx_dropped_inc(tun->dev);
1644  			return err;
1645  		}
1646  		dev_sw_netstats_rx_add(tun->dev, xdp->data_end - xdp->data);
1647  		break;
1648  	case XDP_PASS:
1649  		break;
1650  	default:
1651  		bpf_warn_invalid_xdp_action(tun->dev, xdp_prog, act);
1652  		fallthrough;
1653  	case XDP_ABORTED:
1654  		trace_xdp_exception(tun->dev, xdp_prog, act);
1655  		fallthrough;
1656  	case XDP_DROP:
1657  		dev_core_stats_rx_dropped_inc(tun->dev);
1658  		break;
1659  	}
1660  
1661  	return act;
1662  }
1663  
tun_build_skb(struct tun_struct * tun,struct tun_file * tfile,struct iov_iter * from,struct virtio_net_hdr * hdr,int len,int * skb_xdp)1664  static struct sk_buff *tun_build_skb(struct tun_struct *tun,
1665  				     struct tun_file *tfile,
1666  				     struct iov_iter *from,
1667  				     struct virtio_net_hdr *hdr,
1668  				     int len, int *skb_xdp)
1669  {
1670  	struct page_frag *alloc_frag = &current->task_frag;
1671  	struct bpf_prog *xdp_prog;
1672  	int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
1673  	char *buf;
1674  	size_t copied;
1675  	int pad = TUN_RX_PAD;
1676  	int err = 0;
1677  
1678  	rcu_read_lock();
1679  	xdp_prog = rcu_dereference(tun->xdp_prog);
1680  	if (xdp_prog)
1681  		pad += XDP_PACKET_HEADROOM;
1682  	buflen += SKB_DATA_ALIGN(len + pad);
1683  	rcu_read_unlock();
1684  
1685  	alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
1686  	if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
1687  		return ERR_PTR(-ENOMEM);
1688  
1689  	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
1690  	copied = copy_page_from_iter(alloc_frag->page,
1691  				     alloc_frag->offset + pad,
1692  				     len, from);
1693  	if (copied != len)
1694  		return ERR_PTR(-EFAULT);
1695  
1696  	/* There's a small window that XDP may be set after the check
1697  	 * of xdp_prog above, this should be rare and for simplicity
1698  	 * we do XDP on skb in case the headroom is not enough.
1699  	 */
1700  	if (hdr->gso_type || !xdp_prog) {
1701  		*skb_xdp = 1;
1702  		return __tun_build_skb(tfile, alloc_frag, buf, buflen, len,
1703  				       pad);
1704  	}
1705  
1706  	*skb_xdp = 0;
1707  
1708  	local_bh_disable();
1709  	rcu_read_lock();
1710  	xdp_prog = rcu_dereference(tun->xdp_prog);
1711  	if (xdp_prog) {
1712  		struct xdp_buff xdp;
1713  		u32 act;
1714  
1715  		xdp_init_buff(&xdp, buflen, &tfile->xdp_rxq);
1716  		xdp_prepare_buff(&xdp, buf, pad, len, false);
1717  
1718  		act = bpf_prog_run_xdp(xdp_prog, &xdp);
1719  		if (act == XDP_REDIRECT || act == XDP_TX) {
1720  			get_page(alloc_frag->page);
1721  			alloc_frag->offset += buflen;
1722  		}
1723  		err = tun_xdp_act(tun, xdp_prog, &xdp, act);
1724  		if (err < 0) {
1725  			if (act == XDP_REDIRECT || act == XDP_TX)
1726  				put_page(alloc_frag->page);
1727  			goto out;
1728  		}
1729  
1730  		if (err == XDP_REDIRECT)
1731  			xdp_do_flush();
1732  		if (err != XDP_PASS)
1733  			goto out;
1734  
1735  		pad = xdp.data - xdp.data_hard_start;
1736  		len = xdp.data_end - xdp.data;
1737  	}
1738  	rcu_read_unlock();
1739  	local_bh_enable();
1740  
1741  	return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad);
1742  
1743  out:
1744  	rcu_read_unlock();
1745  	local_bh_enable();
1746  	return NULL;
1747  }
1748  
1749  /* Get packet from user space buffer */
tun_get_user(struct tun_struct * tun,struct tun_file * tfile,void * msg_control,struct iov_iter * from,int noblock,bool more)1750  static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
1751  			    void *msg_control, struct iov_iter *from,
1752  			    int noblock, bool more)
1753  {
1754  	struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) };
1755  	struct sk_buff *skb;
1756  	size_t total_len = iov_iter_count(from);
1757  	size_t len = total_len, align = tun->align, linear;
1758  	struct virtio_net_hdr gso = { 0 };
1759  	int good_linear;
1760  	int copylen;
1761  	bool zerocopy = false;
1762  	int err;
1763  	u32 rxhash = 0;
1764  	int skb_xdp = 1;
1765  	bool frags = tun_napi_frags_enabled(tfile);
1766  	enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1767  
1768  	if (!(tun->flags & IFF_NO_PI)) {
1769  		if (len < sizeof(pi))
1770  			return -EINVAL;
1771  		len -= sizeof(pi);
1772  
1773  		if (!copy_from_iter_full(&pi, sizeof(pi), from))
1774  			return -EFAULT;
1775  	}
1776  
1777  	if (tun->flags & IFF_VNET_HDR) {
1778  		int vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
1779  
1780  		if (len < vnet_hdr_sz)
1781  			return -EINVAL;
1782  		len -= vnet_hdr_sz;
1783  
1784  		if (!copy_from_iter_full(&gso, sizeof(gso), from))
1785  			return -EFAULT;
1786  
1787  		if ((gso.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
1788  		    tun16_to_cpu(tun, gso.csum_start) + tun16_to_cpu(tun, gso.csum_offset) + 2 > tun16_to_cpu(tun, gso.hdr_len))
1789  			gso.hdr_len = cpu_to_tun16(tun, tun16_to_cpu(tun, gso.csum_start) + tun16_to_cpu(tun, gso.csum_offset) + 2);
1790  
1791  		if (tun16_to_cpu(tun, gso.hdr_len) > len)
1792  			return -EINVAL;
1793  		iov_iter_advance(from, vnet_hdr_sz - sizeof(gso));
1794  	}
1795  
1796  	if ((tun->flags & TUN_TYPE_MASK) == IFF_TAP) {
1797  		align += NET_IP_ALIGN;
1798  		if (unlikely(len < ETH_HLEN ||
1799  			     (gso.hdr_len && tun16_to_cpu(tun, gso.hdr_len) < ETH_HLEN)))
1800  			return -EINVAL;
1801  	}
1802  
1803  	good_linear = SKB_MAX_HEAD(align);
1804  
1805  	if (msg_control) {
1806  		struct iov_iter i = *from;
1807  
1808  		/* There are 256 bytes to be copied in skb, so there is
1809  		 * enough room for skb expand head in case it is used.
1810  		 * The rest of the buffer is mapped from userspace.
1811  		 */
1812  		copylen = gso.hdr_len ? tun16_to_cpu(tun, gso.hdr_len) : GOODCOPY_LEN;
1813  		if (copylen > good_linear)
1814  			copylen = good_linear;
1815  		linear = copylen;
1816  		iov_iter_advance(&i, copylen);
1817  		if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS)
1818  			zerocopy = true;
1819  	}
1820  
1821  	if (!frags && tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
1822  		/* For the packet that is not easy to be processed
1823  		 * (e.g gso or jumbo packet), we will do it at after
1824  		 * skb was created with generic XDP routine.
1825  		 */
1826  		skb = tun_build_skb(tun, tfile, from, &gso, len, &skb_xdp);
1827  		err = PTR_ERR_OR_ZERO(skb);
1828  		if (err)
1829  			goto drop;
1830  		if (!skb)
1831  			return total_len;
1832  	} else {
1833  		if (!zerocopy) {
1834  			copylen = len;
1835  			if (tun16_to_cpu(tun, gso.hdr_len) > good_linear)
1836  				linear = good_linear;
1837  			else
1838  				linear = tun16_to_cpu(tun, gso.hdr_len);
1839  		}
1840  
1841  		if (frags) {
1842  			mutex_lock(&tfile->napi_mutex);
1843  			skb = tun_napi_alloc_frags(tfile, copylen, from);
1844  			/* tun_napi_alloc_frags() enforces a layout for the skb.
1845  			 * If zerocopy is enabled, then this layout will be
1846  			 * overwritten by zerocopy_sg_from_iter().
1847  			 */
1848  			zerocopy = false;
1849  		} else {
1850  			if (!linear)
1851  				linear = min_t(size_t, good_linear, copylen);
1852  
1853  			skb = tun_alloc_skb(tfile, align, copylen, linear,
1854  					    noblock);
1855  		}
1856  
1857  		err = PTR_ERR_OR_ZERO(skb);
1858  		if (err)
1859  			goto drop;
1860  
1861  		if (zerocopy)
1862  			err = zerocopy_sg_from_iter(skb, from);
1863  		else
1864  			err = skb_copy_datagram_from_iter(skb, 0, from, len);
1865  
1866  		if (err) {
1867  			err = -EFAULT;
1868  			drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT;
1869  			goto drop;
1870  		}
1871  	}
1872  
1873  	if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) {
1874  		atomic_long_inc(&tun->rx_frame_errors);
1875  		err = -EINVAL;
1876  		goto free_skb;
1877  	}
1878  
1879  	switch (tun->flags & TUN_TYPE_MASK) {
1880  	case IFF_TUN:
1881  		if (tun->flags & IFF_NO_PI) {
1882  			u8 ip_version = skb->len ? (skb->data[0] >> 4) : 0;
1883  
1884  			switch (ip_version) {
1885  			case 4:
1886  				pi.proto = htons(ETH_P_IP);
1887  				break;
1888  			case 6:
1889  				pi.proto = htons(ETH_P_IPV6);
1890  				break;
1891  			default:
1892  				err = -EINVAL;
1893  				goto drop;
1894  			}
1895  		}
1896  
1897  		skb_reset_mac_header(skb);
1898  		skb->protocol = pi.proto;
1899  		skb->dev = tun->dev;
1900  		break;
1901  	case IFF_TAP:
1902  		if (frags && !pskb_may_pull(skb, ETH_HLEN)) {
1903  			err = -ENOMEM;
1904  			drop_reason = SKB_DROP_REASON_HDR_TRUNC;
1905  			goto drop;
1906  		}
1907  		skb->protocol = eth_type_trans(skb, tun->dev);
1908  		break;
1909  	}
1910  
1911  	/* copy skb_ubuf_info for callback when skb has no error */
1912  	if (zerocopy) {
1913  		skb_zcopy_init(skb, msg_control);
1914  	} else if (msg_control) {
1915  		struct ubuf_info *uarg = msg_control;
1916  		uarg->callback(NULL, uarg, false);
1917  	}
1918  
1919  	skb_reset_network_header(skb);
1920  	skb_probe_transport_header(skb);
1921  	skb_record_rx_queue(skb, tfile->queue_index);
1922  
1923  	if (skb_xdp) {
1924  		struct bpf_prog *xdp_prog;
1925  		int ret;
1926  
1927  		local_bh_disable();
1928  		rcu_read_lock();
1929  		xdp_prog = rcu_dereference(tun->xdp_prog);
1930  		if (xdp_prog) {
1931  			ret = do_xdp_generic(xdp_prog, skb);
1932  			if (ret != XDP_PASS) {
1933  				rcu_read_unlock();
1934  				local_bh_enable();
1935  				goto unlock_frags;
1936  			}
1937  		}
1938  		rcu_read_unlock();
1939  		local_bh_enable();
1940  	}
1941  
1942  	/* Compute the costly rx hash only if needed for flow updates.
1943  	 * We may get a very small possibility of OOO during switching, not
1944  	 * worth to optimize.
1945  	 */
1946  	if (!rcu_access_pointer(tun->steering_prog) && tun->numqueues > 1 &&
1947  	    !tfile->detached)
1948  		rxhash = __skb_get_hash_symmetric(skb);
1949  
1950  	rcu_read_lock();
1951  	if (unlikely(!(tun->dev->flags & IFF_UP))) {
1952  		err = -EIO;
1953  		rcu_read_unlock();
1954  		drop_reason = SKB_DROP_REASON_DEV_READY;
1955  		goto drop;
1956  	}
1957  
1958  	if (frags) {
1959  		u32 headlen;
1960  
1961  		/* Exercise flow dissector code path. */
1962  		skb_push(skb, ETH_HLEN);
1963  		headlen = eth_get_headlen(tun->dev, skb->data,
1964  					  skb_headlen(skb));
1965  
1966  		if (unlikely(headlen > skb_headlen(skb))) {
1967  			WARN_ON_ONCE(1);
1968  			err = -ENOMEM;
1969  			dev_core_stats_rx_dropped_inc(tun->dev);
1970  napi_busy:
1971  			napi_free_frags(&tfile->napi);
1972  			rcu_read_unlock();
1973  			mutex_unlock(&tfile->napi_mutex);
1974  			return err;
1975  		}
1976  
1977  		if (likely(napi_schedule_prep(&tfile->napi))) {
1978  			local_bh_disable();
1979  			napi_gro_frags(&tfile->napi);
1980  			napi_complete(&tfile->napi);
1981  			local_bh_enable();
1982  		} else {
1983  			err = -EBUSY;
1984  			goto napi_busy;
1985  		}
1986  		mutex_unlock(&tfile->napi_mutex);
1987  	} else if (tfile->napi_enabled) {
1988  		struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
1989  		int queue_len;
1990  
1991  		spin_lock_bh(&queue->lock);
1992  
1993  		if (unlikely(tfile->detached)) {
1994  			spin_unlock_bh(&queue->lock);
1995  			rcu_read_unlock();
1996  			err = -EBUSY;
1997  			goto free_skb;
1998  		}
1999  
2000  		__skb_queue_tail(queue, skb);
2001  		queue_len = skb_queue_len(queue);
2002  		spin_unlock(&queue->lock);
2003  
2004  		if (!more || queue_len > NAPI_POLL_WEIGHT)
2005  			napi_schedule(&tfile->napi);
2006  
2007  		local_bh_enable();
2008  	} else if (!IS_ENABLED(CONFIG_4KSTACKS)) {
2009  		tun_rx_batched(tun, tfile, skb, more);
2010  	} else {
2011  		netif_rx(skb);
2012  	}
2013  	rcu_read_unlock();
2014  
2015  	preempt_disable();
2016  	dev_sw_netstats_rx_add(tun->dev, len);
2017  	preempt_enable();
2018  
2019  	if (rxhash)
2020  		tun_flow_update(tun, rxhash, tfile);
2021  
2022  	return total_len;
2023  
2024  drop:
2025  	if (err != -EAGAIN)
2026  		dev_core_stats_rx_dropped_inc(tun->dev);
2027  
2028  free_skb:
2029  	if (!IS_ERR_OR_NULL(skb))
2030  		kfree_skb_reason(skb, drop_reason);
2031  
2032  unlock_frags:
2033  	if (frags) {
2034  		tfile->napi.skb = NULL;
2035  		mutex_unlock(&tfile->napi_mutex);
2036  	}
2037  
2038  	return err ?: total_len;
2039  }
2040  
tun_chr_write_iter(struct kiocb * iocb,struct iov_iter * from)2041  static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from)
2042  {
2043  	struct file *file = iocb->ki_filp;
2044  	struct tun_file *tfile = file->private_data;
2045  	struct tun_struct *tun = tun_get(tfile);
2046  	ssize_t result;
2047  	int noblock = 0;
2048  
2049  	if (!tun)
2050  		return -EBADFD;
2051  
2052  	if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT))
2053  		noblock = 1;
2054  
2055  	result = tun_get_user(tun, tfile, NULL, from, noblock, false);
2056  
2057  	tun_put(tun);
2058  	return result;
2059  }
2060  
tun_put_user_xdp(struct tun_struct * tun,struct tun_file * tfile,struct xdp_frame * xdp_frame,struct iov_iter * iter)2061  static ssize_t tun_put_user_xdp(struct tun_struct *tun,
2062  				struct tun_file *tfile,
2063  				struct xdp_frame *xdp_frame,
2064  				struct iov_iter *iter)
2065  {
2066  	int vnet_hdr_sz = 0;
2067  	size_t size = xdp_frame->len;
2068  	size_t ret;
2069  
2070  	if (tun->flags & IFF_VNET_HDR) {
2071  		struct virtio_net_hdr gso = { 0 };
2072  
2073  		vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
2074  		if (unlikely(iov_iter_count(iter) < vnet_hdr_sz))
2075  			return -EINVAL;
2076  		if (unlikely(copy_to_iter(&gso, sizeof(gso), iter) !=
2077  			     sizeof(gso)))
2078  			return -EFAULT;
2079  		iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso));
2080  	}
2081  
2082  	ret = copy_to_iter(xdp_frame->data, size, iter) + vnet_hdr_sz;
2083  
2084  	preempt_disable();
2085  	dev_sw_netstats_tx_add(tun->dev, 1, ret);
2086  	preempt_enable();
2087  
2088  	return ret;
2089  }
2090  
2091  /* Put packet to the user space buffer */
tun_put_user(struct tun_struct * tun,struct tun_file * tfile,struct sk_buff * skb,struct iov_iter * iter)2092  static ssize_t tun_put_user(struct tun_struct *tun,
2093  			    struct tun_file *tfile,
2094  			    struct sk_buff *skb,
2095  			    struct iov_iter *iter)
2096  {
2097  	struct tun_pi pi = { 0, skb->protocol };
2098  	ssize_t total;
2099  	int vlan_offset = 0;
2100  	int vlan_hlen = 0;
2101  	int vnet_hdr_sz = 0;
2102  
2103  	if (skb_vlan_tag_present(skb))
2104  		vlan_hlen = VLAN_HLEN;
2105  
2106  	if (tun->flags & IFF_VNET_HDR)
2107  		vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
2108  
2109  	total = skb->len + vlan_hlen + vnet_hdr_sz;
2110  
2111  	if (!(tun->flags & IFF_NO_PI)) {
2112  		if (iov_iter_count(iter) < sizeof(pi))
2113  			return -EINVAL;
2114  
2115  		total += sizeof(pi);
2116  		if (iov_iter_count(iter) < total) {
2117  			/* Packet will be striped */
2118  			pi.flags |= TUN_PKT_STRIP;
2119  		}
2120  
2121  		if (copy_to_iter(&pi, sizeof(pi), iter) != sizeof(pi))
2122  			return -EFAULT;
2123  	}
2124  
2125  	if (vnet_hdr_sz) {
2126  		struct virtio_net_hdr gso;
2127  
2128  		if (iov_iter_count(iter) < vnet_hdr_sz)
2129  			return -EINVAL;
2130  
2131  		if (virtio_net_hdr_from_skb(skb, &gso,
2132  					    tun_is_little_endian(tun), true,
2133  					    vlan_hlen)) {
2134  			struct skb_shared_info *sinfo = skb_shinfo(skb);
2135  
2136  			if (net_ratelimit()) {
2137  				netdev_err(tun->dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n",
2138  					   sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size),
2139  					   tun16_to_cpu(tun, gso.hdr_len));
2140  				print_hex_dump(KERN_ERR, "tun: ",
2141  					       DUMP_PREFIX_NONE,
2142  					       16, 1, skb->head,
2143  					       min((int)tun16_to_cpu(tun, gso.hdr_len), 64), true);
2144  			}
2145  			WARN_ON_ONCE(1);
2146  			return -EINVAL;
2147  		}
2148  
2149  		if (copy_to_iter(&gso, sizeof(gso), iter) != sizeof(gso))
2150  			return -EFAULT;
2151  
2152  		iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso));
2153  	}
2154  
2155  	if (vlan_hlen) {
2156  		int ret;
2157  		struct veth veth;
2158  
2159  		veth.h_vlan_proto = skb->vlan_proto;
2160  		veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb));
2161  
2162  		vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
2163  
2164  		ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset);
2165  		if (ret || !iov_iter_count(iter))
2166  			goto done;
2167  
2168  		ret = copy_to_iter(&veth, sizeof(veth), iter);
2169  		if (ret != sizeof(veth) || !iov_iter_count(iter))
2170  			goto done;
2171  	}
2172  
2173  	skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset);
2174  
2175  done:
2176  	/* caller is in process context, */
2177  	preempt_disable();
2178  	dev_sw_netstats_tx_add(tun->dev, 1, skb->len + vlan_hlen);
2179  	preempt_enable();
2180  
2181  	return total;
2182  }
2183  
tun_ring_recv(struct tun_file * tfile,int noblock,int * err)2184  static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
2185  {
2186  	DECLARE_WAITQUEUE(wait, current);
2187  	void *ptr = NULL;
2188  	int error = 0;
2189  
2190  	ptr = ptr_ring_consume(&tfile->tx_ring);
2191  	if (ptr)
2192  		goto out;
2193  	if (noblock) {
2194  		error = -EAGAIN;
2195  		goto out;
2196  	}
2197  
2198  	add_wait_queue(&tfile->socket.wq.wait, &wait);
2199  
2200  	while (1) {
2201  		set_current_state(TASK_INTERRUPTIBLE);
2202  		ptr = ptr_ring_consume(&tfile->tx_ring);
2203  		if (ptr)
2204  			break;
2205  		if (signal_pending(current)) {
2206  			error = -ERESTARTSYS;
2207  			break;
2208  		}
2209  		if (tfile->socket.sk->sk_shutdown & RCV_SHUTDOWN) {
2210  			error = -EFAULT;
2211  			break;
2212  		}
2213  
2214  		schedule();
2215  	}
2216  
2217  	__set_current_state(TASK_RUNNING);
2218  	remove_wait_queue(&tfile->socket.wq.wait, &wait);
2219  
2220  out:
2221  	*err = error;
2222  	return ptr;
2223  }
2224  
tun_do_read(struct tun_struct * tun,struct tun_file * tfile,struct iov_iter * to,int noblock,void * ptr)2225  static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
2226  			   struct iov_iter *to,
2227  			   int noblock, void *ptr)
2228  {
2229  	ssize_t ret;
2230  	int err;
2231  
2232  	if (!iov_iter_count(to)) {
2233  		tun_ptr_free(ptr);
2234  		return 0;
2235  	}
2236  
2237  	if (!ptr) {
2238  		/* Read frames from ring */
2239  		ptr = tun_ring_recv(tfile, noblock, &err);
2240  		if (!ptr)
2241  			return err;
2242  	}
2243  
2244  	if (tun_is_xdp_frame(ptr)) {
2245  		struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
2246  
2247  		ret = tun_put_user_xdp(tun, tfile, xdpf, to);
2248  		xdp_return_frame(xdpf);
2249  	} else {
2250  		struct sk_buff *skb = ptr;
2251  
2252  		ret = tun_put_user(tun, tfile, skb, to);
2253  		if (unlikely(ret < 0))
2254  			kfree_skb(skb);
2255  		else
2256  			consume_skb(skb);
2257  	}
2258  
2259  	return ret;
2260  }
2261  
tun_chr_read_iter(struct kiocb * iocb,struct iov_iter * to)2262  static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
2263  {
2264  	struct file *file = iocb->ki_filp;
2265  	struct tun_file *tfile = file->private_data;
2266  	struct tun_struct *tun = tun_get(tfile);
2267  	ssize_t len = iov_iter_count(to), ret;
2268  	int noblock = 0;
2269  
2270  	if (!tun)
2271  		return -EBADFD;
2272  
2273  	if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT))
2274  		noblock = 1;
2275  
2276  	ret = tun_do_read(tun, tfile, to, noblock, NULL);
2277  	ret = min_t(ssize_t, ret, len);
2278  	if (ret > 0)
2279  		iocb->ki_pos = ret;
2280  	tun_put(tun);
2281  	return ret;
2282  }
2283  
tun_prog_free(struct rcu_head * rcu)2284  static void tun_prog_free(struct rcu_head *rcu)
2285  {
2286  	struct tun_prog *prog = container_of(rcu, struct tun_prog, rcu);
2287  
2288  	bpf_prog_destroy(prog->prog);
2289  	kfree(prog);
2290  }
2291  
__tun_set_ebpf(struct tun_struct * tun,struct tun_prog __rcu ** prog_p,struct bpf_prog * prog)2292  static int __tun_set_ebpf(struct tun_struct *tun,
2293  			  struct tun_prog __rcu **prog_p,
2294  			  struct bpf_prog *prog)
2295  {
2296  	struct tun_prog *old, *new = NULL;
2297  
2298  	if (prog) {
2299  		new = kmalloc(sizeof(*new), GFP_KERNEL);
2300  		if (!new)
2301  			return -ENOMEM;
2302  		new->prog = prog;
2303  	}
2304  
2305  	spin_lock_bh(&tun->lock);
2306  	old = rcu_dereference_protected(*prog_p,
2307  					lockdep_is_held(&tun->lock));
2308  	rcu_assign_pointer(*prog_p, new);
2309  	spin_unlock_bh(&tun->lock);
2310  
2311  	if (old)
2312  		call_rcu(&old->rcu, tun_prog_free);
2313  
2314  	return 0;
2315  }
2316  
tun_free_netdev(struct net_device * dev)2317  static void tun_free_netdev(struct net_device *dev)
2318  {
2319  	struct tun_struct *tun = netdev_priv(dev);
2320  
2321  	BUG_ON(!(list_empty(&tun->disabled)));
2322  
2323  	free_percpu(dev->tstats);
2324  	tun_flow_uninit(tun);
2325  	security_tun_dev_free_security(tun->security);
2326  	__tun_set_ebpf(tun, &tun->steering_prog, NULL);
2327  	__tun_set_ebpf(tun, &tun->filter_prog, NULL);
2328  }
2329  
tun_setup(struct net_device * dev)2330  static void tun_setup(struct net_device *dev)
2331  {
2332  	struct tun_struct *tun = netdev_priv(dev);
2333  
2334  	tun->owner = INVALID_UID;
2335  	tun->group = INVALID_GID;
2336  	tun_default_link_ksettings(dev, &tun->link_ksettings);
2337  
2338  	dev->ethtool_ops = &tun_ethtool_ops;
2339  	dev->needs_free_netdev = true;
2340  	dev->priv_destructor = tun_free_netdev;
2341  	/* We prefer our own queue length */
2342  	dev->tx_queue_len = TUN_READQ_SIZE;
2343  }
2344  
2345  /* Trivial set of netlink ops to allow deleting tun or tap
2346   * device with netlink.
2347   */
tun_validate(struct nlattr * tb[],struct nlattr * data[],struct netlink_ext_ack * extack)2348  static int tun_validate(struct nlattr *tb[], struct nlattr *data[],
2349  			struct netlink_ext_ack *extack)
2350  {
2351  	NL_SET_ERR_MSG(extack,
2352  		       "tun/tap creation via rtnetlink is not supported.");
2353  	return -EOPNOTSUPP;
2354  }
2355  
tun_get_size(const struct net_device * dev)2356  static size_t tun_get_size(const struct net_device *dev)
2357  {
2358  	BUILD_BUG_ON(sizeof(u32) != sizeof(uid_t));
2359  	BUILD_BUG_ON(sizeof(u32) != sizeof(gid_t));
2360  
2361  	return nla_total_size(sizeof(uid_t)) + /* OWNER */
2362  	       nla_total_size(sizeof(gid_t)) + /* GROUP */
2363  	       nla_total_size(sizeof(u8)) + /* TYPE */
2364  	       nla_total_size(sizeof(u8)) + /* PI */
2365  	       nla_total_size(sizeof(u8)) + /* VNET_HDR */
2366  	       nla_total_size(sizeof(u8)) + /* PERSIST */
2367  	       nla_total_size(sizeof(u8)) + /* MULTI_QUEUE */
2368  	       nla_total_size(sizeof(u32)) + /* NUM_QUEUES */
2369  	       nla_total_size(sizeof(u32)) + /* NUM_DISABLED_QUEUES */
2370  	       0;
2371  }
2372  
tun_fill_info(struct sk_buff * skb,const struct net_device * dev)2373  static int tun_fill_info(struct sk_buff *skb, const struct net_device *dev)
2374  {
2375  	struct tun_struct *tun = netdev_priv(dev);
2376  
2377  	if (nla_put_u8(skb, IFLA_TUN_TYPE, tun->flags & TUN_TYPE_MASK))
2378  		goto nla_put_failure;
2379  	if (uid_valid(tun->owner) &&
2380  	    nla_put_u32(skb, IFLA_TUN_OWNER,
2381  			from_kuid_munged(current_user_ns(), tun->owner)))
2382  		goto nla_put_failure;
2383  	if (gid_valid(tun->group) &&
2384  	    nla_put_u32(skb, IFLA_TUN_GROUP,
2385  			from_kgid_munged(current_user_ns(), tun->group)))
2386  		goto nla_put_failure;
2387  	if (nla_put_u8(skb, IFLA_TUN_PI, !(tun->flags & IFF_NO_PI)))
2388  		goto nla_put_failure;
2389  	if (nla_put_u8(skb, IFLA_TUN_VNET_HDR, !!(tun->flags & IFF_VNET_HDR)))
2390  		goto nla_put_failure;
2391  	if (nla_put_u8(skb, IFLA_TUN_PERSIST, !!(tun->flags & IFF_PERSIST)))
2392  		goto nla_put_failure;
2393  	if (nla_put_u8(skb, IFLA_TUN_MULTI_QUEUE,
2394  		       !!(tun->flags & IFF_MULTI_QUEUE)))
2395  		goto nla_put_failure;
2396  	if (tun->flags & IFF_MULTI_QUEUE) {
2397  		if (nla_put_u32(skb, IFLA_TUN_NUM_QUEUES, tun->numqueues))
2398  			goto nla_put_failure;
2399  		if (nla_put_u32(skb, IFLA_TUN_NUM_DISABLED_QUEUES,
2400  				tun->numdisabled))
2401  			goto nla_put_failure;
2402  	}
2403  
2404  	return 0;
2405  
2406  nla_put_failure:
2407  	return -EMSGSIZE;
2408  }
2409  
2410  static struct rtnl_link_ops tun_link_ops __read_mostly = {
2411  	.kind		= DRV_NAME,
2412  	.priv_size	= sizeof(struct tun_struct),
2413  	.setup		= tun_setup,
2414  	.validate	= tun_validate,
2415  	.get_size       = tun_get_size,
2416  	.fill_info      = tun_fill_info,
2417  };
2418  
tun_sock_write_space(struct sock * sk)2419  static void tun_sock_write_space(struct sock *sk)
2420  {
2421  	struct tun_file *tfile;
2422  	wait_queue_head_t *wqueue;
2423  
2424  	if (!sock_writeable(sk))
2425  		return;
2426  
2427  	if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags))
2428  		return;
2429  
2430  	wqueue = sk_sleep(sk);
2431  	if (wqueue && waitqueue_active(wqueue))
2432  		wake_up_interruptible_sync_poll(wqueue, EPOLLOUT |
2433  						EPOLLWRNORM | EPOLLWRBAND);
2434  
2435  	tfile = container_of(sk, struct tun_file, sk);
2436  	kill_fasync(&tfile->fasync, SIGIO, POLL_OUT);
2437  }
2438  
tun_put_page(struct tun_page * tpage)2439  static void tun_put_page(struct tun_page *tpage)
2440  {
2441  	if (tpage->page)
2442  		__page_frag_cache_drain(tpage->page, tpage->count);
2443  }
2444  
tun_xdp_one(struct tun_struct * tun,struct tun_file * tfile,struct xdp_buff * xdp,int * flush,struct tun_page * tpage)2445  static int tun_xdp_one(struct tun_struct *tun,
2446  		       struct tun_file *tfile,
2447  		       struct xdp_buff *xdp, int *flush,
2448  		       struct tun_page *tpage)
2449  {
2450  	unsigned int datasize = xdp->data_end - xdp->data;
2451  	struct tun_xdp_hdr *hdr = xdp->data_hard_start;
2452  	struct virtio_net_hdr *gso = &hdr->gso;
2453  	struct bpf_prog *xdp_prog;
2454  	struct sk_buff *skb = NULL;
2455  	struct sk_buff_head *queue;
2456  	u32 rxhash = 0, act;
2457  	int buflen = hdr->buflen;
2458  	int ret = 0;
2459  	bool skb_xdp = false;
2460  	struct page *page;
2461  
2462  	if (unlikely(datasize < ETH_HLEN))
2463  		return -EINVAL;
2464  
2465  	xdp_prog = rcu_dereference(tun->xdp_prog);
2466  	if (xdp_prog) {
2467  		if (gso->gso_type) {
2468  			skb_xdp = true;
2469  			goto build;
2470  		}
2471  
2472  		xdp_init_buff(xdp, buflen, &tfile->xdp_rxq);
2473  		xdp_set_data_meta_invalid(xdp);
2474  
2475  		act = bpf_prog_run_xdp(xdp_prog, xdp);
2476  		ret = tun_xdp_act(tun, xdp_prog, xdp, act);
2477  		if (ret < 0) {
2478  			put_page(virt_to_head_page(xdp->data));
2479  			return ret;
2480  		}
2481  
2482  		switch (ret) {
2483  		case XDP_REDIRECT:
2484  			*flush = true;
2485  			fallthrough;
2486  		case XDP_TX:
2487  			return 0;
2488  		case XDP_PASS:
2489  			break;
2490  		default:
2491  			page = virt_to_head_page(xdp->data);
2492  			if (tpage->page == page) {
2493  				++tpage->count;
2494  			} else {
2495  				tun_put_page(tpage);
2496  				tpage->page = page;
2497  				tpage->count = 1;
2498  			}
2499  			return 0;
2500  		}
2501  	}
2502  
2503  build:
2504  	skb = build_skb(xdp->data_hard_start, buflen);
2505  	if (!skb) {
2506  		ret = -ENOMEM;
2507  		goto out;
2508  	}
2509  
2510  	skb_reserve(skb, xdp->data - xdp->data_hard_start);
2511  	skb_put(skb, xdp->data_end - xdp->data);
2512  
2513  	if (virtio_net_hdr_to_skb(skb, gso, tun_is_little_endian(tun))) {
2514  		atomic_long_inc(&tun->rx_frame_errors);
2515  		kfree_skb(skb);
2516  		ret = -EINVAL;
2517  		goto out;
2518  	}
2519  
2520  	skb->protocol = eth_type_trans(skb, tun->dev);
2521  	skb_reset_network_header(skb);
2522  	skb_probe_transport_header(skb);
2523  	skb_record_rx_queue(skb, tfile->queue_index);
2524  
2525  	if (skb_xdp) {
2526  		ret = do_xdp_generic(xdp_prog, skb);
2527  		if (ret != XDP_PASS) {
2528  			ret = 0;
2529  			goto out;
2530  		}
2531  	}
2532  
2533  	if (!rcu_dereference(tun->steering_prog) && tun->numqueues > 1 &&
2534  	    !tfile->detached)
2535  		rxhash = __skb_get_hash_symmetric(skb);
2536  
2537  	if (tfile->napi_enabled) {
2538  		queue = &tfile->sk.sk_write_queue;
2539  		spin_lock(&queue->lock);
2540  
2541  		if (unlikely(tfile->detached)) {
2542  			spin_unlock(&queue->lock);
2543  			kfree_skb(skb);
2544  			return -EBUSY;
2545  		}
2546  
2547  		__skb_queue_tail(queue, skb);
2548  		spin_unlock(&queue->lock);
2549  		ret = 1;
2550  	} else {
2551  		netif_receive_skb(skb);
2552  		ret = 0;
2553  	}
2554  
2555  	/* No need to disable preemption here since this function is
2556  	 * always called with bh disabled
2557  	 */
2558  	dev_sw_netstats_rx_add(tun->dev, datasize);
2559  
2560  	if (rxhash)
2561  		tun_flow_update(tun, rxhash, tfile);
2562  
2563  out:
2564  	return ret;
2565  }
2566  
tun_sendmsg(struct socket * sock,struct msghdr * m,size_t total_len)2567  static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
2568  {
2569  	int ret, i;
2570  	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
2571  	struct tun_struct *tun = tun_get(tfile);
2572  	struct tun_msg_ctl *ctl = m->msg_control;
2573  	struct xdp_buff *xdp;
2574  
2575  	if (!tun)
2576  		return -EBADFD;
2577  
2578  	if (m->msg_controllen == sizeof(struct tun_msg_ctl) &&
2579  	    ctl && ctl->type == TUN_MSG_PTR) {
2580  		struct tun_page tpage;
2581  		int n = ctl->num;
2582  		int flush = 0, queued = 0;
2583  
2584  		memset(&tpage, 0, sizeof(tpage));
2585  
2586  		local_bh_disable();
2587  		rcu_read_lock();
2588  
2589  		for (i = 0; i < n; i++) {
2590  			xdp = &((struct xdp_buff *)ctl->ptr)[i];
2591  			ret = tun_xdp_one(tun, tfile, xdp, &flush, &tpage);
2592  			if (ret > 0)
2593  				queued += ret;
2594  		}
2595  
2596  		if (flush)
2597  			xdp_do_flush();
2598  
2599  		if (tfile->napi_enabled && queued > 0)
2600  			napi_schedule(&tfile->napi);
2601  
2602  		rcu_read_unlock();
2603  		local_bh_enable();
2604  
2605  		tun_put_page(&tpage);
2606  
2607  		ret = total_len;
2608  		goto out;
2609  	}
2610  
2611  	ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter,
2612  			   m->msg_flags & MSG_DONTWAIT,
2613  			   m->msg_flags & MSG_MORE);
2614  out:
2615  	tun_put(tun);
2616  	return ret;
2617  }
2618  
tun_recvmsg(struct socket * sock,struct msghdr * m,size_t total_len,int flags)2619  static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len,
2620  		       int flags)
2621  {
2622  	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
2623  	struct tun_struct *tun = tun_get(tfile);
2624  	void *ptr = m->msg_control;
2625  	int ret;
2626  
2627  	if (!tun) {
2628  		ret = -EBADFD;
2629  		goto out_free;
2630  	}
2631  
2632  	if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) {
2633  		ret = -EINVAL;
2634  		goto out_put_tun;
2635  	}
2636  	if (flags & MSG_ERRQUEUE) {
2637  		ret = sock_recv_errqueue(sock->sk, m, total_len,
2638  					 SOL_PACKET, TUN_TX_TIMESTAMP);
2639  		goto out;
2640  	}
2641  	ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT, ptr);
2642  	if (ret > (ssize_t)total_len) {
2643  		m->msg_flags |= MSG_TRUNC;
2644  		ret = flags & MSG_TRUNC ? ret : total_len;
2645  	}
2646  out:
2647  	tun_put(tun);
2648  	return ret;
2649  
2650  out_put_tun:
2651  	tun_put(tun);
2652  out_free:
2653  	tun_ptr_free(ptr);
2654  	return ret;
2655  }
2656  
tun_ptr_peek_len(void * ptr)2657  static int tun_ptr_peek_len(void *ptr)
2658  {
2659  	if (likely(ptr)) {
2660  		if (tun_is_xdp_frame(ptr)) {
2661  			struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
2662  
2663  			return xdpf->len;
2664  		}
2665  		return __skb_array_len_with_tag(ptr);
2666  	} else {
2667  		return 0;
2668  	}
2669  }
2670  
tun_peek_len(struct socket * sock)2671  static int tun_peek_len(struct socket *sock)
2672  {
2673  	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
2674  	struct tun_struct *tun;
2675  	int ret = 0;
2676  
2677  	tun = tun_get(tfile);
2678  	if (!tun)
2679  		return 0;
2680  
2681  	ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len);
2682  	tun_put(tun);
2683  
2684  	return ret;
2685  }
2686  
2687  /* Ops structure to mimic raw sockets with tun */
2688  static const struct proto_ops tun_socket_ops = {
2689  	.peek_len = tun_peek_len,
2690  	.sendmsg = tun_sendmsg,
2691  	.recvmsg = tun_recvmsg,
2692  };
2693  
2694  static struct proto tun_proto = {
2695  	.name		= "tun",
2696  	.owner		= THIS_MODULE,
2697  	.obj_size	= sizeof(struct tun_file),
2698  };
2699  
tun_flags(struct tun_struct * tun)2700  static int tun_flags(struct tun_struct *tun)
2701  {
2702  	return tun->flags & (TUN_FEATURES | IFF_PERSIST | IFF_TUN | IFF_TAP);
2703  }
2704  
tun_flags_show(struct device * dev,struct device_attribute * attr,char * buf)2705  static ssize_t tun_flags_show(struct device *dev, struct device_attribute *attr,
2706  			      char *buf)
2707  {
2708  	struct tun_struct *tun = netdev_priv(to_net_dev(dev));
2709  	return sysfs_emit(buf, "0x%x\n", tun_flags(tun));
2710  }
2711  
owner_show(struct device * dev,struct device_attribute * attr,char * buf)2712  static ssize_t owner_show(struct device *dev, struct device_attribute *attr,
2713  			  char *buf)
2714  {
2715  	struct tun_struct *tun = netdev_priv(to_net_dev(dev));
2716  	return uid_valid(tun->owner)?
2717  		sysfs_emit(buf, "%u\n",
2718  			   from_kuid_munged(current_user_ns(), tun->owner)) :
2719  		sysfs_emit(buf, "-1\n");
2720  }
2721  
group_show(struct device * dev,struct device_attribute * attr,char * buf)2722  static ssize_t group_show(struct device *dev, struct device_attribute *attr,
2723  			  char *buf)
2724  {
2725  	struct tun_struct *tun = netdev_priv(to_net_dev(dev));
2726  	return gid_valid(tun->group) ?
2727  		sysfs_emit(buf, "%u\n",
2728  			   from_kgid_munged(current_user_ns(), tun->group)) :
2729  		sysfs_emit(buf, "-1\n");
2730  }
2731  
2732  static DEVICE_ATTR_RO(tun_flags);
2733  static DEVICE_ATTR_RO(owner);
2734  static DEVICE_ATTR_RO(group);
2735  
2736  static struct attribute *tun_dev_attrs[] = {
2737  	&dev_attr_tun_flags.attr,
2738  	&dev_attr_owner.attr,
2739  	&dev_attr_group.attr,
2740  	NULL
2741  };
2742  
2743  static const struct attribute_group tun_attr_group = {
2744  	.attrs = tun_dev_attrs
2745  };
2746  
tun_set_iff(struct net * net,struct file * file,struct ifreq * ifr)2747  static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
2748  {
2749  	struct tun_struct *tun;
2750  	struct tun_file *tfile = file->private_data;
2751  	struct net_device *dev;
2752  	int err;
2753  
2754  	if (tfile->detached)
2755  		return -EINVAL;
2756  
2757  	if ((ifr->ifr_flags & IFF_NAPI_FRAGS)) {
2758  		if (!capable(CAP_NET_ADMIN))
2759  			return -EPERM;
2760  
2761  		if (!(ifr->ifr_flags & IFF_NAPI) ||
2762  		    (ifr->ifr_flags & TUN_TYPE_MASK) != IFF_TAP)
2763  			return -EINVAL;
2764  	}
2765  
2766  	dev = __dev_get_by_name(net, ifr->ifr_name);
2767  	if (dev) {
2768  		if (ifr->ifr_flags & IFF_TUN_EXCL)
2769  			return -EBUSY;
2770  		if ((ifr->ifr_flags & IFF_TUN) && dev->netdev_ops == &tun_netdev_ops)
2771  			tun = netdev_priv(dev);
2772  		else if ((ifr->ifr_flags & IFF_TAP) && dev->netdev_ops == &tap_netdev_ops)
2773  			tun = netdev_priv(dev);
2774  		else
2775  			return -EINVAL;
2776  
2777  		if (!!(ifr->ifr_flags & IFF_MULTI_QUEUE) !=
2778  		    !!(tun->flags & IFF_MULTI_QUEUE))
2779  			return -EINVAL;
2780  
2781  		if (tun_not_capable(tun))
2782  			return -EPERM;
2783  		err = security_tun_dev_open(tun->security);
2784  		if (err < 0)
2785  			return err;
2786  
2787  		err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER,
2788  				 ifr->ifr_flags & IFF_NAPI,
2789  				 ifr->ifr_flags & IFF_NAPI_FRAGS, true);
2790  		if (err < 0)
2791  			return err;
2792  
2793  		if (tun->flags & IFF_MULTI_QUEUE &&
2794  		    (tun->numqueues + tun->numdisabled > 1)) {
2795  			/* One or more queue has already been attached, no need
2796  			 * to initialize the device again.
2797  			 */
2798  			netdev_state_change(dev);
2799  			return 0;
2800  		}
2801  
2802  		tun->flags = (tun->flags & ~TUN_FEATURES) |
2803  			      (ifr->ifr_flags & TUN_FEATURES);
2804  
2805  		netdev_state_change(dev);
2806  	} else {
2807  		char *name;
2808  		unsigned long flags = 0;
2809  		int queues = ifr->ifr_flags & IFF_MULTI_QUEUE ?
2810  			     MAX_TAP_QUEUES : 1;
2811  
2812  		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2813  			return -EPERM;
2814  		err = security_tun_dev_create();
2815  		if (err < 0)
2816  			return err;
2817  
2818  		/* Set dev type */
2819  		if (ifr->ifr_flags & IFF_TUN) {
2820  			/* TUN device */
2821  			flags |= IFF_TUN;
2822  			name = "tun%d";
2823  		} else if (ifr->ifr_flags & IFF_TAP) {
2824  			/* TAP device */
2825  			flags |= IFF_TAP;
2826  			name = "tap%d";
2827  		} else
2828  			return -EINVAL;
2829  
2830  		if (*ifr->ifr_name)
2831  			name = ifr->ifr_name;
2832  
2833  		dev = alloc_netdev_mqs(sizeof(struct tun_struct), name,
2834  				       NET_NAME_UNKNOWN, tun_setup, queues,
2835  				       queues);
2836  
2837  		if (!dev)
2838  			return -ENOMEM;
2839  
2840  		dev_net_set(dev, net);
2841  		dev->rtnl_link_ops = &tun_link_ops;
2842  		dev->ifindex = tfile->ifindex;
2843  		dev->sysfs_groups[0] = &tun_attr_group;
2844  
2845  		tun = netdev_priv(dev);
2846  		tun->dev = dev;
2847  		tun->flags = flags;
2848  		tun->txflt.count = 0;
2849  		tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
2850  
2851  		tun->align = NET_SKB_PAD;
2852  		tun->filter_attached = false;
2853  		tun->sndbuf = tfile->socket.sk->sk_sndbuf;
2854  		tun->rx_batched = 0;
2855  		RCU_INIT_POINTER(tun->steering_prog, NULL);
2856  
2857  		tun->ifr = ifr;
2858  		tun->file = file;
2859  
2860  		tun_net_initialize(dev);
2861  
2862  		err = register_netdevice(tun->dev);
2863  		if (err < 0) {
2864  			free_netdev(dev);
2865  			return err;
2866  		}
2867  		/* free_netdev() won't check refcnt, to avoid race
2868  		 * with dev_put() we need publish tun after registration.
2869  		 */
2870  		rcu_assign_pointer(tfile->tun, tun);
2871  	}
2872  
2873  	if (ifr->ifr_flags & IFF_NO_CARRIER)
2874  		netif_carrier_off(tun->dev);
2875  	else
2876  		netif_carrier_on(tun->dev);
2877  
2878  	/* Make sure persistent devices do not get stuck in
2879  	 * xoff state.
2880  	 */
2881  	if (netif_running(tun->dev))
2882  		netif_tx_wake_all_queues(tun->dev);
2883  
2884  	strcpy(ifr->ifr_name, tun->dev->name);
2885  	return 0;
2886  }
2887  
tun_get_iff(struct tun_struct * tun,struct ifreq * ifr)2888  static void tun_get_iff(struct tun_struct *tun, struct ifreq *ifr)
2889  {
2890  	strcpy(ifr->ifr_name, tun->dev->name);
2891  
2892  	ifr->ifr_flags = tun_flags(tun);
2893  
2894  }
2895  
2896  /* This is like a cut-down ethtool ops, except done via tun fd so no
2897   * privs required. */
set_offload(struct tun_struct * tun,unsigned long arg)2898  static int set_offload(struct tun_struct *tun, unsigned long arg)
2899  {
2900  	netdev_features_t features = 0;
2901  
2902  	if (arg & TUN_F_CSUM) {
2903  		features |= NETIF_F_HW_CSUM;
2904  		arg &= ~TUN_F_CSUM;
2905  
2906  		if (arg & (TUN_F_TSO4|TUN_F_TSO6)) {
2907  			if (arg & TUN_F_TSO_ECN) {
2908  				features |= NETIF_F_TSO_ECN;
2909  				arg &= ~TUN_F_TSO_ECN;
2910  			}
2911  			if (arg & TUN_F_TSO4)
2912  				features |= NETIF_F_TSO;
2913  			if (arg & TUN_F_TSO6)
2914  				features |= NETIF_F_TSO6;
2915  			arg &= ~(TUN_F_TSO4|TUN_F_TSO6);
2916  		}
2917  
2918  		arg &= ~TUN_F_UFO;
2919  
2920  		/* TODO: for now USO4 and USO6 should work simultaneously */
2921  		if (arg & TUN_F_USO4 && arg & TUN_F_USO6) {
2922  			features |= NETIF_F_GSO_UDP_L4;
2923  			arg &= ~(TUN_F_USO4 | TUN_F_USO6);
2924  		}
2925  	}
2926  
2927  	/* This gives the user a way to test for new features in future by
2928  	 * trying to set them. */
2929  	if (arg)
2930  		return -EINVAL;
2931  
2932  	tun->set_features = features;
2933  	tun->dev->wanted_features &= ~TUN_USER_FEATURES;
2934  	tun->dev->wanted_features |= features;
2935  	netdev_update_features(tun->dev);
2936  
2937  	return 0;
2938  }
2939  
tun_detach_filter(struct tun_struct * tun,int n)2940  static void tun_detach_filter(struct tun_struct *tun, int n)
2941  {
2942  	int i;
2943  	struct tun_file *tfile;
2944  
2945  	for (i = 0; i < n; i++) {
2946  		tfile = rtnl_dereference(tun->tfiles[i]);
2947  		lock_sock(tfile->socket.sk);
2948  		sk_detach_filter(tfile->socket.sk);
2949  		release_sock(tfile->socket.sk);
2950  	}
2951  
2952  	tun->filter_attached = false;
2953  }
2954  
tun_attach_filter(struct tun_struct * tun)2955  static int tun_attach_filter(struct tun_struct *tun)
2956  {
2957  	int i, ret = 0;
2958  	struct tun_file *tfile;
2959  
2960  	for (i = 0; i < tun->numqueues; i++) {
2961  		tfile = rtnl_dereference(tun->tfiles[i]);
2962  		lock_sock(tfile->socket.sk);
2963  		ret = sk_attach_filter(&tun->fprog, tfile->socket.sk);
2964  		release_sock(tfile->socket.sk);
2965  		if (ret) {
2966  			tun_detach_filter(tun, i);
2967  			return ret;
2968  		}
2969  	}
2970  
2971  	tun->filter_attached = true;
2972  	return ret;
2973  }
2974  
tun_set_sndbuf(struct tun_struct * tun)2975  static void tun_set_sndbuf(struct tun_struct *tun)
2976  {
2977  	struct tun_file *tfile;
2978  	int i;
2979  
2980  	for (i = 0; i < tun->numqueues; i++) {
2981  		tfile = rtnl_dereference(tun->tfiles[i]);
2982  		tfile->socket.sk->sk_sndbuf = tun->sndbuf;
2983  	}
2984  }
2985  
tun_set_queue(struct file * file,struct ifreq * ifr)2986  static int tun_set_queue(struct file *file, struct ifreq *ifr)
2987  {
2988  	struct tun_file *tfile = file->private_data;
2989  	struct tun_struct *tun;
2990  	int ret = 0;
2991  
2992  	rtnl_lock();
2993  
2994  	if (ifr->ifr_flags & IFF_ATTACH_QUEUE) {
2995  		tun = tfile->detached;
2996  		if (!tun) {
2997  			ret = -EINVAL;
2998  			goto unlock;
2999  		}
3000  		ret = security_tun_dev_attach_queue(tun->security);
3001  		if (ret < 0)
3002  			goto unlock;
3003  		ret = tun_attach(tun, file, false, tun->flags & IFF_NAPI,
3004  				 tun->flags & IFF_NAPI_FRAGS, true);
3005  	} else if (ifr->ifr_flags & IFF_DETACH_QUEUE) {
3006  		tun = rtnl_dereference(tfile->tun);
3007  		if (!tun || !(tun->flags & IFF_MULTI_QUEUE) || tfile->detached)
3008  			ret = -EINVAL;
3009  		else
3010  			__tun_detach(tfile, false);
3011  	} else
3012  		ret = -EINVAL;
3013  
3014  	if (ret >= 0)
3015  		netdev_state_change(tun->dev);
3016  
3017  unlock:
3018  	rtnl_unlock();
3019  	return ret;
3020  }
3021  
tun_set_ebpf(struct tun_struct * tun,struct tun_prog __rcu ** prog_p,void __user * data)3022  static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p,
3023  			void __user *data)
3024  {
3025  	struct bpf_prog *prog;
3026  	int fd;
3027  
3028  	if (copy_from_user(&fd, data, sizeof(fd)))
3029  		return -EFAULT;
3030  
3031  	if (fd == -1) {
3032  		prog = NULL;
3033  	} else {
3034  		prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
3035  		if (IS_ERR(prog))
3036  			return PTR_ERR(prog);
3037  	}
3038  
3039  	return __tun_set_ebpf(tun, prog_p, prog);
3040  }
3041  
3042  /* Return correct value for tun->dev->addr_len based on tun->dev->type. */
tun_get_addr_len(unsigned short type)3043  static unsigned char tun_get_addr_len(unsigned short type)
3044  {
3045  	switch (type) {
3046  	case ARPHRD_IP6GRE:
3047  	case ARPHRD_TUNNEL6:
3048  		return sizeof(struct in6_addr);
3049  	case ARPHRD_IPGRE:
3050  	case ARPHRD_TUNNEL:
3051  	case ARPHRD_SIT:
3052  		return 4;
3053  	case ARPHRD_ETHER:
3054  		return ETH_ALEN;
3055  	case ARPHRD_IEEE802154:
3056  	case ARPHRD_IEEE802154_MONITOR:
3057  		return IEEE802154_EXTENDED_ADDR_LEN;
3058  	case ARPHRD_PHONET_PIPE:
3059  	case ARPHRD_PPP:
3060  	case ARPHRD_NONE:
3061  		return 0;
3062  	case ARPHRD_6LOWPAN:
3063  		return EUI64_ADDR_LEN;
3064  	case ARPHRD_FDDI:
3065  		return FDDI_K_ALEN;
3066  	case ARPHRD_HIPPI:
3067  		return HIPPI_ALEN;
3068  	case ARPHRD_IEEE802:
3069  		return FC_ALEN;
3070  	case ARPHRD_ROSE:
3071  		return ROSE_ADDR_LEN;
3072  	case ARPHRD_NETROM:
3073  		return AX25_ADDR_LEN;
3074  	case ARPHRD_LOCALTLK:
3075  		return LTALK_ALEN;
3076  	default:
3077  		return 0;
3078  	}
3079  }
3080  
__tun_chr_ioctl(struct file * file,unsigned int cmd,unsigned long arg,int ifreq_len)3081  static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
3082  			    unsigned long arg, int ifreq_len)
3083  {
3084  	struct tun_file *tfile = file->private_data;
3085  	struct net *net = sock_net(&tfile->sk);
3086  	struct tun_struct *tun;
3087  	void __user* argp = (void __user*)arg;
3088  	unsigned int carrier;
3089  	struct ifreq ifr;
3090  	kuid_t owner;
3091  	kgid_t group;
3092  	int ifindex;
3093  	int sndbuf;
3094  	int vnet_hdr_sz;
3095  	int le;
3096  	int ret;
3097  	bool do_notify = false;
3098  
3099  	if (cmd == TUNSETIFF || cmd == TUNSETQUEUE ||
3100  	    (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) {
3101  		if (copy_from_user(&ifr, argp, ifreq_len))
3102  			return -EFAULT;
3103  	} else {
3104  		memset(&ifr, 0, sizeof(ifr));
3105  	}
3106  	if (cmd == TUNGETFEATURES) {
3107  		/* Currently this just means: "what IFF flags are valid?".
3108  		 * This is needed because we never checked for invalid flags on
3109  		 * TUNSETIFF.
3110  		 */
3111  		return put_user(IFF_TUN | IFF_TAP | IFF_NO_CARRIER |
3112  				TUN_FEATURES, (unsigned int __user*)argp);
3113  	} else if (cmd == TUNSETQUEUE) {
3114  		return tun_set_queue(file, &ifr);
3115  	} else if (cmd == SIOCGSKNS) {
3116  		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3117  			return -EPERM;
3118  		return open_related_ns(&net->ns, get_net_ns);
3119  	}
3120  
3121  	rtnl_lock();
3122  
3123  	tun = tun_get(tfile);
3124  	if (cmd == TUNSETIFF) {
3125  		ret = -EEXIST;
3126  		if (tun)
3127  			goto unlock;
3128  
3129  		ifr.ifr_name[IFNAMSIZ-1] = '\0';
3130  
3131  		ret = tun_set_iff(net, file, &ifr);
3132  
3133  		if (ret)
3134  			goto unlock;
3135  
3136  		if (copy_to_user(argp, &ifr, ifreq_len))
3137  			ret = -EFAULT;
3138  		goto unlock;
3139  	}
3140  	if (cmd == TUNSETIFINDEX) {
3141  		ret = -EPERM;
3142  		if (tun)
3143  			goto unlock;
3144  
3145  		ret = -EFAULT;
3146  		if (copy_from_user(&ifindex, argp, sizeof(ifindex)))
3147  			goto unlock;
3148  		ret = -EINVAL;
3149  		if (ifindex < 0)
3150  			goto unlock;
3151  		ret = 0;
3152  		tfile->ifindex = ifindex;
3153  		goto unlock;
3154  	}
3155  
3156  	ret = -EBADFD;
3157  	if (!tun)
3158  		goto unlock;
3159  
3160  	netif_info(tun, drv, tun->dev, "tun_chr_ioctl cmd %u\n", cmd);
3161  
3162  	net = dev_net(tun->dev);
3163  	ret = 0;
3164  	switch (cmd) {
3165  	case TUNGETIFF:
3166  		tun_get_iff(tun, &ifr);
3167  
3168  		if (tfile->detached)
3169  			ifr.ifr_flags |= IFF_DETACH_QUEUE;
3170  		if (!tfile->socket.sk->sk_filter)
3171  			ifr.ifr_flags |= IFF_NOFILTER;
3172  
3173  		if (copy_to_user(argp, &ifr, ifreq_len))
3174  			ret = -EFAULT;
3175  		break;
3176  
3177  	case TUNSETNOCSUM:
3178  		/* Disable/Enable checksum */
3179  
3180  		/* [unimplemented] */
3181  		netif_info(tun, drv, tun->dev, "ignored: set checksum %s\n",
3182  			   arg ? "disabled" : "enabled");
3183  		break;
3184  
3185  	case TUNSETPERSIST:
3186  		/* Disable/Enable persist mode. Keep an extra reference to the
3187  		 * module to prevent the module being unprobed.
3188  		 */
3189  		if (arg && !(tun->flags & IFF_PERSIST)) {
3190  			tun->flags |= IFF_PERSIST;
3191  			__module_get(THIS_MODULE);
3192  			do_notify = true;
3193  		}
3194  		if (!arg && (tun->flags & IFF_PERSIST)) {
3195  			tun->flags &= ~IFF_PERSIST;
3196  			module_put(THIS_MODULE);
3197  			do_notify = true;
3198  		}
3199  
3200  		netif_info(tun, drv, tun->dev, "persist %s\n",
3201  			   arg ? "enabled" : "disabled");
3202  		break;
3203  
3204  	case TUNSETOWNER:
3205  		/* Set owner of the device */
3206  		owner = make_kuid(current_user_ns(), arg);
3207  		if (!uid_valid(owner)) {
3208  			ret = -EINVAL;
3209  			break;
3210  		}
3211  		tun->owner = owner;
3212  		do_notify = true;
3213  		netif_info(tun, drv, tun->dev, "owner set to %u\n",
3214  			   from_kuid(&init_user_ns, tun->owner));
3215  		break;
3216  
3217  	case TUNSETGROUP:
3218  		/* Set group of the device */
3219  		group = make_kgid(current_user_ns(), arg);
3220  		if (!gid_valid(group)) {
3221  			ret = -EINVAL;
3222  			break;
3223  		}
3224  		tun->group = group;
3225  		do_notify = true;
3226  		netif_info(tun, drv, tun->dev, "group set to %u\n",
3227  			   from_kgid(&init_user_ns, tun->group));
3228  		break;
3229  
3230  	case TUNSETLINK:
3231  		/* Only allow setting the type when the interface is down */
3232  		if (tun->dev->flags & IFF_UP) {
3233  			netif_info(tun, drv, tun->dev,
3234  				   "Linktype set failed because interface is up\n");
3235  			ret = -EBUSY;
3236  		} else {
3237  			ret = call_netdevice_notifiers(NETDEV_PRE_TYPE_CHANGE,
3238  						       tun->dev);
3239  			ret = notifier_to_errno(ret);
3240  			if (ret) {
3241  				netif_info(tun, drv, tun->dev,
3242  					   "Refused to change device type\n");
3243  				break;
3244  			}
3245  			tun->dev->type = (int) arg;
3246  			tun->dev->addr_len = tun_get_addr_len(tun->dev->type);
3247  			netif_info(tun, drv, tun->dev, "linktype set to %d\n",
3248  				   tun->dev->type);
3249  			call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE,
3250  						 tun->dev);
3251  		}
3252  		break;
3253  
3254  	case TUNSETDEBUG:
3255  		tun->msg_enable = (u32)arg;
3256  		break;
3257  
3258  	case TUNSETOFFLOAD:
3259  		ret = set_offload(tun, arg);
3260  		break;
3261  
3262  	case TUNSETTXFILTER:
3263  		/* Can be set only for TAPs */
3264  		ret = -EINVAL;
3265  		if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
3266  			break;
3267  		ret = update_filter(&tun->txflt, (void __user *)arg);
3268  		break;
3269  
3270  	case SIOCGIFHWADDR:
3271  		/* Get hw address */
3272  		dev_get_mac_address(&ifr.ifr_hwaddr, net, tun->dev->name);
3273  		if (copy_to_user(argp, &ifr, ifreq_len))
3274  			ret = -EFAULT;
3275  		break;
3276  
3277  	case SIOCSIFHWADDR:
3278  		/* Set hw address */
3279  		ret = dev_set_mac_address_user(tun->dev, &ifr.ifr_hwaddr, NULL);
3280  		break;
3281  
3282  	case TUNGETSNDBUF:
3283  		sndbuf = tfile->socket.sk->sk_sndbuf;
3284  		if (copy_to_user(argp, &sndbuf, sizeof(sndbuf)))
3285  			ret = -EFAULT;
3286  		break;
3287  
3288  	case TUNSETSNDBUF:
3289  		if (copy_from_user(&sndbuf, argp, sizeof(sndbuf))) {
3290  			ret = -EFAULT;
3291  			break;
3292  		}
3293  		if (sndbuf <= 0) {
3294  			ret = -EINVAL;
3295  			break;
3296  		}
3297  
3298  		tun->sndbuf = sndbuf;
3299  		tun_set_sndbuf(tun);
3300  		break;
3301  
3302  	case TUNGETVNETHDRSZ:
3303  		vnet_hdr_sz = tun->vnet_hdr_sz;
3304  		if (copy_to_user(argp, &vnet_hdr_sz, sizeof(vnet_hdr_sz)))
3305  			ret = -EFAULT;
3306  		break;
3307  
3308  	case TUNSETVNETHDRSZ:
3309  		if (copy_from_user(&vnet_hdr_sz, argp, sizeof(vnet_hdr_sz))) {
3310  			ret = -EFAULT;
3311  			break;
3312  		}
3313  		if (vnet_hdr_sz < (int)sizeof(struct virtio_net_hdr)) {
3314  			ret = -EINVAL;
3315  			break;
3316  		}
3317  
3318  		tun->vnet_hdr_sz = vnet_hdr_sz;
3319  		break;
3320  
3321  	case TUNGETVNETLE:
3322  		le = !!(tun->flags & TUN_VNET_LE);
3323  		if (put_user(le, (int __user *)argp))
3324  			ret = -EFAULT;
3325  		break;
3326  
3327  	case TUNSETVNETLE:
3328  		if (get_user(le, (int __user *)argp)) {
3329  			ret = -EFAULT;
3330  			break;
3331  		}
3332  		if (le)
3333  			tun->flags |= TUN_VNET_LE;
3334  		else
3335  			tun->flags &= ~TUN_VNET_LE;
3336  		break;
3337  
3338  	case TUNGETVNETBE:
3339  		ret = tun_get_vnet_be(tun, argp);
3340  		break;
3341  
3342  	case TUNSETVNETBE:
3343  		ret = tun_set_vnet_be(tun, argp);
3344  		break;
3345  
3346  	case TUNATTACHFILTER:
3347  		/* Can be set only for TAPs */
3348  		ret = -EINVAL;
3349  		if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
3350  			break;
3351  		ret = -EFAULT;
3352  		if (copy_from_user(&tun->fprog, argp, sizeof(tun->fprog)))
3353  			break;
3354  
3355  		ret = tun_attach_filter(tun);
3356  		break;
3357  
3358  	case TUNDETACHFILTER:
3359  		/* Can be set only for TAPs */
3360  		ret = -EINVAL;
3361  		if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
3362  			break;
3363  		ret = 0;
3364  		tun_detach_filter(tun, tun->numqueues);
3365  		break;
3366  
3367  	case TUNGETFILTER:
3368  		ret = -EINVAL;
3369  		if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
3370  			break;
3371  		ret = -EFAULT;
3372  		if (copy_to_user(argp, &tun->fprog, sizeof(tun->fprog)))
3373  			break;
3374  		ret = 0;
3375  		break;
3376  
3377  	case TUNSETSTEERINGEBPF:
3378  		ret = tun_set_ebpf(tun, &tun->steering_prog, argp);
3379  		break;
3380  
3381  	case TUNSETFILTEREBPF:
3382  		ret = tun_set_ebpf(tun, &tun->filter_prog, argp);
3383  		break;
3384  
3385  	case TUNSETCARRIER:
3386  		ret = -EFAULT;
3387  		if (copy_from_user(&carrier, argp, sizeof(carrier)))
3388  			goto unlock;
3389  
3390  		ret = tun_net_change_carrier(tun->dev, (bool)carrier);
3391  		break;
3392  
3393  	case TUNGETDEVNETNS:
3394  		ret = -EPERM;
3395  		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3396  			goto unlock;
3397  		ret = open_related_ns(&net->ns, get_net_ns);
3398  		break;
3399  
3400  	default:
3401  		ret = -EINVAL;
3402  		break;
3403  	}
3404  
3405  	if (do_notify)
3406  		netdev_state_change(tun->dev);
3407  
3408  unlock:
3409  	rtnl_unlock();
3410  	if (tun)
3411  		tun_put(tun);
3412  	return ret;
3413  }
3414  
tun_chr_ioctl(struct file * file,unsigned int cmd,unsigned long arg)3415  static long tun_chr_ioctl(struct file *file,
3416  			  unsigned int cmd, unsigned long arg)
3417  {
3418  	return __tun_chr_ioctl(file, cmd, arg, sizeof (struct ifreq));
3419  }
3420  
3421  #ifdef CONFIG_COMPAT
tun_chr_compat_ioctl(struct file * file,unsigned int cmd,unsigned long arg)3422  static long tun_chr_compat_ioctl(struct file *file,
3423  			 unsigned int cmd, unsigned long arg)
3424  {
3425  	switch (cmd) {
3426  	case TUNSETIFF:
3427  	case TUNGETIFF:
3428  	case TUNSETTXFILTER:
3429  	case TUNGETSNDBUF:
3430  	case TUNSETSNDBUF:
3431  	case SIOCGIFHWADDR:
3432  	case SIOCSIFHWADDR:
3433  		arg = (unsigned long)compat_ptr(arg);
3434  		break;
3435  	default:
3436  		arg = (compat_ulong_t)arg;
3437  		break;
3438  	}
3439  
3440  	/*
3441  	 * compat_ifreq is shorter than ifreq, so we must not access beyond
3442  	 * the end of that structure. All fields that are used in this
3443  	 * driver are compatible though, we don't need to convert the
3444  	 * contents.
3445  	 */
3446  	return __tun_chr_ioctl(file, cmd, arg, sizeof(struct compat_ifreq));
3447  }
3448  #endif /* CONFIG_COMPAT */
3449  
tun_chr_fasync(int fd,struct file * file,int on)3450  static int tun_chr_fasync(int fd, struct file *file, int on)
3451  {
3452  	struct tun_file *tfile = file->private_data;
3453  	int ret;
3454  
3455  	if ((ret = fasync_helper(fd, file, on, &tfile->fasync)) < 0)
3456  		goto out;
3457  
3458  	if (on) {
3459  		__f_setown(file, task_pid(current), PIDTYPE_TGID, 0);
3460  		tfile->flags |= TUN_FASYNC;
3461  	} else
3462  		tfile->flags &= ~TUN_FASYNC;
3463  	ret = 0;
3464  out:
3465  	return ret;
3466  }
3467  
tun_chr_open(struct inode * inode,struct file * file)3468  static int tun_chr_open(struct inode *inode, struct file * file)
3469  {
3470  	struct net *net = current->nsproxy->net_ns;
3471  	struct tun_file *tfile;
3472  
3473  	tfile = (struct tun_file *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
3474  					    &tun_proto, 0);
3475  	if (!tfile)
3476  		return -ENOMEM;
3477  	if (ptr_ring_init(&tfile->tx_ring, 0, GFP_KERNEL)) {
3478  		sk_free(&tfile->sk);
3479  		return -ENOMEM;
3480  	}
3481  
3482  	mutex_init(&tfile->napi_mutex);
3483  	RCU_INIT_POINTER(tfile->tun, NULL);
3484  	tfile->flags = 0;
3485  	tfile->ifindex = 0;
3486  
3487  	init_waitqueue_head(&tfile->socket.wq.wait);
3488  
3489  	tfile->socket.file = file;
3490  	tfile->socket.ops = &tun_socket_ops;
3491  
3492  	sock_init_data_uid(&tfile->socket, &tfile->sk, current_fsuid());
3493  
3494  	tfile->sk.sk_write_space = tun_sock_write_space;
3495  	tfile->sk.sk_sndbuf = INT_MAX;
3496  
3497  	file->private_data = tfile;
3498  	INIT_LIST_HEAD(&tfile->next);
3499  
3500  	sock_set_flag(&tfile->sk, SOCK_ZEROCOPY);
3501  
3502  	/* tun groks IOCB_NOWAIT just fine, mark it as such */
3503  	file->f_mode |= FMODE_NOWAIT;
3504  	return 0;
3505  }
3506  
tun_chr_close(struct inode * inode,struct file * file)3507  static int tun_chr_close(struct inode *inode, struct file *file)
3508  {
3509  	struct tun_file *tfile = file->private_data;
3510  
3511  	tun_detach(tfile, true);
3512  
3513  	return 0;
3514  }
3515  
3516  #ifdef CONFIG_PROC_FS
tun_chr_show_fdinfo(struct seq_file * m,struct file * file)3517  static void tun_chr_show_fdinfo(struct seq_file *m, struct file *file)
3518  {
3519  	struct tun_file *tfile = file->private_data;
3520  	struct tun_struct *tun;
3521  	struct ifreq ifr;
3522  
3523  	memset(&ifr, 0, sizeof(ifr));
3524  
3525  	rtnl_lock();
3526  	tun = tun_get(tfile);
3527  	if (tun)
3528  		tun_get_iff(tun, &ifr);
3529  	rtnl_unlock();
3530  
3531  	if (tun)
3532  		tun_put(tun);
3533  
3534  	seq_printf(m, "iff:\t%s\n", ifr.ifr_name);
3535  }
3536  #endif
3537  
3538  static const struct file_operations tun_fops = {
3539  	.owner	= THIS_MODULE,
3540  	.llseek = no_llseek,
3541  	.read_iter  = tun_chr_read_iter,
3542  	.write_iter = tun_chr_write_iter,
3543  	.poll	= tun_chr_poll,
3544  	.unlocked_ioctl	= tun_chr_ioctl,
3545  #ifdef CONFIG_COMPAT
3546  	.compat_ioctl = tun_chr_compat_ioctl,
3547  #endif
3548  	.open	= tun_chr_open,
3549  	.release = tun_chr_close,
3550  	.fasync = tun_chr_fasync,
3551  #ifdef CONFIG_PROC_FS
3552  	.show_fdinfo = tun_chr_show_fdinfo,
3553  #endif
3554  };
3555  
3556  static struct miscdevice tun_miscdev = {
3557  	.minor = TUN_MINOR,
3558  	.name = "tun",
3559  	.nodename = "net/tun",
3560  	.fops = &tun_fops,
3561  };
3562  
3563  /* ethtool interface */
3564  
tun_default_link_ksettings(struct net_device * dev,struct ethtool_link_ksettings * cmd)3565  static void tun_default_link_ksettings(struct net_device *dev,
3566  				       struct ethtool_link_ksettings *cmd)
3567  {
3568  	ethtool_link_ksettings_zero_link_mode(cmd, supported);
3569  	ethtool_link_ksettings_zero_link_mode(cmd, advertising);
3570  	cmd->base.speed		= SPEED_10000;
3571  	cmd->base.duplex	= DUPLEX_FULL;
3572  	cmd->base.port		= PORT_TP;
3573  	cmd->base.phy_address	= 0;
3574  	cmd->base.autoneg	= AUTONEG_DISABLE;
3575  }
3576  
tun_get_link_ksettings(struct net_device * dev,struct ethtool_link_ksettings * cmd)3577  static int tun_get_link_ksettings(struct net_device *dev,
3578  				  struct ethtool_link_ksettings *cmd)
3579  {
3580  	struct tun_struct *tun = netdev_priv(dev);
3581  
3582  	memcpy(cmd, &tun->link_ksettings, sizeof(*cmd));
3583  	return 0;
3584  }
3585  
tun_set_link_ksettings(struct net_device * dev,const struct ethtool_link_ksettings * cmd)3586  static int tun_set_link_ksettings(struct net_device *dev,
3587  				  const struct ethtool_link_ksettings *cmd)
3588  {
3589  	struct tun_struct *tun = netdev_priv(dev);
3590  
3591  	memcpy(&tun->link_ksettings, cmd, sizeof(*cmd));
3592  	return 0;
3593  }
3594  
tun_get_drvinfo(struct net_device * dev,struct ethtool_drvinfo * info)3595  static void tun_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
3596  {
3597  	struct tun_struct *tun = netdev_priv(dev);
3598  
3599  	strscpy(info->driver, DRV_NAME, sizeof(info->driver));
3600  	strscpy(info->version, DRV_VERSION, sizeof(info->version));
3601  
3602  	switch (tun->flags & TUN_TYPE_MASK) {
3603  	case IFF_TUN:
3604  		strscpy(info->bus_info, "tun", sizeof(info->bus_info));
3605  		break;
3606  	case IFF_TAP:
3607  		strscpy(info->bus_info, "tap", sizeof(info->bus_info));
3608  		break;
3609  	}
3610  }
3611  
tun_get_msglevel(struct net_device * dev)3612  static u32 tun_get_msglevel(struct net_device *dev)
3613  {
3614  	struct tun_struct *tun = netdev_priv(dev);
3615  
3616  	return tun->msg_enable;
3617  }
3618  
tun_set_msglevel(struct net_device * dev,u32 value)3619  static void tun_set_msglevel(struct net_device *dev, u32 value)
3620  {
3621  	struct tun_struct *tun = netdev_priv(dev);
3622  
3623  	tun->msg_enable = value;
3624  }
3625  
tun_get_coalesce(struct net_device * dev,struct ethtool_coalesce * ec,struct kernel_ethtool_coalesce * kernel_coal,struct netlink_ext_ack * extack)3626  static int tun_get_coalesce(struct net_device *dev,
3627  			    struct ethtool_coalesce *ec,
3628  			    struct kernel_ethtool_coalesce *kernel_coal,
3629  			    struct netlink_ext_ack *extack)
3630  {
3631  	struct tun_struct *tun = netdev_priv(dev);
3632  
3633  	ec->rx_max_coalesced_frames = tun->rx_batched;
3634  
3635  	return 0;
3636  }
3637  
tun_set_coalesce(struct net_device * dev,struct ethtool_coalesce * ec,struct kernel_ethtool_coalesce * kernel_coal,struct netlink_ext_ack * extack)3638  static int tun_set_coalesce(struct net_device *dev,
3639  			    struct ethtool_coalesce *ec,
3640  			    struct kernel_ethtool_coalesce *kernel_coal,
3641  			    struct netlink_ext_ack *extack)
3642  {
3643  	struct tun_struct *tun = netdev_priv(dev);
3644  
3645  	if (ec->rx_max_coalesced_frames > NAPI_POLL_WEIGHT)
3646  		tun->rx_batched = NAPI_POLL_WEIGHT;
3647  	else
3648  		tun->rx_batched = ec->rx_max_coalesced_frames;
3649  
3650  	return 0;
3651  }
3652  
3653  static const struct ethtool_ops tun_ethtool_ops = {
3654  	.supported_coalesce_params = ETHTOOL_COALESCE_RX_MAX_FRAMES,
3655  	.get_drvinfo	= tun_get_drvinfo,
3656  	.get_msglevel	= tun_get_msglevel,
3657  	.set_msglevel	= tun_set_msglevel,
3658  	.get_link	= ethtool_op_get_link,
3659  	.get_ts_info	= ethtool_op_get_ts_info,
3660  	.get_coalesce   = tun_get_coalesce,
3661  	.set_coalesce   = tun_set_coalesce,
3662  	.get_link_ksettings = tun_get_link_ksettings,
3663  	.set_link_ksettings = tun_set_link_ksettings,
3664  };
3665  
tun_queue_resize(struct tun_struct * tun)3666  static int tun_queue_resize(struct tun_struct *tun)
3667  {
3668  	struct net_device *dev = tun->dev;
3669  	struct tun_file *tfile;
3670  	struct ptr_ring **rings;
3671  	int n = tun->numqueues + tun->numdisabled;
3672  	int ret, i;
3673  
3674  	rings = kmalloc_array(n, sizeof(*rings), GFP_KERNEL);
3675  	if (!rings)
3676  		return -ENOMEM;
3677  
3678  	for (i = 0; i < tun->numqueues; i++) {
3679  		tfile = rtnl_dereference(tun->tfiles[i]);
3680  		rings[i] = &tfile->tx_ring;
3681  	}
3682  	list_for_each_entry(tfile, &tun->disabled, next)
3683  		rings[i++] = &tfile->tx_ring;
3684  
3685  	ret = ptr_ring_resize_multiple(rings, n,
3686  				       dev->tx_queue_len, GFP_KERNEL,
3687  				       tun_ptr_free);
3688  
3689  	kfree(rings);
3690  	return ret;
3691  }
3692  
tun_device_event(struct notifier_block * unused,unsigned long event,void * ptr)3693  static int tun_device_event(struct notifier_block *unused,
3694  			    unsigned long event, void *ptr)
3695  {
3696  	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3697  	struct tun_struct *tun = netdev_priv(dev);
3698  	int i;
3699  
3700  	if (dev->rtnl_link_ops != &tun_link_ops)
3701  		return NOTIFY_DONE;
3702  
3703  	switch (event) {
3704  	case NETDEV_CHANGE_TX_QUEUE_LEN:
3705  		if (tun_queue_resize(tun))
3706  			return NOTIFY_BAD;
3707  		break;
3708  	case NETDEV_UP:
3709  		for (i = 0; i < tun->numqueues; i++) {
3710  			struct tun_file *tfile;
3711  
3712  			tfile = rtnl_dereference(tun->tfiles[i]);
3713  			tfile->socket.sk->sk_write_space(tfile->socket.sk);
3714  		}
3715  		break;
3716  	default:
3717  		break;
3718  	}
3719  
3720  	return NOTIFY_DONE;
3721  }
3722  
3723  static struct notifier_block tun_notifier_block __read_mostly = {
3724  	.notifier_call	= tun_device_event,
3725  };
3726  
tun_init(void)3727  static int __init tun_init(void)
3728  {
3729  	int ret = 0;
3730  
3731  	pr_info("%s, %s\n", DRV_DESCRIPTION, DRV_VERSION);
3732  
3733  	ret = rtnl_link_register(&tun_link_ops);
3734  	if (ret) {
3735  		pr_err("Can't register link_ops\n");
3736  		goto err_linkops;
3737  	}
3738  
3739  	ret = misc_register(&tun_miscdev);
3740  	if (ret) {
3741  		pr_err("Can't register misc device %d\n", TUN_MINOR);
3742  		goto err_misc;
3743  	}
3744  
3745  	ret = register_netdevice_notifier(&tun_notifier_block);
3746  	if (ret) {
3747  		pr_err("Can't register netdevice notifier\n");
3748  		goto err_notifier;
3749  	}
3750  
3751  	return  0;
3752  
3753  err_notifier:
3754  	misc_deregister(&tun_miscdev);
3755  err_misc:
3756  	rtnl_link_unregister(&tun_link_ops);
3757  err_linkops:
3758  	return ret;
3759  }
3760  
tun_cleanup(void)3761  static void __exit tun_cleanup(void)
3762  {
3763  	misc_deregister(&tun_miscdev);
3764  	rtnl_link_unregister(&tun_link_ops);
3765  	unregister_netdevice_notifier(&tun_notifier_block);
3766  }
3767  
3768  /* Get an underlying socket object from tun file.  Returns error unless file is
3769   * attached to a device.  The returned object works like a packet socket, it
3770   * can be used for sock_sendmsg/sock_recvmsg.  The caller is responsible for
3771   * holding a reference to the file for as long as the socket is in use. */
tun_get_socket(struct file * file)3772  struct socket *tun_get_socket(struct file *file)
3773  {
3774  	struct tun_file *tfile;
3775  	if (file->f_op != &tun_fops)
3776  		return ERR_PTR(-EINVAL);
3777  	tfile = file->private_data;
3778  	if (!tfile)
3779  		return ERR_PTR(-EBADFD);
3780  	return &tfile->socket;
3781  }
3782  EXPORT_SYMBOL_GPL(tun_get_socket);
3783  
tun_get_tx_ring(struct file * file)3784  struct ptr_ring *tun_get_tx_ring(struct file *file)
3785  {
3786  	struct tun_file *tfile;
3787  
3788  	if (file->f_op != &tun_fops)
3789  		return ERR_PTR(-EINVAL);
3790  	tfile = file->private_data;
3791  	if (!tfile)
3792  		return ERR_PTR(-EBADFD);
3793  	return &tfile->tx_ring;
3794  }
3795  EXPORT_SYMBOL_GPL(tun_get_tx_ring);
3796  
3797  module_init(tun_init);
3798  module_exit(tun_cleanup);
3799  MODULE_DESCRIPTION(DRV_DESCRIPTION);
3800  MODULE_AUTHOR(DRV_COPYRIGHT);
3801  MODULE_LICENSE("GPL");
3802  MODULE_ALIAS_MISCDEV(TUN_MINOR);
3803  MODULE_ALIAS("devname:net/tun");
3804