xref: /openbmc/linux/drivers/vhost/net.c (revision 1ac731c529cd4d6adbce134754b51ff7d822b145)
17a338472SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only
23a4d5c94SMichael S. Tsirkin /* Copyright (C) 2009 Red Hat, Inc.
33a4d5c94SMichael S. Tsirkin  * Author: Michael S. Tsirkin <mst@redhat.com>
43a4d5c94SMichael S. Tsirkin  *
53a4d5c94SMichael S. Tsirkin  * virtio-net server in host kernel.
63a4d5c94SMichael S. Tsirkin  */
73a4d5c94SMichael S. Tsirkin 
83a4d5c94SMichael S. Tsirkin #include <linux/compat.h>
93a4d5c94SMichael S. Tsirkin #include <linux/eventfd.h>
103a4d5c94SMichael S. Tsirkin #include <linux/vhost.h>
113a4d5c94SMichael S. Tsirkin #include <linux/virtio_net.h>
123a4d5c94SMichael S. Tsirkin #include <linux/miscdevice.h>
133a4d5c94SMichael S. Tsirkin #include <linux/module.h>
14bab632d6SMichael S. Tsirkin #include <linux/moduleparam.h>
153a4d5c94SMichael S. Tsirkin #include <linux/mutex.h>
163a4d5c94SMichael S. Tsirkin #include <linux/workqueue.h>
173a4d5c94SMichael S. Tsirkin #include <linux/file.h>
185a0e3ad6STejun Heo #include <linux/slab.h>
19e6017571SIngo Molnar #include <linux/sched/clock.h>
20174cd4b1SIngo Molnar #include <linux/sched/signal.h>
2123cc5a99SMichael S. Tsirkin #include <linux/vmalloc.h>
223a4d5c94SMichael S. Tsirkin 
233a4d5c94SMichael S. Tsirkin #include <linux/net.h>
243a4d5c94SMichael S. Tsirkin #include <linux/if_packet.h>
253a4d5c94SMichael S. Tsirkin #include <linux/if_arp.h>
263a4d5c94SMichael S. Tsirkin #include <linux/if_tun.h>
27501c774cSArnd Bergmann #include <linux/if_macvlan.h>
28635b8c8eSSainath Grandhi #include <linux/if_tap.h>
29c53cff5eSBasil Gor #include <linux/if_vlan.h>
30c67df11fSJason Wang #include <linux/skb_array.h>
31c67df11fSJason Wang #include <linux/skbuff.h>
323a4d5c94SMichael S. Tsirkin 
333a4d5c94SMichael S. Tsirkin #include <net/sock.h>
341ffcbc85SJesper Dangaard Brouer #include <net/xdp.h>
353a4d5c94SMichael S. Tsirkin 
363a4d5c94SMichael S. Tsirkin #include "vhost.h"
373a4d5c94SMichael S. Tsirkin 
38098eadceSJason Wang static int experimental_zcopytx = 0;
39bab632d6SMichael S. Tsirkin module_param(experimental_zcopytx, int, 0444);
40f9611c43SMichael S. Tsirkin MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
41f9611c43SMichael S. Tsirkin 		                       " 1 -Enable; 0 - Disable");
42bab632d6SMichael S. Tsirkin 
433a4d5c94SMichael S. Tsirkin /* Max number of bytes transferred before requeueing the job.
443a4d5c94SMichael S. Tsirkin  * Using this limit prevents one virtqueue from starving others. */
453a4d5c94SMichael S. Tsirkin #define VHOST_NET_WEIGHT 0x80000
463a4d5c94SMichael S. Tsirkin 
47a2ac9990Shaibinzhang(张海斌) /* Max number of packets transferred before requeueing the job.
48db688c24SPaolo Abeni  * Using this limit prevents one virtqueue from starving others with small
49db688c24SPaolo Abeni  * pkts.
50db688c24SPaolo Abeni  */
51db688c24SPaolo Abeni #define VHOST_NET_PKT_WEIGHT 256
52a2ac9990Shaibinzhang(张海斌) 
53bab632d6SMichael S. Tsirkin /* MAX number of TX used buffers for outstanding zerocopy */
54bab632d6SMichael S. Tsirkin #define VHOST_MAX_PEND 128
55bab632d6SMichael S. Tsirkin #define VHOST_GOODCOPY_LEN 256
56bab632d6SMichael S. Tsirkin 
57eaae8132SMichael S. Tsirkin /*
58eaae8132SMichael S. Tsirkin  * For transmit, used buffer len is unused; we override it to track buffer
59eaae8132SMichael S. Tsirkin  * status internally; used for zerocopy tx only.
60eaae8132SMichael S. Tsirkin  */
61eaae8132SMichael S. Tsirkin /* Lower device DMA failed */
62bf995734SMichael S. Tsirkin #define VHOST_DMA_FAILED_LEN	((__force __virtio32)3)
63eaae8132SMichael S. Tsirkin /* Lower device DMA done */
64bf995734SMichael S. Tsirkin #define VHOST_DMA_DONE_LEN	((__force __virtio32)2)
65eaae8132SMichael S. Tsirkin /* Lower device DMA in progress */
66bf995734SMichael S. Tsirkin #define VHOST_DMA_IN_PROGRESS	((__force __virtio32)1)
67eaae8132SMichael S. Tsirkin /* Buffer unused */
68bf995734SMichael S. Tsirkin #define VHOST_DMA_CLEAR_LEN	((__force __virtio32)0)
69eaae8132SMichael S. Tsirkin 
70bf995734SMichael S. Tsirkin #define VHOST_DMA_IS_DONE(len) ((__force u32)(len) >= (__force u32)VHOST_DMA_DONE_LEN)
71eaae8132SMichael S. Tsirkin 
723a4d5c94SMichael S. Tsirkin enum {
738570a6e7SAsias He 	VHOST_NET_FEATURES = VHOST_FEATURES |
748570a6e7SAsias He 			 (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) |
756b1e6cc7SJason Wang 			 (1ULL << VIRTIO_NET_F_MRG_RXBUF) |
76313389beSKangjie Xu 			 (1ULL << VIRTIO_F_ACCESS_PLATFORM) |
77313389beSKangjie Xu 			 (1ULL << VIRTIO_F_RING_RESET)
788570a6e7SAsias He };
798570a6e7SAsias He 
808570a6e7SAsias He enum {
81429711aeSJason Wang 	VHOST_NET_BACKEND_FEATURES = (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2)
82429711aeSJason Wang };
83429711aeSJason Wang 
84429711aeSJason Wang enum {
853a4d5c94SMichael S. Tsirkin 	VHOST_NET_VQ_RX = 0,
863a4d5c94SMichael S. Tsirkin 	VHOST_NET_VQ_TX = 1,
873a4d5c94SMichael S. Tsirkin 	VHOST_NET_VQ_MAX = 2,
883a4d5c94SMichael S. Tsirkin };
893a4d5c94SMichael S. Tsirkin 
90fe729a57SAsias He struct vhost_net_ubuf_ref {
910ad8b480SMichael S. Tsirkin 	/* refcount follows semantics similar to kref:
920ad8b480SMichael S. Tsirkin 	 *  0: object is released
930ad8b480SMichael S. Tsirkin 	 *  1: no outstanding ubufs
940ad8b480SMichael S. Tsirkin 	 * >1: outstanding ubufs
950ad8b480SMichael S. Tsirkin 	 */
960ad8b480SMichael S. Tsirkin 	atomic_t refcount;
972839400fSAsias He 	wait_queue_head_t wait;
982839400fSAsias He 	struct vhost_virtqueue *vq;
992839400fSAsias He };
1002839400fSAsias He 
101d0d86971SJason Wang #define VHOST_NET_BATCH 64
102c67df11fSJason Wang struct vhost_net_buf {
1035990a305SJason Wang 	void **queue;
104c67df11fSJason Wang 	int tail;
105c67df11fSJason Wang 	int head;
106c67df11fSJason Wang };
107c67df11fSJason Wang 
1083ab2e420SAsias He struct vhost_net_virtqueue {
1093ab2e420SAsias He 	struct vhost_virtqueue vq;
11081f95a55SMichael S. Tsirkin 	size_t vhost_hlen;
11181f95a55SMichael S. Tsirkin 	size_t sock_hlen;
1122839400fSAsias He 	/* vhost zerocopy support fields below: */
1132839400fSAsias He 	/* last used idx for outstanding DMA zerocopy buffers */
1142839400fSAsias He 	int upend_idx;
115f5a4941aSJason Wang 	/* For TX, first used idx for DMA done zerocopy buffers
116f5a4941aSJason Wang 	 * For RX, number of batched heads
117f5a4941aSJason Wang 	 */
1182839400fSAsias He 	int done_idx;
1190a0be13bSJason Wang 	/* Number of XDP frames batched */
1200a0be13bSJason Wang 	int batched_xdp;
1212839400fSAsias He 	/* an array of userspace buffers info */
122dfff202bSPavel Begunkov 	struct ubuf_info_msgzc *ubuf_info;
1232839400fSAsias He 	/* Reference counting for outstanding ubufs.
1242839400fSAsias He 	 * Protected by vq mutex. Writers must also take device mutex. */
125fe729a57SAsias He 	struct vhost_net_ubuf_ref *ubufs;
1265990a305SJason Wang 	struct ptr_ring *rx_ring;
127c67df11fSJason Wang 	struct vhost_net_buf rxq;
1280a0be13bSJason Wang 	/* Batched XDP buffs */
1290a0be13bSJason Wang 	struct xdp_buff *xdp;
1303ab2e420SAsias He };
1313ab2e420SAsias He 
1323a4d5c94SMichael S. Tsirkin struct vhost_net {
1333a4d5c94SMichael S. Tsirkin 	struct vhost_dev dev;
1343ab2e420SAsias He 	struct vhost_net_virtqueue vqs[VHOST_NET_VQ_MAX];
1353a4d5c94SMichael S. Tsirkin 	struct vhost_poll poll[VHOST_NET_VQ_MAX];
136eaae8132SMichael S. Tsirkin 	/* Number of TX recently submitted.
137eaae8132SMichael S. Tsirkin 	 * Protected by tx vq lock. */
138eaae8132SMichael S. Tsirkin 	unsigned tx_packets;
139eaae8132SMichael S. Tsirkin 	/* Number of times zerocopy TX recently failed.
140eaae8132SMichael S. Tsirkin 	 * Protected by tx vq lock. */
141eaae8132SMichael S. Tsirkin 	unsigned tx_zcopy_err;
1421280c27fSMichael S. Tsirkin 	/* Flush in progress. Protected by tx vq lock. */
1431280c27fSMichael S. Tsirkin 	bool tx_flush;
144e4dab1e6SJason Wang 	/* Private page frag */
145e4dab1e6SJason Wang 	struct page_frag page_frag;
146e4dab1e6SJason Wang 	/* Refcount bias of page frag */
147e4dab1e6SJason Wang 	int refcnt_bias;
1483a4d5c94SMichael S. Tsirkin };
1493a4d5c94SMichael S. Tsirkin 
150fe729a57SAsias He static unsigned vhost_net_zcopy_mask __read_mostly;
1512839400fSAsias He 
vhost_net_buf_get_ptr(struct vhost_net_buf * rxq)152c67df11fSJason Wang static void *vhost_net_buf_get_ptr(struct vhost_net_buf *rxq)
153c67df11fSJason Wang {
154c67df11fSJason Wang 	if (rxq->tail != rxq->head)
155c67df11fSJason Wang 		return rxq->queue[rxq->head];
156c67df11fSJason Wang 	else
157c67df11fSJason Wang 		return NULL;
158c67df11fSJason Wang }
159c67df11fSJason Wang 
vhost_net_buf_get_size(struct vhost_net_buf * rxq)160c67df11fSJason Wang static int vhost_net_buf_get_size(struct vhost_net_buf *rxq)
161c67df11fSJason Wang {
162c67df11fSJason Wang 	return rxq->tail - rxq->head;
163c67df11fSJason Wang }
164c67df11fSJason Wang 
vhost_net_buf_is_empty(struct vhost_net_buf * rxq)165c67df11fSJason Wang static int vhost_net_buf_is_empty(struct vhost_net_buf *rxq)
166c67df11fSJason Wang {
167c67df11fSJason Wang 	return rxq->tail == rxq->head;
168c67df11fSJason Wang }
169c67df11fSJason Wang 
vhost_net_buf_consume(struct vhost_net_buf * rxq)170c67df11fSJason Wang static void *vhost_net_buf_consume(struct vhost_net_buf *rxq)
171c67df11fSJason Wang {
172c67df11fSJason Wang 	void *ret = vhost_net_buf_get_ptr(rxq);
173c67df11fSJason Wang 	++rxq->head;
174c67df11fSJason Wang 	return ret;
175c67df11fSJason Wang }
176c67df11fSJason Wang 
vhost_net_buf_produce(struct vhost_net_virtqueue * nvq)177c67df11fSJason Wang static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq)
178c67df11fSJason Wang {
179c67df11fSJason Wang 	struct vhost_net_buf *rxq = &nvq->rxq;
180c67df11fSJason Wang 
181c67df11fSJason Wang 	rxq->head = 0;
1825990a305SJason Wang 	rxq->tail = ptr_ring_consume_batched(nvq->rx_ring, rxq->queue,
183d0d86971SJason Wang 					      VHOST_NET_BATCH);
184c67df11fSJason Wang 	return rxq->tail;
185c67df11fSJason Wang }
186c67df11fSJason Wang 
vhost_net_buf_unproduce(struct vhost_net_virtqueue * nvq)187c67df11fSJason Wang static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq)
188c67df11fSJason Wang {
189c67df11fSJason Wang 	struct vhost_net_buf *rxq = &nvq->rxq;
190c67df11fSJason Wang 
1915990a305SJason Wang 	if (nvq->rx_ring && !vhost_net_buf_is_empty(rxq)) {
1925990a305SJason Wang 		ptr_ring_unconsume(nvq->rx_ring, rxq->queue + rxq->head,
1935990a305SJason Wang 				   vhost_net_buf_get_size(rxq),
1943a403076SJason Wang 				   tun_ptr_free);
195c67df11fSJason Wang 		rxq->head = rxq->tail = 0;
196c67df11fSJason Wang 	}
197c67df11fSJason Wang }
198c67df11fSJason Wang 
vhost_net_buf_peek_len(void * ptr)199fc72d1d5SJason Wang static int vhost_net_buf_peek_len(void *ptr)
200fc72d1d5SJason Wang {
2011ffcbc85SJesper Dangaard Brouer 	if (tun_is_xdp_frame(ptr)) {
2021ffcbc85SJesper Dangaard Brouer 		struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
203fc72d1d5SJason Wang 
2041ffcbc85SJesper Dangaard Brouer 		return xdpf->len;
205fc72d1d5SJason Wang 	}
206fc72d1d5SJason Wang 
207fc72d1d5SJason Wang 	return __skb_array_len_with_tag(ptr);
208fc72d1d5SJason Wang }
209fc72d1d5SJason Wang 
vhost_net_buf_peek(struct vhost_net_virtqueue * nvq)210c67df11fSJason Wang static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq)
211c67df11fSJason Wang {
212c67df11fSJason Wang 	struct vhost_net_buf *rxq = &nvq->rxq;
213c67df11fSJason Wang 
214c67df11fSJason Wang 	if (!vhost_net_buf_is_empty(rxq))
215c67df11fSJason Wang 		goto out;
216c67df11fSJason Wang 
217c67df11fSJason Wang 	if (!vhost_net_buf_produce(nvq))
218c67df11fSJason Wang 		return 0;
219c67df11fSJason Wang 
220c67df11fSJason Wang out:
221fc72d1d5SJason Wang 	return vhost_net_buf_peek_len(vhost_net_buf_get_ptr(rxq));
222c67df11fSJason Wang }
223c67df11fSJason Wang 
vhost_net_buf_init(struct vhost_net_buf * rxq)224c67df11fSJason Wang static void vhost_net_buf_init(struct vhost_net_buf *rxq)
225c67df11fSJason Wang {
226c67df11fSJason Wang 	rxq->head = rxq->tail = 0;
227c67df11fSJason Wang }
228c67df11fSJason Wang 
vhost_net_enable_zcopy(int vq)229fe729a57SAsias He static void vhost_net_enable_zcopy(int vq)
2302839400fSAsias He {
231fe729a57SAsias He 	vhost_net_zcopy_mask |= 0x1 << vq;
2322839400fSAsias He }
2332839400fSAsias He 
234fe729a57SAsias He static struct vhost_net_ubuf_ref *
vhost_net_ubuf_alloc(struct vhost_virtqueue * vq,bool zcopy)235fe729a57SAsias He vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy)
2362839400fSAsias He {
237fe729a57SAsias He 	struct vhost_net_ubuf_ref *ubufs;
2382839400fSAsias He 	/* No zero copy backend? Nothing to count. */
2392839400fSAsias He 	if (!zcopy)
2402839400fSAsias He 		return NULL;
2412839400fSAsias He 	ubufs = kmalloc(sizeof(*ubufs), GFP_KERNEL);
2422839400fSAsias He 	if (!ubufs)
2432839400fSAsias He 		return ERR_PTR(-ENOMEM);
2440ad8b480SMichael S. Tsirkin 	atomic_set(&ubufs->refcount, 1);
2452839400fSAsias He 	init_waitqueue_head(&ubufs->wait);
2462839400fSAsias He 	ubufs->vq = vq;
2472839400fSAsias He 	return ubufs;
2482839400fSAsias He }
2492839400fSAsias He 
vhost_net_ubuf_put(struct vhost_net_ubuf_ref * ubufs)2500ad8b480SMichael S. Tsirkin static int vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs)
2512839400fSAsias He {
2520ad8b480SMichael S. Tsirkin 	int r = atomic_sub_return(1, &ubufs->refcount);
2530ad8b480SMichael S. Tsirkin 	if (unlikely(!r))
2540ad8b480SMichael S. Tsirkin 		wake_up(&ubufs->wait);
2550ad8b480SMichael S. Tsirkin 	return r;
2562839400fSAsias He }
2572839400fSAsias He 
vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref * ubufs)258fe729a57SAsias He static void vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref *ubufs)
2592839400fSAsias He {
2600ad8b480SMichael S. Tsirkin 	vhost_net_ubuf_put(ubufs);
2610ad8b480SMichael S. Tsirkin 	wait_event(ubufs->wait, !atomic_read(&ubufs->refcount));
262c38e39c3SMichael S. Tsirkin }
263c38e39c3SMichael S. Tsirkin 
vhost_net_ubuf_put_wait_and_free(struct vhost_net_ubuf_ref * ubufs)264c38e39c3SMichael S. Tsirkin static void vhost_net_ubuf_put_wait_and_free(struct vhost_net_ubuf_ref *ubufs)
265c38e39c3SMichael S. Tsirkin {
266c38e39c3SMichael S. Tsirkin 	vhost_net_ubuf_put_and_wait(ubufs);
2672839400fSAsias He 	kfree(ubufs);
2682839400fSAsias He }
2692839400fSAsias He 
vhost_net_clear_ubuf_info(struct vhost_net * n)270b1ad8496SAsias He static void vhost_net_clear_ubuf_info(struct vhost_net *n)
271b1ad8496SAsias He {
272b1ad8496SAsias He 	int i;
273b1ad8496SAsias He 
274288cfe78SMichael S. Tsirkin 	for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
275b1ad8496SAsias He 		kfree(n->vqs[i].ubuf_info);
276288cfe78SMichael S. Tsirkin 		n->vqs[i].ubuf_info = NULL;
277b1ad8496SAsias He 	}
278b1ad8496SAsias He }
279b1ad8496SAsias He 
vhost_net_set_ubuf_info(struct vhost_net * n)2800a1febf7SAsias He static int vhost_net_set_ubuf_info(struct vhost_net *n)
2812839400fSAsias He {
2822839400fSAsias He 	bool zcopy;
2832839400fSAsias He 	int i;
2842839400fSAsias He 
285288cfe78SMichael S. Tsirkin 	for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
286fe729a57SAsias He 		zcopy = vhost_net_zcopy_mask & (0x1 << i);
2872839400fSAsias He 		if (!zcopy)
2882839400fSAsias He 			continue;
2896da2ec56SKees Cook 		n->vqs[i].ubuf_info =
2906da2ec56SKees Cook 			kmalloc_array(UIO_MAXIOV,
2916da2ec56SKees Cook 				      sizeof(*n->vqs[i].ubuf_info),
2926da2ec56SKees Cook 				      GFP_KERNEL);
2932839400fSAsias He 		if  (!n->vqs[i].ubuf_info)
2942839400fSAsias He 			goto err;
2952839400fSAsias He 	}
2962839400fSAsias He 	return 0;
2972839400fSAsias He 
2982839400fSAsias He err:
299288cfe78SMichael S. Tsirkin 	vhost_net_clear_ubuf_info(n);
3002839400fSAsias He 	return -ENOMEM;
3012839400fSAsias He }
3022839400fSAsias He 
vhost_net_vq_reset(struct vhost_net * n)3030a1febf7SAsias He static void vhost_net_vq_reset(struct vhost_net *n)
3042839400fSAsias He {
3052839400fSAsias He 	int i;
3062839400fSAsias He 
307288cfe78SMichael S. Tsirkin 	vhost_net_clear_ubuf_info(n);
308288cfe78SMichael S. Tsirkin 
3092839400fSAsias He 	for (i = 0; i < VHOST_NET_VQ_MAX; i++) {
3102839400fSAsias He 		n->vqs[i].done_idx = 0;
3112839400fSAsias He 		n->vqs[i].upend_idx = 0;
3122839400fSAsias He 		n->vqs[i].ubufs = NULL;
31381f95a55SMichael S. Tsirkin 		n->vqs[i].vhost_hlen = 0;
31481f95a55SMichael S. Tsirkin 		n->vqs[i].sock_hlen = 0;
315c67df11fSJason Wang 		vhost_net_buf_init(&n->vqs[i].rxq);
3162839400fSAsias He 	}
3172839400fSAsias He 
3182839400fSAsias He }
3192839400fSAsias He 
vhost_net_tx_packet(struct vhost_net * net)320eaae8132SMichael S. Tsirkin static void vhost_net_tx_packet(struct vhost_net *net)
321eaae8132SMichael S. Tsirkin {
322eaae8132SMichael S. Tsirkin 	++net->tx_packets;
323eaae8132SMichael S. Tsirkin 	if (net->tx_packets < 1024)
324eaae8132SMichael S. Tsirkin 		return;
325eaae8132SMichael S. Tsirkin 	net->tx_packets = 0;
326eaae8132SMichael S. Tsirkin 	net->tx_zcopy_err = 0;
327eaae8132SMichael S. Tsirkin }
328eaae8132SMichael S. Tsirkin 
vhost_net_tx_err(struct vhost_net * net)329eaae8132SMichael S. Tsirkin static void vhost_net_tx_err(struct vhost_net *net)
330eaae8132SMichael S. Tsirkin {
331eaae8132SMichael S. Tsirkin 	++net->tx_zcopy_err;
332eaae8132SMichael S. Tsirkin }
333eaae8132SMichael S. Tsirkin 
vhost_net_tx_select_zcopy(struct vhost_net * net)334eaae8132SMichael S. Tsirkin static bool vhost_net_tx_select_zcopy(struct vhost_net *net)
335eaae8132SMichael S. Tsirkin {
3361280c27fSMichael S. Tsirkin 	/* TX flush waits for outstanding DMAs to be done.
3371280c27fSMichael S. Tsirkin 	 * Don't start new DMAs.
3381280c27fSMichael S. Tsirkin 	 */
3391280c27fSMichael S. Tsirkin 	return !net->tx_flush &&
3401280c27fSMichael S. Tsirkin 		net->tx_packets / 64 >= net->tx_zcopy_err;
341eaae8132SMichael S. Tsirkin }
342eaae8132SMichael S. Tsirkin 
vhost_sock_zcopy(struct socket * sock)343bab632d6SMichael S. Tsirkin static bool vhost_sock_zcopy(struct socket *sock)
344bab632d6SMichael S. Tsirkin {
345bab632d6SMichael S. Tsirkin 	return unlikely(experimental_zcopytx) &&
346bab632d6SMichael S. Tsirkin 		sock_flag(sock->sk, SOCK_ZEROCOPY);
347bab632d6SMichael S. Tsirkin }
348bab632d6SMichael S. Tsirkin 
vhost_sock_xdp(struct socket * sock)3490a0be13bSJason Wang static bool vhost_sock_xdp(struct socket *sock)
3500a0be13bSJason Wang {
3510a0be13bSJason Wang 	return sock_flag(sock->sk, SOCK_XDP);
3520a0be13bSJason Wang }
3530a0be13bSJason Wang 
354b211616dSMichael S. Tsirkin /* In case of DMA done not in order in lower device driver for some reason.
355b211616dSMichael S. Tsirkin  * upend_idx is used to track end of used idx, done_idx is used to track head
356b211616dSMichael S. Tsirkin  * of used idx. Once lower device DMA done contiguously, we will signal KVM
357b211616dSMichael S. Tsirkin  * guest used idx.
358b211616dSMichael S. Tsirkin  */
vhost_zerocopy_signal_used(struct vhost_net * net,struct vhost_virtqueue * vq)359094afe7dSJason Wang static void vhost_zerocopy_signal_used(struct vhost_net *net,
360eaae8132SMichael S. Tsirkin 				       struct vhost_virtqueue *vq)
361b211616dSMichael S. Tsirkin {
3622839400fSAsias He 	struct vhost_net_virtqueue *nvq =
3632839400fSAsias He 		container_of(vq, struct vhost_net_virtqueue, vq);
364c92112aeSJason Wang 	int i, add;
365b211616dSMichael S. Tsirkin 	int j = 0;
366b211616dSMichael S. Tsirkin 
3672839400fSAsias He 	for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) {
368eaae8132SMichael S. Tsirkin 		if (vq->heads[i].len == VHOST_DMA_FAILED_LEN)
369eaae8132SMichael S. Tsirkin 			vhost_net_tx_err(net);
370b211616dSMichael S. Tsirkin 		if (VHOST_DMA_IS_DONE(vq->heads[i].len)) {
371b211616dSMichael S. Tsirkin 			vq->heads[i].len = VHOST_DMA_CLEAR_LEN;
372b211616dSMichael S. Tsirkin 			++j;
373b211616dSMichael S. Tsirkin 		} else
374b211616dSMichael S. Tsirkin 			break;
375b211616dSMichael S. Tsirkin 	}
376c92112aeSJason Wang 	while (j) {
377c92112aeSJason Wang 		add = min(UIO_MAXIOV - nvq->done_idx, j);
378c92112aeSJason Wang 		vhost_add_used_and_signal_n(vq->dev, vq,
379c92112aeSJason Wang 					    &vq->heads[nvq->done_idx], add);
380c92112aeSJason Wang 		nvq->done_idx = (nvq->done_idx + add) % UIO_MAXIOV;
381c92112aeSJason Wang 		j -= add;
382c92112aeSJason Wang 	}
383b211616dSMichael S. Tsirkin }
384b211616dSMichael S. Tsirkin 
vhost_zerocopy_callback(struct sk_buff * skb,struct ubuf_info * ubuf_base,bool success)38536177832SJonathan Lemon static void vhost_zerocopy_callback(struct sk_buff *skb,
386dfff202bSPavel Begunkov 				    struct ubuf_info *ubuf_base, bool success)
387b211616dSMichael S. Tsirkin {
388dfff202bSPavel Begunkov 	struct ubuf_info_msgzc *ubuf = uarg_to_msgzc(ubuf_base);
389fe729a57SAsias He 	struct vhost_net_ubuf_ref *ubufs = ubuf->ctx;
390b211616dSMichael S. Tsirkin 	struct vhost_virtqueue *vq = ubufs->vq;
3910ad8b480SMichael S. Tsirkin 	int cnt;
392b211616dSMichael S. Tsirkin 
393b0c057caSMichael S. Tsirkin 	rcu_read_lock_bh();
394b0c057caSMichael S. Tsirkin 
39519c73b3eSJason Wang 	/* set len to mark this desc buffers done DMA */
39619c73b3eSJason Wang 	vq->heads[ubuf->desc].len = success ?
39719c73b3eSJason Wang 		VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
3980ad8b480SMichael S. Tsirkin 	cnt = vhost_net_ubuf_put(ubufs);
39919c73b3eSJason Wang 
40024eb21a1SMichael S. Tsirkin 	/*
40124eb21a1SMichael S. Tsirkin 	 * Trigger polling thread if guest stopped submitting new buffers:
4020ad8b480SMichael S. Tsirkin 	 * in this case, the refcount after decrement will eventually reach 1.
40324eb21a1SMichael S. Tsirkin 	 * We also trigger polling periodically after each 16 packets
40424eb21a1SMichael S. Tsirkin 	 * (the value 16 here is more or less arbitrary, it's tuned to trigger
40524eb21a1SMichael S. Tsirkin 	 * less than 10% of times).
40624eb21a1SMichael S. Tsirkin 	 */
4070ad8b480SMichael S. Tsirkin 	if (cnt <= 1 || !(cnt % 16))
408b211616dSMichael S. Tsirkin 		vhost_poll_queue(&vq->poll);
409b0c057caSMichael S. Tsirkin 
410b0c057caSMichael S. Tsirkin 	rcu_read_unlock_bh();
411b211616dSMichael S. Tsirkin }
412b211616dSMichael S. Tsirkin 
busy_clock(void)41303088137SJason Wang static inline unsigned long busy_clock(void)
41403088137SJason Wang {
41503088137SJason Wang 	return local_clock() >> 10;
41603088137SJason Wang }
41703088137SJason Wang 
vhost_can_busy_poll(unsigned long endtime)418027b1760SToshiaki Makita static bool vhost_can_busy_poll(unsigned long endtime)
41903088137SJason Wang {
420027b1760SToshiaki Makita 	return likely(!need_resched() && !time_after(busy_clock(), endtime) &&
421027b1760SToshiaki Makita 		      !signal_pending(current));
42203088137SJason Wang }
42303088137SJason Wang 
vhost_net_disable_vq(struct vhost_net * n,struct vhost_virtqueue * vq)4248241a1e4SJason Wang static void vhost_net_disable_vq(struct vhost_net *n,
4258241a1e4SJason Wang 				 struct vhost_virtqueue *vq)
4268241a1e4SJason Wang {
4278241a1e4SJason Wang 	struct vhost_net_virtqueue *nvq =
4288241a1e4SJason Wang 		container_of(vq, struct vhost_net_virtqueue, vq);
4298241a1e4SJason Wang 	struct vhost_poll *poll = n->poll + (nvq - n->vqs);
430247643f8SEugenio Pérez 	if (!vhost_vq_get_backend(vq))
4318241a1e4SJason Wang 		return;
4328241a1e4SJason Wang 	vhost_poll_stop(poll);
4338241a1e4SJason Wang }
4348241a1e4SJason Wang 
vhost_net_enable_vq(struct vhost_net * n,struct vhost_virtqueue * vq)4358241a1e4SJason Wang static int vhost_net_enable_vq(struct vhost_net *n,
4368241a1e4SJason Wang 				struct vhost_virtqueue *vq)
4378241a1e4SJason Wang {
4388241a1e4SJason Wang 	struct vhost_net_virtqueue *nvq =
4398241a1e4SJason Wang 		container_of(vq, struct vhost_net_virtqueue, vq);
4408241a1e4SJason Wang 	struct vhost_poll *poll = n->poll + (nvq - n->vqs);
4418241a1e4SJason Wang 	struct socket *sock;
4428241a1e4SJason Wang 
443247643f8SEugenio Pérez 	sock = vhost_vq_get_backend(vq);
4448241a1e4SJason Wang 	if (!sock)
4458241a1e4SJason Wang 		return 0;
4468241a1e4SJason Wang 
4478241a1e4SJason Wang 	return vhost_poll_start(poll, sock->file);
4488241a1e4SJason Wang }
4498241a1e4SJason Wang 
vhost_net_signal_used(struct vhost_net_virtqueue * nvq)4504afb52c2SJason Wang static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq)
4514afb52c2SJason Wang {
4524afb52c2SJason Wang 	struct vhost_virtqueue *vq = &nvq->vq;
4534afb52c2SJason Wang 	struct vhost_dev *dev = vq->dev;
4544afb52c2SJason Wang 
4554afb52c2SJason Wang 	if (!nvq->done_idx)
4564afb52c2SJason Wang 		return;
4574afb52c2SJason Wang 
4584afb52c2SJason Wang 	vhost_add_used_and_signal_n(dev, vq, vq->heads, nvq->done_idx);
4594afb52c2SJason Wang 	nvq->done_idx = 0;
4604afb52c2SJason Wang }
4614afb52c2SJason Wang 
vhost_tx_batch(struct vhost_net * net,struct vhost_net_virtqueue * nvq,struct socket * sock,struct msghdr * msghdr)4620a0be13bSJason Wang static void vhost_tx_batch(struct vhost_net *net,
4630a0be13bSJason Wang 			   struct vhost_net_virtqueue *nvq,
4640a0be13bSJason Wang 			   struct socket *sock,
4650a0be13bSJason Wang 			   struct msghdr *msghdr)
4660a0be13bSJason Wang {
4670a0be13bSJason Wang 	struct tun_msg_ctl ctl = {
4680a0be13bSJason Wang 		.type = TUN_MSG_PTR,
4690a0be13bSJason Wang 		.num = nvq->batched_xdp,
4700a0be13bSJason Wang 		.ptr = nvq->xdp,
4710a0be13bSJason Wang 	};
4723c4cea8fSPaolo Abeni 	int i, err;
4730a0be13bSJason Wang 
4740a0be13bSJason Wang 	if (nvq->batched_xdp == 0)
4750a0be13bSJason Wang 		goto signal_used;
4760a0be13bSJason Wang 
4770a0be13bSJason Wang 	msghdr->msg_control = &ctl;
47874a335a0SHarold Huang 	msghdr->msg_controllen = sizeof(ctl);
4790a0be13bSJason Wang 	err = sock->ops->sendmsg(sock, msghdr, 0);
4800a0be13bSJason Wang 	if (unlikely(err < 0)) {
4810a0be13bSJason Wang 		vq_err(&nvq->vq, "Fail to batch sending packets\n");
4823c4cea8fSPaolo Abeni 
4833c4cea8fSPaolo Abeni 		/* free pages owned by XDP; since this is an unlikely error path,
4843c4cea8fSPaolo Abeni 		 * keep it simple and avoid more complex bulk update for the
4853c4cea8fSPaolo Abeni 		 * used pages
4863c4cea8fSPaolo Abeni 		 */
4873c4cea8fSPaolo Abeni 		for (i = 0; i < nvq->batched_xdp; ++i)
4883c4cea8fSPaolo Abeni 			put_page(virt_to_head_page(nvq->xdp[i].data));
4893c4cea8fSPaolo Abeni 		nvq->batched_xdp = 0;
4903c4cea8fSPaolo Abeni 		nvq->done_idx = 0;
4910a0be13bSJason Wang 		return;
4920a0be13bSJason Wang 	}
4930a0be13bSJason Wang 
4940a0be13bSJason Wang signal_used:
4950a0be13bSJason Wang 	vhost_net_signal_used(nvq);
4960a0be13bSJason Wang 	nvq->batched_xdp = 0;
4970a0be13bSJason Wang }
4980a0be13bSJason Wang 
sock_has_rx_data(struct socket * sock)499dc151282STonghao Zhang static int sock_has_rx_data(struct socket *sock)
500dc151282STonghao Zhang {
501dc151282STonghao Zhang 	if (unlikely(!sock))
502dc151282STonghao Zhang 		return 0;
503dc151282STonghao Zhang 
504dc151282STonghao Zhang 	if (sock->ops->peek_len)
505dc151282STonghao Zhang 		return sock->ops->peek_len(sock);
506dc151282STonghao Zhang 
507dc151282STonghao Zhang 	return skb_queue_empty(&sock->sk->sk_receive_queue);
508dc151282STonghao Zhang }
509dc151282STonghao Zhang 
vhost_net_busy_poll_try_queue(struct vhost_net * net,struct vhost_virtqueue * vq)510dc151282STonghao Zhang static void vhost_net_busy_poll_try_queue(struct vhost_net *net,
511dc151282STonghao Zhang 					  struct vhost_virtqueue *vq)
512dc151282STonghao Zhang {
513dc151282STonghao Zhang 	if (!vhost_vq_avail_empty(&net->dev, vq)) {
514dc151282STonghao Zhang 		vhost_poll_queue(&vq->poll);
515dc151282STonghao Zhang 	} else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
516dc151282STonghao Zhang 		vhost_disable_notify(&net->dev, vq);
517dc151282STonghao Zhang 		vhost_poll_queue(&vq->poll);
518dc151282STonghao Zhang 	}
519dc151282STonghao Zhang }
520dc151282STonghao Zhang 
vhost_net_busy_poll(struct vhost_net * net,struct vhost_virtqueue * rvq,struct vhost_virtqueue * tvq,bool * busyloop_intr,bool poll_rx)521dc151282STonghao Zhang static void vhost_net_busy_poll(struct vhost_net *net,
522dc151282STonghao Zhang 				struct vhost_virtqueue *rvq,
523dc151282STonghao Zhang 				struct vhost_virtqueue *tvq,
524dc151282STonghao Zhang 				bool *busyloop_intr,
525dc151282STonghao Zhang 				bool poll_rx)
526dc151282STonghao Zhang {
527dc151282STonghao Zhang 	unsigned long busyloop_timeout;
528dc151282STonghao Zhang 	unsigned long endtime;
529dc151282STonghao Zhang 	struct socket *sock;
530dc151282STonghao Zhang 	struct vhost_virtqueue *vq = poll_rx ? tvq : rvq;
531dc151282STonghao Zhang 
532476e8ba7SJason Wang 	/* Try to hold the vq mutex of the paired virtqueue. We can't
533476e8ba7SJason Wang 	 * use mutex_lock() here since we could not guarantee a
534476e8ba7SJason Wang 	 * consistenet lock ordering.
535476e8ba7SJason Wang 	 */
536476e8ba7SJason Wang 	if (!mutex_trylock(&vq->mutex))
537476e8ba7SJason Wang 		return;
538476e8ba7SJason Wang 
539dc151282STonghao Zhang 	vhost_disable_notify(&net->dev, vq);
540247643f8SEugenio Pérez 	sock = vhost_vq_get_backend(rvq);
541dc151282STonghao Zhang 
542dc151282STonghao Zhang 	busyloop_timeout = poll_rx ? rvq->busyloop_timeout:
543dc151282STonghao Zhang 				     tvq->busyloop_timeout;
544dc151282STonghao Zhang 
545dc151282STonghao Zhang 	preempt_disable();
546dc151282STonghao Zhang 	endtime = busy_clock() + busyloop_timeout;
547dc151282STonghao Zhang 
548dc151282STonghao Zhang 	while (vhost_can_busy_poll(endtime)) {
549dc151282STonghao Zhang 		if (vhost_vq_has_work(vq)) {
550dc151282STonghao Zhang 			*busyloop_intr = true;
551dc151282STonghao Zhang 			break;
552dc151282STonghao Zhang 		}
553dc151282STonghao Zhang 
554dc151282STonghao Zhang 		if ((sock_has_rx_data(sock) &&
555dc151282STonghao Zhang 		     !vhost_vq_avail_empty(&net->dev, rvq)) ||
556dc151282STonghao Zhang 		    !vhost_vq_avail_empty(&net->dev, tvq))
557dc151282STonghao Zhang 			break;
558dc151282STonghao Zhang 
559dc151282STonghao Zhang 		cpu_relax();
560dc151282STonghao Zhang 	}
561dc151282STonghao Zhang 
562dc151282STonghao Zhang 	preempt_enable();
563dc151282STonghao Zhang 
564dc151282STonghao Zhang 	if (poll_rx || sock_has_rx_data(sock))
565dc151282STonghao Zhang 		vhost_net_busy_poll_try_queue(net, vq);
566dc151282STonghao Zhang 	else if (!poll_rx) /* On tx here, sock has no rx data. */
567dc151282STonghao Zhang 		vhost_enable_notify(&net->dev, rvq);
568dc151282STonghao Zhang 
569dc151282STonghao Zhang 	mutex_unlock(&vq->mutex);
570dc151282STonghao Zhang }
571dc151282STonghao Zhang 
vhost_net_tx_get_vq_desc(struct vhost_net * net,struct vhost_net_virtqueue * tnvq,unsigned int * out_num,unsigned int * in_num,struct msghdr * msghdr,bool * busyloop_intr)57203088137SJason Wang static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
573441abde4STonghao Zhang 				    struct vhost_net_virtqueue *tnvq,
574027b1760SToshiaki Makita 				    unsigned int *out_num, unsigned int *in_num,
5750a0be13bSJason Wang 				    struct msghdr *msghdr, bool *busyloop_intr)
57603088137SJason Wang {
577441abde4STonghao Zhang 	struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
578441abde4STonghao Zhang 	struct vhost_virtqueue *rvq = &rnvq->vq;
579441abde4STonghao Zhang 	struct vhost_virtqueue *tvq = &tnvq->vq;
580441abde4STonghao Zhang 
581441abde4STonghao Zhang 	int r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
58203088137SJason Wang 				  out_num, in_num, NULL, NULL);
58303088137SJason Wang 
584441abde4STonghao Zhang 	if (r == tvq->num && tvq->busyloop_timeout) {
5850a0be13bSJason Wang 		/* Flush batched packets first */
586247643f8SEugenio Pérez 		if (!vhost_sock_zcopy(vhost_vq_get_backend(tvq)))
587247643f8SEugenio Pérez 			vhost_tx_batch(net, tnvq,
588247643f8SEugenio Pérez 				       vhost_vq_get_backend(tvq),
589247643f8SEugenio Pérez 				       msghdr);
590441abde4STonghao Zhang 
591441abde4STonghao Zhang 		vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, false);
592441abde4STonghao Zhang 
593441abde4STonghao Zhang 		r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
59403088137SJason Wang 				      out_num, in_num, NULL, NULL);
59503088137SJason Wang 	}
59603088137SJason Wang 
59703088137SJason Wang 	return r;
59803088137SJason Wang }
59903088137SJason Wang 
vhost_exceeds_maxpend(struct vhost_net * net)6000ed005ceSJason Wang static bool vhost_exceeds_maxpend(struct vhost_net *net)
6010ed005ceSJason Wang {
6020ed005ceSJason Wang 	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
6030ed005ceSJason Wang 	struct vhost_virtqueue *vq = &nvq->vq;
6040ed005ceSJason Wang 
6051e6f7453SWillem de Bruijn 	return (nvq->upend_idx + UIO_MAXIOV - nvq->done_idx) % UIO_MAXIOV >
6061e6f7453SWillem de Bruijn 	       min_t(unsigned int, VHOST_MAX_PEND, vq->num >> 2);
6070ed005ceSJason Wang }
6080ed005ceSJason Wang 
init_iov_iter(struct vhost_virtqueue * vq,struct iov_iter * iter,size_t hdr_size,int out)609b0d0ea50SJason Wang static size_t init_iov_iter(struct vhost_virtqueue *vq, struct iov_iter *iter,
610b0d0ea50SJason Wang 			    size_t hdr_size, int out)
611b0d0ea50SJason Wang {
612b0d0ea50SJason Wang 	/* Skip header. TODO: support TSO. */
613b0d0ea50SJason Wang 	size_t len = iov_length(vq->iov, out);
614b0d0ea50SJason Wang 
615de4eda9dSAl Viro 	iov_iter_init(iter, ITER_SOURCE, vq->iov, out, len);
616b0d0ea50SJason Wang 	iov_iter_advance(iter, hdr_size);
617b0d0ea50SJason Wang 
618b0d0ea50SJason Wang 	return iov_iter_count(iter);
619b0d0ea50SJason Wang }
620b0d0ea50SJason Wang 
get_tx_bufs(struct vhost_net * net,struct vhost_net_virtqueue * nvq,struct msghdr * msg,unsigned int * out,unsigned int * in,size_t * len,bool * busyloop_intr)621a2a91a13SJason Wang static int get_tx_bufs(struct vhost_net *net,
622a2a91a13SJason Wang 		       struct vhost_net_virtqueue *nvq,
623a2a91a13SJason Wang 		       struct msghdr *msg,
624a2a91a13SJason Wang 		       unsigned int *out, unsigned int *in,
625a2a91a13SJason Wang 		       size_t *len, bool *busyloop_intr)
626a2a91a13SJason Wang {
627a2a91a13SJason Wang 	struct vhost_virtqueue *vq = &nvq->vq;
628a2a91a13SJason Wang 	int ret;
629a2a91a13SJason Wang 
6300a0be13bSJason Wang 	ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr);
6314afb52c2SJason Wang 
632a2a91a13SJason Wang 	if (ret < 0 || ret == vq->num)
633a2a91a13SJason Wang 		return ret;
634a2a91a13SJason Wang 
635a2a91a13SJason Wang 	if (*in) {
636a2a91a13SJason Wang 		vq_err(vq, "Unexpected descriptor format for TX: out %d, int %d\n",
637a2a91a13SJason Wang 			*out, *in);
638a2a91a13SJason Wang 		return -EFAULT;
639a2a91a13SJason Wang 	}
640a2a91a13SJason Wang 
641a2a91a13SJason Wang 	/* Sanity check */
642a2a91a13SJason Wang 	*len = init_iov_iter(vq, &msg->msg_iter, nvq->vhost_hlen, *out);
643a2a91a13SJason Wang 	if (*len == 0) {
644a2a91a13SJason Wang 		vq_err(vq, "Unexpected header len for TX: %zd expected %zd\n",
645a2a91a13SJason Wang 			*len, nvq->vhost_hlen);
646a2a91a13SJason Wang 		return -EFAULT;
647a2a91a13SJason Wang 	}
648a2a91a13SJason Wang 
649a2a91a13SJason Wang 	return ret;
650a2a91a13SJason Wang }
651a2a91a13SJason Wang 
tx_can_batch(struct vhost_virtqueue * vq,size_t total_len)652c92a8a8cSJason Wang static bool tx_can_batch(struct vhost_virtqueue *vq, size_t total_len)
653c92a8a8cSJason Wang {
654c92a8a8cSJason Wang 	return total_len < VHOST_NET_WEIGHT &&
655c92a8a8cSJason Wang 	       !vhost_vq_avail_empty(vq->dev, vq);
656c92a8a8cSJason Wang }
657c92a8a8cSJason Wang 
vhost_net_page_frag_refill(struct vhost_net * net,unsigned int sz,struct page_frag * pfrag,gfp_t gfp)658e4dab1e6SJason Wang static bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int sz,
659e4dab1e6SJason Wang 				       struct page_frag *pfrag, gfp_t gfp)
660e4dab1e6SJason Wang {
661e4dab1e6SJason Wang 	if (pfrag->page) {
662e4dab1e6SJason Wang 		if (pfrag->offset + sz <= pfrag->size)
663e4dab1e6SJason Wang 			return true;
664e4dab1e6SJason Wang 		__page_frag_cache_drain(pfrag->page, net->refcnt_bias);
665e4dab1e6SJason Wang 	}
666e4dab1e6SJason Wang 
667e4dab1e6SJason Wang 	pfrag->offset = 0;
668e4dab1e6SJason Wang 	net->refcnt_bias = 0;
669e4dab1e6SJason Wang 	if (SKB_FRAG_PAGE_ORDER) {
670e4dab1e6SJason Wang 		/* Avoid direct reclaim but allow kswapd to wake */
671e4dab1e6SJason Wang 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
672e4dab1e6SJason Wang 					  __GFP_COMP | __GFP_NOWARN |
673e4dab1e6SJason Wang 					  __GFP_NORETRY,
674e4dab1e6SJason Wang 					  SKB_FRAG_PAGE_ORDER);
675e4dab1e6SJason Wang 		if (likely(pfrag->page)) {
676e4dab1e6SJason Wang 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
677e4dab1e6SJason Wang 			goto done;
678e4dab1e6SJason Wang 		}
679e4dab1e6SJason Wang 	}
680e4dab1e6SJason Wang 	pfrag->page = alloc_page(gfp);
681e4dab1e6SJason Wang 	if (likely(pfrag->page)) {
682e4dab1e6SJason Wang 		pfrag->size = PAGE_SIZE;
683e4dab1e6SJason Wang 		goto done;
684e4dab1e6SJason Wang 	}
685e4dab1e6SJason Wang 	return false;
686e4dab1e6SJason Wang 
687e4dab1e6SJason Wang done:
688e4dab1e6SJason Wang 	net->refcnt_bias = USHRT_MAX;
689e4dab1e6SJason Wang 	page_ref_add(pfrag->page, USHRT_MAX - 1);
690e4dab1e6SJason Wang 	return true;
691e4dab1e6SJason Wang }
692e4dab1e6SJason Wang 
6930a0be13bSJason Wang #define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
6940a0be13bSJason Wang 
vhost_net_build_xdp(struct vhost_net_virtqueue * nvq,struct iov_iter * from)6950a0be13bSJason Wang static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
6960a0be13bSJason Wang 			       struct iov_iter *from)
6970a0be13bSJason Wang {
6980a0be13bSJason Wang 	struct vhost_virtqueue *vq = &nvq->vq;
699e4dab1e6SJason Wang 	struct vhost_net *net = container_of(vq->dev, struct vhost_net,
700e4dab1e6SJason Wang 					     dev);
701247643f8SEugenio Pérez 	struct socket *sock = vhost_vq_get_backend(vq);
702e4dab1e6SJason Wang 	struct page_frag *alloc_frag = &net->page_frag;
7030a0be13bSJason Wang 	struct virtio_net_hdr *gso;
7040a0be13bSJason Wang 	struct xdp_buff *xdp = &nvq->xdp[nvq->batched_xdp];
7050a0be13bSJason Wang 	struct tun_xdp_hdr *hdr;
7060a0be13bSJason Wang 	size_t len = iov_iter_count(from);
7070a0be13bSJason Wang 	int headroom = vhost_sock_xdp(sock) ? XDP_PACKET_HEADROOM : 0;
7080a0be13bSJason Wang 	int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
7090a0be13bSJason Wang 	int pad = SKB_DATA_ALIGN(VHOST_NET_RX_PAD + headroom + nvq->sock_hlen);
7100a0be13bSJason Wang 	int sock_hlen = nvq->sock_hlen;
7110a0be13bSJason Wang 	void *buf;
7120a0be13bSJason Wang 	int copied;
7130a0be13bSJason Wang 
7140a0be13bSJason Wang 	if (unlikely(len < nvq->sock_hlen))
7150a0be13bSJason Wang 		return -EFAULT;
7160a0be13bSJason Wang 
7170a0be13bSJason Wang 	if (SKB_DATA_ALIGN(len + pad) +
7180a0be13bSJason Wang 	    SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
7190a0be13bSJason Wang 		return -ENOSPC;
7200a0be13bSJason Wang 
7210a0be13bSJason Wang 	buflen += SKB_DATA_ALIGN(len + pad);
7220a0be13bSJason Wang 	alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
723e4dab1e6SJason Wang 	if (unlikely(!vhost_net_page_frag_refill(net, buflen,
724e4dab1e6SJason Wang 						 alloc_frag, GFP_KERNEL)))
7250a0be13bSJason Wang 		return -ENOMEM;
7260a0be13bSJason Wang 
7270a0be13bSJason Wang 	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
7280a0be13bSJason Wang 	copied = copy_page_from_iter(alloc_frag->page,
7290a0be13bSJason Wang 				     alloc_frag->offset +
7300a0be13bSJason Wang 				     offsetof(struct tun_xdp_hdr, gso),
7310a0be13bSJason Wang 				     sock_hlen, from);
7320a0be13bSJason Wang 	if (copied != sock_hlen)
7330a0be13bSJason Wang 		return -EFAULT;
7340a0be13bSJason Wang 
7350a0be13bSJason Wang 	hdr = buf;
7360a0be13bSJason Wang 	gso = &hdr->gso;
7370a0be13bSJason Wang 
7380a0be13bSJason Wang 	if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
7390a0be13bSJason Wang 	    vhost16_to_cpu(vq, gso->csum_start) +
7400a0be13bSJason Wang 	    vhost16_to_cpu(vq, gso->csum_offset) + 2 >
7410a0be13bSJason Wang 	    vhost16_to_cpu(vq, gso->hdr_len)) {
7420a0be13bSJason Wang 		gso->hdr_len = cpu_to_vhost16(vq,
7430a0be13bSJason Wang 			       vhost16_to_cpu(vq, gso->csum_start) +
7440a0be13bSJason Wang 			       vhost16_to_cpu(vq, gso->csum_offset) + 2);
7450a0be13bSJason Wang 
7460a0be13bSJason Wang 		if (vhost16_to_cpu(vq, gso->hdr_len) > len)
7470a0be13bSJason Wang 			return -EINVAL;
7480a0be13bSJason Wang 	}
7490a0be13bSJason Wang 
7500a0be13bSJason Wang 	len -= sock_hlen;
7510a0be13bSJason Wang 	copied = copy_page_from_iter(alloc_frag->page,
7520a0be13bSJason Wang 				     alloc_frag->offset + pad,
7530a0be13bSJason Wang 				     len, from);
7540a0be13bSJason Wang 	if (copied != len)
7550a0be13bSJason Wang 		return -EFAULT;
7560a0be13bSJason Wang 
757224bf7dbSMatteo Croce 	xdp_init_buff(xdp, buflen, NULL);
758224bf7dbSMatteo Croce 	xdp_prepare_buff(xdp, buf, pad, len, true);
7590a0be13bSJason Wang 	hdr->buflen = buflen;
7600a0be13bSJason Wang 
761e4dab1e6SJason Wang 	--net->refcnt_bias;
7620a0be13bSJason Wang 	alloc_frag->offset += buflen;
7630a0be13bSJason Wang 
7640a0be13bSJason Wang 	++nvq->batched_xdp;
7650a0be13bSJason Wang 
7660a0be13bSJason Wang 	return 0;
7670a0be13bSJason Wang }
7680a0be13bSJason Wang 
handle_tx_copy(struct vhost_net * net,struct socket * sock)7690d20bdf3SJason Wang static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
7703a4d5c94SMichael S. Tsirkin {
7712839400fSAsias He 	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
77281f95a55SMichael S. Tsirkin 	struct vhost_virtqueue *vq = &nvq->vq;
77398a527aaSAl Viro 	unsigned out, in;
774d5675bd2SMichael S. Tsirkin 	int head;
7753a4d5c94SMichael S. Tsirkin 	struct msghdr msg = {
7763a4d5c94SMichael S. Tsirkin 		.msg_name = NULL,
7773a4d5c94SMichael S. Tsirkin 		.msg_namelen = 0,
7783a4d5c94SMichael S. Tsirkin 		.msg_control = NULL,
7793a4d5c94SMichael S. Tsirkin 		.msg_controllen = 0,
7803a4d5c94SMichael S. Tsirkin 		.msg_flags = MSG_DONTWAIT,
7813a4d5c94SMichael S. Tsirkin 	};
7823a4d5c94SMichael S. Tsirkin 	size_t len, total_len = 0;
78370181d51SJason Wang 	int err;
784a2ac9990Shaibinzhang(张海斌) 	int sent_pkts = 0;
7850a0be13bSJason Wang 	bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX);
78628457ee6SArnd Bergmann 
787e2412c07SJason Wang 	do {
7880d20bdf3SJason Wang 		bool busyloop_intr = false;
7892e26af79SAsias He 
7900a0be13bSJason Wang 		if (nvq->done_idx == VHOST_NET_BATCH)
7910a0be13bSJason Wang 			vhost_tx_batch(net, nvq, sock, &msg);
7920a0be13bSJason Wang 
7930d20bdf3SJason Wang 		head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
7940d20bdf3SJason Wang 				   &busyloop_intr);
7950d20bdf3SJason Wang 		/* On error, stop handling until the next kick. */
7960d20bdf3SJason Wang 		if (unlikely(head < 0))
7970d20bdf3SJason Wang 			break;
7980d20bdf3SJason Wang 		/* Nothing new?  Wait for eventfd to tell us they refilled. */
7990d20bdf3SJason Wang 		if (head == vq->num) {
8000d20bdf3SJason Wang 			if (unlikely(busyloop_intr)) {
8010d20bdf3SJason Wang 				vhost_poll_queue(&vq->poll);
8020d20bdf3SJason Wang 			} else if (unlikely(vhost_enable_notify(&net->dev,
8030d20bdf3SJason Wang 								vq))) {
8048ea8cf89SMichael S. Tsirkin 				vhost_disable_notify(&net->dev, vq);
8050d20bdf3SJason Wang 				continue;
8060d20bdf3SJason Wang 			}
8070d20bdf3SJason Wang 			break;
8080d20bdf3SJason Wang 		}
8093a4d5c94SMichael S. Tsirkin 
8100d20bdf3SJason Wang 		total_len += len;
8110a0be13bSJason Wang 
8120a0be13bSJason Wang 		/* For simplicity, TX batching is only enabled if
8130a0be13bSJason Wang 		 * sndbuf is unlimited.
8140a0be13bSJason Wang 		 */
8150a0be13bSJason Wang 		if (sock_can_batch) {
8160a0be13bSJason Wang 			err = vhost_net_build_xdp(nvq, &msg.msg_iter);
8170a0be13bSJason Wang 			if (!err) {
8180a0be13bSJason Wang 				goto done;
8190a0be13bSJason Wang 			} else if (unlikely(err != -ENOSPC)) {
8200a0be13bSJason Wang 				vhost_tx_batch(net, nvq, sock, &msg);
8210a0be13bSJason Wang 				vhost_discard_vq_desc(vq, 1);
8220a0be13bSJason Wang 				vhost_net_enable_vq(net, vq);
8230a0be13bSJason Wang 				break;
8240a0be13bSJason Wang 			}
8250a0be13bSJason Wang 
8260a0be13bSJason Wang 			/* We can't build XDP buff, go for single
8270a0be13bSJason Wang 			 * packet path but let's flush batched
8280a0be13bSJason Wang 			 * packets.
8290a0be13bSJason Wang 			 */
8300a0be13bSJason Wang 			vhost_tx_batch(net, nvq, sock, &msg);
8310a0be13bSJason Wang 			msg.msg_control = NULL;
8320a0be13bSJason Wang 		} else {
8330d20bdf3SJason Wang 			if (tx_can_batch(vq, total_len))
8340d20bdf3SJason Wang 				msg.msg_flags |= MSG_MORE;
8350d20bdf3SJason Wang 			else
8360d20bdf3SJason Wang 				msg.msg_flags &= ~MSG_MORE;
8370a0be13bSJason Wang 		}
8380d20bdf3SJason Wang 
8390d20bdf3SJason Wang 		err = sock->ops->sendmsg(sock, &msg, len);
8400d20bdf3SJason Wang 		if (unlikely(err < 0)) {
841dc9c9e72SYunjian Wang 			if (err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS) {
8420d20bdf3SJason Wang 				vhost_discard_vq_desc(vq, 1);
8430d20bdf3SJason Wang 				vhost_net_enable_vq(net, vq);
8440d20bdf3SJason Wang 				break;
8450d20bdf3SJason Wang 			}
846dc9c9e72SYunjian Wang 			pr_debug("Fail to send packet: err %d", err);
847dc9c9e72SYunjian Wang 		} else if (unlikely(err != len))
8480d20bdf3SJason Wang 			pr_debug("Truncated TX packet: len %d != %zd\n",
8490d20bdf3SJason Wang 				 err, len);
8500a0be13bSJason Wang done:
8510a0be13bSJason Wang 		vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head);
8520a0be13bSJason Wang 		vq->heads[nvq->done_idx].len = 0;
8530a0be13bSJason Wang 		++nvq->done_idx;
854e2412c07SJason Wang 	} while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len)));
8554afb52c2SJason Wang 
8560a0be13bSJason Wang 	vhost_tx_batch(net, nvq, sock, &msg);
8570d20bdf3SJason Wang }
8580d20bdf3SJason Wang 
handle_tx_zerocopy(struct vhost_net * net,struct socket * sock)8590d20bdf3SJason Wang static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
8600d20bdf3SJason Wang {
8610d20bdf3SJason Wang 	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
8620d20bdf3SJason Wang 	struct vhost_virtqueue *vq = &nvq->vq;
8630d20bdf3SJason Wang 	unsigned out, in;
8640d20bdf3SJason Wang 	int head;
8650d20bdf3SJason Wang 	struct msghdr msg = {
8660d20bdf3SJason Wang 		.msg_name = NULL,
8670d20bdf3SJason Wang 		.msg_namelen = 0,
8680d20bdf3SJason Wang 		.msg_control = NULL,
8690d20bdf3SJason Wang 		.msg_controllen = 0,
8700d20bdf3SJason Wang 		.msg_flags = MSG_DONTWAIT,
8710d20bdf3SJason Wang 	};
872fe8dd45bSJason Wang 	struct tun_msg_ctl ctl;
8730d20bdf3SJason Wang 	size_t len, total_len = 0;
8740d20bdf3SJason Wang 	int err;
8753f649ab7SKees Cook 	struct vhost_net_ubuf_ref *ubufs;
876dfff202bSPavel Begunkov 	struct ubuf_info_msgzc *ubuf;
8770d20bdf3SJason Wang 	bool zcopy_used;
8780d20bdf3SJason Wang 	int sent_pkts = 0;
8793a4d5c94SMichael S. Tsirkin 
880e2412c07SJason Wang 	do {
881027b1760SToshiaki Makita 		bool busyloop_intr;
882027b1760SToshiaki Makita 
883bab632d6SMichael S. Tsirkin 		/* Release DMAs done buffers first */
884eaae8132SMichael S. Tsirkin 		vhost_zerocopy_signal_used(net, vq);
885bab632d6SMichael S. Tsirkin 
886027b1760SToshiaki Makita 		busyloop_intr = false;
887a2a91a13SJason Wang 		head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
888652e4f3eSJason Wang 				   &busyloop_intr);
889d5675bd2SMichael S. Tsirkin 		/* On error, stop handling until the next kick. */
8907b3384fcSMichael S. Tsirkin 		if (unlikely(head < 0))
891d5675bd2SMichael S. Tsirkin 			break;
8923a4d5c94SMichael S. Tsirkin 		/* Nothing new?  Wait for eventfd to tell us they refilled. */
8933a4d5c94SMichael S. Tsirkin 		if (head == vq->num) {
894027b1760SToshiaki Makita 			if (unlikely(busyloop_intr)) {
895027b1760SToshiaki Makita 				vhost_poll_queue(&vq->poll);
896027b1760SToshiaki Makita 			} else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
8978ea8cf89SMichael S. Tsirkin 				vhost_disable_notify(&net->dev, vq);
8983a4d5c94SMichael S. Tsirkin 				continue;
8993a4d5c94SMichael S. Tsirkin 			}
9003a4d5c94SMichael S. Tsirkin 			break;
9013a4d5c94SMichael S. Tsirkin 		}
902ce21a029SJason Wang 
9030d20bdf3SJason Wang 		zcopy_used = len >= VHOST_GOODCOPY_LEN
9041e6f7453SWillem de Bruijn 			     && !vhost_exceeds_maxpend(net)
905ce21a029SJason Wang 			     && vhost_net_tx_select_zcopy(net);
906cedb9bdcSMichael S. Tsirkin 
907bab632d6SMichael S. Tsirkin 		/* use msg_control to pass vhost zerocopy ubuf info to skb */
908cedb9bdcSMichael S. Tsirkin 		if (zcopy_used) {
9092839400fSAsias He 			ubuf = nvq->ubuf_info + nvq->upend_idx;
9108b38694aSMichael S. Tsirkin 			vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
911ce21a029SJason Wang 			vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
9122839400fSAsias He 			ubuf->ctx = nvq->ubufs;
9132839400fSAsias He 			ubuf->desc = nvq->upend_idx;
914dfff202bSPavel Begunkov 			ubuf->ubuf.callback = vhost_zerocopy_callback;
915dfff202bSPavel Begunkov 			ubuf->ubuf.flags = SKBFL_ZEROCOPY_FRAG;
916dfff202bSPavel Begunkov 			refcount_set(&ubuf->ubuf.refcnt, 1);
917fe8dd45bSJason Wang 			msg.msg_control = &ctl;
918fe8dd45bSJason Wang 			ctl.type = TUN_MSG_UBUF;
919dfff202bSPavel Begunkov 			ctl.ptr = &ubuf->ubuf;
920fe8dd45bSJason Wang 			msg.msg_controllen = sizeof(ctl);
9212839400fSAsias He 			ubufs = nvq->ubufs;
9220ad8b480SMichael S. Tsirkin 			atomic_inc(&ubufs->refcount);
9232839400fSAsias He 			nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
924ce21a029SJason Wang 		} else {
9254364d5f9SJason Wang 			msg.msg_control = NULL;
926ce21a029SJason Wang 			ubufs = NULL;
927ce21a029SJason Wang 		}
9280ed005ceSJason Wang 		total_len += len;
929c92a8a8cSJason Wang 		if (tx_can_batch(vq, total_len) &&
9300ed005ceSJason Wang 		    likely(!vhost_exceeds_maxpend(net))) {
9310ed005ceSJason Wang 			msg.msg_flags |= MSG_MORE;
9320ed005ceSJason Wang 		} else {
9330ed005ceSJason Wang 			msg.msg_flags &= ~MSG_MORE;
9340ed005ceSJason Wang 		}
9350ed005ceSJason Wang 
9361b784140SYing Xue 		err = sock->ops->sendmsg(sock, &msg, len);
9373a4d5c94SMichael S. Tsirkin 		if (unlikely(err < 0)) {
938*1f5d2e3bSAndrey Smetanin 			bool retry = err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS;
939*1f5d2e3bSAndrey Smetanin 
940cedb9bdcSMichael S. Tsirkin 			if (zcopy_used) {
94101e31beaSYunjian Wang 				if (vq->heads[ubuf->desc].len == VHOST_DMA_IN_PROGRESS)
942fe729a57SAsias He 					vhost_net_ubuf_put(ubufs);
943*1f5d2e3bSAndrey Smetanin 				if (retry)
9442839400fSAsias He 					nvq->upend_idx = ((unsigned)nvq->upend_idx - 1)
9452839400fSAsias He 						% UIO_MAXIOV;
946*1f5d2e3bSAndrey Smetanin 				else
947*1f5d2e3bSAndrey Smetanin 					vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN;
948bab632d6SMichael S. Tsirkin 			}
949*1f5d2e3bSAndrey Smetanin 			if (retry) {
9508dd014adSDavid Stevens 				vhost_discard_vq_desc(vq, 1);
951feb8892cSJason Wang 				vhost_net_enable_vq(net, vq);
9523a4d5c94SMichael S. Tsirkin 				break;
9533a4d5c94SMichael S. Tsirkin 			}
954dc9c9e72SYunjian Wang 			pr_debug("Fail to send packet: err %d", err);
955dc9c9e72SYunjian Wang 		} else if (unlikely(err != len))
95695c0ec6aSMichael S. Tsirkin 			pr_debug("Truncated TX packet: "
9573a4d5c94SMichael S. Tsirkin 				 " len %d != %zd\n", err, len);
958cedb9bdcSMichael S. Tsirkin 		if (!zcopy_used)
9593a4d5c94SMichael S. Tsirkin 			vhost_add_used_and_signal(&net->dev, vq, head, 0);
960c8fb217aSJason Wang 		else
961eaae8132SMichael S. Tsirkin 			vhost_zerocopy_signal_used(net, vq);
962eaae8132SMichael S. Tsirkin 		vhost_net_tx_packet(net);
963e2412c07SJason Wang 	} while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len)));
9643a4d5c94SMichael S. Tsirkin }
9650d20bdf3SJason Wang 
9660d20bdf3SJason Wang /* Expects to be always run from workqueue - which acts as
9670d20bdf3SJason Wang  * read-size critical section for our kind of RCU. */
handle_tx(struct vhost_net * net)9680d20bdf3SJason Wang static void handle_tx(struct vhost_net *net)
9690d20bdf3SJason Wang {
9700d20bdf3SJason Wang 	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
9710d20bdf3SJason Wang 	struct vhost_virtqueue *vq = &nvq->vq;
9720d20bdf3SJason Wang 	struct socket *sock;
9730d20bdf3SJason Wang 
974a6a67a2fSTonghao Zhang 	mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
975247643f8SEugenio Pérez 	sock = vhost_vq_get_backend(vq);
9760d20bdf3SJason Wang 	if (!sock)
9770d20bdf3SJason Wang 		goto out;
9780d20bdf3SJason Wang 
9799b5e830bSJason Wang 	if (!vq_meta_prefetch(vq))
9800d20bdf3SJason Wang 		goto out;
9810d20bdf3SJason Wang 
9820d20bdf3SJason Wang 	vhost_disable_notify(&net->dev, vq);
9830d20bdf3SJason Wang 	vhost_net_disable_vq(net, vq);
9840d20bdf3SJason Wang 
9850d20bdf3SJason Wang 	if (vhost_sock_zcopy(sock))
9860d20bdf3SJason Wang 		handle_tx_zerocopy(net, sock);
9870d20bdf3SJason Wang 	else
9880d20bdf3SJason Wang 		handle_tx_copy(net, sock);
9890d20bdf3SJason Wang 
9902e26af79SAsias He out:
9913a4d5c94SMichael S. Tsirkin 	mutex_unlock(&vq->mutex);
9923a4d5c94SMichael S. Tsirkin }
9933a4d5c94SMichael S. Tsirkin 
peek_head_len(struct vhost_net_virtqueue * rvq,struct sock * sk)994c67df11fSJason Wang static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
9958dd014adSDavid Stevens {
9968dd014adSDavid Stevens 	struct sk_buff *head;
9978dd014adSDavid Stevens 	int len = 0;
998783e3988SJason Wang 	unsigned long flags;
9998dd014adSDavid Stevens 
10005990a305SJason Wang 	if (rvq->rx_ring)
1001c67df11fSJason Wang 		return vhost_net_buf_peek(rvq);
10021576d986SJason Wang 
1003783e3988SJason Wang 	spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
10048dd014adSDavid Stevens 	head = skb_peek(&sk->sk_receive_queue);
1005c53cff5eSBasil Gor 	if (likely(head)) {
10068dd014adSDavid Stevens 		len = head->len;
1007df8a39deSJiri Pirko 		if (skb_vlan_tag_present(head))
1008c53cff5eSBasil Gor 			len += VLAN_HLEN;
1009c53cff5eSBasil Gor 	}
1010c53cff5eSBasil Gor 
1011783e3988SJason Wang 	spin_unlock_irqrestore(&sk->sk_receive_queue.lock, flags);
10128dd014adSDavid Stevens 	return len;
10138dd014adSDavid Stevens }
10148dd014adSDavid Stevens 
vhost_net_rx_peek_head_len(struct vhost_net * net,struct sock * sk,bool * busyloop_intr)1015be294a51SToshiaki Makita static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
1016be294a51SToshiaki Makita 				      bool *busyloop_intr)
101703088137SJason Wang {
101828b9b33bSToshiaki Makita 	struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
101928b9b33bSToshiaki Makita 	struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
10206369fec5SToshiaki Makita 	struct vhost_virtqueue *rvq = &rnvq->vq;
102128b9b33bSToshiaki Makita 	struct vhost_virtqueue *tvq = &tnvq->vq;
102228b9b33bSToshiaki Makita 	int len = peek_head_len(rnvq, sk);
102303088137SJason Wang 
1024dc151282STonghao Zhang 	if (!len && rvq->busyloop_timeout) {
1025f5a4941aSJason Wang 		/* Flush batched heads first */
102609c32489SJason Wang 		vhost_net_signal_used(rnvq);
102703088137SJason Wang 		/* Both tx vq and rx socket were polled here */
1028dc151282STonghao Zhang 		vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, true);
102903088137SJason Wang 
103028b9b33bSToshiaki Makita 		len = peek_head_len(rnvq, sk);
103103088137SJason Wang 	}
103203088137SJason Wang 
103303088137SJason Wang 	return len;
103403088137SJason Wang }
103503088137SJason Wang 
10368dd014adSDavid Stevens /* This is a multi-buffer version of vhost_get_desc, that works if
10378dd014adSDavid Stevens  *	vq has read descriptors only.
10388dd014adSDavid Stevens  * @vq		- the relevant virtqueue
10398dd014adSDavid Stevens  * @datalen	- data length we'll be reading
10408dd014adSDavid Stevens  * @iovcount	- returned count of io vectors we fill
10418dd014adSDavid Stevens  * @log		- vhost log
10428dd014adSDavid Stevens  * @log_num	- log offset
104394249369SJason Wang  * @quota       - headcount quota, 1 for big buffer
10448dd014adSDavid Stevens  *	returns number of buffer heads allocated, negative on error
10458dd014adSDavid Stevens  */
get_rx_bufs(struct vhost_virtqueue * vq,struct vring_used_elem * heads,int datalen,unsigned * iovcount,struct vhost_log * log,unsigned * log_num,unsigned int quota)10468dd014adSDavid Stevens static int get_rx_bufs(struct vhost_virtqueue *vq,
10478dd014adSDavid Stevens 		       struct vring_used_elem *heads,
10488dd014adSDavid Stevens 		       int datalen,
10498dd014adSDavid Stevens 		       unsigned *iovcount,
10508dd014adSDavid Stevens 		       struct vhost_log *log,
105194249369SJason Wang 		       unsigned *log_num,
105294249369SJason Wang 		       unsigned int quota)
10538dd014adSDavid Stevens {
10548dd014adSDavid Stevens 	unsigned int out, in;
10558dd014adSDavid Stevens 	int seg = 0;
10568dd014adSDavid Stevens 	int headcount = 0;
10578dd014adSDavid Stevens 	unsigned d;
10588dd014adSDavid Stevens 	int r, nlogs = 0;
10598b38694aSMichael S. Tsirkin 	/* len is always initialized before use since we are always called with
10608b38694aSMichael S. Tsirkin 	 * datalen > 0.
10618b38694aSMichael S. Tsirkin 	 */
10623f649ab7SKees Cook 	u32 len;
10638dd014adSDavid Stevens 
106494249369SJason Wang 	while (datalen > 0 && headcount < quota) {
1065e0e9b406SJason Wang 		if (unlikely(seg >= UIO_MAXIOV)) {
10668dd014adSDavid Stevens 			r = -ENOBUFS;
10678dd014adSDavid Stevens 			goto err;
10688dd014adSDavid Stevens 		}
106947283befSMichael S. Tsirkin 		r = vhost_get_vq_desc(vq, vq->iov + seg,
10708dd014adSDavid Stevens 				      ARRAY_SIZE(vq->iov) - seg, &out,
10718dd014adSDavid Stevens 				      &in, log, log_num);
1072a39ee449SMichael S. Tsirkin 		if (unlikely(r < 0))
1073a39ee449SMichael S. Tsirkin 			goto err;
1074a39ee449SMichael S. Tsirkin 
1075a39ee449SMichael S. Tsirkin 		d = r;
10768dd014adSDavid Stevens 		if (d == vq->num) {
10778dd014adSDavid Stevens 			r = 0;
10788dd014adSDavid Stevens 			goto err;
10798dd014adSDavid Stevens 		}
10808dd014adSDavid Stevens 		if (unlikely(out || in <= 0)) {
10818dd014adSDavid Stevens 			vq_err(vq, "unexpected descriptor format for RX: "
10828dd014adSDavid Stevens 				"out %d, in %d\n", out, in);
10838dd014adSDavid Stevens 			r = -EINVAL;
10848dd014adSDavid Stevens 			goto err;
10858dd014adSDavid Stevens 		}
10868dd014adSDavid Stevens 		if (unlikely(log)) {
10878dd014adSDavid Stevens 			nlogs += *log_num;
10888dd014adSDavid Stevens 			log += *log_num;
10898dd014adSDavid Stevens 		}
10908b38694aSMichael S. Tsirkin 		heads[headcount].id = cpu_to_vhost32(vq, d);
10918b38694aSMichael S. Tsirkin 		len = iov_length(vq->iov + seg, in);
10928b38694aSMichael S. Tsirkin 		heads[headcount].len = cpu_to_vhost32(vq, len);
10938b38694aSMichael S. Tsirkin 		datalen -= len;
10948dd014adSDavid Stevens 		++headcount;
10958dd014adSDavid Stevens 		seg += in;
10968dd014adSDavid Stevens 	}
109799975cc6SMichael S. Tsirkin 	heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen);
10988dd014adSDavid Stevens 	*iovcount = seg;
10998dd014adSDavid Stevens 	if (unlikely(log))
11008dd014adSDavid Stevens 		*log_num = nlogs;
1101d8316f39SMichael S. Tsirkin 
1102d8316f39SMichael S. Tsirkin 	/* Detect overrun */
1103d8316f39SMichael S. Tsirkin 	if (unlikely(datalen > 0)) {
1104d8316f39SMichael S. Tsirkin 		r = UIO_MAXIOV + 1;
1105d8316f39SMichael S. Tsirkin 		goto err;
1106d8316f39SMichael S. Tsirkin 	}
11078dd014adSDavid Stevens 	return headcount;
11088dd014adSDavid Stevens err:
11098dd014adSDavid Stevens 	vhost_discard_vq_desc(vq, headcount);
11108dd014adSDavid Stevens 	return r;
11118dd014adSDavid Stevens }
11128dd014adSDavid Stevens 
11133a4d5c94SMichael S. Tsirkin /* Expects to be always run from workqueue - which acts as
11143a4d5c94SMichael S. Tsirkin  * read-size critical section for our kind of RCU. */
handle_rx(struct vhost_net * net)111594249369SJason Wang static void handle_rx(struct vhost_net *net)
11163a4d5c94SMichael S. Tsirkin {
111781f95a55SMichael S. Tsirkin 	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX];
111881f95a55SMichael S. Tsirkin 	struct vhost_virtqueue *vq = &nvq->vq;
11193f649ab7SKees Cook 	unsigned in, log;
11208dd014adSDavid Stevens 	struct vhost_log *vq_log;
11218dd014adSDavid Stevens 	struct msghdr msg = {
11228dd014adSDavid Stevens 		.msg_name = NULL,
11238dd014adSDavid Stevens 		.msg_namelen = 0,
11248dd014adSDavid Stevens 		.msg_control = NULL, /* FIXME: get and handle RX aux data. */
11258dd014adSDavid Stevens 		.msg_controllen = 0,
11268dd014adSDavid Stevens 		.msg_flags = MSG_DONTWAIT,
11278dd014adSDavid Stevens 	};
11280960b641SJason Wang 	struct virtio_net_hdr hdr = {
11290960b641SJason Wang 		.flags = 0,
11300960b641SJason Wang 		.gso_type = VIRTIO_NET_HDR_GSO_NONE
11318dd014adSDavid Stevens 	};
11328dd014adSDavid Stevens 	size_t total_len = 0;
1133910a578fSMichael S. Tsirkin 	int err, mergeable;
1134f5a4941aSJason Wang 	s16 headcount;
11358dd014adSDavid Stevens 	size_t vhost_hlen, sock_hlen;
11368dd014adSDavid Stevens 	size_t vhost_len, sock_len;
1137be294a51SToshiaki Makita 	bool busyloop_intr = false;
11382e26af79SAsias He 	struct socket *sock;
1139ba7438aeSAl Viro 	struct iov_iter fixup;
11400960b641SJason Wang 	__virtio16 num_buffers;
1141db688c24SPaolo Abeni 	int recv_pkts = 0;
11428dd014adSDavid Stevens 
1143a6a67a2fSTonghao Zhang 	mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX);
1144247643f8SEugenio Pérez 	sock = vhost_vq_get_backend(vq);
11452e26af79SAsias He 	if (!sock)
11462e26af79SAsias He 		goto out;
11476b1e6cc7SJason Wang 
11489b5e830bSJason Wang 	if (!vq_meta_prefetch(vq))
11496b1e6cc7SJason Wang 		goto out;
11506b1e6cc7SJason Wang 
11518ea8cf89SMichael S. Tsirkin 	vhost_disable_notify(&net->dev, vq);
11528241a1e4SJason Wang 	vhost_net_disable_vq(net, vq);
11532e26af79SAsias He 
115481f95a55SMichael S. Tsirkin 	vhost_hlen = nvq->vhost_hlen;
115581f95a55SMichael S. Tsirkin 	sock_hlen = nvq->sock_hlen;
11568dd014adSDavid Stevens 
1157ea16c514SMichael S. Tsirkin 	vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ?
11588dd014adSDavid Stevens 		vq->log : NULL;
1159ea16c514SMichael S. Tsirkin 	mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
11608dd014adSDavid Stevens 
1161e2412c07SJason Wang 	do {
1162e2412c07SJason Wang 		sock_len = vhost_net_rx_peek_head_len(net, sock->sk,
1163e2412c07SJason Wang 						      &busyloop_intr);
1164e2412c07SJason Wang 		if (!sock_len)
1165e2412c07SJason Wang 			break;
11668dd014adSDavid Stevens 		sock_len += sock_hlen;
11678dd014adSDavid Stevens 		vhost_len = sock_len + vhost_hlen;
1168f5a4941aSJason Wang 		headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx,
1169f5a4941aSJason Wang 					vhost_len, &in, vq_log, &log,
117094249369SJason Wang 					likely(mergeable) ? UIO_MAXIOV : 1);
11718dd014adSDavid Stevens 		/* On error, stop handling until the next kick. */
11728dd014adSDavid Stevens 		if (unlikely(headcount < 0))
11738241a1e4SJason Wang 			goto out;
11748dd014adSDavid Stevens 		/* OK, now we need to know about added descriptors. */
11758dd014adSDavid Stevens 		if (!headcount) {
11766369fec5SToshiaki Makita 			if (unlikely(busyloop_intr)) {
11776369fec5SToshiaki Makita 				vhost_poll_queue(&vq->poll);
11786369fec5SToshiaki Makita 			} else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
11798dd014adSDavid Stevens 				/* They have slipped one in as we were
11808dd014adSDavid Stevens 				 * doing that: check again. */
11818ea8cf89SMichael S. Tsirkin 				vhost_disable_notify(&net->dev, vq);
11828dd014adSDavid Stevens 				continue;
11838dd014adSDavid Stevens 			}
11848dd014adSDavid Stevens 			/* Nothing new?  Wait for eventfd to tell us
11858dd014adSDavid Stevens 			 * they refilled. */
11868241a1e4SJason Wang 			goto out;
11878dd014adSDavid Stevens 		}
11886369fec5SToshiaki Makita 		busyloop_intr = false;
11895990a305SJason Wang 		if (nvq->rx_ring)
11906e474083SWei Xu 			msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
11916e474083SWei Xu 		/* On overrun, truncate and discard */
11926e474083SWei Xu 		if (unlikely(headcount > UIO_MAXIOV)) {
1193de4eda9dSAl Viro 			iov_iter_init(&msg.msg_iter, ITER_DEST, vq->iov, 1, 1);
11946e474083SWei Xu 			err = sock->ops->recvmsg(sock, &msg,
11956e474083SWei Xu 						 1, MSG_DONTWAIT | MSG_TRUNC);
11966e474083SWei Xu 			pr_debug("Discarded rx packet: len %zd\n", sock_len);
11976e474083SWei Xu 			continue;
11986e474083SWei Xu 		}
11998dd014adSDavid Stevens 		/* We don't need to be notified again. */
1200de4eda9dSAl Viro 		iov_iter_init(&msg.msg_iter, ITER_DEST, vq->iov, in, vhost_len);
1201ba7438aeSAl Viro 		fixup = msg.msg_iter;
1202ba7438aeSAl Viro 		if (unlikely((vhost_hlen))) {
1203ba7438aeSAl Viro 			/* We will supply the header ourselves
1204ba7438aeSAl Viro 			 * TODO: support TSO.
1205ba7438aeSAl Viro 			 */
1206ba7438aeSAl Viro 			iov_iter_advance(&msg.msg_iter, vhost_hlen);
1207ba7438aeSAl Viro 		}
12081b784140SYing Xue 		err = sock->ops->recvmsg(sock, &msg,
12098dd014adSDavid Stevens 					 sock_len, MSG_DONTWAIT | MSG_TRUNC);
12108dd014adSDavid Stevens 		/* Userspace might have consumed the packet meanwhile:
12118dd014adSDavid Stevens 		 * it's not supposed to do this usually, but might be hard
12128dd014adSDavid Stevens 		 * to prevent. Discard data we got (if any) and keep going. */
12138dd014adSDavid Stevens 		if (unlikely(err != sock_len)) {
12148dd014adSDavid Stevens 			pr_debug("Discarded rx packet: "
12158dd014adSDavid Stevens 				 " len %d, expected %zd\n", err, sock_len);
12168dd014adSDavid Stevens 			vhost_discard_vq_desc(vq, headcount);
12178dd014adSDavid Stevens 			continue;
12188dd014adSDavid Stevens 		}
1219ba7438aeSAl Viro 		/* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */
12204c5a8442SMichael S. Tsirkin 		if (unlikely(vhost_hlen)) {
12214c5a8442SMichael S. Tsirkin 			if (copy_to_iter(&hdr, sizeof(hdr),
12224c5a8442SMichael S. Tsirkin 					 &fixup) != sizeof(hdr)) {
12234c5a8442SMichael S. Tsirkin 				vq_err(vq, "Unable to write vnet_hdr "
12244c5a8442SMichael S. Tsirkin 				       "at addr %p\n", vq->iov->iov_base);
12258241a1e4SJason Wang 				goto out;
12268dd014adSDavid Stevens 			}
12274c5a8442SMichael S. Tsirkin 		} else {
12284c5a8442SMichael S. Tsirkin 			/* Header came from socket; we'll need to patch
12294c5a8442SMichael S. Tsirkin 			 * ->num_buffers over if VIRTIO_NET_F_MRG_RXBUF
12304c5a8442SMichael S. Tsirkin 			 */
12314c5a8442SMichael S. Tsirkin 			iov_iter_advance(&fixup, sizeof(hdr));
12324c5a8442SMichael S. Tsirkin 		}
12338dd014adSDavid Stevens 		/* TODO: Should check and handle checksum. */
12345201aa49SMichael S. Tsirkin 
12350960b641SJason Wang 		num_buffers = cpu_to_vhost16(vq, headcount);
1236cfbdab95SJason Wang 		if (likely(mergeable) &&
12370d79a493SMichael S. Tsirkin 		    copy_to_iter(&num_buffers, sizeof num_buffers,
12380d79a493SMichael S. Tsirkin 				 &fixup) != sizeof num_buffers) {
12398dd014adSDavid Stevens 			vq_err(vq, "Failed num_buffers write");
12408dd014adSDavid Stevens 			vhost_discard_vq_desc(vq, headcount);
12418241a1e4SJason Wang 			goto out;
12428dd014adSDavid Stevens 		}
1243f5a4941aSJason Wang 		nvq->done_idx += headcount;
1244d0d86971SJason Wang 		if (nvq->done_idx > VHOST_NET_BATCH)
124509c32489SJason Wang 			vhost_net_signal_used(nvq);
12468dd014adSDavid Stevens 		if (unlikely(vq_log))
1247cc5e7107SJason Wang 			vhost_log_write(vq, vq_log, log, vhost_len,
1248cc5e7107SJason Wang 					vq->iov, in);
12498dd014adSDavid Stevens 		total_len += vhost_len;
1250e2412c07SJason Wang 	} while (likely(!vhost_exceeds_weight(vq, ++recv_pkts, total_len)));
1251e2412c07SJason Wang 
1252be294a51SToshiaki Makita 	if (unlikely(busyloop_intr))
1253be294a51SToshiaki Makita 		vhost_poll_queue(&vq->poll);
1254e2412c07SJason Wang 	else if (!sock_len)
12558241a1e4SJason Wang 		vhost_net_enable_vq(net, vq);
12562e26af79SAsias He out:
125709c32489SJason Wang 	vhost_net_signal_used(nvq);
12588dd014adSDavid Stevens 	mutex_unlock(&vq->mutex);
12598dd014adSDavid Stevens }
12608dd014adSDavid Stevens 
handle_tx_kick(struct vhost_work * work)1261c23f3445STejun Heo static void handle_tx_kick(struct vhost_work *work)
12623a4d5c94SMichael S. Tsirkin {
1263c23f3445STejun Heo 	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
1264c23f3445STejun Heo 						  poll.work);
1265c23f3445STejun Heo 	struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
1266c23f3445STejun Heo 
12673a4d5c94SMichael S. Tsirkin 	handle_tx(net);
12683a4d5c94SMichael S. Tsirkin }
12693a4d5c94SMichael S. Tsirkin 
handle_rx_kick(struct vhost_work * work)1270c23f3445STejun Heo static void handle_rx_kick(struct vhost_work *work)
12713a4d5c94SMichael S. Tsirkin {
1272c23f3445STejun Heo 	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
1273c23f3445STejun Heo 						  poll.work);
1274c23f3445STejun Heo 	struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
1275c23f3445STejun Heo 
12763a4d5c94SMichael S. Tsirkin 	handle_rx(net);
12773a4d5c94SMichael S. Tsirkin }
12783a4d5c94SMichael S. Tsirkin 
handle_tx_net(struct vhost_work * work)1279c23f3445STejun Heo static void handle_tx_net(struct vhost_work *work)
12803a4d5c94SMichael S. Tsirkin {
1281c23f3445STejun Heo 	struct vhost_net *net = container_of(work, struct vhost_net,
1282c23f3445STejun Heo 					     poll[VHOST_NET_VQ_TX].work);
12833a4d5c94SMichael S. Tsirkin 	handle_tx(net);
12843a4d5c94SMichael S. Tsirkin }
12853a4d5c94SMichael S. Tsirkin 
handle_rx_net(struct vhost_work * work)1286c23f3445STejun Heo static void handle_rx_net(struct vhost_work *work)
12873a4d5c94SMichael S. Tsirkin {
1288c23f3445STejun Heo 	struct vhost_net *net = container_of(work, struct vhost_net,
1289c23f3445STejun Heo 					     poll[VHOST_NET_VQ_RX].work);
12903a4d5c94SMichael S. Tsirkin 	handle_rx(net);
12913a4d5c94SMichael S. Tsirkin }
12923a4d5c94SMichael S. Tsirkin 
vhost_net_open(struct inode * inode,struct file * f)12933a4d5c94SMichael S. Tsirkin static int vhost_net_open(struct inode *inode, struct file *f)
12943a4d5c94SMichael S. Tsirkin {
129523cc5a99SMichael S. Tsirkin 	struct vhost_net *n;
1296c23f3445STejun Heo 	struct vhost_dev *dev;
12973ab2e420SAsias He 	struct vhost_virtqueue **vqs;
12985990a305SJason Wang 	void **queue;
12990a0be13bSJason Wang 	struct xdp_buff *xdp;
130059566b6eSZhi Yong Wu 	int i;
1301c23f3445STejun Heo 
1302dcda9b04SMichal Hocko 	n = kvmalloc(sizeof *n, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
13033a4d5c94SMichael S. Tsirkin 	if (!n)
13043a4d5c94SMichael S. Tsirkin 		return -ENOMEM;
13056da2ec56SKees Cook 	vqs = kmalloc_array(VHOST_NET_VQ_MAX, sizeof(*vqs), GFP_KERNEL);
13063ab2e420SAsias He 	if (!vqs) {
1307d04257b0SRomain Francoise 		kvfree(n);
13083ab2e420SAsias He 		return -ENOMEM;
13093ab2e420SAsias He 	}
1310c23f3445STejun Heo 
1311d0d86971SJason Wang 	queue = kmalloc_array(VHOST_NET_BATCH, sizeof(void *),
1312c67df11fSJason Wang 			      GFP_KERNEL);
1313c67df11fSJason Wang 	if (!queue) {
1314c67df11fSJason Wang 		kfree(vqs);
1315c67df11fSJason Wang 		kvfree(n);
1316c67df11fSJason Wang 		return -ENOMEM;
1317c67df11fSJason Wang 	}
1318c67df11fSJason Wang 	n->vqs[VHOST_NET_VQ_RX].rxq.queue = queue;
1319c67df11fSJason Wang 
13200a0be13bSJason Wang 	xdp = kmalloc_array(VHOST_NET_BATCH, sizeof(*xdp), GFP_KERNEL);
13210a0be13bSJason Wang 	if (!xdp) {
13220a0be13bSJason Wang 		kfree(vqs);
13230a0be13bSJason Wang 		kvfree(n);
13240a0be13bSJason Wang 		kfree(queue);
13258a1aff14SDan Carpenter 		return -ENOMEM;
13260a0be13bSJason Wang 	}
13270a0be13bSJason Wang 	n->vqs[VHOST_NET_VQ_TX].xdp = xdp;
13280a0be13bSJason Wang 
1329c23f3445STejun Heo 	dev = &n->dev;
13303ab2e420SAsias He 	vqs[VHOST_NET_VQ_TX] = &n->vqs[VHOST_NET_VQ_TX].vq;
13313ab2e420SAsias He 	vqs[VHOST_NET_VQ_RX] = &n->vqs[VHOST_NET_VQ_RX].vq;
13323ab2e420SAsias He 	n->vqs[VHOST_NET_VQ_TX].vq.handle_kick = handle_tx_kick;
13333ab2e420SAsias He 	n->vqs[VHOST_NET_VQ_RX].vq.handle_kick = handle_rx_kick;
13342839400fSAsias He 	for (i = 0; i < VHOST_NET_VQ_MAX; i++) {
13352839400fSAsias He 		n->vqs[i].ubufs = NULL;
13362839400fSAsias He 		n->vqs[i].ubuf_info = NULL;
13372839400fSAsias He 		n->vqs[i].upend_idx = 0;
13382839400fSAsias He 		n->vqs[i].done_idx = 0;
13390a0be13bSJason Wang 		n->vqs[i].batched_xdp = 0;
134081f95a55SMichael S. Tsirkin 		n->vqs[i].vhost_hlen = 0;
134181f95a55SMichael S. Tsirkin 		n->vqs[i].sock_hlen = 0;
1342ab7e34b3SAlexander Potapenko 		n->vqs[i].rx_ring = NULL;
1343c67df11fSJason Wang 		vhost_net_buf_init(&n->vqs[i].rxq);
13442839400fSAsias He 	}
1345b46a0bf7SJason Wang 	vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX,
1346e82b9b07SJason Wang 		       UIO_MAXIOV + VHOST_NET_BATCH,
134701fcb1cbSJason Wang 		       VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT, true,
1348792a4f2eSJason Wang 		       NULL);
13493a4d5c94SMichael S. Tsirkin 
1350a9a08845SLinus Torvalds 	vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, dev,
1351a9a08845SLinus Torvalds 			vqs[VHOST_NET_VQ_TX]);
13523a4d5c94SMichael S. Tsirkin 	vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev,
13533a4d5c94SMichael S. Tsirkin 			vqs[VHOST_NET_VQ_RX]);
1354e4dab1e6SJason Wang 
1355e4dab1e6SJason Wang 	f->private_data = n;
13563a4d5c94SMichael S. Tsirkin 	n->page_frag.page = NULL;
13573a4d5c94SMichael S. Tsirkin 	n->refcnt_bias = 0;
13583a4d5c94SMichael S. Tsirkin 
13593a4d5c94SMichael S. Tsirkin 	return 0;
13603a4d5c94SMichael S. Tsirkin }
13613a4d5c94SMichael S. Tsirkin 
vhost_net_stop_vq(struct vhost_net * n,struct vhost_virtqueue * vq)13623a4d5c94SMichael S. Tsirkin static struct socket *vhost_net_stop_vq(struct vhost_net *n,
13633a4d5c94SMichael S. Tsirkin 					struct vhost_virtqueue *vq)
1364c67df11fSJason Wang {
1365c67df11fSJason Wang 	struct socket *sock;
13663a4d5c94SMichael S. Tsirkin 	struct vhost_net_virtqueue *nvq =
13673a4d5c94SMichael S. Tsirkin 		container_of(vq, struct vhost_net_virtqueue, vq);
1368247643f8SEugenio Pérez 
13693a4d5c94SMichael S. Tsirkin 	mutex_lock(&vq->mutex);
1370247643f8SEugenio Pérez 	sock = vhost_vq_get_backend(vq);
1371c67df11fSJason Wang 	vhost_net_disable_vq(n, vq);
1372303fd71bSJason Wang 	vhost_vq_set_backend(vq, NULL);
13733a4d5c94SMichael S. Tsirkin 	vhost_net_buf_unproduce(nvq);
13743a4d5c94SMichael S. Tsirkin 	nvq->rx_ring = NULL;
13753a4d5c94SMichael S. Tsirkin 	mutex_unlock(&vq->mutex);
13763a4d5c94SMichael S. Tsirkin 	return sock;
13773a4d5c94SMichael S. Tsirkin }
13783a4d5c94SMichael S. Tsirkin 
vhost_net_stop(struct vhost_net * n,struct socket ** tx_sock,struct socket ** rx_sock)13793a4d5c94SMichael S. Tsirkin static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock,
13803ab2e420SAsias He 			   struct socket **rx_sock)
13813ab2e420SAsias He {
13823a4d5c94SMichael S. Tsirkin 	*tx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_TX].vq);
13833a4d5c94SMichael S. Tsirkin 	*rx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_RX].vq);
13843a4d5c94SMichael S. Tsirkin }
13853a4d5c94SMichael S. Tsirkin 
vhost_net_flush(struct vhost_net * n)1386b2ffa407SMike Christie static void vhost_net_flush(struct vhost_net *n)
13872839400fSAsias He {
13883ab2e420SAsias He 	vhost_dev_flush(&n->dev);
13891280c27fSMichael S. Tsirkin 	if (n->vqs[VHOST_NET_VQ_TX].ubufs) {
13903ab2e420SAsias He 		mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
13911280c27fSMichael S. Tsirkin 		n->tx_flush = true;
1392fe729a57SAsias He 		mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
13933ab2e420SAsias He 		/* Wait for all lower device DMAs done. */
13941280c27fSMichael S. Tsirkin 		vhost_net_ubuf_put_and_wait(n->vqs[VHOST_NET_VQ_TX].ubufs);
13950ad8b480SMichael S. Tsirkin 		mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
13963ab2e420SAsias He 		n->tx_flush = false;
13971280c27fSMichael S. Tsirkin 		atomic_set(&n->vqs[VHOST_NET_VQ_TX].ubufs->refcount, 1);
13983a4d5c94SMichael S. Tsirkin 		mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
13993a4d5c94SMichael S. Tsirkin 	}
14003a4d5c94SMichael S. Tsirkin }
14013a4d5c94SMichael S. Tsirkin 
vhost_net_release(struct inode * inode,struct file * f)14023a4d5c94SMichael S. Tsirkin static int vhost_net_release(struct inode *inode, struct file *f)
14033a4d5c94SMichael S. Tsirkin {
14043a4d5c94SMichael S. Tsirkin 	struct vhost_net *n = f->private_data;
14053a4d5c94SMichael S. Tsirkin 	struct socket *tx_sock;
14063a4d5c94SMichael S. Tsirkin 	struct socket *rx_sock;
14073a4d5c94SMichael S. Tsirkin 
1408b211616dSMichael S. Tsirkin 	vhost_net_stop(n, &tx_sock, &rx_sock);
1409f6f93f75S夷则(Caspar) 	vhost_net_flush(n);
141081f95a55SMichael S. Tsirkin 	vhost_dev_stop(&n->dev);
14113a4d5c94SMichael S. Tsirkin 	vhost_dev_cleanup(&n->dev);
141209aaacf0SAl Viro 	vhost_net_vq_reset(n);
14133a4d5c94SMichael S. Tsirkin 	if (tx_sock)
141409aaacf0SAl Viro 		sockfd_put(tx_sock);
1415b0c057caSMichael S. Tsirkin 	if (rx_sock)
1416d05faa5fSPaul E. McKenney 		sockfd_put(rx_sock);
14173a4d5c94SMichael S. Tsirkin 	/* Make sure no callbacks are outstanding */
14183a4d5c94SMichael S. Tsirkin 	synchronize_rcu();
14193a4d5c94SMichael S. Tsirkin 	/* We do an extra flush before freeing memory,
1420c67df11fSJason Wang 	 * since jobs can re-queue themselves. */
14210a0be13bSJason Wang 	vhost_net_flush(n);
14223ab2e420SAsias He 	kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue);
1423e4dab1e6SJason Wang 	kfree(n->vqs[VHOST_NET_VQ_TX].xdp);
1424e4dab1e6SJason Wang 	kfree(n->dev.vqs);
1425d04257b0SRomain Francoise 	if (n->page_frag.page)
14263a4d5c94SMichael S. Tsirkin 		__page_frag_cache_drain(n->page_frag.page, n->refcnt_bias);
14273a4d5c94SMichael S. Tsirkin 	kvfree(n);
14283a4d5c94SMichael S. Tsirkin 	return 0;
14293a4d5c94SMichael S. Tsirkin }
14303a4d5c94SMichael S. Tsirkin 
get_raw_socket(int fd)14319b2c45d4SDenys Vlasenko static struct socket *get_raw_socket(int fd)
14323a4d5c94SMichael S. Tsirkin {
1433d47effe1SKrishna Kumar 	int r;
14343a4d5c94SMichael S. Tsirkin 	struct socket *sock = sockfd_lookup(fd, &r);
14353a4d5c94SMichael S. Tsirkin 
14363a4d5c94SMichael S. Tsirkin 	if (!sock)
14373a4d5c94SMichael S. Tsirkin 		return ERR_PTR(-ENOTSOCK);
14383a4d5c94SMichael S. Tsirkin 
14393a4d5c94SMichael S. Tsirkin 	/* Parameter checking */
14403a4d5c94SMichael S. Tsirkin 	if (sock->sk->sk_type != SOCK_RAW) {
14413a4d5c94SMichael S. Tsirkin 		r = -ESOCKTNOSUPPORT;
14423a4d5c94SMichael S. Tsirkin 		goto err;
144342d84c84SEugenio Pérez 	}
14443a4d5c94SMichael S. Tsirkin 
14453a4d5c94SMichael S. Tsirkin 	if (sock->sk->sk_family != AF_PACKET) {
14463a4d5c94SMichael S. Tsirkin 		r = -EPFNOSUPPORT;
14473a4d5c94SMichael S. Tsirkin 		goto err;
14483a4d5c94SMichael S. Tsirkin 	}
144909aaacf0SAl Viro 	return sock;
14503a4d5c94SMichael S. Tsirkin err:
14513a4d5c94SMichael S. Tsirkin 	sockfd_put(sock);
14523a4d5c94SMichael S. Tsirkin 	return ERR_PTR(r);
1453fb4554c2SAl Viro }
1454c67df11fSJason Wang 
get_tap_ptr_ring(struct file * file)14555990a305SJason Wang static struct ptr_ring *get_tap_ptr_ring(struct file *file)
14565990a305SJason Wang {
14575990a305SJason Wang 	struct ptr_ring *ring;
1458c67df11fSJason Wang 	ring = tun_get_tx_ring(file);
14595990a305SJason Wang 	if (!IS_ERR(ring))
14605990a305SJason Wang 		goto out;
1461c67df11fSJason Wang 	ring = tap_get_ptr_ring(file);
14625990a305SJason Wang 	if (!IS_ERR(ring))
1463c67df11fSJason Wang 		goto out;
14645990a305SJason Wang 	ring = NULL;
1465c67df11fSJason Wang out:
1466c67df11fSJason Wang 	return ring;
1467501c774cSArnd Bergmann }
14683a4d5c94SMichael S. Tsirkin 
get_tap_socket(int fd)14693a4d5c94SMichael S. Tsirkin static struct socket *get_tap_socket(int fd)
14703a4d5c94SMichael S. Tsirkin {
1471d47effe1SKrishna Kumar 	struct file *file = fget(fd);
14723a4d5c94SMichael S. Tsirkin 	struct socket *sock;
14733a4d5c94SMichael S. Tsirkin 
14743a4d5c94SMichael S. Tsirkin 	if (!file)
1475501c774cSArnd Bergmann 		return ERR_PTR(-EBADF);
1476501c774cSArnd Bergmann 	sock = tun_get_socket(file);
1477635b8c8eSSainath Grandhi 	if (!IS_ERR(sock))
14783a4d5c94SMichael S. Tsirkin 		return sock;
14793a4d5c94SMichael S. Tsirkin 	sock = tap_get_socket(file);
14803a4d5c94SMichael S. Tsirkin 	if (IS_ERR(sock))
14813a4d5c94SMichael S. Tsirkin 		fput(file);
14823a4d5c94SMichael S. Tsirkin 	return sock;
14833a4d5c94SMichael S. Tsirkin }
14843a4d5c94SMichael S. Tsirkin 
get_socket(int fd)14853a4d5c94SMichael S. Tsirkin static struct socket *get_socket(int fd)
1486d47effe1SKrishna Kumar {
14873a4d5c94SMichael S. Tsirkin 	struct socket *sock;
14883a4d5c94SMichael S. Tsirkin 
14893a4d5c94SMichael S. Tsirkin 	/* special case to disable backend */
14903a4d5c94SMichael S. Tsirkin 	if (fd == -1)
14913a4d5c94SMichael S. Tsirkin 		return NULL;
14923a4d5c94SMichael S. Tsirkin 	sock = get_raw_socket(fd);
1493501c774cSArnd Bergmann 	if (!IS_ERR(sock))
14943a4d5c94SMichael S. Tsirkin 		return sock;
14953a4d5c94SMichael S. Tsirkin 	sock = get_tap_socket(fd);
14963a4d5c94SMichael S. Tsirkin 	if (!IS_ERR(sock))
14973a4d5c94SMichael S. Tsirkin 		return sock;
14983a4d5c94SMichael S. Tsirkin 	return ERR_PTR(-ENOTSOCK);
14993a4d5c94SMichael S. Tsirkin }
15003a4d5c94SMichael S. Tsirkin 
vhost_net_set_backend(struct vhost_net * n,unsigned index,int fd)15013a4d5c94SMichael S. Tsirkin static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
15023a4d5c94SMichael S. Tsirkin {
15032839400fSAsias He 	struct socket *sock, *oldsock;
1504fe729a57SAsias He 	struct vhost_virtqueue *vq;
15053a4d5c94SMichael S. Tsirkin 	struct vhost_net_virtqueue *nvq;
15063a4d5c94SMichael S. Tsirkin 	struct vhost_net_ubuf_ref *ubufs, *oldubufs = NULL;
15073a4d5c94SMichael S. Tsirkin 	int r;
15083a4d5c94SMichael S. Tsirkin 
15093a4d5c94SMichael S. Tsirkin 	mutex_lock(&n->dev.mutex);
15103a4d5c94SMichael S. Tsirkin 	r = vhost_dev_check_owner(&n->dev);
15113a4d5c94SMichael S. Tsirkin 	if (r)
15123a4d5c94SMichael S. Tsirkin 		goto err;
15133a4d5c94SMichael S. Tsirkin 
15143a4d5c94SMichael S. Tsirkin 	if (index >= VHOST_NET_VQ_MAX) {
15153a4d5c94SMichael S. Tsirkin 		r = -ENOBUFS;
15163ab2e420SAsias He 		goto err;
15172839400fSAsias He 	}
15183a4d5c94SMichael S. Tsirkin 	vq = &n->vqs[index].vq;
15193a4d5c94SMichael S. Tsirkin 	nvq = &n->vqs[index];
15209526f9a2SEric Auger 	mutex_lock(&vq->mutex);
15219526f9a2SEric Auger 
15229526f9a2SEric Auger 	if (fd == -1)
15233a4d5c94SMichael S. Tsirkin 		vhost_clear_msg(&n->dev);
15243a4d5c94SMichael S. Tsirkin 
15253a4d5c94SMichael S. Tsirkin 	/* Verify that ring has been setup correctly. */
15261dace8c8SJeff Dike 	if (!vhost_vq_access_ok(vq)) {
15273a4d5c94SMichael S. Tsirkin 		r = -EFAULT;
15283a4d5c94SMichael S. Tsirkin 		goto err_vq;
15293a4d5c94SMichael S. Tsirkin 	}
15303a4d5c94SMichael S. Tsirkin 	sock = get_socket(fd);
15311dace8c8SJeff Dike 	if (IS_ERR(sock)) {
15323a4d5c94SMichael S. Tsirkin 		r = PTR_ERR(sock);
15333a4d5c94SMichael S. Tsirkin 		goto err_vq;
15343a4d5c94SMichael S. Tsirkin 	}
1535247643f8SEugenio Pérez 
1536dd1f4078SJeff Dike 	/* start polling new socket */
1537fe729a57SAsias He 	oldsock = vhost_vq_get_backend(vq);
1538fe729a57SAsias He 	if (sock != oldsock) {
1539bab632d6SMichael S. Tsirkin 		ubufs = vhost_net_ubuf_alloc(vq,
1540bab632d6SMichael S. Tsirkin 					     sock && vhost_sock_zcopy(sock));
1541bab632d6SMichael S. Tsirkin 		if (IS_ERR(ubufs)) {
1542bab632d6SMichael S. Tsirkin 			r = PTR_ERR(ubufs);
1543692a998bSJason Wang 			goto err_ubufs;
15443a4d5c94SMichael S. Tsirkin 		}
1545247643f8SEugenio Pérez 
1546c67df11fSJason Wang 		vhost_net_disable_vq(n, vq);
154780f7d030SGreg Kurz 		vhost_vq_set_backend(vq, sock);
1548f59281daSJason Wang 		vhost_net_buf_unproduce(nvq);
1549692a998bSJason Wang 		r = vhost_vq_init_access(vq);
15502b8b328bSJason Wang 		if (r)
15512b8b328bSJason Wang 			goto err_used;
15522b8b328bSJason Wang 		r = vhost_net_enable_vq(n, vq);
1553fb4554c2SAl Viro 		if (r)
1554fb4554c2SAl Viro 			goto err_used;
1555fb4554c2SAl Viro 		if (index == VHOST_NET_VQ_RX) {
1556fb4554c2SAl Viro 			if (sock)
1557fb4554c2SAl Viro 				nvq->rx_ring = get_tap_ptr_ring(sock->file);
1558fb4554c2SAl Viro 			else
1559692a998bSJason Wang 				nvq->rx_ring = NULL;
15602839400fSAsias He 		}
15612839400fSAsias He 
156264e9a9b8SMichael S. Tsirkin 		oldubufs = nvq->ubufs;
156364e9a9b8SMichael S. Tsirkin 		nvq->ubufs = ubufs;
156464e9a9b8SMichael S. Tsirkin 
15651280c27fSMichael S. Tsirkin 		n->tx_packets = 0;
1566dd1f4078SJeff Dike 		n->tx_zcopy_err = 0;
1567dd1f4078SJeff Dike 		n->tx_flush = false;
15681680e906SMichael S. Tsirkin 	}
15691680e906SMichael S. Tsirkin 
1570c047e5f3SMichael S. Tsirkin 	mutex_unlock(&vq->mutex);
1571c38e39c3SMichael S. Tsirkin 
1572c047e5f3SMichael S. Tsirkin 	if (oldubufs) {
1573eaae8132SMichael S. Tsirkin 		vhost_net_ubuf_put_wait_and_free(oldubufs);
1574c047e5f3SMichael S. Tsirkin 		mutex_lock(&vq->mutex);
1575c047e5f3SMichael S. Tsirkin 		vhost_zerocopy_signal_used(n, vq);
1576bab632d6SMichael S. Tsirkin 		mutex_unlock(&vq->mutex);
15773a4d5c94SMichael S. Tsirkin 	}
1578b2ffa407SMike Christie 
157909aaacf0SAl Viro 	if (oldsock) {
15803a4d5c94SMichael S. Tsirkin 		vhost_dev_flush(&n->dev);
15811dace8c8SJeff Dike 		sockfd_put(oldsock);
15821680e906SMichael S. Tsirkin 	}
15831680e906SMichael S. Tsirkin 
15841680e906SMichael S. Tsirkin 	mutex_unlock(&n->dev.mutex);
1585692a998bSJason Wang 	return 0;
1586247643f8SEugenio Pérez 
1587692a998bSJason Wang err_used:
1588692a998bSJason Wang 	vhost_vq_set_backend(vq, oldsock);
1589c38e39c3SMichael S. Tsirkin 	vhost_net_enable_vq(n, vq);
1590bab632d6SMichael S. Tsirkin 	if (ubufs)
1591b8f1f658SJason Wang 		vhost_net_ubuf_put_wait_and_free(ubufs);
159209aaacf0SAl Viro err_ubufs:
15931dace8c8SJeff Dike 	if (sock)
15941dace8c8SJeff Dike 		sockfd_put(sock);
15953a4d5c94SMichael S. Tsirkin err_vq:
15963a4d5c94SMichael S. Tsirkin 	mutex_unlock(&vq->mutex);
15973a4d5c94SMichael S. Tsirkin err:
15983a4d5c94SMichael S. Tsirkin 	mutex_unlock(&n->dev.mutex);
15993a4d5c94SMichael S. Tsirkin 	return r;
16003a4d5c94SMichael S. Tsirkin }
16013a4d5c94SMichael S. Tsirkin 
vhost_net_reset_owner(struct vhost_net * n)16023a4d5c94SMichael S. Tsirkin static long vhost_net_reset_owner(struct vhost_net *n)
16033a4d5c94SMichael S. Tsirkin {
16043a4d5c94SMichael S. Tsirkin 	struct socket *tx_sock = NULL;
16050bbe3066SJason Wang 	struct socket *rx_sock = NULL;
1606d47effe1SKrishna Kumar 	long err;
16073a4d5c94SMichael S. Tsirkin 	struct vhost_iotlb *umem;
16083a4d5c94SMichael S. Tsirkin 
16093a4d5c94SMichael S. Tsirkin 	mutex_lock(&n->dev.mutex);
16103a4d5c94SMichael S. Tsirkin 	err = vhost_dev_check_owner(&n->dev);
1611a9709d68SJason Wang 	if (err)
1612a9709d68SJason Wang 		goto done;
1613150b9e51SMichael S. Tsirkin 	umem = vhost_dev_reset_owner_prepare();
1614150b9e51SMichael S. Tsirkin 	if (!umem) {
1615150b9e51SMichael S. Tsirkin 		err = -ENOMEM;
16163a4d5c94SMichael S. Tsirkin 		goto done;
16173a4d5c94SMichael S. Tsirkin 	}
16184cd87951SJason Wang 	vhost_net_stop(n, &tx_sock, &rx_sock);
1619a9709d68SJason Wang 	vhost_net_flush(n);
162081f95a55SMichael S. Tsirkin 	vhost_dev_stop(&n->dev);
16213a4d5c94SMichael S. Tsirkin 	vhost_dev_reset_owner(&n->dev, umem);
16223a4d5c94SMichael S. Tsirkin 	vhost_net_vq_reset(n);
16233a4d5c94SMichael S. Tsirkin done:
162409aaacf0SAl Viro 	mutex_unlock(&n->dev.mutex);
16253a4d5c94SMichael S. Tsirkin 	if (tx_sock)
162609aaacf0SAl Viro 		sockfd_put(tx_sock);
16273a4d5c94SMichael S. Tsirkin 	if (rx_sock)
16283a4d5c94SMichael S. Tsirkin 		sockfd_put(rx_sock);
16293a4d5c94SMichael S. Tsirkin 	return err;
16303a4d5c94SMichael S. Tsirkin }
16313a4d5c94SMichael S. Tsirkin 
vhost_net_set_features(struct vhost_net * n,u64 features)16328dd014adSDavid Stevens static int vhost_net_set_features(struct vhost_net *n, u64 features)
16333a4d5c94SMichael S. Tsirkin {
16348dd014adSDavid Stevens 	size_t vhost_hlen, sock_hlen, hdr_len;
1635e4fca7d6SMichael S. Tsirkin 	int i;
1636e4fca7d6SMichael S. Tsirkin 
16378dd014adSDavid Stevens 	hdr_len = (features & ((1ULL << VIRTIO_NET_F_MRG_RXBUF) |
16388dd014adSDavid Stevens 			       (1ULL << VIRTIO_F_VERSION_1))) ?
16398dd014adSDavid Stevens 			sizeof(struct virtio_net_hdr_mrg_rxbuf) :
16408dd014adSDavid Stevens 			sizeof(struct virtio_net_hdr);
16418dd014adSDavid Stevens 	if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) {
16428dd014adSDavid Stevens 		/* vhost provides vnet_hdr */
16438dd014adSDavid Stevens 		vhost_hlen = hdr_len;
16448dd014adSDavid Stevens 		sock_hlen = 0;
16458dd014adSDavid Stevens 	} else {
16468dd014adSDavid Stevens 		/* socket provides vnet_hdr */
16478dd014adSDavid Stevens 		vhost_hlen = 0;
16483a4d5c94SMichael S. Tsirkin 		sock_hlen = hdr_len;
16493a4d5c94SMichael S. Tsirkin 	}
16506b1e6cc7SJason Wang 	mutex_lock(&n->dev.mutex);
16516b1e6cc7SJason Wang 	if ((features & (1 << VHOST_F_LOG_ALL)) &&
16526b1e6cc7SJason Wang 	    !vhost_log_access_ok(&n->dev))
1653321bd212SMichael S. Tsirkin 		goto out_unlock;
1654759aba1eSLiming Wu 
16556b1e6cc7SJason Wang 	if ((features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) {
16563a4d5c94SMichael S. Tsirkin 		if (vhost_init_device_iotlb(&n->dev))
16576b1e6cc7SJason Wang 			goto out_unlock;
16583a4d5c94SMichael S. Tsirkin 	}
16593ab2e420SAsias He 
1660ea16c514SMichael S. Tsirkin 	for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
166181f95a55SMichael S. Tsirkin 		mutex_lock(&n->vqs[i].vq.mutex);
166281f95a55SMichael S. Tsirkin 		n->vqs[i].vq.acked_features = features;
16633ab2e420SAsias He 		n->vqs[i].vhost_hlen = vhost_hlen;
16643a4d5c94SMichael S. Tsirkin 		n->vqs[i].sock_hlen = sock_hlen;
16653a4d5c94SMichael S. Tsirkin 		mutex_unlock(&n->vqs[i].vq.mutex);
16663a4d5c94SMichael S. Tsirkin 	}
16676b1e6cc7SJason Wang 	mutex_unlock(&n->dev.mutex);
16686b1e6cc7SJason Wang 	return 0;
16696b1e6cc7SJason Wang 
16706b1e6cc7SJason Wang out_unlock:
16713a4d5c94SMichael S. Tsirkin 	mutex_unlock(&n->dev.mutex);
16723a4d5c94SMichael S. Tsirkin 	return -EFAULT;
1673b1ad8496SAsias He }
1674b1ad8496SAsias He 
vhost_net_set_owner(struct vhost_net * n)1675b1ad8496SAsias He static long vhost_net_set_owner(struct vhost_net *n)
1676b1ad8496SAsias He {
1677b1ad8496SAsias He 	int r;
167805c05351SMichael S. Tsirkin 
167905c05351SMichael S. Tsirkin 	mutex_lock(&n->dev.mutex);
168005c05351SMichael S. Tsirkin 	if (vhost_dev_has_owner(&n->dev)) {
168105c05351SMichael S. Tsirkin 		r = -EBUSY;
1682b1ad8496SAsias He 		goto out;
1683b1ad8496SAsias He 	}
1684b1ad8496SAsias He 	r = vhost_net_set_ubuf_info(n);
1685b1ad8496SAsias He 	if (r)
1686b1ad8496SAsias He 		goto out;
1687b1ad8496SAsias He 	r = vhost_dev_set_owner(&n->dev);
1688b1ad8496SAsias He 	if (r)
1689b1ad8496SAsias He 		vhost_net_clear_ubuf_info(n);
1690b1ad8496SAsias He 	vhost_net_flush(n);
1691b1ad8496SAsias He out:
1692b1ad8496SAsias He 	mutex_unlock(&n->dev.mutex);
1693b1ad8496SAsias He 	return r;
16943a4d5c94SMichael S. Tsirkin }
16953a4d5c94SMichael S. Tsirkin 
vhost_net_ioctl(struct file * f,unsigned int ioctl,unsigned long arg)16963a4d5c94SMichael S. Tsirkin static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
16973a4d5c94SMichael S. Tsirkin 			    unsigned long arg)
16983a4d5c94SMichael S. Tsirkin {
16993a4d5c94SMichael S. Tsirkin 	struct vhost_net *n = f->private_data;
17003a4d5c94SMichael S. Tsirkin 	void __user *argp = (void __user *)arg;
17013a4d5c94SMichael S. Tsirkin 	u64 __user *featurep = argp;
17023a4d5c94SMichael S. Tsirkin 	struct vhost_vring_file backend;
1703d47effe1SKrishna Kumar 	u64 features;
17043a4d5c94SMichael S. Tsirkin 	int r;
17053a4d5c94SMichael S. Tsirkin 
1706d3553a52STakuya Yoshikawa 	switch (ioctl) {
1707d3553a52STakuya Yoshikawa 	case VHOST_NET_SET_BACKEND:
17083a4d5c94SMichael S. Tsirkin 		if (copy_from_user(&backend, argp, sizeof backend))
17093a4d5c94SMichael S. Tsirkin 			return -EFAULT;
17100dd05a3bSStefan Hajnoczi 		return vhost_net_set_backend(n, backend.index, backend.fd);
1711d3553a52STakuya Yoshikawa 	case VHOST_GET_FEATURES:
1712d3553a52STakuya Yoshikawa 		features = VHOST_NET_FEATURES;
1713d3553a52STakuya Yoshikawa 		if (copy_to_user(featurep, &features, sizeof features))
17143a4d5c94SMichael S. Tsirkin 			return -EFAULT;
1715d3553a52STakuya Yoshikawa 		return 0;
1716d3553a52STakuya Yoshikawa 	case VHOST_SET_FEATURES:
17170dd05a3bSStefan Hajnoczi 		if (copy_from_user(&features, featurep, sizeof features))
17183a4d5c94SMichael S. Tsirkin 			return -EFAULT;
17193a4d5c94SMichael S. Tsirkin 		if (features & ~VHOST_NET_FEATURES)
1720429711aeSJason Wang 			return -EOPNOTSUPP;
1721429711aeSJason Wang 		return vhost_net_set_features(n, features);
1722429711aeSJason Wang 	case VHOST_GET_BACKEND_FEATURES:
1723429711aeSJason Wang 		features = VHOST_NET_BACKEND_FEATURES;
1724429711aeSJason Wang 		if (copy_to_user(featurep, &features, sizeof(features)))
1725429711aeSJason Wang 			return -EFAULT;
1726429711aeSJason Wang 		return 0;
1727429711aeSJason Wang 	case VHOST_SET_BACKEND_FEATURES:
1728429711aeSJason Wang 		if (copy_from_user(&features, featurep, sizeof(features)))
1729429711aeSJason Wang 			return -EFAULT;
1730460f7ce1SJason Wang 		if (features & ~VHOST_NET_BACKEND_FEATURES)
1731460f7ce1SJason Wang 			return -EOPNOTSUPP;
17323a4d5c94SMichael S. Tsirkin 		vhost_set_backend_features(&n->dev, features);
17333a4d5c94SMichael S. Tsirkin 		return 0;
1734b1ad8496SAsias He 	case VHOST_RESET_OWNER:
1735b1ad8496SAsias He 		return vhost_net_reset_owner(n);
17363a4d5c94SMichael S. Tsirkin 	case VHOST_SET_OWNER:
17373a4d5c94SMichael S. Tsirkin 		return vhost_net_set_owner(n);
1738935cdee7SMichael S. Tsirkin 	default:
1739935cdee7SMichael S. Tsirkin 		mutex_lock(&n->dev.mutex);
1740935cdee7SMichael S. Tsirkin 		r = vhost_dev_ioctl(&n->dev, ioctl, argp);
1741935cdee7SMichael S. Tsirkin 		if (r == -ENOIOCTLCMD)
17423a4d5c94SMichael S. Tsirkin 			r = vhost_vring_ioctl(&n->dev, ioctl, argp);
17433a4d5c94SMichael S. Tsirkin 		else
17443a4d5c94SMichael S. Tsirkin 			vhost_net_flush(n);
17453a4d5c94SMichael S. Tsirkin 		mutex_unlock(&n->dev.mutex);
17463a4d5c94SMichael S. Tsirkin 		return r;
17473a4d5c94SMichael S. Tsirkin 	}
17486b1e6cc7SJason Wang }
17496b1e6cc7SJason Wang 
vhost_net_chr_read_iter(struct kiocb * iocb,struct iov_iter * to)17506b1e6cc7SJason Wang static ssize_t vhost_net_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
17516b1e6cc7SJason Wang {
17526b1e6cc7SJason Wang 	struct file *file = iocb->ki_filp;
17536b1e6cc7SJason Wang 	struct vhost_net *n = file->private_data;
17546b1e6cc7SJason Wang 	struct vhost_dev *dev = &n->dev;
17556b1e6cc7SJason Wang 	int noblock = file->f_flags & O_NONBLOCK;
17566b1e6cc7SJason Wang 
17576b1e6cc7SJason Wang 	return vhost_chr_read_iter(dev, to, noblock);
17586b1e6cc7SJason Wang }
17596b1e6cc7SJason Wang 
vhost_net_chr_write_iter(struct kiocb * iocb,struct iov_iter * from)17606b1e6cc7SJason Wang static ssize_t vhost_net_chr_write_iter(struct kiocb *iocb,
17616b1e6cc7SJason Wang 					struct iov_iter *from)
17626b1e6cc7SJason Wang {
17636b1e6cc7SJason Wang 	struct file *file = iocb->ki_filp;
17646b1e6cc7SJason Wang 	struct vhost_net *n = file->private_data;
17656b1e6cc7SJason Wang 	struct vhost_dev *dev = &n->dev;
17666b1e6cc7SJason Wang 
17676b1e6cc7SJason Wang 	return vhost_chr_write_iter(dev, from);
1768afc9a42bSAl Viro }
17696b1e6cc7SJason Wang 
vhost_net_chr_poll(struct file * file,poll_table * wait)17706b1e6cc7SJason Wang static __poll_t vhost_net_chr_poll(struct file *file, poll_table *wait)
17716b1e6cc7SJason Wang {
17726b1e6cc7SJason Wang 	struct vhost_net *n = file->private_data;
17736b1e6cc7SJason Wang 	struct vhost_dev *dev = &n->dev;
17746b1e6cc7SJason Wang 
17756b1e6cc7SJason Wang 	return vhost_chr_poll(file, dev, wait);
1776373a83a6STobias Klauser }
17773a4d5c94SMichael S. Tsirkin 
17783a4d5c94SMichael S. Tsirkin static const struct file_operations vhost_net_fops = {
17796b1e6cc7SJason Wang 	.owner          = THIS_MODULE,
17806b1e6cc7SJason Wang 	.release        = vhost_net_release,
17816b1e6cc7SJason Wang 	.read_iter      = vhost_net_chr_read_iter,
17823a4d5c94SMichael S. Tsirkin 	.write_iter     = vhost_net_chr_write_iter,
1783407e9ef7SArnd Bergmann 	.poll           = vhost_net_chr_poll,
17843a4d5c94SMichael S. Tsirkin 	.unlocked_ioctl = vhost_net_ioctl,
17856038f373SArnd Bergmann 	.compat_ioctl   = compat_ptr_ioctl,
17863a4d5c94SMichael S. Tsirkin 	.open           = vhost_net_open,
17873a4d5c94SMichael S. Tsirkin 	.llseek		= noop_llseek,
17883a4d5c94SMichael S. Tsirkin };
17897c7c7f01Sstephen hemminger 
17907c7c7f01Sstephen hemminger static struct miscdevice vhost_net_misc = {
17917c7c7f01Sstephen hemminger 	.minor = VHOST_NET_MINOR,
17923a4d5c94SMichael S. Tsirkin 	.name = "vhost-net",
17933a4d5c94SMichael S. Tsirkin 	.fops = &vhost_net_fops,
1794078adb3bSXiu Jianfeng };
17953a4d5c94SMichael S. Tsirkin 
vhost_net_init(void)1796bab632d6SMichael S. Tsirkin static int __init vhost_net_init(void)
1797fe729a57SAsias He {
1798c23f3445STejun Heo 	if (experimental_zcopytx)
17993a4d5c94SMichael S. Tsirkin 		vhost_net_enable_zcopy(VHOST_NET_VQ_TX);
18003a4d5c94SMichael S. Tsirkin 	return misc_register(&vhost_net_misc);
18013a4d5c94SMichael S. Tsirkin }
1802078adb3bSXiu Jianfeng module_init(vhost_net_init);
18033a4d5c94SMichael S. Tsirkin 
vhost_net_exit(void)18043a4d5c94SMichael S. Tsirkin static void __exit vhost_net_exit(void)
18053a4d5c94SMichael S. Tsirkin {
18063a4d5c94SMichael S. Tsirkin 	misc_deregister(&vhost_net_misc);
18073a4d5c94SMichael S. Tsirkin }
18083a4d5c94SMichael S. Tsirkin module_exit(vhost_net_exit);
18093a4d5c94SMichael S. Tsirkin 
18103a4d5c94SMichael S. Tsirkin MODULE_VERSION("0.0.1");
18113a4d5c94SMichael S. Tsirkin MODULE_LICENSE("GPL v2");
18127c7c7f01Sstephen hemminger MODULE_AUTHOR("Michael S. Tsirkin");
18137c7c7f01Sstephen hemminger MODULE_DESCRIPTION("Host kernel accelerator for virtio net");
1814 MODULE_ALIAS_MISCDEV(VHOST_NET_MINOR);
1815 MODULE_ALIAS("devname:vhost-net");
1816