17a338472SThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only
23a4d5c94SMichael S. Tsirkin /* Copyright (C) 2009 Red Hat, Inc.
33a4d5c94SMichael S. Tsirkin * Author: Michael S. Tsirkin <mst@redhat.com>
43a4d5c94SMichael S. Tsirkin *
53a4d5c94SMichael S. Tsirkin * virtio-net server in host kernel.
63a4d5c94SMichael S. Tsirkin */
73a4d5c94SMichael S. Tsirkin
83a4d5c94SMichael S. Tsirkin #include <linux/compat.h>
93a4d5c94SMichael S. Tsirkin #include <linux/eventfd.h>
103a4d5c94SMichael S. Tsirkin #include <linux/vhost.h>
113a4d5c94SMichael S. Tsirkin #include <linux/virtio_net.h>
123a4d5c94SMichael S. Tsirkin #include <linux/miscdevice.h>
133a4d5c94SMichael S. Tsirkin #include <linux/module.h>
14bab632d6SMichael S. Tsirkin #include <linux/moduleparam.h>
153a4d5c94SMichael S. Tsirkin #include <linux/mutex.h>
163a4d5c94SMichael S. Tsirkin #include <linux/workqueue.h>
173a4d5c94SMichael S. Tsirkin #include <linux/file.h>
185a0e3ad6STejun Heo #include <linux/slab.h>
19e6017571SIngo Molnar #include <linux/sched/clock.h>
20174cd4b1SIngo Molnar #include <linux/sched/signal.h>
2123cc5a99SMichael S. Tsirkin #include <linux/vmalloc.h>
223a4d5c94SMichael S. Tsirkin
233a4d5c94SMichael S. Tsirkin #include <linux/net.h>
243a4d5c94SMichael S. Tsirkin #include <linux/if_packet.h>
253a4d5c94SMichael S. Tsirkin #include <linux/if_arp.h>
263a4d5c94SMichael S. Tsirkin #include <linux/if_tun.h>
27501c774cSArnd Bergmann #include <linux/if_macvlan.h>
28635b8c8eSSainath Grandhi #include <linux/if_tap.h>
29c53cff5eSBasil Gor #include <linux/if_vlan.h>
30c67df11fSJason Wang #include <linux/skb_array.h>
31c67df11fSJason Wang #include <linux/skbuff.h>
323a4d5c94SMichael S. Tsirkin
333a4d5c94SMichael S. Tsirkin #include <net/sock.h>
341ffcbc85SJesper Dangaard Brouer #include <net/xdp.h>
353a4d5c94SMichael S. Tsirkin
363a4d5c94SMichael S. Tsirkin #include "vhost.h"
373a4d5c94SMichael S. Tsirkin
38098eadceSJason Wang static int experimental_zcopytx = 0;
39bab632d6SMichael S. Tsirkin module_param(experimental_zcopytx, int, 0444);
40f9611c43SMichael S. Tsirkin MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
41f9611c43SMichael S. Tsirkin " 1 -Enable; 0 - Disable");
42bab632d6SMichael S. Tsirkin
433a4d5c94SMichael S. Tsirkin /* Max number of bytes transferred before requeueing the job.
443a4d5c94SMichael S. Tsirkin * Using this limit prevents one virtqueue from starving others. */
453a4d5c94SMichael S. Tsirkin #define VHOST_NET_WEIGHT 0x80000
463a4d5c94SMichael S. Tsirkin
47a2ac9990Shaibinzhang(张海斌) /* Max number of packets transferred before requeueing the job.
48db688c24SPaolo Abeni * Using this limit prevents one virtqueue from starving others with small
49db688c24SPaolo Abeni * pkts.
50db688c24SPaolo Abeni */
51db688c24SPaolo Abeni #define VHOST_NET_PKT_WEIGHT 256
52a2ac9990Shaibinzhang(张海斌)
53bab632d6SMichael S. Tsirkin /* MAX number of TX used buffers for outstanding zerocopy */
54bab632d6SMichael S. Tsirkin #define VHOST_MAX_PEND 128
55bab632d6SMichael S. Tsirkin #define VHOST_GOODCOPY_LEN 256
56bab632d6SMichael S. Tsirkin
57eaae8132SMichael S. Tsirkin /*
58eaae8132SMichael S. Tsirkin * For transmit, used buffer len is unused; we override it to track buffer
59eaae8132SMichael S. Tsirkin * status internally; used for zerocopy tx only.
60eaae8132SMichael S. Tsirkin */
61eaae8132SMichael S. Tsirkin /* Lower device DMA failed */
62bf995734SMichael S. Tsirkin #define VHOST_DMA_FAILED_LEN ((__force __virtio32)3)
63eaae8132SMichael S. Tsirkin /* Lower device DMA done */
64bf995734SMichael S. Tsirkin #define VHOST_DMA_DONE_LEN ((__force __virtio32)2)
65eaae8132SMichael S. Tsirkin /* Lower device DMA in progress */
66bf995734SMichael S. Tsirkin #define VHOST_DMA_IN_PROGRESS ((__force __virtio32)1)
67eaae8132SMichael S. Tsirkin /* Buffer unused */
68bf995734SMichael S. Tsirkin #define VHOST_DMA_CLEAR_LEN ((__force __virtio32)0)
69eaae8132SMichael S. Tsirkin
70bf995734SMichael S. Tsirkin #define VHOST_DMA_IS_DONE(len) ((__force u32)(len) >= (__force u32)VHOST_DMA_DONE_LEN)
71eaae8132SMichael S. Tsirkin
723a4d5c94SMichael S. Tsirkin enum {
738570a6e7SAsias He VHOST_NET_FEATURES = VHOST_FEATURES |
748570a6e7SAsias He (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) |
756b1e6cc7SJason Wang (1ULL << VIRTIO_NET_F_MRG_RXBUF) |
76313389beSKangjie Xu (1ULL << VIRTIO_F_ACCESS_PLATFORM) |
77313389beSKangjie Xu (1ULL << VIRTIO_F_RING_RESET)
788570a6e7SAsias He };
798570a6e7SAsias He
808570a6e7SAsias He enum {
81429711aeSJason Wang VHOST_NET_BACKEND_FEATURES = (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2)
82429711aeSJason Wang };
83429711aeSJason Wang
84429711aeSJason Wang enum {
853a4d5c94SMichael S. Tsirkin VHOST_NET_VQ_RX = 0,
863a4d5c94SMichael S. Tsirkin VHOST_NET_VQ_TX = 1,
873a4d5c94SMichael S. Tsirkin VHOST_NET_VQ_MAX = 2,
883a4d5c94SMichael S. Tsirkin };
893a4d5c94SMichael S. Tsirkin
90fe729a57SAsias He struct vhost_net_ubuf_ref {
910ad8b480SMichael S. Tsirkin /* refcount follows semantics similar to kref:
920ad8b480SMichael S. Tsirkin * 0: object is released
930ad8b480SMichael S. Tsirkin * 1: no outstanding ubufs
940ad8b480SMichael S. Tsirkin * >1: outstanding ubufs
950ad8b480SMichael S. Tsirkin */
960ad8b480SMichael S. Tsirkin atomic_t refcount;
972839400fSAsias He wait_queue_head_t wait;
982839400fSAsias He struct vhost_virtqueue *vq;
992839400fSAsias He };
1002839400fSAsias He
101d0d86971SJason Wang #define VHOST_NET_BATCH 64
102c67df11fSJason Wang struct vhost_net_buf {
1035990a305SJason Wang void **queue;
104c67df11fSJason Wang int tail;
105c67df11fSJason Wang int head;
106c67df11fSJason Wang };
107c67df11fSJason Wang
1083ab2e420SAsias He struct vhost_net_virtqueue {
1093ab2e420SAsias He struct vhost_virtqueue vq;
11081f95a55SMichael S. Tsirkin size_t vhost_hlen;
11181f95a55SMichael S. Tsirkin size_t sock_hlen;
1122839400fSAsias He /* vhost zerocopy support fields below: */
1132839400fSAsias He /* last used idx for outstanding DMA zerocopy buffers */
1142839400fSAsias He int upend_idx;
115f5a4941aSJason Wang /* For TX, first used idx for DMA done zerocopy buffers
116f5a4941aSJason Wang * For RX, number of batched heads
117f5a4941aSJason Wang */
1182839400fSAsias He int done_idx;
1190a0be13bSJason Wang /* Number of XDP frames batched */
1200a0be13bSJason Wang int batched_xdp;
1212839400fSAsias He /* an array of userspace buffers info */
122dfff202bSPavel Begunkov struct ubuf_info_msgzc *ubuf_info;
1232839400fSAsias He /* Reference counting for outstanding ubufs.
1242839400fSAsias He * Protected by vq mutex. Writers must also take device mutex. */
125fe729a57SAsias He struct vhost_net_ubuf_ref *ubufs;
1265990a305SJason Wang struct ptr_ring *rx_ring;
127c67df11fSJason Wang struct vhost_net_buf rxq;
1280a0be13bSJason Wang /* Batched XDP buffs */
1290a0be13bSJason Wang struct xdp_buff *xdp;
1303ab2e420SAsias He };
1313ab2e420SAsias He
1323a4d5c94SMichael S. Tsirkin struct vhost_net {
1333a4d5c94SMichael S. Tsirkin struct vhost_dev dev;
1343ab2e420SAsias He struct vhost_net_virtqueue vqs[VHOST_NET_VQ_MAX];
1353a4d5c94SMichael S. Tsirkin struct vhost_poll poll[VHOST_NET_VQ_MAX];
136eaae8132SMichael S. Tsirkin /* Number of TX recently submitted.
137eaae8132SMichael S. Tsirkin * Protected by tx vq lock. */
138eaae8132SMichael S. Tsirkin unsigned tx_packets;
139eaae8132SMichael S. Tsirkin /* Number of times zerocopy TX recently failed.
140eaae8132SMichael S. Tsirkin * Protected by tx vq lock. */
141eaae8132SMichael S. Tsirkin unsigned tx_zcopy_err;
1421280c27fSMichael S. Tsirkin /* Flush in progress. Protected by tx vq lock. */
1431280c27fSMichael S. Tsirkin bool tx_flush;
144e4dab1e6SJason Wang /* Private page frag */
145e4dab1e6SJason Wang struct page_frag page_frag;
146e4dab1e6SJason Wang /* Refcount bias of page frag */
147e4dab1e6SJason Wang int refcnt_bias;
1483a4d5c94SMichael S. Tsirkin };
1493a4d5c94SMichael S. Tsirkin
150fe729a57SAsias He static unsigned vhost_net_zcopy_mask __read_mostly;
1512839400fSAsias He
vhost_net_buf_get_ptr(struct vhost_net_buf * rxq)152c67df11fSJason Wang static void *vhost_net_buf_get_ptr(struct vhost_net_buf *rxq)
153c67df11fSJason Wang {
154c67df11fSJason Wang if (rxq->tail != rxq->head)
155c67df11fSJason Wang return rxq->queue[rxq->head];
156c67df11fSJason Wang else
157c67df11fSJason Wang return NULL;
158c67df11fSJason Wang }
159c67df11fSJason Wang
vhost_net_buf_get_size(struct vhost_net_buf * rxq)160c67df11fSJason Wang static int vhost_net_buf_get_size(struct vhost_net_buf *rxq)
161c67df11fSJason Wang {
162c67df11fSJason Wang return rxq->tail - rxq->head;
163c67df11fSJason Wang }
164c67df11fSJason Wang
vhost_net_buf_is_empty(struct vhost_net_buf * rxq)165c67df11fSJason Wang static int vhost_net_buf_is_empty(struct vhost_net_buf *rxq)
166c67df11fSJason Wang {
167c67df11fSJason Wang return rxq->tail == rxq->head;
168c67df11fSJason Wang }
169c67df11fSJason Wang
vhost_net_buf_consume(struct vhost_net_buf * rxq)170c67df11fSJason Wang static void *vhost_net_buf_consume(struct vhost_net_buf *rxq)
171c67df11fSJason Wang {
172c67df11fSJason Wang void *ret = vhost_net_buf_get_ptr(rxq);
173c67df11fSJason Wang ++rxq->head;
174c67df11fSJason Wang return ret;
175c67df11fSJason Wang }
176c67df11fSJason Wang
vhost_net_buf_produce(struct vhost_net_virtqueue * nvq)177c67df11fSJason Wang static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq)
178c67df11fSJason Wang {
179c67df11fSJason Wang struct vhost_net_buf *rxq = &nvq->rxq;
180c67df11fSJason Wang
181c67df11fSJason Wang rxq->head = 0;
1825990a305SJason Wang rxq->tail = ptr_ring_consume_batched(nvq->rx_ring, rxq->queue,
183d0d86971SJason Wang VHOST_NET_BATCH);
184c67df11fSJason Wang return rxq->tail;
185c67df11fSJason Wang }
186c67df11fSJason Wang
vhost_net_buf_unproduce(struct vhost_net_virtqueue * nvq)187c67df11fSJason Wang static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq)
188c67df11fSJason Wang {
189c67df11fSJason Wang struct vhost_net_buf *rxq = &nvq->rxq;
190c67df11fSJason Wang
1915990a305SJason Wang if (nvq->rx_ring && !vhost_net_buf_is_empty(rxq)) {
1925990a305SJason Wang ptr_ring_unconsume(nvq->rx_ring, rxq->queue + rxq->head,
1935990a305SJason Wang vhost_net_buf_get_size(rxq),
1943a403076SJason Wang tun_ptr_free);
195c67df11fSJason Wang rxq->head = rxq->tail = 0;
196c67df11fSJason Wang }
197c67df11fSJason Wang }
198c67df11fSJason Wang
vhost_net_buf_peek_len(void * ptr)199fc72d1d5SJason Wang static int vhost_net_buf_peek_len(void *ptr)
200fc72d1d5SJason Wang {
2011ffcbc85SJesper Dangaard Brouer if (tun_is_xdp_frame(ptr)) {
2021ffcbc85SJesper Dangaard Brouer struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
203fc72d1d5SJason Wang
2041ffcbc85SJesper Dangaard Brouer return xdpf->len;
205fc72d1d5SJason Wang }
206fc72d1d5SJason Wang
207fc72d1d5SJason Wang return __skb_array_len_with_tag(ptr);
208fc72d1d5SJason Wang }
209fc72d1d5SJason Wang
vhost_net_buf_peek(struct vhost_net_virtqueue * nvq)210c67df11fSJason Wang static int vhost_net_buf_peek(struct vhost_net_virtqueue *nvq)
211c67df11fSJason Wang {
212c67df11fSJason Wang struct vhost_net_buf *rxq = &nvq->rxq;
213c67df11fSJason Wang
214c67df11fSJason Wang if (!vhost_net_buf_is_empty(rxq))
215c67df11fSJason Wang goto out;
216c67df11fSJason Wang
217c67df11fSJason Wang if (!vhost_net_buf_produce(nvq))
218c67df11fSJason Wang return 0;
219c67df11fSJason Wang
220c67df11fSJason Wang out:
221fc72d1d5SJason Wang return vhost_net_buf_peek_len(vhost_net_buf_get_ptr(rxq));
222c67df11fSJason Wang }
223c67df11fSJason Wang
vhost_net_buf_init(struct vhost_net_buf * rxq)224c67df11fSJason Wang static void vhost_net_buf_init(struct vhost_net_buf *rxq)
225c67df11fSJason Wang {
226c67df11fSJason Wang rxq->head = rxq->tail = 0;
227c67df11fSJason Wang }
228c67df11fSJason Wang
vhost_net_enable_zcopy(int vq)229fe729a57SAsias He static void vhost_net_enable_zcopy(int vq)
2302839400fSAsias He {
231fe729a57SAsias He vhost_net_zcopy_mask |= 0x1 << vq;
2322839400fSAsias He }
2332839400fSAsias He
234fe729a57SAsias He static struct vhost_net_ubuf_ref *
vhost_net_ubuf_alloc(struct vhost_virtqueue * vq,bool zcopy)235fe729a57SAsias He vhost_net_ubuf_alloc(struct vhost_virtqueue *vq, bool zcopy)
2362839400fSAsias He {
237fe729a57SAsias He struct vhost_net_ubuf_ref *ubufs;
2382839400fSAsias He /* No zero copy backend? Nothing to count. */
2392839400fSAsias He if (!zcopy)
2402839400fSAsias He return NULL;
2412839400fSAsias He ubufs = kmalloc(sizeof(*ubufs), GFP_KERNEL);
2422839400fSAsias He if (!ubufs)
2432839400fSAsias He return ERR_PTR(-ENOMEM);
2440ad8b480SMichael S. Tsirkin atomic_set(&ubufs->refcount, 1);
2452839400fSAsias He init_waitqueue_head(&ubufs->wait);
2462839400fSAsias He ubufs->vq = vq;
2472839400fSAsias He return ubufs;
2482839400fSAsias He }
2492839400fSAsias He
vhost_net_ubuf_put(struct vhost_net_ubuf_ref * ubufs)2500ad8b480SMichael S. Tsirkin static int vhost_net_ubuf_put(struct vhost_net_ubuf_ref *ubufs)
2512839400fSAsias He {
2520ad8b480SMichael S. Tsirkin int r = atomic_sub_return(1, &ubufs->refcount);
2530ad8b480SMichael S. Tsirkin if (unlikely(!r))
2540ad8b480SMichael S. Tsirkin wake_up(&ubufs->wait);
2550ad8b480SMichael S. Tsirkin return r;
2562839400fSAsias He }
2572839400fSAsias He
vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref * ubufs)258fe729a57SAsias He static void vhost_net_ubuf_put_and_wait(struct vhost_net_ubuf_ref *ubufs)
2592839400fSAsias He {
2600ad8b480SMichael S. Tsirkin vhost_net_ubuf_put(ubufs);
2610ad8b480SMichael S. Tsirkin wait_event(ubufs->wait, !atomic_read(&ubufs->refcount));
262c38e39c3SMichael S. Tsirkin }
263c38e39c3SMichael S. Tsirkin
vhost_net_ubuf_put_wait_and_free(struct vhost_net_ubuf_ref * ubufs)264c38e39c3SMichael S. Tsirkin static void vhost_net_ubuf_put_wait_and_free(struct vhost_net_ubuf_ref *ubufs)
265c38e39c3SMichael S. Tsirkin {
266c38e39c3SMichael S. Tsirkin vhost_net_ubuf_put_and_wait(ubufs);
2672839400fSAsias He kfree(ubufs);
2682839400fSAsias He }
2692839400fSAsias He
vhost_net_clear_ubuf_info(struct vhost_net * n)270b1ad8496SAsias He static void vhost_net_clear_ubuf_info(struct vhost_net *n)
271b1ad8496SAsias He {
272b1ad8496SAsias He int i;
273b1ad8496SAsias He
274288cfe78SMichael S. Tsirkin for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
275b1ad8496SAsias He kfree(n->vqs[i].ubuf_info);
276288cfe78SMichael S. Tsirkin n->vqs[i].ubuf_info = NULL;
277b1ad8496SAsias He }
278b1ad8496SAsias He }
279b1ad8496SAsias He
vhost_net_set_ubuf_info(struct vhost_net * n)2800a1febf7SAsias He static int vhost_net_set_ubuf_info(struct vhost_net *n)
2812839400fSAsias He {
2822839400fSAsias He bool zcopy;
2832839400fSAsias He int i;
2842839400fSAsias He
285288cfe78SMichael S. Tsirkin for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
286fe729a57SAsias He zcopy = vhost_net_zcopy_mask & (0x1 << i);
2872839400fSAsias He if (!zcopy)
2882839400fSAsias He continue;
2896da2ec56SKees Cook n->vqs[i].ubuf_info =
2906da2ec56SKees Cook kmalloc_array(UIO_MAXIOV,
2916da2ec56SKees Cook sizeof(*n->vqs[i].ubuf_info),
2926da2ec56SKees Cook GFP_KERNEL);
2932839400fSAsias He if (!n->vqs[i].ubuf_info)
2942839400fSAsias He goto err;
2952839400fSAsias He }
2962839400fSAsias He return 0;
2972839400fSAsias He
2982839400fSAsias He err:
299288cfe78SMichael S. Tsirkin vhost_net_clear_ubuf_info(n);
3002839400fSAsias He return -ENOMEM;
3012839400fSAsias He }
3022839400fSAsias He
vhost_net_vq_reset(struct vhost_net * n)3030a1febf7SAsias He static void vhost_net_vq_reset(struct vhost_net *n)
3042839400fSAsias He {
3052839400fSAsias He int i;
3062839400fSAsias He
307288cfe78SMichael S. Tsirkin vhost_net_clear_ubuf_info(n);
308288cfe78SMichael S. Tsirkin
3092839400fSAsias He for (i = 0; i < VHOST_NET_VQ_MAX; i++) {
3102839400fSAsias He n->vqs[i].done_idx = 0;
3112839400fSAsias He n->vqs[i].upend_idx = 0;
3122839400fSAsias He n->vqs[i].ubufs = NULL;
31381f95a55SMichael S. Tsirkin n->vqs[i].vhost_hlen = 0;
31481f95a55SMichael S. Tsirkin n->vqs[i].sock_hlen = 0;
315c67df11fSJason Wang vhost_net_buf_init(&n->vqs[i].rxq);
3162839400fSAsias He }
3172839400fSAsias He
3182839400fSAsias He }
3192839400fSAsias He
vhost_net_tx_packet(struct vhost_net * net)320eaae8132SMichael S. Tsirkin static void vhost_net_tx_packet(struct vhost_net *net)
321eaae8132SMichael S. Tsirkin {
322eaae8132SMichael S. Tsirkin ++net->tx_packets;
323eaae8132SMichael S. Tsirkin if (net->tx_packets < 1024)
324eaae8132SMichael S. Tsirkin return;
325eaae8132SMichael S. Tsirkin net->tx_packets = 0;
326eaae8132SMichael S. Tsirkin net->tx_zcopy_err = 0;
327eaae8132SMichael S. Tsirkin }
328eaae8132SMichael S. Tsirkin
vhost_net_tx_err(struct vhost_net * net)329eaae8132SMichael S. Tsirkin static void vhost_net_tx_err(struct vhost_net *net)
330eaae8132SMichael S. Tsirkin {
331eaae8132SMichael S. Tsirkin ++net->tx_zcopy_err;
332eaae8132SMichael S. Tsirkin }
333eaae8132SMichael S. Tsirkin
vhost_net_tx_select_zcopy(struct vhost_net * net)334eaae8132SMichael S. Tsirkin static bool vhost_net_tx_select_zcopy(struct vhost_net *net)
335eaae8132SMichael S. Tsirkin {
3361280c27fSMichael S. Tsirkin /* TX flush waits for outstanding DMAs to be done.
3371280c27fSMichael S. Tsirkin * Don't start new DMAs.
3381280c27fSMichael S. Tsirkin */
3391280c27fSMichael S. Tsirkin return !net->tx_flush &&
3401280c27fSMichael S. Tsirkin net->tx_packets / 64 >= net->tx_zcopy_err;
341eaae8132SMichael S. Tsirkin }
342eaae8132SMichael S. Tsirkin
vhost_sock_zcopy(struct socket * sock)343bab632d6SMichael S. Tsirkin static bool vhost_sock_zcopy(struct socket *sock)
344bab632d6SMichael S. Tsirkin {
345bab632d6SMichael S. Tsirkin return unlikely(experimental_zcopytx) &&
346bab632d6SMichael S. Tsirkin sock_flag(sock->sk, SOCK_ZEROCOPY);
347bab632d6SMichael S. Tsirkin }
348bab632d6SMichael S. Tsirkin
vhost_sock_xdp(struct socket * sock)3490a0be13bSJason Wang static bool vhost_sock_xdp(struct socket *sock)
3500a0be13bSJason Wang {
3510a0be13bSJason Wang return sock_flag(sock->sk, SOCK_XDP);
3520a0be13bSJason Wang }
3530a0be13bSJason Wang
354b211616dSMichael S. Tsirkin /* In case of DMA done not in order in lower device driver for some reason.
355b211616dSMichael S. Tsirkin * upend_idx is used to track end of used idx, done_idx is used to track head
356b211616dSMichael S. Tsirkin * of used idx. Once lower device DMA done contiguously, we will signal KVM
357b211616dSMichael S. Tsirkin * guest used idx.
358b211616dSMichael S. Tsirkin */
vhost_zerocopy_signal_used(struct vhost_net * net,struct vhost_virtqueue * vq)359094afe7dSJason Wang static void vhost_zerocopy_signal_used(struct vhost_net *net,
360eaae8132SMichael S. Tsirkin struct vhost_virtqueue *vq)
361b211616dSMichael S. Tsirkin {
3622839400fSAsias He struct vhost_net_virtqueue *nvq =
3632839400fSAsias He container_of(vq, struct vhost_net_virtqueue, vq);
364c92112aeSJason Wang int i, add;
365b211616dSMichael S. Tsirkin int j = 0;
366b211616dSMichael S. Tsirkin
3672839400fSAsias He for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) {
368eaae8132SMichael S. Tsirkin if (vq->heads[i].len == VHOST_DMA_FAILED_LEN)
369eaae8132SMichael S. Tsirkin vhost_net_tx_err(net);
370b211616dSMichael S. Tsirkin if (VHOST_DMA_IS_DONE(vq->heads[i].len)) {
371b211616dSMichael S. Tsirkin vq->heads[i].len = VHOST_DMA_CLEAR_LEN;
372b211616dSMichael S. Tsirkin ++j;
373b211616dSMichael S. Tsirkin } else
374b211616dSMichael S. Tsirkin break;
375b211616dSMichael S. Tsirkin }
376c92112aeSJason Wang while (j) {
377c92112aeSJason Wang add = min(UIO_MAXIOV - nvq->done_idx, j);
378c92112aeSJason Wang vhost_add_used_and_signal_n(vq->dev, vq,
379c92112aeSJason Wang &vq->heads[nvq->done_idx], add);
380c92112aeSJason Wang nvq->done_idx = (nvq->done_idx + add) % UIO_MAXIOV;
381c92112aeSJason Wang j -= add;
382c92112aeSJason Wang }
383b211616dSMichael S. Tsirkin }
384b211616dSMichael S. Tsirkin
vhost_zerocopy_callback(struct sk_buff * skb,struct ubuf_info * ubuf_base,bool success)38536177832SJonathan Lemon static void vhost_zerocopy_callback(struct sk_buff *skb,
386dfff202bSPavel Begunkov struct ubuf_info *ubuf_base, bool success)
387b211616dSMichael S. Tsirkin {
388dfff202bSPavel Begunkov struct ubuf_info_msgzc *ubuf = uarg_to_msgzc(ubuf_base);
389fe729a57SAsias He struct vhost_net_ubuf_ref *ubufs = ubuf->ctx;
390b211616dSMichael S. Tsirkin struct vhost_virtqueue *vq = ubufs->vq;
3910ad8b480SMichael S. Tsirkin int cnt;
392b211616dSMichael S. Tsirkin
393b0c057caSMichael S. Tsirkin rcu_read_lock_bh();
394b0c057caSMichael S. Tsirkin
39519c73b3eSJason Wang /* set len to mark this desc buffers done DMA */
39619c73b3eSJason Wang vq->heads[ubuf->desc].len = success ?
39719c73b3eSJason Wang VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
3980ad8b480SMichael S. Tsirkin cnt = vhost_net_ubuf_put(ubufs);
39919c73b3eSJason Wang
40024eb21a1SMichael S. Tsirkin /*
40124eb21a1SMichael S. Tsirkin * Trigger polling thread if guest stopped submitting new buffers:
4020ad8b480SMichael S. Tsirkin * in this case, the refcount after decrement will eventually reach 1.
40324eb21a1SMichael S. Tsirkin * We also trigger polling periodically after each 16 packets
40424eb21a1SMichael S. Tsirkin * (the value 16 here is more or less arbitrary, it's tuned to trigger
40524eb21a1SMichael S. Tsirkin * less than 10% of times).
40624eb21a1SMichael S. Tsirkin */
4070ad8b480SMichael S. Tsirkin if (cnt <= 1 || !(cnt % 16))
408b211616dSMichael S. Tsirkin vhost_poll_queue(&vq->poll);
409b0c057caSMichael S. Tsirkin
410b0c057caSMichael S. Tsirkin rcu_read_unlock_bh();
411b211616dSMichael S. Tsirkin }
412b211616dSMichael S. Tsirkin
busy_clock(void)41303088137SJason Wang static inline unsigned long busy_clock(void)
41403088137SJason Wang {
41503088137SJason Wang return local_clock() >> 10;
41603088137SJason Wang }
41703088137SJason Wang
vhost_can_busy_poll(unsigned long endtime)418027b1760SToshiaki Makita static bool vhost_can_busy_poll(unsigned long endtime)
41903088137SJason Wang {
420027b1760SToshiaki Makita return likely(!need_resched() && !time_after(busy_clock(), endtime) &&
421027b1760SToshiaki Makita !signal_pending(current));
42203088137SJason Wang }
42303088137SJason Wang
vhost_net_disable_vq(struct vhost_net * n,struct vhost_virtqueue * vq)4248241a1e4SJason Wang static void vhost_net_disable_vq(struct vhost_net *n,
4258241a1e4SJason Wang struct vhost_virtqueue *vq)
4268241a1e4SJason Wang {
4278241a1e4SJason Wang struct vhost_net_virtqueue *nvq =
4288241a1e4SJason Wang container_of(vq, struct vhost_net_virtqueue, vq);
4298241a1e4SJason Wang struct vhost_poll *poll = n->poll + (nvq - n->vqs);
430247643f8SEugenio Pérez if (!vhost_vq_get_backend(vq))
4318241a1e4SJason Wang return;
4328241a1e4SJason Wang vhost_poll_stop(poll);
4338241a1e4SJason Wang }
4348241a1e4SJason Wang
vhost_net_enable_vq(struct vhost_net * n,struct vhost_virtqueue * vq)4358241a1e4SJason Wang static int vhost_net_enable_vq(struct vhost_net *n,
4368241a1e4SJason Wang struct vhost_virtqueue *vq)
4378241a1e4SJason Wang {
4388241a1e4SJason Wang struct vhost_net_virtqueue *nvq =
4398241a1e4SJason Wang container_of(vq, struct vhost_net_virtqueue, vq);
4408241a1e4SJason Wang struct vhost_poll *poll = n->poll + (nvq - n->vqs);
4418241a1e4SJason Wang struct socket *sock;
4428241a1e4SJason Wang
443247643f8SEugenio Pérez sock = vhost_vq_get_backend(vq);
4448241a1e4SJason Wang if (!sock)
4458241a1e4SJason Wang return 0;
4468241a1e4SJason Wang
4478241a1e4SJason Wang return vhost_poll_start(poll, sock->file);
4488241a1e4SJason Wang }
4498241a1e4SJason Wang
vhost_net_signal_used(struct vhost_net_virtqueue * nvq)4504afb52c2SJason Wang static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq)
4514afb52c2SJason Wang {
4524afb52c2SJason Wang struct vhost_virtqueue *vq = &nvq->vq;
4534afb52c2SJason Wang struct vhost_dev *dev = vq->dev;
4544afb52c2SJason Wang
4554afb52c2SJason Wang if (!nvq->done_idx)
4564afb52c2SJason Wang return;
4574afb52c2SJason Wang
4584afb52c2SJason Wang vhost_add_used_and_signal_n(dev, vq, vq->heads, nvq->done_idx);
4594afb52c2SJason Wang nvq->done_idx = 0;
4604afb52c2SJason Wang }
4614afb52c2SJason Wang
vhost_tx_batch(struct vhost_net * net,struct vhost_net_virtqueue * nvq,struct socket * sock,struct msghdr * msghdr)4620a0be13bSJason Wang static void vhost_tx_batch(struct vhost_net *net,
4630a0be13bSJason Wang struct vhost_net_virtqueue *nvq,
4640a0be13bSJason Wang struct socket *sock,
4650a0be13bSJason Wang struct msghdr *msghdr)
4660a0be13bSJason Wang {
4670a0be13bSJason Wang struct tun_msg_ctl ctl = {
4680a0be13bSJason Wang .type = TUN_MSG_PTR,
4690a0be13bSJason Wang .num = nvq->batched_xdp,
4700a0be13bSJason Wang .ptr = nvq->xdp,
4710a0be13bSJason Wang };
4723c4cea8fSPaolo Abeni int i, err;
4730a0be13bSJason Wang
4740a0be13bSJason Wang if (nvq->batched_xdp == 0)
4750a0be13bSJason Wang goto signal_used;
4760a0be13bSJason Wang
4770a0be13bSJason Wang msghdr->msg_control = &ctl;
47874a335a0SHarold Huang msghdr->msg_controllen = sizeof(ctl);
4790a0be13bSJason Wang err = sock->ops->sendmsg(sock, msghdr, 0);
4800a0be13bSJason Wang if (unlikely(err < 0)) {
4810a0be13bSJason Wang vq_err(&nvq->vq, "Fail to batch sending packets\n");
4823c4cea8fSPaolo Abeni
4833c4cea8fSPaolo Abeni /* free pages owned by XDP; since this is an unlikely error path,
4843c4cea8fSPaolo Abeni * keep it simple and avoid more complex bulk update for the
4853c4cea8fSPaolo Abeni * used pages
4863c4cea8fSPaolo Abeni */
4873c4cea8fSPaolo Abeni for (i = 0; i < nvq->batched_xdp; ++i)
4883c4cea8fSPaolo Abeni put_page(virt_to_head_page(nvq->xdp[i].data));
4893c4cea8fSPaolo Abeni nvq->batched_xdp = 0;
4903c4cea8fSPaolo Abeni nvq->done_idx = 0;
4910a0be13bSJason Wang return;
4920a0be13bSJason Wang }
4930a0be13bSJason Wang
4940a0be13bSJason Wang signal_used:
4950a0be13bSJason Wang vhost_net_signal_used(nvq);
4960a0be13bSJason Wang nvq->batched_xdp = 0;
4970a0be13bSJason Wang }
4980a0be13bSJason Wang
sock_has_rx_data(struct socket * sock)499dc151282STonghao Zhang static int sock_has_rx_data(struct socket *sock)
500dc151282STonghao Zhang {
501dc151282STonghao Zhang if (unlikely(!sock))
502dc151282STonghao Zhang return 0;
503dc151282STonghao Zhang
504dc151282STonghao Zhang if (sock->ops->peek_len)
505dc151282STonghao Zhang return sock->ops->peek_len(sock);
506dc151282STonghao Zhang
507dc151282STonghao Zhang return skb_queue_empty(&sock->sk->sk_receive_queue);
508dc151282STonghao Zhang }
509dc151282STonghao Zhang
vhost_net_busy_poll_try_queue(struct vhost_net * net,struct vhost_virtqueue * vq)510dc151282STonghao Zhang static void vhost_net_busy_poll_try_queue(struct vhost_net *net,
511dc151282STonghao Zhang struct vhost_virtqueue *vq)
512dc151282STonghao Zhang {
513dc151282STonghao Zhang if (!vhost_vq_avail_empty(&net->dev, vq)) {
514dc151282STonghao Zhang vhost_poll_queue(&vq->poll);
515dc151282STonghao Zhang } else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
516dc151282STonghao Zhang vhost_disable_notify(&net->dev, vq);
517dc151282STonghao Zhang vhost_poll_queue(&vq->poll);
518dc151282STonghao Zhang }
519dc151282STonghao Zhang }
520dc151282STonghao Zhang
vhost_net_busy_poll(struct vhost_net * net,struct vhost_virtqueue * rvq,struct vhost_virtqueue * tvq,bool * busyloop_intr,bool poll_rx)521dc151282STonghao Zhang static void vhost_net_busy_poll(struct vhost_net *net,
522dc151282STonghao Zhang struct vhost_virtqueue *rvq,
523dc151282STonghao Zhang struct vhost_virtqueue *tvq,
524dc151282STonghao Zhang bool *busyloop_intr,
525dc151282STonghao Zhang bool poll_rx)
526dc151282STonghao Zhang {
527dc151282STonghao Zhang unsigned long busyloop_timeout;
528dc151282STonghao Zhang unsigned long endtime;
529dc151282STonghao Zhang struct socket *sock;
530dc151282STonghao Zhang struct vhost_virtqueue *vq = poll_rx ? tvq : rvq;
531dc151282STonghao Zhang
532476e8ba7SJason Wang /* Try to hold the vq mutex of the paired virtqueue. We can't
533476e8ba7SJason Wang * use mutex_lock() here since we could not guarantee a
534476e8ba7SJason Wang * consistenet lock ordering.
535476e8ba7SJason Wang */
536476e8ba7SJason Wang if (!mutex_trylock(&vq->mutex))
537476e8ba7SJason Wang return;
538476e8ba7SJason Wang
539dc151282STonghao Zhang vhost_disable_notify(&net->dev, vq);
540247643f8SEugenio Pérez sock = vhost_vq_get_backend(rvq);
541dc151282STonghao Zhang
542dc151282STonghao Zhang busyloop_timeout = poll_rx ? rvq->busyloop_timeout:
543dc151282STonghao Zhang tvq->busyloop_timeout;
544dc151282STonghao Zhang
545dc151282STonghao Zhang preempt_disable();
546dc151282STonghao Zhang endtime = busy_clock() + busyloop_timeout;
547dc151282STonghao Zhang
548dc151282STonghao Zhang while (vhost_can_busy_poll(endtime)) {
549dc151282STonghao Zhang if (vhost_vq_has_work(vq)) {
550dc151282STonghao Zhang *busyloop_intr = true;
551dc151282STonghao Zhang break;
552dc151282STonghao Zhang }
553dc151282STonghao Zhang
554dc151282STonghao Zhang if ((sock_has_rx_data(sock) &&
555dc151282STonghao Zhang !vhost_vq_avail_empty(&net->dev, rvq)) ||
556dc151282STonghao Zhang !vhost_vq_avail_empty(&net->dev, tvq))
557dc151282STonghao Zhang break;
558dc151282STonghao Zhang
559dc151282STonghao Zhang cpu_relax();
560dc151282STonghao Zhang }
561dc151282STonghao Zhang
562dc151282STonghao Zhang preempt_enable();
563dc151282STonghao Zhang
564dc151282STonghao Zhang if (poll_rx || sock_has_rx_data(sock))
565dc151282STonghao Zhang vhost_net_busy_poll_try_queue(net, vq);
566dc151282STonghao Zhang else if (!poll_rx) /* On tx here, sock has no rx data. */
567dc151282STonghao Zhang vhost_enable_notify(&net->dev, rvq);
568dc151282STonghao Zhang
569dc151282STonghao Zhang mutex_unlock(&vq->mutex);
570dc151282STonghao Zhang }
571dc151282STonghao Zhang
vhost_net_tx_get_vq_desc(struct vhost_net * net,struct vhost_net_virtqueue * tnvq,unsigned int * out_num,unsigned int * in_num,struct msghdr * msghdr,bool * busyloop_intr)57203088137SJason Wang static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
573441abde4STonghao Zhang struct vhost_net_virtqueue *tnvq,
574027b1760SToshiaki Makita unsigned int *out_num, unsigned int *in_num,
5750a0be13bSJason Wang struct msghdr *msghdr, bool *busyloop_intr)
57603088137SJason Wang {
577441abde4STonghao Zhang struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
578441abde4STonghao Zhang struct vhost_virtqueue *rvq = &rnvq->vq;
579441abde4STonghao Zhang struct vhost_virtqueue *tvq = &tnvq->vq;
580441abde4STonghao Zhang
581441abde4STonghao Zhang int r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
58203088137SJason Wang out_num, in_num, NULL, NULL);
58303088137SJason Wang
584441abde4STonghao Zhang if (r == tvq->num && tvq->busyloop_timeout) {
5850a0be13bSJason Wang /* Flush batched packets first */
586247643f8SEugenio Pérez if (!vhost_sock_zcopy(vhost_vq_get_backend(tvq)))
587247643f8SEugenio Pérez vhost_tx_batch(net, tnvq,
588247643f8SEugenio Pérez vhost_vq_get_backend(tvq),
589247643f8SEugenio Pérez msghdr);
590441abde4STonghao Zhang
591441abde4STonghao Zhang vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, false);
592441abde4STonghao Zhang
593441abde4STonghao Zhang r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
59403088137SJason Wang out_num, in_num, NULL, NULL);
59503088137SJason Wang }
59603088137SJason Wang
59703088137SJason Wang return r;
59803088137SJason Wang }
59903088137SJason Wang
vhost_exceeds_maxpend(struct vhost_net * net)6000ed005ceSJason Wang static bool vhost_exceeds_maxpend(struct vhost_net *net)
6010ed005ceSJason Wang {
6020ed005ceSJason Wang struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
6030ed005ceSJason Wang struct vhost_virtqueue *vq = &nvq->vq;
6040ed005ceSJason Wang
6051e6f7453SWillem de Bruijn return (nvq->upend_idx + UIO_MAXIOV - nvq->done_idx) % UIO_MAXIOV >
6061e6f7453SWillem de Bruijn min_t(unsigned int, VHOST_MAX_PEND, vq->num >> 2);
6070ed005ceSJason Wang }
6080ed005ceSJason Wang
init_iov_iter(struct vhost_virtqueue * vq,struct iov_iter * iter,size_t hdr_size,int out)609b0d0ea50SJason Wang static size_t init_iov_iter(struct vhost_virtqueue *vq, struct iov_iter *iter,
610b0d0ea50SJason Wang size_t hdr_size, int out)
611b0d0ea50SJason Wang {
612b0d0ea50SJason Wang /* Skip header. TODO: support TSO. */
613b0d0ea50SJason Wang size_t len = iov_length(vq->iov, out);
614b0d0ea50SJason Wang
615de4eda9dSAl Viro iov_iter_init(iter, ITER_SOURCE, vq->iov, out, len);
616b0d0ea50SJason Wang iov_iter_advance(iter, hdr_size);
617b0d0ea50SJason Wang
618b0d0ea50SJason Wang return iov_iter_count(iter);
619b0d0ea50SJason Wang }
620b0d0ea50SJason Wang
get_tx_bufs(struct vhost_net * net,struct vhost_net_virtqueue * nvq,struct msghdr * msg,unsigned int * out,unsigned int * in,size_t * len,bool * busyloop_intr)621a2a91a13SJason Wang static int get_tx_bufs(struct vhost_net *net,
622a2a91a13SJason Wang struct vhost_net_virtqueue *nvq,
623a2a91a13SJason Wang struct msghdr *msg,
624a2a91a13SJason Wang unsigned int *out, unsigned int *in,
625a2a91a13SJason Wang size_t *len, bool *busyloop_intr)
626a2a91a13SJason Wang {
627a2a91a13SJason Wang struct vhost_virtqueue *vq = &nvq->vq;
628a2a91a13SJason Wang int ret;
629a2a91a13SJason Wang
6300a0be13bSJason Wang ret = vhost_net_tx_get_vq_desc(net, nvq, out, in, msg, busyloop_intr);
6314afb52c2SJason Wang
632a2a91a13SJason Wang if (ret < 0 || ret == vq->num)
633a2a91a13SJason Wang return ret;
634a2a91a13SJason Wang
635a2a91a13SJason Wang if (*in) {
636a2a91a13SJason Wang vq_err(vq, "Unexpected descriptor format for TX: out %d, int %d\n",
637a2a91a13SJason Wang *out, *in);
638a2a91a13SJason Wang return -EFAULT;
639a2a91a13SJason Wang }
640a2a91a13SJason Wang
641a2a91a13SJason Wang /* Sanity check */
642a2a91a13SJason Wang *len = init_iov_iter(vq, &msg->msg_iter, nvq->vhost_hlen, *out);
643a2a91a13SJason Wang if (*len == 0) {
644a2a91a13SJason Wang vq_err(vq, "Unexpected header len for TX: %zd expected %zd\n",
645a2a91a13SJason Wang *len, nvq->vhost_hlen);
646a2a91a13SJason Wang return -EFAULT;
647a2a91a13SJason Wang }
648a2a91a13SJason Wang
649a2a91a13SJason Wang return ret;
650a2a91a13SJason Wang }
651a2a91a13SJason Wang
tx_can_batch(struct vhost_virtqueue * vq,size_t total_len)652c92a8a8cSJason Wang static bool tx_can_batch(struct vhost_virtqueue *vq, size_t total_len)
653c92a8a8cSJason Wang {
654c92a8a8cSJason Wang return total_len < VHOST_NET_WEIGHT &&
655c92a8a8cSJason Wang !vhost_vq_avail_empty(vq->dev, vq);
656c92a8a8cSJason Wang }
657c92a8a8cSJason Wang
vhost_net_page_frag_refill(struct vhost_net * net,unsigned int sz,struct page_frag * pfrag,gfp_t gfp)658e4dab1e6SJason Wang static bool vhost_net_page_frag_refill(struct vhost_net *net, unsigned int sz,
659e4dab1e6SJason Wang struct page_frag *pfrag, gfp_t gfp)
660e4dab1e6SJason Wang {
661e4dab1e6SJason Wang if (pfrag->page) {
662e4dab1e6SJason Wang if (pfrag->offset + sz <= pfrag->size)
663e4dab1e6SJason Wang return true;
664e4dab1e6SJason Wang __page_frag_cache_drain(pfrag->page, net->refcnt_bias);
665e4dab1e6SJason Wang }
666e4dab1e6SJason Wang
667e4dab1e6SJason Wang pfrag->offset = 0;
668e4dab1e6SJason Wang net->refcnt_bias = 0;
669e4dab1e6SJason Wang if (SKB_FRAG_PAGE_ORDER) {
670e4dab1e6SJason Wang /* Avoid direct reclaim but allow kswapd to wake */
671e4dab1e6SJason Wang pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
672e4dab1e6SJason Wang __GFP_COMP | __GFP_NOWARN |
673e4dab1e6SJason Wang __GFP_NORETRY,
674e4dab1e6SJason Wang SKB_FRAG_PAGE_ORDER);
675e4dab1e6SJason Wang if (likely(pfrag->page)) {
676e4dab1e6SJason Wang pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
677e4dab1e6SJason Wang goto done;
678e4dab1e6SJason Wang }
679e4dab1e6SJason Wang }
680e4dab1e6SJason Wang pfrag->page = alloc_page(gfp);
681e4dab1e6SJason Wang if (likely(pfrag->page)) {
682e4dab1e6SJason Wang pfrag->size = PAGE_SIZE;
683e4dab1e6SJason Wang goto done;
684e4dab1e6SJason Wang }
685e4dab1e6SJason Wang return false;
686e4dab1e6SJason Wang
687e4dab1e6SJason Wang done:
688e4dab1e6SJason Wang net->refcnt_bias = USHRT_MAX;
689e4dab1e6SJason Wang page_ref_add(pfrag->page, USHRT_MAX - 1);
690e4dab1e6SJason Wang return true;
691e4dab1e6SJason Wang }
692e4dab1e6SJason Wang
6930a0be13bSJason Wang #define VHOST_NET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
6940a0be13bSJason Wang
vhost_net_build_xdp(struct vhost_net_virtqueue * nvq,struct iov_iter * from)6950a0be13bSJason Wang static int vhost_net_build_xdp(struct vhost_net_virtqueue *nvq,
6960a0be13bSJason Wang struct iov_iter *from)
6970a0be13bSJason Wang {
6980a0be13bSJason Wang struct vhost_virtqueue *vq = &nvq->vq;
699e4dab1e6SJason Wang struct vhost_net *net = container_of(vq->dev, struct vhost_net,
700e4dab1e6SJason Wang dev);
701247643f8SEugenio Pérez struct socket *sock = vhost_vq_get_backend(vq);
702e4dab1e6SJason Wang struct page_frag *alloc_frag = &net->page_frag;
7030a0be13bSJason Wang struct virtio_net_hdr *gso;
7040a0be13bSJason Wang struct xdp_buff *xdp = &nvq->xdp[nvq->batched_xdp];
7050a0be13bSJason Wang struct tun_xdp_hdr *hdr;
7060a0be13bSJason Wang size_t len = iov_iter_count(from);
7070a0be13bSJason Wang int headroom = vhost_sock_xdp(sock) ? XDP_PACKET_HEADROOM : 0;
7080a0be13bSJason Wang int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
7090a0be13bSJason Wang int pad = SKB_DATA_ALIGN(VHOST_NET_RX_PAD + headroom + nvq->sock_hlen);
7100a0be13bSJason Wang int sock_hlen = nvq->sock_hlen;
7110a0be13bSJason Wang void *buf;
7120a0be13bSJason Wang int copied;
7130a0be13bSJason Wang
7140a0be13bSJason Wang if (unlikely(len < nvq->sock_hlen))
7150a0be13bSJason Wang return -EFAULT;
7160a0be13bSJason Wang
7170a0be13bSJason Wang if (SKB_DATA_ALIGN(len + pad) +
7180a0be13bSJason Wang SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
7190a0be13bSJason Wang return -ENOSPC;
7200a0be13bSJason Wang
7210a0be13bSJason Wang buflen += SKB_DATA_ALIGN(len + pad);
7220a0be13bSJason Wang alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
723e4dab1e6SJason Wang if (unlikely(!vhost_net_page_frag_refill(net, buflen,
724e4dab1e6SJason Wang alloc_frag, GFP_KERNEL)))
7250a0be13bSJason Wang return -ENOMEM;
7260a0be13bSJason Wang
7270a0be13bSJason Wang buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
7280a0be13bSJason Wang copied = copy_page_from_iter(alloc_frag->page,
7290a0be13bSJason Wang alloc_frag->offset +
7300a0be13bSJason Wang offsetof(struct tun_xdp_hdr, gso),
7310a0be13bSJason Wang sock_hlen, from);
7320a0be13bSJason Wang if (copied != sock_hlen)
7330a0be13bSJason Wang return -EFAULT;
7340a0be13bSJason Wang
7350a0be13bSJason Wang hdr = buf;
7360a0be13bSJason Wang gso = &hdr->gso;
7370a0be13bSJason Wang
7380a0be13bSJason Wang if ((gso->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
7390a0be13bSJason Wang vhost16_to_cpu(vq, gso->csum_start) +
7400a0be13bSJason Wang vhost16_to_cpu(vq, gso->csum_offset) + 2 >
7410a0be13bSJason Wang vhost16_to_cpu(vq, gso->hdr_len)) {
7420a0be13bSJason Wang gso->hdr_len = cpu_to_vhost16(vq,
7430a0be13bSJason Wang vhost16_to_cpu(vq, gso->csum_start) +
7440a0be13bSJason Wang vhost16_to_cpu(vq, gso->csum_offset) + 2);
7450a0be13bSJason Wang
7460a0be13bSJason Wang if (vhost16_to_cpu(vq, gso->hdr_len) > len)
7470a0be13bSJason Wang return -EINVAL;
7480a0be13bSJason Wang }
7490a0be13bSJason Wang
7500a0be13bSJason Wang len -= sock_hlen;
7510a0be13bSJason Wang copied = copy_page_from_iter(alloc_frag->page,
7520a0be13bSJason Wang alloc_frag->offset + pad,
7530a0be13bSJason Wang len, from);
7540a0be13bSJason Wang if (copied != len)
7550a0be13bSJason Wang return -EFAULT;
7560a0be13bSJason Wang
757224bf7dbSMatteo Croce xdp_init_buff(xdp, buflen, NULL);
758224bf7dbSMatteo Croce xdp_prepare_buff(xdp, buf, pad, len, true);
7590a0be13bSJason Wang hdr->buflen = buflen;
7600a0be13bSJason Wang
761e4dab1e6SJason Wang --net->refcnt_bias;
7620a0be13bSJason Wang alloc_frag->offset += buflen;
7630a0be13bSJason Wang
7640a0be13bSJason Wang ++nvq->batched_xdp;
7650a0be13bSJason Wang
7660a0be13bSJason Wang return 0;
7670a0be13bSJason Wang }
7680a0be13bSJason Wang
handle_tx_copy(struct vhost_net * net,struct socket * sock)7690d20bdf3SJason Wang static void handle_tx_copy(struct vhost_net *net, struct socket *sock)
7703a4d5c94SMichael S. Tsirkin {
7712839400fSAsias He struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
77281f95a55SMichael S. Tsirkin struct vhost_virtqueue *vq = &nvq->vq;
77398a527aaSAl Viro unsigned out, in;
774d5675bd2SMichael S. Tsirkin int head;
7753a4d5c94SMichael S. Tsirkin struct msghdr msg = {
7763a4d5c94SMichael S. Tsirkin .msg_name = NULL,
7773a4d5c94SMichael S. Tsirkin .msg_namelen = 0,
7783a4d5c94SMichael S. Tsirkin .msg_control = NULL,
7793a4d5c94SMichael S. Tsirkin .msg_controllen = 0,
7803a4d5c94SMichael S. Tsirkin .msg_flags = MSG_DONTWAIT,
7813a4d5c94SMichael S. Tsirkin };
7823a4d5c94SMichael S. Tsirkin size_t len, total_len = 0;
78370181d51SJason Wang int err;
784a2ac9990Shaibinzhang(张海斌) int sent_pkts = 0;
7850a0be13bSJason Wang bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX);
78628457ee6SArnd Bergmann
787e2412c07SJason Wang do {
7880d20bdf3SJason Wang bool busyloop_intr = false;
7892e26af79SAsias He
7900a0be13bSJason Wang if (nvq->done_idx == VHOST_NET_BATCH)
7910a0be13bSJason Wang vhost_tx_batch(net, nvq, sock, &msg);
7920a0be13bSJason Wang
7930d20bdf3SJason Wang head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
7940d20bdf3SJason Wang &busyloop_intr);
7950d20bdf3SJason Wang /* On error, stop handling until the next kick. */
7960d20bdf3SJason Wang if (unlikely(head < 0))
7970d20bdf3SJason Wang break;
7980d20bdf3SJason Wang /* Nothing new? Wait for eventfd to tell us they refilled. */
7990d20bdf3SJason Wang if (head == vq->num) {
8000d20bdf3SJason Wang if (unlikely(busyloop_intr)) {
8010d20bdf3SJason Wang vhost_poll_queue(&vq->poll);
8020d20bdf3SJason Wang } else if (unlikely(vhost_enable_notify(&net->dev,
8030d20bdf3SJason Wang vq))) {
8048ea8cf89SMichael S. Tsirkin vhost_disable_notify(&net->dev, vq);
8050d20bdf3SJason Wang continue;
8060d20bdf3SJason Wang }
8070d20bdf3SJason Wang break;
8080d20bdf3SJason Wang }
8093a4d5c94SMichael S. Tsirkin
8100d20bdf3SJason Wang total_len += len;
8110a0be13bSJason Wang
8120a0be13bSJason Wang /* For simplicity, TX batching is only enabled if
8130a0be13bSJason Wang * sndbuf is unlimited.
8140a0be13bSJason Wang */
8150a0be13bSJason Wang if (sock_can_batch) {
8160a0be13bSJason Wang err = vhost_net_build_xdp(nvq, &msg.msg_iter);
8170a0be13bSJason Wang if (!err) {
8180a0be13bSJason Wang goto done;
8190a0be13bSJason Wang } else if (unlikely(err != -ENOSPC)) {
8200a0be13bSJason Wang vhost_tx_batch(net, nvq, sock, &msg);
8210a0be13bSJason Wang vhost_discard_vq_desc(vq, 1);
8220a0be13bSJason Wang vhost_net_enable_vq(net, vq);
8230a0be13bSJason Wang break;
8240a0be13bSJason Wang }
8250a0be13bSJason Wang
8260a0be13bSJason Wang /* We can't build XDP buff, go for single
8270a0be13bSJason Wang * packet path but let's flush batched
8280a0be13bSJason Wang * packets.
8290a0be13bSJason Wang */
8300a0be13bSJason Wang vhost_tx_batch(net, nvq, sock, &msg);
8310a0be13bSJason Wang msg.msg_control = NULL;
8320a0be13bSJason Wang } else {
8330d20bdf3SJason Wang if (tx_can_batch(vq, total_len))
8340d20bdf3SJason Wang msg.msg_flags |= MSG_MORE;
8350d20bdf3SJason Wang else
8360d20bdf3SJason Wang msg.msg_flags &= ~MSG_MORE;
8370a0be13bSJason Wang }
8380d20bdf3SJason Wang
8390d20bdf3SJason Wang err = sock->ops->sendmsg(sock, &msg, len);
8400d20bdf3SJason Wang if (unlikely(err < 0)) {
841dc9c9e72SYunjian Wang if (err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS) {
8420d20bdf3SJason Wang vhost_discard_vq_desc(vq, 1);
8430d20bdf3SJason Wang vhost_net_enable_vq(net, vq);
8440d20bdf3SJason Wang break;
8450d20bdf3SJason Wang }
846dc9c9e72SYunjian Wang pr_debug("Fail to send packet: err %d", err);
847dc9c9e72SYunjian Wang } else if (unlikely(err != len))
8480d20bdf3SJason Wang pr_debug("Truncated TX packet: len %d != %zd\n",
8490d20bdf3SJason Wang err, len);
8500a0be13bSJason Wang done:
8510a0be13bSJason Wang vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head);
8520a0be13bSJason Wang vq->heads[nvq->done_idx].len = 0;
8530a0be13bSJason Wang ++nvq->done_idx;
854e2412c07SJason Wang } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len)));
8554afb52c2SJason Wang
8560a0be13bSJason Wang vhost_tx_batch(net, nvq, sock, &msg);
8570d20bdf3SJason Wang }
8580d20bdf3SJason Wang
handle_tx_zerocopy(struct vhost_net * net,struct socket * sock)8590d20bdf3SJason Wang static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock)
8600d20bdf3SJason Wang {
8610d20bdf3SJason Wang struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
8620d20bdf3SJason Wang struct vhost_virtqueue *vq = &nvq->vq;
8630d20bdf3SJason Wang unsigned out, in;
8640d20bdf3SJason Wang int head;
8650d20bdf3SJason Wang struct msghdr msg = {
8660d20bdf3SJason Wang .msg_name = NULL,
8670d20bdf3SJason Wang .msg_namelen = 0,
8680d20bdf3SJason Wang .msg_control = NULL,
8690d20bdf3SJason Wang .msg_controllen = 0,
8700d20bdf3SJason Wang .msg_flags = MSG_DONTWAIT,
8710d20bdf3SJason Wang };
872fe8dd45bSJason Wang struct tun_msg_ctl ctl;
8730d20bdf3SJason Wang size_t len, total_len = 0;
8740d20bdf3SJason Wang int err;
8753f649ab7SKees Cook struct vhost_net_ubuf_ref *ubufs;
876dfff202bSPavel Begunkov struct ubuf_info_msgzc *ubuf;
8770d20bdf3SJason Wang bool zcopy_used;
8780d20bdf3SJason Wang int sent_pkts = 0;
8793a4d5c94SMichael S. Tsirkin
880e2412c07SJason Wang do {
881027b1760SToshiaki Makita bool busyloop_intr;
882027b1760SToshiaki Makita
883bab632d6SMichael S. Tsirkin /* Release DMAs done buffers first */
884eaae8132SMichael S. Tsirkin vhost_zerocopy_signal_used(net, vq);
885bab632d6SMichael S. Tsirkin
886027b1760SToshiaki Makita busyloop_intr = false;
887a2a91a13SJason Wang head = get_tx_bufs(net, nvq, &msg, &out, &in, &len,
888652e4f3eSJason Wang &busyloop_intr);
889d5675bd2SMichael S. Tsirkin /* On error, stop handling until the next kick. */
8907b3384fcSMichael S. Tsirkin if (unlikely(head < 0))
891d5675bd2SMichael S. Tsirkin break;
8923a4d5c94SMichael S. Tsirkin /* Nothing new? Wait for eventfd to tell us they refilled. */
8933a4d5c94SMichael S. Tsirkin if (head == vq->num) {
894027b1760SToshiaki Makita if (unlikely(busyloop_intr)) {
895027b1760SToshiaki Makita vhost_poll_queue(&vq->poll);
896027b1760SToshiaki Makita } else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
8978ea8cf89SMichael S. Tsirkin vhost_disable_notify(&net->dev, vq);
8983a4d5c94SMichael S. Tsirkin continue;
8993a4d5c94SMichael S. Tsirkin }
9003a4d5c94SMichael S. Tsirkin break;
9013a4d5c94SMichael S. Tsirkin }
902ce21a029SJason Wang
9030d20bdf3SJason Wang zcopy_used = len >= VHOST_GOODCOPY_LEN
9041e6f7453SWillem de Bruijn && !vhost_exceeds_maxpend(net)
905ce21a029SJason Wang && vhost_net_tx_select_zcopy(net);
906cedb9bdcSMichael S. Tsirkin
907bab632d6SMichael S. Tsirkin /* use msg_control to pass vhost zerocopy ubuf info to skb */
908cedb9bdcSMichael S. Tsirkin if (zcopy_used) {
9092839400fSAsias He ubuf = nvq->ubuf_info + nvq->upend_idx;
9108b38694aSMichael S. Tsirkin vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
911ce21a029SJason Wang vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
9122839400fSAsias He ubuf->ctx = nvq->ubufs;
9132839400fSAsias He ubuf->desc = nvq->upend_idx;
914dfff202bSPavel Begunkov ubuf->ubuf.callback = vhost_zerocopy_callback;
915dfff202bSPavel Begunkov ubuf->ubuf.flags = SKBFL_ZEROCOPY_FRAG;
916dfff202bSPavel Begunkov refcount_set(&ubuf->ubuf.refcnt, 1);
917fe8dd45bSJason Wang msg.msg_control = &ctl;
918fe8dd45bSJason Wang ctl.type = TUN_MSG_UBUF;
919dfff202bSPavel Begunkov ctl.ptr = &ubuf->ubuf;
920fe8dd45bSJason Wang msg.msg_controllen = sizeof(ctl);
9212839400fSAsias He ubufs = nvq->ubufs;
9220ad8b480SMichael S. Tsirkin atomic_inc(&ubufs->refcount);
9232839400fSAsias He nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
924ce21a029SJason Wang } else {
9254364d5f9SJason Wang msg.msg_control = NULL;
926ce21a029SJason Wang ubufs = NULL;
927ce21a029SJason Wang }
9280ed005ceSJason Wang total_len += len;
929c92a8a8cSJason Wang if (tx_can_batch(vq, total_len) &&
9300ed005ceSJason Wang likely(!vhost_exceeds_maxpend(net))) {
9310ed005ceSJason Wang msg.msg_flags |= MSG_MORE;
9320ed005ceSJason Wang } else {
9330ed005ceSJason Wang msg.msg_flags &= ~MSG_MORE;
9340ed005ceSJason Wang }
9350ed005ceSJason Wang
9361b784140SYing Xue err = sock->ops->sendmsg(sock, &msg, len);
9373a4d5c94SMichael S. Tsirkin if (unlikely(err < 0)) {
938*1f5d2e3bSAndrey Smetanin bool retry = err == -EAGAIN || err == -ENOMEM || err == -ENOBUFS;
939*1f5d2e3bSAndrey Smetanin
940cedb9bdcSMichael S. Tsirkin if (zcopy_used) {
94101e31beaSYunjian Wang if (vq->heads[ubuf->desc].len == VHOST_DMA_IN_PROGRESS)
942fe729a57SAsias He vhost_net_ubuf_put(ubufs);
943*1f5d2e3bSAndrey Smetanin if (retry)
9442839400fSAsias He nvq->upend_idx = ((unsigned)nvq->upend_idx - 1)
9452839400fSAsias He % UIO_MAXIOV;
946*1f5d2e3bSAndrey Smetanin else
947*1f5d2e3bSAndrey Smetanin vq->heads[ubuf->desc].len = VHOST_DMA_DONE_LEN;
948bab632d6SMichael S. Tsirkin }
949*1f5d2e3bSAndrey Smetanin if (retry) {
9508dd014adSDavid Stevens vhost_discard_vq_desc(vq, 1);
951feb8892cSJason Wang vhost_net_enable_vq(net, vq);
9523a4d5c94SMichael S. Tsirkin break;
9533a4d5c94SMichael S. Tsirkin }
954dc9c9e72SYunjian Wang pr_debug("Fail to send packet: err %d", err);
955dc9c9e72SYunjian Wang } else if (unlikely(err != len))
95695c0ec6aSMichael S. Tsirkin pr_debug("Truncated TX packet: "
9573a4d5c94SMichael S. Tsirkin " len %d != %zd\n", err, len);
958cedb9bdcSMichael S. Tsirkin if (!zcopy_used)
9593a4d5c94SMichael S. Tsirkin vhost_add_used_and_signal(&net->dev, vq, head, 0);
960c8fb217aSJason Wang else
961eaae8132SMichael S. Tsirkin vhost_zerocopy_signal_used(net, vq);
962eaae8132SMichael S. Tsirkin vhost_net_tx_packet(net);
963e2412c07SJason Wang } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len)));
9643a4d5c94SMichael S. Tsirkin }
9650d20bdf3SJason Wang
9660d20bdf3SJason Wang /* Expects to be always run from workqueue - which acts as
9670d20bdf3SJason Wang * read-size critical section for our kind of RCU. */
handle_tx(struct vhost_net * net)9680d20bdf3SJason Wang static void handle_tx(struct vhost_net *net)
9690d20bdf3SJason Wang {
9700d20bdf3SJason Wang struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
9710d20bdf3SJason Wang struct vhost_virtqueue *vq = &nvq->vq;
9720d20bdf3SJason Wang struct socket *sock;
9730d20bdf3SJason Wang
974a6a67a2fSTonghao Zhang mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
975247643f8SEugenio Pérez sock = vhost_vq_get_backend(vq);
9760d20bdf3SJason Wang if (!sock)
9770d20bdf3SJason Wang goto out;
9780d20bdf3SJason Wang
9799b5e830bSJason Wang if (!vq_meta_prefetch(vq))
9800d20bdf3SJason Wang goto out;
9810d20bdf3SJason Wang
9820d20bdf3SJason Wang vhost_disable_notify(&net->dev, vq);
9830d20bdf3SJason Wang vhost_net_disable_vq(net, vq);
9840d20bdf3SJason Wang
9850d20bdf3SJason Wang if (vhost_sock_zcopy(sock))
9860d20bdf3SJason Wang handle_tx_zerocopy(net, sock);
9870d20bdf3SJason Wang else
9880d20bdf3SJason Wang handle_tx_copy(net, sock);
9890d20bdf3SJason Wang
9902e26af79SAsias He out:
9913a4d5c94SMichael S. Tsirkin mutex_unlock(&vq->mutex);
9923a4d5c94SMichael S. Tsirkin }
9933a4d5c94SMichael S. Tsirkin
peek_head_len(struct vhost_net_virtqueue * rvq,struct sock * sk)994c67df11fSJason Wang static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
9958dd014adSDavid Stevens {
9968dd014adSDavid Stevens struct sk_buff *head;
9978dd014adSDavid Stevens int len = 0;
998783e3988SJason Wang unsigned long flags;
9998dd014adSDavid Stevens
10005990a305SJason Wang if (rvq->rx_ring)
1001c67df11fSJason Wang return vhost_net_buf_peek(rvq);
10021576d986SJason Wang
1003783e3988SJason Wang spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
10048dd014adSDavid Stevens head = skb_peek(&sk->sk_receive_queue);
1005c53cff5eSBasil Gor if (likely(head)) {
10068dd014adSDavid Stevens len = head->len;
1007df8a39deSJiri Pirko if (skb_vlan_tag_present(head))
1008c53cff5eSBasil Gor len += VLAN_HLEN;
1009c53cff5eSBasil Gor }
1010c53cff5eSBasil Gor
1011783e3988SJason Wang spin_unlock_irqrestore(&sk->sk_receive_queue.lock, flags);
10128dd014adSDavid Stevens return len;
10138dd014adSDavid Stevens }
10148dd014adSDavid Stevens
vhost_net_rx_peek_head_len(struct vhost_net * net,struct sock * sk,bool * busyloop_intr)1015be294a51SToshiaki Makita static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
1016be294a51SToshiaki Makita bool *busyloop_intr)
101703088137SJason Wang {
101828b9b33bSToshiaki Makita struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
101928b9b33bSToshiaki Makita struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
10206369fec5SToshiaki Makita struct vhost_virtqueue *rvq = &rnvq->vq;
102128b9b33bSToshiaki Makita struct vhost_virtqueue *tvq = &tnvq->vq;
102228b9b33bSToshiaki Makita int len = peek_head_len(rnvq, sk);
102303088137SJason Wang
1024dc151282STonghao Zhang if (!len && rvq->busyloop_timeout) {
1025f5a4941aSJason Wang /* Flush batched heads first */
102609c32489SJason Wang vhost_net_signal_used(rnvq);
102703088137SJason Wang /* Both tx vq and rx socket were polled here */
1028dc151282STonghao Zhang vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, true);
102903088137SJason Wang
103028b9b33bSToshiaki Makita len = peek_head_len(rnvq, sk);
103103088137SJason Wang }
103203088137SJason Wang
103303088137SJason Wang return len;
103403088137SJason Wang }
103503088137SJason Wang
10368dd014adSDavid Stevens /* This is a multi-buffer version of vhost_get_desc, that works if
10378dd014adSDavid Stevens * vq has read descriptors only.
10388dd014adSDavid Stevens * @vq - the relevant virtqueue
10398dd014adSDavid Stevens * @datalen - data length we'll be reading
10408dd014adSDavid Stevens * @iovcount - returned count of io vectors we fill
10418dd014adSDavid Stevens * @log - vhost log
10428dd014adSDavid Stevens * @log_num - log offset
104394249369SJason Wang * @quota - headcount quota, 1 for big buffer
10448dd014adSDavid Stevens * returns number of buffer heads allocated, negative on error
10458dd014adSDavid Stevens */
get_rx_bufs(struct vhost_virtqueue * vq,struct vring_used_elem * heads,int datalen,unsigned * iovcount,struct vhost_log * log,unsigned * log_num,unsigned int quota)10468dd014adSDavid Stevens static int get_rx_bufs(struct vhost_virtqueue *vq,
10478dd014adSDavid Stevens struct vring_used_elem *heads,
10488dd014adSDavid Stevens int datalen,
10498dd014adSDavid Stevens unsigned *iovcount,
10508dd014adSDavid Stevens struct vhost_log *log,
105194249369SJason Wang unsigned *log_num,
105294249369SJason Wang unsigned int quota)
10538dd014adSDavid Stevens {
10548dd014adSDavid Stevens unsigned int out, in;
10558dd014adSDavid Stevens int seg = 0;
10568dd014adSDavid Stevens int headcount = 0;
10578dd014adSDavid Stevens unsigned d;
10588dd014adSDavid Stevens int r, nlogs = 0;
10598b38694aSMichael S. Tsirkin /* len is always initialized before use since we are always called with
10608b38694aSMichael S. Tsirkin * datalen > 0.
10618b38694aSMichael S. Tsirkin */
10623f649ab7SKees Cook u32 len;
10638dd014adSDavid Stevens
106494249369SJason Wang while (datalen > 0 && headcount < quota) {
1065e0e9b406SJason Wang if (unlikely(seg >= UIO_MAXIOV)) {
10668dd014adSDavid Stevens r = -ENOBUFS;
10678dd014adSDavid Stevens goto err;
10688dd014adSDavid Stevens }
106947283befSMichael S. Tsirkin r = vhost_get_vq_desc(vq, vq->iov + seg,
10708dd014adSDavid Stevens ARRAY_SIZE(vq->iov) - seg, &out,
10718dd014adSDavid Stevens &in, log, log_num);
1072a39ee449SMichael S. Tsirkin if (unlikely(r < 0))
1073a39ee449SMichael S. Tsirkin goto err;
1074a39ee449SMichael S. Tsirkin
1075a39ee449SMichael S. Tsirkin d = r;
10768dd014adSDavid Stevens if (d == vq->num) {
10778dd014adSDavid Stevens r = 0;
10788dd014adSDavid Stevens goto err;
10798dd014adSDavid Stevens }
10808dd014adSDavid Stevens if (unlikely(out || in <= 0)) {
10818dd014adSDavid Stevens vq_err(vq, "unexpected descriptor format for RX: "
10828dd014adSDavid Stevens "out %d, in %d\n", out, in);
10838dd014adSDavid Stevens r = -EINVAL;
10848dd014adSDavid Stevens goto err;
10858dd014adSDavid Stevens }
10868dd014adSDavid Stevens if (unlikely(log)) {
10878dd014adSDavid Stevens nlogs += *log_num;
10888dd014adSDavid Stevens log += *log_num;
10898dd014adSDavid Stevens }
10908b38694aSMichael S. Tsirkin heads[headcount].id = cpu_to_vhost32(vq, d);
10918b38694aSMichael S. Tsirkin len = iov_length(vq->iov + seg, in);
10928b38694aSMichael S. Tsirkin heads[headcount].len = cpu_to_vhost32(vq, len);
10938b38694aSMichael S. Tsirkin datalen -= len;
10948dd014adSDavid Stevens ++headcount;
10958dd014adSDavid Stevens seg += in;
10968dd014adSDavid Stevens }
109799975cc6SMichael S. Tsirkin heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen);
10988dd014adSDavid Stevens *iovcount = seg;
10998dd014adSDavid Stevens if (unlikely(log))
11008dd014adSDavid Stevens *log_num = nlogs;
1101d8316f39SMichael S. Tsirkin
1102d8316f39SMichael S. Tsirkin /* Detect overrun */
1103d8316f39SMichael S. Tsirkin if (unlikely(datalen > 0)) {
1104d8316f39SMichael S. Tsirkin r = UIO_MAXIOV + 1;
1105d8316f39SMichael S. Tsirkin goto err;
1106d8316f39SMichael S. Tsirkin }
11078dd014adSDavid Stevens return headcount;
11088dd014adSDavid Stevens err:
11098dd014adSDavid Stevens vhost_discard_vq_desc(vq, headcount);
11108dd014adSDavid Stevens return r;
11118dd014adSDavid Stevens }
11128dd014adSDavid Stevens
11133a4d5c94SMichael S. Tsirkin /* Expects to be always run from workqueue - which acts as
11143a4d5c94SMichael S. Tsirkin * read-size critical section for our kind of RCU. */
handle_rx(struct vhost_net * net)111594249369SJason Wang static void handle_rx(struct vhost_net *net)
11163a4d5c94SMichael S. Tsirkin {
111781f95a55SMichael S. Tsirkin struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX];
111881f95a55SMichael S. Tsirkin struct vhost_virtqueue *vq = &nvq->vq;
11193f649ab7SKees Cook unsigned in, log;
11208dd014adSDavid Stevens struct vhost_log *vq_log;
11218dd014adSDavid Stevens struct msghdr msg = {
11228dd014adSDavid Stevens .msg_name = NULL,
11238dd014adSDavid Stevens .msg_namelen = 0,
11248dd014adSDavid Stevens .msg_control = NULL, /* FIXME: get and handle RX aux data. */
11258dd014adSDavid Stevens .msg_controllen = 0,
11268dd014adSDavid Stevens .msg_flags = MSG_DONTWAIT,
11278dd014adSDavid Stevens };
11280960b641SJason Wang struct virtio_net_hdr hdr = {
11290960b641SJason Wang .flags = 0,
11300960b641SJason Wang .gso_type = VIRTIO_NET_HDR_GSO_NONE
11318dd014adSDavid Stevens };
11328dd014adSDavid Stevens size_t total_len = 0;
1133910a578fSMichael S. Tsirkin int err, mergeable;
1134f5a4941aSJason Wang s16 headcount;
11358dd014adSDavid Stevens size_t vhost_hlen, sock_hlen;
11368dd014adSDavid Stevens size_t vhost_len, sock_len;
1137be294a51SToshiaki Makita bool busyloop_intr = false;
11382e26af79SAsias He struct socket *sock;
1139ba7438aeSAl Viro struct iov_iter fixup;
11400960b641SJason Wang __virtio16 num_buffers;
1141db688c24SPaolo Abeni int recv_pkts = 0;
11428dd014adSDavid Stevens
1143a6a67a2fSTonghao Zhang mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX);
1144247643f8SEugenio Pérez sock = vhost_vq_get_backend(vq);
11452e26af79SAsias He if (!sock)
11462e26af79SAsias He goto out;
11476b1e6cc7SJason Wang
11489b5e830bSJason Wang if (!vq_meta_prefetch(vq))
11496b1e6cc7SJason Wang goto out;
11506b1e6cc7SJason Wang
11518ea8cf89SMichael S. Tsirkin vhost_disable_notify(&net->dev, vq);
11528241a1e4SJason Wang vhost_net_disable_vq(net, vq);
11532e26af79SAsias He
115481f95a55SMichael S. Tsirkin vhost_hlen = nvq->vhost_hlen;
115581f95a55SMichael S. Tsirkin sock_hlen = nvq->sock_hlen;
11568dd014adSDavid Stevens
1157ea16c514SMichael S. Tsirkin vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ?
11588dd014adSDavid Stevens vq->log : NULL;
1159ea16c514SMichael S. Tsirkin mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
11608dd014adSDavid Stevens
1161e2412c07SJason Wang do {
1162e2412c07SJason Wang sock_len = vhost_net_rx_peek_head_len(net, sock->sk,
1163e2412c07SJason Wang &busyloop_intr);
1164e2412c07SJason Wang if (!sock_len)
1165e2412c07SJason Wang break;
11668dd014adSDavid Stevens sock_len += sock_hlen;
11678dd014adSDavid Stevens vhost_len = sock_len + vhost_hlen;
1168f5a4941aSJason Wang headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx,
1169f5a4941aSJason Wang vhost_len, &in, vq_log, &log,
117094249369SJason Wang likely(mergeable) ? UIO_MAXIOV : 1);
11718dd014adSDavid Stevens /* On error, stop handling until the next kick. */
11728dd014adSDavid Stevens if (unlikely(headcount < 0))
11738241a1e4SJason Wang goto out;
11748dd014adSDavid Stevens /* OK, now we need to know about added descriptors. */
11758dd014adSDavid Stevens if (!headcount) {
11766369fec5SToshiaki Makita if (unlikely(busyloop_intr)) {
11776369fec5SToshiaki Makita vhost_poll_queue(&vq->poll);
11786369fec5SToshiaki Makita } else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
11798dd014adSDavid Stevens /* They have slipped one in as we were
11808dd014adSDavid Stevens * doing that: check again. */
11818ea8cf89SMichael S. Tsirkin vhost_disable_notify(&net->dev, vq);
11828dd014adSDavid Stevens continue;
11838dd014adSDavid Stevens }
11848dd014adSDavid Stevens /* Nothing new? Wait for eventfd to tell us
11858dd014adSDavid Stevens * they refilled. */
11868241a1e4SJason Wang goto out;
11878dd014adSDavid Stevens }
11886369fec5SToshiaki Makita busyloop_intr = false;
11895990a305SJason Wang if (nvq->rx_ring)
11906e474083SWei Xu msg.msg_control = vhost_net_buf_consume(&nvq->rxq);
11916e474083SWei Xu /* On overrun, truncate and discard */
11926e474083SWei Xu if (unlikely(headcount > UIO_MAXIOV)) {
1193de4eda9dSAl Viro iov_iter_init(&msg.msg_iter, ITER_DEST, vq->iov, 1, 1);
11946e474083SWei Xu err = sock->ops->recvmsg(sock, &msg,
11956e474083SWei Xu 1, MSG_DONTWAIT | MSG_TRUNC);
11966e474083SWei Xu pr_debug("Discarded rx packet: len %zd\n", sock_len);
11976e474083SWei Xu continue;
11986e474083SWei Xu }
11998dd014adSDavid Stevens /* We don't need to be notified again. */
1200de4eda9dSAl Viro iov_iter_init(&msg.msg_iter, ITER_DEST, vq->iov, in, vhost_len);
1201ba7438aeSAl Viro fixup = msg.msg_iter;
1202ba7438aeSAl Viro if (unlikely((vhost_hlen))) {
1203ba7438aeSAl Viro /* We will supply the header ourselves
1204ba7438aeSAl Viro * TODO: support TSO.
1205ba7438aeSAl Viro */
1206ba7438aeSAl Viro iov_iter_advance(&msg.msg_iter, vhost_hlen);
1207ba7438aeSAl Viro }
12081b784140SYing Xue err = sock->ops->recvmsg(sock, &msg,
12098dd014adSDavid Stevens sock_len, MSG_DONTWAIT | MSG_TRUNC);
12108dd014adSDavid Stevens /* Userspace might have consumed the packet meanwhile:
12118dd014adSDavid Stevens * it's not supposed to do this usually, but might be hard
12128dd014adSDavid Stevens * to prevent. Discard data we got (if any) and keep going. */
12138dd014adSDavid Stevens if (unlikely(err != sock_len)) {
12148dd014adSDavid Stevens pr_debug("Discarded rx packet: "
12158dd014adSDavid Stevens " len %d, expected %zd\n", err, sock_len);
12168dd014adSDavid Stevens vhost_discard_vq_desc(vq, headcount);
12178dd014adSDavid Stevens continue;
12188dd014adSDavid Stevens }
1219ba7438aeSAl Viro /* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */
12204c5a8442SMichael S. Tsirkin if (unlikely(vhost_hlen)) {
12214c5a8442SMichael S. Tsirkin if (copy_to_iter(&hdr, sizeof(hdr),
12224c5a8442SMichael S. Tsirkin &fixup) != sizeof(hdr)) {
12234c5a8442SMichael S. Tsirkin vq_err(vq, "Unable to write vnet_hdr "
12244c5a8442SMichael S. Tsirkin "at addr %p\n", vq->iov->iov_base);
12258241a1e4SJason Wang goto out;
12268dd014adSDavid Stevens }
12274c5a8442SMichael S. Tsirkin } else {
12284c5a8442SMichael S. Tsirkin /* Header came from socket; we'll need to patch
12294c5a8442SMichael S. Tsirkin * ->num_buffers over if VIRTIO_NET_F_MRG_RXBUF
12304c5a8442SMichael S. Tsirkin */
12314c5a8442SMichael S. Tsirkin iov_iter_advance(&fixup, sizeof(hdr));
12324c5a8442SMichael S. Tsirkin }
12338dd014adSDavid Stevens /* TODO: Should check and handle checksum. */
12345201aa49SMichael S. Tsirkin
12350960b641SJason Wang num_buffers = cpu_to_vhost16(vq, headcount);
1236cfbdab95SJason Wang if (likely(mergeable) &&
12370d79a493SMichael S. Tsirkin copy_to_iter(&num_buffers, sizeof num_buffers,
12380d79a493SMichael S. Tsirkin &fixup) != sizeof num_buffers) {
12398dd014adSDavid Stevens vq_err(vq, "Failed num_buffers write");
12408dd014adSDavid Stevens vhost_discard_vq_desc(vq, headcount);
12418241a1e4SJason Wang goto out;
12428dd014adSDavid Stevens }
1243f5a4941aSJason Wang nvq->done_idx += headcount;
1244d0d86971SJason Wang if (nvq->done_idx > VHOST_NET_BATCH)
124509c32489SJason Wang vhost_net_signal_used(nvq);
12468dd014adSDavid Stevens if (unlikely(vq_log))
1247cc5e7107SJason Wang vhost_log_write(vq, vq_log, log, vhost_len,
1248cc5e7107SJason Wang vq->iov, in);
12498dd014adSDavid Stevens total_len += vhost_len;
1250e2412c07SJason Wang } while (likely(!vhost_exceeds_weight(vq, ++recv_pkts, total_len)));
1251e2412c07SJason Wang
1252be294a51SToshiaki Makita if (unlikely(busyloop_intr))
1253be294a51SToshiaki Makita vhost_poll_queue(&vq->poll);
1254e2412c07SJason Wang else if (!sock_len)
12558241a1e4SJason Wang vhost_net_enable_vq(net, vq);
12562e26af79SAsias He out:
125709c32489SJason Wang vhost_net_signal_used(nvq);
12588dd014adSDavid Stevens mutex_unlock(&vq->mutex);
12598dd014adSDavid Stevens }
12608dd014adSDavid Stevens
handle_tx_kick(struct vhost_work * work)1261c23f3445STejun Heo static void handle_tx_kick(struct vhost_work *work)
12623a4d5c94SMichael S. Tsirkin {
1263c23f3445STejun Heo struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
1264c23f3445STejun Heo poll.work);
1265c23f3445STejun Heo struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
1266c23f3445STejun Heo
12673a4d5c94SMichael S. Tsirkin handle_tx(net);
12683a4d5c94SMichael S. Tsirkin }
12693a4d5c94SMichael S. Tsirkin
handle_rx_kick(struct vhost_work * work)1270c23f3445STejun Heo static void handle_rx_kick(struct vhost_work *work)
12713a4d5c94SMichael S. Tsirkin {
1272c23f3445STejun Heo struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
1273c23f3445STejun Heo poll.work);
1274c23f3445STejun Heo struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);
1275c23f3445STejun Heo
12763a4d5c94SMichael S. Tsirkin handle_rx(net);
12773a4d5c94SMichael S. Tsirkin }
12783a4d5c94SMichael S. Tsirkin
handle_tx_net(struct vhost_work * work)1279c23f3445STejun Heo static void handle_tx_net(struct vhost_work *work)
12803a4d5c94SMichael S. Tsirkin {
1281c23f3445STejun Heo struct vhost_net *net = container_of(work, struct vhost_net,
1282c23f3445STejun Heo poll[VHOST_NET_VQ_TX].work);
12833a4d5c94SMichael S. Tsirkin handle_tx(net);
12843a4d5c94SMichael S. Tsirkin }
12853a4d5c94SMichael S. Tsirkin
handle_rx_net(struct vhost_work * work)1286c23f3445STejun Heo static void handle_rx_net(struct vhost_work *work)
12873a4d5c94SMichael S. Tsirkin {
1288c23f3445STejun Heo struct vhost_net *net = container_of(work, struct vhost_net,
1289c23f3445STejun Heo poll[VHOST_NET_VQ_RX].work);
12903a4d5c94SMichael S. Tsirkin handle_rx(net);
12913a4d5c94SMichael S. Tsirkin }
12923a4d5c94SMichael S. Tsirkin
vhost_net_open(struct inode * inode,struct file * f)12933a4d5c94SMichael S. Tsirkin static int vhost_net_open(struct inode *inode, struct file *f)
12943a4d5c94SMichael S. Tsirkin {
129523cc5a99SMichael S. Tsirkin struct vhost_net *n;
1296c23f3445STejun Heo struct vhost_dev *dev;
12973ab2e420SAsias He struct vhost_virtqueue **vqs;
12985990a305SJason Wang void **queue;
12990a0be13bSJason Wang struct xdp_buff *xdp;
130059566b6eSZhi Yong Wu int i;
1301c23f3445STejun Heo
1302dcda9b04SMichal Hocko n = kvmalloc(sizeof *n, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
13033a4d5c94SMichael S. Tsirkin if (!n)
13043a4d5c94SMichael S. Tsirkin return -ENOMEM;
13056da2ec56SKees Cook vqs = kmalloc_array(VHOST_NET_VQ_MAX, sizeof(*vqs), GFP_KERNEL);
13063ab2e420SAsias He if (!vqs) {
1307d04257b0SRomain Francoise kvfree(n);
13083ab2e420SAsias He return -ENOMEM;
13093ab2e420SAsias He }
1310c23f3445STejun Heo
1311d0d86971SJason Wang queue = kmalloc_array(VHOST_NET_BATCH, sizeof(void *),
1312c67df11fSJason Wang GFP_KERNEL);
1313c67df11fSJason Wang if (!queue) {
1314c67df11fSJason Wang kfree(vqs);
1315c67df11fSJason Wang kvfree(n);
1316c67df11fSJason Wang return -ENOMEM;
1317c67df11fSJason Wang }
1318c67df11fSJason Wang n->vqs[VHOST_NET_VQ_RX].rxq.queue = queue;
1319c67df11fSJason Wang
13200a0be13bSJason Wang xdp = kmalloc_array(VHOST_NET_BATCH, sizeof(*xdp), GFP_KERNEL);
13210a0be13bSJason Wang if (!xdp) {
13220a0be13bSJason Wang kfree(vqs);
13230a0be13bSJason Wang kvfree(n);
13240a0be13bSJason Wang kfree(queue);
13258a1aff14SDan Carpenter return -ENOMEM;
13260a0be13bSJason Wang }
13270a0be13bSJason Wang n->vqs[VHOST_NET_VQ_TX].xdp = xdp;
13280a0be13bSJason Wang
1329c23f3445STejun Heo dev = &n->dev;
13303ab2e420SAsias He vqs[VHOST_NET_VQ_TX] = &n->vqs[VHOST_NET_VQ_TX].vq;
13313ab2e420SAsias He vqs[VHOST_NET_VQ_RX] = &n->vqs[VHOST_NET_VQ_RX].vq;
13323ab2e420SAsias He n->vqs[VHOST_NET_VQ_TX].vq.handle_kick = handle_tx_kick;
13333ab2e420SAsias He n->vqs[VHOST_NET_VQ_RX].vq.handle_kick = handle_rx_kick;
13342839400fSAsias He for (i = 0; i < VHOST_NET_VQ_MAX; i++) {
13352839400fSAsias He n->vqs[i].ubufs = NULL;
13362839400fSAsias He n->vqs[i].ubuf_info = NULL;
13372839400fSAsias He n->vqs[i].upend_idx = 0;
13382839400fSAsias He n->vqs[i].done_idx = 0;
13390a0be13bSJason Wang n->vqs[i].batched_xdp = 0;
134081f95a55SMichael S. Tsirkin n->vqs[i].vhost_hlen = 0;
134181f95a55SMichael S. Tsirkin n->vqs[i].sock_hlen = 0;
1342ab7e34b3SAlexander Potapenko n->vqs[i].rx_ring = NULL;
1343c67df11fSJason Wang vhost_net_buf_init(&n->vqs[i].rxq);
13442839400fSAsias He }
1345b46a0bf7SJason Wang vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX,
1346e82b9b07SJason Wang UIO_MAXIOV + VHOST_NET_BATCH,
134701fcb1cbSJason Wang VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT, true,
1348792a4f2eSJason Wang NULL);
13493a4d5c94SMichael S. Tsirkin
1350a9a08845SLinus Torvalds vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, dev,
1351a9a08845SLinus Torvalds vqs[VHOST_NET_VQ_TX]);
13523a4d5c94SMichael S. Tsirkin vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev,
13533a4d5c94SMichael S. Tsirkin vqs[VHOST_NET_VQ_RX]);
1354e4dab1e6SJason Wang
1355e4dab1e6SJason Wang f->private_data = n;
13563a4d5c94SMichael S. Tsirkin n->page_frag.page = NULL;
13573a4d5c94SMichael S. Tsirkin n->refcnt_bias = 0;
13583a4d5c94SMichael S. Tsirkin
13593a4d5c94SMichael S. Tsirkin return 0;
13603a4d5c94SMichael S. Tsirkin }
13613a4d5c94SMichael S. Tsirkin
vhost_net_stop_vq(struct vhost_net * n,struct vhost_virtqueue * vq)13623a4d5c94SMichael S. Tsirkin static struct socket *vhost_net_stop_vq(struct vhost_net *n,
13633a4d5c94SMichael S. Tsirkin struct vhost_virtqueue *vq)
1364c67df11fSJason Wang {
1365c67df11fSJason Wang struct socket *sock;
13663a4d5c94SMichael S. Tsirkin struct vhost_net_virtqueue *nvq =
13673a4d5c94SMichael S. Tsirkin container_of(vq, struct vhost_net_virtqueue, vq);
1368247643f8SEugenio Pérez
13693a4d5c94SMichael S. Tsirkin mutex_lock(&vq->mutex);
1370247643f8SEugenio Pérez sock = vhost_vq_get_backend(vq);
1371c67df11fSJason Wang vhost_net_disable_vq(n, vq);
1372303fd71bSJason Wang vhost_vq_set_backend(vq, NULL);
13733a4d5c94SMichael S. Tsirkin vhost_net_buf_unproduce(nvq);
13743a4d5c94SMichael S. Tsirkin nvq->rx_ring = NULL;
13753a4d5c94SMichael S. Tsirkin mutex_unlock(&vq->mutex);
13763a4d5c94SMichael S. Tsirkin return sock;
13773a4d5c94SMichael S. Tsirkin }
13783a4d5c94SMichael S. Tsirkin
vhost_net_stop(struct vhost_net * n,struct socket ** tx_sock,struct socket ** rx_sock)13793a4d5c94SMichael S. Tsirkin static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock,
13803ab2e420SAsias He struct socket **rx_sock)
13813ab2e420SAsias He {
13823a4d5c94SMichael S. Tsirkin *tx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_TX].vq);
13833a4d5c94SMichael S. Tsirkin *rx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_RX].vq);
13843a4d5c94SMichael S. Tsirkin }
13853a4d5c94SMichael S. Tsirkin
vhost_net_flush(struct vhost_net * n)1386b2ffa407SMike Christie static void vhost_net_flush(struct vhost_net *n)
13872839400fSAsias He {
13883ab2e420SAsias He vhost_dev_flush(&n->dev);
13891280c27fSMichael S. Tsirkin if (n->vqs[VHOST_NET_VQ_TX].ubufs) {
13903ab2e420SAsias He mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
13911280c27fSMichael S. Tsirkin n->tx_flush = true;
1392fe729a57SAsias He mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
13933ab2e420SAsias He /* Wait for all lower device DMAs done. */
13941280c27fSMichael S. Tsirkin vhost_net_ubuf_put_and_wait(n->vqs[VHOST_NET_VQ_TX].ubufs);
13950ad8b480SMichael S. Tsirkin mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
13963ab2e420SAsias He n->tx_flush = false;
13971280c27fSMichael S. Tsirkin atomic_set(&n->vqs[VHOST_NET_VQ_TX].ubufs->refcount, 1);
13983a4d5c94SMichael S. Tsirkin mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
13993a4d5c94SMichael S. Tsirkin }
14003a4d5c94SMichael S. Tsirkin }
14013a4d5c94SMichael S. Tsirkin
vhost_net_release(struct inode * inode,struct file * f)14023a4d5c94SMichael S. Tsirkin static int vhost_net_release(struct inode *inode, struct file *f)
14033a4d5c94SMichael S. Tsirkin {
14043a4d5c94SMichael S. Tsirkin struct vhost_net *n = f->private_data;
14053a4d5c94SMichael S. Tsirkin struct socket *tx_sock;
14063a4d5c94SMichael S. Tsirkin struct socket *rx_sock;
14073a4d5c94SMichael S. Tsirkin
1408b211616dSMichael S. Tsirkin vhost_net_stop(n, &tx_sock, &rx_sock);
1409f6f93f75S夷则(Caspar) vhost_net_flush(n);
141081f95a55SMichael S. Tsirkin vhost_dev_stop(&n->dev);
14113a4d5c94SMichael S. Tsirkin vhost_dev_cleanup(&n->dev);
141209aaacf0SAl Viro vhost_net_vq_reset(n);
14133a4d5c94SMichael S. Tsirkin if (tx_sock)
141409aaacf0SAl Viro sockfd_put(tx_sock);
1415b0c057caSMichael S. Tsirkin if (rx_sock)
1416d05faa5fSPaul E. McKenney sockfd_put(rx_sock);
14173a4d5c94SMichael S. Tsirkin /* Make sure no callbacks are outstanding */
14183a4d5c94SMichael S. Tsirkin synchronize_rcu();
14193a4d5c94SMichael S. Tsirkin /* We do an extra flush before freeing memory,
1420c67df11fSJason Wang * since jobs can re-queue themselves. */
14210a0be13bSJason Wang vhost_net_flush(n);
14223ab2e420SAsias He kfree(n->vqs[VHOST_NET_VQ_RX].rxq.queue);
1423e4dab1e6SJason Wang kfree(n->vqs[VHOST_NET_VQ_TX].xdp);
1424e4dab1e6SJason Wang kfree(n->dev.vqs);
1425d04257b0SRomain Francoise if (n->page_frag.page)
14263a4d5c94SMichael S. Tsirkin __page_frag_cache_drain(n->page_frag.page, n->refcnt_bias);
14273a4d5c94SMichael S. Tsirkin kvfree(n);
14283a4d5c94SMichael S. Tsirkin return 0;
14293a4d5c94SMichael S. Tsirkin }
14303a4d5c94SMichael S. Tsirkin
get_raw_socket(int fd)14319b2c45d4SDenys Vlasenko static struct socket *get_raw_socket(int fd)
14323a4d5c94SMichael S. Tsirkin {
1433d47effe1SKrishna Kumar int r;
14343a4d5c94SMichael S. Tsirkin struct socket *sock = sockfd_lookup(fd, &r);
14353a4d5c94SMichael S. Tsirkin
14363a4d5c94SMichael S. Tsirkin if (!sock)
14373a4d5c94SMichael S. Tsirkin return ERR_PTR(-ENOTSOCK);
14383a4d5c94SMichael S. Tsirkin
14393a4d5c94SMichael S. Tsirkin /* Parameter checking */
14403a4d5c94SMichael S. Tsirkin if (sock->sk->sk_type != SOCK_RAW) {
14413a4d5c94SMichael S. Tsirkin r = -ESOCKTNOSUPPORT;
14423a4d5c94SMichael S. Tsirkin goto err;
144342d84c84SEugenio Pérez }
14443a4d5c94SMichael S. Tsirkin
14453a4d5c94SMichael S. Tsirkin if (sock->sk->sk_family != AF_PACKET) {
14463a4d5c94SMichael S. Tsirkin r = -EPFNOSUPPORT;
14473a4d5c94SMichael S. Tsirkin goto err;
14483a4d5c94SMichael S. Tsirkin }
144909aaacf0SAl Viro return sock;
14503a4d5c94SMichael S. Tsirkin err:
14513a4d5c94SMichael S. Tsirkin sockfd_put(sock);
14523a4d5c94SMichael S. Tsirkin return ERR_PTR(r);
1453fb4554c2SAl Viro }
1454c67df11fSJason Wang
get_tap_ptr_ring(struct file * file)14555990a305SJason Wang static struct ptr_ring *get_tap_ptr_ring(struct file *file)
14565990a305SJason Wang {
14575990a305SJason Wang struct ptr_ring *ring;
1458c67df11fSJason Wang ring = tun_get_tx_ring(file);
14595990a305SJason Wang if (!IS_ERR(ring))
14605990a305SJason Wang goto out;
1461c67df11fSJason Wang ring = tap_get_ptr_ring(file);
14625990a305SJason Wang if (!IS_ERR(ring))
1463c67df11fSJason Wang goto out;
14645990a305SJason Wang ring = NULL;
1465c67df11fSJason Wang out:
1466c67df11fSJason Wang return ring;
1467501c774cSArnd Bergmann }
14683a4d5c94SMichael S. Tsirkin
get_tap_socket(int fd)14693a4d5c94SMichael S. Tsirkin static struct socket *get_tap_socket(int fd)
14703a4d5c94SMichael S. Tsirkin {
1471d47effe1SKrishna Kumar struct file *file = fget(fd);
14723a4d5c94SMichael S. Tsirkin struct socket *sock;
14733a4d5c94SMichael S. Tsirkin
14743a4d5c94SMichael S. Tsirkin if (!file)
1475501c774cSArnd Bergmann return ERR_PTR(-EBADF);
1476501c774cSArnd Bergmann sock = tun_get_socket(file);
1477635b8c8eSSainath Grandhi if (!IS_ERR(sock))
14783a4d5c94SMichael S. Tsirkin return sock;
14793a4d5c94SMichael S. Tsirkin sock = tap_get_socket(file);
14803a4d5c94SMichael S. Tsirkin if (IS_ERR(sock))
14813a4d5c94SMichael S. Tsirkin fput(file);
14823a4d5c94SMichael S. Tsirkin return sock;
14833a4d5c94SMichael S. Tsirkin }
14843a4d5c94SMichael S. Tsirkin
get_socket(int fd)14853a4d5c94SMichael S. Tsirkin static struct socket *get_socket(int fd)
1486d47effe1SKrishna Kumar {
14873a4d5c94SMichael S. Tsirkin struct socket *sock;
14883a4d5c94SMichael S. Tsirkin
14893a4d5c94SMichael S. Tsirkin /* special case to disable backend */
14903a4d5c94SMichael S. Tsirkin if (fd == -1)
14913a4d5c94SMichael S. Tsirkin return NULL;
14923a4d5c94SMichael S. Tsirkin sock = get_raw_socket(fd);
1493501c774cSArnd Bergmann if (!IS_ERR(sock))
14943a4d5c94SMichael S. Tsirkin return sock;
14953a4d5c94SMichael S. Tsirkin sock = get_tap_socket(fd);
14963a4d5c94SMichael S. Tsirkin if (!IS_ERR(sock))
14973a4d5c94SMichael S. Tsirkin return sock;
14983a4d5c94SMichael S. Tsirkin return ERR_PTR(-ENOTSOCK);
14993a4d5c94SMichael S. Tsirkin }
15003a4d5c94SMichael S. Tsirkin
vhost_net_set_backend(struct vhost_net * n,unsigned index,int fd)15013a4d5c94SMichael S. Tsirkin static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
15023a4d5c94SMichael S. Tsirkin {
15032839400fSAsias He struct socket *sock, *oldsock;
1504fe729a57SAsias He struct vhost_virtqueue *vq;
15053a4d5c94SMichael S. Tsirkin struct vhost_net_virtqueue *nvq;
15063a4d5c94SMichael S. Tsirkin struct vhost_net_ubuf_ref *ubufs, *oldubufs = NULL;
15073a4d5c94SMichael S. Tsirkin int r;
15083a4d5c94SMichael S. Tsirkin
15093a4d5c94SMichael S. Tsirkin mutex_lock(&n->dev.mutex);
15103a4d5c94SMichael S. Tsirkin r = vhost_dev_check_owner(&n->dev);
15113a4d5c94SMichael S. Tsirkin if (r)
15123a4d5c94SMichael S. Tsirkin goto err;
15133a4d5c94SMichael S. Tsirkin
15143a4d5c94SMichael S. Tsirkin if (index >= VHOST_NET_VQ_MAX) {
15153a4d5c94SMichael S. Tsirkin r = -ENOBUFS;
15163ab2e420SAsias He goto err;
15172839400fSAsias He }
15183a4d5c94SMichael S. Tsirkin vq = &n->vqs[index].vq;
15193a4d5c94SMichael S. Tsirkin nvq = &n->vqs[index];
15209526f9a2SEric Auger mutex_lock(&vq->mutex);
15219526f9a2SEric Auger
15229526f9a2SEric Auger if (fd == -1)
15233a4d5c94SMichael S. Tsirkin vhost_clear_msg(&n->dev);
15243a4d5c94SMichael S. Tsirkin
15253a4d5c94SMichael S. Tsirkin /* Verify that ring has been setup correctly. */
15261dace8c8SJeff Dike if (!vhost_vq_access_ok(vq)) {
15273a4d5c94SMichael S. Tsirkin r = -EFAULT;
15283a4d5c94SMichael S. Tsirkin goto err_vq;
15293a4d5c94SMichael S. Tsirkin }
15303a4d5c94SMichael S. Tsirkin sock = get_socket(fd);
15311dace8c8SJeff Dike if (IS_ERR(sock)) {
15323a4d5c94SMichael S. Tsirkin r = PTR_ERR(sock);
15333a4d5c94SMichael S. Tsirkin goto err_vq;
15343a4d5c94SMichael S. Tsirkin }
1535247643f8SEugenio Pérez
1536dd1f4078SJeff Dike /* start polling new socket */
1537fe729a57SAsias He oldsock = vhost_vq_get_backend(vq);
1538fe729a57SAsias He if (sock != oldsock) {
1539bab632d6SMichael S. Tsirkin ubufs = vhost_net_ubuf_alloc(vq,
1540bab632d6SMichael S. Tsirkin sock && vhost_sock_zcopy(sock));
1541bab632d6SMichael S. Tsirkin if (IS_ERR(ubufs)) {
1542bab632d6SMichael S. Tsirkin r = PTR_ERR(ubufs);
1543692a998bSJason Wang goto err_ubufs;
15443a4d5c94SMichael S. Tsirkin }
1545247643f8SEugenio Pérez
1546c67df11fSJason Wang vhost_net_disable_vq(n, vq);
154780f7d030SGreg Kurz vhost_vq_set_backend(vq, sock);
1548f59281daSJason Wang vhost_net_buf_unproduce(nvq);
1549692a998bSJason Wang r = vhost_vq_init_access(vq);
15502b8b328bSJason Wang if (r)
15512b8b328bSJason Wang goto err_used;
15522b8b328bSJason Wang r = vhost_net_enable_vq(n, vq);
1553fb4554c2SAl Viro if (r)
1554fb4554c2SAl Viro goto err_used;
1555fb4554c2SAl Viro if (index == VHOST_NET_VQ_RX) {
1556fb4554c2SAl Viro if (sock)
1557fb4554c2SAl Viro nvq->rx_ring = get_tap_ptr_ring(sock->file);
1558fb4554c2SAl Viro else
1559692a998bSJason Wang nvq->rx_ring = NULL;
15602839400fSAsias He }
15612839400fSAsias He
156264e9a9b8SMichael S. Tsirkin oldubufs = nvq->ubufs;
156364e9a9b8SMichael S. Tsirkin nvq->ubufs = ubufs;
156464e9a9b8SMichael S. Tsirkin
15651280c27fSMichael S. Tsirkin n->tx_packets = 0;
1566dd1f4078SJeff Dike n->tx_zcopy_err = 0;
1567dd1f4078SJeff Dike n->tx_flush = false;
15681680e906SMichael S. Tsirkin }
15691680e906SMichael S. Tsirkin
1570c047e5f3SMichael S. Tsirkin mutex_unlock(&vq->mutex);
1571c38e39c3SMichael S. Tsirkin
1572c047e5f3SMichael S. Tsirkin if (oldubufs) {
1573eaae8132SMichael S. Tsirkin vhost_net_ubuf_put_wait_and_free(oldubufs);
1574c047e5f3SMichael S. Tsirkin mutex_lock(&vq->mutex);
1575c047e5f3SMichael S. Tsirkin vhost_zerocopy_signal_used(n, vq);
1576bab632d6SMichael S. Tsirkin mutex_unlock(&vq->mutex);
15773a4d5c94SMichael S. Tsirkin }
1578b2ffa407SMike Christie
157909aaacf0SAl Viro if (oldsock) {
15803a4d5c94SMichael S. Tsirkin vhost_dev_flush(&n->dev);
15811dace8c8SJeff Dike sockfd_put(oldsock);
15821680e906SMichael S. Tsirkin }
15831680e906SMichael S. Tsirkin
15841680e906SMichael S. Tsirkin mutex_unlock(&n->dev.mutex);
1585692a998bSJason Wang return 0;
1586247643f8SEugenio Pérez
1587692a998bSJason Wang err_used:
1588692a998bSJason Wang vhost_vq_set_backend(vq, oldsock);
1589c38e39c3SMichael S. Tsirkin vhost_net_enable_vq(n, vq);
1590bab632d6SMichael S. Tsirkin if (ubufs)
1591b8f1f658SJason Wang vhost_net_ubuf_put_wait_and_free(ubufs);
159209aaacf0SAl Viro err_ubufs:
15931dace8c8SJeff Dike if (sock)
15941dace8c8SJeff Dike sockfd_put(sock);
15953a4d5c94SMichael S. Tsirkin err_vq:
15963a4d5c94SMichael S. Tsirkin mutex_unlock(&vq->mutex);
15973a4d5c94SMichael S. Tsirkin err:
15983a4d5c94SMichael S. Tsirkin mutex_unlock(&n->dev.mutex);
15993a4d5c94SMichael S. Tsirkin return r;
16003a4d5c94SMichael S. Tsirkin }
16013a4d5c94SMichael S. Tsirkin
vhost_net_reset_owner(struct vhost_net * n)16023a4d5c94SMichael S. Tsirkin static long vhost_net_reset_owner(struct vhost_net *n)
16033a4d5c94SMichael S. Tsirkin {
16043a4d5c94SMichael S. Tsirkin struct socket *tx_sock = NULL;
16050bbe3066SJason Wang struct socket *rx_sock = NULL;
1606d47effe1SKrishna Kumar long err;
16073a4d5c94SMichael S. Tsirkin struct vhost_iotlb *umem;
16083a4d5c94SMichael S. Tsirkin
16093a4d5c94SMichael S. Tsirkin mutex_lock(&n->dev.mutex);
16103a4d5c94SMichael S. Tsirkin err = vhost_dev_check_owner(&n->dev);
1611a9709d68SJason Wang if (err)
1612a9709d68SJason Wang goto done;
1613150b9e51SMichael S. Tsirkin umem = vhost_dev_reset_owner_prepare();
1614150b9e51SMichael S. Tsirkin if (!umem) {
1615150b9e51SMichael S. Tsirkin err = -ENOMEM;
16163a4d5c94SMichael S. Tsirkin goto done;
16173a4d5c94SMichael S. Tsirkin }
16184cd87951SJason Wang vhost_net_stop(n, &tx_sock, &rx_sock);
1619a9709d68SJason Wang vhost_net_flush(n);
162081f95a55SMichael S. Tsirkin vhost_dev_stop(&n->dev);
16213a4d5c94SMichael S. Tsirkin vhost_dev_reset_owner(&n->dev, umem);
16223a4d5c94SMichael S. Tsirkin vhost_net_vq_reset(n);
16233a4d5c94SMichael S. Tsirkin done:
162409aaacf0SAl Viro mutex_unlock(&n->dev.mutex);
16253a4d5c94SMichael S. Tsirkin if (tx_sock)
162609aaacf0SAl Viro sockfd_put(tx_sock);
16273a4d5c94SMichael S. Tsirkin if (rx_sock)
16283a4d5c94SMichael S. Tsirkin sockfd_put(rx_sock);
16293a4d5c94SMichael S. Tsirkin return err;
16303a4d5c94SMichael S. Tsirkin }
16313a4d5c94SMichael S. Tsirkin
vhost_net_set_features(struct vhost_net * n,u64 features)16328dd014adSDavid Stevens static int vhost_net_set_features(struct vhost_net *n, u64 features)
16333a4d5c94SMichael S. Tsirkin {
16348dd014adSDavid Stevens size_t vhost_hlen, sock_hlen, hdr_len;
1635e4fca7d6SMichael S. Tsirkin int i;
1636e4fca7d6SMichael S. Tsirkin
16378dd014adSDavid Stevens hdr_len = (features & ((1ULL << VIRTIO_NET_F_MRG_RXBUF) |
16388dd014adSDavid Stevens (1ULL << VIRTIO_F_VERSION_1))) ?
16398dd014adSDavid Stevens sizeof(struct virtio_net_hdr_mrg_rxbuf) :
16408dd014adSDavid Stevens sizeof(struct virtio_net_hdr);
16418dd014adSDavid Stevens if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) {
16428dd014adSDavid Stevens /* vhost provides vnet_hdr */
16438dd014adSDavid Stevens vhost_hlen = hdr_len;
16448dd014adSDavid Stevens sock_hlen = 0;
16458dd014adSDavid Stevens } else {
16468dd014adSDavid Stevens /* socket provides vnet_hdr */
16478dd014adSDavid Stevens vhost_hlen = 0;
16483a4d5c94SMichael S. Tsirkin sock_hlen = hdr_len;
16493a4d5c94SMichael S. Tsirkin }
16506b1e6cc7SJason Wang mutex_lock(&n->dev.mutex);
16516b1e6cc7SJason Wang if ((features & (1 << VHOST_F_LOG_ALL)) &&
16526b1e6cc7SJason Wang !vhost_log_access_ok(&n->dev))
1653321bd212SMichael S. Tsirkin goto out_unlock;
1654759aba1eSLiming Wu
16556b1e6cc7SJason Wang if ((features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) {
16563a4d5c94SMichael S. Tsirkin if (vhost_init_device_iotlb(&n->dev))
16576b1e6cc7SJason Wang goto out_unlock;
16583a4d5c94SMichael S. Tsirkin }
16593ab2e420SAsias He
1660ea16c514SMichael S. Tsirkin for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
166181f95a55SMichael S. Tsirkin mutex_lock(&n->vqs[i].vq.mutex);
166281f95a55SMichael S. Tsirkin n->vqs[i].vq.acked_features = features;
16633ab2e420SAsias He n->vqs[i].vhost_hlen = vhost_hlen;
16643a4d5c94SMichael S. Tsirkin n->vqs[i].sock_hlen = sock_hlen;
16653a4d5c94SMichael S. Tsirkin mutex_unlock(&n->vqs[i].vq.mutex);
16663a4d5c94SMichael S. Tsirkin }
16676b1e6cc7SJason Wang mutex_unlock(&n->dev.mutex);
16686b1e6cc7SJason Wang return 0;
16696b1e6cc7SJason Wang
16706b1e6cc7SJason Wang out_unlock:
16713a4d5c94SMichael S. Tsirkin mutex_unlock(&n->dev.mutex);
16723a4d5c94SMichael S. Tsirkin return -EFAULT;
1673b1ad8496SAsias He }
1674b1ad8496SAsias He
vhost_net_set_owner(struct vhost_net * n)1675b1ad8496SAsias He static long vhost_net_set_owner(struct vhost_net *n)
1676b1ad8496SAsias He {
1677b1ad8496SAsias He int r;
167805c05351SMichael S. Tsirkin
167905c05351SMichael S. Tsirkin mutex_lock(&n->dev.mutex);
168005c05351SMichael S. Tsirkin if (vhost_dev_has_owner(&n->dev)) {
168105c05351SMichael S. Tsirkin r = -EBUSY;
1682b1ad8496SAsias He goto out;
1683b1ad8496SAsias He }
1684b1ad8496SAsias He r = vhost_net_set_ubuf_info(n);
1685b1ad8496SAsias He if (r)
1686b1ad8496SAsias He goto out;
1687b1ad8496SAsias He r = vhost_dev_set_owner(&n->dev);
1688b1ad8496SAsias He if (r)
1689b1ad8496SAsias He vhost_net_clear_ubuf_info(n);
1690b1ad8496SAsias He vhost_net_flush(n);
1691b1ad8496SAsias He out:
1692b1ad8496SAsias He mutex_unlock(&n->dev.mutex);
1693b1ad8496SAsias He return r;
16943a4d5c94SMichael S. Tsirkin }
16953a4d5c94SMichael S. Tsirkin
vhost_net_ioctl(struct file * f,unsigned int ioctl,unsigned long arg)16963a4d5c94SMichael S. Tsirkin static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
16973a4d5c94SMichael S. Tsirkin unsigned long arg)
16983a4d5c94SMichael S. Tsirkin {
16993a4d5c94SMichael S. Tsirkin struct vhost_net *n = f->private_data;
17003a4d5c94SMichael S. Tsirkin void __user *argp = (void __user *)arg;
17013a4d5c94SMichael S. Tsirkin u64 __user *featurep = argp;
17023a4d5c94SMichael S. Tsirkin struct vhost_vring_file backend;
1703d47effe1SKrishna Kumar u64 features;
17043a4d5c94SMichael S. Tsirkin int r;
17053a4d5c94SMichael S. Tsirkin
1706d3553a52STakuya Yoshikawa switch (ioctl) {
1707d3553a52STakuya Yoshikawa case VHOST_NET_SET_BACKEND:
17083a4d5c94SMichael S. Tsirkin if (copy_from_user(&backend, argp, sizeof backend))
17093a4d5c94SMichael S. Tsirkin return -EFAULT;
17100dd05a3bSStefan Hajnoczi return vhost_net_set_backend(n, backend.index, backend.fd);
1711d3553a52STakuya Yoshikawa case VHOST_GET_FEATURES:
1712d3553a52STakuya Yoshikawa features = VHOST_NET_FEATURES;
1713d3553a52STakuya Yoshikawa if (copy_to_user(featurep, &features, sizeof features))
17143a4d5c94SMichael S. Tsirkin return -EFAULT;
1715d3553a52STakuya Yoshikawa return 0;
1716d3553a52STakuya Yoshikawa case VHOST_SET_FEATURES:
17170dd05a3bSStefan Hajnoczi if (copy_from_user(&features, featurep, sizeof features))
17183a4d5c94SMichael S. Tsirkin return -EFAULT;
17193a4d5c94SMichael S. Tsirkin if (features & ~VHOST_NET_FEATURES)
1720429711aeSJason Wang return -EOPNOTSUPP;
1721429711aeSJason Wang return vhost_net_set_features(n, features);
1722429711aeSJason Wang case VHOST_GET_BACKEND_FEATURES:
1723429711aeSJason Wang features = VHOST_NET_BACKEND_FEATURES;
1724429711aeSJason Wang if (copy_to_user(featurep, &features, sizeof(features)))
1725429711aeSJason Wang return -EFAULT;
1726429711aeSJason Wang return 0;
1727429711aeSJason Wang case VHOST_SET_BACKEND_FEATURES:
1728429711aeSJason Wang if (copy_from_user(&features, featurep, sizeof(features)))
1729429711aeSJason Wang return -EFAULT;
1730460f7ce1SJason Wang if (features & ~VHOST_NET_BACKEND_FEATURES)
1731460f7ce1SJason Wang return -EOPNOTSUPP;
17323a4d5c94SMichael S. Tsirkin vhost_set_backend_features(&n->dev, features);
17333a4d5c94SMichael S. Tsirkin return 0;
1734b1ad8496SAsias He case VHOST_RESET_OWNER:
1735b1ad8496SAsias He return vhost_net_reset_owner(n);
17363a4d5c94SMichael S. Tsirkin case VHOST_SET_OWNER:
17373a4d5c94SMichael S. Tsirkin return vhost_net_set_owner(n);
1738935cdee7SMichael S. Tsirkin default:
1739935cdee7SMichael S. Tsirkin mutex_lock(&n->dev.mutex);
1740935cdee7SMichael S. Tsirkin r = vhost_dev_ioctl(&n->dev, ioctl, argp);
1741935cdee7SMichael S. Tsirkin if (r == -ENOIOCTLCMD)
17423a4d5c94SMichael S. Tsirkin r = vhost_vring_ioctl(&n->dev, ioctl, argp);
17433a4d5c94SMichael S. Tsirkin else
17443a4d5c94SMichael S. Tsirkin vhost_net_flush(n);
17453a4d5c94SMichael S. Tsirkin mutex_unlock(&n->dev.mutex);
17463a4d5c94SMichael S. Tsirkin return r;
17473a4d5c94SMichael S. Tsirkin }
17486b1e6cc7SJason Wang }
17496b1e6cc7SJason Wang
vhost_net_chr_read_iter(struct kiocb * iocb,struct iov_iter * to)17506b1e6cc7SJason Wang static ssize_t vhost_net_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
17516b1e6cc7SJason Wang {
17526b1e6cc7SJason Wang struct file *file = iocb->ki_filp;
17536b1e6cc7SJason Wang struct vhost_net *n = file->private_data;
17546b1e6cc7SJason Wang struct vhost_dev *dev = &n->dev;
17556b1e6cc7SJason Wang int noblock = file->f_flags & O_NONBLOCK;
17566b1e6cc7SJason Wang
17576b1e6cc7SJason Wang return vhost_chr_read_iter(dev, to, noblock);
17586b1e6cc7SJason Wang }
17596b1e6cc7SJason Wang
vhost_net_chr_write_iter(struct kiocb * iocb,struct iov_iter * from)17606b1e6cc7SJason Wang static ssize_t vhost_net_chr_write_iter(struct kiocb *iocb,
17616b1e6cc7SJason Wang struct iov_iter *from)
17626b1e6cc7SJason Wang {
17636b1e6cc7SJason Wang struct file *file = iocb->ki_filp;
17646b1e6cc7SJason Wang struct vhost_net *n = file->private_data;
17656b1e6cc7SJason Wang struct vhost_dev *dev = &n->dev;
17666b1e6cc7SJason Wang
17676b1e6cc7SJason Wang return vhost_chr_write_iter(dev, from);
1768afc9a42bSAl Viro }
17696b1e6cc7SJason Wang
vhost_net_chr_poll(struct file * file,poll_table * wait)17706b1e6cc7SJason Wang static __poll_t vhost_net_chr_poll(struct file *file, poll_table *wait)
17716b1e6cc7SJason Wang {
17726b1e6cc7SJason Wang struct vhost_net *n = file->private_data;
17736b1e6cc7SJason Wang struct vhost_dev *dev = &n->dev;
17746b1e6cc7SJason Wang
17756b1e6cc7SJason Wang return vhost_chr_poll(file, dev, wait);
1776373a83a6STobias Klauser }
17773a4d5c94SMichael S. Tsirkin
17783a4d5c94SMichael S. Tsirkin static const struct file_operations vhost_net_fops = {
17796b1e6cc7SJason Wang .owner = THIS_MODULE,
17806b1e6cc7SJason Wang .release = vhost_net_release,
17816b1e6cc7SJason Wang .read_iter = vhost_net_chr_read_iter,
17823a4d5c94SMichael S. Tsirkin .write_iter = vhost_net_chr_write_iter,
1783407e9ef7SArnd Bergmann .poll = vhost_net_chr_poll,
17843a4d5c94SMichael S. Tsirkin .unlocked_ioctl = vhost_net_ioctl,
17856038f373SArnd Bergmann .compat_ioctl = compat_ptr_ioctl,
17863a4d5c94SMichael S. Tsirkin .open = vhost_net_open,
17873a4d5c94SMichael S. Tsirkin .llseek = noop_llseek,
17883a4d5c94SMichael S. Tsirkin };
17897c7c7f01Sstephen hemminger
17907c7c7f01Sstephen hemminger static struct miscdevice vhost_net_misc = {
17917c7c7f01Sstephen hemminger .minor = VHOST_NET_MINOR,
17923a4d5c94SMichael S. Tsirkin .name = "vhost-net",
17933a4d5c94SMichael S. Tsirkin .fops = &vhost_net_fops,
1794078adb3bSXiu Jianfeng };
17953a4d5c94SMichael S. Tsirkin
vhost_net_init(void)1796bab632d6SMichael S. Tsirkin static int __init vhost_net_init(void)
1797fe729a57SAsias He {
1798c23f3445STejun Heo if (experimental_zcopytx)
17993a4d5c94SMichael S. Tsirkin vhost_net_enable_zcopy(VHOST_NET_VQ_TX);
18003a4d5c94SMichael S. Tsirkin return misc_register(&vhost_net_misc);
18013a4d5c94SMichael S. Tsirkin }
1802078adb3bSXiu Jianfeng module_init(vhost_net_init);
18033a4d5c94SMichael S. Tsirkin
vhost_net_exit(void)18043a4d5c94SMichael S. Tsirkin static void __exit vhost_net_exit(void)
18053a4d5c94SMichael S. Tsirkin {
18063a4d5c94SMichael S. Tsirkin misc_deregister(&vhost_net_misc);
18073a4d5c94SMichael S. Tsirkin }
18083a4d5c94SMichael S. Tsirkin module_exit(vhost_net_exit);
18093a4d5c94SMichael S. Tsirkin
18103a4d5c94SMichael S. Tsirkin MODULE_VERSION("0.0.1");
18113a4d5c94SMichael S. Tsirkin MODULE_LICENSE("GPL v2");
18127c7c7f01Sstephen hemminger MODULE_AUTHOR("Michael S. Tsirkin");
18137c7c7f01Sstephen hemminger MODULE_DESCRIPTION("Host kernel accelerator for virtio net");
1814 MODULE_ALIAS_MISCDEV(VHOST_NET_MINOR);
1815 MODULE_ALIAS("devname:vhost-net");
1816