xref: /openbmc/qemu/subprojects/libvduse/libvduse.c (revision a6caeee8111386b2d16ee07fe817193cde7f0d2a)
1 /*
2  * VDUSE (vDPA Device in Userspace) library
3  *
4  * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights reserved.
5  *   Portions of codes and concepts borrowed from libvhost-user.c, so:
6  *     Copyright IBM, Corp. 2007
7  *     Copyright (c) 2016 Red Hat, Inc.
8  *
9  * Author:
10  *   Xie Yongji <xieyongji@bytedance.com>
11  *   Anthony Liguori <aliguori@us.ibm.com>
12  *   Marc-AndrĂ© Lureau <mlureau@redhat.com>
13  *   Victor Kaplansky <victork@redhat.com>
14  *
15  * This work is licensed under the terms of the GNU GPL, version 2 or
16  * later.  See the COPYING file in the top-level directory.
17  */
18 
19 #include <stdlib.h>
20 #include <stdio.h>
21 #include <stdbool.h>
22 #include <stddef.h>
23 #include <errno.h>
24 #include <string.h>
25 #include <assert.h>
26 #include <endian.h>
27 #include <unistd.h>
28 #include <limits.h>
29 #include <fcntl.h>
30 #include <inttypes.h>
31 
32 #include <sys/ioctl.h>
33 #include <sys/eventfd.h>
34 #include <sys/mman.h>
35 
36 #include "include/atomic.h"
37 #include "linux-headers/linux/virtio_ring.h"
38 #include "linux-headers/linux/virtio_config.h"
39 #include "linux-headers/linux/vduse.h"
40 #include "libvduse.h"
41 
42 #define VDUSE_VQ_ALIGN 4096
43 #define MAX_IOVA_REGIONS 256
44 
45 /* Round number down to multiple */
46 #define ALIGN_DOWN(n, m) ((n) / (m) * (m))
47 
48 /* Round number up to multiple */
49 #define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
50 
51 #ifndef unlikely
52 #define unlikely(x)   __builtin_expect(!!(x), 0)
53 #endif
54 
55 typedef struct VduseRing {
56     unsigned int num;
57     uint64_t desc_addr;
58     uint64_t avail_addr;
59     uint64_t used_addr;
60     struct vring_desc *desc;
61     struct vring_avail *avail;
62     struct vring_used *used;
63 } VduseRing;
64 
65 struct VduseVirtq {
66     VduseRing vring;
67     uint16_t last_avail_idx;
68     uint16_t shadow_avail_idx;
69     uint16_t used_idx;
70     uint16_t signalled_used;
71     bool signalled_used_valid;
72     int index;
73     int inuse;
74     bool ready;
75     int fd;
76     VduseDev *dev;
77 };
78 
79 typedef struct VduseIovaRegion {
80     uint64_t iova;
81     uint64_t size;
82     uint64_t mmap_offset;
83     uint64_t mmap_addr;
84 } VduseIovaRegion;
85 
86 struct VduseDev {
87     VduseVirtq *vqs;
88     VduseIovaRegion regions[MAX_IOVA_REGIONS];
89     int num_regions;
90     char *name;
91     uint32_t device_id;
92     uint32_t vendor_id;
93     uint16_t num_queues;
94     uint16_t queue_size;
95     uint64_t features;
96     const VduseOps *ops;
97     int fd;
98     int ctrl_fd;
99     void *priv;
100 };
101 
102 static inline bool has_feature(uint64_t features, unsigned int fbit)
103 {
104     assert(fbit < 64);
105     return !!(features & (1ULL << fbit));
106 }
107 
108 static inline bool vduse_dev_has_feature(VduseDev *dev, unsigned int fbit)
109 {
110     return has_feature(dev->features, fbit);
111 }
112 
113 uint64_t vduse_get_virtio_features(void)
114 {
115     return (1ULL << VIRTIO_F_IOMMU_PLATFORM) |
116            (1ULL << VIRTIO_F_VERSION_1) |
117            (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) |
118            (1ULL << VIRTIO_RING_F_EVENT_IDX) |
119            (1ULL << VIRTIO_RING_F_INDIRECT_DESC);
120 }
121 
122 VduseDev *vduse_queue_get_dev(VduseVirtq *vq)
123 {
124     return vq->dev;
125 }
126 
127 int vduse_queue_get_fd(VduseVirtq *vq)
128 {
129     return vq->fd;
130 }
131 
132 void *vduse_dev_get_priv(VduseDev *dev)
133 {
134     return dev->priv;
135 }
136 
137 VduseVirtq *vduse_dev_get_queue(VduseDev *dev, int index)
138 {
139     return &dev->vqs[index];
140 }
141 
142 int vduse_dev_get_fd(VduseDev *dev)
143 {
144     return dev->fd;
145 }
146 
147 static int vduse_inject_irq(VduseDev *dev, int index)
148 {
149     return ioctl(dev->fd, VDUSE_VQ_INJECT_IRQ, &index);
150 }
151 
152 static void vduse_iova_remove_region(VduseDev *dev, uint64_t start,
153                                      uint64_t last)
154 {
155     int i;
156 
157     if (last == start) {
158         return;
159     }
160 
161     for (i = 0; i < MAX_IOVA_REGIONS; i++) {
162         if (!dev->regions[i].mmap_addr) {
163             continue;
164         }
165 
166         if (start <= dev->regions[i].iova &&
167             last >= (dev->regions[i].iova + dev->regions[i].size - 1)) {
168             munmap((void *)(uintptr_t)dev->regions[i].mmap_addr,
169                    dev->regions[i].mmap_offset + dev->regions[i].size);
170             dev->regions[i].mmap_addr = 0;
171             dev->num_regions--;
172         }
173     }
174 }
175 
176 static int vduse_iova_add_region(VduseDev *dev, int fd,
177                                  uint64_t offset, uint64_t start,
178                                  uint64_t last, int prot)
179 {
180     int i;
181     uint64_t size = last - start + 1;
182     void *mmap_addr = mmap(0, size + offset, prot, MAP_SHARED, fd, 0);
183 
184     if (mmap_addr == MAP_FAILED) {
185         close(fd);
186         return -EINVAL;
187     }
188 
189     for (i = 0; i < MAX_IOVA_REGIONS; i++) {
190         if (!dev->regions[i].mmap_addr) {
191             dev->regions[i].mmap_addr = (uint64_t)(uintptr_t)mmap_addr;
192             dev->regions[i].mmap_offset = offset;
193             dev->regions[i].iova = start;
194             dev->regions[i].size = size;
195             dev->num_regions++;
196             break;
197         }
198     }
199     assert(i < MAX_IOVA_REGIONS);
200     close(fd);
201 
202     return 0;
203 }
204 
205 static int perm_to_prot(uint8_t perm)
206 {
207     int prot = 0;
208 
209     switch (perm) {
210     case VDUSE_ACCESS_WO:
211         prot |= PROT_WRITE;
212         break;
213     case VDUSE_ACCESS_RO:
214         prot |= PROT_READ;
215         break;
216     case VDUSE_ACCESS_RW:
217         prot |= PROT_READ | PROT_WRITE;
218         break;
219     default:
220         break;
221     }
222 
223     return prot;
224 }
225 
226 static inline void *iova_to_va(VduseDev *dev, uint64_t *plen, uint64_t iova)
227 {
228     int i, ret;
229     struct vduse_iotlb_entry entry;
230 
231     for (i = 0; i < MAX_IOVA_REGIONS; i++) {
232         VduseIovaRegion *r = &dev->regions[i];
233 
234         if (!r->mmap_addr) {
235             continue;
236         }
237 
238         if ((iova >= r->iova) && (iova < (r->iova + r->size))) {
239             if ((iova + *plen) > (r->iova + r->size)) {
240                 *plen = r->iova + r->size - iova;
241             }
242             return (void *)(uintptr_t)(iova - r->iova +
243                    r->mmap_addr + r->mmap_offset);
244         }
245     }
246 
247     entry.start = iova;
248     entry.last = iova + 1;
249     ret = ioctl(dev->fd, VDUSE_IOTLB_GET_FD, &entry);
250     if (ret < 0) {
251         return NULL;
252     }
253 
254     if (!vduse_iova_add_region(dev, ret, entry.offset, entry.start,
255                                entry.last, perm_to_prot(entry.perm))) {
256         return iova_to_va(dev, plen, iova);
257     }
258 
259     return NULL;
260 }
261 
262 static inline uint16_t vring_avail_flags(VduseVirtq *vq)
263 {
264     return le16toh(vq->vring.avail->flags);
265 }
266 
267 static inline uint16_t vring_avail_idx(VduseVirtq *vq)
268 {
269     vq->shadow_avail_idx = le16toh(vq->vring.avail->idx);
270 
271     return vq->shadow_avail_idx;
272 }
273 
274 static inline uint16_t vring_avail_ring(VduseVirtq *vq, int i)
275 {
276     return le16toh(vq->vring.avail->ring[i]);
277 }
278 
279 static inline uint16_t vring_get_used_event(VduseVirtq *vq)
280 {
281     return vring_avail_ring(vq, vq->vring.num);
282 }
283 
284 static bool vduse_queue_get_head(VduseVirtq *vq, unsigned int idx,
285                                  unsigned int *head)
286 {
287     /*
288      * Grab the next descriptor number they're advertising, and increment
289      * the index we've seen.
290      */
291     *head = vring_avail_ring(vq, idx % vq->vring.num);
292 
293     /* If their number is silly, that's a fatal mistake. */
294     if (*head >= vq->vring.num) {
295         fprintf(stderr, "Guest says index %u is available\n", *head);
296         return false;
297     }
298 
299     return true;
300 }
301 
302 static int
303 vduse_queue_read_indirect_desc(VduseDev *dev, struct vring_desc *desc,
304                                uint64_t addr, size_t len)
305 {
306     struct vring_desc *ori_desc;
307     uint64_t read_len;
308 
309     if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc))) {
310         return -1;
311     }
312 
313     if (len == 0) {
314         return -1;
315     }
316 
317     while (len) {
318         read_len = len;
319         ori_desc = iova_to_va(dev, &read_len, addr);
320         if (!ori_desc) {
321             return -1;
322         }
323 
324         memcpy(desc, ori_desc, read_len);
325         len -= read_len;
326         addr += read_len;
327         desc += read_len;
328     }
329 
330     return 0;
331 }
332 
333 enum {
334     VIRTQUEUE_READ_DESC_ERROR = -1,
335     VIRTQUEUE_READ_DESC_DONE = 0,   /* end of chain */
336     VIRTQUEUE_READ_DESC_MORE = 1,   /* more buffers in chain */
337 };
338 
339 static int vduse_queue_read_next_desc(struct vring_desc *desc, int i,
340                                       unsigned int max, unsigned int *next)
341 {
342     /* If this descriptor says it doesn't chain, we're done. */
343     if (!(le16toh(desc[i].flags) & VRING_DESC_F_NEXT)) {
344         return VIRTQUEUE_READ_DESC_DONE;
345     }
346 
347     /* Check they're not leading us off end of descriptors. */
348     *next = desc[i].next;
349     /* Make sure compiler knows to grab that: we don't want it changing! */
350     smp_wmb();
351 
352     if (*next >= max) {
353         fprintf(stderr, "Desc next is %u\n", *next);
354         return VIRTQUEUE_READ_DESC_ERROR;
355     }
356 
357     return VIRTQUEUE_READ_DESC_MORE;
358 }
359 
360 /*
361  * Fetch avail_idx from VQ memory only when we really need to know if
362  * guest has added some buffers.
363  */
364 static bool vduse_queue_empty(VduseVirtq *vq)
365 {
366     if (unlikely(!vq->vring.avail)) {
367         return true;
368     }
369 
370     if (vq->shadow_avail_idx != vq->last_avail_idx) {
371         return false;
372     }
373 
374     return vring_avail_idx(vq) == vq->last_avail_idx;
375 }
376 
377 static bool vduse_queue_should_notify(VduseVirtq *vq)
378 {
379     VduseDev *dev = vq->dev;
380     uint16_t old, new;
381     bool v;
382 
383     /* We need to expose used array entries before checking used event. */
384     smp_mb();
385 
386     /* Always notify when queue is empty (when feature acknowledge) */
387     if (vduse_dev_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) &&
388         !vq->inuse && vduse_queue_empty(vq)) {
389         return true;
390     }
391 
392     if (!vduse_dev_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
393         return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT);
394     }
395 
396     v = vq->signalled_used_valid;
397     vq->signalled_used_valid = true;
398     old = vq->signalled_used;
399     new = vq->signalled_used = vq->used_idx;
400     return !v || vring_need_event(vring_get_used_event(vq), new, old);
401 }
402 
403 void vduse_queue_notify(VduseVirtq *vq)
404 {
405     VduseDev *dev = vq->dev;
406 
407     if (unlikely(!vq->vring.avail)) {
408         return;
409     }
410 
411     if (!vduse_queue_should_notify(vq)) {
412         return;
413     }
414 
415     if (vduse_inject_irq(dev, vq->index) < 0) {
416         fprintf(stderr, "Error inject irq for vq %d: %s\n",
417                 vq->index, strerror(errno));
418     }
419 }
420 
421 static inline void vring_set_avail_event(VduseVirtq *vq, uint16_t val)
422 {
423     *((uint16_t *)&vq->vring.used->ring[vq->vring.num]) = htole16(val);
424 }
425 
426 static bool vduse_queue_map_single_desc(VduseVirtq *vq, unsigned int *p_num_sg,
427                                    struct iovec *iov, unsigned int max_num_sg,
428                                    bool is_write, uint64_t pa, size_t sz)
429 {
430     unsigned num_sg = *p_num_sg;
431     VduseDev *dev = vq->dev;
432 
433     assert(num_sg <= max_num_sg);
434 
435     if (!sz) {
436         fprintf(stderr, "virtio: zero sized buffers are not allowed\n");
437         return false;
438     }
439 
440     while (sz) {
441         uint64_t len = sz;
442 
443         if (num_sg == max_num_sg) {
444             fprintf(stderr,
445                     "virtio: too many descriptors in indirect table\n");
446             return false;
447         }
448 
449         iov[num_sg].iov_base = iova_to_va(dev, &len, pa);
450         if (iov[num_sg].iov_base == NULL) {
451             fprintf(stderr, "virtio: invalid address for buffers\n");
452             return false;
453         }
454         iov[num_sg++].iov_len = len;
455         sz -= len;
456         pa += len;
457     }
458 
459     *p_num_sg = num_sg;
460     return true;
461 }
462 
463 static void *vduse_queue_alloc_element(size_t sz, unsigned out_num,
464                                        unsigned in_num)
465 {
466     VduseVirtqElement *elem;
467     size_t in_sg_ofs = ALIGN_UP(sz, __alignof__(elem->in_sg[0]));
468     size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]);
469     size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]);
470 
471     assert(sz >= sizeof(VduseVirtqElement));
472     elem = malloc(out_sg_end);
473     if (!elem) {
474         return NULL;
475     }
476     elem->out_num = out_num;
477     elem->in_num = in_num;
478     elem->in_sg = (void *)elem + in_sg_ofs;
479     elem->out_sg = (void *)elem + out_sg_ofs;
480     return elem;
481 }
482 
483 static void *vduse_queue_map_desc(VduseVirtq *vq, unsigned int idx, size_t sz)
484 {
485     struct vring_desc *desc = vq->vring.desc;
486     VduseDev *dev = vq->dev;
487     uint64_t desc_addr, read_len;
488     unsigned int desc_len;
489     unsigned int max = vq->vring.num;
490     unsigned int i = idx;
491     VduseVirtqElement *elem;
492     struct iovec iov[VIRTQUEUE_MAX_SIZE];
493     struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE];
494     unsigned int out_num = 0, in_num = 0;
495     int rc;
496 
497     if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) {
498         if (le32toh(desc[i].len) % sizeof(struct vring_desc)) {
499             fprintf(stderr, "Invalid size for indirect buffer table\n");
500             return NULL;
501         }
502 
503         /* loop over the indirect descriptor table */
504         desc_addr = le64toh(desc[i].addr);
505         desc_len = le32toh(desc[i].len);
506         max = desc_len / sizeof(struct vring_desc);
507         read_len = desc_len;
508         desc = iova_to_va(dev, &read_len, desc_addr);
509         if (unlikely(desc && read_len != desc_len)) {
510             /* Failed to use zero copy */
511             desc = NULL;
512             if (!vduse_queue_read_indirect_desc(dev, desc_buf,
513                                                 desc_addr,
514                                                 desc_len)) {
515                 desc = desc_buf;
516             }
517         }
518         if (!desc) {
519             fprintf(stderr, "Invalid indirect buffer table\n");
520             return NULL;
521         }
522         i = 0;
523     }
524 
525     /* Collect all the descriptors */
526     do {
527         if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) {
528             if (!vduse_queue_map_single_desc(vq, &in_num, iov + out_num,
529                                              VIRTQUEUE_MAX_SIZE - out_num,
530                                              true, le64toh(desc[i].addr),
531                                              le32toh(desc[i].len))) {
532                 return NULL;
533             }
534         } else {
535             if (in_num) {
536                 fprintf(stderr, "Incorrect order for descriptors\n");
537                 return NULL;
538             }
539             if (!vduse_queue_map_single_desc(vq, &out_num, iov,
540                                              VIRTQUEUE_MAX_SIZE, false,
541                                              le64toh(desc[i].addr),
542                                              le32toh(desc[i].len))) {
543                 return NULL;
544             }
545         }
546 
547         /* If we've got too many, that implies a descriptor loop. */
548         if ((in_num + out_num) > max) {
549             fprintf(stderr, "Looped descriptor\n");
550             return NULL;
551         }
552         rc = vduse_queue_read_next_desc(desc, i, max, &i);
553     } while (rc == VIRTQUEUE_READ_DESC_MORE);
554 
555     if (rc == VIRTQUEUE_READ_DESC_ERROR) {
556         fprintf(stderr, "read descriptor error\n");
557         return NULL;
558     }
559 
560     /* Now copy what we have collected and mapped */
561     elem = vduse_queue_alloc_element(sz, out_num, in_num);
562     if (!elem) {
563         fprintf(stderr, "read descriptor error\n");
564         return NULL;
565     }
566     elem->index = idx;
567     for (i = 0; i < out_num; i++) {
568         elem->out_sg[i] = iov[i];
569     }
570     for (i = 0; i < in_num; i++) {
571         elem->in_sg[i] = iov[out_num + i];
572     }
573 
574     return elem;
575 }
576 
577 void *vduse_queue_pop(VduseVirtq *vq, size_t sz)
578 {
579     unsigned int head;
580     VduseVirtqElement *elem;
581     VduseDev *dev = vq->dev;
582 
583     if (unlikely(!vq->vring.avail)) {
584         return NULL;
585     }
586 
587     if (vduse_queue_empty(vq)) {
588         return NULL;
589     }
590     /* Needed after virtio_queue_empty() */
591     smp_rmb();
592 
593     if (vq->inuse >= vq->vring.num) {
594         fprintf(stderr, "Virtqueue size exceeded: %d\n", vq->inuse);
595         return NULL;
596     }
597 
598     if (!vduse_queue_get_head(vq, vq->last_avail_idx++, &head)) {
599         return NULL;
600     }
601 
602     if (vduse_dev_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
603         vring_set_avail_event(vq, vq->last_avail_idx);
604     }
605 
606     elem = vduse_queue_map_desc(vq, head, sz);
607 
608     if (!elem) {
609         return NULL;
610     }
611 
612     vq->inuse++;
613 
614     return elem;
615 }
616 
617 static inline void vring_used_write(VduseVirtq *vq,
618                                     struct vring_used_elem *uelem, int i)
619 {
620     struct vring_used *used = vq->vring.used;
621 
622     used->ring[i] = *uelem;
623 }
624 
625 static void vduse_queue_fill(VduseVirtq *vq, const VduseVirtqElement *elem,
626                              unsigned int len, unsigned int idx)
627 {
628     struct vring_used_elem uelem;
629 
630     if (unlikely(!vq->vring.used)) {
631         return;
632     }
633 
634     idx = (idx + vq->used_idx) % vq->vring.num;
635 
636     uelem.id = htole32(elem->index);
637     uelem.len = htole32(len);
638     vring_used_write(vq, &uelem, idx);
639 }
640 
641 static inline void vring_used_idx_set(VduseVirtq *vq, uint16_t val)
642 {
643     vq->vring.used->idx = htole16(val);
644     vq->used_idx = val;
645 }
646 
647 static void vduse_queue_flush(VduseVirtq *vq, unsigned int count)
648 {
649     uint16_t old, new;
650 
651     if (unlikely(!vq->vring.used)) {
652         return;
653     }
654 
655     /* Make sure buffer is written before we update index. */
656     smp_wmb();
657 
658     old = vq->used_idx;
659     new = old + count;
660     vring_used_idx_set(vq, new);
661     vq->inuse -= count;
662     if (unlikely((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old))) {
663         vq->signalled_used_valid = false;
664     }
665 }
666 
667 void vduse_queue_push(VduseVirtq *vq, const VduseVirtqElement *elem,
668                       unsigned int len)
669 {
670     vduse_queue_fill(vq, elem, len, 0);
671     vduse_queue_flush(vq, 1);
672 }
673 
674 static int vduse_queue_update_vring(VduseVirtq *vq, uint64_t desc_addr,
675                                     uint64_t avail_addr, uint64_t used_addr)
676 {
677     struct VduseDev *dev = vq->dev;
678     uint64_t len;
679 
680     len = sizeof(struct vring_desc);
681     vq->vring.desc = iova_to_va(dev, &len, desc_addr);
682     if (len != sizeof(struct vring_desc)) {
683         return -EINVAL;
684     }
685 
686     len = sizeof(struct vring_avail);
687     vq->vring.avail = iova_to_va(dev, &len, avail_addr);
688     if (len != sizeof(struct vring_avail)) {
689         return -EINVAL;
690     }
691 
692     len = sizeof(struct vring_used);
693     vq->vring.used = iova_to_va(dev, &len, used_addr);
694     if (len != sizeof(struct vring_used)) {
695         return -EINVAL;
696     }
697 
698     if (!vq->vring.desc || !vq->vring.avail || !vq->vring.used) {
699         fprintf(stderr, "Failed to get vq[%d] iova mapping\n", vq->index);
700         return -EINVAL;
701     }
702 
703     return 0;
704 }
705 
706 static void vduse_queue_enable(VduseVirtq *vq)
707 {
708     struct VduseDev *dev = vq->dev;
709     struct vduse_vq_info vq_info;
710     struct vduse_vq_eventfd vq_eventfd;
711     int fd;
712 
713     vq_info.index = vq->index;
714     if (ioctl(dev->fd, VDUSE_VQ_GET_INFO, &vq_info)) {
715         fprintf(stderr, "Failed to get vq[%d] info: %s\n",
716                 vq->index, strerror(errno));
717         return;
718     }
719 
720     if (!vq_info.ready) {
721         return;
722     }
723 
724     vq->vring.num = vq_info.num;
725     vq->vring.desc_addr = vq_info.desc_addr;
726     vq->vring.avail_addr = vq_info.driver_addr;
727     vq->vring.used_addr = vq_info.device_addr;
728 
729     if (vduse_queue_update_vring(vq, vq_info.desc_addr,
730                                  vq_info.driver_addr, vq_info.device_addr)) {
731         fprintf(stderr, "Failed to update vring for vq[%d]\n", vq->index);
732         return;
733     }
734 
735     fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
736     if (fd < 0) {
737         fprintf(stderr, "Failed to init eventfd for vq[%d]\n", vq->index);
738         return;
739     }
740 
741     vq_eventfd.index = vq->index;
742     vq_eventfd.fd = fd;
743     if (ioctl(dev->fd, VDUSE_VQ_SETUP_KICKFD, &vq_eventfd)) {
744         fprintf(stderr, "Failed to setup kick fd for vq[%d]\n", vq->index);
745         close(fd);
746         return;
747     }
748 
749     vq->fd = fd;
750     vq->shadow_avail_idx = vq->last_avail_idx = vq_info.split.avail_index;
751     vq->inuse = 0;
752     vq->used_idx = 0;
753     vq->signalled_used_valid = false;
754     vq->ready = true;
755 
756     dev->ops->enable_queue(dev, vq);
757 }
758 
759 static void vduse_queue_disable(VduseVirtq *vq)
760 {
761     struct VduseDev *dev = vq->dev;
762     struct vduse_vq_eventfd eventfd;
763 
764     if (!vq->ready) {
765         return;
766     }
767 
768     dev->ops->disable_queue(dev, vq);
769 
770     eventfd.index = vq->index;
771     eventfd.fd = VDUSE_EVENTFD_DEASSIGN;
772     ioctl(dev->fd, VDUSE_VQ_SETUP_KICKFD, &eventfd);
773     close(vq->fd);
774 
775     assert(vq->inuse == 0);
776 
777     vq->vring.num = 0;
778     vq->vring.desc_addr = 0;
779     vq->vring.avail_addr = 0;
780     vq->vring.used_addr = 0;
781     vq->vring.desc = 0;
782     vq->vring.avail = 0;
783     vq->vring.used = 0;
784     vq->ready = false;
785     vq->fd = -1;
786 }
787 
788 static void vduse_dev_start_dataplane(VduseDev *dev)
789 {
790     int i;
791 
792     if (ioctl(dev->fd, VDUSE_DEV_GET_FEATURES, &dev->features)) {
793         fprintf(stderr, "Failed to get features: %s\n", strerror(errno));
794         return;
795     }
796     assert(vduse_dev_has_feature(dev, VIRTIO_F_VERSION_1));
797 
798     for (i = 0; i < dev->num_queues; i++) {
799         vduse_queue_enable(&dev->vqs[i]);
800     }
801 }
802 
803 static void vduse_dev_stop_dataplane(VduseDev *dev)
804 {
805     int i;
806 
807     for (i = 0; i < dev->num_queues; i++) {
808         vduse_queue_disable(&dev->vqs[i]);
809     }
810     dev->features = 0;
811     vduse_iova_remove_region(dev, 0, ULONG_MAX);
812 }
813 
814 int vduse_dev_handler(VduseDev *dev)
815 {
816     struct vduse_dev_request req;
817     struct vduse_dev_response resp = { 0 };
818     VduseVirtq *vq;
819     int i, ret;
820 
821     ret = read(dev->fd, &req, sizeof(req));
822     if (ret != sizeof(req)) {
823         fprintf(stderr, "Read request error [%d]: %s\n",
824                 ret, strerror(errno));
825         return -errno;
826     }
827     resp.request_id = req.request_id;
828 
829     switch (req.type) {
830     case VDUSE_GET_VQ_STATE:
831         vq = &dev->vqs[req.vq_state.index];
832         resp.vq_state.split.avail_index = vq->last_avail_idx;
833         resp.result = VDUSE_REQ_RESULT_OK;
834         break;
835     case VDUSE_SET_STATUS:
836         if (req.s.status & VIRTIO_CONFIG_S_DRIVER_OK) {
837             vduse_dev_start_dataplane(dev);
838         } else if (req.s.status == 0) {
839             vduse_dev_stop_dataplane(dev);
840         }
841         resp.result = VDUSE_REQ_RESULT_OK;
842         break;
843     case VDUSE_UPDATE_IOTLB:
844         /* The iova will be updated by iova_to_va() later, so just remove it */
845         vduse_iova_remove_region(dev, req.iova.start, req.iova.last);
846         for (i = 0; i < dev->num_queues; i++) {
847             VduseVirtq *vq = &dev->vqs[i];
848             if (vq->ready) {
849                 if (vduse_queue_update_vring(vq, vq->vring.desc_addr,
850                                              vq->vring.avail_addr,
851                                              vq->vring.used_addr)) {
852                     fprintf(stderr, "Failed to update vring for vq[%d]\n",
853                             vq->index);
854                 }
855             }
856         }
857         resp.result = VDUSE_REQ_RESULT_OK;
858         break;
859     default:
860         resp.result = VDUSE_REQ_RESULT_FAILED;
861         break;
862     }
863 
864     ret = write(dev->fd, &resp, sizeof(resp));
865     if (ret != sizeof(resp)) {
866         fprintf(stderr, "Write request %d error [%d]: %s\n",
867                 req.type, ret, strerror(errno));
868         return -errno;
869     }
870     return 0;
871 }
872 
873 int vduse_dev_update_config(VduseDev *dev, uint32_t size,
874                             uint32_t offset, char *buffer)
875 {
876     int ret;
877     struct vduse_config_data *data;
878 
879     data = malloc(offsetof(struct vduse_config_data, buffer) + size);
880     if (!data) {
881         return -ENOMEM;
882     }
883 
884     data->offset = offset;
885     data->length = size;
886     memcpy(data->buffer, buffer, size);
887 
888     ret = ioctl(dev->fd, VDUSE_DEV_SET_CONFIG, data);
889     free(data);
890 
891     if (ret) {
892         return -errno;
893     }
894 
895     if (ioctl(dev->fd, VDUSE_DEV_INJECT_CONFIG_IRQ)) {
896         return -errno;
897     }
898 
899     return 0;
900 }
901 
902 int vduse_dev_setup_queue(VduseDev *dev, int index, int max_size)
903 {
904     VduseVirtq *vq = &dev->vqs[index];
905     struct vduse_vq_config vq_config = { 0 };
906 
907     if (max_size > VIRTQUEUE_MAX_SIZE) {
908         return -EINVAL;
909     }
910 
911     vq_config.index = vq->index;
912     vq_config.max_size = max_size;
913 
914     if (ioctl(dev->fd, VDUSE_VQ_SETUP, &vq_config)) {
915         return -errno;
916     }
917 
918     return 0;
919 }
920 
921 static int vduse_dev_init_vqs(VduseDev *dev, uint16_t num_queues)
922 {
923     VduseVirtq *vqs;
924     int i;
925 
926     vqs = calloc(sizeof(VduseVirtq), num_queues);
927     if (!vqs) {
928         return -ENOMEM;
929     }
930 
931     for (i = 0; i < num_queues; i++) {
932         vqs[i].index = i;
933         vqs[i].dev = dev;
934         vqs[i].fd = -1;
935     }
936     dev->vqs = vqs;
937 
938     return 0;
939 }
940 
941 static int vduse_dev_init(VduseDev *dev, const char *name,
942                           uint16_t num_queues, const VduseOps *ops,
943                           void *priv)
944 {
945     char *dev_path, *dev_name;
946     int ret, fd;
947 
948     dev_path = malloc(strlen(name) + strlen("/dev/vduse/") + 1);
949     if (!dev_path) {
950         return -ENOMEM;
951     }
952     sprintf(dev_path, "/dev/vduse/%s", name);
953 
954     fd = open(dev_path, O_RDWR);
955     free(dev_path);
956     if (fd < 0) {
957         fprintf(stderr, "Failed to open vduse dev %s: %s\n",
958                 name, strerror(errno));
959         return -errno;
960     }
961 
962     dev_name = strdup(name);
963     if (!dev_name) {
964         close(fd);
965         return -ENOMEM;
966     }
967 
968     ret = vduse_dev_init_vqs(dev, num_queues);
969     if (ret) {
970         free(dev_name);
971         close(fd);
972         return ret;
973     }
974 
975     dev->name = dev_name;
976     dev->num_queues = num_queues;
977     dev->fd = fd;
978     dev->ops = ops;
979     dev->priv = priv;
980 
981     return 0;
982 }
983 
984 static inline bool vduse_name_is_valid(const char *name)
985 {
986     return strlen(name) >= VDUSE_NAME_MAX || strstr(name, "..");
987 }
988 
989 VduseDev *vduse_dev_create_by_fd(int fd, uint16_t num_queues,
990                                  const VduseOps *ops, void *priv)
991 {
992     VduseDev *dev;
993     int ret;
994 
995     if (!ops || !ops->enable_queue || !ops->disable_queue) {
996         fprintf(stderr, "Invalid parameter for vduse\n");
997         return NULL;
998     }
999 
1000     dev = calloc(sizeof(VduseDev), 1);
1001     if (!dev) {
1002         fprintf(stderr, "Failed to allocate vduse device\n");
1003         return NULL;
1004     }
1005 
1006     ret = vduse_dev_init_vqs(dev, num_queues);
1007     if (ret) {
1008         fprintf(stderr, "Failed to init vqs\n");
1009         free(dev);
1010         return NULL;
1011     }
1012 
1013     dev->num_queues = num_queues;
1014     dev->fd = fd;
1015     dev->ops = ops;
1016     dev->priv = priv;
1017 
1018     return dev;
1019 }
1020 
1021 VduseDev *vduse_dev_create_by_name(const char *name, uint16_t num_queues,
1022                                    const VduseOps *ops, void *priv)
1023 {
1024     VduseDev *dev;
1025     int ret;
1026 
1027     if (!name || vduse_name_is_valid(name) || !ops ||
1028         !ops->enable_queue || !ops->disable_queue) {
1029         fprintf(stderr, "Invalid parameter for vduse\n");
1030         return NULL;
1031     }
1032 
1033     dev = calloc(sizeof(VduseDev), 1);
1034     if (!dev) {
1035         fprintf(stderr, "Failed to allocate vduse device\n");
1036         return NULL;
1037     }
1038 
1039     ret = vduse_dev_init(dev, name, num_queues, ops, priv);
1040     if (ret < 0) {
1041         fprintf(stderr, "Failed to init vduse device %s: %s\n",
1042                 name, strerror(ret));
1043         free(dev);
1044         return NULL;
1045     }
1046 
1047     return dev;
1048 }
1049 
1050 VduseDev *vduse_dev_create(const char *name, uint32_t device_id,
1051                            uint32_t vendor_id, uint64_t features,
1052                            uint16_t num_queues, uint32_t config_size,
1053                            char *config, const VduseOps *ops, void *priv)
1054 {
1055     VduseDev *dev;
1056     int ret, ctrl_fd;
1057     uint64_t version;
1058     struct vduse_dev_config *dev_config;
1059     size_t size = offsetof(struct vduse_dev_config, config);
1060 
1061     if (!name || vduse_name_is_valid(name) ||
1062         !has_feature(features,  VIRTIO_F_VERSION_1) || !config ||
1063         !config_size || !ops || !ops->enable_queue || !ops->disable_queue) {
1064         fprintf(stderr, "Invalid parameter for vduse\n");
1065         return NULL;
1066     }
1067 
1068     dev = calloc(sizeof(VduseDev), 1);
1069     if (!dev) {
1070         fprintf(stderr, "Failed to allocate vduse device\n");
1071         return NULL;
1072     }
1073 
1074     ctrl_fd = open("/dev/vduse/control", O_RDWR);
1075     if (ctrl_fd < 0) {
1076         fprintf(stderr, "Failed to open /dev/vduse/control: %s\n",
1077                 strerror(errno));
1078         goto err_ctrl;
1079     }
1080 
1081     version = VDUSE_API_VERSION;
1082     if (ioctl(ctrl_fd, VDUSE_SET_API_VERSION, &version)) {
1083         fprintf(stderr, "Failed to set api version %" PRIu64 ": %s\n",
1084                 version, strerror(errno));
1085         goto err_dev;
1086     }
1087 
1088     dev_config = calloc(size + config_size, 1);
1089     if (!dev_config) {
1090         fprintf(stderr, "Failed to allocate config space\n");
1091         goto err_dev;
1092     }
1093 
1094     strcpy(dev_config->name, name);
1095     dev_config->device_id = device_id;
1096     dev_config->vendor_id = vendor_id;
1097     dev_config->features = features;
1098     dev_config->vq_num = num_queues;
1099     dev_config->vq_align = VDUSE_VQ_ALIGN;
1100     dev_config->config_size = config_size;
1101     memcpy(dev_config->config, config, config_size);
1102 
1103     ret = ioctl(ctrl_fd, VDUSE_CREATE_DEV, dev_config);
1104     free(dev_config);
1105     if (ret < 0) {
1106         fprintf(stderr, "Failed to create vduse device %s: %s\n",
1107                 name, strerror(errno));
1108         goto err_dev;
1109     }
1110     dev->ctrl_fd = ctrl_fd;
1111 
1112     ret = vduse_dev_init(dev, name, num_queues, ops, priv);
1113     if (ret < 0) {
1114         fprintf(stderr, "Failed to init vduse device %s: %s\n",
1115                 name, strerror(ret));
1116         goto err;
1117     }
1118 
1119     return dev;
1120 err:
1121     ioctl(ctrl_fd, VDUSE_DESTROY_DEV, name);
1122 err_dev:
1123     close(ctrl_fd);
1124 err_ctrl:
1125     free(dev);
1126 
1127     return NULL;
1128 }
1129 
1130 int vduse_dev_destroy(VduseDev *dev)
1131 {
1132     int ret = 0;
1133 
1134     free(dev->vqs);
1135     if (dev->fd >= 0) {
1136         close(dev->fd);
1137         dev->fd = -1;
1138     }
1139     if (dev->ctrl_fd >= 0) {
1140         if (ioctl(dev->ctrl_fd, VDUSE_DESTROY_DEV, dev->name)) {
1141             ret = -errno;
1142         }
1143         close(dev->ctrl_fd);
1144         dev->ctrl_fd = -1;
1145     }
1146     free(dev->name);
1147     free(dev);
1148 
1149     return ret;
1150 }
1151