1 /*
2  * VDUSE (vDPA Device in Userspace) library
3  *
4  * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights reserved.
5  *   Portions of codes and concepts borrowed from libvhost-user.c, so:
6  *     Copyright IBM, Corp. 2007
7  *     Copyright (c) 2016 Red Hat, Inc.
8  *
9  * Author:
10  *   Xie Yongji <xieyongji@bytedance.com>
11  *   Anthony Liguori <aliguori@us.ibm.com>
12  *   Marc-André Lureau <mlureau@redhat.com>
13  *   Victor Kaplansky <victork@redhat.com>
14  *
15  * This work is licensed under the terms of the GNU GPL, version 2 or
16  * later.  See the COPYING file in the top-level directory.
17  */
18 
19 #include <stdlib.h>
20 #include <stdio.h>
21 #include <stdbool.h>
22 #include <stddef.h>
23 #include <errno.h>
24 #include <string.h>
25 #include <assert.h>
26 #include <endian.h>
27 #include <unistd.h>
28 #include <limits.h>
29 #include <fcntl.h>
30 #include <inttypes.h>
31 
32 #include <sys/ioctl.h>
33 #include <sys/eventfd.h>
34 #include <sys/mman.h>
35 
36 #include "include/atomic.h"
37 #include "linux-headers/linux/virtio_ring.h"
38 #include "linux-headers/linux/virtio_config.h"
39 #include "linux-headers/linux/vduse.h"
40 #include "libvduse.h"
41 
42 #define VDUSE_VQ_ALIGN 4096
43 #define MAX_IOVA_REGIONS 256
44 
45 #define LOG_ALIGNMENT 64
46 
47 /* Round number down to multiple */
48 #define ALIGN_DOWN(n, m) ((n) / (m) * (m))
49 
50 /* Round number up to multiple */
51 #define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m))
52 
53 #ifndef unlikely
54 #define unlikely(x)   __builtin_expect(!!(x), 0)
55 #endif
56 
57 typedef struct VduseDescStateSplit {
58     uint8_t inflight;
59     uint8_t padding[5];
60     uint16_t next;
61     uint64_t counter;
62 } VduseDescStateSplit;
63 
64 typedef struct VduseVirtqLogInflight {
65     uint64_t features;
66     uint16_t version;
67     uint16_t desc_num;
68     uint16_t last_batch_head;
69     uint16_t used_idx;
70     VduseDescStateSplit desc[];
71 } VduseVirtqLogInflight;
72 
73 typedef struct VduseVirtqLog {
74     VduseVirtqLogInflight inflight;
75 } VduseVirtqLog;
76 
77 typedef struct VduseVirtqInflightDesc {
78     uint16_t index;
79     uint64_t counter;
80 } VduseVirtqInflightDesc;
81 
82 typedef struct VduseRing {
83     unsigned int num;
84     uint64_t desc_addr;
85     uint64_t avail_addr;
86     uint64_t used_addr;
87     struct vring_desc *desc;
88     struct vring_avail *avail;
89     struct vring_used *used;
90 } VduseRing;
91 
92 struct VduseVirtq {
93     VduseRing vring;
94     uint16_t last_avail_idx;
95     uint16_t shadow_avail_idx;
96     uint16_t used_idx;
97     uint16_t signalled_used;
98     bool signalled_used_valid;
99     int index;
100     int inuse;
101     bool ready;
102     int fd;
103     VduseDev *dev;
104     VduseVirtqInflightDesc *resubmit_list;
105     uint16_t resubmit_num;
106     uint64_t counter;
107     VduseVirtqLog *log;
108 };
109 
110 typedef struct VduseIovaRegion {
111     uint64_t iova;
112     uint64_t size;
113     uint64_t mmap_offset;
114     uint64_t mmap_addr;
115 } VduseIovaRegion;
116 
117 struct VduseDev {
118     VduseVirtq *vqs;
119     VduseIovaRegion regions[MAX_IOVA_REGIONS];
120     int num_regions;
121     char *name;
122     uint32_t device_id;
123     uint32_t vendor_id;
124     uint16_t num_queues;
125     uint16_t queue_size;
126     uint64_t features;
127     const VduseOps *ops;
128     int fd;
129     int ctrl_fd;
130     void *priv;
131     void *log;
132 };
133 
134 static inline size_t vduse_vq_log_size(uint16_t queue_size)
135 {
136     return ALIGN_UP(sizeof(VduseDescStateSplit) * queue_size +
137                     sizeof(VduseVirtqLogInflight), LOG_ALIGNMENT);
138 }
139 
140 static void *vduse_log_get(const char *filename, size_t size)
141 {
142     void *ptr = MAP_FAILED;
143     int fd;
144 
145     fd = open(filename, O_RDWR | O_CREAT, 0600);
146     if (fd == -1) {
147         return MAP_FAILED;
148     }
149 
150     if (ftruncate(fd, size) == -1) {
151         goto out;
152     }
153 
154     ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
155 
156 out:
157     close(fd);
158     return ptr;
159 }
160 
161 static inline bool has_feature(uint64_t features, unsigned int fbit)
162 {
163     assert(fbit < 64);
164     return !!(features & (1ULL << fbit));
165 }
166 
167 static inline bool vduse_dev_has_feature(VduseDev *dev, unsigned int fbit)
168 {
169     return has_feature(dev->features, fbit);
170 }
171 
172 uint64_t vduse_get_virtio_features(void)
173 {
174     return (1ULL << VIRTIO_F_IOMMU_PLATFORM) |
175            (1ULL << VIRTIO_F_VERSION_1) |
176            (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) |
177            (1ULL << VIRTIO_RING_F_EVENT_IDX) |
178            (1ULL << VIRTIO_RING_F_INDIRECT_DESC);
179 }
180 
181 VduseDev *vduse_queue_get_dev(VduseVirtq *vq)
182 {
183     return vq->dev;
184 }
185 
186 int vduse_queue_get_fd(VduseVirtq *vq)
187 {
188     return vq->fd;
189 }
190 
191 void *vduse_dev_get_priv(VduseDev *dev)
192 {
193     return dev->priv;
194 }
195 
196 VduseVirtq *vduse_dev_get_queue(VduseDev *dev, int index)
197 {
198     return &dev->vqs[index];
199 }
200 
201 int vduse_dev_get_fd(VduseDev *dev)
202 {
203     return dev->fd;
204 }
205 
206 static int vduse_inject_irq(VduseDev *dev, int index)
207 {
208     return ioctl(dev->fd, VDUSE_VQ_INJECT_IRQ, &index);
209 }
210 
211 static int inflight_desc_compare(const void *a, const void *b)
212 {
213     VduseVirtqInflightDesc *desc0 = (VduseVirtqInflightDesc *)a,
214                            *desc1 = (VduseVirtqInflightDesc *)b;
215 
216     if (desc1->counter > desc0->counter &&
217         (desc1->counter - desc0->counter) < VIRTQUEUE_MAX_SIZE * 2) {
218         return 1;
219     }
220 
221     return -1;
222 }
223 
224 static int vduse_queue_check_inflights(VduseVirtq *vq)
225 {
226     int i = 0;
227     VduseDev *dev = vq->dev;
228 
229     vq->used_idx = le16toh(vq->vring.used->idx);
230     vq->resubmit_num = 0;
231     vq->resubmit_list = NULL;
232     vq->counter = 0;
233 
234     if (unlikely(vq->log->inflight.used_idx != vq->used_idx)) {
235         if (vq->log->inflight.last_batch_head > VIRTQUEUE_MAX_SIZE) {
236             return -1;
237         }
238 
239         vq->log->inflight.desc[vq->log->inflight.last_batch_head].inflight = 0;
240 
241         barrier();
242 
243         vq->log->inflight.used_idx = vq->used_idx;
244     }
245 
246     for (i = 0; i < vq->log->inflight.desc_num; i++) {
247         if (vq->log->inflight.desc[i].inflight == 1) {
248             vq->inuse++;
249         }
250     }
251 
252     vq->shadow_avail_idx = vq->last_avail_idx = vq->inuse + vq->used_idx;
253 
254     if (vq->inuse) {
255         vq->resubmit_list = calloc(vq->inuse, sizeof(VduseVirtqInflightDesc));
256         if (!vq->resubmit_list) {
257             return -1;
258         }
259 
260         for (i = 0; i < vq->log->inflight.desc_num; i++) {
261             if (vq->log->inflight.desc[i].inflight) {
262                 vq->resubmit_list[vq->resubmit_num].index = i;
263                 vq->resubmit_list[vq->resubmit_num].counter =
264                                         vq->log->inflight.desc[i].counter;
265                 vq->resubmit_num++;
266             }
267         }
268 
269         if (vq->resubmit_num > 1) {
270             qsort(vq->resubmit_list, vq->resubmit_num,
271                   sizeof(VduseVirtqInflightDesc), inflight_desc_compare);
272         }
273         vq->counter = vq->resubmit_list[0].counter + 1;
274     }
275 
276     vduse_inject_irq(dev, vq->index);
277 
278     return 0;
279 }
280 
281 static int vduse_queue_inflight_get(VduseVirtq *vq, int desc_idx)
282 {
283     vq->log->inflight.desc[desc_idx].counter = vq->counter++;
284 
285     barrier();
286 
287     vq->log->inflight.desc[desc_idx].inflight = 1;
288 
289     return 0;
290 }
291 
292 static int vduse_queue_inflight_pre_put(VduseVirtq *vq, int desc_idx)
293 {
294     vq->log->inflight.last_batch_head = desc_idx;
295 
296     return 0;
297 }
298 
299 static int vduse_queue_inflight_post_put(VduseVirtq *vq, int desc_idx)
300 {
301     vq->log->inflight.desc[desc_idx].inflight = 0;
302 
303     barrier();
304 
305     vq->log->inflight.used_idx = vq->used_idx;
306 
307     return 0;
308 }
309 
310 static void vduse_iova_remove_region(VduseDev *dev, uint64_t start,
311                                      uint64_t last)
312 {
313     int i;
314 
315     if (last == start) {
316         return;
317     }
318 
319     for (i = 0; i < MAX_IOVA_REGIONS; i++) {
320         if (!dev->regions[i].mmap_addr) {
321             continue;
322         }
323 
324         if (start <= dev->regions[i].iova &&
325             last >= (dev->regions[i].iova + dev->regions[i].size - 1)) {
326             munmap((void *)(uintptr_t)dev->regions[i].mmap_addr,
327                    dev->regions[i].mmap_offset + dev->regions[i].size);
328             dev->regions[i].mmap_addr = 0;
329             dev->num_regions--;
330         }
331     }
332 }
333 
334 static int vduse_iova_add_region(VduseDev *dev, int fd,
335                                  uint64_t offset, uint64_t start,
336                                  uint64_t last, int prot)
337 {
338     int i;
339     uint64_t size = last - start + 1;
340     void *mmap_addr = mmap(0, size + offset, prot, MAP_SHARED, fd, 0);
341 
342     if (mmap_addr == MAP_FAILED) {
343         close(fd);
344         return -EINVAL;
345     }
346 
347     for (i = 0; i < MAX_IOVA_REGIONS; i++) {
348         if (!dev->regions[i].mmap_addr) {
349             dev->regions[i].mmap_addr = (uint64_t)(uintptr_t)mmap_addr;
350             dev->regions[i].mmap_offset = offset;
351             dev->regions[i].iova = start;
352             dev->regions[i].size = size;
353             dev->num_regions++;
354             break;
355         }
356     }
357     assert(i < MAX_IOVA_REGIONS);
358     close(fd);
359 
360     return 0;
361 }
362 
363 static int perm_to_prot(uint8_t perm)
364 {
365     int prot = 0;
366 
367     switch (perm) {
368     case VDUSE_ACCESS_WO:
369         prot |= PROT_WRITE;
370         break;
371     case VDUSE_ACCESS_RO:
372         prot |= PROT_READ;
373         break;
374     case VDUSE_ACCESS_RW:
375         prot |= PROT_READ | PROT_WRITE;
376         break;
377     default:
378         break;
379     }
380 
381     return prot;
382 }
383 
384 static inline void *iova_to_va(VduseDev *dev, uint64_t *plen, uint64_t iova)
385 {
386     int i, ret;
387     struct vduse_iotlb_entry entry;
388 
389     for (i = 0; i < MAX_IOVA_REGIONS; i++) {
390         VduseIovaRegion *r = &dev->regions[i];
391 
392         if (!r->mmap_addr) {
393             continue;
394         }
395 
396         if ((iova >= r->iova) && (iova < (r->iova + r->size))) {
397             if ((iova + *plen) > (r->iova + r->size)) {
398                 *plen = r->iova + r->size - iova;
399             }
400             return (void *)(uintptr_t)(iova - r->iova +
401                    r->mmap_addr + r->mmap_offset);
402         }
403     }
404 
405     entry.start = iova;
406     entry.last = iova + 1;
407     ret = ioctl(dev->fd, VDUSE_IOTLB_GET_FD, &entry);
408     if (ret < 0) {
409         return NULL;
410     }
411 
412     if (!vduse_iova_add_region(dev, ret, entry.offset, entry.start,
413                                entry.last, perm_to_prot(entry.perm))) {
414         return iova_to_va(dev, plen, iova);
415     }
416 
417     return NULL;
418 }
419 
420 static inline uint16_t vring_avail_flags(VduseVirtq *vq)
421 {
422     return le16toh(vq->vring.avail->flags);
423 }
424 
425 static inline uint16_t vring_avail_idx(VduseVirtq *vq)
426 {
427     vq->shadow_avail_idx = le16toh(vq->vring.avail->idx);
428 
429     return vq->shadow_avail_idx;
430 }
431 
432 static inline uint16_t vring_avail_ring(VduseVirtq *vq, int i)
433 {
434     return le16toh(vq->vring.avail->ring[i]);
435 }
436 
437 static inline uint16_t vring_get_used_event(VduseVirtq *vq)
438 {
439     return vring_avail_ring(vq, vq->vring.num);
440 }
441 
442 static bool vduse_queue_get_head(VduseVirtq *vq, unsigned int idx,
443                                  unsigned int *head)
444 {
445     /*
446      * Grab the next descriptor number they're advertising, and increment
447      * the index we've seen.
448      */
449     *head = vring_avail_ring(vq, idx % vq->vring.num);
450 
451     /* If their number is silly, that's a fatal mistake. */
452     if (*head >= vq->vring.num) {
453         fprintf(stderr, "Guest says index %u is available\n", *head);
454         return false;
455     }
456 
457     return true;
458 }
459 
460 static int
461 vduse_queue_read_indirect_desc(VduseDev *dev, struct vring_desc *desc,
462                                uint64_t addr, size_t len)
463 {
464     struct vring_desc *ori_desc;
465     uint64_t read_len;
466 
467     if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc))) {
468         return -1;
469     }
470 
471     if (len == 0) {
472         return -1;
473     }
474 
475     while (len) {
476         read_len = len;
477         ori_desc = iova_to_va(dev, &read_len, addr);
478         if (!ori_desc) {
479             return -1;
480         }
481 
482         memcpy(desc, ori_desc, read_len);
483         len -= read_len;
484         addr += read_len;
485         desc += read_len;
486     }
487 
488     return 0;
489 }
490 
491 enum {
492     VIRTQUEUE_READ_DESC_ERROR = -1,
493     VIRTQUEUE_READ_DESC_DONE = 0,   /* end of chain */
494     VIRTQUEUE_READ_DESC_MORE = 1,   /* more buffers in chain */
495 };
496 
497 static int vduse_queue_read_next_desc(struct vring_desc *desc, int i,
498                                       unsigned int max, unsigned int *next)
499 {
500     /* If this descriptor says it doesn't chain, we're done. */
501     if (!(le16toh(desc[i].flags) & VRING_DESC_F_NEXT)) {
502         return VIRTQUEUE_READ_DESC_DONE;
503     }
504 
505     /* Check they're not leading us off end of descriptors. */
506     *next = desc[i].next;
507     /* Make sure compiler knows to grab that: we don't want it changing! */
508     smp_wmb();
509 
510     if (*next >= max) {
511         fprintf(stderr, "Desc next is %u\n", *next);
512         return VIRTQUEUE_READ_DESC_ERROR;
513     }
514 
515     return VIRTQUEUE_READ_DESC_MORE;
516 }
517 
518 /*
519  * Fetch avail_idx from VQ memory only when we really need to know if
520  * guest has added some buffers.
521  */
522 static bool vduse_queue_empty(VduseVirtq *vq)
523 {
524     if (unlikely(!vq->vring.avail)) {
525         return true;
526     }
527 
528     if (vq->shadow_avail_idx != vq->last_avail_idx) {
529         return false;
530     }
531 
532     return vring_avail_idx(vq) == vq->last_avail_idx;
533 }
534 
535 static bool vduse_queue_should_notify(VduseVirtq *vq)
536 {
537     VduseDev *dev = vq->dev;
538     uint16_t old, new;
539     bool v;
540 
541     /* We need to expose used array entries before checking used event. */
542     smp_mb();
543 
544     /* Always notify when queue is empty (when feature acknowledge) */
545     if (vduse_dev_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) &&
546         !vq->inuse && vduse_queue_empty(vq)) {
547         return true;
548     }
549 
550     if (!vduse_dev_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
551         return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT);
552     }
553 
554     v = vq->signalled_used_valid;
555     vq->signalled_used_valid = true;
556     old = vq->signalled_used;
557     new = vq->signalled_used = vq->used_idx;
558     return !v || vring_need_event(vring_get_used_event(vq), new, old);
559 }
560 
561 void vduse_queue_notify(VduseVirtq *vq)
562 {
563     VduseDev *dev = vq->dev;
564 
565     if (unlikely(!vq->vring.avail)) {
566         return;
567     }
568 
569     if (!vduse_queue_should_notify(vq)) {
570         return;
571     }
572 
573     if (vduse_inject_irq(dev, vq->index) < 0) {
574         fprintf(stderr, "Error inject irq for vq %d: %s\n",
575                 vq->index, strerror(errno));
576     }
577 }
578 
579 static inline void vring_set_avail_event(VduseVirtq *vq, uint16_t val)
580 {
581     *((uint16_t *)&vq->vring.used->ring[vq->vring.num]) = htole16(val);
582 }
583 
584 static bool vduse_queue_map_single_desc(VduseVirtq *vq, unsigned int *p_num_sg,
585                                    struct iovec *iov, unsigned int max_num_sg,
586                                    bool is_write, uint64_t pa, size_t sz)
587 {
588     unsigned num_sg = *p_num_sg;
589     VduseDev *dev = vq->dev;
590 
591     assert(num_sg <= max_num_sg);
592 
593     if (!sz) {
594         fprintf(stderr, "virtio: zero sized buffers are not allowed\n");
595         return false;
596     }
597 
598     while (sz) {
599         uint64_t len = sz;
600 
601         if (num_sg == max_num_sg) {
602             fprintf(stderr,
603                     "virtio: too many descriptors in indirect table\n");
604             return false;
605         }
606 
607         iov[num_sg].iov_base = iova_to_va(dev, &len, pa);
608         if (iov[num_sg].iov_base == NULL) {
609             fprintf(stderr, "virtio: invalid address for buffers\n");
610             return false;
611         }
612         iov[num_sg++].iov_len = len;
613         sz -= len;
614         pa += len;
615     }
616 
617     *p_num_sg = num_sg;
618     return true;
619 }
620 
621 static void *vduse_queue_alloc_element(size_t sz, unsigned out_num,
622                                        unsigned in_num)
623 {
624     VduseVirtqElement *elem;
625     size_t in_sg_ofs = ALIGN_UP(sz, __alignof__(elem->in_sg[0]));
626     size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]);
627     size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]);
628 
629     assert(sz >= sizeof(VduseVirtqElement));
630     elem = malloc(out_sg_end);
631     if (!elem) {
632         return NULL;
633     }
634     elem->out_num = out_num;
635     elem->in_num = in_num;
636     elem->in_sg = (void *)elem + in_sg_ofs;
637     elem->out_sg = (void *)elem + out_sg_ofs;
638     return elem;
639 }
640 
641 static void *vduse_queue_map_desc(VduseVirtq *vq, unsigned int idx, size_t sz)
642 {
643     struct vring_desc *desc = vq->vring.desc;
644     VduseDev *dev = vq->dev;
645     uint64_t desc_addr, read_len;
646     unsigned int desc_len;
647     unsigned int max = vq->vring.num;
648     unsigned int i = idx;
649     VduseVirtqElement *elem;
650     struct iovec iov[VIRTQUEUE_MAX_SIZE];
651     struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE];
652     unsigned int out_num = 0, in_num = 0;
653     int rc;
654 
655     if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) {
656         if (le32toh(desc[i].len) % sizeof(struct vring_desc)) {
657             fprintf(stderr, "Invalid size for indirect buffer table\n");
658             return NULL;
659         }
660 
661         /* loop over the indirect descriptor table */
662         desc_addr = le64toh(desc[i].addr);
663         desc_len = le32toh(desc[i].len);
664         max = desc_len / sizeof(struct vring_desc);
665         read_len = desc_len;
666         desc = iova_to_va(dev, &read_len, desc_addr);
667         if (unlikely(desc && read_len != desc_len)) {
668             /* Failed to use zero copy */
669             desc = NULL;
670             if (!vduse_queue_read_indirect_desc(dev, desc_buf,
671                                                 desc_addr,
672                                                 desc_len)) {
673                 desc = desc_buf;
674             }
675         }
676         if (!desc) {
677             fprintf(stderr, "Invalid indirect buffer table\n");
678             return NULL;
679         }
680         i = 0;
681     }
682 
683     /* Collect all the descriptors */
684     do {
685         if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) {
686             if (!vduse_queue_map_single_desc(vq, &in_num, iov + out_num,
687                                              VIRTQUEUE_MAX_SIZE - out_num,
688                                              true, le64toh(desc[i].addr),
689                                              le32toh(desc[i].len))) {
690                 return NULL;
691             }
692         } else {
693             if (in_num) {
694                 fprintf(stderr, "Incorrect order for descriptors\n");
695                 return NULL;
696             }
697             if (!vduse_queue_map_single_desc(vq, &out_num, iov,
698                                              VIRTQUEUE_MAX_SIZE, false,
699                                              le64toh(desc[i].addr),
700                                              le32toh(desc[i].len))) {
701                 return NULL;
702             }
703         }
704 
705         /* If we've got too many, that implies a descriptor loop. */
706         if ((in_num + out_num) > max) {
707             fprintf(stderr, "Looped descriptor\n");
708             return NULL;
709         }
710         rc = vduse_queue_read_next_desc(desc, i, max, &i);
711     } while (rc == VIRTQUEUE_READ_DESC_MORE);
712 
713     if (rc == VIRTQUEUE_READ_DESC_ERROR) {
714         fprintf(stderr, "read descriptor error\n");
715         return NULL;
716     }
717 
718     /* Now copy what we have collected and mapped */
719     elem = vduse_queue_alloc_element(sz, out_num, in_num);
720     if (!elem) {
721         fprintf(stderr, "read descriptor error\n");
722         return NULL;
723     }
724     elem->index = idx;
725     for (i = 0; i < out_num; i++) {
726         elem->out_sg[i] = iov[i];
727     }
728     for (i = 0; i < in_num; i++) {
729         elem->in_sg[i] = iov[out_num + i];
730     }
731 
732     return elem;
733 }
734 
735 void *vduse_queue_pop(VduseVirtq *vq, size_t sz)
736 {
737     unsigned int head;
738     VduseVirtqElement *elem;
739     VduseDev *dev = vq->dev;
740     int i;
741 
742     if (unlikely(!vq->vring.avail)) {
743         return NULL;
744     }
745 
746     if (unlikely(vq->resubmit_list && vq->resubmit_num > 0)) {
747         i = (--vq->resubmit_num);
748         elem = vduse_queue_map_desc(vq, vq->resubmit_list[i].index, sz);
749 
750         if (!vq->resubmit_num) {
751             free(vq->resubmit_list);
752             vq->resubmit_list = NULL;
753         }
754 
755         return elem;
756     }
757 
758     if (vduse_queue_empty(vq)) {
759         return NULL;
760     }
761     /* Needed after virtio_queue_empty() */
762     smp_rmb();
763 
764     if (vq->inuse >= vq->vring.num) {
765         fprintf(stderr, "Virtqueue size exceeded: %d\n", vq->inuse);
766         return NULL;
767     }
768 
769     if (!vduse_queue_get_head(vq, vq->last_avail_idx++, &head)) {
770         return NULL;
771     }
772 
773     if (vduse_dev_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) {
774         vring_set_avail_event(vq, vq->last_avail_idx);
775     }
776 
777     elem = vduse_queue_map_desc(vq, head, sz);
778 
779     if (!elem) {
780         return NULL;
781     }
782 
783     vq->inuse++;
784 
785     vduse_queue_inflight_get(vq, head);
786 
787     return elem;
788 }
789 
790 static inline void vring_used_write(VduseVirtq *vq,
791                                     struct vring_used_elem *uelem, int i)
792 {
793     struct vring_used *used = vq->vring.used;
794 
795     used->ring[i] = *uelem;
796 }
797 
798 static void vduse_queue_fill(VduseVirtq *vq, const VduseVirtqElement *elem,
799                              unsigned int len, unsigned int idx)
800 {
801     struct vring_used_elem uelem;
802 
803     if (unlikely(!vq->vring.used)) {
804         return;
805     }
806 
807     idx = (idx + vq->used_idx) % vq->vring.num;
808 
809     uelem.id = htole32(elem->index);
810     uelem.len = htole32(len);
811     vring_used_write(vq, &uelem, idx);
812 }
813 
814 static inline void vring_used_idx_set(VduseVirtq *vq, uint16_t val)
815 {
816     vq->vring.used->idx = htole16(val);
817     vq->used_idx = val;
818 }
819 
820 static void vduse_queue_flush(VduseVirtq *vq, unsigned int count)
821 {
822     uint16_t old, new;
823 
824     if (unlikely(!vq->vring.used)) {
825         return;
826     }
827 
828     /* Make sure buffer is written before we update index. */
829     smp_wmb();
830 
831     old = vq->used_idx;
832     new = old + count;
833     vring_used_idx_set(vq, new);
834     vq->inuse -= count;
835     if (unlikely((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old))) {
836         vq->signalled_used_valid = false;
837     }
838 }
839 
840 void vduse_queue_push(VduseVirtq *vq, const VduseVirtqElement *elem,
841                       unsigned int len)
842 {
843     vduse_queue_fill(vq, elem, len, 0);
844     vduse_queue_inflight_pre_put(vq, elem->index);
845     vduse_queue_flush(vq, 1);
846     vduse_queue_inflight_post_put(vq, elem->index);
847 }
848 
849 static int vduse_queue_update_vring(VduseVirtq *vq, uint64_t desc_addr,
850                                     uint64_t avail_addr, uint64_t used_addr)
851 {
852     struct VduseDev *dev = vq->dev;
853     uint64_t len;
854 
855     len = sizeof(struct vring_desc);
856     vq->vring.desc = iova_to_va(dev, &len, desc_addr);
857     if (len != sizeof(struct vring_desc)) {
858         return -EINVAL;
859     }
860 
861     len = sizeof(struct vring_avail);
862     vq->vring.avail = iova_to_va(dev, &len, avail_addr);
863     if (len != sizeof(struct vring_avail)) {
864         return -EINVAL;
865     }
866 
867     len = sizeof(struct vring_used);
868     vq->vring.used = iova_to_va(dev, &len, used_addr);
869     if (len != sizeof(struct vring_used)) {
870         return -EINVAL;
871     }
872 
873     if (!vq->vring.desc || !vq->vring.avail || !vq->vring.used) {
874         fprintf(stderr, "Failed to get vq[%d] iova mapping\n", vq->index);
875         return -EINVAL;
876     }
877 
878     return 0;
879 }
880 
881 static void vduse_queue_enable(VduseVirtq *vq)
882 {
883     struct VduseDev *dev = vq->dev;
884     struct vduse_vq_info vq_info;
885     struct vduse_vq_eventfd vq_eventfd;
886     int fd;
887 
888     vq_info.index = vq->index;
889     if (ioctl(dev->fd, VDUSE_VQ_GET_INFO, &vq_info)) {
890         fprintf(stderr, "Failed to get vq[%d] info: %s\n",
891                 vq->index, strerror(errno));
892         return;
893     }
894 
895     if (!vq_info.ready) {
896         return;
897     }
898 
899     vq->vring.num = vq_info.num;
900     vq->vring.desc_addr = vq_info.desc_addr;
901     vq->vring.avail_addr = vq_info.driver_addr;
902     vq->vring.used_addr = vq_info.device_addr;
903 
904     if (vduse_queue_update_vring(vq, vq_info.desc_addr,
905                                  vq_info.driver_addr, vq_info.device_addr)) {
906         fprintf(stderr, "Failed to update vring for vq[%d]\n", vq->index);
907         return;
908     }
909 
910     fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
911     if (fd < 0) {
912         fprintf(stderr, "Failed to init eventfd for vq[%d]\n", vq->index);
913         return;
914     }
915 
916     vq_eventfd.index = vq->index;
917     vq_eventfd.fd = fd;
918     if (ioctl(dev->fd, VDUSE_VQ_SETUP_KICKFD, &vq_eventfd)) {
919         fprintf(stderr, "Failed to setup kick fd for vq[%d]\n", vq->index);
920         close(fd);
921         return;
922     }
923 
924     vq->fd = fd;
925     vq->signalled_used_valid = false;
926     vq->ready = true;
927 
928     if (vduse_queue_check_inflights(vq)) {
929         fprintf(stderr, "Failed to check inflights for vq[%d]\n", vq->index);
930         close(fd);
931         return;
932     }
933 
934     dev->ops->enable_queue(dev, vq);
935 }
936 
937 static void vduse_queue_disable(VduseVirtq *vq)
938 {
939     struct VduseDev *dev = vq->dev;
940     struct vduse_vq_eventfd eventfd;
941 
942     if (!vq->ready) {
943         return;
944     }
945 
946     dev->ops->disable_queue(dev, vq);
947 
948     eventfd.index = vq->index;
949     eventfd.fd = VDUSE_EVENTFD_DEASSIGN;
950     ioctl(dev->fd, VDUSE_VQ_SETUP_KICKFD, &eventfd);
951     close(vq->fd);
952 
953     assert(vq->inuse == 0);
954 
955     vq->vring.num = 0;
956     vq->vring.desc_addr = 0;
957     vq->vring.avail_addr = 0;
958     vq->vring.used_addr = 0;
959     vq->vring.desc = 0;
960     vq->vring.avail = 0;
961     vq->vring.used = 0;
962     vq->ready = false;
963     vq->fd = -1;
964 }
965 
966 static void vduse_dev_start_dataplane(VduseDev *dev)
967 {
968     int i;
969 
970     if (ioctl(dev->fd, VDUSE_DEV_GET_FEATURES, &dev->features)) {
971         fprintf(stderr, "Failed to get features: %s\n", strerror(errno));
972         return;
973     }
974     assert(vduse_dev_has_feature(dev, VIRTIO_F_VERSION_1));
975 
976     for (i = 0; i < dev->num_queues; i++) {
977         vduse_queue_enable(&dev->vqs[i]);
978     }
979 }
980 
981 static void vduse_dev_stop_dataplane(VduseDev *dev)
982 {
983     size_t log_size = dev->num_queues * vduse_vq_log_size(VIRTQUEUE_MAX_SIZE);
984     int i;
985 
986     for (i = 0; i < dev->num_queues; i++) {
987         vduse_queue_disable(&dev->vqs[i]);
988     }
989     if (dev->log) {
990         memset(dev->log, 0, log_size);
991     }
992     dev->features = 0;
993     vduse_iova_remove_region(dev, 0, ULONG_MAX);
994 }
995 
996 int vduse_dev_handler(VduseDev *dev)
997 {
998     struct vduse_dev_request req;
999     struct vduse_dev_response resp = { 0 };
1000     VduseVirtq *vq;
1001     int i, ret;
1002 
1003     ret = read(dev->fd, &req, sizeof(req));
1004     if (ret != sizeof(req)) {
1005         fprintf(stderr, "Read request error [%d]: %s\n",
1006                 ret, strerror(errno));
1007         return -errno;
1008     }
1009     resp.request_id = req.request_id;
1010 
1011     switch (req.type) {
1012     case VDUSE_GET_VQ_STATE:
1013         vq = &dev->vqs[req.vq_state.index];
1014         resp.vq_state.split.avail_index = vq->last_avail_idx;
1015         resp.result = VDUSE_REQ_RESULT_OK;
1016         break;
1017     case VDUSE_SET_STATUS:
1018         if (req.s.status & VIRTIO_CONFIG_S_DRIVER_OK) {
1019             vduse_dev_start_dataplane(dev);
1020         } else if (req.s.status == 0) {
1021             vduse_dev_stop_dataplane(dev);
1022         }
1023         resp.result = VDUSE_REQ_RESULT_OK;
1024         break;
1025     case VDUSE_UPDATE_IOTLB:
1026         /* The iova will be updated by iova_to_va() later, so just remove it */
1027         vduse_iova_remove_region(dev, req.iova.start, req.iova.last);
1028         for (i = 0; i < dev->num_queues; i++) {
1029             VduseVirtq *vq = &dev->vqs[i];
1030             if (vq->ready) {
1031                 if (vduse_queue_update_vring(vq, vq->vring.desc_addr,
1032                                              vq->vring.avail_addr,
1033                                              vq->vring.used_addr)) {
1034                     fprintf(stderr, "Failed to update vring for vq[%d]\n",
1035                             vq->index);
1036                 }
1037             }
1038         }
1039         resp.result = VDUSE_REQ_RESULT_OK;
1040         break;
1041     default:
1042         resp.result = VDUSE_REQ_RESULT_FAILED;
1043         break;
1044     }
1045 
1046     ret = write(dev->fd, &resp, sizeof(resp));
1047     if (ret != sizeof(resp)) {
1048         fprintf(stderr, "Write request %d error [%d]: %s\n",
1049                 req.type, ret, strerror(errno));
1050         return -errno;
1051     }
1052     return 0;
1053 }
1054 
1055 int vduse_dev_update_config(VduseDev *dev, uint32_t size,
1056                             uint32_t offset, char *buffer)
1057 {
1058     int ret;
1059     struct vduse_config_data *data;
1060 
1061     data = malloc(offsetof(struct vduse_config_data, buffer) + size);
1062     if (!data) {
1063         return -ENOMEM;
1064     }
1065 
1066     data->offset = offset;
1067     data->length = size;
1068     memcpy(data->buffer, buffer, size);
1069 
1070     ret = ioctl(dev->fd, VDUSE_DEV_SET_CONFIG, data);
1071     free(data);
1072 
1073     if (ret) {
1074         return -errno;
1075     }
1076 
1077     if (ioctl(dev->fd, VDUSE_DEV_INJECT_CONFIG_IRQ)) {
1078         return -errno;
1079     }
1080 
1081     return 0;
1082 }
1083 
1084 int vduse_dev_setup_queue(VduseDev *dev, int index, int max_size)
1085 {
1086     VduseVirtq *vq = &dev->vqs[index];
1087     struct vduse_vq_config vq_config = { 0 };
1088 
1089     if (max_size > VIRTQUEUE_MAX_SIZE) {
1090         return -EINVAL;
1091     }
1092 
1093     vq_config.index = vq->index;
1094     vq_config.max_size = max_size;
1095 
1096     if (ioctl(dev->fd, VDUSE_VQ_SETUP, &vq_config)) {
1097         return -errno;
1098     }
1099 
1100     vduse_queue_enable(vq);
1101 
1102     return 0;
1103 }
1104 
1105 int vduse_set_reconnect_log_file(VduseDev *dev, const char *filename)
1106 {
1107 
1108     size_t log_size = dev->num_queues * vduse_vq_log_size(VIRTQUEUE_MAX_SIZE);
1109     void *log;
1110     int i;
1111 
1112     dev->log = log = vduse_log_get(filename, log_size);
1113     if (log == MAP_FAILED) {
1114         fprintf(stderr, "Failed to get vduse log\n");
1115         return -EINVAL;
1116     }
1117 
1118     for (i = 0; i < dev->num_queues; i++) {
1119         dev->vqs[i].log = log;
1120         dev->vqs[i].log->inflight.desc_num = VIRTQUEUE_MAX_SIZE;
1121         log = (void *)((char *)log + vduse_vq_log_size(VIRTQUEUE_MAX_SIZE));
1122     }
1123 
1124     return 0;
1125 }
1126 
1127 static int vduse_dev_init_vqs(VduseDev *dev, uint16_t num_queues)
1128 {
1129     VduseVirtq *vqs;
1130     int i;
1131 
1132     vqs = calloc(sizeof(VduseVirtq), num_queues);
1133     if (!vqs) {
1134         return -ENOMEM;
1135     }
1136 
1137     for (i = 0; i < num_queues; i++) {
1138         vqs[i].index = i;
1139         vqs[i].dev = dev;
1140         vqs[i].fd = -1;
1141     }
1142     dev->vqs = vqs;
1143 
1144     return 0;
1145 }
1146 
1147 static int vduse_dev_init(VduseDev *dev, const char *name,
1148                           uint16_t num_queues, const VduseOps *ops,
1149                           void *priv)
1150 {
1151     char *dev_path, *dev_name;
1152     int ret, fd;
1153 
1154     dev_path = malloc(strlen(name) + strlen("/dev/vduse/") + 1);
1155     if (!dev_path) {
1156         return -ENOMEM;
1157     }
1158     sprintf(dev_path, "/dev/vduse/%s", name);
1159 
1160     fd = open(dev_path, O_RDWR);
1161     free(dev_path);
1162     if (fd < 0) {
1163         fprintf(stderr, "Failed to open vduse dev %s: %s\n",
1164                 name, strerror(errno));
1165         return -errno;
1166     }
1167 
1168     if (ioctl(fd, VDUSE_DEV_GET_FEATURES, &dev->features)) {
1169         fprintf(stderr, "Failed to get features: %s\n", strerror(errno));
1170         close(fd);
1171         return -errno;
1172     }
1173 
1174     dev_name = strdup(name);
1175     if (!dev_name) {
1176         close(fd);
1177         return -ENOMEM;
1178     }
1179 
1180     ret = vduse_dev_init_vqs(dev, num_queues);
1181     if (ret) {
1182         free(dev_name);
1183         close(fd);
1184         return ret;
1185     }
1186 
1187     dev->name = dev_name;
1188     dev->num_queues = num_queues;
1189     dev->fd = fd;
1190     dev->ops = ops;
1191     dev->priv = priv;
1192 
1193     return 0;
1194 }
1195 
1196 static inline bool vduse_name_is_valid(const char *name)
1197 {
1198     return strlen(name) >= VDUSE_NAME_MAX || strstr(name, "..");
1199 }
1200 
1201 VduseDev *vduse_dev_create_by_fd(int fd, uint16_t num_queues,
1202                                  const VduseOps *ops, void *priv)
1203 {
1204     VduseDev *dev;
1205     int ret;
1206 
1207     if (!ops || !ops->enable_queue || !ops->disable_queue) {
1208         fprintf(stderr, "Invalid parameter for vduse\n");
1209         return NULL;
1210     }
1211 
1212     dev = calloc(sizeof(VduseDev), 1);
1213     if (!dev) {
1214         fprintf(stderr, "Failed to allocate vduse device\n");
1215         return NULL;
1216     }
1217 
1218     if (ioctl(fd, VDUSE_DEV_GET_FEATURES, &dev->features)) {
1219         fprintf(stderr, "Failed to get features: %s\n", strerror(errno));
1220         free(dev);
1221         return NULL;
1222     }
1223 
1224     ret = vduse_dev_init_vqs(dev, num_queues);
1225     if (ret) {
1226         fprintf(stderr, "Failed to init vqs\n");
1227         free(dev);
1228         return NULL;
1229     }
1230 
1231     dev->num_queues = num_queues;
1232     dev->fd = fd;
1233     dev->ops = ops;
1234     dev->priv = priv;
1235 
1236     return dev;
1237 }
1238 
1239 VduseDev *vduse_dev_create_by_name(const char *name, uint16_t num_queues,
1240                                    const VduseOps *ops, void *priv)
1241 {
1242     VduseDev *dev;
1243     int ret;
1244 
1245     if (!name || vduse_name_is_valid(name) || !ops ||
1246         !ops->enable_queue || !ops->disable_queue) {
1247         fprintf(stderr, "Invalid parameter for vduse\n");
1248         return NULL;
1249     }
1250 
1251     dev = calloc(sizeof(VduseDev), 1);
1252     if (!dev) {
1253         fprintf(stderr, "Failed to allocate vduse device\n");
1254         return NULL;
1255     }
1256 
1257     ret = vduse_dev_init(dev, name, num_queues, ops, priv);
1258     if (ret < 0) {
1259         fprintf(stderr, "Failed to init vduse device %s: %s\n",
1260                 name, strerror(ret));
1261         free(dev);
1262         return NULL;
1263     }
1264 
1265     return dev;
1266 }
1267 
1268 VduseDev *vduse_dev_create(const char *name, uint32_t device_id,
1269                            uint32_t vendor_id, uint64_t features,
1270                            uint16_t num_queues, uint32_t config_size,
1271                            char *config, const VduseOps *ops, void *priv)
1272 {
1273     VduseDev *dev;
1274     int ret, ctrl_fd;
1275     uint64_t version;
1276     struct vduse_dev_config *dev_config;
1277     size_t size = offsetof(struct vduse_dev_config, config);
1278 
1279     if (!name || vduse_name_is_valid(name) ||
1280         !has_feature(features,  VIRTIO_F_VERSION_1) || !config ||
1281         !config_size || !ops || !ops->enable_queue || !ops->disable_queue) {
1282         fprintf(stderr, "Invalid parameter for vduse\n");
1283         return NULL;
1284     }
1285 
1286     dev = calloc(sizeof(VduseDev), 1);
1287     if (!dev) {
1288         fprintf(stderr, "Failed to allocate vduse device\n");
1289         return NULL;
1290     }
1291 
1292     ctrl_fd = open("/dev/vduse/control", O_RDWR);
1293     if (ctrl_fd < 0) {
1294         fprintf(stderr, "Failed to open /dev/vduse/control: %s\n",
1295                 strerror(errno));
1296         goto err_ctrl;
1297     }
1298 
1299     version = VDUSE_API_VERSION;
1300     if (ioctl(ctrl_fd, VDUSE_SET_API_VERSION, &version)) {
1301         fprintf(stderr, "Failed to set api version %" PRIu64 ": %s\n",
1302                 version, strerror(errno));
1303         goto err_dev;
1304     }
1305 
1306     dev_config = calloc(size + config_size, 1);
1307     if (!dev_config) {
1308         fprintf(stderr, "Failed to allocate config space\n");
1309         goto err_dev;
1310     }
1311 
1312     strcpy(dev_config->name, name);
1313     dev_config->device_id = device_id;
1314     dev_config->vendor_id = vendor_id;
1315     dev_config->features = features;
1316     dev_config->vq_num = num_queues;
1317     dev_config->vq_align = VDUSE_VQ_ALIGN;
1318     dev_config->config_size = config_size;
1319     memcpy(dev_config->config, config, config_size);
1320 
1321     ret = ioctl(ctrl_fd, VDUSE_CREATE_DEV, dev_config);
1322     free(dev_config);
1323     if (ret && errno != EEXIST) {
1324         fprintf(stderr, "Failed to create vduse device %s: %s\n",
1325                 name, strerror(errno));
1326         goto err_dev;
1327     }
1328     dev->ctrl_fd = ctrl_fd;
1329 
1330     ret = vduse_dev_init(dev, name, num_queues, ops, priv);
1331     if (ret < 0) {
1332         fprintf(stderr, "Failed to init vduse device %s: %s\n",
1333                 name, strerror(ret));
1334         goto err;
1335     }
1336 
1337     return dev;
1338 err:
1339     ioctl(ctrl_fd, VDUSE_DESTROY_DEV, name);
1340 err_dev:
1341     close(ctrl_fd);
1342 err_ctrl:
1343     free(dev);
1344 
1345     return NULL;
1346 }
1347 
1348 int vduse_dev_destroy(VduseDev *dev)
1349 {
1350     size_t log_size = dev->num_queues * vduse_vq_log_size(VIRTQUEUE_MAX_SIZE);
1351     int i, ret = 0;
1352 
1353     if (dev->log) {
1354         munmap(dev->log, log_size);
1355     }
1356     for (i = 0; i < dev->num_queues; i++) {
1357         free(dev->vqs[i].resubmit_list);
1358     }
1359     free(dev->vqs);
1360     if (dev->fd >= 0) {
1361         close(dev->fd);
1362         dev->fd = -1;
1363     }
1364     if (dev->ctrl_fd >= 0) {
1365         if (ioctl(dev->ctrl_fd, VDUSE_DESTROY_DEV, dev->name)) {
1366             ret = -errno;
1367         }
1368         close(dev->ctrl_fd);
1369         dev->ctrl_fd = -1;
1370     }
1371     free(dev->name);
1372     free(dev);
1373 
1374     return ret;
1375 }
1376