xref: /openbmc/qemu/hw/virtio/vhost.c (revision 8f0a3716)
1 /*
2  * vhost support
3  *
4  * Copyright Red Hat, Inc. 2010
5  *
6  * Authors:
7  *  Michael S. Tsirkin <mst@redhat.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2.  See
10  * the COPYING file in the top-level directory.
11  *
12  * Contributions after 2012-01-13 are licensed under the terms of the
13  * GNU GPL, version 2 or (at your option) any later version.
14  */
15 
16 #include "qemu/osdep.h"
17 #include "qapi/error.h"
18 #include "hw/virtio/vhost.h"
19 #include "hw/hw.h"
20 #include "qemu/atomic.h"
21 #include "qemu/range.h"
22 #include "qemu/error-report.h"
23 #include "qemu/memfd.h"
24 #include <linux/vhost.h>
25 #include "exec/address-spaces.h"
26 #include "hw/virtio/virtio-bus.h"
27 #include "hw/virtio/virtio-access.h"
28 #include "migration/blocker.h"
29 #include "sysemu/dma.h"
30 #include "trace.h"
31 
32 /* enabled until disconnected backend stabilizes */
33 #define _VHOST_DEBUG 1
34 
35 #ifdef _VHOST_DEBUG
36 #define VHOST_OPS_DEBUG(fmt, ...) \
37     do { error_report(fmt ": %s (%d)", ## __VA_ARGS__, \
38                       strerror(errno), errno); } while (0)
39 #else
40 #define VHOST_OPS_DEBUG(fmt, ...) \
41     do { } while (0)
42 #endif
43 
44 static struct vhost_log *vhost_log;
45 static struct vhost_log *vhost_log_shm;
46 
47 static unsigned int used_memslots;
48 static QLIST_HEAD(, vhost_dev) vhost_devices =
49     QLIST_HEAD_INITIALIZER(vhost_devices);
50 
51 bool vhost_has_free_slot(void)
52 {
53     unsigned int slots_limit = ~0U;
54     struct vhost_dev *hdev;
55 
56     QLIST_FOREACH(hdev, &vhost_devices, entry) {
57         unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev);
58         slots_limit = MIN(slots_limit, r);
59     }
60     return slots_limit > used_memslots;
61 }
62 
63 static void vhost_dev_sync_region(struct vhost_dev *dev,
64                                   MemoryRegionSection *section,
65                                   uint64_t mfirst, uint64_t mlast,
66                                   uint64_t rfirst, uint64_t rlast)
67 {
68     vhost_log_chunk_t *log = dev->log->log;
69 
70     uint64_t start = MAX(mfirst, rfirst);
71     uint64_t end = MIN(mlast, rlast);
72     vhost_log_chunk_t *from = log + start / VHOST_LOG_CHUNK;
73     vhost_log_chunk_t *to = log + end / VHOST_LOG_CHUNK + 1;
74     uint64_t addr = QEMU_ALIGN_DOWN(start, VHOST_LOG_CHUNK);
75 
76     if (end < start) {
77         return;
78     }
79     assert(end / VHOST_LOG_CHUNK < dev->log_size);
80     assert(start / VHOST_LOG_CHUNK < dev->log_size);
81 
82     for (;from < to; ++from) {
83         vhost_log_chunk_t log;
84         /* We first check with non-atomic: much cheaper,
85          * and we expect non-dirty to be the common case. */
86         if (!*from) {
87             addr += VHOST_LOG_CHUNK;
88             continue;
89         }
90         /* Data must be read atomically. We don't really need barrier semantics
91          * but it's easier to use atomic_* than roll our own. */
92         log = atomic_xchg(from, 0);
93         while (log) {
94             int bit = ctzl(log);
95             hwaddr page_addr;
96             hwaddr section_offset;
97             hwaddr mr_offset;
98             page_addr = addr + bit * VHOST_LOG_PAGE;
99             section_offset = page_addr - section->offset_within_address_space;
100             mr_offset = section_offset + section->offset_within_region;
101             memory_region_set_dirty(section->mr, mr_offset, VHOST_LOG_PAGE);
102             log &= ~(0x1ull << bit);
103         }
104         addr += VHOST_LOG_CHUNK;
105     }
106 }
107 
108 static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
109                                    MemoryRegionSection *section,
110                                    hwaddr first,
111                                    hwaddr last)
112 {
113     int i;
114     hwaddr start_addr;
115     hwaddr end_addr;
116 
117     if (!dev->log_enabled || !dev->started) {
118         return 0;
119     }
120     start_addr = section->offset_within_address_space;
121     end_addr = range_get_last(start_addr, int128_get64(section->size));
122     start_addr = MAX(first, start_addr);
123     end_addr = MIN(last, end_addr);
124 
125     for (i = 0; i < dev->mem->nregions; ++i) {
126         struct vhost_memory_region *reg = dev->mem->regions + i;
127         vhost_dev_sync_region(dev, section, start_addr, end_addr,
128                               reg->guest_phys_addr,
129                               range_get_last(reg->guest_phys_addr,
130                                              reg->memory_size));
131     }
132     for (i = 0; i < dev->nvqs; ++i) {
133         struct vhost_virtqueue *vq = dev->vqs + i;
134         vhost_dev_sync_region(dev, section, start_addr, end_addr, vq->used_phys,
135                               range_get_last(vq->used_phys, vq->used_size));
136     }
137     return 0;
138 }
139 
140 static void vhost_log_sync(MemoryListener *listener,
141                           MemoryRegionSection *section)
142 {
143     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
144                                          memory_listener);
145     vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL);
146 }
147 
148 static void vhost_log_sync_range(struct vhost_dev *dev,
149                                  hwaddr first, hwaddr last)
150 {
151     int i;
152     /* FIXME: this is N^2 in number of sections */
153     for (i = 0; i < dev->n_mem_sections; ++i) {
154         MemoryRegionSection *section = &dev->mem_sections[i];
155         vhost_sync_dirty_bitmap(dev, section, first, last);
156     }
157 }
158 
159 /* Assign/unassign. Keep an unsorted array of non-overlapping
160  * memory regions in dev->mem. */
161 static void vhost_dev_unassign_memory(struct vhost_dev *dev,
162                                       uint64_t start_addr,
163                                       uint64_t size)
164 {
165     int from, to, n = dev->mem->nregions;
166     /* Track overlapping/split regions for sanity checking. */
167     int overlap_start = 0, overlap_end = 0, overlap_middle = 0, split = 0;
168 
169     for (from = 0, to = 0; from < n; ++from, ++to) {
170         struct vhost_memory_region *reg = dev->mem->regions + to;
171         uint64_t reglast;
172         uint64_t memlast;
173         uint64_t change;
174 
175         /* clone old region */
176         if (to != from) {
177             memcpy(reg, dev->mem->regions + from, sizeof *reg);
178         }
179 
180         /* No overlap is simple */
181         if (!ranges_overlap(reg->guest_phys_addr, reg->memory_size,
182                             start_addr, size)) {
183             continue;
184         }
185 
186         /* Split only happens if supplied region
187          * is in the middle of an existing one. Thus it can not
188          * overlap with any other existing region. */
189         assert(!split);
190 
191         reglast = range_get_last(reg->guest_phys_addr, reg->memory_size);
192         memlast = range_get_last(start_addr, size);
193 
194         /* Remove whole region */
195         if (start_addr <= reg->guest_phys_addr && memlast >= reglast) {
196             --dev->mem->nregions;
197             --to;
198             ++overlap_middle;
199             continue;
200         }
201 
202         /* Shrink region */
203         if (memlast >= reglast) {
204             reg->memory_size = start_addr - reg->guest_phys_addr;
205             assert(reg->memory_size);
206             assert(!overlap_end);
207             ++overlap_end;
208             continue;
209         }
210 
211         /* Shift region */
212         if (start_addr <= reg->guest_phys_addr) {
213             change = memlast + 1 - reg->guest_phys_addr;
214             reg->memory_size -= change;
215             reg->guest_phys_addr += change;
216             reg->userspace_addr += change;
217             assert(reg->memory_size);
218             assert(!overlap_start);
219             ++overlap_start;
220             continue;
221         }
222 
223         /* This only happens if supplied region
224          * is in the middle of an existing one. Thus it can not
225          * overlap with any other existing region. */
226         assert(!overlap_start);
227         assert(!overlap_end);
228         assert(!overlap_middle);
229         /* Split region: shrink first part, shift second part. */
230         memcpy(dev->mem->regions + n, reg, sizeof *reg);
231         reg->memory_size = start_addr - reg->guest_phys_addr;
232         assert(reg->memory_size);
233         change = memlast + 1 - reg->guest_phys_addr;
234         reg = dev->mem->regions + n;
235         reg->memory_size -= change;
236         assert(reg->memory_size);
237         reg->guest_phys_addr += change;
238         reg->userspace_addr += change;
239         /* Never add more than 1 region */
240         assert(dev->mem->nregions == n);
241         ++dev->mem->nregions;
242         ++split;
243     }
244 }
245 
246 /* Called after unassign, so no regions overlap the given range. */
247 static void vhost_dev_assign_memory(struct vhost_dev *dev,
248                                     uint64_t start_addr,
249                                     uint64_t size,
250                                     uint64_t uaddr)
251 {
252     int from, to;
253     struct vhost_memory_region *merged = NULL;
254     for (from = 0, to = 0; from < dev->mem->nregions; ++from, ++to) {
255         struct vhost_memory_region *reg = dev->mem->regions + to;
256         uint64_t prlast, urlast;
257         uint64_t pmlast, umlast;
258         uint64_t s, e, u;
259 
260         /* clone old region */
261         if (to != from) {
262             memcpy(reg, dev->mem->regions + from, sizeof *reg);
263         }
264         prlast = range_get_last(reg->guest_phys_addr, reg->memory_size);
265         pmlast = range_get_last(start_addr, size);
266         urlast = range_get_last(reg->userspace_addr, reg->memory_size);
267         umlast = range_get_last(uaddr, size);
268 
269         /* check for overlapping regions: should never happen. */
270         assert(prlast < start_addr || pmlast < reg->guest_phys_addr);
271         /* Not an adjacent or overlapping region - do not merge. */
272         if ((prlast + 1 != start_addr || urlast + 1 != uaddr) &&
273             (pmlast + 1 != reg->guest_phys_addr ||
274              umlast + 1 != reg->userspace_addr)) {
275             continue;
276         }
277 
278         if (dev->vhost_ops->vhost_backend_can_merge &&
279             !dev->vhost_ops->vhost_backend_can_merge(dev, uaddr, size,
280                                                      reg->userspace_addr,
281                                                      reg->memory_size)) {
282             continue;
283         }
284 
285         if (merged) {
286             --to;
287             assert(to >= 0);
288         } else {
289             merged = reg;
290         }
291         u = MIN(uaddr, reg->userspace_addr);
292         s = MIN(start_addr, reg->guest_phys_addr);
293         e = MAX(pmlast, prlast);
294         uaddr = merged->userspace_addr = u;
295         start_addr = merged->guest_phys_addr = s;
296         size = merged->memory_size = e - s + 1;
297         assert(merged->memory_size);
298     }
299 
300     if (!merged) {
301         struct vhost_memory_region *reg = dev->mem->regions + to;
302         memset(reg, 0, sizeof *reg);
303         reg->memory_size = size;
304         assert(reg->memory_size);
305         reg->guest_phys_addr = start_addr;
306         reg->userspace_addr = uaddr;
307         ++to;
308     }
309     assert(to <= dev->mem->nregions + 1);
310     dev->mem->nregions = to;
311 }
312 
313 static uint64_t vhost_get_log_size(struct vhost_dev *dev)
314 {
315     uint64_t log_size = 0;
316     int i;
317     for (i = 0; i < dev->mem->nregions; ++i) {
318         struct vhost_memory_region *reg = dev->mem->regions + i;
319         uint64_t last = range_get_last(reg->guest_phys_addr,
320                                        reg->memory_size);
321         log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
322     }
323     for (i = 0; i < dev->nvqs; ++i) {
324         struct vhost_virtqueue *vq = dev->vqs + i;
325         uint64_t last = vq->used_phys + vq->used_size - 1;
326         log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
327     }
328     return log_size;
329 }
330 
331 static struct vhost_log *vhost_log_alloc(uint64_t size, bool share)
332 {
333     Error *err = NULL;
334     struct vhost_log *log;
335     uint64_t logsize = size * sizeof(*(log->log));
336     int fd = -1;
337 
338     log = g_new0(struct vhost_log, 1);
339     if (share) {
340         log->log = qemu_memfd_alloc("vhost-log", logsize,
341                                     F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
342                                     &fd, &err);
343         if (err) {
344             error_report_err(err);
345             g_free(log);
346             return NULL;
347         }
348         memset(log->log, 0, logsize);
349     } else {
350         log->log = g_malloc0(logsize);
351     }
352 
353     log->size = size;
354     log->refcnt = 1;
355     log->fd = fd;
356 
357     return log;
358 }
359 
360 static struct vhost_log *vhost_log_get(uint64_t size, bool share)
361 {
362     struct vhost_log *log = share ? vhost_log_shm : vhost_log;
363 
364     if (!log || log->size != size) {
365         log = vhost_log_alloc(size, share);
366         if (share) {
367             vhost_log_shm = log;
368         } else {
369             vhost_log = log;
370         }
371     } else {
372         ++log->refcnt;
373     }
374 
375     return log;
376 }
377 
378 static void vhost_log_put(struct vhost_dev *dev, bool sync)
379 {
380     struct vhost_log *log = dev->log;
381 
382     if (!log) {
383         return;
384     }
385 
386     --log->refcnt;
387     if (log->refcnt == 0) {
388         /* Sync only the range covered by the old log */
389         if (dev->log_size && sync) {
390             vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1);
391         }
392 
393         if (vhost_log == log) {
394             g_free(log->log);
395             vhost_log = NULL;
396         } else if (vhost_log_shm == log) {
397             qemu_memfd_free(log->log, log->size * sizeof(*(log->log)),
398                             log->fd);
399             vhost_log_shm = NULL;
400         }
401 
402         g_free(log);
403     }
404 
405     dev->log = NULL;
406     dev->log_size = 0;
407 }
408 
409 static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
410 {
411     return dev->vhost_ops->vhost_requires_shm_log &&
412            dev->vhost_ops->vhost_requires_shm_log(dev);
413 }
414 
415 static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
416 {
417     struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev));
418     uint64_t log_base = (uintptr_t)log->log;
419     int r;
420 
421     /* inform backend of log switching, this must be done before
422        releasing the current log, to ensure no logging is lost */
423     r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log);
424     if (r < 0) {
425         VHOST_OPS_DEBUG("vhost_set_log_base failed");
426     }
427 
428     vhost_log_put(dev, true);
429     dev->log = log;
430     dev->log_size = size;
431 }
432 
433 static int vhost_dev_has_iommu(struct vhost_dev *dev)
434 {
435     VirtIODevice *vdev = dev->vdev;
436 
437     return virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
438 }
439 
440 static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,
441                               hwaddr *plen, int is_write)
442 {
443     if (!vhost_dev_has_iommu(dev)) {
444         return cpu_physical_memory_map(addr, plen, is_write);
445     } else {
446         return (void *)(uintptr_t)addr;
447     }
448 }
449 
450 static void vhost_memory_unmap(struct vhost_dev *dev, void *buffer,
451                                hwaddr len, int is_write,
452                                hwaddr access_len)
453 {
454     if (!vhost_dev_has_iommu(dev)) {
455         cpu_physical_memory_unmap(buffer, len, is_write, access_len);
456     }
457 }
458 
459 static int vhost_verify_ring_part_mapping(struct vhost_dev *dev,
460                                           void *part,
461                                           uint64_t part_addr,
462                                           uint64_t part_size,
463                                           uint64_t start_addr,
464                                           uint64_t size)
465 {
466     hwaddr l;
467     void *p;
468     int r = 0;
469 
470     if (!ranges_overlap(start_addr, size, part_addr, part_size)) {
471         return 0;
472     }
473     l = part_size;
474     p = vhost_memory_map(dev, part_addr, &l, 1);
475     if (!p || l != part_size) {
476         r = -ENOMEM;
477     }
478     if (p != part) {
479         r = -EBUSY;
480     }
481     vhost_memory_unmap(dev, p, l, 0, 0);
482     return r;
483 }
484 
485 static int vhost_verify_ring_mappings(struct vhost_dev *dev,
486                                       uint64_t start_addr,
487                                       uint64_t size)
488 {
489     int i, j;
490     int r = 0;
491     const char *part_name[] = {
492         "descriptor table",
493         "available ring",
494         "used ring"
495     };
496 
497     for (i = 0; i < dev->nvqs; ++i) {
498         struct vhost_virtqueue *vq = dev->vqs + i;
499 
500         j = 0;
501         r = vhost_verify_ring_part_mapping(dev, vq->desc, vq->desc_phys,
502                                            vq->desc_size, start_addr, size);
503         if (r) {
504             break;
505         }
506 
507         j++;
508         r = vhost_verify_ring_part_mapping(dev, vq->avail, vq->avail_phys,
509                                            vq->avail_size, start_addr, size);
510         if (r) {
511             break;
512         }
513 
514         j++;
515         r = vhost_verify_ring_part_mapping(dev, vq->used, vq->used_phys,
516                                            vq->used_size, start_addr, size);
517         if (r) {
518             break;
519         }
520     }
521 
522     if (r == -ENOMEM) {
523         error_report("Unable to map %s for ring %d", part_name[j], i);
524     } else if (r == -EBUSY) {
525         error_report("%s relocated for ring %d", part_name[j], i);
526     }
527     return r;
528 }
529 
530 static struct vhost_memory_region *vhost_dev_find_reg(struct vhost_dev *dev,
531 						      uint64_t start_addr,
532 						      uint64_t size)
533 {
534     int i, n = dev->mem->nregions;
535     for (i = 0; i < n; ++i) {
536         struct vhost_memory_region *reg = dev->mem->regions + i;
537         if (ranges_overlap(reg->guest_phys_addr, reg->memory_size,
538                            start_addr, size)) {
539             return reg;
540         }
541     }
542     return NULL;
543 }
544 
545 static bool vhost_dev_cmp_memory(struct vhost_dev *dev,
546                                  uint64_t start_addr,
547                                  uint64_t size,
548                                  uint64_t uaddr)
549 {
550     struct vhost_memory_region *reg = vhost_dev_find_reg(dev, start_addr, size);
551     uint64_t reglast;
552     uint64_t memlast;
553 
554     if (!reg) {
555         return true;
556     }
557 
558     reglast = range_get_last(reg->guest_phys_addr, reg->memory_size);
559     memlast = range_get_last(start_addr, size);
560 
561     /* Need to extend region? */
562     if (start_addr < reg->guest_phys_addr || memlast > reglast) {
563         return true;
564     }
565     /* userspace_addr changed? */
566     return uaddr != reg->userspace_addr + start_addr - reg->guest_phys_addr;
567 }
568 
569 static void vhost_set_memory(MemoryListener *listener,
570                              MemoryRegionSection *section,
571                              bool add)
572 {
573     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
574                                          memory_listener);
575     hwaddr start_addr = section->offset_within_address_space;
576     ram_addr_t size = int128_get64(section->size);
577     bool log_dirty =
578         memory_region_get_dirty_log_mask(section->mr) & ~(1 << DIRTY_MEMORY_MIGRATION);
579     int s = offsetof(struct vhost_memory, regions) +
580         (dev->mem->nregions + 1) * sizeof dev->mem->regions[0];
581     void *ram;
582 
583     dev->mem = g_realloc(dev->mem, s);
584 
585     if (log_dirty) {
586         add = false;
587     }
588 
589     assert(size);
590 
591     /* Optimize no-change case. At least cirrus_vga does this a lot at this time. */
592     ram = memory_region_get_ram_ptr(section->mr) + section->offset_within_region;
593     if (add) {
594         if (!vhost_dev_cmp_memory(dev, start_addr, size, (uintptr_t)ram)) {
595             /* Region exists with same address. Nothing to do. */
596             return;
597         }
598     } else {
599         if (!vhost_dev_find_reg(dev, start_addr, size)) {
600             /* Removing region that we don't access. Nothing to do. */
601             return;
602         }
603     }
604 
605     vhost_dev_unassign_memory(dev, start_addr, size);
606     if (add) {
607         /* Add given mapping, merging adjacent regions if any */
608         vhost_dev_assign_memory(dev, start_addr, size, (uintptr_t)ram);
609     } else {
610         /* Remove old mapping for this memory, if any. */
611         vhost_dev_unassign_memory(dev, start_addr, size);
612     }
613     dev->mem_changed_start_addr = MIN(dev->mem_changed_start_addr, start_addr);
614     dev->mem_changed_end_addr = MAX(dev->mem_changed_end_addr, start_addr + size - 1);
615     dev->memory_changed = true;
616     used_memslots = dev->mem->nregions;
617 }
618 
619 static bool vhost_section(MemoryRegionSection *section)
620 {
621     return memory_region_is_ram(section->mr) &&
622         !memory_region_is_rom(section->mr);
623 }
624 
625 static void vhost_begin(MemoryListener *listener)
626 {
627     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
628                                          memory_listener);
629     dev->mem_changed_end_addr = 0;
630     dev->mem_changed_start_addr = -1;
631 }
632 
633 static void vhost_commit(MemoryListener *listener)
634 {
635     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
636                                          memory_listener);
637     hwaddr start_addr = 0;
638     ram_addr_t size = 0;
639     uint64_t log_size;
640     int r;
641 
642     if (!dev->memory_changed) {
643         return;
644     }
645     if (!dev->started) {
646         return;
647     }
648     if (dev->mem_changed_start_addr > dev->mem_changed_end_addr) {
649         return;
650     }
651 
652     if (dev->started) {
653         start_addr = dev->mem_changed_start_addr;
654         size = dev->mem_changed_end_addr - dev->mem_changed_start_addr + 1;
655 
656         r = vhost_verify_ring_mappings(dev, start_addr, size);
657         assert(r >= 0);
658     }
659 
660     if (!dev->log_enabled) {
661         r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
662         if (r < 0) {
663             VHOST_OPS_DEBUG("vhost_set_mem_table failed");
664         }
665         dev->memory_changed = false;
666         return;
667     }
668     log_size = vhost_get_log_size(dev);
669     /* We allocate an extra 4K bytes to log,
670      * to reduce the * number of reallocations. */
671 #define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log)
672     /* To log more, must increase log size before table update. */
673     if (dev->log_size < log_size) {
674         vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER);
675     }
676     r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
677     if (r < 0) {
678         VHOST_OPS_DEBUG("vhost_set_mem_table failed");
679     }
680     /* To log less, can only decrease log size after table update. */
681     if (dev->log_size > log_size + VHOST_LOG_BUFFER) {
682         vhost_dev_log_resize(dev, log_size);
683     }
684     dev->memory_changed = false;
685 }
686 
687 static void vhost_region_add(MemoryListener *listener,
688                              MemoryRegionSection *section)
689 {
690     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
691                                          memory_listener);
692 
693     if (!vhost_section(section)) {
694         return;
695     }
696 
697     trace_vhost_region_add(dev, section->mr->name ?: NULL);
698     ++dev->n_mem_sections;
699     dev->mem_sections = g_renew(MemoryRegionSection, dev->mem_sections,
700                                 dev->n_mem_sections);
701     dev->mem_sections[dev->n_mem_sections - 1] = *section;
702     memory_region_ref(section->mr);
703     vhost_set_memory(listener, section, true);
704 }
705 
706 static void vhost_region_del(MemoryListener *listener,
707                              MemoryRegionSection *section)
708 {
709     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
710                                          memory_listener);
711     int i;
712 
713     if (!vhost_section(section)) {
714         return;
715     }
716 
717     trace_vhost_region_del(dev, section->mr->name ?: NULL);
718     vhost_set_memory(listener, section, false);
719     memory_region_unref(section->mr);
720     for (i = 0; i < dev->n_mem_sections; ++i) {
721         if (dev->mem_sections[i].offset_within_address_space
722             == section->offset_within_address_space) {
723             --dev->n_mem_sections;
724             memmove(&dev->mem_sections[i], &dev->mem_sections[i+1],
725                     (dev->n_mem_sections - i) * sizeof(*dev->mem_sections));
726             break;
727         }
728     }
729 }
730 
731 static void vhost_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
732 {
733     struct vhost_iommu *iommu = container_of(n, struct vhost_iommu, n);
734     struct vhost_dev *hdev = iommu->hdev;
735     hwaddr iova = iotlb->iova + iommu->iommu_offset;
736 
737     if (vhost_backend_invalidate_device_iotlb(hdev, iova,
738                                               iotlb->addr_mask + 1)) {
739         error_report("Fail to invalidate device iotlb");
740     }
741 }
742 
743 static void vhost_iommu_region_add(MemoryListener *listener,
744                                    MemoryRegionSection *section)
745 {
746     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
747                                          iommu_listener);
748     struct vhost_iommu *iommu;
749     Int128 end;
750 
751     if (!memory_region_is_iommu(section->mr)) {
752         return;
753     }
754 
755     trace_vhost_iommu_region_add(dev, section->mr->name ?: NULL);
756 
757     iommu = g_malloc0(sizeof(*iommu));
758     end = int128_add(int128_make64(section->offset_within_region),
759                      section->size);
760     end = int128_sub(end, int128_one());
761     iommu_notifier_init(&iommu->n, vhost_iommu_unmap_notify,
762                         IOMMU_NOTIFIER_UNMAP,
763                         section->offset_within_region,
764                         int128_get64(end));
765     iommu->mr = section->mr;
766     iommu->iommu_offset = section->offset_within_address_space -
767                           section->offset_within_region;
768     iommu->hdev = dev;
769     memory_region_register_iommu_notifier(section->mr, &iommu->n);
770     QLIST_INSERT_HEAD(&dev->iommu_list, iommu, iommu_next);
771     /* TODO: can replay help performance here? */
772 }
773 
774 static void vhost_iommu_region_del(MemoryListener *listener,
775                                    MemoryRegionSection *section)
776 {
777     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
778                                          iommu_listener);
779     struct vhost_iommu *iommu;
780 
781     if (!memory_region_is_iommu(section->mr)) {
782         return;
783     }
784 
785     trace_vhost_iommu_region_del(dev, section->mr->name ?: NULL);
786 
787     QLIST_FOREACH(iommu, &dev->iommu_list, iommu_next) {
788         if (iommu->mr == section->mr &&
789             iommu->n.start == section->offset_within_region) {
790             memory_region_unregister_iommu_notifier(iommu->mr,
791                                                     &iommu->n);
792             QLIST_REMOVE(iommu, iommu_next);
793             g_free(iommu);
794             break;
795         }
796     }
797 }
798 
799 static void vhost_region_nop(MemoryListener *listener,
800                              MemoryRegionSection *section)
801 {
802 }
803 
804 static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
805                                     struct vhost_virtqueue *vq,
806                                     unsigned idx, bool enable_log)
807 {
808     struct vhost_vring_addr addr = {
809         .index = idx,
810         .desc_user_addr = (uint64_t)(unsigned long)vq->desc,
811         .avail_user_addr = (uint64_t)(unsigned long)vq->avail,
812         .used_user_addr = (uint64_t)(unsigned long)vq->used,
813         .log_guest_addr = vq->used_phys,
814         .flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0,
815     };
816     int r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr);
817     if (r < 0) {
818         VHOST_OPS_DEBUG("vhost_set_vring_addr failed");
819         return -errno;
820     }
821     return 0;
822 }
823 
824 static int vhost_dev_set_features(struct vhost_dev *dev,
825                                   bool enable_log)
826 {
827     uint64_t features = dev->acked_features;
828     int r;
829     if (enable_log) {
830         features |= 0x1ULL << VHOST_F_LOG_ALL;
831     }
832     r = dev->vhost_ops->vhost_set_features(dev, features);
833     if (r < 0) {
834         VHOST_OPS_DEBUG("vhost_set_features failed");
835     }
836     return r < 0 ? -errno : 0;
837 }
838 
839 static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
840 {
841     int r, i, idx;
842     r = vhost_dev_set_features(dev, enable_log);
843     if (r < 0) {
844         goto err_features;
845     }
846     for (i = 0; i < dev->nvqs; ++i) {
847         idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
848         r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
849                                      enable_log);
850         if (r < 0) {
851             goto err_vq;
852         }
853     }
854     return 0;
855 err_vq:
856     for (; i >= 0; --i) {
857         idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
858         vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
859                                  dev->log_enabled);
860     }
861     vhost_dev_set_features(dev, dev->log_enabled);
862 err_features:
863     return r;
864 }
865 
866 static int vhost_migration_log(MemoryListener *listener, int enable)
867 {
868     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
869                                          memory_listener);
870     int r;
871     if (!!enable == dev->log_enabled) {
872         return 0;
873     }
874     if (!dev->started) {
875         dev->log_enabled = enable;
876         return 0;
877     }
878     if (!enable) {
879         r = vhost_dev_set_log(dev, false);
880         if (r < 0) {
881             return r;
882         }
883         vhost_log_put(dev, false);
884     } else {
885         vhost_dev_log_resize(dev, vhost_get_log_size(dev));
886         r = vhost_dev_set_log(dev, true);
887         if (r < 0) {
888             return r;
889         }
890     }
891     dev->log_enabled = enable;
892     return 0;
893 }
894 
895 static void vhost_log_global_start(MemoryListener *listener)
896 {
897     int r;
898 
899     r = vhost_migration_log(listener, true);
900     if (r < 0) {
901         abort();
902     }
903 }
904 
905 static void vhost_log_global_stop(MemoryListener *listener)
906 {
907     int r;
908 
909     r = vhost_migration_log(listener, false);
910     if (r < 0) {
911         abort();
912     }
913 }
914 
915 static void vhost_log_start(MemoryListener *listener,
916                             MemoryRegionSection *section,
917                             int old, int new)
918 {
919     /* FIXME: implement */
920 }
921 
922 static void vhost_log_stop(MemoryListener *listener,
923                            MemoryRegionSection *section,
924                            int old, int new)
925 {
926     /* FIXME: implement */
927 }
928 
929 /* The vhost driver natively knows how to handle the vrings of non
930  * cross-endian legacy devices and modern devices. Only legacy devices
931  * exposed to a bi-endian guest may require the vhost driver to use a
932  * specific endianness.
933  */
934 static inline bool vhost_needs_vring_endian(VirtIODevice *vdev)
935 {
936     if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
937         return false;
938     }
939 #ifdef HOST_WORDS_BIGENDIAN
940     return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_LITTLE;
941 #else
942     return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_BIG;
943 #endif
944 }
945 
946 static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev,
947                                                    bool is_big_endian,
948                                                    int vhost_vq_index)
949 {
950     struct vhost_vring_state s = {
951         .index = vhost_vq_index,
952         .num = is_big_endian
953     };
954 
955     if (!dev->vhost_ops->vhost_set_vring_endian(dev, &s)) {
956         return 0;
957     }
958 
959     VHOST_OPS_DEBUG("vhost_set_vring_endian failed");
960     if (errno == ENOTTY) {
961         error_report("vhost does not support cross-endian");
962         return -ENOSYS;
963     }
964 
965     return -errno;
966 }
967 
968 static int vhost_memory_region_lookup(struct vhost_dev *hdev,
969                                       uint64_t gpa, uint64_t *uaddr,
970                                       uint64_t *len)
971 {
972     int i;
973 
974     for (i = 0; i < hdev->mem->nregions; i++) {
975         struct vhost_memory_region *reg = hdev->mem->regions + i;
976 
977         if (gpa >= reg->guest_phys_addr &&
978             reg->guest_phys_addr + reg->memory_size > gpa) {
979             *uaddr = reg->userspace_addr + gpa - reg->guest_phys_addr;
980             *len = reg->guest_phys_addr + reg->memory_size - gpa;
981             return 0;
982         }
983     }
984 
985     return -EFAULT;
986 }
987 
988 int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write)
989 {
990     IOMMUTLBEntry iotlb;
991     uint64_t uaddr, len;
992     int ret = -EFAULT;
993 
994     rcu_read_lock();
995 
996     iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
997                                           iova, write);
998     if (iotlb.target_as != NULL) {
999         ret = vhost_memory_region_lookup(dev, iotlb.translated_addr,
1000                                          &uaddr, &len);
1001         if (ret) {
1002             error_report("Fail to lookup the translated address "
1003                          "%"PRIx64, iotlb.translated_addr);
1004             goto out;
1005         }
1006 
1007         len = MIN(iotlb.addr_mask + 1, len);
1008         iova = iova & ~iotlb.addr_mask;
1009 
1010         ret = vhost_backend_update_device_iotlb(dev, iova, uaddr,
1011                                                 len, iotlb.perm);
1012         if (ret) {
1013             error_report("Fail to update device iotlb");
1014             goto out;
1015         }
1016     }
1017 out:
1018     rcu_read_unlock();
1019 
1020     return ret;
1021 }
1022 
1023 static int vhost_virtqueue_start(struct vhost_dev *dev,
1024                                 struct VirtIODevice *vdev,
1025                                 struct vhost_virtqueue *vq,
1026                                 unsigned idx)
1027 {
1028     BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1029     VirtioBusState *vbus = VIRTIO_BUS(qbus);
1030     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
1031     hwaddr s, l, a;
1032     int r;
1033     int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
1034     struct vhost_vring_file file = {
1035         .index = vhost_vq_index
1036     };
1037     struct vhost_vring_state state = {
1038         .index = vhost_vq_index
1039     };
1040     struct VirtQueue *vvq = virtio_get_queue(vdev, idx);
1041 
1042 
1043     vq->num = state.num = virtio_queue_get_num(vdev, idx);
1044     r = dev->vhost_ops->vhost_set_vring_num(dev, &state);
1045     if (r) {
1046         VHOST_OPS_DEBUG("vhost_set_vring_num failed");
1047         return -errno;
1048     }
1049 
1050     state.num = virtio_queue_get_last_avail_idx(vdev, idx);
1051     r = dev->vhost_ops->vhost_set_vring_base(dev, &state);
1052     if (r) {
1053         VHOST_OPS_DEBUG("vhost_set_vring_base failed");
1054         return -errno;
1055     }
1056 
1057     if (vhost_needs_vring_endian(vdev)) {
1058         r = vhost_virtqueue_set_vring_endian_legacy(dev,
1059                                                     virtio_is_big_endian(vdev),
1060                                                     vhost_vq_index);
1061         if (r) {
1062             return -errno;
1063         }
1064     }
1065 
1066     vq->desc_size = s = l = virtio_queue_get_desc_size(vdev, idx);
1067     vq->desc_phys = a = virtio_queue_get_desc_addr(vdev, idx);
1068     vq->desc = vhost_memory_map(dev, a, &l, 0);
1069     if (!vq->desc || l != s) {
1070         r = -ENOMEM;
1071         goto fail_alloc_desc;
1072     }
1073     vq->avail_size = s = l = virtio_queue_get_avail_size(vdev, idx);
1074     vq->avail_phys = a = virtio_queue_get_avail_addr(vdev, idx);
1075     vq->avail = vhost_memory_map(dev, a, &l, 0);
1076     if (!vq->avail || l != s) {
1077         r = -ENOMEM;
1078         goto fail_alloc_avail;
1079     }
1080     vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx);
1081     vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx);
1082     vq->used = vhost_memory_map(dev, a, &l, 1);
1083     if (!vq->used || l != s) {
1084         r = -ENOMEM;
1085         goto fail_alloc_used;
1086     }
1087 
1088     r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled);
1089     if (r < 0) {
1090         r = -errno;
1091         goto fail_alloc;
1092     }
1093 
1094     file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq));
1095     r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
1096     if (r) {
1097         VHOST_OPS_DEBUG("vhost_set_vring_kick failed");
1098         r = -errno;
1099         goto fail_kick;
1100     }
1101 
1102     /* Clear and discard previous events if any. */
1103     event_notifier_test_and_clear(&vq->masked_notifier);
1104 
1105     /* Init vring in unmasked state, unless guest_notifier_mask
1106      * will do it later.
1107      */
1108     if (!vdev->use_guest_notifier_mask) {
1109         /* TODO: check and handle errors. */
1110         vhost_virtqueue_mask(dev, vdev, idx, false);
1111     }
1112 
1113     if (k->query_guest_notifiers &&
1114         k->query_guest_notifiers(qbus->parent) &&
1115         virtio_queue_vector(vdev, idx) == VIRTIO_NO_VECTOR) {
1116         file.fd = -1;
1117         r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
1118         if (r) {
1119             goto fail_vector;
1120         }
1121     }
1122 
1123     return 0;
1124 
1125 fail_vector:
1126 fail_kick:
1127 fail_alloc:
1128     vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
1129                        0, 0);
1130 fail_alloc_used:
1131     vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
1132                        0, 0);
1133 fail_alloc_avail:
1134     vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
1135                        0, 0);
1136 fail_alloc_desc:
1137     return r;
1138 }
1139 
1140 static void vhost_virtqueue_stop(struct vhost_dev *dev,
1141                                     struct VirtIODevice *vdev,
1142                                     struct vhost_virtqueue *vq,
1143                                     unsigned idx)
1144 {
1145     int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
1146     struct vhost_vring_state state = {
1147         .index = vhost_vq_index,
1148     };
1149     int r;
1150 
1151     r = dev->vhost_ops->vhost_get_vring_base(dev, &state);
1152     if (r < 0) {
1153         VHOST_OPS_DEBUG("vhost VQ %d ring restore failed: %d", idx, r);
1154         /* Connection to the backend is broken, so let's sync internal
1155          * last avail idx to the device used idx.
1156          */
1157         virtio_queue_restore_last_avail_idx(vdev, idx);
1158     } else {
1159         virtio_queue_set_last_avail_idx(vdev, idx, state.num);
1160     }
1161     virtio_queue_invalidate_signalled_used(vdev, idx);
1162     virtio_queue_update_used_idx(vdev, idx);
1163 
1164     /* In the cross-endian case, we need to reset the vring endianness to
1165      * native as legacy devices expect so by default.
1166      */
1167     if (vhost_needs_vring_endian(vdev)) {
1168         vhost_virtqueue_set_vring_endian_legacy(dev,
1169                                                 !virtio_is_big_endian(vdev),
1170                                                 vhost_vq_index);
1171     }
1172 
1173     vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
1174                        1, virtio_queue_get_used_size(vdev, idx));
1175     vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
1176                        0, virtio_queue_get_avail_size(vdev, idx));
1177     vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
1178                        0, virtio_queue_get_desc_size(vdev, idx));
1179 }
1180 
1181 static void vhost_eventfd_add(MemoryListener *listener,
1182                               MemoryRegionSection *section,
1183                               bool match_data, uint64_t data, EventNotifier *e)
1184 {
1185 }
1186 
1187 static void vhost_eventfd_del(MemoryListener *listener,
1188                               MemoryRegionSection *section,
1189                               bool match_data, uint64_t data, EventNotifier *e)
1190 {
1191 }
1192 
1193 static int vhost_virtqueue_set_busyloop_timeout(struct vhost_dev *dev,
1194                                                 int n, uint32_t timeout)
1195 {
1196     int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
1197     struct vhost_vring_state state = {
1198         .index = vhost_vq_index,
1199         .num = timeout,
1200     };
1201     int r;
1202 
1203     if (!dev->vhost_ops->vhost_set_vring_busyloop_timeout) {
1204         return -EINVAL;
1205     }
1206 
1207     r = dev->vhost_ops->vhost_set_vring_busyloop_timeout(dev, &state);
1208     if (r) {
1209         VHOST_OPS_DEBUG("vhost_set_vring_busyloop_timeout failed");
1210         return r;
1211     }
1212 
1213     return 0;
1214 }
1215 
1216 static int vhost_virtqueue_init(struct vhost_dev *dev,
1217                                 struct vhost_virtqueue *vq, int n)
1218 {
1219     int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
1220     struct vhost_vring_file file = {
1221         .index = vhost_vq_index,
1222     };
1223     int r = event_notifier_init(&vq->masked_notifier, 0);
1224     if (r < 0) {
1225         return r;
1226     }
1227 
1228     file.fd = event_notifier_get_fd(&vq->masked_notifier);
1229     r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
1230     if (r) {
1231         VHOST_OPS_DEBUG("vhost_set_vring_call failed");
1232         r = -errno;
1233         goto fail_call;
1234     }
1235 
1236     vq->dev = dev;
1237 
1238     return 0;
1239 fail_call:
1240     event_notifier_cleanup(&vq->masked_notifier);
1241     return r;
1242 }
1243 
1244 static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq)
1245 {
1246     event_notifier_cleanup(&vq->masked_notifier);
1247 }
1248 
1249 int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
1250                    VhostBackendType backend_type, uint32_t busyloop_timeout)
1251 {
1252     uint64_t features;
1253     int i, r, n_initialized_vqs = 0;
1254     Error *local_err = NULL;
1255 
1256     hdev->vdev = NULL;
1257     hdev->migration_blocker = NULL;
1258 
1259     r = vhost_set_backend_type(hdev, backend_type);
1260     assert(r >= 0);
1261 
1262     r = hdev->vhost_ops->vhost_backend_init(hdev, opaque);
1263     if (r < 0) {
1264         goto fail;
1265     }
1266 
1267     if (used_memslots > hdev->vhost_ops->vhost_backend_memslots_limit(hdev)) {
1268         error_report("vhost backend memory slots limit is less"
1269                 " than current number of present memory slots");
1270         r = -1;
1271         goto fail;
1272     }
1273 
1274     r = hdev->vhost_ops->vhost_set_owner(hdev);
1275     if (r < 0) {
1276         VHOST_OPS_DEBUG("vhost_set_owner failed");
1277         goto fail;
1278     }
1279 
1280     r = hdev->vhost_ops->vhost_get_features(hdev, &features);
1281     if (r < 0) {
1282         VHOST_OPS_DEBUG("vhost_get_features failed");
1283         goto fail;
1284     }
1285 
1286     for (i = 0; i < hdev->nvqs; ++i, ++n_initialized_vqs) {
1287         r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i);
1288         if (r < 0) {
1289             goto fail;
1290         }
1291     }
1292 
1293     if (busyloop_timeout) {
1294         for (i = 0; i < hdev->nvqs; ++i) {
1295             r = vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i,
1296                                                      busyloop_timeout);
1297             if (r < 0) {
1298                 goto fail_busyloop;
1299             }
1300         }
1301     }
1302 
1303     hdev->features = features;
1304 
1305     hdev->memory_listener = (MemoryListener) {
1306         .begin = vhost_begin,
1307         .commit = vhost_commit,
1308         .region_add = vhost_region_add,
1309         .region_del = vhost_region_del,
1310         .region_nop = vhost_region_nop,
1311         .log_start = vhost_log_start,
1312         .log_stop = vhost_log_stop,
1313         .log_sync = vhost_log_sync,
1314         .log_global_start = vhost_log_global_start,
1315         .log_global_stop = vhost_log_global_stop,
1316         .eventfd_add = vhost_eventfd_add,
1317         .eventfd_del = vhost_eventfd_del,
1318         .priority = 10
1319     };
1320 
1321     hdev->iommu_listener = (MemoryListener) {
1322         .region_add = vhost_iommu_region_add,
1323         .region_del = vhost_iommu_region_del,
1324     };
1325 
1326     if (hdev->migration_blocker == NULL) {
1327         if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) {
1328             error_setg(&hdev->migration_blocker,
1329                        "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature.");
1330         } else if (vhost_dev_log_is_shared(hdev) && !qemu_memfd_check()) {
1331             error_setg(&hdev->migration_blocker,
1332                        "Migration disabled: failed to allocate shared memory");
1333         }
1334     }
1335 
1336     if (hdev->migration_blocker != NULL) {
1337         r = migrate_add_blocker(hdev->migration_blocker, &local_err);
1338         if (local_err) {
1339             error_report_err(local_err);
1340             error_free(hdev->migration_blocker);
1341             goto fail_busyloop;
1342         }
1343     }
1344 
1345     hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions));
1346     hdev->n_mem_sections = 0;
1347     hdev->mem_sections = NULL;
1348     hdev->log = NULL;
1349     hdev->log_size = 0;
1350     hdev->log_enabled = false;
1351     hdev->started = false;
1352     hdev->memory_changed = false;
1353     memory_listener_register(&hdev->memory_listener, &address_space_memory);
1354     QLIST_INSERT_HEAD(&vhost_devices, hdev, entry);
1355     return 0;
1356 
1357 fail_busyloop:
1358     while (--i >= 0) {
1359         vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, 0);
1360     }
1361 fail:
1362     hdev->nvqs = n_initialized_vqs;
1363     vhost_dev_cleanup(hdev);
1364     return r;
1365 }
1366 
1367 void vhost_dev_cleanup(struct vhost_dev *hdev)
1368 {
1369     int i;
1370 
1371     for (i = 0; i < hdev->nvqs; ++i) {
1372         vhost_virtqueue_cleanup(hdev->vqs + i);
1373     }
1374     if (hdev->mem) {
1375         /* those are only safe after successful init */
1376         memory_listener_unregister(&hdev->memory_listener);
1377         QLIST_REMOVE(hdev, entry);
1378     }
1379     if (hdev->migration_blocker) {
1380         migrate_del_blocker(hdev->migration_blocker);
1381         error_free(hdev->migration_blocker);
1382     }
1383     g_free(hdev->mem);
1384     g_free(hdev->mem_sections);
1385     if (hdev->vhost_ops) {
1386         hdev->vhost_ops->vhost_backend_cleanup(hdev);
1387     }
1388     assert(!hdev->log);
1389 
1390     memset(hdev, 0, sizeof(struct vhost_dev));
1391 }
1392 
1393 /* Stop processing guest IO notifications in qemu.
1394  * Start processing them in vhost in kernel.
1395  */
1396 int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
1397 {
1398     BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1399     int i, r, e;
1400 
1401     /* We will pass the notifiers to the kernel, make sure that QEMU
1402      * doesn't interfere.
1403      */
1404     r = virtio_device_grab_ioeventfd(vdev);
1405     if (r < 0) {
1406         error_report("binding does not support host notifiers");
1407         goto fail;
1408     }
1409 
1410     for (i = 0; i < hdev->nvqs; ++i) {
1411         r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
1412                                          true);
1413         if (r < 0) {
1414             error_report("vhost VQ %d notifier binding failed: %d", i, -r);
1415             goto fail_vq;
1416         }
1417     }
1418 
1419     return 0;
1420 fail_vq:
1421     while (--i >= 0) {
1422         e = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
1423                                          false);
1424         if (e < 0) {
1425             error_report("vhost VQ %d notifier cleanup error: %d", i, -r);
1426         }
1427         assert (e >= 0);
1428     }
1429     virtio_device_release_ioeventfd(vdev);
1430 fail:
1431     return r;
1432 }
1433 
1434 /* Stop processing guest IO notifications in vhost.
1435  * Start processing them in qemu.
1436  * This might actually run the qemu handlers right away,
1437  * so virtio in qemu must be completely setup when this is called.
1438  */
1439 void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
1440 {
1441     BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1442     int i, r;
1443 
1444     for (i = 0; i < hdev->nvqs; ++i) {
1445         r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
1446                                          false);
1447         if (r < 0) {
1448             error_report("vhost VQ %d notifier cleanup failed: %d", i, -r);
1449         }
1450         assert (r >= 0);
1451     }
1452     virtio_device_release_ioeventfd(vdev);
1453 }
1454 
1455 /* Test and clear event pending status.
1456  * Should be called after unmask to avoid losing events.
1457  */
1458 bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n)
1459 {
1460     struct vhost_virtqueue *vq = hdev->vqs + n - hdev->vq_index;
1461     assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs);
1462     return event_notifier_test_and_clear(&vq->masked_notifier);
1463 }
1464 
1465 /* Mask/unmask events from this vq. */
1466 void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n,
1467                          bool mask)
1468 {
1469     struct VirtQueue *vvq = virtio_get_queue(vdev, n);
1470     int r, index = n - hdev->vq_index;
1471     struct vhost_vring_file file;
1472 
1473     /* should only be called after backend is connected */
1474     assert(hdev->vhost_ops);
1475 
1476     if (mask) {
1477         assert(vdev->use_guest_notifier_mask);
1478         file.fd = event_notifier_get_fd(&hdev->vqs[index].masked_notifier);
1479     } else {
1480         file.fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vvq));
1481     }
1482 
1483     file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n);
1484     r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file);
1485     if (r < 0) {
1486         VHOST_OPS_DEBUG("vhost_set_vring_call failed");
1487     }
1488 }
1489 
1490 uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits,
1491                             uint64_t features)
1492 {
1493     const int *bit = feature_bits;
1494     while (*bit != VHOST_INVALID_FEATURE_BIT) {
1495         uint64_t bit_mask = (1ULL << *bit);
1496         if (!(hdev->features & bit_mask)) {
1497             features &= ~bit_mask;
1498         }
1499         bit++;
1500     }
1501     return features;
1502 }
1503 
1504 void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits,
1505                         uint64_t features)
1506 {
1507     const int *bit = feature_bits;
1508     while (*bit != VHOST_INVALID_FEATURE_BIT) {
1509         uint64_t bit_mask = (1ULL << *bit);
1510         if (features & bit_mask) {
1511             hdev->acked_features |= bit_mask;
1512         }
1513         bit++;
1514     }
1515 }
1516 
1517 int vhost_dev_get_config(struct vhost_dev *hdev, uint8_t *config,
1518                          uint32_t config_len)
1519 {
1520     assert(hdev->vhost_ops);
1521 
1522     if (hdev->vhost_ops->vhost_get_config) {
1523         return hdev->vhost_ops->vhost_get_config(hdev, config, config_len);
1524     }
1525 
1526     return -1;
1527 }
1528 
1529 int vhost_dev_set_config(struct vhost_dev *hdev, const uint8_t *data,
1530                          uint32_t offset, uint32_t size, uint32_t flags)
1531 {
1532     assert(hdev->vhost_ops);
1533 
1534     if (hdev->vhost_ops->vhost_set_config) {
1535         return hdev->vhost_ops->vhost_set_config(hdev, data, offset,
1536                                                  size, flags);
1537     }
1538 
1539     return -1;
1540 }
1541 
1542 void vhost_dev_set_config_notifier(struct vhost_dev *hdev,
1543                                    const VhostDevConfigOps *ops)
1544 {
1545     assert(hdev->vhost_ops);
1546     hdev->config_ops = ops;
1547 }
1548 
1549 /* Host notifiers must be enabled at this point. */
1550 int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
1551 {
1552     int i, r;
1553 
1554     /* should only be called after backend is connected */
1555     assert(hdev->vhost_ops);
1556 
1557     hdev->started = true;
1558     hdev->vdev = vdev;
1559 
1560     r = vhost_dev_set_features(hdev, hdev->log_enabled);
1561     if (r < 0) {
1562         goto fail_features;
1563     }
1564 
1565     if (vhost_dev_has_iommu(hdev)) {
1566         memory_listener_register(&hdev->iommu_listener, vdev->dma_as);
1567     }
1568 
1569     r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem);
1570     if (r < 0) {
1571         VHOST_OPS_DEBUG("vhost_set_mem_table failed");
1572         r = -errno;
1573         goto fail_mem;
1574     }
1575     for (i = 0; i < hdev->nvqs; ++i) {
1576         r = vhost_virtqueue_start(hdev,
1577                                   vdev,
1578                                   hdev->vqs + i,
1579                                   hdev->vq_index + i);
1580         if (r < 0) {
1581             goto fail_vq;
1582         }
1583     }
1584 
1585     if (hdev->log_enabled) {
1586         uint64_t log_base;
1587 
1588         hdev->log_size = vhost_get_log_size(hdev);
1589         hdev->log = vhost_log_get(hdev->log_size,
1590                                   vhost_dev_log_is_shared(hdev));
1591         log_base = (uintptr_t)hdev->log->log;
1592         r = hdev->vhost_ops->vhost_set_log_base(hdev,
1593                                                 hdev->log_size ? log_base : 0,
1594                                                 hdev->log);
1595         if (r < 0) {
1596             VHOST_OPS_DEBUG("vhost_set_log_base failed");
1597             r = -errno;
1598             goto fail_log;
1599         }
1600     }
1601 
1602     if (vhost_dev_has_iommu(hdev)) {
1603         hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true);
1604 
1605         /* Update used ring information for IOTLB to work correctly,
1606          * vhost-kernel code requires for this.*/
1607         for (i = 0; i < hdev->nvqs; ++i) {
1608             struct vhost_virtqueue *vq = hdev->vqs + i;
1609             vhost_device_iotlb_miss(hdev, vq->used_phys, true);
1610         }
1611     }
1612     return 0;
1613 fail_log:
1614     vhost_log_put(hdev, false);
1615 fail_vq:
1616     while (--i >= 0) {
1617         vhost_virtqueue_stop(hdev,
1618                              vdev,
1619                              hdev->vqs + i,
1620                              hdev->vq_index + i);
1621     }
1622     i = hdev->nvqs;
1623 
1624 fail_mem:
1625 fail_features:
1626 
1627     hdev->started = false;
1628     return r;
1629 }
1630 
1631 /* Host notifiers must be enabled at this point. */
1632 void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
1633 {
1634     int i;
1635 
1636     /* should only be called after backend is connected */
1637     assert(hdev->vhost_ops);
1638 
1639     for (i = 0; i < hdev->nvqs; ++i) {
1640         vhost_virtqueue_stop(hdev,
1641                              vdev,
1642                              hdev->vqs + i,
1643                              hdev->vq_index + i);
1644     }
1645 
1646     if (vhost_dev_has_iommu(hdev)) {
1647         hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false);
1648         memory_listener_unregister(&hdev->iommu_listener);
1649     }
1650     vhost_log_put(hdev, true);
1651     hdev->started = false;
1652     hdev->vdev = NULL;
1653 }
1654 
1655 int vhost_net_set_backend(struct vhost_dev *hdev,
1656                           struct vhost_vring_file *file)
1657 {
1658     if (hdev->vhost_ops->vhost_net_set_backend) {
1659         return hdev->vhost_ops->vhost_net_set_backend(hdev, file);
1660     }
1661 
1662     return -1;
1663 }
1664