xref: /openbmc/qemu/hw/virtio/vhost.c (revision 766aa0a6)
1 /*
2  * vhost support
3  *
4  * Copyright Red Hat, Inc. 2010
5  *
6  * Authors:
7  *  Michael S. Tsirkin <mst@redhat.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2.  See
10  * the COPYING file in the top-level directory.
11  *
12  * Contributions after 2012-01-13 are licensed under the terms of the
13  * GNU GPL, version 2 or (at your option) any later version.
14  */
15 
16 #include "qemu/osdep.h"
17 #include "qapi/error.h"
18 #include "hw/virtio/vhost.h"
19 #include "qemu/atomic.h"
20 #include "qemu/range.h"
21 #include "qemu/error-report.h"
22 #include "qemu/memfd.h"
23 #include "qemu/log.h"
24 #include "standard-headers/linux/vhost_types.h"
25 #include "hw/virtio/virtio-bus.h"
26 #include "hw/mem/memory-device.h"
27 #include "migration/blocker.h"
28 #include "migration/qemu-file-types.h"
29 #include "sysemu/dma.h"
30 #include "trace.h"
31 
32 /* enabled until disconnected backend stabilizes */
33 #define _VHOST_DEBUG 1
34 
35 #ifdef _VHOST_DEBUG
36 #define VHOST_OPS_DEBUG(retval, fmt, ...) \
37     do { \
38         error_report(fmt ": %s (%d)", ## __VA_ARGS__, \
39                      strerror(-retval), -retval); \
40     } while (0)
41 #else
42 #define VHOST_OPS_DEBUG(retval, fmt, ...) \
43     do { } while (0)
44 #endif
45 
46 static struct vhost_log *vhost_log;
47 static struct vhost_log *vhost_log_shm;
48 
49 /* Memslots used by backends that support private memslots (without an fd). */
50 static unsigned int used_memslots;
51 
52 /* Memslots used by backends that only support shared memslots (with an fd). */
53 static unsigned int used_shared_memslots;
54 
55 static QLIST_HEAD(, vhost_dev) vhost_devices =
56     QLIST_HEAD_INITIALIZER(vhost_devices);
57 
58 unsigned int vhost_get_free_memslots(void)
59 {
60     unsigned int free = UINT_MAX;
61     struct vhost_dev *hdev;
62 
63     QLIST_FOREACH(hdev, &vhost_devices, entry) {
64         unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev);
65         unsigned int cur_free;
66 
67         if (hdev->vhost_ops->vhost_backend_no_private_memslots &&
68             hdev->vhost_ops->vhost_backend_no_private_memslots(hdev)) {
69             cur_free = r - used_shared_memslots;
70         } else {
71             cur_free = r - used_memslots;
72         }
73         free = MIN(free, cur_free);
74     }
75     return free;
76 }
77 
78 static void vhost_dev_sync_region(struct vhost_dev *dev,
79                                   MemoryRegionSection *section,
80                                   uint64_t mfirst, uint64_t mlast,
81                                   uint64_t rfirst, uint64_t rlast)
82 {
83     vhost_log_chunk_t *dev_log = dev->log->log;
84 
85     uint64_t start = MAX(mfirst, rfirst);
86     uint64_t end = MIN(mlast, rlast);
87     vhost_log_chunk_t *from = dev_log + start / VHOST_LOG_CHUNK;
88     vhost_log_chunk_t *to = dev_log + end / VHOST_LOG_CHUNK + 1;
89     uint64_t addr = QEMU_ALIGN_DOWN(start, VHOST_LOG_CHUNK);
90 
91     if (end < start) {
92         return;
93     }
94     assert(end / VHOST_LOG_CHUNK < dev->log_size);
95     assert(start / VHOST_LOG_CHUNK < dev->log_size);
96 
97     for (;from < to; ++from) {
98         vhost_log_chunk_t log;
99         /* We first check with non-atomic: much cheaper,
100          * and we expect non-dirty to be the common case. */
101         if (!*from) {
102             addr += VHOST_LOG_CHUNK;
103             continue;
104         }
105         /* Data must be read atomically. We don't really need barrier semantics
106          * but it's easier to use atomic_* than roll our own. */
107         log = qatomic_xchg(from, 0);
108         while (log) {
109             int bit = ctzl(log);
110             hwaddr page_addr;
111             hwaddr section_offset;
112             hwaddr mr_offset;
113             page_addr = addr + bit * VHOST_LOG_PAGE;
114             section_offset = page_addr - section->offset_within_address_space;
115             mr_offset = section_offset + section->offset_within_region;
116             memory_region_set_dirty(section->mr, mr_offset, VHOST_LOG_PAGE);
117             log &= ~(0x1ull << bit);
118         }
119         addr += VHOST_LOG_CHUNK;
120     }
121 }
122 
123 bool vhost_dev_has_iommu(struct vhost_dev *dev)
124 {
125     VirtIODevice *vdev = dev->vdev;
126 
127     /*
128      * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support
129      * incremental memory mapping API via IOTLB API. For platform that
130      * does not have IOMMU, there's no need to enable this feature
131      * which may cause unnecessary IOTLB miss/update transactions.
132      */
133     if (vdev) {
134         return virtio_bus_device_iommu_enabled(vdev) &&
135             virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
136     } else {
137         return false;
138     }
139 }
140 
141 static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
142                                    MemoryRegionSection *section,
143                                    hwaddr first,
144                                    hwaddr last)
145 {
146     int i;
147     hwaddr start_addr;
148     hwaddr end_addr;
149 
150     if (!dev->log_enabled || !dev->started) {
151         return 0;
152     }
153     start_addr = section->offset_within_address_space;
154     end_addr = range_get_last(start_addr, int128_get64(section->size));
155     start_addr = MAX(first, start_addr);
156     end_addr = MIN(last, end_addr);
157 
158     for (i = 0; i < dev->mem->nregions; ++i) {
159         struct vhost_memory_region *reg = dev->mem->regions + i;
160         vhost_dev_sync_region(dev, section, start_addr, end_addr,
161                               reg->guest_phys_addr,
162                               range_get_last(reg->guest_phys_addr,
163                                              reg->memory_size));
164     }
165     for (i = 0; i < dev->nvqs; ++i) {
166         struct vhost_virtqueue *vq = dev->vqs + i;
167 
168         if (!vq->used_phys && !vq->used_size) {
169             continue;
170         }
171 
172         if (vhost_dev_has_iommu(dev)) {
173             IOMMUTLBEntry iotlb;
174             hwaddr used_phys = vq->used_phys, used_size = vq->used_size;
175             hwaddr phys, s, offset;
176 
177             while (used_size) {
178                 rcu_read_lock();
179                 iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
180                                                       used_phys,
181                                                       true,
182                                                       MEMTXATTRS_UNSPECIFIED);
183                 rcu_read_unlock();
184 
185                 if (!iotlb.target_as) {
186                     qemu_log_mask(LOG_GUEST_ERROR, "translation "
187                                   "failure for used_iova %"PRIx64"\n",
188                                   used_phys);
189                     return -EINVAL;
190                 }
191 
192                 offset = used_phys & iotlb.addr_mask;
193                 phys = iotlb.translated_addr + offset;
194 
195                 /*
196                  * Distance from start of used ring until last byte of
197                  * IOMMU page.
198                  */
199                 s = iotlb.addr_mask - offset;
200                 /*
201                  * Size of used ring, or of the part of it until end
202                  * of IOMMU page. To avoid zero result, do the adding
203                  * outside of MIN().
204                  */
205                 s = MIN(s, used_size - 1) + 1;
206 
207                 vhost_dev_sync_region(dev, section, start_addr, end_addr, phys,
208                                       range_get_last(phys, s));
209                 used_size -= s;
210                 used_phys += s;
211             }
212         } else {
213             vhost_dev_sync_region(dev, section, start_addr,
214                                   end_addr, vq->used_phys,
215                                   range_get_last(vq->used_phys, vq->used_size));
216         }
217     }
218     return 0;
219 }
220 
221 static void vhost_log_sync(MemoryListener *listener,
222                           MemoryRegionSection *section)
223 {
224     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
225                                          memory_listener);
226     vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL);
227 }
228 
229 static void vhost_log_sync_range(struct vhost_dev *dev,
230                                  hwaddr first, hwaddr last)
231 {
232     int i;
233     /* FIXME: this is N^2 in number of sections */
234     for (i = 0; i < dev->n_mem_sections; ++i) {
235         MemoryRegionSection *section = &dev->mem_sections[i];
236         vhost_sync_dirty_bitmap(dev, section, first, last);
237     }
238 }
239 
240 static uint64_t vhost_get_log_size(struct vhost_dev *dev)
241 {
242     uint64_t log_size = 0;
243     int i;
244     for (i = 0; i < dev->mem->nregions; ++i) {
245         struct vhost_memory_region *reg = dev->mem->regions + i;
246         uint64_t last = range_get_last(reg->guest_phys_addr,
247                                        reg->memory_size);
248         log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
249     }
250     return log_size;
251 }
252 
253 static int vhost_set_backend_type(struct vhost_dev *dev,
254                                   VhostBackendType backend_type)
255 {
256     int r = 0;
257 
258     switch (backend_type) {
259 #ifdef CONFIG_VHOST_KERNEL
260     case VHOST_BACKEND_TYPE_KERNEL:
261         dev->vhost_ops = &kernel_ops;
262         break;
263 #endif
264 #ifdef CONFIG_VHOST_USER
265     case VHOST_BACKEND_TYPE_USER:
266         dev->vhost_ops = &user_ops;
267         break;
268 #endif
269 #ifdef CONFIG_VHOST_VDPA
270     case VHOST_BACKEND_TYPE_VDPA:
271         dev->vhost_ops = &vdpa_ops;
272         break;
273 #endif
274     default:
275         error_report("Unknown vhost backend type");
276         r = -1;
277     }
278 
279     return r;
280 }
281 
282 static struct vhost_log *vhost_log_alloc(uint64_t size, bool share)
283 {
284     Error *err = NULL;
285     struct vhost_log *log;
286     uint64_t logsize = size * sizeof(*(log->log));
287     int fd = -1;
288 
289     log = g_new0(struct vhost_log, 1);
290     if (share) {
291         log->log = qemu_memfd_alloc("vhost-log", logsize,
292                                     F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
293                                     &fd, &err);
294         if (err) {
295             error_report_err(err);
296             g_free(log);
297             return NULL;
298         }
299         memset(log->log, 0, logsize);
300     } else {
301         log->log = g_malloc0(logsize);
302     }
303 
304     log->size = size;
305     log->refcnt = 1;
306     log->fd = fd;
307 
308     return log;
309 }
310 
311 static struct vhost_log *vhost_log_get(uint64_t size, bool share)
312 {
313     struct vhost_log *log = share ? vhost_log_shm : vhost_log;
314 
315     if (!log || log->size != size) {
316         log = vhost_log_alloc(size, share);
317         if (share) {
318             vhost_log_shm = log;
319         } else {
320             vhost_log = log;
321         }
322     } else {
323         ++log->refcnt;
324     }
325 
326     return log;
327 }
328 
329 static void vhost_log_put(struct vhost_dev *dev, bool sync)
330 {
331     struct vhost_log *log = dev->log;
332 
333     if (!log) {
334         return;
335     }
336 
337     --log->refcnt;
338     if (log->refcnt == 0) {
339         /* Sync only the range covered by the old log */
340         if (dev->log_size && sync) {
341             vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1);
342         }
343 
344         if (vhost_log == log) {
345             g_free(log->log);
346             vhost_log = NULL;
347         } else if (vhost_log_shm == log) {
348             qemu_memfd_free(log->log, log->size * sizeof(*(log->log)),
349                             log->fd);
350             vhost_log_shm = NULL;
351         }
352 
353         g_free(log);
354     }
355 
356     dev->log = NULL;
357     dev->log_size = 0;
358 }
359 
360 static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
361 {
362     return dev->vhost_ops->vhost_requires_shm_log &&
363            dev->vhost_ops->vhost_requires_shm_log(dev);
364 }
365 
366 static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
367 {
368     struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev));
369     uint64_t log_base = (uintptr_t)log->log;
370     int r;
371 
372     /* inform backend of log switching, this must be done before
373        releasing the current log, to ensure no logging is lost */
374     r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log);
375     if (r < 0) {
376         VHOST_OPS_DEBUG(r, "vhost_set_log_base failed");
377     }
378 
379     vhost_log_put(dev, true);
380     dev->log = log;
381     dev->log_size = size;
382 }
383 
384 static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,
385                               hwaddr *plen, bool is_write)
386 {
387     if (!vhost_dev_has_iommu(dev)) {
388         return cpu_physical_memory_map(addr, plen, is_write);
389     } else {
390         return (void *)(uintptr_t)addr;
391     }
392 }
393 
394 static void vhost_memory_unmap(struct vhost_dev *dev, void *buffer,
395                                hwaddr len, int is_write,
396                                hwaddr access_len)
397 {
398     if (!vhost_dev_has_iommu(dev)) {
399         cpu_physical_memory_unmap(buffer, len, is_write, access_len);
400     }
401 }
402 
403 static int vhost_verify_ring_part_mapping(void *ring_hva,
404                                           uint64_t ring_gpa,
405                                           uint64_t ring_size,
406                                           void *reg_hva,
407                                           uint64_t reg_gpa,
408                                           uint64_t reg_size)
409 {
410     uint64_t hva_ring_offset;
411     uint64_t ring_last = range_get_last(ring_gpa, ring_size);
412     uint64_t reg_last = range_get_last(reg_gpa, reg_size);
413 
414     if (ring_last < reg_gpa || ring_gpa > reg_last) {
415         return 0;
416     }
417     /* check that whole ring's is mapped */
418     if (ring_last > reg_last) {
419         return -ENOMEM;
420     }
421     /* check that ring's MemoryRegion wasn't replaced */
422     hva_ring_offset = ring_gpa - reg_gpa;
423     if (ring_hva != reg_hva + hva_ring_offset) {
424         return -EBUSY;
425     }
426 
427     return 0;
428 }
429 
430 static int vhost_verify_ring_mappings(struct vhost_dev *dev,
431                                       void *reg_hva,
432                                       uint64_t reg_gpa,
433                                       uint64_t reg_size)
434 {
435     int i, j;
436     int r = 0;
437     const char *part_name[] = {
438         "descriptor table",
439         "available ring",
440         "used ring"
441     };
442 
443     if (vhost_dev_has_iommu(dev)) {
444         return 0;
445     }
446 
447     for (i = 0; i < dev->nvqs; ++i) {
448         struct vhost_virtqueue *vq = dev->vqs + i;
449 
450         if (vq->desc_phys == 0) {
451             continue;
452         }
453 
454         j = 0;
455         r = vhost_verify_ring_part_mapping(
456                 vq->desc, vq->desc_phys, vq->desc_size,
457                 reg_hva, reg_gpa, reg_size);
458         if (r) {
459             break;
460         }
461 
462         j++;
463         r = vhost_verify_ring_part_mapping(
464                 vq->avail, vq->avail_phys, vq->avail_size,
465                 reg_hva, reg_gpa, reg_size);
466         if (r) {
467             break;
468         }
469 
470         j++;
471         r = vhost_verify_ring_part_mapping(
472                 vq->used, vq->used_phys, vq->used_size,
473                 reg_hva, reg_gpa, reg_size);
474         if (r) {
475             break;
476         }
477     }
478 
479     if (r == -ENOMEM) {
480         error_report("Unable to map %s for ring %d", part_name[j], i);
481     } else if (r == -EBUSY) {
482         error_report("%s relocated for ring %d", part_name[j], i);
483     }
484     return r;
485 }
486 
487 /*
488  * vhost_section: identify sections needed for vhost access
489  *
490  * We only care about RAM sections here (where virtqueue and guest
491  * internals accessed by virtio might live).
492  */
493 static bool vhost_section(struct vhost_dev *dev, MemoryRegionSection *section)
494 {
495     MemoryRegion *mr = section->mr;
496 
497     if (memory_region_is_ram(mr) && !memory_region_is_rom(mr)) {
498         uint8_t dirty_mask = memory_region_get_dirty_log_mask(mr);
499         uint8_t handled_dirty;
500 
501         /*
502          * Kernel based vhost doesn't handle any block which is doing
503          * dirty-tracking other than migration for which it has
504          * specific logging support. However for TCG the kernel never
505          * gets involved anyway so we can also ignore it's
506          * self-modiying code detection flags. However a vhost-user
507          * client could still confuse a TCG guest if it re-writes
508          * executable memory that has already been translated.
509          */
510         handled_dirty = (1 << DIRTY_MEMORY_MIGRATION) |
511             (1 << DIRTY_MEMORY_CODE);
512 
513         if (dirty_mask & ~handled_dirty) {
514             trace_vhost_reject_section(mr->name, 1);
515             return false;
516         }
517 
518         /*
519          * Some backends (like vhost-user) can only handle memory regions
520          * that have an fd (can be mapped into a different process). Filter
521          * the ones without an fd out, if requested.
522          *
523          * TODO: we might have to limit to MAP_SHARED as well.
524          */
525         if (memory_region_get_fd(section->mr) < 0 &&
526             dev->vhost_ops->vhost_backend_no_private_memslots &&
527             dev->vhost_ops->vhost_backend_no_private_memslots(dev)) {
528             trace_vhost_reject_section(mr->name, 2);
529             return false;
530         }
531 
532         trace_vhost_section(mr->name);
533         return true;
534     } else {
535         trace_vhost_reject_section(mr->name, 3);
536         return false;
537     }
538 }
539 
540 static void vhost_begin(MemoryListener *listener)
541 {
542     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
543                                          memory_listener);
544     dev->tmp_sections = NULL;
545     dev->n_tmp_sections = 0;
546 }
547 
548 static void vhost_commit(MemoryListener *listener)
549 {
550     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
551                                          memory_listener);
552     MemoryRegionSection *old_sections;
553     int n_old_sections;
554     uint64_t log_size;
555     size_t regions_size;
556     int r;
557     int i;
558     bool changed = false;
559 
560     /* Note we can be called before the device is started, but then
561      * starting the device calls set_mem_table, so we need to have
562      * built the data structures.
563      */
564     old_sections = dev->mem_sections;
565     n_old_sections = dev->n_mem_sections;
566     dev->mem_sections = dev->tmp_sections;
567     dev->n_mem_sections = dev->n_tmp_sections;
568 
569     if (dev->n_mem_sections != n_old_sections) {
570         changed = true;
571     } else {
572         /* Same size, lets check the contents */
573         for (i = 0; i < n_old_sections; i++) {
574             if (!MemoryRegionSection_eq(&old_sections[i],
575                                         &dev->mem_sections[i])) {
576                 changed = true;
577                 break;
578             }
579         }
580     }
581 
582     trace_vhost_commit(dev->started, changed);
583     if (!changed) {
584         goto out;
585     }
586 
587     /* Rebuild the regions list from the new sections list */
588     regions_size = offsetof(struct vhost_memory, regions) +
589                        dev->n_mem_sections * sizeof dev->mem->regions[0];
590     dev->mem = g_realloc(dev->mem, regions_size);
591     dev->mem->nregions = dev->n_mem_sections;
592 
593     if (dev->vhost_ops->vhost_backend_no_private_memslots &&
594         dev->vhost_ops->vhost_backend_no_private_memslots(dev)) {
595         used_shared_memslots = dev->mem->nregions;
596     } else {
597         used_memslots = dev->mem->nregions;
598     }
599 
600     for (i = 0; i < dev->n_mem_sections; i++) {
601         struct vhost_memory_region *cur_vmr = dev->mem->regions + i;
602         struct MemoryRegionSection *mrs = dev->mem_sections + i;
603 
604         cur_vmr->guest_phys_addr = mrs->offset_within_address_space;
605         cur_vmr->memory_size     = int128_get64(mrs->size);
606         cur_vmr->userspace_addr  =
607             (uintptr_t)memory_region_get_ram_ptr(mrs->mr) +
608             mrs->offset_within_region;
609         cur_vmr->flags_padding   = 0;
610     }
611 
612     if (!dev->started) {
613         goto out;
614     }
615 
616     for (i = 0; i < dev->mem->nregions; i++) {
617         if (vhost_verify_ring_mappings(dev,
618                        (void *)(uintptr_t)dev->mem->regions[i].userspace_addr,
619                        dev->mem->regions[i].guest_phys_addr,
620                        dev->mem->regions[i].memory_size)) {
621             error_report("Verify ring failure on region %d", i);
622             abort();
623         }
624     }
625 
626     if (!dev->log_enabled) {
627         r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
628         if (r < 0) {
629             VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed");
630         }
631         goto out;
632     }
633     log_size = vhost_get_log_size(dev);
634     /* We allocate an extra 4K bytes to log,
635      * to reduce the * number of reallocations. */
636 #define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log)
637     /* To log more, must increase log size before table update. */
638     if (dev->log_size < log_size) {
639         vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER);
640     }
641     r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
642     if (r < 0) {
643         VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed");
644     }
645     /* To log less, can only decrease log size after table update. */
646     if (dev->log_size > log_size + VHOST_LOG_BUFFER) {
647         vhost_dev_log_resize(dev, log_size);
648     }
649 
650 out:
651     /* Deref the old list of sections, this must happen _after_ the
652      * vhost_set_mem_table to ensure the client isn't still using the
653      * section we're about to unref.
654      */
655     while (n_old_sections--) {
656         memory_region_unref(old_sections[n_old_sections].mr);
657     }
658     g_free(old_sections);
659     return;
660 }
661 
662 /* Adds the section data to the tmp_section structure.
663  * It relies on the listener calling us in memory address order
664  * and for each region (via the _add and _nop methods) to
665  * join neighbours.
666  */
667 static void vhost_region_add_section(struct vhost_dev *dev,
668                                      MemoryRegionSection *section)
669 {
670     bool need_add = true;
671     uint64_t mrs_size = int128_get64(section->size);
672     uint64_t mrs_gpa = section->offset_within_address_space;
673     uintptr_t mrs_host = (uintptr_t)memory_region_get_ram_ptr(section->mr) +
674                          section->offset_within_region;
675     RAMBlock *mrs_rb = section->mr->ram_block;
676 
677     trace_vhost_region_add_section(section->mr->name, mrs_gpa, mrs_size,
678                                    mrs_host);
679 
680     if (dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER) {
681         /* Round the section to it's page size */
682         /* First align the start down to a page boundary */
683         size_t mrs_page = qemu_ram_pagesize(mrs_rb);
684         uint64_t alignage = mrs_host & (mrs_page - 1);
685         if (alignage) {
686             mrs_host -= alignage;
687             mrs_size += alignage;
688             mrs_gpa  -= alignage;
689         }
690         /* Now align the size up to a page boundary */
691         alignage = mrs_size & (mrs_page - 1);
692         if (alignage) {
693             mrs_size += mrs_page - alignage;
694         }
695         trace_vhost_region_add_section_aligned(section->mr->name, mrs_gpa,
696                                                mrs_size, mrs_host);
697     }
698 
699     if (dev->n_tmp_sections) {
700         /* Since we already have at least one section, lets see if
701          * this extends it; since we're scanning in order, we only
702          * have to look at the last one, and the FlatView that calls
703          * us shouldn't have overlaps.
704          */
705         MemoryRegionSection *prev_sec = dev->tmp_sections +
706                                                (dev->n_tmp_sections - 1);
707         uint64_t prev_gpa_start = prev_sec->offset_within_address_space;
708         uint64_t prev_size = int128_get64(prev_sec->size);
709         uint64_t prev_gpa_end   = range_get_last(prev_gpa_start, prev_size);
710         uint64_t prev_host_start =
711                         (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr) +
712                         prev_sec->offset_within_region;
713         uint64_t prev_host_end   = range_get_last(prev_host_start, prev_size);
714 
715         if (mrs_gpa <= (prev_gpa_end + 1)) {
716             /* OK, looks like overlapping/intersecting - it's possible that
717              * the rounding to page sizes has made them overlap, but they should
718              * match up in the same RAMBlock if they do.
719              */
720             if (mrs_gpa < prev_gpa_start) {
721                 error_report("%s:Section '%s' rounded to %"PRIx64
722                              " prior to previous '%s' %"PRIx64,
723                              __func__, section->mr->name, mrs_gpa,
724                              prev_sec->mr->name, prev_gpa_start);
725                 /* A way to cleanly fail here would be better */
726                 return;
727             }
728             /* Offset from the start of the previous GPA to this GPA */
729             size_t offset = mrs_gpa - prev_gpa_start;
730 
731             if (prev_host_start + offset == mrs_host &&
732                 section->mr == prev_sec->mr) {
733                 uint64_t max_end = MAX(prev_host_end, mrs_host + mrs_size);
734                 need_add = false;
735                 prev_sec->offset_within_address_space =
736                     MIN(prev_gpa_start, mrs_gpa);
737                 prev_sec->offset_within_region =
738                     MIN(prev_host_start, mrs_host) -
739                     (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr);
740                 prev_sec->size = int128_make64(max_end - MIN(prev_host_start,
741                                                mrs_host));
742                 trace_vhost_region_add_section_merge(section->mr->name,
743                                         int128_get64(prev_sec->size),
744                                         prev_sec->offset_within_address_space,
745                                         prev_sec->offset_within_region);
746             } else {
747                 /* adjoining regions are fine, but overlapping ones with
748                  * different blocks/offsets shouldn't happen
749                  */
750                 if (mrs_gpa != prev_gpa_end + 1) {
751                     error_report("%s: Overlapping but not coherent sections "
752                                  "at %"PRIx64,
753                                  __func__, mrs_gpa);
754                     return;
755                 }
756             }
757         }
758     }
759 
760     if (need_add) {
761         ++dev->n_tmp_sections;
762         dev->tmp_sections = g_renew(MemoryRegionSection, dev->tmp_sections,
763                                     dev->n_tmp_sections);
764         dev->tmp_sections[dev->n_tmp_sections - 1] = *section;
765         /* The flatview isn't stable and we don't use it, making it NULL
766          * means we can memcmp the list.
767          */
768         dev->tmp_sections[dev->n_tmp_sections - 1].fv = NULL;
769         memory_region_ref(section->mr);
770     }
771 }
772 
773 /* Used for both add and nop callbacks */
774 static void vhost_region_addnop(MemoryListener *listener,
775                                 MemoryRegionSection *section)
776 {
777     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
778                                          memory_listener);
779 
780     if (!vhost_section(dev, section)) {
781         return;
782     }
783     vhost_region_add_section(dev, section);
784 }
785 
786 static void vhost_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
787 {
788     struct vhost_iommu *iommu = container_of(n, struct vhost_iommu, n);
789     struct vhost_dev *hdev = iommu->hdev;
790     hwaddr iova = iotlb->iova + iommu->iommu_offset;
791 
792     if (vhost_backend_invalidate_device_iotlb(hdev, iova,
793                                               iotlb->addr_mask + 1)) {
794         error_report("Fail to invalidate device iotlb");
795     }
796 }
797 
798 static void vhost_iommu_region_add(MemoryListener *listener,
799                                    MemoryRegionSection *section)
800 {
801     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
802                                          iommu_listener);
803     struct vhost_iommu *iommu;
804     Int128 end;
805     int iommu_idx;
806     IOMMUMemoryRegion *iommu_mr;
807 
808     if (!memory_region_is_iommu(section->mr)) {
809         return;
810     }
811 
812     iommu_mr = IOMMU_MEMORY_REGION(section->mr);
813 
814     iommu = g_malloc0(sizeof(*iommu));
815     end = int128_add(int128_make64(section->offset_within_region),
816                      section->size);
817     end = int128_sub(end, int128_one());
818     iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
819                                                    MEMTXATTRS_UNSPECIFIED);
820     iommu_notifier_init(&iommu->n, vhost_iommu_unmap_notify,
821                         dev->vdev->device_iotlb_enabled ?
822                             IOMMU_NOTIFIER_DEVIOTLB_UNMAP :
823                             IOMMU_NOTIFIER_UNMAP,
824                         section->offset_within_region,
825                         int128_get64(end),
826                         iommu_idx);
827     iommu->mr = section->mr;
828     iommu->iommu_offset = section->offset_within_address_space -
829                           section->offset_within_region;
830     iommu->hdev = dev;
831     memory_region_register_iommu_notifier(section->mr, &iommu->n,
832                                           &error_fatal);
833     QLIST_INSERT_HEAD(&dev->iommu_list, iommu, iommu_next);
834     /* TODO: can replay help performance here? */
835 }
836 
837 static void vhost_iommu_region_del(MemoryListener *listener,
838                                    MemoryRegionSection *section)
839 {
840     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
841                                          iommu_listener);
842     struct vhost_iommu *iommu;
843 
844     if (!memory_region_is_iommu(section->mr)) {
845         return;
846     }
847 
848     QLIST_FOREACH(iommu, &dev->iommu_list, iommu_next) {
849         if (iommu->mr == section->mr &&
850             iommu->n.start == section->offset_within_region) {
851             memory_region_unregister_iommu_notifier(iommu->mr,
852                                                     &iommu->n);
853             QLIST_REMOVE(iommu, iommu_next);
854             g_free(iommu);
855             break;
856         }
857     }
858 }
859 
860 void vhost_toggle_device_iotlb(VirtIODevice *vdev)
861 {
862     VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev);
863     struct vhost_dev *dev;
864     struct vhost_iommu *iommu;
865 
866     if (vdev->vhost_started) {
867         dev = vdc->get_vhost(vdev);
868     } else {
869         return;
870     }
871 
872     QLIST_FOREACH(iommu, &dev->iommu_list, iommu_next) {
873         memory_region_unregister_iommu_notifier(iommu->mr, &iommu->n);
874         iommu->n.notifier_flags = vdev->device_iotlb_enabled ?
875                 IOMMU_NOTIFIER_DEVIOTLB_UNMAP : IOMMU_NOTIFIER_UNMAP;
876         memory_region_register_iommu_notifier(iommu->mr, &iommu->n,
877                                               &error_fatal);
878     }
879 }
880 
881 static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
882                                     struct vhost_virtqueue *vq,
883                                     unsigned idx, bool enable_log)
884 {
885     struct vhost_vring_addr addr;
886     int r;
887     memset(&addr, 0, sizeof(struct vhost_vring_addr));
888 
889     if (dev->vhost_ops->vhost_vq_get_addr) {
890         r = dev->vhost_ops->vhost_vq_get_addr(dev, &addr, vq);
891         if (r < 0) {
892             VHOST_OPS_DEBUG(r, "vhost_vq_get_addr failed");
893             return r;
894         }
895     } else {
896         addr.desc_user_addr = (uint64_t)(unsigned long)vq->desc;
897         addr.avail_user_addr = (uint64_t)(unsigned long)vq->avail;
898         addr.used_user_addr = (uint64_t)(unsigned long)vq->used;
899     }
900     addr.index = idx;
901     addr.log_guest_addr = vq->used_phys;
902     addr.flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0;
903     r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr);
904     if (r < 0) {
905         VHOST_OPS_DEBUG(r, "vhost_set_vring_addr failed");
906     }
907     return r;
908 }
909 
910 static int vhost_dev_set_features(struct vhost_dev *dev,
911                                   bool enable_log)
912 {
913     uint64_t features = dev->acked_features;
914     int r;
915     if (enable_log) {
916         features |= 0x1ULL << VHOST_F_LOG_ALL;
917     }
918     if (!vhost_dev_has_iommu(dev)) {
919         features &= ~(0x1ULL << VIRTIO_F_IOMMU_PLATFORM);
920     }
921     if (dev->vhost_ops->vhost_force_iommu) {
922         if (dev->vhost_ops->vhost_force_iommu(dev) == true) {
923             features |= 0x1ULL << VIRTIO_F_IOMMU_PLATFORM;
924        }
925     }
926     r = dev->vhost_ops->vhost_set_features(dev, features);
927     if (r < 0) {
928         VHOST_OPS_DEBUG(r, "vhost_set_features failed");
929         goto out;
930     }
931     if (dev->vhost_ops->vhost_set_backend_cap) {
932         r = dev->vhost_ops->vhost_set_backend_cap(dev);
933         if (r < 0) {
934             VHOST_OPS_DEBUG(r, "vhost_set_backend_cap failed");
935             goto out;
936         }
937     }
938 
939 out:
940     return r;
941 }
942 
943 static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
944 {
945     int r, i, idx;
946     hwaddr addr;
947 
948     r = vhost_dev_set_features(dev, enable_log);
949     if (r < 0) {
950         goto err_features;
951     }
952     for (i = 0; i < dev->nvqs; ++i) {
953         idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
954         addr = virtio_queue_get_desc_addr(dev->vdev, idx);
955         if (!addr) {
956             /*
957              * The queue might not be ready for start. If this
958              * is the case there is no reason to continue the process.
959              * The similar logic is used by the vhost_virtqueue_start()
960              * routine.
961              */
962             continue;
963         }
964         r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
965                                      enable_log);
966         if (r < 0) {
967             goto err_vq;
968         }
969     }
970     return 0;
971 err_vq:
972     for (; i >= 0; --i) {
973         idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
974         addr = virtio_queue_get_desc_addr(dev->vdev, idx);
975         if (!addr) {
976             continue;
977         }
978         vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
979                                  dev->log_enabled);
980     }
981     vhost_dev_set_features(dev, dev->log_enabled);
982 err_features:
983     return r;
984 }
985 
986 static int vhost_migration_log(MemoryListener *listener, bool enable)
987 {
988     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
989                                          memory_listener);
990     int r;
991     if (enable == dev->log_enabled) {
992         return 0;
993     }
994     if (!dev->started) {
995         dev->log_enabled = enable;
996         return 0;
997     }
998 
999     r = 0;
1000     if (!enable) {
1001         r = vhost_dev_set_log(dev, false);
1002         if (r < 0) {
1003             goto check_dev_state;
1004         }
1005         vhost_log_put(dev, false);
1006     } else {
1007         vhost_dev_log_resize(dev, vhost_get_log_size(dev));
1008         r = vhost_dev_set_log(dev, true);
1009         if (r < 0) {
1010             goto check_dev_state;
1011         }
1012     }
1013 
1014 check_dev_state:
1015     dev->log_enabled = enable;
1016     /*
1017      * vhost-user-* devices could change their state during log
1018      * initialization due to disconnect. So check dev state after
1019      * vhost communication.
1020      */
1021     if (!dev->started) {
1022         /*
1023          * Since device is in the stopped state, it is okay for
1024          * migration. Return success.
1025          */
1026         r = 0;
1027     }
1028     if (r) {
1029         /* An error occurred. */
1030         dev->log_enabled = false;
1031     }
1032 
1033     return r;
1034 }
1035 
1036 static void vhost_log_global_start(MemoryListener *listener)
1037 {
1038     int r;
1039 
1040     r = vhost_migration_log(listener, true);
1041     if (r < 0) {
1042         abort();
1043     }
1044 }
1045 
1046 static void vhost_log_global_stop(MemoryListener *listener)
1047 {
1048     int r;
1049 
1050     r = vhost_migration_log(listener, false);
1051     if (r < 0) {
1052         abort();
1053     }
1054 }
1055 
1056 static void vhost_log_start(MemoryListener *listener,
1057                             MemoryRegionSection *section,
1058                             int old, int new)
1059 {
1060     /* FIXME: implement */
1061 }
1062 
1063 static void vhost_log_stop(MemoryListener *listener,
1064                            MemoryRegionSection *section,
1065                            int old, int new)
1066 {
1067     /* FIXME: implement */
1068 }
1069 
1070 /* The vhost driver natively knows how to handle the vrings of non
1071  * cross-endian legacy devices and modern devices. Only legacy devices
1072  * exposed to a bi-endian guest may require the vhost driver to use a
1073  * specific endianness.
1074  */
1075 static inline bool vhost_needs_vring_endian(VirtIODevice *vdev)
1076 {
1077     if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
1078         return false;
1079     }
1080 #if HOST_BIG_ENDIAN
1081     return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_LITTLE;
1082 #else
1083     return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_BIG;
1084 #endif
1085 }
1086 
1087 static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev,
1088                                                    bool is_big_endian,
1089                                                    int vhost_vq_index)
1090 {
1091     int r;
1092     struct vhost_vring_state s = {
1093         .index = vhost_vq_index,
1094         .num = is_big_endian
1095     };
1096 
1097     r = dev->vhost_ops->vhost_set_vring_endian(dev, &s);
1098     if (r < 0) {
1099         VHOST_OPS_DEBUG(r, "vhost_set_vring_endian failed");
1100     }
1101     return r;
1102 }
1103 
1104 static int vhost_memory_region_lookup(struct vhost_dev *hdev,
1105                                       uint64_t gpa, uint64_t *uaddr,
1106                                       uint64_t *len)
1107 {
1108     int i;
1109 
1110     for (i = 0; i < hdev->mem->nregions; i++) {
1111         struct vhost_memory_region *reg = hdev->mem->regions + i;
1112 
1113         if (gpa >= reg->guest_phys_addr &&
1114             reg->guest_phys_addr + reg->memory_size > gpa) {
1115             *uaddr = reg->userspace_addr + gpa - reg->guest_phys_addr;
1116             *len = reg->guest_phys_addr + reg->memory_size - gpa;
1117             return 0;
1118         }
1119     }
1120 
1121     return -EFAULT;
1122 }
1123 
1124 int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write)
1125 {
1126     IOMMUTLBEntry iotlb;
1127     uint64_t uaddr, len;
1128     int ret = -EFAULT;
1129 
1130     RCU_READ_LOCK_GUARD();
1131 
1132     trace_vhost_iotlb_miss(dev, 1);
1133 
1134     iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
1135                                           iova, write,
1136                                           MEMTXATTRS_UNSPECIFIED);
1137     if (iotlb.target_as != NULL) {
1138         ret = vhost_memory_region_lookup(dev, iotlb.translated_addr,
1139                                          &uaddr, &len);
1140         if (ret) {
1141             trace_vhost_iotlb_miss(dev, 3);
1142             error_report("Fail to lookup the translated address "
1143                          "%"PRIx64, iotlb.translated_addr);
1144             goto out;
1145         }
1146 
1147         len = MIN(iotlb.addr_mask + 1, len);
1148         iova = iova & ~iotlb.addr_mask;
1149 
1150         ret = vhost_backend_update_device_iotlb(dev, iova, uaddr,
1151                                                 len, iotlb.perm);
1152         if (ret) {
1153             trace_vhost_iotlb_miss(dev, 4);
1154             error_report("Fail to update device iotlb");
1155             goto out;
1156         }
1157     }
1158 
1159     trace_vhost_iotlb_miss(dev, 2);
1160 
1161 out:
1162     return ret;
1163 }
1164 
1165 int vhost_virtqueue_start(struct vhost_dev *dev,
1166                           struct VirtIODevice *vdev,
1167                           struct vhost_virtqueue *vq,
1168                           unsigned idx)
1169 {
1170     BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1171     VirtioBusState *vbus = VIRTIO_BUS(qbus);
1172     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
1173     hwaddr s, l, a;
1174     int r;
1175     int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
1176     struct vhost_vring_file file = {
1177         .index = vhost_vq_index
1178     };
1179     struct vhost_vring_state state = {
1180         .index = vhost_vq_index
1181     };
1182     struct VirtQueue *vvq = virtio_get_queue(vdev, idx);
1183 
1184     a = virtio_queue_get_desc_addr(vdev, idx);
1185     if (a == 0) {
1186         /* Queue might not be ready for start */
1187         return 0;
1188     }
1189 
1190     vq->num = state.num = virtio_queue_get_num(vdev, idx);
1191     r = dev->vhost_ops->vhost_set_vring_num(dev, &state);
1192     if (r) {
1193         VHOST_OPS_DEBUG(r, "vhost_set_vring_num failed");
1194         return r;
1195     }
1196 
1197     state.num = virtio_queue_get_last_avail_idx(vdev, idx);
1198     r = dev->vhost_ops->vhost_set_vring_base(dev, &state);
1199     if (r) {
1200         VHOST_OPS_DEBUG(r, "vhost_set_vring_base failed");
1201         return r;
1202     }
1203 
1204     if (vhost_needs_vring_endian(vdev)) {
1205         r = vhost_virtqueue_set_vring_endian_legacy(dev,
1206                                                     virtio_is_big_endian(vdev),
1207                                                     vhost_vq_index);
1208         if (r) {
1209             return r;
1210         }
1211     }
1212 
1213     vq->desc_size = s = l = virtio_queue_get_desc_size(vdev, idx);
1214     vq->desc_phys = a;
1215     vq->desc = vhost_memory_map(dev, a, &l, false);
1216     if (!vq->desc || l != s) {
1217         r = -ENOMEM;
1218         goto fail_alloc_desc;
1219     }
1220     vq->avail_size = s = l = virtio_queue_get_avail_size(vdev, idx);
1221     vq->avail_phys = a = virtio_queue_get_avail_addr(vdev, idx);
1222     vq->avail = vhost_memory_map(dev, a, &l, false);
1223     if (!vq->avail || l != s) {
1224         r = -ENOMEM;
1225         goto fail_alloc_avail;
1226     }
1227     vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx);
1228     vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx);
1229     vq->used = vhost_memory_map(dev, a, &l, true);
1230     if (!vq->used || l != s) {
1231         r = -ENOMEM;
1232         goto fail_alloc_used;
1233     }
1234 
1235     r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled);
1236     if (r < 0) {
1237         goto fail_alloc;
1238     }
1239 
1240     file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq));
1241     r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
1242     if (r) {
1243         VHOST_OPS_DEBUG(r, "vhost_set_vring_kick failed");
1244         goto fail_kick;
1245     }
1246 
1247     /* Clear and discard previous events if any. */
1248     event_notifier_test_and_clear(&vq->masked_notifier);
1249 
1250     /* Init vring in unmasked state, unless guest_notifier_mask
1251      * will do it later.
1252      */
1253     if (!vdev->use_guest_notifier_mask) {
1254         /* TODO: check and handle errors. */
1255         vhost_virtqueue_mask(dev, vdev, idx, false);
1256     }
1257 
1258     if (k->query_guest_notifiers &&
1259         k->query_guest_notifiers(qbus->parent) &&
1260         virtio_queue_vector(vdev, idx) == VIRTIO_NO_VECTOR) {
1261         file.fd = -1;
1262         r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
1263         if (r) {
1264             goto fail_vector;
1265         }
1266     }
1267 
1268     return 0;
1269 
1270 fail_vector:
1271 fail_kick:
1272 fail_alloc:
1273     vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
1274                        0, 0);
1275 fail_alloc_used:
1276     vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
1277                        0, 0);
1278 fail_alloc_avail:
1279     vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
1280                        0, 0);
1281 fail_alloc_desc:
1282     return r;
1283 }
1284 
1285 void vhost_virtqueue_stop(struct vhost_dev *dev,
1286                           struct VirtIODevice *vdev,
1287                           struct vhost_virtqueue *vq,
1288                           unsigned idx)
1289 {
1290     int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
1291     struct vhost_vring_state state = {
1292         .index = vhost_vq_index,
1293     };
1294     int r;
1295 
1296     if (virtio_queue_get_desc_addr(vdev, idx) == 0) {
1297         /* Don't stop the virtqueue which might have not been started */
1298         return;
1299     }
1300 
1301     r = dev->vhost_ops->vhost_get_vring_base(dev, &state);
1302     if (r < 0) {
1303         VHOST_OPS_DEBUG(r, "vhost VQ %u ring restore failed: %d", idx, r);
1304         /* Connection to the backend is broken, so let's sync internal
1305          * last avail idx to the device used idx.
1306          */
1307         virtio_queue_restore_last_avail_idx(vdev, idx);
1308     } else {
1309         virtio_queue_set_last_avail_idx(vdev, idx, state.num);
1310     }
1311     virtio_queue_invalidate_signalled_used(vdev, idx);
1312     virtio_queue_update_used_idx(vdev, idx);
1313 
1314     /* In the cross-endian case, we need to reset the vring endianness to
1315      * native as legacy devices expect so by default.
1316      */
1317     if (vhost_needs_vring_endian(vdev)) {
1318         vhost_virtqueue_set_vring_endian_legacy(dev,
1319                                                 !virtio_is_big_endian(vdev),
1320                                                 vhost_vq_index);
1321     }
1322 
1323     vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
1324                        1, virtio_queue_get_used_size(vdev, idx));
1325     vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
1326                        0, virtio_queue_get_avail_size(vdev, idx));
1327     vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
1328                        0, virtio_queue_get_desc_size(vdev, idx));
1329 }
1330 
1331 static int vhost_virtqueue_set_busyloop_timeout(struct vhost_dev *dev,
1332                                                 int n, uint32_t timeout)
1333 {
1334     int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
1335     struct vhost_vring_state state = {
1336         .index = vhost_vq_index,
1337         .num = timeout,
1338     };
1339     int r;
1340 
1341     if (!dev->vhost_ops->vhost_set_vring_busyloop_timeout) {
1342         return -EINVAL;
1343     }
1344 
1345     r = dev->vhost_ops->vhost_set_vring_busyloop_timeout(dev, &state);
1346     if (r) {
1347         VHOST_OPS_DEBUG(r, "vhost_set_vring_busyloop_timeout failed");
1348         return r;
1349     }
1350 
1351     return 0;
1352 }
1353 
1354 static void vhost_virtqueue_error_notifier(EventNotifier *n)
1355 {
1356     struct vhost_virtqueue *vq = container_of(n, struct vhost_virtqueue,
1357                                               error_notifier);
1358     struct vhost_dev *dev = vq->dev;
1359     int index = vq - dev->vqs;
1360 
1361     if (event_notifier_test_and_clear(n) && dev->vdev) {
1362         VHOST_OPS_DEBUG(-EINVAL,  "vhost vring error in virtqueue %d",
1363                         dev->vq_index + index);
1364     }
1365 }
1366 
1367 static int vhost_virtqueue_init(struct vhost_dev *dev,
1368                                 struct vhost_virtqueue *vq, int n)
1369 {
1370     int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
1371     struct vhost_vring_file file = {
1372         .index = vhost_vq_index,
1373     };
1374     int r = event_notifier_init(&vq->masked_notifier, 0);
1375     if (r < 0) {
1376         return r;
1377     }
1378 
1379     file.fd = event_notifier_get_wfd(&vq->masked_notifier);
1380     r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
1381     if (r) {
1382         VHOST_OPS_DEBUG(r, "vhost_set_vring_call failed");
1383         goto fail_call;
1384     }
1385 
1386     vq->dev = dev;
1387 
1388     if (dev->vhost_ops->vhost_set_vring_err) {
1389         r = event_notifier_init(&vq->error_notifier, 0);
1390         if (r < 0) {
1391             goto fail_call;
1392         }
1393 
1394         file.fd = event_notifier_get_fd(&vq->error_notifier);
1395         r = dev->vhost_ops->vhost_set_vring_err(dev, &file);
1396         if (r) {
1397             VHOST_OPS_DEBUG(r, "vhost_set_vring_err failed");
1398             goto fail_err;
1399         }
1400 
1401         event_notifier_set_handler(&vq->error_notifier,
1402                                    vhost_virtqueue_error_notifier);
1403     }
1404 
1405     return 0;
1406 
1407 fail_err:
1408     event_notifier_cleanup(&vq->error_notifier);
1409 fail_call:
1410     event_notifier_cleanup(&vq->masked_notifier);
1411     return r;
1412 }
1413 
1414 static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq)
1415 {
1416     event_notifier_cleanup(&vq->masked_notifier);
1417     if (vq->dev->vhost_ops->vhost_set_vring_err) {
1418         event_notifier_set_handler(&vq->error_notifier, NULL);
1419         event_notifier_cleanup(&vq->error_notifier);
1420     }
1421 }
1422 
1423 int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
1424                    VhostBackendType backend_type, uint32_t busyloop_timeout,
1425                    Error **errp)
1426 {
1427     unsigned int used, reserved, limit;
1428     uint64_t features;
1429     int i, r, n_initialized_vqs = 0;
1430 
1431     hdev->vdev = NULL;
1432     hdev->migration_blocker = NULL;
1433 
1434     r = vhost_set_backend_type(hdev, backend_type);
1435     assert(r >= 0);
1436 
1437     r = hdev->vhost_ops->vhost_backend_init(hdev, opaque, errp);
1438     if (r < 0) {
1439         goto fail;
1440     }
1441 
1442     r = hdev->vhost_ops->vhost_set_owner(hdev);
1443     if (r < 0) {
1444         error_setg_errno(errp, -r, "vhost_set_owner failed");
1445         goto fail;
1446     }
1447 
1448     r = hdev->vhost_ops->vhost_get_features(hdev, &features);
1449     if (r < 0) {
1450         error_setg_errno(errp, -r, "vhost_get_features failed");
1451         goto fail;
1452     }
1453 
1454     for (i = 0; i < hdev->nvqs; ++i, ++n_initialized_vqs) {
1455         r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i);
1456         if (r < 0) {
1457             error_setg_errno(errp, -r, "Failed to initialize virtqueue %d", i);
1458             goto fail;
1459         }
1460     }
1461 
1462     if (busyloop_timeout) {
1463         for (i = 0; i < hdev->nvqs; ++i) {
1464             r = vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i,
1465                                                      busyloop_timeout);
1466             if (r < 0) {
1467                 error_setg_errno(errp, -r, "Failed to set busyloop timeout");
1468                 goto fail_busyloop;
1469             }
1470         }
1471     }
1472 
1473     hdev->features = features;
1474 
1475     hdev->memory_listener = (MemoryListener) {
1476         .name = "vhost",
1477         .begin = vhost_begin,
1478         .commit = vhost_commit,
1479         .region_add = vhost_region_addnop,
1480         .region_nop = vhost_region_addnop,
1481         .log_start = vhost_log_start,
1482         .log_stop = vhost_log_stop,
1483         .log_sync = vhost_log_sync,
1484         .log_global_start = vhost_log_global_start,
1485         .log_global_stop = vhost_log_global_stop,
1486         .priority = MEMORY_LISTENER_PRIORITY_DEV_BACKEND
1487     };
1488 
1489     hdev->iommu_listener = (MemoryListener) {
1490         .name = "vhost-iommu",
1491         .region_add = vhost_iommu_region_add,
1492         .region_del = vhost_iommu_region_del,
1493     };
1494 
1495     if (hdev->migration_blocker == NULL) {
1496         if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) {
1497             error_setg(&hdev->migration_blocker,
1498                        "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature.");
1499         } else if (vhost_dev_log_is_shared(hdev) && !qemu_memfd_alloc_check()) {
1500             error_setg(&hdev->migration_blocker,
1501                        "Migration disabled: failed to allocate shared memory");
1502         }
1503     }
1504 
1505     if (hdev->migration_blocker != NULL) {
1506         r = migrate_add_blocker(hdev->migration_blocker, errp);
1507         if (r < 0) {
1508             error_free(hdev->migration_blocker);
1509             goto fail_busyloop;
1510         }
1511     }
1512 
1513     hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions));
1514     hdev->n_mem_sections = 0;
1515     hdev->mem_sections = NULL;
1516     hdev->log = NULL;
1517     hdev->log_size = 0;
1518     hdev->log_enabled = false;
1519     hdev->started = false;
1520     memory_listener_register(&hdev->memory_listener, &address_space_memory);
1521     QLIST_INSERT_HEAD(&vhost_devices, hdev, entry);
1522 
1523     /*
1524      * The listener we registered properly updated the corresponding counter.
1525      * So we can trust that these values are accurate.
1526      */
1527     if (hdev->vhost_ops->vhost_backend_no_private_memslots &&
1528         hdev->vhost_ops->vhost_backend_no_private_memslots(hdev)) {
1529         used = used_shared_memslots;
1530     } else {
1531         used = used_memslots;
1532     }
1533     /*
1534      * We assume that all reserved memslots actually require a real memslot
1535      * in our vhost backend. This might not be true, for example, if the
1536      * memslot would be ROM. If ever relevant, we can optimize for that --
1537      * but we'll need additional information about the reservations.
1538      */
1539     reserved = memory_devices_get_reserved_memslots();
1540     limit = hdev->vhost_ops->vhost_backend_memslots_limit(hdev);
1541     if (used + reserved > limit) {
1542         error_setg(errp, "vhost backend memory slots limit (%d) is less"
1543                    " than current number of used (%d) and reserved (%d)"
1544                    " memory slots for memory devices.", limit, used, reserved);
1545         r = -EINVAL;
1546         goto fail_busyloop;
1547     }
1548 
1549     return 0;
1550 
1551 fail_busyloop:
1552     if (busyloop_timeout) {
1553         while (--i >= 0) {
1554             vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, 0);
1555         }
1556     }
1557 fail:
1558     hdev->nvqs = n_initialized_vqs;
1559     vhost_dev_cleanup(hdev);
1560     return r;
1561 }
1562 
1563 void vhost_dev_cleanup(struct vhost_dev *hdev)
1564 {
1565     int i;
1566 
1567     trace_vhost_dev_cleanup(hdev);
1568 
1569     for (i = 0; i < hdev->nvqs; ++i) {
1570         vhost_virtqueue_cleanup(hdev->vqs + i);
1571     }
1572     if (hdev->mem) {
1573         /* those are only safe after successful init */
1574         memory_listener_unregister(&hdev->memory_listener);
1575         QLIST_REMOVE(hdev, entry);
1576     }
1577     if (hdev->migration_blocker) {
1578         migrate_del_blocker(hdev->migration_blocker);
1579         error_free(hdev->migration_blocker);
1580     }
1581     g_free(hdev->mem);
1582     g_free(hdev->mem_sections);
1583     if (hdev->vhost_ops) {
1584         hdev->vhost_ops->vhost_backend_cleanup(hdev);
1585     }
1586     assert(!hdev->log);
1587 
1588     memset(hdev, 0, sizeof(struct vhost_dev));
1589 }
1590 
1591 static void vhost_dev_disable_notifiers_nvqs(struct vhost_dev *hdev,
1592                                              VirtIODevice *vdev,
1593                                              unsigned int nvqs)
1594 {
1595     BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1596     int i, r;
1597 
1598     /*
1599      * Batch all the host notifiers in a single transaction to avoid
1600      * quadratic time complexity in address_space_update_ioeventfds().
1601      */
1602     memory_region_transaction_begin();
1603 
1604     for (i = 0; i < nvqs; ++i) {
1605         r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
1606                                          false);
1607         if (r < 0) {
1608             error_report("vhost VQ %d notifier cleanup failed: %d", i, -r);
1609         }
1610         assert(r >= 0);
1611     }
1612 
1613     /*
1614      * The transaction expects the ioeventfds to be open when it
1615      * commits. Do it now, before the cleanup loop.
1616      */
1617     memory_region_transaction_commit();
1618 
1619     for (i = 0; i < nvqs; ++i) {
1620         virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i);
1621     }
1622     virtio_device_release_ioeventfd(vdev);
1623 }
1624 
1625 /* Stop processing guest IO notifications in qemu.
1626  * Start processing them in vhost in kernel.
1627  */
1628 int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
1629 {
1630     BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1631     int i, r;
1632 
1633     /* We will pass the notifiers to the kernel, make sure that QEMU
1634      * doesn't interfere.
1635      */
1636     r = virtio_device_grab_ioeventfd(vdev);
1637     if (r < 0) {
1638         error_report("binding does not support host notifiers");
1639         return r;
1640     }
1641 
1642     /*
1643      * Batch all the host notifiers in a single transaction to avoid
1644      * quadratic time complexity in address_space_update_ioeventfds().
1645      */
1646     memory_region_transaction_begin();
1647 
1648     for (i = 0; i < hdev->nvqs; ++i) {
1649         r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
1650                                          true);
1651         if (r < 0) {
1652             error_report("vhost VQ %d notifier binding failed: %d", i, -r);
1653             memory_region_transaction_commit();
1654             vhost_dev_disable_notifiers_nvqs(hdev, vdev, i);
1655             return r;
1656         }
1657     }
1658 
1659     memory_region_transaction_commit();
1660 
1661     return 0;
1662 }
1663 
1664 /* Stop processing guest IO notifications in vhost.
1665  * Start processing them in qemu.
1666  * This might actually run the qemu handlers right away,
1667  * so virtio in qemu must be completely setup when this is called.
1668  */
1669 void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
1670 {
1671     vhost_dev_disable_notifiers_nvqs(hdev, vdev, hdev->nvqs);
1672 }
1673 
1674 /* Test and clear event pending status.
1675  * Should be called after unmask to avoid losing events.
1676  */
1677 bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n)
1678 {
1679     struct vhost_virtqueue *vq = hdev->vqs + n - hdev->vq_index;
1680     assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs);
1681     return event_notifier_test_and_clear(&vq->masked_notifier);
1682 }
1683 
1684 /* Mask/unmask events from this vq. */
1685 void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n,
1686                          bool mask)
1687 {
1688     struct VirtQueue *vvq = virtio_get_queue(vdev, n);
1689     int r, index = n - hdev->vq_index;
1690     struct vhost_vring_file file;
1691 
1692     /* should only be called after backend is connected */
1693     assert(hdev->vhost_ops);
1694 
1695     if (mask) {
1696         assert(vdev->use_guest_notifier_mask);
1697         file.fd = event_notifier_get_wfd(&hdev->vqs[index].masked_notifier);
1698     } else {
1699         file.fd = event_notifier_get_wfd(virtio_queue_get_guest_notifier(vvq));
1700     }
1701 
1702     file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n);
1703     r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file);
1704     if (r < 0) {
1705         error_report("vhost_set_vring_call failed %d", -r);
1706     }
1707 }
1708 
1709 bool vhost_config_pending(struct vhost_dev *hdev)
1710 {
1711     assert(hdev->vhost_ops);
1712     if ((hdev->started == false) ||
1713         (hdev->vhost_ops->vhost_set_config_call == NULL)) {
1714         return false;
1715     }
1716 
1717     EventNotifier *notifier =
1718         &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier;
1719     return event_notifier_test_and_clear(notifier);
1720 }
1721 
1722 void vhost_config_mask(struct vhost_dev *hdev, VirtIODevice *vdev, bool mask)
1723 {
1724     int fd;
1725     int r;
1726     EventNotifier *notifier =
1727         &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier;
1728     EventNotifier *config_notifier = &vdev->config_notifier;
1729     assert(hdev->vhost_ops);
1730 
1731     if ((hdev->started == false) ||
1732         (hdev->vhost_ops->vhost_set_config_call == NULL)) {
1733         return;
1734     }
1735     if (mask) {
1736         assert(vdev->use_guest_notifier_mask);
1737         fd = event_notifier_get_fd(notifier);
1738     } else {
1739         fd = event_notifier_get_fd(config_notifier);
1740     }
1741     r = hdev->vhost_ops->vhost_set_config_call(hdev, fd);
1742     if (r < 0) {
1743         error_report("vhost_set_config_call failed %d", -r);
1744     }
1745 }
1746 
1747 static void vhost_stop_config_intr(struct vhost_dev *dev)
1748 {
1749     int fd = -1;
1750     assert(dev->vhost_ops);
1751     if (dev->vhost_ops->vhost_set_config_call) {
1752         dev->vhost_ops->vhost_set_config_call(dev, fd);
1753     }
1754 }
1755 
1756 static void vhost_start_config_intr(struct vhost_dev *dev)
1757 {
1758     int r;
1759 
1760     assert(dev->vhost_ops);
1761     int fd = event_notifier_get_fd(&dev->vdev->config_notifier);
1762     if (dev->vhost_ops->vhost_set_config_call) {
1763         r = dev->vhost_ops->vhost_set_config_call(dev, fd);
1764         if (!r) {
1765             event_notifier_set(&dev->vdev->config_notifier);
1766         }
1767     }
1768 }
1769 
1770 uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits,
1771                             uint64_t features)
1772 {
1773     const int *bit = feature_bits;
1774     while (*bit != VHOST_INVALID_FEATURE_BIT) {
1775         uint64_t bit_mask = (1ULL << *bit);
1776         if (!(hdev->features & bit_mask)) {
1777             features &= ~bit_mask;
1778         }
1779         bit++;
1780     }
1781     return features;
1782 }
1783 
1784 void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits,
1785                         uint64_t features)
1786 {
1787     const int *bit = feature_bits;
1788     while (*bit != VHOST_INVALID_FEATURE_BIT) {
1789         uint64_t bit_mask = (1ULL << *bit);
1790         if (features & bit_mask) {
1791             hdev->acked_features |= bit_mask;
1792         }
1793         bit++;
1794     }
1795 }
1796 
1797 int vhost_dev_get_config(struct vhost_dev *hdev, uint8_t *config,
1798                          uint32_t config_len, Error **errp)
1799 {
1800     assert(hdev->vhost_ops);
1801 
1802     if (hdev->vhost_ops->vhost_get_config) {
1803         return hdev->vhost_ops->vhost_get_config(hdev, config, config_len,
1804                                                  errp);
1805     }
1806 
1807     error_setg(errp, "vhost_get_config not implemented");
1808     return -ENOSYS;
1809 }
1810 
1811 int vhost_dev_set_config(struct vhost_dev *hdev, const uint8_t *data,
1812                          uint32_t offset, uint32_t size, uint32_t flags)
1813 {
1814     assert(hdev->vhost_ops);
1815 
1816     if (hdev->vhost_ops->vhost_set_config) {
1817         return hdev->vhost_ops->vhost_set_config(hdev, data, offset,
1818                                                  size, flags);
1819     }
1820 
1821     return -ENOSYS;
1822 }
1823 
1824 void vhost_dev_set_config_notifier(struct vhost_dev *hdev,
1825                                    const VhostDevConfigOps *ops)
1826 {
1827     hdev->config_ops = ops;
1828 }
1829 
1830 void vhost_dev_free_inflight(struct vhost_inflight *inflight)
1831 {
1832     if (inflight && inflight->addr) {
1833         qemu_memfd_free(inflight->addr, inflight->size, inflight->fd);
1834         inflight->addr = NULL;
1835         inflight->fd = -1;
1836     }
1837 }
1838 
1839 static int vhost_dev_resize_inflight(struct vhost_inflight *inflight,
1840                                      uint64_t new_size)
1841 {
1842     Error *err = NULL;
1843     int fd = -1;
1844     void *addr = qemu_memfd_alloc("vhost-inflight", new_size,
1845                                   F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
1846                                   &fd, &err);
1847 
1848     if (err) {
1849         error_report_err(err);
1850         return -ENOMEM;
1851     }
1852 
1853     vhost_dev_free_inflight(inflight);
1854     inflight->offset = 0;
1855     inflight->addr = addr;
1856     inflight->fd = fd;
1857     inflight->size = new_size;
1858 
1859     return 0;
1860 }
1861 
1862 void vhost_dev_save_inflight(struct vhost_inflight *inflight, QEMUFile *f)
1863 {
1864     if (inflight->addr) {
1865         qemu_put_be64(f, inflight->size);
1866         qemu_put_be16(f, inflight->queue_size);
1867         qemu_put_buffer(f, inflight->addr, inflight->size);
1868     } else {
1869         qemu_put_be64(f, 0);
1870     }
1871 }
1872 
1873 int vhost_dev_load_inflight(struct vhost_inflight *inflight, QEMUFile *f)
1874 {
1875     uint64_t size;
1876 
1877     size = qemu_get_be64(f);
1878     if (!size) {
1879         return 0;
1880     }
1881 
1882     if (inflight->size != size) {
1883         int ret = vhost_dev_resize_inflight(inflight, size);
1884         if (ret < 0) {
1885             return ret;
1886         }
1887     }
1888     inflight->queue_size = qemu_get_be16(f);
1889 
1890     qemu_get_buffer(f, inflight->addr, size);
1891 
1892     return 0;
1893 }
1894 
1895 int vhost_dev_prepare_inflight(struct vhost_dev *hdev, VirtIODevice *vdev)
1896 {
1897     int r;
1898 
1899     if (hdev->vhost_ops->vhost_get_inflight_fd == NULL ||
1900         hdev->vhost_ops->vhost_set_inflight_fd == NULL) {
1901         return 0;
1902     }
1903 
1904     hdev->vdev = vdev;
1905 
1906     r = vhost_dev_set_features(hdev, hdev->log_enabled);
1907     if (r < 0) {
1908         VHOST_OPS_DEBUG(r, "vhost_dev_prepare_inflight failed");
1909         return r;
1910     }
1911 
1912     return 0;
1913 }
1914 
1915 int vhost_dev_set_inflight(struct vhost_dev *dev,
1916                            struct vhost_inflight *inflight)
1917 {
1918     int r;
1919 
1920     if (dev->vhost_ops->vhost_set_inflight_fd && inflight->addr) {
1921         r = dev->vhost_ops->vhost_set_inflight_fd(dev, inflight);
1922         if (r) {
1923             VHOST_OPS_DEBUG(r, "vhost_set_inflight_fd failed");
1924             return r;
1925         }
1926     }
1927 
1928     return 0;
1929 }
1930 
1931 int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size,
1932                            struct vhost_inflight *inflight)
1933 {
1934     int r;
1935 
1936     if (dev->vhost_ops->vhost_get_inflight_fd) {
1937         r = dev->vhost_ops->vhost_get_inflight_fd(dev, queue_size, inflight);
1938         if (r) {
1939             VHOST_OPS_DEBUG(r, "vhost_get_inflight_fd failed");
1940             return r;
1941         }
1942     }
1943 
1944     return 0;
1945 }
1946 
1947 static int vhost_dev_set_vring_enable(struct vhost_dev *hdev, int enable)
1948 {
1949     if (!hdev->vhost_ops->vhost_set_vring_enable) {
1950         return 0;
1951     }
1952 
1953     /*
1954      * For vhost-user devices, if VHOST_USER_F_PROTOCOL_FEATURES has not
1955      * been negotiated, the rings start directly in the enabled state, and
1956      * .vhost_set_vring_enable callback will fail since
1957      * VHOST_USER_SET_VRING_ENABLE is not supported.
1958      */
1959     if (hdev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER &&
1960         !virtio_has_feature(hdev->backend_features,
1961                             VHOST_USER_F_PROTOCOL_FEATURES)) {
1962         return 0;
1963     }
1964 
1965     return hdev->vhost_ops->vhost_set_vring_enable(hdev, enable);
1966 }
1967 
1968 /* Host notifiers must be enabled at this point. */
1969 int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings)
1970 {
1971     int i, r;
1972 
1973     /* should only be called after backend is connected */
1974     assert(hdev->vhost_ops);
1975 
1976     trace_vhost_dev_start(hdev, vdev->name, vrings);
1977 
1978     vdev->vhost_started = true;
1979     hdev->started = true;
1980     hdev->vdev = vdev;
1981 
1982     r = vhost_dev_set_features(hdev, hdev->log_enabled);
1983     if (r < 0) {
1984         goto fail_features;
1985     }
1986 
1987     if (vhost_dev_has_iommu(hdev)) {
1988         memory_listener_register(&hdev->iommu_listener, vdev->dma_as);
1989     }
1990 
1991     r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem);
1992     if (r < 0) {
1993         VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed");
1994         goto fail_mem;
1995     }
1996     for (i = 0; i < hdev->nvqs; ++i) {
1997         r = vhost_virtqueue_start(hdev,
1998                                   vdev,
1999                                   hdev->vqs + i,
2000                                   hdev->vq_index + i);
2001         if (r < 0) {
2002             goto fail_vq;
2003         }
2004     }
2005 
2006     r = event_notifier_init(
2007         &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier, 0);
2008     if (r < 0) {
2009         VHOST_OPS_DEBUG(r, "event_notifier_init failed");
2010         goto fail_vq;
2011     }
2012     event_notifier_test_and_clear(
2013         &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier);
2014     if (!vdev->use_guest_notifier_mask) {
2015         vhost_config_mask(hdev, vdev, true);
2016     }
2017     if (hdev->log_enabled) {
2018         uint64_t log_base;
2019 
2020         hdev->log_size = vhost_get_log_size(hdev);
2021         hdev->log = vhost_log_get(hdev->log_size,
2022                                   vhost_dev_log_is_shared(hdev));
2023         log_base = (uintptr_t)hdev->log->log;
2024         r = hdev->vhost_ops->vhost_set_log_base(hdev,
2025                                                 hdev->log_size ? log_base : 0,
2026                                                 hdev->log);
2027         if (r < 0) {
2028             VHOST_OPS_DEBUG(r, "vhost_set_log_base failed");
2029             goto fail_log;
2030         }
2031     }
2032     if (vrings) {
2033         r = vhost_dev_set_vring_enable(hdev, true);
2034         if (r) {
2035             goto fail_log;
2036         }
2037     }
2038     if (hdev->vhost_ops->vhost_dev_start) {
2039         r = hdev->vhost_ops->vhost_dev_start(hdev, true);
2040         if (r) {
2041             goto fail_start;
2042         }
2043     }
2044     if (vhost_dev_has_iommu(hdev) &&
2045         hdev->vhost_ops->vhost_set_iotlb_callback) {
2046             hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true);
2047 
2048         /* Update used ring information for IOTLB to work correctly,
2049          * vhost-kernel code requires for this.*/
2050         for (i = 0; i < hdev->nvqs; ++i) {
2051             struct vhost_virtqueue *vq = hdev->vqs + i;
2052             vhost_device_iotlb_miss(hdev, vq->used_phys, true);
2053         }
2054     }
2055     vhost_start_config_intr(hdev);
2056     return 0;
2057 fail_start:
2058     if (vrings) {
2059         vhost_dev_set_vring_enable(hdev, false);
2060     }
2061 fail_log:
2062     vhost_log_put(hdev, false);
2063 fail_vq:
2064     while (--i >= 0) {
2065         vhost_virtqueue_stop(hdev,
2066                              vdev,
2067                              hdev->vqs + i,
2068                              hdev->vq_index + i);
2069     }
2070 
2071 fail_mem:
2072     if (vhost_dev_has_iommu(hdev)) {
2073         memory_listener_unregister(&hdev->iommu_listener);
2074     }
2075 fail_features:
2076     vdev->vhost_started = false;
2077     hdev->started = false;
2078     return r;
2079 }
2080 
2081 /* Host notifiers must be enabled at this point. */
2082 void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings)
2083 {
2084     int i;
2085 
2086     /* should only be called after backend is connected */
2087     assert(hdev->vhost_ops);
2088     event_notifier_test_and_clear(
2089         &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier);
2090     event_notifier_test_and_clear(&vdev->config_notifier);
2091     event_notifier_cleanup(
2092         &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier);
2093 
2094     trace_vhost_dev_stop(hdev, vdev->name, vrings);
2095 
2096     if (hdev->vhost_ops->vhost_dev_start) {
2097         hdev->vhost_ops->vhost_dev_start(hdev, false);
2098     }
2099     if (vrings) {
2100         vhost_dev_set_vring_enable(hdev, false);
2101     }
2102     for (i = 0; i < hdev->nvqs; ++i) {
2103         vhost_virtqueue_stop(hdev,
2104                              vdev,
2105                              hdev->vqs + i,
2106                              hdev->vq_index + i);
2107     }
2108     if (hdev->vhost_ops->vhost_reset_status) {
2109         hdev->vhost_ops->vhost_reset_status(hdev);
2110     }
2111 
2112     if (vhost_dev_has_iommu(hdev)) {
2113         if (hdev->vhost_ops->vhost_set_iotlb_callback) {
2114             hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false);
2115         }
2116         memory_listener_unregister(&hdev->iommu_listener);
2117     }
2118     vhost_stop_config_intr(hdev);
2119     vhost_log_put(hdev, true);
2120     hdev->started = false;
2121     vdev->vhost_started = false;
2122     hdev->vdev = NULL;
2123 }
2124 
2125 int vhost_net_set_backend(struct vhost_dev *hdev,
2126                           struct vhost_vring_file *file)
2127 {
2128     if (hdev->vhost_ops->vhost_net_set_backend) {
2129         return hdev->vhost_ops->vhost_net_set_backend(hdev, file);
2130     }
2131 
2132     return -ENOSYS;
2133 }
2134