xref: /openbmc/qemu/hw/virtio/vhost.c (revision 64552b6b)
1 /*
2  * vhost support
3  *
4  * Copyright Red Hat, Inc. 2010
5  *
6  * Authors:
7  *  Michael S. Tsirkin <mst@redhat.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2.  See
10  * the COPYING file in the top-level directory.
11  *
12  * Contributions after 2012-01-13 are licensed under the terms of the
13  * GNU GPL, version 2 or (at your option) any later version.
14  */
15 
16 #include "qemu/osdep.h"
17 #include "qapi/error.h"
18 #include "hw/virtio/vhost.h"
19 #include "hw/hw.h"
20 #include "qemu/atomic.h"
21 #include "qemu/range.h"
22 #include "qemu/error-report.h"
23 #include "qemu/memfd.h"
24 #include "standard-headers/linux/vhost_types.h"
25 #include "exec/address-spaces.h"
26 #include "hw/virtio/virtio-bus.h"
27 #include "hw/virtio/virtio-access.h"
28 #include "migration/blocker.h"
29 #include "migration/qemu-file-types.h"
30 #include "sysemu/dma.h"
31 #include "trace.h"
32 
33 /* enabled until disconnected backend stabilizes */
34 #define _VHOST_DEBUG 1
35 
36 #ifdef _VHOST_DEBUG
37 #define VHOST_OPS_DEBUG(fmt, ...) \
38     do { error_report(fmt ": %s (%d)", ## __VA_ARGS__, \
39                       strerror(errno), errno); } while (0)
40 #else
41 #define VHOST_OPS_DEBUG(fmt, ...) \
42     do { } while (0)
43 #endif
44 
45 static struct vhost_log *vhost_log;
46 static struct vhost_log *vhost_log_shm;
47 
48 static unsigned int used_memslots;
49 static QLIST_HEAD(, vhost_dev) vhost_devices =
50     QLIST_HEAD_INITIALIZER(vhost_devices);
51 
52 bool vhost_has_free_slot(void)
53 {
54     unsigned int slots_limit = ~0U;
55     struct vhost_dev *hdev;
56 
57     QLIST_FOREACH(hdev, &vhost_devices, entry) {
58         unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev);
59         slots_limit = MIN(slots_limit, r);
60     }
61     return slots_limit > used_memslots;
62 }
63 
64 static void vhost_dev_sync_region(struct vhost_dev *dev,
65                                   MemoryRegionSection *section,
66                                   uint64_t mfirst, uint64_t mlast,
67                                   uint64_t rfirst, uint64_t rlast)
68 {
69     vhost_log_chunk_t *log = dev->log->log;
70 
71     uint64_t start = MAX(mfirst, rfirst);
72     uint64_t end = MIN(mlast, rlast);
73     vhost_log_chunk_t *from = log + start / VHOST_LOG_CHUNK;
74     vhost_log_chunk_t *to = log + end / VHOST_LOG_CHUNK + 1;
75     uint64_t addr = QEMU_ALIGN_DOWN(start, VHOST_LOG_CHUNK);
76 
77     if (end < start) {
78         return;
79     }
80     assert(end / VHOST_LOG_CHUNK < dev->log_size);
81     assert(start / VHOST_LOG_CHUNK < dev->log_size);
82 
83     for (;from < to; ++from) {
84         vhost_log_chunk_t log;
85         /* We first check with non-atomic: much cheaper,
86          * and we expect non-dirty to be the common case. */
87         if (!*from) {
88             addr += VHOST_LOG_CHUNK;
89             continue;
90         }
91         /* Data must be read atomically. We don't really need barrier semantics
92          * but it's easier to use atomic_* than roll our own. */
93         log = atomic_xchg(from, 0);
94         while (log) {
95             int bit = ctzl(log);
96             hwaddr page_addr;
97             hwaddr section_offset;
98             hwaddr mr_offset;
99             page_addr = addr + bit * VHOST_LOG_PAGE;
100             section_offset = page_addr - section->offset_within_address_space;
101             mr_offset = section_offset + section->offset_within_region;
102             memory_region_set_dirty(section->mr, mr_offset, VHOST_LOG_PAGE);
103             log &= ~(0x1ull << bit);
104         }
105         addr += VHOST_LOG_CHUNK;
106     }
107 }
108 
109 static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
110                                    MemoryRegionSection *section,
111                                    hwaddr first,
112                                    hwaddr last)
113 {
114     int i;
115     hwaddr start_addr;
116     hwaddr end_addr;
117 
118     if (!dev->log_enabled || !dev->started) {
119         return 0;
120     }
121     start_addr = section->offset_within_address_space;
122     end_addr = range_get_last(start_addr, int128_get64(section->size));
123     start_addr = MAX(first, start_addr);
124     end_addr = MIN(last, end_addr);
125 
126     for (i = 0; i < dev->mem->nregions; ++i) {
127         struct vhost_memory_region *reg = dev->mem->regions + i;
128         vhost_dev_sync_region(dev, section, start_addr, end_addr,
129                               reg->guest_phys_addr,
130                               range_get_last(reg->guest_phys_addr,
131                                              reg->memory_size));
132     }
133     for (i = 0; i < dev->nvqs; ++i) {
134         struct vhost_virtqueue *vq = dev->vqs + i;
135 
136         if (!vq->used_phys && !vq->used_size) {
137             continue;
138         }
139 
140         vhost_dev_sync_region(dev, section, start_addr, end_addr, vq->used_phys,
141                               range_get_last(vq->used_phys, vq->used_size));
142     }
143     return 0;
144 }
145 
146 static void vhost_log_sync(MemoryListener *listener,
147                           MemoryRegionSection *section)
148 {
149     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
150                                          memory_listener);
151     vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL);
152 }
153 
154 static void vhost_log_sync_range(struct vhost_dev *dev,
155                                  hwaddr first, hwaddr last)
156 {
157     int i;
158     /* FIXME: this is N^2 in number of sections */
159     for (i = 0; i < dev->n_mem_sections; ++i) {
160         MemoryRegionSection *section = &dev->mem_sections[i];
161         vhost_sync_dirty_bitmap(dev, section, first, last);
162     }
163 }
164 
165 static uint64_t vhost_get_log_size(struct vhost_dev *dev)
166 {
167     uint64_t log_size = 0;
168     int i;
169     for (i = 0; i < dev->mem->nregions; ++i) {
170         struct vhost_memory_region *reg = dev->mem->regions + i;
171         uint64_t last = range_get_last(reg->guest_phys_addr,
172                                        reg->memory_size);
173         log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
174     }
175     for (i = 0; i < dev->nvqs; ++i) {
176         struct vhost_virtqueue *vq = dev->vqs + i;
177 
178         if (!vq->used_phys && !vq->used_size) {
179             continue;
180         }
181 
182         uint64_t last = vq->used_phys + vq->used_size - 1;
183         log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
184     }
185     return log_size;
186 }
187 
188 static struct vhost_log *vhost_log_alloc(uint64_t size, bool share)
189 {
190     Error *err = NULL;
191     struct vhost_log *log;
192     uint64_t logsize = size * sizeof(*(log->log));
193     int fd = -1;
194 
195     log = g_new0(struct vhost_log, 1);
196     if (share) {
197         log->log = qemu_memfd_alloc("vhost-log", logsize,
198                                     F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
199                                     &fd, &err);
200         if (err) {
201             error_report_err(err);
202             g_free(log);
203             return NULL;
204         }
205         memset(log->log, 0, logsize);
206     } else {
207         log->log = g_malloc0(logsize);
208     }
209 
210     log->size = size;
211     log->refcnt = 1;
212     log->fd = fd;
213 
214     return log;
215 }
216 
217 static struct vhost_log *vhost_log_get(uint64_t size, bool share)
218 {
219     struct vhost_log *log = share ? vhost_log_shm : vhost_log;
220 
221     if (!log || log->size != size) {
222         log = vhost_log_alloc(size, share);
223         if (share) {
224             vhost_log_shm = log;
225         } else {
226             vhost_log = log;
227         }
228     } else {
229         ++log->refcnt;
230     }
231 
232     return log;
233 }
234 
235 static void vhost_log_put(struct vhost_dev *dev, bool sync)
236 {
237     struct vhost_log *log = dev->log;
238 
239     if (!log) {
240         return;
241     }
242 
243     --log->refcnt;
244     if (log->refcnt == 0) {
245         /* Sync only the range covered by the old log */
246         if (dev->log_size && sync) {
247             vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1);
248         }
249 
250         if (vhost_log == log) {
251             g_free(log->log);
252             vhost_log = NULL;
253         } else if (vhost_log_shm == log) {
254             qemu_memfd_free(log->log, log->size * sizeof(*(log->log)),
255                             log->fd);
256             vhost_log_shm = NULL;
257         }
258 
259         g_free(log);
260     }
261 
262     dev->log = NULL;
263     dev->log_size = 0;
264 }
265 
266 static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
267 {
268     return dev->vhost_ops->vhost_requires_shm_log &&
269            dev->vhost_ops->vhost_requires_shm_log(dev);
270 }
271 
272 static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
273 {
274     struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev));
275     uint64_t log_base = (uintptr_t)log->log;
276     int r;
277 
278     /* inform backend of log switching, this must be done before
279        releasing the current log, to ensure no logging is lost */
280     r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log);
281     if (r < 0) {
282         VHOST_OPS_DEBUG("vhost_set_log_base failed");
283     }
284 
285     vhost_log_put(dev, true);
286     dev->log = log;
287     dev->log_size = size;
288 }
289 
290 static int vhost_dev_has_iommu(struct vhost_dev *dev)
291 {
292     VirtIODevice *vdev = dev->vdev;
293 
294     return virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
295 }
296 
297 static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,
298                               hwaddr *plen, int is_write)
299 {
300     if (!vhost_dev_has_iommu(dev)) {
301         return cpu_physical_memory_map(addr, plen, is_write);
302     } else {
303         return (void *)(uintptr_t)addr;
304     }
305 }
306 
307 static void vhost_memory_unmap(struct vhost_dev *dev, void *buffer,
308                                hwaddr len, int is_write,
309                                hwaddr access_len)
310 {
311     if (!vhost_dev_has_iommu(dev)) {
312         cpu_physical_memory_unmap(buffer, len, is_write, access_len);
313     }
314 }
315 
316 static int vhost_verify_ring_part_mapping(void *ring_hva,
317                                           uint64_t ring_gpa,
318                                           uint64_t ring_size,
319                                           void *reg_hva,
320                                           uint64_t reg_gpa,
321                                           uint64_t reg_size)
322 {
323     uint64_t hva_ring_offset;
324     uint64_t ring_last = range_get_last(ring_gpa, ring_size);
325     uint64_t reg_last = range_get_last(reg_gpa, reg_size);
326 
327     if (ring_last < reg_gpa || ring_gpa > reg_last) {
328         return 0;
329     }
330     /* check that whole ring's is mapped */
331     if (ring_last > reg_last) {
332         return -ENOMEM;
333     }
334     /* check that ring's MemoryRegion wasn't replaced */
335     hva_ring_offset = ring_gpa - reg_gpa;
336     if (ring_hva != reg_hva + hva_ring_offset) {
337         return -EBUSY;
338     }
339 
340     return 0;
341 }
342 
343 static int vhost_verify_ring_mappings(struct vhost_dev *dev,
344                                       void *reg_hva,
345                                       uint64_t reg_gpa,
346                                       uint64_t reg_size)
347 {
348     int i, j;
349     int r = 0;
350     const char *part_name[] = {
351         "descriptor table",
352         "available ring",
353         "used ring"
354     };
355 
356     if (vhost_dev_has_iommu(dev)) {
357         return 0;
358     }
359 
360     for (i = 0; i < dev->nvqs; ++i) {
361         struct vhost_virtqueue *vq = dev->vqs + i;
362 
363         if (vq->desc_phys == 0) {
364             continue;
365         }
366 
367         j = 0;
368         r = vhost_verify_ring_part_mapping(
369                 vq->desc, vq->desc_phys, vq->desc_size,
370                 reg_hva, reg_gpa, reg_size);
371         if (r) {
372             break;
373         }
374 
375         j++;
376         r = vhost_verify_ring_part_mapping(
377                 vq->avail, vq->avail_phys, vq->avail_size,
378                 reg_hva, reg_gpa, reg_size);
379         if (r) {
380             break;
381         }
382 
383         j++;
384         r = vhost_verify_ring_part_mapping(
385                 vq->used, vq->used_phys, vq->used_size,
386                 reg_hva, reg_gpa, reg_size);
387         if (r) {
388             break;
389         }
390     }
391 
392     if (r == -ENOMEM) {
393         error_report("Unable to map %s for ring %d", part_name[j], i);
394     } else if (r == -EBUSY) {
395         error_report("%s relocated for ring %d", part_name[j], i);
396     }
397     return r;
398 }
399 
400 static bool vhost_section(struct vhost_dev *dev, MemoryRegionSection *section)
401 {
402     bool result;
403     bool log_dirty = memory_region_get_dirty_log_mask(section->mr) &
404                      ~(1 << DIRTY_MEMORY_MIGRATION);
405     result = memory_region_is_ram(section->mr) &&
406         !memory_region_is_rom(section->mr);
407 
408     /* Vhost doesn't handle any block which is doing dirty-tracking other
409      * than migration; this typically fires on VGA areas.
410      */
411     result &= !log_dirty;
412 
413     if (result && dev->vhost_ops->vhost_backend_mem_section_filter) {
414         result &=
415             dev->vhost_ops->vhost_backend_mem_section_filter(dev, section);
416     }
417 
418     trace_vhost_section(section->mr->name, result);
419     return result;
420 }
421 
422 static void vhost_begin(MemoryListener *listener)
423 {
424     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
425                                          memory_listener);
426     dev->tmp_sections = NULL;
427     dev->n_tmp_sections = 0;
428 }
429 
430 static void vhost_commit(MemoryListener *listener)
431 {
432     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
433                                          memory_listener);
434     MemoryRegionSection *old_sections;
435     int n_old_sections;
436     uint64_t log_size;
437     size_t regions_size;
438     int r;
439     int i;
440     bool changed = false;
441 
442     /* Note we can be called before the device is started, but then
443      * starting the device calls set_mem_table, so we need to have
444      * built the data structures.
445      */
446     old_sections = dev->mem_sections;
447     n_old_sections = dev->n_mem_sections;
448     dev->mem_sections = dev->tmp_sections;
449     dev->n_mem_sections = dev->n_tmp_sections;
450 
451     if (dev->n_mem_sections != n_old_sections) {
452         changed = true;
453     } else {
454         /* Same size, lets check the contents */
455         changed = n_old_sections && memcmp(dev->mem_sections, old_sections,
456                          n_old_sections * sizeof(old_sections[0])) != 0;
457     }
458 
459     trace_vhost_commit(dev->started, changed);
460     if (!changed) {
461         goto out;
462     }
463 
464     /* Rebuild the regions list from the new sections list */
465     regions_size = offsetof(struct vhost_memory, regions) +
466                        dev->n_mem_sections * sizeof dev->mem->regions[0];
467     dev->mem = g_realloc(dev->mem, regions_size);
468     dev->mem->nregions = dev->n_mem_sections;
469     used_memslots = dev->mem->nregions;
470     for (i = 0; i < dev->n_mem_sections; i++) {
471         struct vhost_memory_region *cur_vmr = dev->mem->regions + i;
472         struct MemoryRegionSection *mrs = dev->mem_sections + i;
473 
474         cur_vmr->guest_phys_addr = mrs->offset_within_address_space;
475         cur_vmr->memory_size     = int128_get64(mrs->size);
476         cur_vmr->userspace_addr  =
477             (uintptr_t)memory_region_get_ram_ptr(mrs->mr) +
478             mrs->offset_within_region;
479         cur_vmr->flags_padding   = 0;
480     }
481 
482     if (!dev->started) {
483         goto out;
484     }
485 
486     for (i = 0; i < dev->mem->nregions; i++) {
487         if (vhost_verify_ring_mappings(dev,
488                        (void *)(uintptr_t)dev->mem->regions[i].userspace_addr,
489                        dev->mem->regions[i].guest_phys_addr,
490                        dev->mem->regions[i].memory_size)) {
491             error_report("Verify ring failure on region %d", i);
492             abort();
493         }
494     }
495 
496     if (!dev->log_enabled) {
497         r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
498         if (r < 0) {
499             VHOST_OPS_DEBUG("vhost_set_mem_table failed");
500         }
501         goto out;
502     }
503     log_size = vhost_get_log_size(dev);
504     /* We allocate an extra 4K bytes to log,
505      * to reduce the * number of reallocations. */
506 #define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log)
507     /* To log more, must increase log size before table update. */
508     if (dev->log_size < log_size) {
509         vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER);
510     }
511     r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
512     if (r < 0) {
513         VHOST_OPS_DEBUG("vhost_set_mem_table failed");
514     }
515     /* To log less, can only decrease log size after table update. */
516     if (dev->log_size > log_size + VHOST_LOG_BUFFER) {
517         vhost_dev_log_resize(dev, log_size);
518     }
519 
520 out:
521     /* Deref the old list of sections, this must happen _after_ the
522      * vhost_set_mem_table to ensure the client isn't still using the
523      * section we're about to unref.
524      */
525     while (n_old_sections--) {
526         memory_region_unref(old_sections[n_old_sections].mr);
527     }
528     g_free(old_sections);
529     return;
530 }
531 
532 /* Adds the section data to the tmp_section structure.
533  * It relies on the listener calling us in memory address order
534  * and for each region (via the _add and _nop methods) to
535  * join neighbours.
536  */
537 static void vhost_region_add_section(struct vhost_dev *dev,
538                                      MemoryRegionSection *section)
539 {
540     bool need_add = true;
541     uint64_t mrs_size = int128_get64(section->size);
542     uint64_t mrs_gpa = section->offset_within_address_space;
543     uintptr_t mrs_host = (uintptr_t)memory_region_get_ram_ptr(section->mr) +
544                          section->offset_within_region;
545     RAMBlock *mrs_rb = section->mr->ram_block;
546     size_t mrs_page = qemu_ram_pagesize(mrs_rb);
547 
548     trace_vhost_region_add_section(section->mr->name, mrs_gpa, mrs_size,
549                                    mrs_host);
550 
551     /* Round the section to it's page size */
552     /* First align the start down to a page boundary */
553     uint64_t alignage = mrs_host & (mrs_page - 1);
554     if (alignage) {
555         mrs_host -= alignage;
556         mrs_size += alignage;
557         mrs_gpa  -= alignage;
558     }
559     /* Now align the size up to a page boundary */
560     alignage = mrs_size & (mrs_page - 1);
561     if (alignage) {
562         mrs_size += mrs_page - alignage;
563     }
564     trace_vhost_region_add_section_aligned(section->mr->name, mrs_gpa, mrs_size,
565                                            mrs_host);
566 
567     if (dev->n_tmp_sections) {
568         /* Since we already have at least one section, lets see if
569          * this extends it; since we're scanning in order, we only
570          * have to look at the last one, and the FlatView that calls
571          * us shouldn't have overlaps.
572          */
573         MemoryRegionSection *prev_sec = dev->tmp_sections +
574                                                (dev->n_tmp_sections - 1);
575         uint64_t prev_gpa_start = prev_sec->offset_within_address_space;
576         uint64_t prev_size = int128_get64(prev_sec->size);
577         uint64_t prev_gpa_end   = range_get_last(prev_gpa_start, prev_size);
578         uint64_t prev_host_start =
579                         (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr) +
580                         prev_sec->offset_within_region;
581         uint64_t prev_host_end   = range_get_last(prev_host_start, prev_size);
582 
583         if (mrs_gpa <= (prev_gpa_end + 1)) {
584             /* OK, looks like overlapping/intersecting - it's possible that
585              * the rounding to page sizes has made them overlap, but they should
586              * match up in the same RAMBlock if they do.
587              */
588             if (mrs_gpa < prev_gpa_start) {
589                 error_report("%s:Section rounded to %"PRIx64
590                              " prior to previous %"PRIx64,
591                              __func__, mrs_gpa, prev_gpa_start);
592                 /* A way to cleanly fail here would be better */
593                 return;
594             }
595             /* Offset from the start of the previous GPA to this GPA */
596             size_t offset = mrs_gpa - prev_gpa_start;
597 
598             if (prev_host_start + offset == mrs_host &&
599                 section->mr == prev_sec->mr &&
600                 (!dev->vhost_ops->vhost_backend_can_merge ||
601                  dev->vhost_ops->vhost_backend_can_merge(dev,
602                     mrs_host, mrs_size,
603                     prev_host_start, prev_size))) {
604                 uint64_t max_end = MAX(prev_host_end, mrs_host + mrs_size);
605                 need_add = false;
606                 prev_sec->offset_within_address_space =
607                     MIN(prev_gpa_start, mrs_gpa);
608                 prev_sec->offset_within_region =
609                     MIN(prev_host_start, mrs_host) -
610                     (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr);
611                 prev_sec->size = int128_make64(max_end - MIN(prev_host_start,
612                                                mrs_host));
613                 trace_vhost_region_add_section_merge(section->mr->name,
614                                         int128_get64(prev_sec->size),
615                                         prev_sec->offset_within_address_space,
616                                         prev_sec->offset_within_region);
617             } else {
618                 /* adjoining regions are fine, but overlapping ones with
619                  * different blocks/offsets shouldn't happen
620                  */
621                 if (mrs_gpa != prev_gpa_end + 1) {
622                     error_report("%s: Overlapping but not coherent sections "
623                                  "at %"PRIx64,
624                                  __func__, mrs_gpa);
625                     return;
626                 }
627             }
628         }
629     }
630 
631     if (need_add) {
632         ++dev->n_tmp_sections;
633         dev->tmp_sections = g_renew(MemoryRegionSection, dev->tmp_sections,
634                                     dev->n_tmp_sections);
635         dev->tmp_sections[dev->n_tmp_sections - 1] = *section;
636         /* The flatview isn't stable and we don't use it, making it NULL
637          * means we can memcmp the list.
638          */
639         dev->tmp_sections[dev->n_tmp_sections - 1].fv = NULL;
640         memory_region_ref(section->mr);
641     }
642 }
643 
644 /* Used for both add and nop callbacks */
645 static void vhost_region_addnop(MemoryListener *listener,
646                                 MemoryRegionSection *section)
647 {
648     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
649                                          memory_listener);
650 
651     if (!vhost_section(dev, section)) {
652         return;
653     }
654     vhost_region_add_section(dev, section);
655 }
656 
657 static void vhost_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
658 {
659     struct vhost_iommu *iommu = container_of(n, struct vhost_iommu, n);
660     struct vhost_dev *hdev = iommu->hdev;
661     hwaddr iova = iotlb->iova + iommu->iommu_offset;
662 
663     if (vhost_backend_invalidate_device_iotlb(hdev, iova,
664                                               iotlb->addr_mask + 1)) {
665         error_report("Fail to invalidate device iotlb");
666     }
667 }
668 
669 static void vhost_iommu_region_add(MemoryListener *listener,
670                                    MemoryRegionSection *section)
671 {
672     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
673                                          iommu_listener);
674     struct vhost_iommu *iommu;
675     Int128 end;
676     int iommu_idx;
677     IOMMUMemoryRegion *iommu_mr;
678 
679     if (!memory_region_is_iommu(section->mr)) {
680         return;
681     }
682 
683     iommu_mr = IOMMU_MEMORY_REGION(section->mr);
684 
685     iommu = g_malloc0(sizeof(*iommu));
686     end = int128_add(int128_make64(section->offset_within_region),
687                      section->size);
688     end = int128_sub(end, int128_one());
689     iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
690                                                    MEMTXATTRS_UNSPECIFIED);
691     iommu_notifier_init(&iommu->n, vhost_iommu_unmap_notify,
692                         IOMMU_NOTIFIER_UNMAP,
693                         section->offset_within_region,
694                         int128_get64(end),
695                         iommu_idx);
696     iommu->mr = section->mr;
697     iommu->iommu_offset = section->offset_within_address_space -
698                           section->offset_within_region;
699     iommu->hdev = dev;
700     memory_region_register_iommu_notifier(section->mr, &iommu->n);
701     QLIST_INSERT_HEAD(&dev->iommu_list, iommu, iommu_next);
702     /* TODO: can replay help performance here? */
703 }
704 
705 static void vhost_iommu_region_del(MemoryListener *listener,
706                                    MemoryRegionSection *section)
707 {
708     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
709                                          iommu_listener);
710     struct vhost_iommu *iommu;
711 
712     if (!memory_region_is_iommu(section->mr)) {
713         return;
714     }
715 
716     QLIST_FOREACH(iommu, &dev->iommu_list, iommu_next) {
717         if (iommu->mr == section->mr &&
718             iommu->n.start == section->offset_within_region) {
719             memory_region_unregister_iommu_notifier(iommu->mr,
720                                                     &iommu->n);
721             QLIST_REMOVE(iommu, iommu_next);
722             g_free(iommu);
723             break;
724         }
725     }
726 }
727 
728 static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
729                                     struct vhost_virtqueue *vq,
730                                     unsigned idx, bool enable_log)
731 {
732     struct vhost_vring_addr addr = {
733         .index = idx,
734         .desc_user_addr = (uint64_t)(unsigned long)vq->desc,
735         .avail_user_addr = (uint64_t)(unsigned long)vq->avail,
736         .used_user_addr = (uint64_t)(unsigned long)vq->used,
737         .log_guest_addr = vq->used_phys,
738         .flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0,
739     };
740     int r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr);
741     if (r < 0) {
742         VHOST_OPS_DEBUG("vhost_set_vring_addr failed");
743         return -errno;
744     }
745     return 0;
746 }
747 
748 static int vhost_dev_set_features(struct vhost_dev *dev,
749                                   bool enable_log)
750 {
751     uint64_t features = dev->acked_features;
752     int r;
753     if (enable_log) {
754         features |= 0x1ULL << VHOST_F_LOG_ALL;
755     }
756     r = dev->vhost_ops->vhost_set_features(dev, features);
757     if (r < 0) {
758         VHOST_OPS_DEBUG("vhost_set_features failed");
759     }
760     return r < 0 ? -errno : 0;
761 }
762 
763 static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
764 {
765     int r, i, idx;
766     r = vhost_dev_set_features(dev, enable_log);
767     if (r < 0) {
768         goto err_features;
769     }
770     for (i = 0; i < dev->nvqs; ++i) {
771         idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
772         r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
773                                      enable_log);
774         if (r < 0) {
775             goto err_vq;
776         }
777     }
778     return 0;
779 err_vq:
780     for (; i >= 0; --i) {
781         idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
782         vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
783                                  dev->log_enabled);
784     }
785     vhost_dev_set_features(dev, dev->log_enabled);
786 err_features:
787     return r;
788 }
789 
790 static int vhost_migration_log(MemoryListener *listener, int enable)
791 {
792     struct vhost_dev *dev = container_of(listener, struct vhost_dev,
793                                          memory_listener);
794     int r;
795     if (!!enable == dev->log_enabled) {
796         return 0;
797     }
798     if (!dev->started) {
799         dev->log_enabled = enable;
800         return 0;
801     }
802     if (!enable) {
803         r = vhost_dev_set_log(dev, false);
804         if (r < 0) {
805             return r;
806         }
807         vhost_log_put(dev, false);
808     } else {
809         vhost_dev_log_resize(dev, vhost_get_log_size(dev));
810         r = vhost_dev_set_log(dev, true);
811         if (r < 0) {
812             return r;
813         }
814     }
815     dev->log_enabled = enable;
816     return 0;
817 }
818 
819 static void vhost_log_global_start(MemoryListener *listener)
820 {
821     int r;
822 
823     r = vhost_migration_log(listener, true);
824     if (r < 0) {
825         abort();
826     }
827 }
828 
829 static void vhost_log_global_stop(MemoryListener *listener)
830 {
831     int r;
832 
833     r = vhost_migration_log(listener, false);
834     if (r < 0) {
835         abort();
836     }
837 }
838 
839 static void vhost_log_start(MemoryListener *listener,
840                             MemoryRegionSection *section,
841                             int old, int new)
842 {
843     /* FIXME: implement */
844 }
845 
846 static void vhost_log_stop(MemoryListener *listener,
847                            MemoryRegionSection *section,
848                            int old, int new)
849 {
850     /* FIXME: implement */
851 }
852 
853 /* The vhost driver natively knows how to handle the vrings of non
854  * cross-endian legacy devices and modern devices. Only legacy devices
855  * exposed to a bi-endian guest may require the vhost driver to use a
856  * specific endianness.
857  */
858 static inline bool vhost_needs_vring_endian(VirtIODevice *vdev)
859 {
860     if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
861         return false;
862     }
863 #ifdef HOST_WORDS_BIGENDIAN
864     return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_LITTLE;
865 #else
866     return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_BIG;
867 #endif
868 }
869 
870 static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev,
871                                                    bool is_big_endian,
872                                                    int vhost_vq_index)
873 {
874     struct vhost_vring_state s = {
875         .index = vhost_vq_index,
876         .num = is_big_endian
877     };
878 
879     if (!dev->vhost_ops->vhost_set_vring_endian(dev, &s)) {
880         return 0;
881     }
882 
883     VHOST_OPS_DEBUG("vhost_set_vring_endian failed");
884     if (errno == ENOTTY) {
885         error_report("vhost does not support cross-endian");
886         return -ENOSYS;
887     }
888 
889     return -errno;
890 }
891 
892 static int vhost_memory_region_lookup(struct vhost_dev *hdev,
893                                       uint64_t gpa, uint64_t *uaddr,
894                                       uint64_t *len)
895 {
896     int i;
897 
898     for (i = 0; i < hdev->mem->nregions; i++) {
899         struct vhost_memory_region *reg = hdev->mem->regions + i;
900 
901         if (gpa >= reg->guest_phys_addr &&
902             reg->guest_phys_addr + reg->memory_size > gpa) {
903             *uaddr = reg->userspace_addr + gpa - reg->guest_phys_addr;
904             *len = reg->guest_phys_addr + reg->memory_size - gpa;
905             return 0;
906         }
907     }
908 
909     return -EFAULT;
910 }
911 
912 int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write)
913 {
914     IOMMUTLBEntry iotlb;
915     uint64_t uaddr, len;
916     int ret = -EFAULT;
917 
918     rcu_read_lock();
919 
920     trace_vhost_iotlb_miss(dev, 1);
921 
922     iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
923                                           iova, write,
924                                           MEMTXATTRS_UNSPECIFIED);
925     if (iotlb.target_as != NULL) {
926         ret = vhost_memory_region_lookup(dev, iotlb.translated_addr,
927                                          &uaddr, &len);
928         if (ret) {
929             trace_vhost_iotlb_miss(dev, 3);
930             error_report("Fail to lookup the translated address "
931                          "%"PRIx64, iotlb.translated_addr);
932             goto out;
933         }
934 
935         len = MIN(iotlb.addr_mask + 1, len);
936         iova = iova & ~iotlb.addr_mask;
937 
938         ret = vhost_backend_update_device_iotlb(dev, iova, uaddr,
939                                                 len, iotlb.perm);
940         if (ret) {
941             trace_vhost_iotlb_miss(dev, 4);
942             error_report("Fail to update device iotlb");
943             goto out;
944         }
945     }
946 
947     trace_vhost_iotlb_miss(dev, 2);
948 
949 out:
950     rcu_read_unlock();
951 
952     return ret;
953 }
954 
955 static int vhost_virtqueue_start(struct vhost_dev *dev,
956                                 struct VirtIODevice *vdev,
957                                 struct vhost_virtqueue *vq,
958                                 unsigned idx)
959 {
960     BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
961     VirtioBusState *vbus = VIRTIO_BUS(qbus);
962     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
963     hwaddr s, l, a;
964     int r;
965     int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
966     struct vhost_vring_file file = {
967         .index = vhost_vq_index
968     };
969     struct vhost_vring_state state = {
970         .index = vhost_vq_index
971     };
972     struct VirtQueue *vvq = virtio_get_queue(vdev, idx);
973 
974     a = virtio_queue_get_desc_addr(vdev, idx);
975     if (a == 0) {
976         /* Queue might not be ready for start */
977         return 0;
978     }
979 
980     vq->num = state.num = virtio_queue_get_num(vdev, idx);
981     r = dev->vhost_ops->vhost_set_vring_num(dev, &state);
982     if (r) {
983         VHOST_OPS_DEBUG("vhost_set_vring_num failed");
984         return -errno;
985     }
986 
987     state.num = virtio_queue_get_last_avail_idx(vdev, idx);
988     r = dev->vhost_ops->vhost_set_vring_base(dev, &state);
989     if (r) {
990         VHOST_OPS_DEBUG("vhost_set_vring_base failed");
991         return -errno;
992     }
993 
994     if (vhost_needs_vring_endian(vdev)) {
995         r = vhost_virtqueue_set_vring_endian_legacy(dev,
996                                                     virtio_is_big_endian(vdev),
997                                                     vhost_vq_index);
998         if (r) {
999             return -errno;
1000         }
1001     }
1002 
1003     vq->desc_size = s = l = virtio_queue_get_desc_size(vdev, idx);
1004     vq->desc_phys = a;
1005     vq->desc = vhost_memory_map(dev, a, &l, 0);
1006     if (!vq->desc || l != s) {
1007         r = -ENOMEM;
1008         goto fail_alloc_desc;
1009     }
1010     vq->avail_size = s = l = virtio_queue_get_avail_size(vdev, idx);
1011     vq->avail_phys = a = virtio_queue_get_avail_addr(vdev, idx);
1012     vq->avail = vhost_memory_map(dev, a, &l, 0);
1013     if (!vq->avail || l != s) {
1014         r = -ENOMEM;
1015         goto fail_alloc_avail;
1016     }
1017     vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx);
1018     vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx);
1019     vq->used = vhost_memory_map(dev, a, &l, 1);
1020     if (!vq->used || l != s) {
1021         r = -ENOMEM;
1022         goto fail_alloc_used;
1023     }
1024 
1025     r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled);
1026     if (r < 0) {
1027         r = -errno;
1028         goto fail_alloc;
1029     }
1030 
1031     file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq));
1032     r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
1033     if (r) {
1034         VHOST_OPS_DEBUG("vhost_set_vring_kick failed");
1035         r = -errno;
1036         goto fail_kick;
1037     }
1038 
1039     /* Clear and discard previous events if any. */
1040     event_notifier_test_and_clear(&vq->masked_notifier);
1041 
1042     /* Init vring in unmasked state, unless guest_notifier_mask
1043      * will do it later.
1044      */
1045     if (!vdev->use_guest_notifier_mask) {
1046         /* TODO: check and handle errors. */
1047         vhost_virtqueue_mask(dev, vdev, idx, false);
1048     }
1049 
1050     if (k->query_guest_notifiers &&
1051         k->query_guest_notifiers(qbus->parent) &&
1052         virtio_queue_vector(vdev, idx) == VIRTIO_NO_VECTOR) {
1053         file.fd = -1;
1054         r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
1055         if (r) {
1056             goto fail_vector;
1057         }
1058     }
1059 
1060     return 0;
1061 
1062 fail_vector:
1063 fail_kick:
1064 fail_alloc:
1065     vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
1066                        0, 0);
1067 fail_alloc_used:
1068     vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
1069                        0, 0);
1070 fail_alloc_avail:
1071     vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
1072                        0, 0);
1073 fail_alloc_desc:
1074     return r;
1075 }
1076 
1077 static void vhost_virtqueue_stop(struct vhost_dev *dev,
1078                                     struct VirtIODevice *vdev,
1079                                     struct vhost_virtqueue *vq,
1080                                     unsigned idx)
1081 {
1082     int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
1083     struct vhost_vring_state state = {
1084         .index = vhost_vq_index,
1085     };
1086     int r;
1087 
1088     if (virtio_queue_get_desc_addr(vdev, idx) == 0) {
1089         /* Don't stop the virtqueue which might have not been started */
1090         return;
1091     }
1092 
1093     r = dev->vhost_ops->vhost_get_vring_base(dev, &state);
1094     if (r < 0) {
1095         VHOST_OPS_DEBUG("vhost VQ %u ring restore failed: %d", idx, r);
1096         /* Connection to the backend is broken, so let's sync internal
1097          * last avail idx to the device used idx.
1098          */
1099         virtio_queue_restore_last_avail_idx(vdev, idx);
1100     } else {
1101         virtio_queue_set_last_avail_idx(vdev, idx, state.num);
1102     }
1103     virtio_queue_invalidate_signalled_used(vdev, idx);
1104     virtio_queue_update_used_idx(vdev, idx);
1105 
1106     /* In the cross-endian case, we need to reset the vring endianness to
1107      * native as legacy devices expect so by default.
1108      */
1109     if (vhost_needs_vring_endian(vdev)) {
1110         vhost_virtqueue_set_vring_endian_legacy(dev,
1111                                                 !virtio_is_big_endian(vdev),
1112                                                 vhost_vq_index);
1113     }
1114 
1115     vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
1116                        1, virtio_queue_get_used_size(vdev, idx));
1117     vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
1118                        0, virtio_queue_get_avail_size(vdev, idx));
1119     vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
1120                        0, virtio_queue_get_desc_size(vdev, idx));
1121 }
1122 
1123 static void vhost_eventfd_add(MemoryListener *listener,
1124                               MemoryRegionSection *section,
1125                               bool match_data, uint64_t data, EventNotifier *e)
1126 {
1127 }
1128 
1129 static void vhost_eventfd_del(MemoryListener *listener,
1130                               MemoryRegionSection *section,
1131                               bool match_data, uint64_t data, EventNotifier *e)
1132 {
1133 }
1134 
1135 static int vhost_virtqueue_set_busyloop_timeout(struct vhost_dev *dev,
1136                                                 int n, uint32_t timeout)
1137 {
1138     int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
1139     struct vhost_vring_state state = {
1140         .index = vhost_vq_index,
1141         .num = timeout,
1142     };
1143     int r;
1144 
1145     if (!dev->vhost_ops->vhost_set_vring_busyloop_timeout) {
1146         return -EINVAL;
1147     }
1148 
1149     r = dev->vhost_ops->vhost_set_vring_busyloop_timeout(dev, &state);
1150     if (r) {
1151         VHOST_OPS_DEBUG("vhost_set_vring_busyloop_timeout failed");
1152         return r;
1153     }
1154 
1155     return 0;
1156 }
1157 
1158 static int vhost_virtqueue_init(struct vhost_dev *dev,
1159                                 struct vhost_virtqueue *vq, int n)
1160 {
1161     int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
1162     struct vhost_vring_file file = {
1163         .index = vhost_vq_index,
1164     };
1165     int r = event_notifier_init(&vq->masked_notifier, 0);
1166     if (r < 0) {
1167         return r;
1168     }
1169 
1170     file.fd = event_notifier_get_fd(&vq->masked_notifier);
1171     r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
1172     if (r) {
1173         VHOST_OPS_DEBUG("vhost_set_vring_call failed");
1174         r = -errno;
1175         goto fail_call;
1176     }
1177 
1178     vq->dev = dev;
1179 
1180     return 0;
1181 fail_call:
1182     event_notifier_cleanup(&vq->masked_notifier);
1183     return r;
1184 }
1185 
1186 static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq)
1187 {
1188     event_notifier_cleanup(&vq->masked_notifier);
1189 }
1190 
1191 int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
1192                    VhostBackendType backend_type, uint32_t busyloop_timeout)
1193 {
1194     uint64_t features;
1195     int i, r, n_initialized_vqs = 0;
1196     Error *local_err = NULL;
1197 
1198     hdev->vdev = NULL;
1199     hdev->migration_blocker = NULL;
1200 
1201     r = vhost_set_backend_type(hdev, backend_type);
1202     assert(r >= 0);
1203 
1204     r = hdev->vhost_ops->vhost_backend_init(hdev, opaque);
1205     if (r < 0) {
1206         goto fail;
1207     }
1208 
1209     r = hdev->vhost_ops->vhost_set_owner(hdev);
1210     if (r < 0) {
1211         VHOST_OPS_DEBUG("vhost_set_owner failed");
1212         goto fail;
1213     }
1214 
1215     r = hdev->vhost_ops->vhost_get_features(hdev, &features);
1216     if (r < 0) {
1217         VHOST_OPS_DEBUG("vhost_get_features failed");
1218         goto fail;
1219     }
1220 
1221     for (i = 0; i < hdev->nvqs; ++i, ++n_initialized_vqs) {
1222         r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i);
1223         if (r < 0) {
1224             goto fail;
1225         }
1226     }
1227 
1228     if (busyloop_timeout) {
1229         for (i = 0; i < hdev->nvqs; ++i) {
1230             r = vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i,
1231                                                      busyloop_timeout);
1232             if (r < 0) {
1233                 goto fail_busyloop;
1234             }
1235         }
1236     }
1237 
1238     hdev->features = features;
1239 
1240     hdev->memory_listener = (MemoryListener) {
1241         .begin = vhost_begin,
1242         .commit = vhost_commit,
1243         .region_add = vhost_region_addnop,
1244         .region_nop = vhost_region_addnop,
1245         .log_start = vhost_log_start,
1246         .log_stop = vhost_log_stop,
1247         .log_sync = vhost_log_sync,
1248         .log_global_start = vhost_log_global_start,
1249         .log_global_stop = vhost_log_global_stop,
1250         .eventfd_add = vhost_eventfd_add,
1251         .eventfd_del = vhost_eventfd_del,
1252         .priority = 10
1253     };
1254 
1255     hdev->iommu_listener = (MemoryListener) {
1256         .region_add = vhost_iommu_region_add,
1257         .region_del = vhost_iommu_region_del,
1258     };
1259 
1260     if (hdev->migration_blocker == NULL) {
1261         if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) {
1262             error_setg(&hdev->migration_blocker,
1263                        "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature.");
1264         } else if (vhost_dev_log_is_shared(hdev) && !qemu_memfd_alloc_check()) {
1265             error_setg(&hdev->migration_blocker,
1266                        "Migration disabled: failed to allocate shared memory");
1267         }
1268     }
1269 
1270     if (hdev->migration_blocker != NULL) {
1271         r = migrate_add_blocker(hdev->migration_blocker, &local_err);
1272         if (local_err) {
1273             error_report_err(local_err);
1274             error_free(hdev->migration_blocker);
1275             goto fail_busyloop;
1276         }
1277     }
1278 
1279     hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions));
1280     hdev->n_mem_sections = 0;
1281     hdev->mem_sections = NULL;
1282     hdev->log = NULL;
1283     hdev->log_size = 0;
1284     hdev->log_enabled = false;
1285     hdev->started = false;
1286     memory_listener_register(&hdev->memory_listener, &address_space_memory);
1287     QLIST_INSERT_HEAD(&vhost_devices, hdev, entry);
1288 
1289     if (used_memslots > hdev->vhost_ops->vhost_backend_memslots_limit(hdev)) {
1290         error_report("vhost backend memory slots limit is less"
1291                 " than current number of present memory slots");
1292         r = -1;
1293         if (busyloop_timeout) {
1294             goto fail_busyloop;
1295         } else {
1296             goto fail;
1297         }
1298     }
1299 
1300     return 0;
1301 
1302 fail_busyloop:
1303     while (--i >= 0) {
1304         vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, 0);
1305     }
1306 fail:
1307     hdev->nvqs = n_initialized_vqs;
1308     vhost_dev_cleanup(hdev);
1309     return r;
1310 }
1311 
1312 void vhost_dev_cleanup(struct vhost_dev *hdev)
1313 {
1314     int i;
1315 
1316     for (i = 0; i < hdev->nvqs; ++i) {
1317         vhost_virtqueue_cleanup(hdev->vqs + i);
1318     }
1319     if (hdev->mem) {
1320         /* those are only safe after successful init */
1321         memory_listener_unregister(&hdev->memory_listener);
1322         QLIST_REMOVE(hdev, entry);
1323     }
1324     if (hdev->migration_blocker) {
1325         migrate_del_blocker(hdev->migration_blocker);
1326         error_free(hdev->migration_blocker);
1327     }
1328     g_free(hdev->mem);
1329     g_free(hdev->mem_sections);
1330     if (hdev->vhost_ops) {
1331         hdev->vhost_ops->vhost_backend_cleanup(hdev);
1332     }
1333     assert(!hdev->log);
1334 
1335     memset(hdev, 0, sizeof(struct vhost_dev));
1336 }
1337 
1338 /* Stop processing guest IO notifications in qemu.
1339  * Start processing them in vhost in kernel.
1340  */
1341 int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
1342 {
1343     BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1344     int i, r, e;
1345 
1346     /* We will pass the notifiers to the kernel, make sure that QEMU
1347      * doesn't interfere.
1348      */
1349     r = virtio_device_grab_ioeventfd(vdev);
1350     if (r < 0) {
1351         error_report("binding does not support host notifiers");
1352         goto fail;
1353     }
1354 
1355     for (i = 0; i < hdev->nvqs; ++i) {
1356         r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
1357                                          true);
1358         if (r < 0) {
1359             error_report("vhost VQ %d notifier binding failed: %d", i, -r);
1360             goto fail_vq;
1361         }
1362     }
1363 
1364     return 0;
1365 fail_vq:
1366     while (--i >= 0) {
1367         e = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
1368                                          false);
1369         if (e < 0) {
1370             error_report("vhost VQ %d notifier cleanup error: %d", i, -r);
1371         }
1372         assert (e >= 0);
1373         virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i);
1374     }
1375     virtio_device_release_ioeventfd(vdev);
1376 fail:
1377     return r;
1378 }
1379 
1380 /* Stop processing guest IO notifications in vhost.
1381  * Start processing them in qemu.
1382  * This might actually run the qemu handlers right away,
1383  * so virtio in qemu must be completely setup when this is called.
1384  */
1385 void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
1386 {
1387     BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1388     int i, r;
1389 
1390     for (i = 0; i < hdev->nvqs; ++i) {
1391         r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
1392                                          false);
1393         if (r < 0) {
1394             error_report("vhost VQ %d notifier cleanup failed: %d", i, -r);
1395         }
1396         assert (r >= 0);
1397         virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i);
1398     }
1399     virtio_device_release_ioeventfd(vdev);
1400 }
1401 
1402 /* Test and clear event pending status.
1403  * Should be called after unmask to avoid losing events.
1404  */
1405 bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n)
1406 {
1407     struct vhost_virtqueue *vq = hdev->vqs + n - hdev->vq_index;
1408     assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs);
1409     return event_notifier_test_and_clear(&vq->masked_notifier);
1410 }
1411 
1412 /* Mask/unmask events from this vq. */
1413 void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n,
1414                          bool mask)
1415 {
1416     struct VirtQueue *vvq = virtio_get_queue(vdev, n);
1417     int r, index = n - hdev->vq_index;
1418     struct vhost_vring_file file;
1419 
1420     /* should only be called after backend is connected */
1421     assert(hdev->vhost_ops);
1422 
1423     if (mask) {
1424         assert(vdev->use_guest_notifier_mask);
1425         file.fd = event_notifier_get_fd(&hdev->vqs[index].masked_notifier);
1426     } else {
1427         file.fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vvq));
1428     }
1429 
1430     file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n);
1431     r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file);
1432     if (r < 0) {
1433         VHOST_OPS_DEBUG("vhost_set_vring_call failed");
1434     }
1435 }
1436 
1437 uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits,
1438                             uint64_t features)
1439 {
1440     const int *bit = feature_bits;
1441     while (*bit != VHOST_INVALID_FEATURE_BIT) {
1442         uint64_t bit_mask = (1ULL << *bit);
1443         if (!(hdev->features & bit_mask)) {
1444             features &= ~bit_mask;
1445         }
1446         bit++;
1447     }
1448     return features;
1449 }
1450 
1451 void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits,
1452                         uint64_t features)
1453 {
1454     const int *bit = feature_bits;
1455     while (*bit != VHOST_INVALID_FEATURE_BIT) {
1456         uint64_t bit_mask = (1ULL << *bit);
1457         if (features & bit_mask) {
1458             hdev->acked_features |= bit_mask;
1459         }
1460         bit++;
1461     }
1462 }
1463 
1464 int vhost_dev_get_config(struct vhost_dev *hdev, uint8_t *config,
1465                          uint32_t config_len)
1466 {
1467     assert(hdev->vhost_ops);
1468 
1469     if (hdev->vhost_ops->vhost_get_config) {
1470         return hdev->vhost_ops->vhost_get_config(hdev, config, config_len);
1471     }
1472 
1473     return -1;
1474 }
1475 
1476 int vhost_dev_set_config(struct vhost_dev *hdev, const uint8_t *data,
1477                          uint32_t offset, uint32_t size, uint32_t flags)
1478 {
1479     assert(hdev->vhost_ops);
1480 
1481     if (hdev->vhost_ops->vhost_set_config) {
1482         return hdev->vhost_ops->vhost_set_config(hdev, data, offset,
1483                                                  size, flags);
1484     }
1485 
1486     return -1;
1487 }
1488 
1489 void vhost_dev_set_config_notifier(struct vhost_dev *hdev,
1490                                    const VhostDevConfigOps *ops)
1491 {
1492     hdev->config_ops = ops;
1493 }
1494 
1495 void vhost_dev_free_inflight(struct vhost_inflight *inflight)
1496 {
1497     if (inflight->addr) {
1498         qemu_memfd_free(inflight->addr, inflight->size, inflight->fd);
1499         inflight->addr = NULL;
1500         inflight->fd = -1;
1501     }
1502 }
1503 
1504 static int vhost_dev_resize_inflight(struct vhost_inflight *inflight,
1505                                      uint64_t new_size)
1506 {
1507     Error *err = NULL;
1508     int fd = -1;
1509     void *addr = qemu_memfd_alloc("vhost-inflight", new_size,
1510                                   F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
1511                                   &fd, &err);
1512 
1513     if (err) {
1514         error_report_err(err);
1515         return -1;
1516     }
1517 
1518     vhost_dev_free_inflight(inflight);
1519     inflight->offset = 0;
1520     inflight->addr = addr;
1521     inflight->fd = fd;
1522     inflight->size = new_size;
1523 
1524     return 0;
1525 }
1526 
1527 void vhost_dev_save_inflight(struct vhost_inflight *inflight, QEMUFile *f)
1528 {
1529     if (inflight->addr) {
1530         qemu_put_be64(f, inflight->size);
1531         qemu_put_be16(f, inflight->queue_size);
1532         qemu_put_buffer(f, inflight->addr, inflight->size);
1533     } else {
1534         qemu_put_be64(f, 0);
1535     }
1536 }
1537 
1538 int vhost_dev_load_inflight(struct vhost_inflight *inflight, QEMUFile *f)
1539 {
1540     uint64_t size;
1541 
1542     size = qemu_get_be64(f);
1543     if (!size) {
1544         return 0;
1545     }
1546 
1547     if (inflight->size != size) {
1548         if (vhost_dev_resize_inflight(inflight, size)) {
1549             return -1;
1550         }
1551     }
1552     inflight->queue_size = qemu_get_be16(f);
1553 
1554     qemu_get_buffer(f, inflight->addr, size);
1555 
1556     return 0;
1557 }
1558 
1559 int vhost_dev_set_inflight(struct vhost_dev *dev,
1560                            struct vhost_inflight *inflight)
1561 {
1562     int r;
1563 
1564     if (dev->vhost_ops->vhost_set_inflight_fd && inflight->addr) {
1565         r = dev->vhost_ops->vhost_set_inflight_fd(dev, inflight);
1566         if (r) {
1567             VHOST_OPS_DEBUG("vhost_set_inflight_fd failed");
1568             return -errno;
1569         }
1570     }
1571 
1572     return 0;
1573 }
1574 
1575 int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size,
1576                            struct vhost_inflight *inflight)
1577 {
1578     int r;
1579 
1580     if (dev->vhost_ops->vhost_get_inflight_fd) {
1581         r = dev->vhost_ops->vhost_get_inflight_fd(dev, queue_size, inflight);
1582         if (r) {
1583             VHOST_OPS_DEBUG("vhost_get_inflight_fd failed");
1584             return -errno;
1585         }
1586     }
1587 
1588     return 0;
1589 }
1590 
1591 /* Host notifiers must be enabled at this point. */
1592 int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
1593 {
1594     int i, r;
1595 
1596     /* should only be called after backend is connected */
1597     assert(hdev->vhost_ops);
1598 
1599     hdev->started = true;
1600     hdev->vdev = vdev;
1601 
1602     r = vhost_dev_set_features(hdev, hdev->log_enabled);
1603     if (r < 0) {
1604         goto fail_features;
1605     }
1606 
1607     if (vhost_dev_has_iommu(hdev)) {
1608         memory_listener_register(&hdev->iommu_listener, vdev->dma_as);
1609     }
1610 
1611     r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem);
1612     if (r < 0) {
1613         VHOST_OPS_DEBUG("vhost_set_mem_table failed");
1614         r = -errno;
1615         goto fail_mem;
1616     }
1617     for (i = 0; i < hdev->nvqs; ++i) {
1618         r = vhost_virtqueue_start(hdev,
1619                                   vdev,
1620                                   hdev->vqs + i,
1621                                   hdev->vq_index + i);
1622         if (r < 0) {
1623             goto fail_vq;
1624         }
1625     }
1626 
1627     if (hdev->log_enabled) {
1628         uint64_t log_base;
1629 
1630         hdev->log_size = vhost_get_log_size(hdev);
1631         hdev->log = vhost_log_get(hdev->log_size,
1632                                   vhost_dev_log_is_shared(hdev));
1633         log_base = (uintptr_t)hdev->log->log;
1634         r = hdev->vhost_ops->vhost_set_log_base(hdev,
1635                                                 hdev->log_size ? log_base : 0,
1636                                                 hdev->log);
1637         if (r < 0) {
1638             VHOST_OPS_DEBUG("vhost_set_log_base failed");
1639             r = -errno;
1640             goto fail_log;
1641         }
1642     }
1643 
1644     if (vhost_dev_has_iommu(hdev)) {
1645         hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true);
1646 
1647         /* Update used ring information for IOTLB to work correctly,
1648          * vhost-kernel code requires for this.*/
1649         for (i = 0; i < hdev->nvqs; ++i) {
1650             struct vhost_virtqueue *vq = hdev->vqs + i;
1651             vhost_device_iotlb_miss(hdev, vq->used_phys, true);
1652         }
1653     }
1654     return 0;
1655 fail_log:
1656     vhost_log_put(hdev, false);
1657 fail_vq:
1658     while (--i >= 0) {
1659         vhost_virtqueue_stop(hdev,
1660                              vdev,
1661                              hdev->vqs + i,
1662                              hdev->vq_index + i);
1663     }
1664 
1665 fail_mem:
1666 fail_features:
1667 
1668     hdev->started = false;
1669     return r;
1670 }
1671 
1672 /* Host notifiers must be enabled at this point. */
1673 void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
1674 {
1675     int i;
1676 
1677     /* should only be called after backend is connected */
1678     assert(hdev->vhost_ops);
1679 
1680     for (i = 0; i < hdev->nvqs; ++i) {
1681         vhost_virtqueue_stop(hdev,
1682                              vdev,
1683                              hdev->vqs + i,
1684                              hdev->vq_index + i);
1685     }
1686 
1687     if (vhost_dev_has_iommu(hdev)) {
1688         hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false);
1689         memory_listener_unregister(&hdev->iommu_listener);
1690     }
1691     vhost_log_put(hdev, true);
1692     hdev->started = false;
1693     hdev->vdev = NULL;
1694 }
1695 
1696 int vhost_net_set_backend(struct vhost_dev *hdev,
1697                           struct vhost_vring_file *file)
1698 {
1699     if (hdev->vhost_ops->vhost_net_set_backend) {
1700         return hdev->vhost_ops->vhost_net_set_backend(hdev, file);
1701     }
1702 
1703     return -1;
1704 }
1705