1 /*
2 * vhost support
3 *
4 * Copyright Red Hat, Inc. 2010
5 *
6 * Authors:
7 * Michael S. Tsirkin <mst@redhat.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
11 *
12 * Contributions after 2012-01-13 are licensed under the terms of the
13 * GNU GPL, version 2 or (at your option) any later version.
14 */
15
16 #include "qemu/osdep.h"
17 #include "qapi/error.h"
18 #include "hw/virtio/vhost.h"
19 #include "qemu/atomic.h"
20 #include "qemu/range.h"
21 #include "qemu/error-report.h"
22 #include "qemu/memfd.h"
23 #include "qemu/log.h"
24 #include "standard-headers/linux/vhost_types.h"
25 #include "hw/virtio/virtio-bus.h"
26 #include "hw/mem/memory-device.h"
27 #include "migration/blocker.h"
28 #include "migration/qemu-file-types.h"
29 #include "system/dma.h"
30 #include "trace.h"
31
32 /* enabled until disconnected backend stabilizes */
33 #define _VHOST_DEBUG 1
34
35 #ifdef _VHOST_DEBUG
36 #define VHOST_OPS_DEBUG(retval, fmt, ...) \
37 do { \
38 error_report(fmt ": %s (%d)", ## __VA_ARGS__, \
39 strerror(-retval), -retval); \
40 } while (0)
41 #else
42 #define VHOST_OPS_DEBUG(retval, fmt, ...) \
43 do { } while (0)
44 #endif
45
46 static struct vhost_log *vhost_log[VHOST_BACKEND_TYPE_MAX];
47 static struct vhost_log *vhost_log_shm[VHOST_BACKEND_TYPE_MAX];
48 static QLIST_HEAD(, vhost_dev) vhost_log_devs[VHOST_BACKEND_TYPE_MAX];
49
50 static QLIST_HEAD(, vhost_dev) vhost_devices =
51 QLIST_HEAD_INITIALIZER(vhost_devices);
52
vhost_get_max_memslots(void)53 unsigned int vhost_get_max_memslots(void)
54 {
55 unsigned int max = UINT_MAX;
56 struct vhost_dev *hdev;
57
58 QLIST_FOREACH(hdev, &vhost_devices, entry) {
59 max = MIN(max, hdev->vhost_ops->vhost_backend_memslots_limit(hdev));
60 }
61 return max;
62 }
63
vhost_get_free_memslots(void)64 unsigned int vhost_get_free_memslots(void)
65 {
66 unsigned int free = UINT_MAX;
67 struct vhost_dev *hdev;
68
69 QLIST_FOREACH(hdev, &vhost_devices, entry) {
70 unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev);
71 unsigned int cur_free = r - hdev->mem->nregions;
72
73 if (unlikely(r < hdev->mem->nregions)) {
74 warn_report_once("used (%u) vhost backend memory slots exceed"
75 " the device limit (%u).", hdev->mem->nregions, r);
76 free = 0;
77 } else {
78 free = MIN(free, cur_free);
79 }
80 }
81 return free;
82 }
83
vhost_dev_sync_region(struct vhost_dev * dev,MemoryRegionSection * section,uint64_t mfirst,uint64_t mlast,uint64_t rfirst,uint64_t rlast)84 static void vhost_dev_sync_region(struct vhost_dev *dev,
85 MemoryRegionSection *section,
86 uint64_t mfirst, uint64_t mlast,
87 uint64_t rfirst, uint64_t rlast)
88 {
89 vhost_log_chunk_t *dev_log = dev->log->log;
90
91 uint64_t start = MAX(mfirst, rfirst);
92 uint64_t end = MIN(mlast, rlast);
93 vhost_log_chunk_t *from = dev_log + start / VHOST_LOG_CHUNK;
94 vhost_log_chunk_t *to = dev_log + end / VHOST_LOG_CHUNK + 1;
95 uint64_t addr = QEMU_ALIGN_DOWN(start, VHOST_LOG_CHUNK);
96
97 if (end < start) {
98 return;
99 }
100 assert(end / VHOST_LOG_CHUNK < dev->log_size);
101 assert(start / VHOST_LOG_CHUNK < dev->log_size);
102
103 for (;from < to; ++from) {
104 vhost_log_chunk_t log;
105 /* We first check with non-atomic: much cheaper,
106 * and we expect non-dirty to be the common case. */
107 if (!*from) {
108 addr += VHOST_LOG_CHUNK;
109 continue;
110 }
111 /* Data must be read atomically. We don't really need barrier semantics
112 * but it's easier to use atomic_* than roll our own. */
113 log = qatomic_xchg(from, 0);
114 while (log) {
115 int bit = ctzl(log);
116 hwaddr page_addr;
117 hwaddr section_offset;
118 hwaddr mr_offset;
119 page_addr = addr + bit * VHOST_LOG_PAGE;
120 section_offset = page_addr - section->offset_within_address_space;
121 mr_offset = section_offset + section->offset_within_region;
122 memory_region_set_dirty(section->mr, mr_offset, VHOST_LOG_PAGE);
123 log &= ~(0x1ull << bit);
124 }
125 addr += VHOST_LOG_CHUNK;
126 }
127 }
128
vhost_dev_has_iommu(struct vhost_dev * dev)129 bool vhost_dev_has_iommu(struct vhost_dev *dev)
130 {
131 VirtIODevice *vdev = dev->vdev;
132
133 /*
134 * For vhost, VIRTIO_F_IOMMU_PLATFORM means the backend support
135 * incremental memory mapping API via IOTLB API. For platform that
136 * does not have IOMMU, there's no need to enable this feature
137 * which may cause unnecessary IOTLB miss/update transactions.
138 */
139 if (vdev) {
140 return virtio_bus_device_iommu_enabled(vdev) &&
141 virtio_host_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM);
142 } else {
143 return false;
144 }
145 }
146
vhost_dev_should_log(struct vhost_dev * dev)147 static inline bool vhost_dev_should_log(struct vhost_dev *dev)
148 {
149 assert(dev->vhost_ops);
150 assert(dev->vhost_ops->backend_type > VHOST_BACKEND_TYPE_NONE);
151 assert(dev->vhost_ops->backend_type < VHOST_BACKEND_TYPE_MAX);
152
153 return dev == QLIST_FIRST(&vhost_log_devs[dev->vhost_ops->backend_type]);
154 }
155
vhost_dev_elect_mem_logger(struct vhost_dev * hdev,bool add)156 static inline void vhost_dev_elect_mem_logger(struct vhost_dev *hdev, bool add)
157 {
158 VhostBackendType backend_type;
159
160 assert(hdev->vhost_ops);
161
162 backend_type = hdev->vhost_ops->backend_type;
163 assert(backend_type > VHOST_BACKEND_TYPE_NONE);
164 assert(backend_type < VHOST_BACKEND_TYPE_MAX);
165
166 if (add && !QLIST_IS_INSERTED(hdev, logdev_entry)) {
167 if (QLIST_EMPTY(&vhost_log_devs[backend_type])) {
168 QLIST_INSERT_HEAD(&vhost_log_devs[backend_type],
169 hdev, logdev_entry);
170 } else {
171 /*
172 * The first vhost_device in the list is selected as the shared
173 * logger to scan memory sections. Put new entry next to the head
174 * to avoid inadvertent change to the underlying logger device.
175 * This is done in order to get better cache locality and to avoid
176 * performance churn on the hot path for log scanning. Even when
177 * new devices come and go quickly, it wouldn't end up changing
178 * the active leading logger device at all.
179 */
180 QLIST_INSERT_AFTER(QLIST_FIRST(&vhost_log_devs[backend_type]),
181 hdev, logdev_entry);
182 }
183 } else if (!add && QLIST_IS_INSERTED(hdev, logdev_entry)) {
184 QLIST_REMOVE(hdev, logdev_entry);
185 }
186 }
187
vhost_sync_dirty_bitmap(struct vhost_dev * dev,MemoryRegionSection * section,hwaddr first,hwaddr last)188 static int vhost_sync_dirty_bitmap(struct vhost_dev *dev,
189 MemoryRegionSection *section,
190 hwaddr first,
191 hwaddr last)
192 {
193 int i;
194 hwaddr start_addr;
195 hwaddr end_addr;
196
197 if (!dev->log_enabled || !dev->started) {
198 return 0;
199 }
200 start_addr = section->offset_within_address_space;
201 end_addr = range_get_last(start_addr, int128_get64(section->size));
202 start_addr = MAX(first, start_addr);
203 end_addr = MIN(last, end_addr);
204
205 if (vhost_dev_should_log(dev)) {
206 for (i = 0; i < dev->mem->nregions; ++i) {
207 struct vhost_memory_region *reg = dev->mem->regions + i;
208 vhost_dev_sync_region(dev, section, start_addr, end_addr,
209 reg->guest_phys_addr,
210 range_get_last(reg->guest_phys_addr,
211 reg->memory_size));
212 }
213 }
214 for (i = 0; i < dev->nvqs; ++i) {
215 struct vhost_virtqueue *vq = dev->vqs + i;
216
217 if (!vq->used_phys && !vq->used_size) {
218 continue;
219 }
220
221 if (vhost_dev_has_iommu(dev)) {
222 IOMMUTLBEntry iotlb;
223 hwaddr used_phys = vq->used_phys, used_size = vq->used_size;
224 hwaddr phys, s, offset;
225
226 while (used_size) {
227 rcu_read_lock();
228 iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
229 used_phys,
230 true,
231 MEMTXATTRS_UNSPECIFIED);
232 rcu_read_unlock();
233
234 if (!iotlb.target_as) {
235 qemu_log_mask(LOG_GUEST_ERROR, "translation "
236 "failure for used_iova %"PRIx64"\n",
237 used_phys);
238 return -EINVAL;
239 }
240
241 offset = used_phys & iotlb.addr_mask;
242 phys = iotlb.translated_addr + offset;
243
244 /*
245 * Distance from start of used ring until last byte of
246 * IOMMU page.
247 */
248 s = iotlb.addr_mask - offset;
249 /*
250 * Size of used ring, or of the part of it until end
251 * of IOMMU page. To avoid zero result, do the adding
252 * outside of MIN().
253 */
254 s = MIN(s, used_size - 1) + 1;
255
256 vhost_dev_sync_region(dev, section, start_addr, end_addr, phys,
257 range_get_last(phys, s));
258 used_size -= s;
259 used_phys += s;
260 }
261 } else {
262 vhost_dev_sync_region(dev, section, start_addr,
263 end_addr, vq->used_phys,
264 range_get_last(vq->used_phys, vq->used_size));
265 }
266 }
267 return 0;
268 }
269
vhost_log_sync(MemoryListener * listener,MemoryRegionSection * section)270 static void vhost_log_sync(MemoryListener *listener,
271 MemoryRegionSection *section)
272 {
273 struct vhost_dev *dev = container_of(listener, struct vhost_dev,
274 memory_listener);
275 vhost_sync_dirty_bitmap(dev, section, 0x0, ~0x0ULL);
276 }
277
vhost_log_sync_range(struct vhost_dev * dev,hwaddr first,hwaddr last)278 static void vhost_log_sync_range(struct vhost_dev *dev,
279 hwaddr first, hwaddr last)
280 {
281 int i;
282 /* FIXME: this is N^2 in number of sections */
283 for (i = 0; i < dev->n_mem_sections; ++i) {
284 MemoryRegionSection *section = &dev->mem_sections[i];
285 vhost_sync_dirty_bitmap(dev, section, first, last);
286 }
287 }
288
vhost_get_log_size(struct vhost_dev * dev)289 static uint64_t vhost_get_log_size(struct vhost_dev *dev)
290 {
291 uint64_t log_size = 0;
292 int i;
293 for (i = 0; i < dev->mem->nregions; ++i) {
294 struct vhost_memory_region *reg = dev->mem->regions + i;
295 uint64_t last = range_get_last(reg->guest_phys_addr,
296 reg->memory_size);
297 log_size = MAX(log_size, last / VHOST_LOG_CHUNK + 1);
298 }
299 return log_size;
300 }
301
vhost_set_backend_type(struct vhost_dev * dev,VhostBackendType backend_type)302 static int vhost_set_backend_type(struct vhost_dev *dev,
303 VhostBackendType backend_type)
304 {
305 int r = 0;
306
307 switch (backend_type) {
308 #ifdef CONFIG_VHOST_KERNEL
309 case VHOST_BACKEND_TYPE_KERNEL:
310 dev->vhost_ops = &kernel_ops;
311 break;
312 #endif
313 #ifdef CONFIG_VHOST_USER
314 case VHOST_BACKEND_TYPE_USER:
315 dev->vhost_ops = &user_ops;
316 break;
317 #endif
318 #ifdef CONFIG_VHOST_VDPA
319 case VHOST_BACKEND_TYPE_VDPA:
320 dev->vhost_ops = &vdpa_ops;
321 break;
322 #endif
323 default:
324 error_report("Unknown vhost backend type");
325 r = -1;
326 }
327
328 if (r == 0) {
329 assert(dev->vhost_ops->backend_type == backend_type);
330 }
331
332 return r;
333 }
334
vhost_log_alloc(uint64_t size,bool share)335 static struct vhost_log *vhost_log_alloc(uint64_t size, bool share)
336 {
337 Error *err = NULL;
338 struct vhost_log *log;
339 uint64_t logsize = size * sizeof(*(log->log));
340 int fd = -1;
341
342 log = g_new0(struct vhost_log, 1);
343 if (share) {
344 log->log = qemu_memfd_alloc("vhost-log", logsize,
345 F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
346 &fd, &err);
347 if (err) {
348 error_report_err(err);
349 g_free(log);
350 return NULL;
351 }
352 memset(log->log, 0, logsize);
353 } else {
354 log->log = g_malloc0(logsize);
355 }
356
357 log->size = size;
358 log->refcnt = 1;
359 log->fd = fd;
360
361 return log;
362 }
363
vhost_log_get(VhostBackendType backend_type,uint64_t size,bool share)364 static struct vhost_log *vhost_log_get(VhostBackendType backend_type,
365 uint64_t size, bool share)
366 {
367 struct vhost_log *log;
368
369 assert(backend_type > VHOST_BACKEND_TYPE_NONE);
370 assert(backend_type < VHOST_BACKEND_TYPE_MAX);
371
372 log = share ? vhost_log_shm[backend_type] : vhost_log[backend_type];
373
374 if (!log || log->size != size) {
375 log = vhost_log_alloc(size, share);
376 if (share) {
377 vhost_log_shm[backend_type] = log;
378 } else {
379 vhost_log[backend_type] = log;
380 }
381 } else {
382 ++log->refcnt;
383 }
384
385 return log;
386 }
387
vhost_log_put(struct vhost_dev * dev,bool sync)388 static void vhost_log_put(struct vhost_dev *dev, bool sync)
389 {
390 struct vhost_log *log = dev->log;
391 VhostBackendType backend_type;
392
393 if (!log) {
394 return;
395 }
396
397 assert(dev->vhost_ops);
398 backend_type = dev->vhost_ops->backend_type;
399
400 if (backend_type == VHOST_BACKEND_TYPE_NONE ||
401 backend_type >= VHOST_BACKEND_TYPE_MAX) {
402 return;
403 }
404
405 --log->refcnt;
406 if (log->refcnt == 0) {
407 /* Sync only the range covered by the old log */
408 if (dev->log_size && sync) {
409 vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1);
410 }
411
412 if (vhost_log[backend_type] == log) {
413 g_free(log->log);
414 vhost_log[backend_type] = NULL;
415 } else if (vhost_log_shm[backend_type] == log) {
416 qemu_memfd_free(log->log, log->size * sizeof(*(log->log)),
417 log->fd);
418 vhost_log_shm[backend_type] = NULL;
419 }
420
421 g_free(log);
422 }
423
424 vhost_dev_elect_mem_logger(dev, false);
425 dev->log = NULL;
426 dev->log_size = 0;
427 }
428
vhost_dev_log_is_shared(struct vhost_dev * dev)429 static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
430 {
431 return dev->vhost_ops->vhost_requires_shm_log &&
432 dev->vhost_ops->vhost_requires_shm_log(dev);
433 }
434
vhost_dev_log_resize(struct vhost_dev * dev,uint64_t size)435 static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
436 {
437 struct vhost_log *log = vhost_log_get(dev->vhost_ops->backend_type,
438 size, vhost_dev_log_is_shared(dev));
439 uint64_t log_base = (uintptr_t)log->log;
440 int r;
441
442 /* inform backend of log switching, this must be done before
443 releasing the current log, to ensure no logging is lost */
444 r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log);
445 if (r < 0) {
446 VHOST_OPS_DEBUG(r, "vhost_set_log_base failed");
447 }
448
449 vhost_log_put(dev, true);
450 dev->log = log;
451 dev->log_size = size;
452 }
453
vhost_memory_map(struct vhost_dev * dev,hwaddr addr,hwaddr * plen,bool is_write)454 static void *vhost_memory_map(struct vhost_dev *dev, hwaddr addr,
455 hwaddr *plen, bool is_write)
456 {
457 if (!vhost_dev_has_iommu(dev)) {
458 return cpu_physical_memory_map(addr, plen, is_write);
459 } else {
460 return (void *)(uintptr_t)addr;
461 }
462 }
463
vhost_memory_unmap(struct vhost_dev * dev,void * buffer,hwaddr len,int is_write,hwaddr access_len)464 static void vhost_memory_unmap(struct vhost_dev *dev, void *buffer,
465 hwaddr len, int is_write,
466 hwaddr access_len)
467 {
468 if (!vhost_dev_has_iommu(dev)) {
469 cpu_physical_memory_unmap(buffer, len, is_write, access_len);
470 }
471 }
472
vhost_verify_ring_part_mapping(void * ring_hva,uint64_t ring_gpa,uint64_t ring_size,void * reg_hva,uint64_t reg_gpa,uint64_t reg_size)473 static int vhost_verify_ring_part_mapping(void *ring_hva,
474 uint64_t ring_gpa,
475 uint64_t ring_size,
476 void *reg_hva,
477 uint64_t reg_gpa,
478 uint64_t reg_size)
479 {
480 uint64_t hva_ring_offset;
481 uint64_t ring_last = range_get_last(ring_gpa, ring_size);
482 uint64_t reg_last = range_get_last(reg_gpa, reg_size);
483
484 if (ring_last < reg_gpa || ring_gpa > reg_last) {
485 return 0;
486 }
487 /* check that whole ring's is mapped */
488 if (ring_last > reg_last) {
489 return -ENOMEM;
490 }
491 /* check that ring's MemoryRegion wasn't replaced */
492 hva_ring_offset = ring_gpa - reg_gpa;
493 if (ring_hva != reg_hva + hva_ring_offset) {
494 return -EBUSY;
495 }
496
497 return 0;
498 }
499
vhost_verify_ring_mappings(struct vhost_dev * dev,void * reg_hva,uint64_t reg_gpa,uint64_t reg_size)500 static int vhost_verify_ring_mappings(struct vhost_dev *dev,
501 void *reg_hva,
502 uint64_t reg_gpa,
503 uint64_t reg_size)
504 {
505 int i, j;
506 int r = 0;
507 const char *part_name[] = {
508 "descriptor table",
509 "available ring",
510 "used ring"
511 };
512
513 if (vhost_dev_has_iommu(dev)) {
514 return 0;
515 }
516
517 for (i = 0; i < dev->nvqs; ++i) {
518 struct vhost_virtqueue *vq = dev->vqs + i;
519
520 if (vq->desc_phys == 0) {
521 continue;
522 }
523
524 j = 0;
525 r = vhost_verify_ring_part_mapping(
526 vq->desc, vq->desc_phys, vq->desc_size,
527 reg_hva, reg_gpa, reg_size);
528 if (r) {
529 break;
530 }
531
532 j++;
533 r = vhost_verify_ring_part_mapping(
534 vq->avail, vq->avail_phys, vq->avail_size,
535 reg_hva, reg_gpa, reg_size);
536 if (r) {
537 break;
538 }
539
540 j++;
541 r = vhost_verify_ring_part_mapping(
542 vq->used, vq->used_phys, vq->used_size,
543 reg_hva, reg_gpa, reg_size);
544 if (r) {
545 break;
546 }
547 }
548
549 if (r == -ENOMEM) {
550 error_report("Unable to map %s for ring %d", part_name[j], i);
551 } else if (r == -EBUSY) {
552 error_report("%s relocated for ring %d", part_name[j], i);
553 }
554 return r;
555 }
556
557 /*
558 * vhost_section: identify sections needed for vhost access
559 *
560 * We only care about RAM sections here (where virtqueue and guest
561 * internals accessed by virtio might live).
562 */
vhost_section(struct vhost_dev * dev,MemoryRegionSection * section)563 static bool vhost_section(struct vhost_dev *dev, MemoryRegionSection *section)
564 {
565 MemoryRegion *mr = section->mr;
566
567 if (memory_region_is_ram(mr) && !memory_region_is_rom(mr)) {
568 uint8_t dirty_mask = memory_region_get_dirty_log_mask(mr);
569 uint8_t handled_dirty;
570
571 /*
572 * Kernel based vhost doesn't handle any block which is doing
573 * dirty-tracking other than migration for which it has
574 * specific logging support. However for TCG the kernel never
575 * gets involved anyway so we can also ignore it's
576 * self-modiying code detection flags. However a vhost-user
577 * client could still confuse a TCG guest if it re-writes
578 * executable memory that has already been translated.
579 */
580 handled_dirty = (1 << DIRTY_MEMORY_MIGRATION) |
581 (1 << DIRTY_MEMORY_CODE);
582
583 if (dirty_mask & ~handled_dirty) {
584 trace_vhost_reject_section(mr->name, 1);
585 return false;
586 }
587
588 /*
589 * Some backends (like vhost-user) can only handle memory regions
590 * that have an fd (can be mapped into a different process). Filter
591 * the ones without an fd out, if requested.
592 *
593 * TODO: we might have to limit to MAP_SHARED as well.
594 */
595 if (memory_region_get_fd(section->mr) < 0 &&
596 dev->vhost_ops->vhost_backend_no_private_memslots &&
597 dev->vhost_ops->vhost_backend_no_private_memslots(dev)) {
598 trace_vhost_reject_section(mr->name, 2);
599 return false;
600 }
601
602 trace_vhost_section(mr->name);
603 return true;
604 } else {
605 trace_vhost_reject_section(mr->name, 3);
606 return false;
607 }
608 }
609
vhost_begin(MemoryListener * listener)610 static void vhost_begin(MemoryListener *listener)
611 {
612 struct vhost_dev *dev = container_of(listener, struct vhost_dev,
613 memory_listener);
614 dev->tmp_sections = NULL;
615 dev->n_tmp_sections = 0;
616 }
617
vhost_commit(MemoryListener * listener)618 static void vhost_commit(MemoryListener *listener)
619 {
620 struct vhost_dev *dev = container_of(listener, struct vhost_dev,
621 memory_listener);
622 MemoryRegionSection *old_sections;
623 int n_old_sections;
624 uint64_t log_size;
625 size_t regions_size;
626 int r;
627 int i;
628 bool changed = false;
629
630 /* Note we can be called before the device is started, but then
631 * starting the device calls set_mem_table, so we need to have
632 * built the data structures.
633 */
634 old_sections = dev->mem_sections;
635 n_old_sections = dev->n_mem_sections;
636 dev->mem_sections = dev->tmp_sections;
637 dev->n_mem_sections = dev->n_tmp_sections;
638
639 if (dev->n_mem_sections != n_old_sections) {
640 changed = true;
641 } else {
642 /* Same size, lets check the contents */
643 for (i = 0; i < n_old_sections; i++) {
644 if (!MemoryRegionSection_eq(&old_sections[i],
645 &dev->mem_sections[i])) {
646 changed = true;
647 break;
648 }
649 }
650 }
651
652 trace_vhost_commit(dev->started, changed);
653 if (!changed) {
654 goto out;
655 }
656
657 /* Rebuild the regions list from the new sections list */
658 regions_size = offsetof(struct vhost_memory, regions) +
659 dev->n_mem_sections * sizeof dev->mem->regions[0];
660 dev->mem = g_realloc(dev->mem, regions_size);
661 dev->mem->nregions = dev->n_mem_sections;
662
663 for (i = 0; i < dev->n_mem_sections; i++) {
664 struct vhost_memory_region *cur_vmr = dev->mem->regions + i;
665 struct MemoryRegionSection *mrs = dev->mem_sections + i;
666
667 cur_vmr->guest_phys_addr = mrs->offset_within_address_space;
668 cur_vmr->memory_size = int128_get64(mrs->size);
669 cur_vmr->userspace_addr =
670 (uintptr_t)memory_region_get_ram_ptr(mrs->mr) +
671 mrs->offset_within_region;
672 cur_vmr->flags_padding = 0;
673 }
674
675 if (!dev->started) {
676 goto out;
677 }
678
679 for (i = 0; i < dev->mem->nregions; i++) {
680 if (vhost_verify_ring_mappings(dev,
681 (void *)(uintptr_t)dev->mem->regions[i].userspace_addr,
682 dev->mem->regions[i].guest_phys_addr,
683 dev->mem->regions[i].memory_size)) {
684 error_report("Verify ring failure on region %d", i);
685 abort();
686 }
687 }
688
689 if (!dev->log_enabled) {
690 r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
691 if (r < 0) {
692 VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed");
693 }
694 goto out;
695 }
696 log_size = vhost_get_log_size(dev);
697 /* We allocate an extra 4K bytes to log,
698 * to reduce the * number of reallocations. */
699 #define VHOST_LOG_BUFFER (0x1000 / sizeof *dev->log)
700 /* To log more, must increase log size before table update. */
701 if (dev->log_size < log_size) {
702 vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER);
703 }
704 r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
705 if (r < 0) {
706 VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed");
707 }
708 /* To log less, can only decrease log size after table update. */
709 if (dev->log_size > log_size + VHOST_LOG_BUFFER) {
710 vhost_dev_log_resize(dev, log_size);
711 }
712
713 out:
714 /* Deref the old list of sections, this must happen _after_ the
715 * vhost_set_mem_table to ensure the client isn't still using the
716 * section we're about to unref.
717 */
718 while (n_old_sections--) {
719 memory_region_unref(old_sections[n_old_sections].mr);
720 }
721 g_free(old_sections);
722 }
723
724 /* Adds the section data to the tmp_section structure.
725 * It relies on the listener calling us in memory address order
726 * and for each region (via the _add and _nop methods) to
727 * join neighbours.
728 */
vhost_region_add_section(struct vhost_dev * dev,MemoryRegionSection * section)729 static void vhost_region_add_section(struct vhost_dev *dev,
730 MemoryRegionSection *section)
731 {
732 bool need_add = true;
733 uint64_t mrs_size = int128_get64(section->size);
734 uint64_t mrs_gpa = section->offset_within_address_space;
735 uintptr_t mrs_host = (uintptr_t)memory_region_get_ram_ptr(section->mr) +
736 section->offset_within_region;
737 RAMBlock *mrs_rb = section->mr->ram_block;
738
739 trace_vhost_region_add_section(section->mr->name, mrs_gpa, mrs_size,
740 mrs_host);
741
742 if (dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER) {
743 /* Round the section to it's page size */
744 /* First align the start down to a page boundary */
745 size_t mrs_page = qemu_ram_pagesize(mrs_rb);
746 uint64_t alignage = mrs_host & (mrs_page - 1);
747 if (alignage) {
748 mrs_host -= alignage;
749 mrs_size += alignage;
750 mrs_gpa -= alignage;
751 }
752 /* Now align the size up to a page boundary */
753 alignage = mrs_size & (mrs_page - 1);
754 if (alignage) {
755 mrs_size += mrs_page - alignage;
756 }
757 trace_vhost_region_add_section_aligned(section->mr->name, mrs_gpa,
758 mrs_size, mrs_host);
759 }
760
761 if (dev->n_tmp_sections && !section->unmergeable) {
762 /* Since we already have at least one section, lets see if
763 * this extends it; since we're scanning in order, we only
764 * have to look at the last one, and the FlatView that calls
765 * us shouldn't have overlaps.
766 */
767 MemoryRegionSection *prev_sec = dev->tmp_sections +
768 (dev->n_tmp_sections - 1);
769 uint64_t prev_gpa_start = prev_sec->offset_within_address_space;
770 uint64_t prev_size = int128_get64(prev_sec->size);
771 uint64_t prev_gpa_end = range_get_last(prev_gpa_start, prev_size);
772 uint64_t prev_host_start =
773 (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr) +
774 prev_sec->offset_within_region;
775 uint64_t prev_host_end = range_get_last(prev_host_start, prev_size);
776
777 if (mrs_gpa <= (prev_gpa_end + 1)) {
778 /* OK, looks like overlapping/intersecting - it's possible that
779 * the rounding to page sizes has made them overlap, but they should
780 * match up in the same RAMBlock if they do.
781 */
782 if (mrs_gpa < prev_gpa_start) {
783 error_report("%s:Section '%s' rounded to %"PRIx64
784 " prior to previous '%s' %"PRIx64,
785 __func__, section->mr->name, mrs_gpa,
786 prev_sec->mr->name, prev_gpa_start);
787 /* A way to cleanly fail here would be better */
788 return;
789 }
790 /* Offset from the start of the previous GPA to this GPA */
791 size_t offset = mrs_gpa - prev_gpa_start;
792
793 if (prev_host_start + offset == mrs_host &&
794 section->mr == prev_sec->mr && !prev_sec->unmergeable) {
795 uint64_t max_end = MAX(prev_host_end, mrs_host + mrs_size);
796 need_add = false;
797 prev_sec->offset_within_address_space =
798 MIN(prev_gpa_start, mrs_gpa);
799 prev_sec->offset_within_region =
800 MIN(prev_host_start, mrs_host) -
801 (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr);
802 prev_sec->size = int128_make64(max_end - MIN(prev_host_start,
803 mrs_host));
804 trace_vhost_region_add_section_merge(section->mr->name,
805 int128_get64(prev_sec->size),
806 prev_sec->offset_within_address_space,
807 prev_sec->offset_within_region);
808 } else {
809 /* adjoining regions are fine, but overlapping ones with
810 * different blocks/offsets shouldn't happen
811 */
812 if (mrs_gpa != prev_gpa_end + 1) {
813 error_report("%s: Overlapping but not coherent sections "
814 "at %"PRIx64,
815 __func__, mrs_gpa);
816 return;
817 }
818 }
819 }
820 }
821
822 if (need_add) {
823 ++dev->n_tmp_sections;
824 dev->tmp_sections = g_renew(MemoryRegionSection, dev->tmp_sections,
825 dev->n_tmp_sections);
826 dev->tmp_sections[dev->n_tmp_sections - 1] = *section;
827 /* The flatview isn't stable and we don't use it, making it NULL
828 * means we can memcmp the list.
829 */
830 dev->tmp_sections[dev->n_tmp_sections - 1].fv = NULL;
831 memory_region_ref(section->mr);
832 }
833 }
834
835 /* Used for both add and nop callbacks */
vhost_region_addnop(MemoryListener * listener,MemoryRegionSection * section)836 static void vhost_region_addnop(MemoryListener *listener,
837 MemoryRegionSection *section)
838 {
839 struct vhost_dev *dev = container_of(listener, struct vhost_dev,
840 memory_listener);
841
842 if (!vhost_section(dev, section)) {
843 return;
844 }
845 vhost_region_add_section(dev, section);
846 }
847
vhost_iommu_unmap_notify(IOMMUNotifier * n,IOMMUTLBEntry * iotlb)848 static void vhost_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
849 {
850 struct vhost_iommu *iommu = container_of(n, struct vhost_iommu, n);
851 struct vhost_dev *hdev = iommu->hdev;
852 hwaddr iova = iotlb->iova + iommu->iommu_offset;
853
854 if (vhost_backend_invalidate_device_iotlb(hdev, iova,
855 iotlb->addr_mask + 1)) {
856 error_report("Fail to invalidate device iotlb");
857 }
858 }
859
vhost_iommu_region_add(MemoryListener * listener,MemoryRegionSection * section)860 static void vhost_iommu_region_add(MemoryListener *listener,
861 MemoryRegionSection *section)
862 {
863 struct vhost_dev *dev = container_of(listener, struct vhost_dev,
864 iommu_listener);
865 struct vhost_iommu *iommu;
866 Int128 end;
867 int iommu_idx;
868 IOMMUMemoryRegion *iommu_mr;
869
870 if (!memory_region_is_iommu(section->mr)) {
871 return;
872 }
873
874 iommu_mr = IOMMU_MEMORY_REGION(section->mr);
875
876 iommu = g_malloc0(sizeof(*iommu));
877 end = int128_add(int128_make64(section->offset_within_region),
878 section->size);
879 end = int128_sub(end, int128_one());
880 iommu_idx = memory_region_iommu_attrs_to_index(iommu_mr,
881 MEMTXATTRS_UNSPECIFIED);
882 iommu_notifier_init(&iommu->n, vhost_iommu_unmap_notify,
883 dev->vdev->device_iotlb_enabled ?
884 IOMMU_NOTIFIER_DEVIOTLB_UNMAP :
885 IOMMU_NOTIFIER_UNMAP,
886 section->offset_within_region,
887 int128_get64(end),
888 iommu_idx);
889 iommu->mr = section->mr;
890 iommu->iommu_offset = section->offset_within_address_space -
891 section->offset_within_region;
892 iommu->hdev = dev;
893 memory_region_register_iommu_notifier(section->mr, &iommu->n,
894 &error_fatal);
895 QLIST_INSERT_HEAD(&dev->iommu_list, iommu, iommu_next);
896 /* TODO: can replay help performance here? */
897 }
898
vhost_iommu_region_del(MemoryListener * listener,MemoryRegionSection * section)899 static void vhost_iommu_region_del(MemoryListener *listener,
900 MemoryRegionSection *section)
901 {
902 struct vhost_dev *dev = container_of(listener, struct vhost_dev,
903 iommu_listener);
904 struct vhost_iommu *iommu;
905
906 if (!memory_region_is_iommu(section->mr)) {
907 return;
908 }
909
910 QLIST_FOREACH(iommu, &dev->iommu_list, iommu_next) {
911 if (iommu->mr == section->mr &&
912 iommu->n.start == section->offset_within_region) {
913 memory_region_unregister_iommu_notifier(iommu->mr,
914 &iommu->n);
915 QLIST_REMOVE(iommu, iommu_next);
916 g_free(iommu);
917 break;
918 }
919 }
920 }
921
vhost_toggle_device_iotlb(VirtIODevice * vdev)922 void vhost_toggle_device_iotlb(VirtIODevice *vdev)
923 {
924 VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(vdev);
925 struct vhost_dev *dev;
926 struct vhost_iommu *iommu;
927
928 if (vdev->vhost_started) {
929 dev = vdc->get_vhost(vdev);
930 } else {
931 return;
932 }
933
934 QLIST_FOREACH(iommu, &dev->iommu_list, iommu_next) {
935 memory_region_unregister_iommu_notifier(iommu->mr, &iommu->n);
936 iommu->n.notifier_flags = vdev->device_iotlb_enabled ?
937 IOMMU_NOTIFIER_DEVIOTLB_UNMAP : IOMMU_NOTIFIER_UNMAP;
938 memory_region_register_iommu_notifier(iommu->mr, &iommu->n,
939 &error_fatal);
940 }
941 }
942
vhost_virtqueue_set_addr(struct vhost_dev * dev,struct vhost_virtqueue * vq,unsigned idx,bool enable_log)943 static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
944 struct vhost_virtqueue *vq,
945 unsigned idx, bool enable_log)
946 {
947 struct vhost_vring_addr addr;
948 int r;
949 memset(&addr, 0, sizeof(struct vhost_vring_addr));
950
951 if (dev->vhost_ops->vhost_vq_get_addr) {
952 r = dev->vhost_ops->vhost_vq_get_addr(dev, &addr, vq);
953 if (r < 0) {
954 VHOST_OPS_DEBUG(r, "vhost_vq_get_addr failed");
955 return r;
956 }
957 } else {
958 addr.desc_user_addr = (uint64_t)(unsigned long)vq->desc;
959 addr.avail_user_addr = (uint64_t)(unsigned long)vq->avail;
960 addr.used_user_addr = (uint64_t)(unsigned long)vq->used;
961 }
962 addr.index = idx;
963 addr.log_guest_addr = vq->used_phys;
964 addr.flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0;
965 r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr);
966 if (r < 0) {
967 VHOST_OPS_DEBUG(r, "vhost_set_vring_addr failed");
968 }
969 return r;
970 }
971
vhost_dev_set_features(struct vhost_dev * dev,bool enable_log)972 static int vhost_dev_set_features(struct vhost_dev *dev,
973 bool enable_log)
974 {
975 uint64_t features = dev->acked_features;
976 int r;
977 if (enable_log) {
978 features |= 0x1ULL << VHOST_F_LOG_ALL;
979 }
980 if (!vhost_dev_has_iommu(dev)) {
981 features &= ~(0x1ULL << VIRTIO_F_IOMMU_PLATFORM);
982 }
983 if (dev->vhost_ops->vhost_force_iommu) {
984 if (dev->vhost_ops->vhost_force_iommu(dev) == true) {
985 features |= 0x1ULL << VIRTIO_F_IOMMU_PLATFORM;
986 }
987 }
988 r = dev->vhost_ops->vhost_set_features(dev, features);
989 if (r < 0) {
990 VHOST_OPS_DEBUG(r, "vhost_set_features failed");
991 goto out;
992 }
993 if (dev->vhost_ops->vhost_set_backend_cap) {
994 r = dev->vhost_ops->vhost_set_backend_cap(dev);
995 if (r < 0) {
996 VHOST_OPS_DEBUG(r, "vhost_set_backend_cap failed");
997 goto out;
998 }
999 }
1000
1001 out:
1002 return r;
1003 }
1004
vhost_dev_set_log(struct vhost_dev * dev,bool enable_log)1005 static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
1006 {
1007 int r, i, idx;
1008 hwaddr addr;
1009
1010 r = vhost_dev_set_features(dev, enable_log);
1011 if (r < 0) {
1012 goto err_features;
1013 }
1014 for (i = 0; i < dev->nvqs; ++i) {
1015 idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
1016 addr = virtio_queue_get_desc_addr(dev->vdev, idx);
1017 if (!addr) {
1018 /*
1019 * The queue might not be ready for start. If this
1020 * is the case there is no reason to continue the process.
1021 * The similar logic is used by the vhost_virtqueue_start()
1022 * routine.
1023 */
1024 continue;
1025 }
1026 r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
1027 enable_log);
1028 if (r < 0) {
1029 goto err_vq;
1030 }
1031 }
1032
1033 /*
1034 * At log start we select our vhost_device logger that will scan the
1035 * memory sections and skip for the others. This is possible because
1036 * the log is shared amongst all vhost devices for a given type of
1037 * backend.
1038 */
1039 vhost_dev_elect_mem_logger(dev, enable_log);
1040
1041 return 0;
1042 err_vq:
1043 for (; i >= 0; --i) {
1044 idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
1045 addr = virtio_queue_get_desc_addr(dev->vdev, idx);
1046 if (!addr) {
1047 continue;
1048 }
1049 vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
1050 dev->log_enabled);
1051 }
1052 vhost_dev_set_features(dev, dev->log_enabled);
1053 err_features:
1054 return r;
1055 }
1056
vhost_migration_log(MemoryListener * listener,bool enable)1057 static int vhost_migration_log(MemoryListener *listener, bool enable)
1058 {
1059 struct vhost_dev *dev = container_of(listener, struct vhost_dev,
1060 memory_listener);
1061 int r;
1062 if (enable == dev->log_enabled) {
1063 return 0;
1064 }
1065 if (!dev->started) {
1066 dev->log_enabled = enable;
1067 return 0;
1068 }
1069
1070 r = 0;
1071 if (!enable) {
1072 r = vhost_dev_set_log(dev, false);
1073 if (r < 0) {
1074 goto check_dev_state;
1075 }
1076 vhost_log_put(dev, false);
1077 } else {
1078 vhost_dev_log_resize(dev, vhost_get_log_size(dev));
1079 r = vhost_dev_set_log(dev, true);
1080 if (r < 0) {
1081 goto check_dev_state;
1082 }
1083 }
1084
1085 check_dev_state:
1086 dev->log_enabled = enable;
1087 /*
1088 * vhost-user-* devices could change their state during log
1089 * initialization due to disconnect. So check dev state after
1090 * vhost communication.
1091 */
1092 if (!dev->started) {
1093 /*
1094 * Since device is in the stopped state, it is okay for
1095 * migration. Return success.
1096 */
1097 r = 0;
1098 }
1099 if (r) {
1100 /* An error occurred. */
1101 dev->log_enabled = false;
1102 }
1103
1104 return r;
1105 }
1106
vhost_log_global_start(MemoryListener * listener,Error ** errp)1107 static bool vhost_log_global_start(MemoryListener *listener, Error **errp)
1108 {
1109 int r;
1110
1111 r = vhost_migration_log(listener, true);
1112 if (r < 0) {
1113 error_setg_errno(errp, -r, "vhost: Failed to start logging");
1114 return false;
1115 }
1116 return true;
1117 }
1118
vhost_log_global_stop(MemoryListener * listener)1119 static void vhost_log_global_stop(MemoryListener *listener)
1120 {
1121 int r;
1122
1123 r = vhost_migration_log(listener, false);
1124 if (r < 0) {
1125 /* Not fatal, so report it, but take no further action */
1126 warn_report("vhost: Failed to stop logging");
1127 }
1128 }
1129
vhost_log_start(MemoryListener * listener,MemoryRegionSection * section,int old,int new)1130 static void vhost_log_start(MemoryListener *listener,
1131 MemoryRegionSection *section,
1132 int old, int new)
1133 {
1134 /* FIXME: implement */
1135 }
1136
vhost_log_stop(MemoryListener * listener,MemoryRegionSection * section,int old,int new)1137 static void vhost_log_stop(MemoryListener *listener,
1138 MemoryRegionSection *section,
1139 int old, int new)
1140 {
1141 /* FIXME: implement */
1142 }
1143
1144 /* The vhost driver natively knows how to handle the vrings of non
1145 * cross-endian legacy devices and modern devices. Only legacy devices
1146 * exposed to a bi-endian guest may require the vhost driver to use a
1147 * specific endianness.
1148 */
vhost_needs_vring_endian(VirtIODevice * vdev)1149 static inline bool vhost_needs_vring_endian(VirtIODevice *vdev)
1150 {
1151 if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
1152 return false;
1153 }
1154 #if HOST_BIG_ENDIAN
1155 return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_LITTLE;
1156 #else
1157 return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_BIG;
1158 #endif
1159 }
1160
vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev * dev,bool is_big_endian,int vhost_vq_index)1161 static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev,
1162 bool is_big_endian,
1163 int vhost_vq_index)
1164 {
1165 int r;
1166 struct vhost_vring_state s = {
1167 .index = vhost_vq_index,
1168 .num = is_big_endian
1169 };
1170
1171 r = dev->vhost_ops->vhost_set_vring_endian(dev, &s);
1172 if (r < 0) {
1173 VHOST_OPS_DEBUG(r, "vhost_set_vring_endian failed");
1174 }
1175 return r;
1176 }
1177
vhost_memory_region_lookup(struct vhost_dev * hdev,uint64_t gpa,uint64_t * uaddr,uint64_t * len)1178 static int vhost_memory_region_lookup(struct vhost_dev *hdev,
1179 uint64_t gpa, uint64_t *uaddr,
1180 uint64_t *len)
1181 {
1182 int i;
1183
1184 for (i = 0; i < hdev->mem->nregions; i++) {
1185 struct vhost_memory_region *reg = hdev->mem->regions + i;
1186
1187 if (gpa >= reg->guest_phys_addr &&
1188 reg->guest_phys_addr + reg->memory_size > gpa) {
1189 *uaddr = reg->userspace_addr + gpa - reg->guest_phys_addr;
1190 *len = reg->guest_phys_addr + reg->memory_size - gpa;
1191 return 0;
1192 }
1193 }
1194
1195 return -EFAULT;
1196 }
1197
vhost_device_iotlb_miss(struct vhost_dev * dev,uint64_t iova,int write)1198 int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write)
1199 {
1200 IOMMUTLBEntry iotlb;
1201 uint64_t uaddr, len;
1202 int ret = -EFAULT;
1203
1204 RCU_READ_LOCK_GUARD();
1205
1206 trace_vhost_iotlb_miss(dev, 1);
1207
1208 iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
1209 iova, write,
1210 MEMTXATTRS_UNSPECIFIED);
1211 if (iotlb.target_as != NULL) {
1212 ret = vhost_memory_region_lookup(dev, iotlb.translated_addr,
1213 &uaddr, &len);
1214 if (ret) {
1215 trace_vhost_iotlb_miss(dev, 3);
1216 error_report("Fail to lookup the translated address "
1217 "%"PRIx64, iotlb.translated_addr);
1218 goto out;
1219 }
1220
1221 len = MIN(iotlb.addr_mask + 1, len);
1222 iova = iova & ~iotlb.addr_mask;
1223
1224 ret = vhost_backend_update_device_iotlb(dev, iova, uaddr,
1225 len, iotlb.perm);
1226 if (ret) {
1227 trace_vhost_iotlb_miss(dev, 4);
1228 error_report("Fail to update device iotlb");
1229 goto out;
1230 }
1231 }
1232
1233 trace_vhost_iotlb_miss(dev, 2);
1234
1235 out:
1236 return ret;
1237 }
1238
vhost_virtqueue_start(struct vhost_dev * dev,struct VirtIODevice * vdev,struct vhost_virtqueue * vq,unsigned idx)1239 int vhost_virtqueue_start(struct vhost_dev *dev,
1240 struct VirtIODevice *vdev,
1241 struct vhost_virtqueue *vq,
1242 unsigned idx)
1243 {
1244 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1245 VirtioBusState *vbus = VIRTIO_BUS(qbus);
1246 VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
1247 hwaddr s, l, a;
1248 int r;
1249 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
1250 struct vhost_vring_file file = {
1251 .index = vhost_vq_index
1252 };
1253 struct vhost_vring_state state = {
1254 .index = vhost_vq_index
1255 };
1256 struct VirtQueue *vvq = virtio_get_queue(vdev, idx);
1257
1258 a = virtio_queue_get_desc_addr(vdev, idx);
1259 if (a == 0) {
1260 /* Queue might not be ready for start */
1261 return 0;
1262 }
1263
1264 vq->num = state.num = virtio_queue_get_num(vdev, idx);
1265 r = dev->vhost_ops->vhost_set_vring_num(dev, &state);
1266 if (r) {
1267 VHOST_OPS_DEBUG(r, "vhost_set_vring_num failed");
1268 return r;
1269 }
1270
1271 state.num = virtio_queue_get_last_avail_idx(vdev, idx);
1272 r = dev->vhost_ops->vhost_set_vring_base(dev, &state);
1273 if (r) {
1274 VHOST_OPS_DEBUG(r, "vhost_set_vring_base failed");
1275 return r;
1276 }
1277
1278 if (vhost_needs_vring_endian(vdev)) {
1279 r = vhost_virtqueue_set_vring_endian_legacy(dev,
1280 virtio_is_big_endian(vdev),
1281 vhost_vq_index);
1282 if (r) {
1283 return r;
1284 }
1285 }
1286
1287 vq->desc_size = s = l = virtio_queue_get_desc_size(vdev, idx);
1288 vq->desc_phys = a;
1289 vq->desc = vhost_memory_map(dev, a, &l, false);
1290 if (!vq->desc || l != s) {
1291 r = -ENOMEM;
1292 goto fail_alloc_desc;
1293 }
1294 vq->avail_size = s = l = virtio_queue_get_avail_size(vdev, idx);
1295 vq->avail_phys = a = virtio_queue_get_avail_addr(vdev, idx);
1296 vq->avail = vhost_memory_map(dev, a, &l, false);
1297 if (!vq->avail || l != s) {
1298 r = -ENOMEM;
1299 goto fail_alloc_avail;
1300 }
1301 vq->used_size = s = l = virtio_queue_get_used_size(vdev, idx);
1302 vq->used_phys = a = virtio_queue_get_used_addr(vdev, idx);
1303 vq->used = vhost_memory_map(dev, a, &l, true);
1304 if (!vq->used || l != s) {
1305 r = -ENOMEM;
1306 goto fail_alloc_used;
1307 }
1308
1309 r = vhost_virtqueue_set_addr(dev, vq, vhost_vq_index, dev->log_enabled);
1310 if (r < 0) {
1311 goto fail_alloc;
1312 }
1313
1314 file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq));
1315 r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
1316 if (r) {
1317 VHOST_OPS_DEBUG(r, "vhost_set_vring_kick failed");
1318 goto fail_kick;
1319 }
1320
1321 /* Clear and discard previous events if any. */
1322 event_notifier_test_and_clear(&vq->masked_notifier);
1323
1324 /* Init vring in unmasked state, unless guest_notifier_mask
1325 * will do it later.
1326 */
1327 if (!vdev->use_guest_notifier_mask) {
1328 /* TODO: check and handle errors. */
1329 vhost_virtqueue_mask(dev, vdev, idx, false);
1330 }
1331
1332 if (k->query_guest_notifiers &&
1333 k->query_guest_notifiers(qbus->parent) &&
1334 virtio_queue_vector(vdev, idx) == VIRTIO_NO_VECTOR) {
1335 file.fd = -1;
1336 r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
1337 if (r) {
1338 goto fail_vector;
1339 }
1340 }
1341
1342 return 0;
1343
1344 fail_vector:
1345 fail_kick:
1346 fail_alloc:
1347 vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
1348 0, 0);
1349 fail_alloc_used:
1350 vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
1351 0, 0);
1352 fail_alloc_avail:
1353 vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
1354 0, 0);
1355 fail_alloc_desc:
1356 return r;
1357 }
1358
do_vhost_virtqueue_stop(struct vhost_dev * dev,struct VirtIODevice * vdev,struct vhost_virtqueue * vq,unsigned idx,bool force)1359 static int do_vhost_virtqueue_stop(struct vhost_dev *dev,
1360 struct VirtIODevice *vdev,
1361 struct vhost_virtqueue *vq,
1362 unsigned idx, bool force)
1363 {
1364 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
1365 struct vhost_vring_state state = {
1366 .index = vhost_vq_index,
1367 };
1368 int r = 0;
1369
1370 if (virtio_queue_get_desc_addr(vdev, idx) == 0) {
1371 /* Don't stop the virtqueue which might have not been started */
1372 return 0;
1373 }
1374
1375 if (!force) {
1376 r = dev->vhost_ops->vhost_get_vring_base(dev, &state);
1377 if (r < 0) {
1378 VHOST_OPS_DEBUG(r, "vhost VQ %u ring restore failed: %d", idx, r);
1379 }
1380 }
1381
1382 if (r < 0 || force) {
1383 /* Connection to the backend is broken, so let's sync internal
1384 * last avail idx to the device used idx.
1385 */
1386 virtio_queue_restore_last_avail_idx(vdev, idx);
1387 } else {
1388 virtio_queue_set_last_avail_idx(vdev, idx, state.num);
1389 }
1390 virtio_queue_invalidate_signalled_used(vdev, idx);
1391 virtio_queue_update_used_idx(vdev, idx);
1392
1393 /* In the cross-endian case, we need to reset the vring endianness to
1394 * native as legacy devices expect so by default.
1395 */
1396 if (vhost_needs_vring_endian(vdev)) {
1397 vhost_virtqueue_set_vring_endian_legacy(dev,
1398 !virtio_is_big_endian(vdev),
1399 vhost_vq_index);
1400 }
1401
1402 vhost_memory_unmap(dev, vq->used, virtio_queue_get_used_size(vdev, idx),
1403 1, virtio_queue_get_used_size(vdev, idx));
1404 vhost_memory_unmap(dev, vq->avail, virtio_queue_get_avail_size(vdev, idx),
1405 0, virtio_queue_get_avail_size(vdev, idx));
1406 vhost_memory_unmap(dev, vq->desc, virtio_queue_get_desc_size(vdev, idx),
1407 0, virtio_queue_get_desc_size(vdev, idx));
1408 return r;
1409 }
1410
vhost_virtqueue_stop(struct vhost_dev * dev,struct VirtIODevice * vdev,struct vhost_virtqueue * vq,unsigned idx)1411 int vhost_virtqueue_stop(struct vhost_dev *dev,
1412 struct VirtIODevice *vdev,
1413 struct vhost_virtqueue *vq,
1414 unsigned idx)
1415 {
1416 return do_vhost_virtqueue_stop(dev, vdev, vq, idx, false);
1417 }
1418
vhost_virtqueue_set_busyloop_timeout(struct vhost_dev * dev,int n,uint32_t timeout)1419 static int vhost_virtqueue_set_busyloop_timeout(struct vhost_dev *dev,
1420 int n, uint32_t timeout)
1421 {
1422 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
1423 struct vhost_vring_state state = {
1424 .index = vhost_vq_index,
1425 .num = timeout,
1426 };
1427 int r;
1428
1429 if (!dev->vhost_ops->vhost_set_vring_busyloop_timeout) {
1430 return -EINVAL;
1431 }
1432
1433 r = dev->vhost_ops->vhost_set_vring_busyloop_timeout(dev, &state);
1434 if (r) {
1435 VHOST_OPS_DEBUG(r, "vhost_set_vring_busyloop_timeout failed");
1436 return r;
1437 }
1438
1439 return 0;
1440 }
1441
vhost_virtqueue_error_notifier(EventNotifier * n)1442 static void vhost_virtqueue_error_notifier(EventNotifier *n)
1443 {
1444 struct vhost_virtqueue *vq = container_of(n, struct vhost_virtqueue,
1445 error_notifier);
1446 struct vhost_dev *dev = vq->dev;
1447 int index = vq - dev->vqs;
1448
1449 if (event_notifier_test_and_clear(n) && dev->vdev) {
1450 VHOST_OPS_DEBUG(-EINVAL, "vhost vring error in virtqueue %d",
1451 dev->vq_index + index);
1452 }
1453 }
1454
vhost_virtqueue_init(struct vhost_dev * dev,struct vhost_virtqueue * vq,int n)1455 static int vhost_virtqueue_init(struct vhost_dev *dev,
1456 struct vhost_virtqueue *vq, int n)
1457 {
1458 int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
1459 struct vhost_vring_file file = {
1460 .index = vhost_vq_index,
1461 };
1462 int r = event_notifier_init(&vq->masked_notifier, 0);
1463 if (r < 0) {
1464 return r;
1465 }
1466
1467 file.fd = event_notifier_get_wfd(&vq->masked_notifier);
1468 r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
1469 if (r) {
1470 VHOST_OPS_DEBUG(r, "vhost_set_vring_call failed");
1471 goto fail_call;
1472 }
1473
1474 vq->dev = dev;
1475
1476 if (dev->vhost_ops->vhost_set_vring_err) {
1477 r = event_notifier_init(&vq->error_notifier, 0);
1478 if (r < 0) {
1479 goto fail_call;
1480 }
1481
1482 file.fd = event_notifier_get_fd(&vq->error_notifier);
1483 r = dev->vhost_ops->vhost_set_vring_err(dev, &file);
1484 if (r) {
1485 VHOST_OPS_DEBUG(r, "vhost_set_vring_err failed");
1486 goto fail_err;
1487 }
1488
1489 event_notifier_set_handler(&vq->error_notifier,
1490 vhost_virtqueue_error_notifier);
1491 }
1492
1493 return 0;
1494
1495 fail_err:
1496 event_notifier_cleanup(&vq->error_notifier);
1497 fail_call:
1498 event_notifier_cleanup(&vq->masked_notifier);
1499 return r;
1500 }
1501
vhost_virtqueue_cleanup(struct vhost_virtqueue * vq)1502 static void vhost_virtqueue_cleanup(struct vhost_virtqueue *vq)
1503 {
1504 event_notifier_cleanup(&vq->masked_notifier);
1505 if (vq->dev->vhost_ops->vhost_set_vring_err) {
1506 event_notifier_set_handler(&vq->error_notifier, NULL);
1507 event_notifier_cleanup(&vq->error_notifier);
1508 }
1509 }
1510
vhost_dev_init(struct vhost_dev * hdev,void * opaque,VhostBackendType backend_type,uint32_t busyloop_timeout,Error ** errp)1511 int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
1512 VhostBackendType backend_type, uint32_t busyloop_timeout,
1513 Error **errp)
1514 {
1515 unsigned int used, reserved, limit;
1516 uint64_t features;
1517 int i, r, n_initialized_vqs = 0;
1518
1519 hdev->vdev = NULL;
1520 hdev->migration_blocker = NULL;
1521
1522 r = vhost_set_backend_type(hdev, backend_type);
1523 assert(r >= 0);
1524
1525 r = hdev->vhost_ops->vhost_backend_init(hdev, opaque, errp);
1526 if (r < 0) {
1527 goto fail;
1528 }
1529
1530 r = hdev->vhost_ops->vhost_set_owner(hdev);
1531 if (r < 0) {
1532 error_setg_errno(errp, -r, "vhost_set_owner failed");
1533 goto fail;
1534 }
1535
1536 r = hdev->vhost_ops->vhost_get_features(hdev, &features);
1537 if (r < 0) {
1538 error_setg_errno(errp, -r, "vhost_get_features failed");
1539 goto fail;
1540 }
1541
1542 limit = hdev->vhost_ops->vhost_backend_memslots_limit(hdev);
1543 if (limit < MEMORY_DEVICES_SAFE_MAX_MEMSLOTS &&
1544 memory_devices_memslot_auto_decision_active()) {
1545 error_setg(errp, "some memory device (like virtio-mem)"
1546 " decided how many memory slots to use based on the overall"
1547 " number of memory slots; this vhost backend would further"
1548 " restricts the overall number of memory slots");
1549 error_append_hint(errp, "Try plugging this vhost backend before"
1550 " plugging such memory devices.\n");
1551 r = -EINVAL;
1552 goto fail;
1553 }
1554
1555 for (i = 0; i < hdev->nvqs; ++i, ++n_initialized_vqs) {
1556 r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i);
1557 if (r < 0) {
1558 error_setg_errno(errp, -r, "Failed to initialize virtqueue %d", i);
1559 goto fail;
1560 }
1561 }
1562
1563 if (busyloop_timeout) {
1564 for (i = 0; i < hdev->nvqs; ++i) {
1565 r = vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i,
1566 busyloop_timeout);
1567 if (r < 0) {
1568 error_setg_errno(errp, -r, "Failed to set busyloop timeout");
1569 goto fail_busyloop;
1570 }
1571 }
1572 }
1573
1574 hdev->features = features;
1575
1576 hdev->memory_listener = (MemoryListener) {
1577 .name = "vhost",
1578 .begin = vhost_begin,
1579 .commit = vhost_commit,
1580 .region_add = vhost_region_addnop,
1581 .region_nop = vhost_region_addnop,
1582 .log_start = vhost_log_start,
1583 .log_stop = vhost_log_stop,
1584 .log_sync = vhost_log_sync,
1585 .log_global_start = vhost_log_global_start,
1586 .log_global_stop = vhost_log_global_stop,
1587 .priority = MEMORY_LISTENER_PRIORITY_DEV_BACKEND
1588 };
1589
1590 hdev->iommu_listener = (MemoryListener) {
1591 .name = "vhost-iommu",
1592 .region_add = vhost_iommu_region_add,
1593 .region_del = vhost_iommu_region_del,
1594 };
1595
1596 if (hdev->migration_blocker == NULL) {
1597 if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) {
1598 error_setg(&hdev->migration_blocker,
1599 "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature.");
1600 } else if (vhost_dev_log_is_shared(hdev) && !qemu_memfd_alloc_check()) {
1601 error_setg(&hdev->migration_blocker,
1602 "Migration disabled: failed to allocate shared memory");
1603 }
1604 }
1605
1606 if (hdev->migration_blocker != NULL) {
1607 r = migrate_add_blocker_normal(&hdev->migration_blocker, errp);
1608 if (r < 0) {
1609 goto fail_busyloop;
1610 }
1611 }
1612
1613 hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions));
1614 hdev->n_mem_sections = 0;
1615 hdev->mem_sections = NULL;
1616 hdev->log = NULL;
1617 hdev->log_size = 0;
1618 hdev->log_enabled = false;
1619 hdev->started = false;
1620 memory_listener_register(&hdev->memory_listener, &address_space_memory);
1621 QLIST_INSERT_HEAD(&vhost_devices, hdev, entry);
1622
1623 /*
1624 * The listener we registered properly setup the number of required
1625 * memslots in vhost_commit().
1626 */
1627 used = hdev->mem->nregions;
1628
1629 /*
1630 * We assume that all reserved memslots actually require a real memslot
1631 * in our vhost backend. This might not be true, for example, if the
1632 * memslot would be ROM. If ever relevant, we can optimize for that --
1633 * but we'll need additional information about the reservations.
1634 */
1635 reserved = memory_devices_get_reserved_memslots();
1636 if (used + reserved > limit) {
1637 error_setg(errp, "vhost backend memory slots limit (%d) is less"
1638 " than current number of used (%d) and reserved (%d)"
1639 " memory slots for memory devices.", limit, used, reserved);
1640 r = -EINVAL;
1641 goto fail_busyloop;
1642 }
1643
1644 return 0;
1645
1646 fail_busyloop:
1647 if (busyloop_timeout) {
1648 while (--i >= 0) {
1649 vhost_virtqueue_set_busyloop_timeout(hdev, hdev->vq_index + i, 0);
1650 }
1651 }
1652 fail:
1653 hdev->nvqs = n_initialized_vqs;
1654 vhost_dev_cleanup(hdev);
1655 return r;
1656 }
1657
vhost_dev_cleanup(struct vhost_dev * hdev)1658 void vhost_dev_cleanup(struct vhost_dev *hdev)
1659 {
1660 int i;
1661
1662 trace_vhost_dev_cleanup(hdev);
1663
1664 for (i = 0; i < hdev->nvqs; ++i) {
1665 vhost_virtqueue_cleanup(hdev->vqs + i);
1666 }
1667 if (hdev->mem) {
1668 /* those are only safe after successful init */
1669 memory_listener_unregister(&hdev->memory_listener);
1670 QLIST_REMOVE(hdev, entry);
1671 }
1672 migrate_del_blocker(&hdev->migration_blocker);
1673 g_free(hdev->mem);
1674 g_free(hdev->mem_sections);
1675 if (hdev->vhost_ops) {
1676 hdev->vhost_ops->vhost_backend_cleanup(hdev);
1677 }
1678 assert(!hdev->log);
1679
1680 memset(hdev, 0, sizeof(struct vhost_dev));
1681 }
1682
vhost_dev_disable_notifiers_nvqs(struct vhost_dev * hdev,VirtIODevice * vdev,unsigned int nvqs)1683 void vhost_dev_disable_notifiers_nvqs(struct vhost_dev *hdev,
1684 VirtIODevice *vdev,
1685 unsigned int nvqs)
1686 {
1687 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1688 int i, r;
1689
1690 /*
1691 * Batch all the host notifiers in a single transaction to avoid
1692 * quadratic time complexity in address_space_update_ioeventfds().
1693 */
1694 memory_region_transaction_begin();
1695
1696 for (i = 0; i < nvqs; ++i) {
1697 r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
1698 false);
1699 if (r < 0) {
1700 error_report("vhost VQ %d notifier cleanup failed: %d", i, -r);
1701 }
1702 assert(r >= 0);
1703 }
1704
1705 /*
1706 * The transaction expects the ioeventfds to be open when it
1707 * commits. Do it now, before the cleanup loop.
1708 */
1709 memory_region_transaction_commit();
1710
1711 for (i = 0; i < nvqs; ++i) {
1712 virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i);
1713 }
1714 virtio_device_release_ioeventfd(vdev);
1715 }
1716
1717 /* Stop processing guest IO notifications in qemu.
1718 * Start processing them in vhost in kernel.
1719 */
vhost_dev_enable_notifiers(struct vhost_dev * hdev,VirtIODevice * vdev)1720 int vhost_dev_enable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
1721 {
1722 BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
1723 int i, r;
1724
1725 /* We will pass the notifiers to the kernel, make sure that QEMU
1726 * doesn't interfere.
1727 */
1728 r = virtio_device_grab_ioeventfd(vdev);
1729 if (r < 0) {
1730 error_report("binding does not support host notifiers");
1731 return r;
1732 }
1733
1734 /*
1735 * Batch all the host notifiers in a single transaction to avoid
1736 * quadratic time complexity in address_space_update_ioeventfds().
1737 */
1738 memory_region_transaction_begin();
1739
1740 for (i = 0; i < hdev->nvqs; ++i) {
1741 r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), hdev->vq_index + i,
1742 true);
1743 if (r < 0) {
1744 error_report("vhost VQ %d notifier binding failed: %d", i, -r);
1745 memory_region_transaction_commit();
1746 vhost_dev_disable_notifiers_nvqs(hdev, vdev, i);
1747 return r;
1748 }
1749 }
1750
1751 memory_region_transaction_commit();
1752
1753 return 0;
1754 }
1755
1756 /* Stop processing guest IO notifications in vhost.
1757 * Start processing them in qemu.
1758 * This might actually run the qemu handlers right away,
1759 * so virtio in qemu must be completely setup when this is called.
1760 */
vhost_dev_disable_notifiers(struct vhost_dev * hdev,VirtIODevice * vdev)1761 void vhost_dev_disable_notifiers(struct vhost_dev *hdev, VirtIODevice *vdev)
1762 {
1763 vhost_dev_disable_notifiers_nvqs(hdev, vdev, hdev->nvqs);
1764 }
1765
1766 /* Test and clear event pending status.
1767 * Should be called after unmask to avoid losing events.
1768 */
vhost_virtqueue_pending(struct vhost_dev * hdev,int n)1769 bool vhost_virtqueue_pending(struct vhost_dev *hdev, int n)
1770 {
1771 struct vhost_virtqueue *vq = hdev->vqs + n - hdev->vq_index;
1772 assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs);
1773 return event_notifier_test_and_clear(&vq->masked_notifier);
1774 }
1775
1776 /* Mask/unmask events from this vq. */
vhost_virtqueue_mask(struct vhost_dev * hdev,VirtIODevice * vdev,int n,bool mask)1777 void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n,
1778 bool mask)
1779 {
1780 struct VirtQueue *vvq = virtio_get_queue(vdev, n);
1781 int r, index = n - hdev->vq_index;
1782 struct vhost_vring_file file;
1783
1784 /* should only be called after backend is connected */
1785 assert(hdev->vhost_ops);
1786
1787 if (mask) {
1788 assert(vdev->use_guest_notifier_mask);
1789 file.fd = event_notifier_get_wfd(&hdev->vqs[index].masked_notifier);
1790 } else {
1791 file.fd = event_notifier_get_wfd(virtio_queue_get_guest_notifier(vvq));
1792 }
1793
1794 file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n);
1795 r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file);
1796 if (r < 0) {
1797 error_report("vhost_set_vring_call failed %d", -r);
1798 }
1799 }
1800
vhost_config_pending(struct vhost_dev * hdev)1801 bool vhost_config_pending(struct vhost_dev *hdev)
1802 {
1803 assert(hdev->vhost_ops);
1804 if ((hdev->started == false) ||
1805 (hdev->vhost_ops->vhost_set_config_call == NULL)) {
1806 return false;
1807 }
1808
1809 EventNotifier *notifier =
1810 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier;
1811 return event_notifier_test_and_clear(notifier);
1812 }
1813
vhost_config_mask(struct vhost_dev * hdev,VirtIODevice * vdev,bool mask)1814 void vhost_config_mask(struct vhost_dev *hdev, VirtIODevice *vdev, bool mask)
1815 {
1816 int fd;
1817 int r;
1818 EventNotifier *notifier =
1819 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier;
1820 EventNotifier *config_notifier = &vdev->config_notifier;
1821 assert(hdev->vhost_ops);
1822
1823 if ((hdev->started == false) ||
1824 (hdev->vhost_ops->vhost_set_config_call == NULL)) {
1825 return;
1826 }
1827 if (mask) {
1828 assert(vdev->use_guest_notifier_mask);
1829 fd = event_notifier_get_fd(notifier);
1830 } else {
1831 fd = event_notifier_get_fd(config_notifier);
1832 }
1833 r = hdev->vhost_ops->vhost_set_config_call(hdev, fd);
1834 if (r < 0) {
1835 error_report("vhost_set_config_call failed %d", -r);
1836 }
1837 }
1838
vhost_stop_config_intr(struct vhost_dev * dev)1839 static void vhost_stop_config_intr(struct vhost_dev *dev)
1840 {
1841 int fd = -1;
1842 assert(dev->vhost_ops);
1843 if (dev->vhost_ops->vhost_set_config_call) {
1844 dev->vhost_ops->vhost_set_config_call(dev, fd);
1845 }
1846 }
1847
vhost_start_config_intr(struct vhost_dev * dev)1848 static void vhost_start_config_intr(struct vhost_dev *dev)
1849 {
1850 int r;
1851
1852 assert(dev->vhost_ops);
1853 int fd = event_notifier_get_fd(&dev->vdev->config_notifier);
1854 if (dev->vhost_ops->vhost_set_config_call) {
1855 r = dev->vhost_ops->vhost_set_config_call(dev, fd);
1856 if (!r) {
1857 event_notifier_set(&dev->vdev->config_notifier);
1858 }
1859 }
1860 }
1861
vhost_get_features(struct vhost_dev * hdev,const int * feature_bits,uint64_t features)1862 uint64_t vhost_get_features(struct vhost_dev *hdev, const int *feature_bits,
1863 uint64_t features)
1864 {
1865 const int *bit = feature_bits;
1866 while (*bit != VHOST_INVALID_FEATURE_BIT) {
1867 uint64_t bit_mask = (1ULL << *bit);
1868 if (!(hdev->features & bit_mask)) {
1869 features &= ~bit_mask;
1870 }
1871 bit++;
1872 }
1873 return features;
1874 }
1875
vhost_ack_features(struct vhost_dev * hdev,const int * feature_bits,uint64_t features)1876 void vhost_ack_features(struct vhost_dev *hdev, const int *feature_bits,
1877 uint64_t features)
1878 {
1879 const int *bit = feature_bits;
1880 while (*bit != VHOST_INVALID_FEATURE_BIT) {
1881 uint64_t bit_mask = (1ULL << *bit);
1882 if (features & bit_mask) {
1883 hdev->acked_features |= bit_mask;
1884 }
1885 bit++;
1886 }
1887 }
1888
vhost_dev_get_config(struct vhost_dev * hdev,uint8_t * config,uint32_t config_len,Error ** errp)1889 int vhost_dev_get_config(struct vhost_dev *hdev, uint8_t *config,
1890 uint32_t config_len, Error **errp)
1891 {
1892 assert(hdev->vhost_ops);
1893
1894 if (hdev->vhost_ops->vhost_get_config) {
1895 return hdev->vhost_ops->vhost_get_config(hdev, config, config_len,
1896 errp);
1897 }
1898
1899 error_setg(errp, "vhost_get_config not implemented");
1900 return -ENOSYS;
1901 }
1902
vhost_dev_set_config(struct vhost_dev * hdev,const uint8_t * data,uint32_t offset,uint32_t size,uint32_t flags)1903 int vhost_dev_set_config(struct vhost_dev *hdev, const uint8_t *data,
1904 uint32_t offset, uint32_t size, uint32_t flags)
1905 {
1906 assert(hdev->vhost_ops);
1907
1908 if (hdev->vhost_ops->vhost_set_config) {
1909 return hdev->vhost_ops->vhost_set_config(hdev, data, offset,
1910 size, flags);
1911 }
1912
1913 return -ENOSYS;
1914 }
1915
vhost_dev_set_config_notifier(struct vhost_dev * hdev,const VhostDevConfigOps * ops)1916 void vhost_dev_set_config_notifier(struct vhost_dev *hdev,
1917 const VhostDevConfigOps *ops)
1918 {
1919 hdev->config_ops = ops;
1920 }
1921
vhost_dev_free_inflight(struct vhost_inflight * inflight)1922 void vhost_dev_free_inflight(struct vhost_inflight *inflight)
1923 {
1924 if (inflight && inflight->addr) {
1925 qemu_memfd_free(inflight->addr, inflight->size, inflight->fd);
1926 inflight->addr = NULL;
1927 inflight->fd = -1;
1928 }
1929 }
1930
vhost_dev_prepare_inflight(struct vhost_dev * hdev,VirtIODevice * vdev)1931 int vhost_dev_prepare_inflight(struct vhost_dev *hdev, VirtIODevice *vdev)
1932 {
1933 int r;
1934
1935 if (hdev->vhost_ops->vhost_get_inflight_fd == NULL ||
1936 hdev->vhost_ops->vhost_set_inflight_fd == NULL) {
1937 return 0;
1938 }
1939
1940 hdev->vdev = vdev;
1941
1942 r = vhost_dev_set_features(hdev, hdev->log_enabled);
1943 if (r < 0) {
1944 VHOST_OPS_DEBUG(r, "vhost_dev_prepare_inflight failed");
1945 return r;
1946 }
1947
1948 return 0;
1949 }
1950
vhost_dev_set_inflight(struct vhost_dev * dev,struct vhost_inflight * inflight)1951 int vhost_dev_set_inflight(struct vhost_dev *dev,
1952 struct vhost_inflight *inflight)
1953 {
1954 int r;
1955
1956 if (dev->vhost_ops->vhost_set_inflight_fd && inflight->addr) {
1957 r = dev->vhost_ops->vhost_set_inflight_fd(dev, inflight);
1958 if (r) {
1959 VHOST_OPS_DEBUG(r, "vhost_set_inflight_fd failed");
1960 return r;
1961 }
1962 }
1963
1964 return 0;
1965 }
1966
vhost_dev_get_inflight(struct vhost_dev * dev,uint16_t queue_size,struct vhost_inflight * inflight)1967 int vhost_dev_get_inflight(struct vhost_dev *dev, uint16_t queue_size,
1968 struct vhost_inflight *inflight)
1969 {
1970 int r;
1971
1972 if (dev->vhost_ops->vhost_get_inflight_fd) {
1973 r = dev->vhost_ops->vhost_get_inflight_fd(dev, queue_size, inflight);
1974 if (r) {
1975 VHOST_OPS_DEBUG(r, "vhost_get_inflight_fd failed");
1976 return r;
1977 }
1978 }
1979
1980 return 0;
1981 }
1982
vhost_dev_set_vring_enable(struct vhost_dev * hdev,int enable)1983 static int vhost_dev_set_vring_enable(struct vhost_dev *hdev, int enable)
1984 {
1985 if (!hdev->vhost_ops->vhost_set_vring_enable) {
1986 return 0;
1987 }
1988
1989 /*
1990 * For vhost-user devices, if VHOST_USER_F_PROTOCOL_FEATURES has not
1991 * been negotiated, the rings start directly in the enabled state, and
1992 * .vhost_set_vring_enable callback will fail since
1993 * VHOST_USER_SET_VRING_ENABLE is not supported.
1994 */
1995 if (hdev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER &&
1996 !virtio_has_feature(hdev->backend_features,
1997 VHOST_USER_F_PROTOCOL_FEATURES)) {
1998 return 0;
1999 }
2000
2001 return hdev->vhost_ops->vhost_set_vring_enable(hdev, enable);
2002 }
2003
2004 /*
2005 * Host notifiers must be enabled at this point.
2006 *
2007 * If @vrings is true, this function will enable all vrings before starting the
2008 * device. If it is false, the vring initialization is left to be done by the
2009 * caller.
2010 */
vhost_dev_start(struct vhost_dev * hdev,VirtIODevice * vdev,bool vrings)2011 int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings)
2012 {
2013 int i, r;
2014
2015 /* should only be called after backend is connected */
2016 assert(hdev->vhost_ops);
2017
2018 trace_vhost_dev_start(hdev, vdev->name, vrings);
2019
2020 vdev->vhost_started = true;
2021 hdev->started = true;
2022 hdev->vdev = vdev;
2023
2024 r = vhost_dev_set_features(hdev, hdev->log_enabled);
2025 if (r < 0) {
2026 goto fail_features;
2027 }
2028
2029 if (vhost_dev_has_iommu(hdev)) {
2030 memory_listener_register(&hdev->iommu_listener, vdev->dma_as);
2031 }
2032
2033 r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem);
2034 if (r < 0) {
2035 VHOST_OPS_DEBUG(r, "vhost_set_mem_table failed");
2036 goto fail_mem;
2037 }
2038 for (i = 0; i < hdev->nvqs; ++i) {
2039 r = vhost_virtqueue_start(hdev,
2040 vdev,
2041 hdev->vqs + i,
2042 hdev->vq_index + i);
2043 if (r < 0) {
2044 goto fail_vq;
2045 }
2046 }
2047
2048 r = event_notifier_init(
2049 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier, 0);
2050 if (r < 0) {
2051 VHOST_OPS_DEBUG(r, "event_notifier_init failed");
2052 goto fail_vq;
2053 }
2054 event_notifier_test_and_clear(
2055 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier);
2056 if (!vdev->use_guest_notifier_mask) {
2057 vhost_config_mask(hdev, vdev, true);
2058 }
2059 if (hdev->log_enabled) {
2060 uint64_t log_base;
2061
2062 hdev->log_size = vhost_get_log_size(hdev);
2063 hdev->log = vhost_log_get(hdev->vhost_ops->backend_type,
2064 hdev->log_size,
2065 vhost_dev_log_is_shared(hdev));
2066 log_base = (uintptr_t)hdev->log->log;
2067 r = hdev->vhost_ops->vhost_set_log_base(hdev,
2068 hdev->log_size ? log_base : 0,
2069 hdev->log);
2070 if (r < 0) {
2071 VHOST_OPS_DEBUG(r, "vhost_set_log_base failed");
2072 goto fail_log;
2073 }
2074 vhost_dev_elect_mem_logger(hdev, true);
2075 }
2076 if (vrings) {
2077 r = vhost_dev_set_vring_enable(hdev, true);
2078 if (r) {
2079 goto fail_log;
2080 }
2081 }
2082 if (hdev->vhost_ops->vhost_dev_start) {
2083 r = hdev->vhost_ops->vhost_dev_start(hdev, true);
2084 if (r) {
2085 goto fail_start;
2086 }
2087 }
2088 if (vhost_dev_has_iommu(hdev) &&
2089 hdev->vhost_ops->vhost_set_iotlb_callback) {
2090 hdev->vhost_ops->vhost_set_iotlb_callback(hdev, true);
2091
2092 /* Update used ring information for IOTLB to work correctly,
2093 * vhost-kernel code requires for this.*/
2094 for (i = 0; i < hdev->nvqs; ++i) {
2095 struct vhost_virtqueue *vq = hdev->vqs + i;
2096 r = vhost_device_iotlb_miss(hdev, vq->used_phys, true);
2097 if (r) {
2098 goto fail_iotlb;
2099 }
2100 }
2101 }
2102 vhost_start_config_intr(hdev);
2103 return 0;
2104 fail_iotlb:
2105 if (vhost_dev_has_iommu(hdev) &&
2106 hdev->vhost_ops->vhost_set_iotlb_callback) {
2107 hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false);
2108 }
2109 if (hdev->vhost_ops->vhost_dev_start) {
2110 hdev->vhost_ops->vhost_dev_start(hdev, false);
2111 }
2112 fail_start:
2113 if (vrings) {
2114 vhost_dev_set_vring_enable(hdev, false);
2115 }
2116 fail_log:
2117 vhost_log_put(hdev, false);
2118 fail_vq:
2119 while (--i >= 0) {
2120 vhost_virtqueue_stop(hdev,
2121 vdev,
2122 hdev->vqs + i,
2123 hdev->vq_index + i);
2124 }
2125
2126 fail_mem:
2127 if (vhost_dev_has_iommu(hdev)) {
2128 memory_listener_unregister(&hdev->iommu_listener);
2129 }
2130 fail_features:
2131 vdev->vhost_started = false;
2132 hdev->started = false;
2133 return r;
2134 }
2135
2136 /* Host notifiers must be enabled at this point. */
do_vhost_dev_stop(struct vhost_dev * hdev,VirtIODevice * vdev,bool vrings,bool force)2137 static int do_vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev,
2138 bool vrings, bool force)
2139 {
2140 int i;
2141 int rc = 0;
2142
2143 /* should only be called after backend is connected */
2144 assert(hdev->vhost_ops);
2145 event_notifier_test_and_clear(
2146 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier);
2147 event_notifier_test_and_clear(&vdev->config_notifier);
2148 event_notifier_cleanup(
2149 &hdev->vqs[VHOST_QUEUE_NUM_CONFIG_INR].masked_config_notifier);
2150
2151 trace_vhost_dev_stop(hdev, vdev->name, vrings);
2152
2153 if (hdev->vhost_ops->vhost_dev_start) {
2154 hdev->vhost_ops->vhost_dev_start(hdev, false);
2155 }
2156 if (vrings) {
2157 vhost_dev_set_vring_enable(hdev, false);
2158 }
2159 for (i = 0; i < hdev->nvqs; ++i) {
2160 rc |= do_vhost_virtqueue_stop(hdev,
2161 vdev,
2162 hdev->vqs + i,
2163 hdev->vq_index + i,
2164 force);
2165 }
2166 if (hdev->vhost_ops->vhost_reset_status) {
2167 hdev->vhost_ops->vhost_reset_status(hdev);
2168 }
2169
2170 if (vhost_dev_has_iommu(hdev)) {
2171 if (hdev->vhost_ops->vhost_set_iotlb_callback) {
2172 hdev->vhost_ops->vhost_set_iotlb_callback(hdev, false);
2173 }
2174 memory_listener_unregister(&hdev->iommu_listener);
2175 }
2176 vhost_stop_config_intr(hdev);
2177 vhost_log_put(hdev, true);
2178 hdev->started = false;
2179 vdev->vhost_started = false;
2180 hdev->vdev = NULL;
2181 return rc;
2182 }
2183
vhost_dev_stop(struct vhost_dev * hdev,VirtIODevice * vdev,bool vrings)2184 int vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev, bool vrings)
2185 {
2186 return do_vhost_dev_stop(hdev, vdev, vrings, false);
2187 }
2188
vhost_dev_force_stop(struct vhost_dev * hdev,VirtIODevice * vdev,bool vrings)2189 int vhost_dev_force_stop(struct vhost_dev *hdev, VirtIODevice *vdev,
2190 bool vrings)
2191 {
2192 return do_vhost_dev_stop(hdev, vdev, vrings, true);
2193 }
2194
vhost_net_set_backend(struct vhost_dev * hdev,struct vhost_vring_file * file)2195 int vhost_net_set_backend(struct vhost_dev *hdev,
2196 struct vhost_vring_file *file)
2197 {
2198 if (hdev->vhost_ops->vhost_net_set_backend) {
2199 return hdev->vhost_ops->vhost_net_set_backend(hdev, file);
2200 }
2201
2202 return -ENOSYS;
2203 }
2204
vhost_reset_device(struct vhost_dev * hdev)2205 int vhost_reset_device(struct vhost_dev *hdev)
2206 {
2207 if (hdev->vhost_ops->vhost_reset_device) {
2208 return hdev->vhost_ops->vhost_reset_device(hdev);
2209 }
2210
2211 return -ENOSYS;
2212 }
2213
vhost_supports_device_state(struct vhost_dev * dev)2214 bool vhost_supports_device_state(struct vhost_dev *dev)
2215 {
2216 if (dev->vhost_ops->vhost_supports_device_state) {
2217 return dev->vhost_ops->vhost_supports_device_state(dev);
2218 }
2219
2220 return false;
2221 }
2222
vhost_set_device_state_fd(struct vhost_dev * dev,VhostDeviceStateDirection direction,VhostDeviceStatePhase phase,int fd,int * reply_fd,Error ** errp)2223 int vhost_set_device_state_fd(struct vhost_dev *dev,
2224 VhostDeviceStateDirection direction,
2225 VhostDeviceStatePhase phase,
2226 int fd,
2227 int *reply_fd,
2228 Error **errp)
2229 {
2230 if (dev->vhost_ops->vhost_set_device_state_fd) {
2231 return dev->vhost_ops->vhost_set_device_state_fd(dev, direction, phase,
2232 fd, reply_fd, errp);
2233 }
2234
2235 error_setg(errp,
2236 "vhost transport does not support migration state transfer");
2237 return -ENOSYS;
2238 }
2239
vhost_check_device_state(struct vhost_dev * dev,Error ** errp)2240 int vhost_check_device_state(struct vhost_dev *dev, Error **errp)
2241 {
2242 if (dev->vhost_ops->vhost_check_device_state) {
2243 return dev->vhost_ops->vhost_check_device_state(dev, errp);
2244 }
2245
2246 error_setg(errp,
2247 "vhost transport does not support migration state transfer");
2248 return -ENOSYS;
2249 }
2250
vhost_save_backend_state(struct vhost_dev * dev,QEMUFile * f,Error ** errp)2251 int vhost_save_backend_state(struct vhost_dev *dev, QEMUFile *f, Error **errp)
2252 {
2253 ERRP_GUARD();
2254 /* Maximum chunk size in which to transfer the state */
2255 const size_t chunk_size = 1 * 1024 * 1024;
2256 g_autofree void *transfer_buf = NULL;
2257 g_autoptr(GError) g_err = NULL;
2258 int pipe_fds[2], read_fd = -1, write_fd = -1, reply_fd = -1;
2259 int ret;
2260
2261 /* [0] for reading (our end), [1] for writing (back-end's end) */
2262 if (!g_unix_open_pipe(pipe_fds, FD_CLOEXEC, &g_err)) {
2263 error_setg(errp, "Failed to set up state transfer pipe: %s",
2264 g_err->message);
2265 ret = -EINVAL;
2266 goto fail;
2267 }
2268
2269 read_fd = pipe_fds[0];
2270 write_fd = pipe_fds[1];
2271
2272 /*
2273 * VHOST_TRANSFER_STATE_PHASE_STOPPED means the device must be stopped.
2274 * Ideally, it is suspended, but SUSPEND/RESUME currently do not exist for
2275 * vhost-user, so just check that it is stopped at all.
2276 */
2277 assert(!dev->started);
2278
2279 /* Transfer ownership of write_fd to the back-end */
2280 ret = vhost_set_device_state_fd(dev,
2281 VHOST_TRANSFER_STATE_DIRECTION_SAVE,
2282 VHOST_TRANSFER_STATE_PHASE_STOPPED,
2283 write_fd,
2284 &reply_fd,
2285 errp);
2286 if (ret < 0) {
2287 error_prepend(errp, "Failed to initiate state transfer: ");
2288 goto fail;
2289 }
2290
2291 /* If the back-end wishes to use a different pipe, switch over */
2292 if (reply_fd >= 0) {
2293 close(read_fd);
2294 read_fd = reply_fd;
2295 }
2296
2297 transfer_buf = g_malloc(chunk_size);
2298
2299 while (true) {
2300 ssize_t read_ret;
2301
2302 read_ret = RETRY_ON_EINTR(read(read_fd, transfer_buf, chunk_size));
2303 if (read_ret < 0) {
2304 ret = -errno;
2305 error_setg_errno(errp, -ret, "Failed to receive state");
2306 goto fail;
2307 }
2308
2309 assert(read_ret <= chunk_size);
2310 qemu_put_be32(f, read_ret);
2311
2312 if (read_ret == 0) {
2313 /* EOF */
2314 break;
2315 }
2316
2317 qemu_put_buffer(f, transfer_buf, read_ret);
2318 }
2319
2320 /*
2321 * Back-end will not really care, but be clean and close our end of the pipe
2322 * before inquiring the back-end about whether transfer was successful
2323 */
2324 close(read_fd);
2325 read_fd = -1;
2326
2327 /* Also, verify that the device is still stopped */
2328 assert(!dev->started);
2329
2330 ret = vhost_check_device_state(dev, errp);
2331 if (ret < 0) {
2332 goto fail;
2333 }
2334
2335 ret = 0;
2336 fail:
2337 if (read_fd >= 0) {
2338 close(read_fd);
2339 }
2340
2341 return ret;
2342 }
2343
vhost_load_backend_state(struct vhost_dev * dev,QEMUFile * f,Error ** errp)2344 int vhost_load_backend_state(struct vhost_dev *dev, QEMUFile *f, Error **errp)
2345 {
2346 ERRP_GUARD();
2347 size_t transfer_buf_size = 0;
2348 g_autofree void *transfer_buf = NULL;
2349 g_autoptr(GError) g_err = NULL;
2350 int pipe_fds[2], read_fd = -1, write_fd = -1, reply_fd = -1;
2351 int ret;
2352
2353 /* [0] for reading (back-end's end), [1] for writing (our end) */
2354 if (!g_unix_open_pipe(pipe_fds, FD_CLOEXEC, &g_err)) {
2355 error_setg(errp, "Failed to set up state transfer pipe: %s",
2356 g_err->message);
2357 ret = -EINVAL;
2358 goto fail;
2359 }
2360
2361 read_fd = pipe_fds[0];
2362 write_fd = pipe_fds[1];
2363
2364 /*
2365 * VHOST_TRANSFER_STATE_PHASE_STOPPED means the device must be stopped.
2366 * Ideally, it is suspended, but SUSPEND/RESUME currently do not exist for
2367 * vhost-user, so just check that it is stopped at all.
2368 */
2369 assert(!dev->started);
2370
2371 /* Transfer ownership of read_fd to the back-end */
2372 ret = vhost_set_device_state_fd(dev,
2373 VHOST_TRANSFER_STATE_DIRECTION_LOAD,
2374 VHOST_TRANSFER_STATE_PHASE_STOPPED,
2375 read_fd,
2376 &reply_fd,
2377 errp);
2378 if (ret < 0) {
2379 error_prepend(errp, "Failed to initiate state transfer: ");
2380 goto fail;
2381 }
2382
2383 /* If the back-end wishes to use a different pipe, switch over */
2384 if (reply_fd >= 0) {
2385 close(write_fd);
2386 write_fd = reply_fd;
2387 }
2388
2389 while (true) {
2390 size_t this_chunk_size = qemu_get_be32(f);
2391 ssize_t write_ret;
2392 const uint8_t *transfer_pointer;
2393
2394 if (this_chunk_size == 0) {
2395 /* End of state */
2396 break;
2397 }
2398
2399 if (transfer_buf_size < this_chunk_size) {
2400 transfer_buf = g_realloc(transfer_buf, this_chunk_size);
2401 transfer_buf_size = this_chunk_size;
2402 }
2403
2404 if (qemu_get_buffer(f, transfer_buf, this_chunk_size) <
2405 this_chunk_size)
2406 {
2407 error_setg(errp, "Failed to read state");
2408 ret = -EINVAL;
2409 goto fail;
2410 }
2411
2412 transfer_pointer = transfer_buf;
2413 while (this_chunk_size > 0) {
2414 write_ret = RETRY_ON_EINTR(
2415 write(write_fd, transfer_pointer, this_chunk_size)
2416 );
2417 if (write_ret < 0) {
2418 ret = -errno;
2419 error_setg_errno(errp, -ret, "Failed to send state");
2420 goto fail;
2421 } else if (write_ret == 0) {
2422 error_setg(errp, "Failed to send state: Connection is closed");
2423 ret = -ECONNRESET;
2424 goto fail;
2425 }
2426
2427 assert(write_ret <= this_chunk_size);
2428 this_chunk_size -= write_ret;
2429 transfer_pointer += write_ret;
2430 }
2431 }
2432
2433 /*
2434 * Close our end, thus ending transfer, before inquiring the back-end about
2435 * whether transfer was successful
2436 */
2437 close(write_fd);
2438 write_fd = -1;
2439
2440 /* Also, verify that the device is still stopped */
2441 assert(!dev->started);
2442
2443 ret = vhost_check_device_state(dev, errp);
2444 if (ret < 0) {
2445 goto fail;
2446 }
2447
2448 ret = 0;
2449 fail:
2450 if (write_fd >= 0) {
2451 close(write_fd);
2452 }
2453
2454 return ret;
2455 }
2456