xref: /openbmc/qemu/hw/virtio/vhost-vdpa.c (revision a6caeee8)
1 /*
2  * vhost-vdpa
3  *
4  *  Copyright(c) 2017-2018 Intel Corporation.
5  *  Copyright(c) 2020 Red Hat, Inc.
6  *
7  * This work is licensed under the terms of the GNU GPL, version 2 or later.
8  * See the COPYING file in the top-level directory.
9  *
10  */
11 
12 #include "qemu/osdep.h"
13 #include <linux/vhost.h>
14 #include <linux/vfio.h>
15 #include <sys/eventfd.h>
16 #include <sys/ioctl.h>
17 #include "hw/virtio/vhost.h"
18 #include "hw/virtio/vhost-backend.h"
19 #include "hw/virtio/virtio-net.h"
20 #include "hw/virtio/vhost-shadow-virtqueue.h"
21 #include "hw/virtio/vhost-vdpa.h"
22 #include "exec/address-spaces.h"
23 #include "qemu/cutils.h"
24 #include "qemu/main-loop.h"
25 #include "cpu.h"
26 #include "trace.h"
27 #include "qapi/error.h"
28 
29 /*
30  * Return one past the end of the end of section. Be careful with uint64_t
31  * conversions!
32  */
33 static Int128 vhost_vdpa_section_end(const MemoryRegionSection *section)
34 {
35     Int128 llend = int128_make64(section->offset_within_address_space);
36     llend = int128_add(llend, section->size);
37     llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
38 
39     return llend;
40 }
41 
42 static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section,
43                                                 uint64_t iova_min,
44                                                 uint64_t iova_max)
45 {
46     Int128 llend;
47 
48     if ((!memory_region_is_ram(section->mr) &&
49          !memory_region_is_iommu(section->mr)) ||
50         memory_region_is_protected(section->mr) ||
51         /* vhost-vDPA doesn't allow MMIO to be mapped  */
52         memory_region_is_ram_device(section->mr)) {
53         return true;
54     }
55 
56     if (section->offset_within_address_space < iova_min) {
57         error_report("RAM section out of device range (min=0x%" PRIx64
58                      ", addr=0x%" HWADDR_PRIx ")",
59                      iova_min, section->offset_within_address_space);
60         return true;
61     }
62 
63     llend = vhost_vdpa_section_end(section);
64     if (int128_gt(llend, int128_make64(iova_max))) {
65         error_report("RAM section out of device range (max=0x%" PRIx64
66                      ", end addr=0x%" PRIx64 ")",
67                      iova_max, int128_get64(llend));
68         return true;
69     }
70 
71     return false;
72 }
73 
74 static int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size,
75                               void *vaddr, bool readonly)
76 {
77     struct vhost_msg_v2 msg = {};
78     int fd = v->device_fd;
79     int ret = 0;
80 
81     msg.type = v->msg_type;
82     msg.iotlb.iova = iova;
83     msg.iotlb.size = size;
84     msg.iotlb.uaddr = (uint64_t)(uintptr_t)vaddr;
85     msg.iotlb.perm = readonly ? VHOST_ACCESS_RO : VHOST_ACCESS_RW;
86     msg.iotlb.type = VHOST_IOTLB_UPDATE;
87 
88    trace_vhost_vdpa_dma_map(v, fd, msg.type, msg.iotlb.iova, msg.iotlb.size,
89                             msg.iotlb.uaddr, msg.iotlb.perm, msg.iotlb.type);
90 
91     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
92         error_report("failed to write, fd=%d, errno=%d (%s)",
93             fd, errno, strerror(errno));
94         return -EIO ;
95     }
96 
97     return ret;
98 }
99 
100 static int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova,
101                                 hwaddr size)
102 {
103     struct vhost_msg_v2 msg = {};
104     int fd = v->device_fd;
105     int ret = 0;
106 
107     msg.type = v->msg_type;
108     msg.iotlb.iova = iova;
109     msg.iotlb.size = size;
110     msg.iotlb.type = VHOST_IOTLB_INVALIDATE;
111 
112     trace_vhost_vdpa_dma_unmap(v, fd, msg.type, msg.iotlb.iova,
113                                msg.iotlb.size, msg.iotlb.type);
114 
115     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
116         error_report("failed to write, fd=%d, errno=%d (%s)",
117             fd, errno, strerror(errno));
118         return -EIO ;
119     }
120 
121     return ret;
122 }
123 
124 static void vhost_vdpa_listener_begin_batch(struct vhost_vdpa *v)
125 {
126     int fd = v->device_fd;
127     struct vhost_msg_v2 msg = {
128         .type = v->msg_type,
129         .iotlb.type = VHOST_IOTLB_BATCH_BEGIN,
130     };
131 
132     trace_vhost_vdpa_listener_begin_batch(v, fd, msg.type, msg.iotlb.type);
133     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
134         error_report("failed to write, fd=%d, errno=%d (%s)",
135                      fd, errno, strerror(errno));
136     }
137 }
138 
139 static void vhost_vdpa_iotlb_batch_begin_once(struct vhost_vdpa *v)
140 {
141     if (v->dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH) &&
142         !v->iotlb_batch_begin_sent) {
143         vhost_vdpa_listener_begin_batch(v);
144     }
145 
146     v->iotlb_batch_begin_sent = true;
147 }
148 
149 static void vhost_vdpa_listener_commit(MemoryListener *listener)
150 {
151     struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
152     struct vhost_dev *dev = v->dev;
153     struct vhost_msg_v2 msg = {};
154     int fd = v->device_fd;
155 
156     if (!(dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH))) {
157         return;
158     }
159 
160     if (!v->iotlb_batch_begin_sent) {
161         return;
162     }
163 
164     msg.type = v->msg_type;
165     msg.iotlb.type = VHOST_IOTLB_BATCH_END;
166 
167     trace_vhost_vdpa_listener_commit(v, fd, msg.type, msg.iotlb.type);
168     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
169         error_report("failed to write, fd=%d, errno=%d (%s)",
170                      fd, errno, strerror(errno));
171     }
172 
173     v->iotlb_batch_begin_sent = false;
174 }
175 
176 static void vhost_vdpa_listener_region_add(MemoryListener *listener,
177                                            MemoryRegionSection *section)
178 {
179     struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
180     hwaddr iova;
181     Int128 llend, llsize;
182     void *vaddr;
183     int ret;
184 
185     if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first,
186                                             v->iova_range.last)) {
187         return;
188     }
189 
190     if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
191                  (section->offset_within_region & ~TARGET_PAGE_MASK))) {
192         error_report("%s received unaligned region", __func__);
193         return;
194     }
195 
196     iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
197     llend = vhost_vdpa_section_end(section);
198     if (int128_ge(int128_make64(iova), llend)) {
199         return;
200     }
201 
202     memory_region_ref(section->mr);
203 
204     /* Here we assume that memory_region_is_ram(section->mr)==true */
205 
206     vaddr = memory_region_get_ram_ptr(section->mr) +
207             section->offset_within_region +
208             (iova - section->offset_within_address_space);
209 
210     trace_vhost_vdpa_listener_region_add(v, iova, int128_get64(llend),
211                                          vaddr, section->readonly);
212 
213     llsize = int128_sub(llend, int128_make64(iova));
214     if (v->shadow_vqs_enabled) {
215         DMAMap mem_region = {
216             .translated_addr = (hwaddr)(uintptr_t)vaddr,
217             .size = int128_get64(llsize) - 1,
218             .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
219         };
220 
221         int r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region);
222         if (unlikely(r != IOVA_OK)) {
223             error_report("Can't allocate a mapping (%d)", r);
224             goto fail;
225         }
226 
227         iova = mem_region.iova;
228     }
229 
230     vhost_vdpa_iotlb_batch_begin_once(v);
231     ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
232                              vaddr, section->readonly);
233     if (ret) {
234         error_report("vhost vdpa map fail!");
235         goto fail;
236     }
237 
238     return;
239 
240 fail:
241     /*
242      * On the initfn path, store the first error in the container so we
243      * can gracefully fail.  Runtime, there's not much we can do other
244      * than throw a hardware error.
245      */
246     error_report("vhost-vdpa: DMA mapping failed, unable to continue");
247     return;
248 
249 }
250 
251 static void vhost_vdpa_listener_region_del(MemoryListener *listener,
252                                            MemoryRegionSection *section)
253 {
254     struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
255     hwaddr iova;
256     Int128 llend, llsize;
257     int ret;
258 
259     if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first,
260                                             v->iova_range.last)) {
261         return;
262     }
263 
264     if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
265                  (section->offset_within_region & ~TARGET_PAGE_MASK))) {
266         error_report("%s received unaligned region", __func__);
267         return;
268     }
269 
270     iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
271     llend = vhost_vdpa_section_end(section);
272 
273     trace_vhost_vdpa_listener_region_del(v, iova, int128_get64(llend));
274 
275     if (int128_ge(int128_make64(iova), llend)) {
276         return;
277     }
278 
279     llsize = int128_sub(llend, int128_make64(iova));
280 
281     if (v->shadow_vqs_enabled) {
282         const DMAMap *result;
283         const void *vaddr = memory_region_get_ram_ptr(section->mr) +
284             section->offset_within_region +
285             (iova - section->offset_within_address_space);
286         DMAMap mem_region = {
287             .translated_addr = (hwaddr)(uintptr_t)vaddr,
288             .size = int128_get64(llsize) - 1,
289         };
290 
291         result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region);
292         iova = result->iova;
293         vhost_iova_tree_remove(v->iova_tree, &mem_region);
294     }
295     vhost_vdpa_iotlb_batch_begin_once(v);
296     ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize));
297     if (ret) {
298         error_report("vhost_vdpa dma unmap error!");
299     }
300 
301     memory_region_unref(section->mr);
302 }
303 /*
304  * IOTLB API is used by vhost-vdpa which requires incremental updating
305  * of the mapping. So we can not use generic vhost memory listener which
306  * depends on the addnop().
307  */
308 static const MemoryListener vhost_vdpa_memory_listener = {
309     .name = "vhost-vdpa",
310     .commit = vhost_vdpa_listener_commit,
311     .region_add = vhost_vdpa_listener_region_add,
312     .region_del = vhost_vdpa_listener_region_del,
313 };
314 
315 static int vhost_vdpa_call(struct vhost_dev *dev, unsigned long int request,
316                              void *arg)
317 {
318     struct vhost_vdpa *v = dev->opaque;
319     int fd = v->device_fd;
320     int ret;
321 
322     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
323 
324     ret = ioctl(fd, request, arg);
325     return ret < 0 ? -errno : ret;
326 }
327 
328 static int vhost_vdpa_add_status(struct vhost_dev *dev, uint8_t status)
329 {
330     uint8_t s;
331     int ret;
332 
333     trace_vhost_vdpa_add_status(dev, status);
334     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s);
335     if (ret < 0) {
336         return ret;
337     }
338 
339     s |= status;
340 
341     ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &s);
342     if (ret < 0) {
343         return ret;
344     }
345 
346     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s);
347     if (ret < 0) {
348         return ret;
349     }
350 
351     if (!(s & status)) {
352         return -EIO;
353     }
354 
355     return 0;
356 }
357 
358 static void vhost_vdpa_get_iova_range(struct vhost_vdpa *v)
359 {
360     int ret = vhost_vdpa_call(v->dev, VHOST_VDPA_GET_IOVA_RANGE,
361                               &v->iova_range);
362     if (ret != 0) {
363         v->iova_range.first = 0;
364         v->iova_range.last = UINT64_MAX;
365     }
366 
367     trace_vhost_vdpa_get_iova_range(v->dev, v->iova_range.first,
368                                     v->iova_range.last);
369 }
370 
371 /*
372  * The use of this function is for requests that only need to be
373  * applied once. Typically such request occurs at the beginning
374  * of operation, and before setting up queues. It should not be
375  * used for request that performs operation until all queues are
376  * set, which would need to check dev->vq_index_end instead.
377  */
378 static bool vhost_vdpa_first_dev(struct vhost_dev *dev)
379 {
380     struct vhost_vdpa *v = dev->opaque;
381 
382     return v->index == 0;
383 }
384 
385 static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
386                                        uint64_t *features)
387 {
388     int ret;
389 
390     ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
391     trace_vhost_vdpa_get_features(dev, *features);
392     return ret;
393 }
394 
395 static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
396                                Error **errp)
397 {
398     g_autoptr(GPtrArray) shadow_vqs = NULL;
399     uint64_t dev_features, svq_features;
400     int r;
401     bool ok;
402 
403     if (!v->shadow_vqs_enabled) {
404         return 0;
405     }
406 
407     r = vhost_vdpa_get_dev_features(hdev, &dev_features);
408     if (r != 0) {
409         error_setg_errno(errp, -r, "Can't get vdpa device features");
410         return r;
411     }
412 
413     svq_features = dev_features;
414     ok = vhost_svq_valid_features(svq_features, errp);
415     if (unlikely(!ok)) {
416         return -1;
417     }
418 
419     shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
420     for (unsigned n = 0; n < hdev->nvqs; ++n) {
421         g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree);
422 
423         if (unlikely(!svq)) {
424             error_setg(errp, "Cannot create svq %u", n);
425             return -1;
426         }
427         g_ptr_array_add(shadow_vqs, g_steal_pointer(&svq));
428     }
429 
430     v->shadow_vqs = g_steal_pointer(&shadow_vqs);
431     return 0;
432 }
433 
434 static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
435 {
436     struct vhost_vdpa *v;
437     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
438     trace_vhost_vdpa_init(dev, opaque);
439     int ret;
440 
441     /*
442      * Similar to VFIO, we end up pinning all guest memory and have to
443      * disable discarding of RAM.
444      */
445     ret = ram_block_discard_disable(true);
446     if (ret) {
447         error_report("Cannot set discarding of RAM broken");
448         return ret;
449     }
450 
451     v = opaque;
452     v->dev = dev;
453     dev->opaque =  opaque ;
454     v->listener = vhost_vdpa_memory_listener;
455     v->msg_type = VHOST_IOTLB_MSG_V2;
456     ret = vhost_vdpa_init_svq(dev, v, errp);
457     if (ret) {
458         goto err;
459     }
460 
461     vhost_vdpa_get_iova_range(v);
462 
463     if (!vhost_vdpa_first_dev(dev)) {
464         return 0;
465     }
466 
467     vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
468                                VIRTIO_CONFIG_S_DRIVER);
469 
470     return 0;
471 
472 err:
473     ram_block_discard_disable(false);
474     return ret;
475 }
476 
477 static void vhost_vdpa_host_notifier_uninit(struct vhost_dev *dev,
478                                             int queue_index)
479 {
480     size_t page_size = qemu_real_host_page_size();
481     struct vhost_vdpa *v = dev->opaque;
482     VirtIODevice *vdev = dev->vdev;
483     VhostVDPAHostNotifier *n;
484 
485     n = &v->notifier[queue_index];
486 
487     if (n->addr) {
488         virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, false);
489         object_unparent(OBJECT(&n->mr));
490         munmap(n->addr, page_size);
491         n->addr = NULL;
492     }
493 }
494 
495 static int vhost_vdpa_host_notifier_init(struct vhost_dev *dev, int queue_index)
496 {
497     size_t page_size = qemu_real_host_page_size();
498     struct vhost_vdpa *v = dev->opaque;
499     VirtIODevice *vdev = dev->vdev;
500     VhostVDPAHostNotifier *n;
501     int fd = v->device_fd;
502     void *addr;
503     char *name;
504 
505     vhost_vdpa_host_notifier_uninit(dev, queue_index);
506 
507     n = &v->notifier[queue_index];
508 
509     addr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, fd,
510                 queue_index * page_size);
511     if (addr == MAP_FAILED) {
512         goto err;
513     }
514 
515     name = g_strdup_printf("vhost-vdpa/host-notifier@%p mmaps[%d]",
516                            v, queue_index);
517     memory_region_init_ram_device_ptr(&n->mr, OBJECT(vdev), name,
518                                       page_size, addr);
519     g_free(name);
520 
521     if (virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, true)) {
522         object_unparent(OBJECT(&n->mr));
523         munmap(addr, page_size);
524         goto err;
525     }
526     n->addr = addr;
527 
528     return 0;
529 
530 err:
531     return -1;
532 }
533 
534 static void vhost_vdpa_host_notifiers_uninit(struct vhost_dev *dev, int n)
535 {
536     int i;
537 
538     for (i = dev->vq_index; i < dev->vq_index + n; i++) {
539         vhost_vdpa_host_notifier_uninit(dev, i);
540     }
541 }
542 
543 static void vhost_vdpa_host_notifiers_init(struct vhost_dev *dev)
544 {
545     struct vhost_vdpa *v = dev->opaque;
546     int i;
547 
548     if (v->shadow_vqs_enabled) {
549         /* FIXME SVQ is not compatible with host notifiers mr */
550         return;
551     }
552 
553     for (i = dev->vq_index; i < dev->vq_index + dev->nvqs; i++) {
554         if (vhost_vdpa_host_notifier_init(dev, i)) {
555             goto err;
556         }
557     }
558 
559     return;
560 
561 err:
562     vhost_vdpa_host_notifiers_uninit(dev, i - dev->vq_index);
563     return;
564 }
565 
566 static void vhost_vdpa_svq_cleanup(struct vhost_dev *dev)
567 {
568     struct vhost_vdpa *v = dev->opaque;
569     size_t idx;
570 
571     if (!v->shadow_vqs) {
572         return;
573     }
574 
575     for (idx = 0; idx < v->shadow_vqs->len; ++idx) {
576         vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, idx));
577     }
578     g_ptr_array_free(v->shadow_vqs, true);
579 }
580 
581 static int vhost_vdpa_cleanup(struct vhost_dev *dev)
582 {
583     struct vhost_vdpa *v;
584     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
585     v = dev->opaque;
586     trace_vhost_vdpa_cleanup(dev, v);
587     vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
588     memory_listener_unregister(&v->listener);
589     vhost_vdpa_svq_cleanup(dev);
590 
591     dev->opaque = NULL;
592     ram_block_discard_disable(false);
593 
594     return 0;
595 }
596 
597 static int vhost_vdpa_memslots_limit(struct vhost_dev *dev)
598 {
599     trace_vhost_vdpa_memslots_limit(dev, INT_MAX);
600     return INT_MAX;
601 }
602 
603 static int vhost_vdpa_set_mem_table(struct vhost_dev *dev,
604                                     struct vhost_memory *mem)
605 {
606     if (!vhost_vdpa_first_dev(dev)) {
607         return 0;
608     }
609 
610     trace_vhost_vdpa_set_mem_table(dev, mem->nregions, mem->padding);
611     if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_MEM_TABLE) &&
612         trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_REGIONS)) {
613         int i;
614         for (i = 0; i < mem->nregions; i++) {
615             trace_vhost_vdpa_dump_regions(dev, i,
616                                           mem->regions[i].guest_phys_addr,
617                                           mem->regions[i].memory_size,
618                                           mem->regions[i].userspace_addr,
619                                           mem->regions[i].flags_padding);
620         }
621     }
622     if (mem->padding) {
623         return -EINVAL;
624     }
625 
626     return 0;
627 }
628 
629 static int vhost_vdpa_set_features(struct vhost_dev *dev,
630                                    uint64_t features)
631 {
632     struct vhost_vdpa *v = dev->opaque;
633     int ret;
634 
635     if (!vhost_vdpa_first_dev(dev)) {
636         return 0;
637     }
638 
639     if (v->shadow_vqs_enabled) {
640         if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) {
641             /*
642              * QEMU is just trying to enable or disable logging. SVQ handles
643              * this sepparately, so no need to forward this.
644              */
645             v->acked_features = features;
646             return 0;
647         }
648 
649         v->acked_features = features;
650 
651         /* We must not ack _F_LOG if SVQ is enabled */
652         features &= ~BIT_ULL(VHOST_F_LOG_ALL);
653     }
654 
655     trace_vhost_vdpa_set_features(dev, features);
656     ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features);
657     if (ret) {
658         return ret;
659     }
660 
661     return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK);
662 }
663 
664 static int vhost_vdpa_set_backend_cap(struct vhost_dev *dev)
665 {
666     uint64_t features;
667     uint64_t f = 0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2 |
668         0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH;
669     int r;
670 
671     if (vhost_vdpa_call(dev, VHOST_GET_BACKEND_FEATURES, &features)) {
672         return -EFAULT;
673     }
674 
675     features &= f;
676 
677     if (vhost_vdpa_first_dev(dev)) {
678         r = vhost_vdpa_call(dev, VHOST_SET_BACKEND_FEATURES, &features);
679         if (r) {
680             return -EFAULT;
681         }
682     }
683 
684     dev->backend_cap = features;
685 
686     return 0;
687 }
688 
689 static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
690                                     uint32_t *device_id)
691 {
692     int ret;
693     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_DEVICE_ID, device_id);
694     trace_vhost_vdpa_get_device_id(dev, *device_id);
695     return ret;
696 }
697 
698 static void vhost_vdpa_reset_svq(struct vhost_vdpa *v)
699 {
700     if (!v->shadow_vqs_enabled) {
701         return;
702     }
703 
704     for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
705         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
706         vhost_svq_stop(svq);
707     }
708 }
709 
710 static int vhost_vdpa_reset_device(struct vhost_dev *dev)
711 {
712     struct vhost_vdpa *v = dev->opaque;
713     int ret;
714     uint8_t status = 0;
715 
716     vhost_vdpa_reset_svq(v);
717 
718     ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
719     trace_vhost_vdpa_reset_device(dev, status);
720     return ret;
721 }
722 
723 static int vhost_vdpa_get_vq_index(struct vhost_dev *dev, int idx)
724 {
725     assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs);
726 
727     trace_vhost_vdpa_get_vq_index(dev, idx, idx);
728     return idx;
729 }
730 
731 static int vhost_vdpa_set_vring_ready(struct vhost_dev *dev)
732 {
733     int i;
734     trace_vhost_vdpa_set_vring_ready(dev);
735     for (i = 0; i < dev->nvqs; ++i) {
736         struct vhost_vring_state state = {
737             .index = dev->vq_index + i,
738             .num = 1,
739         };
740         vhost_vdpa_call(dev, VHOST_VDPA_SET_VRING_ENABLE, &state);
741     }
742     return 0;
743 }
744 
745 static void vhost_vdpa_dump_config(struct vhost_dev *dev, const uint8_t *config,
746                                    uint32_t config_len)
747 {
748     int b, len;
749     char line[QEMU_HEXDUMP_LINE_LEN];
750 
751     for (b = 0; b < config_len; b += 16) {
752         len = config_len - b;
753         qemu_hexdump_line(line, b, config, len, false);
754         trace_vhost_vdpa_dump_config(dev, line);
755     }
756 }
757 
758 static int vhost_vdpa_set_config(struct vhost_dev *dev, const uint8_t *data,
759                                    uint32_t offset, uint32_t size,
760                                    uint32_t flags)
761 {
762     struct vhost_vdpa_config *config;
763     int ret;
764     unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
765 
766     trace_vhost_vdpa_set_config(dev, offset, size, flags);
767     config = g_malloc(size + config_size);
768     config->off = offset;
769     config->len = size;
770     memcpy(config->buf, data, size);
771     if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_CONFIG) &&
772         trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) {
773         vhost_vdpa_dump_config(dev, data, size);
774     }
775     ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_CONFIG, config);
776     g_free(config);
777     return ret;
778 }
779 
780 static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
781                                    uint32_t config_len, Error **errp)
782 {
783     struct vhost_vdpa_config *v_config;
784     unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
785     int ret;
786 
787     trace_vhost_vdpa_get_config(dev, config, config_len);
788     v_config = g_malloc(config_len + config_size);
789     v_config->len = config_len;
790     v_config->off = 0;
791     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_CONFIG, v_config);
792     memcpy(config, v_config->buf, config_len);
793     g_free(v_config);
794     if (trace_event_get_state_backends(TRACE_VHOST_VDPA_GET_CONFIG) &&
795         trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) {
796         vhost_vdpa_dump_config(dev, config, config_len);
797     }
798     return ret;
799  }
800 
801 static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
802                                          struct vhost_vring_state *ring)
803 {
804     trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
805     return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
806 }
807 
808 static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
809                                          struct vhost_vring_file *file)
810 {
811     trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd);
812     return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
813 }
814 
815 static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
816                                          struct vhost_vring_file *file)
817 {
818     trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
819     return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
820 }
821 
822 static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
823                                          struct vhost_vring_addr *addr)
824 {
825     trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
826                                 addr->desc_user_addr, addr->used_user_addr,
827                                 addr->avail_user_addr,
828                                 addr->log_guest_addr);
829 
830     return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
831 
832 }
833 
834 /**
835  * Set the shadow virtqueue descriptors to the device
836  *
837  * @dev: The vhost device model
838  * @svq: The shadow virtqueue
839  * @idx: The index of the virtqueue in the vhost device
840  * @errp: Error
841  *
842  * Note that this function does not rewind kick file descriptor if cannot set
843  * call one.
844  */
845 static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
846                                   VhostShadowVirtqueue *svq, unsigned idx,
847                                   Error **errp)
848 {
849     struct vhost_vring_file file = {
850         .index = dev->vq_index + idx,
851     };
852     const EventNotifier *event_notifier = &svq->hdev_kick;
853     int r;
854 
855     file.fd = event_notifier_get_fd(event_notifier);
856     r = vhost_vdpa_set_vring_dev_kick(dev, &file);
857     if (unlikely(r != 0)) {
858         error_setg_errno(errp, -r, "Can't set device kick fd");
859         return r;
860     }
861 
862     event_notifier = &svq->hdev_call;
863     file.fd = event_notifier_get_fd(event_notifier);
864     r = vhost_vdpa_set_vring_dev_call(dev, &file);
865     if (unlikely(r != 0)) {
866         error_setg_errno(errp, -r, "Can't set device call fd");
867     }
868 
869     return r;
870 }
871 
872 /**
873  * Unmap a SVQ area in the device
874  */
875 static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v,
876                                       const DMAMap *needle)
877 {
878     const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle);
879     hwaddr size;
880     int r;
881 
882     if (unlikely(!result)) {
883         error_report("Unable to find SVQ address to unmap");
884         return false;
885     }
886 
887     size = ROUND_UP(result->size, qemu_real_host_page_size());
888     r = vhost_vdpa_dma_unmap(v, result->iova, size);
889     return r == 0;
890 }
891 
892 static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
893                                        const VhostShadowVirtqueue *svq)
894 {
895     DMAMap needle = {};
896     struct vhost_vdpa *v = dev->opaque;
897     struct vhost_vring_addr svq_addr;
898     bool ok;
899 
900     vhost_svq_get_vring_addr(svq, &svq_addr);
901 
902     needle.translated_addr = svq_addr.desc_user_addr;
903     ok = vhost_vdpa_svq_unmap_ring(v, &needle);
904     if (unlikely(!ok)) {
905         return false;
906     }
907 
908     needle.translated_addr = svq_addr.used_user_addr;
909     return vhost_vdpa_svq_unmap_ring(v, &needle);
910 }
911 
912 /**
913  * Map the SVQ area in the device
914  *
915  * @v: Vhost-vdpa device
916  * @needle: The area to search iova
917  * @errorp: Error pointer
918  */
919 static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle,
920                                     Error **errp)
921 {
922     int r;
923 
924     r = vhost_iova_tree_map_alloc(v->iova_tree, needle);
925     if (unlikely(r != IOVA_OK)) {
926         error_setg(errp, "Cannot allocate iova (%d)", r);
927         return false;
928     }
929 
930     r = vhost_vdpa_dma_map(v, needle->iova, needle->size + 1,
931                            (void *)(uintptr_t)needle->translated_addr,
932                            needle->perm == IOMMU_RO);
933     if (unlikely(r != 0)) {
934         error_setg_errno(errp, -r, "Cannot map region to device");
935         vhost_iova_tree_remove(v->iova_tree, needle);
936     }
937 
938     return r == 0;
939 }
940 
941 /**
942  * Map the shadow virtqueue rings in the device
943  *
944  * @dev: The vhost device
945  * @svq: The shadow virtqueue
946  * @addr: Assigned IOVA addresses
947  * @errp: Error pointer
948  */
949 static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
950                                      const VhostShadowVirtqueue *svq,
951                                      struct vhost_vring_addr *addr,
952                                      Error **errp)
953 {
954     DMAMap device_region, driver_region;
955     struct vhost_vring_addr svq_addr;
956     struct vhost_vdpa *v = dev->opaque;
957     size_t device_size = vhost_svq_device_area_size(svq);
958     size_t driver_size = vhost_svq_driver_area_size(svq);
959     size_t avail_offset;
960     bool ok;
961 
962     ERRP_GUARD();
963     vhost_svq_get_vring_addr(svq, &svq_addr);
964 
965     driver_region = (DMAMap) {
966         .translated_addr = svq_addr.desc_user_addr,
967         .size = driver_size - 1,
968         .perm = IOMMU_RO,
969     };
970     ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
971     if (unlikely(!ok)) {
972         error_prepend(errp, "Cannot create vq driver region: ");
973         return false;
974     }
975     addr->desc_user_addr = driver_region.iova;
976     avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
977     addr->avail_user_addr = driver_region.iova + avail_offset;
978 
979     device_region = (DMAMap) {
980         .translated_addr = svq_addr.used_user_addr,
981         .size = device_size - 1,
982         .perm = IOMMU_RW,
983     };
984     ok = vhost_vdpa_svq_map_ring(v, &device_region, errp);
985     if (unlikely(!ok)) {
986         error_prepend(errp, "Cannot create vq device region: ");
987         vhost_vdpa_svq_unmap_ring(v, &driver_region);
988     }
989     addr->used_user_addr = device_region.iova;
990 
991     return ok;
992 }
993 
994 static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
995                                  VhostShadowVirtqueue *svq, unsigned idx,
996                                  Error **errp)
997 {
998     uint16_t vq_index = dev->vq_index + idx;
999     struct vhost_vring_state s = {
1000         .index = vq_index,
1001     };
1002     int r;
1003 
1004     r = vhost_vdpa_set_dev_vring_base(dev, &s);
1005     if (unlikely(r)) {
1006         error_setg_errno(errp, -r, "Cannot set vring base");
1007         return false;
1008     }
1009 
1010     r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp);
1011     return r == 0;
1012 }
1013 
1014 static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
1015 {
1016     struct vhost_vdpa *v = dev->opaque;
1017     Error *err = NULL;
1018     unsigned i;
1019 
1020     if (!v->shadow_vqs) {
1021         return true;
1022     }
1023 
1024     for (i = 0; i < v->shadow_vqs->len; ++i) {
1025         VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
1026         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
1027         struct vhost_vring_addr addr = {
1028             .index = dev->vq_index + i,
1029         };
1030         int r;
1031         bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
1032         if (unlikely(!ok)) {
1033             goto err;
1034         }
1035 
1036         vhost_svq_start(svq, dev->vdev, vq);
1037         ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err);
1038         if (unlikely(!ok)) {
1039             goto err_map;
1040         }
1041 
1042         /* Override vring GPA set by vhost subsystem */
1043         r = vhost_vdpa_set_vring_dev_addr(dev, &addr);
1044         if (unlikely(r != 0)) {
1045             error_setg_errno(&err, -r, "Cannot set device address");
1046             goto err_set_addr;
1047         }
1048     }
1049 
1050     return true;
1051 
1052 err_set_addr:
1053     vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i));
1054 
1055 err_map:
1056     vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i));
1057 
1058 err:
1059     error_reportf_err(err, "Cannot setup SVQ %u: ", i);
1060     for (unsigned j = 0; j < i; ++j) {
1061         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j);
1062         vhost_vdpa_svq_unmap_rings(dev, svq);
1063         vhost_svq_stop(svq);
1064     }
1065 
1066     return false;
1067 }
1068 
1069 static bool vhost_vdpa_svqs_stop(struct vhost_dev *dev)
1070 {
1071     struct vhost_vdpa *v = dev->opaque;
1072 
1073     if (!v->shadow_vqs) {
1074         return true;
1075     }
1076 
1077     for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
1078         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
1079         bool ok = vhost_vdpa_svq_unmap_rings(dev, svq);
1080         if (unlikely(!ok)) {
1081             return false;
1082         }
1083     }
1084 
1085     return true;
1086 }
1087 
1088 static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
1089 {
1090     struct vhost_vdpa *v = dev->opaque;
1091     bool ok;
1092     trace_vhost_vdpa_dev_start(dev, started);
1093 
1094     if (started) {
1095         vhost_vdpa_host_notifiers_init(dev);
1096         ok = vhost_vdpa_svqs_start(dev);
1097         if (unlikely(!ok)) {
1098             return -1;
1099         }
1100         vhost_vdpa_set_vring_ready(dev);
1101     } else {
1102         ok = vhost_vdpa_svqs_stop(dev);
1103         if (unlikely(!ok)) {
1104             return -1;
1105         }
1106         vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
1107     }
1108 
1109     if (dev->vq_index + dev->nvqs != dev->vq_index_end) {
1110         return 0;
1111     }
1112 
1113     if (started) {
1114         memory_listener_register(&v->listener, &address_space_memory);
1115         return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
1116     } else {
1117         vhost_vdpa_reset_device(dev);
1118         vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
1119                                    VIRTIO_CONFIG_S_DRIVER);
1120         memory_listener_unregister(&v->listener);
1121 
1122         return 0;
1123     }
1124 }
1125 
1126 static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
1127                                      struct vhost_log *log)
1128 {
1129     struct vhost_vdpa *v = dev->opaque;
1130     if (v->shadow_vqs_enabled || !vhost_vdpa_first_dev(dev)) {
1131         return 0;
1132     }
1133 
1134     trace_vhost_vdpa_set_log_base(dev, base, log->size, log->refcnt, log->fd,
1135                                   log->log);
1136     return vhost_vdpa_call(dev, VHOST_SET_LOG_BASE, &base);
1137 }
1138 
1139 static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
1140                                        struct vhost_vring_addr *addr)
1141 {
1142     struct vhost_vdpa *v = dev->opaque;
1143 
1144     if (v->shadow_vqs_enabled) {
1145         /*
1146          * Device vring addr was set at device start. SVQ base is handled by
1147          * VirtQueue code.
1148          */
1149         return 0;
1150     }
1151 
1152     return vhost_vdpa_set_vring_dev_addr(dev, addr);
1153 }
1154 
1155 static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
1156                                       struct vhost_vring_state *ring)
1157 {
1158     trace_vhost_vdpa_set_vring_num(dev, ring->index, ring->num);
1159     return vhost_vdpa_call(dev, VHOST_SET_VRING_NUM, ring);
1160 }
1161 
1162 static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
1163                                        struct vhost_vring_state *ring)
1164 {
1165     struct vhost_vdpa *v = dev->opaque;
1166 
1167     if (v->shadow_vqs_enabled) {
1168         /*
1169          * Device vring base was set at device start. SVQ base is handled by
1170          * VirtQueue code.
1171          */
1172         return 0;
1173     }
1174 
1175     return vhost_vdpa_set_dev_vring_base(dev, ring);
1176 }
1177 
1178 static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
1179                                        struct vhost_vring_state *ring)
1180 {
1181     struct vhost_vdpa *v = dev->opaque;
1182     int vdpa_idx = ring->index - dev->vq_index;
1183     int ret;
1184 
1185     if (v->shadow_vqs_enabled) {
1186         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
1187 
1188         /*
1189          * Setting base as last used idx, so destination will see as available
1190          * all the entries that the device did not use, including the in-flight
1191          * processing ones.
1192          *
1193          * TODO: This is ok for networking, but other kinds of devices might
1194          * have problems with these retransmissions.
1195          */
1196         ring->num = svq->last_used_idx;
1197         return 0;
1198     }
1199 
1200     ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
1201     trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num);
1202     return ret;
1203 }
1204 
1205 static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
1206                                        struct vhost_vring_file *file)
1207 {
1208     struct vhost_vdpa *v = dev->opaque;
1209     int vdpa_idx = file->index - dev->vq_index;
1210 
1211     if (v->shadow_vqs_enabled) {
1212         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
1213         vhost_svq_set_svq_kick_fd(svq, file->fd);
1214         return 0;
1215     } else {
1216         return vhost_vdpa_set_vring_dev_kick(dev, file);
1217     }
1218 }
1219 
1220 static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
1221                                        struct vhost_vring_file *file)
1222 {
1223     struct vhost_vdpa *v = dev->opaque;
1224 
1225     if (v->shadow_vqs_enabled) {
1226         int vdpa_idx = file->index - dev->vq_index;
1227         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
1228 
1229         vhost_svq_set_svq_call_fd(svq, file->fd);
1230         return 0;
1231     } else {
1232         return vhost_vdpa_set_vring_dev_call(dev, file);
1233     }
1234 }
1235 
1236 static int vhost_vdpa_get_features(struct vhost_dev *dev,
1237                                      uint64_t *features)
1238 {
1239     struct vhost_vdpa *v = dev->opaque;
1240     int ret = vhost_vdpa_get_dev_features(dev, features);
1241 
1242     if (ret == 0 && v->shadow_vqs_enabled) {
1243         /* Add SVQ logging capabilities */
1244         *features |= BIT_ULL(VHOST_F_LOG_ALL);
1245     }
1246 
1247     return ret;
1248 }
1249 
1250 static int vhost_vdpa_set_owner(struct vhost_dev *dev)
1251 {
1252     if (!vhost_vdpa_first_dev(dev)) {
1253         return 0;
1254     }
1255 
1256     trace_vhost_vdpa_set_owner(dev);
1257     return vhost_vdpa_call(dev, VHOST_SET_OWNER, NULL);
1258 }
1259 
1260 static int vhost_vdpa_vq_get_addr(struct vhost_dev *dev,
1261                     struct vhost_vring_addr *addr, struct vhost_virtqueue *vq)
1262 {
1263     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
1264     addr->desc_user_addr = (uint64_t)(unsigned long)vq->desc_phys;
1265     addr->avail_user_addr = (uint64_t)(unsigned long)vq->avail_phys;
1266     addr->used_user_addr = (uint64_t)(unsigned long)vq->used_phys;
1267     trace_vhost_vdpa_vq_get_addr(dev, vq, addr->desc_user_addr,
1268                                  addr->avail_user_addr, addr->used_user_addr);
1269     return 0;
1270 }
1271 
1272 static bool  vhost_vdpa_force_iommu(struct vhost_dev *dev)
1273 {
1274     return true;
1275 }
1276 
1277 const VhostOps vdpa_ops = {
1278         .backend_type = VHOST_BACKEND_TYPE_VDPA,
1279         .vhost_backend_init = vhost_vdpa_init,
1280         .vhost_backend_cleanup = vhost_vdpa_cleanup,
1281         .vhost_set_log_base = vhost_vdpa_set_log_base,
1282         .vhost_set_vring_addr = vhost_vdpa_set_vring_addr,
1283         .vhost_set_vring_num = vhost_vdpa_set_vring_num,
1284         .vhost_set_vring_base = vhost_vdpa_set_vring_base,
1285         .vhost_get_vring_base = vhost_vdpa_get_vring_base,
1286         .vhost_set_vring_kick = vhost_vdpa_set_vring_kick,
1287         .vhost_set_vring_call = vhost_vdpa_set_vring_call,
1288         .vhost_get_features = vhost_vdpa_get_features,
1289         .vhost_set_backend_cap = vhost_vdpa_set_backend_cap,
1290         .vhost_set_owner = vhost_vdpa_set_owner,
1291         .vhost_set_vring_endian = NULL,
1292         .vhost_backend_memslots_limit = vhost_vdpa_memslots_limit,
1293         .vhost_set_mem_table = vhost_vdpa_set_mem_table,
1294         .vhost_set_features = vhost_vdpa_set_features,
1295         .vhost_reset_device = vhost_vdpa_reset_device,
1296         .vhost_get_vq_index = vhost_vdpa_get_vq_index,
1297         .vhost_get_config  = vhost_vdpa_get_config,
1298         .vhost_set_config = vhost_vdpa_set_config,
1299         .vhost_requires_shm_log = NULL,
1300         .vhost_migration_done = NULL,
1301         .vhost_backend_can_merge = NULL,
1302         .vhost_net_set_mtu = NULL,
1303         .vhost_set_iotlb_callback = NULL,
1304         .vhost_send_device_iotlb_msg = NULL,
1305         .vhost_dev_start = vhost_vdpa_dev_start,
1306         .vhost_get_device_id = vhost_vdpa_get_device_id,
1307         .vhost_vq_get_addr = vhost_vdpa_vq_get_addr,
1308         .vhost_force_iommu = vhost_vdpa_force_iommu,
1309 };
1310