xref: /openbmc/qemu/hw/virtio/vhost-vdpa.c (revision d640b59e)
1 /*
2  * vhost-vdpa
3  *
4  *  Copyright(c) 2017-2018 Intel Corporation.
5  *  Copyright(c) 2020 Red Hat, Inc.
6  *
7  * This work is licensed under the terms of the GNU GPL, version 2 or later.
8  * See the COPYING file in the top-level directory.
9  *
10  */
11 
12 #include "qemu/osdep.h"
13 #include <linux/vhost.h>
14 #include <linux/vfio.h>
15 #include <sys/eventfd.h>
16 #include <sys/ioctl.h>
17 #include "hw/virtio/vhost.h"
18 #include "hw/virtio/vhost-backend.h"
19 #include "hw/virtio/virtio-net.h"
20 #include "hw/virtio/vhost-shadow-virtqueue.h"
21 #include "hw/virtio/vhost-vdpa.h"
22 #include "exec/address-spaces.h"
23 #include "qemu/cutils.h"
24 #include "qemu/main-loop.h"
25 #include "cpu.h"
26 #include "trace.h"
27 #include "qapi/error.h"
28 
29 /*
30  * Return one past the end of the end of section. Be careful with uint64_t
31  * conversions!
32  */
33 static Int128 vhost_vdpa_section_end(const MemoryRegionSection *section)
34 {
35     Int128 llend = int128_make64(section->offset_within_address_space);
36     llend = int128_add(llend, section->size);
37     llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
38 
39     return llend;
40 }
41 
42 static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section,
43                                                 uint64_t iova_min,
44                                                 uint64_t iova_max)
45 {
46     Int128 llend;
47 
48     if ((!memory_region_is_ram(section->mr) &&
49          !memory_region_is_iommu(section->mr)) ||
50         memory_region_is_protected(section->mr) ||
51         /* vhost-vDPA doesn't allow MMIO to be mapped  */
52         memory_region_is_ram_device(section->mr)) {
53         return true;
54     }
55 
56     if (section->offset_within_address_space < iova_min) {
57         error_report("RAM section out of device range (min=0x%" PRIx64
58                      ", addr=0x%" HWADDR_PRIx ")",
59                      iova_min, section->offset_within_address_space);
60         return true;
61     }
62 
63     llend = vhost_vdpa_section_end(section);
64     if (int128_gt(llend, int128_make64(iova_max))) {
65         error_report("RAM section out of device range (max=0x%" PRIx64
66                      ", end addr=0x%" PRIx64 ")",
67                      iova_max, int128_get64(llend));
68         return true;
69     }
70 
71     return false;
72 }
73 
74 static int vhost_vdpa_dma_map(struct vhost_vdpa *v, hwaddr iova, hwaddr size,
75                               void *vaddr, bool readonly)
76 {
77     struct vhost_msg_v2 msg = {};
78     int fd = v->device_fd;
79     int ret = 0;
80 
81     msg.type = v->msg_type;
82     msg.iotlb.iova = iova;
83     msg.iotlb.size = size;
84     msg.iotlb.uaddr = (uint64_t)(uintptr_t)vaddr;
85     msg.iotlb.perm = readonly ? VHOST_ACCESS_RO : VHOST_ACCESS_RW;
86     msg.iotlb.type = VHOST_IOTLB_UPDATE;
87 
88    trace_vhost_vdpa_dma_map(v, fd, msg.type, msg.iotlb.iova, msg.iotlb.size,
89                             msg.iotlb.uaddr, msg.iotlb.perm, msg.iotlb.type);
90 
91     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
92         error_report("failed to write, fd=%d, errno=%d (%s)",
93             fd, errno, strerror(errno));
94         return -EIO ;
95     }
96 
97     return ret;
98 }
99 
100 static int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, hwaddr iova,
101                                 hwaddr size)
102 {
103     struct vhost_msg_v2 msg = {};
104     int fd = v->device_fd;
105     int ret = 0;
106 
107     msg.type = v->msg_type;
108     msg.iotlb.iova = iova;
109     msg.iotlb.size = size;
110     msg.iotlb.type = VHOST_IOTLB_INVALIDATE;
111 
112     trace_vhost_vdpa_dma_unmap(v, fd, msg.type, msg.iotlb.iova,
113                                msg.iotlb.size, msg.iotlb.type);
114 
115     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
116         error_report("failed to write, fd=%d, errno=%d (%s)",
117             fd, errno, strerror(errno));
118         return -EIO ;
119     }
120 
121     return ret;
122 }
123 
124 static void vhost_vdpa_listener_begin_batch(struct vhost_vdpa *v)
125 {
126     int fd = v->device_fd;
127     struct vhost_msg_v2 msg = {
128         .type = v->msg_type,
129         .iotlb.type = VHOST_IOTLB_BATCH_BEGIN,
130     };
131 
132     trace_vhost_vdpa_listener_begin_batch(v, fd, msg.type, msg.iotlb.type);
133     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
134         error_report("failed to write, fd=%d, errno=%d (%s)",
135                      fd, errno, strerror(errno));
136     }
137 }
138 
139 static void vhost_vdpa_iotlb_batch_begin_once(struct vhost_vdpa *v)
140 {
141     if (v->dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH) &&
142         !v->iotlb_batch_begin_sent) {
143         vhost_vdpa_listener_begin_batch(v);
144     }
145 
146     v->iotlb_batch_begin_sent = true;
147 }
148 
149 static void vhost_vdpa_listener_commit(MemoryListener *listener)
150 {
151     struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
152     struct vhost_dev *dev = v->dev;
153     struct vhost_msg_v2 msg = {};
154     int fd = v->device_fd;
155 
156     if (!(dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH))) {
157         return;
158     }
159 
160     if (!v->iotlb_batch_begin_sent) {
161         return;
162     }
163 
164     msg.type = v->msg_type;
165     msg.iotlb.type = VHOST_IOTLB_BATCH_END;
166 
167     trace_vhost_vdpa_listener_commit(v, fd, msg.type, msg.iotlb.type);
168     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
169         error_report("failed to write, fd=%d, errno=%d (%s)",
170                      fd, errno, strerror(errno));
171     }
172 
173     v->iotlb_batch_begin_sent = false;
174 }
175 
176 static void vhost_vdpa_listener_region_add(MemoryListener *listener,
177                                            MemoryRegionSection *section)
178 {
179     struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
180     hwaddr iova;
181     Int128 llend, llsize;
182     void *vaddr;
183     int ret;
184 
185     if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first,
186                                             v->iova_range.last)) {
187         return;
188     }
189 
190     if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
191                  (section->offset_within_region & ~TARGET_PAGE_MASK))) {
192         error_report("%s received unaligned region", __func__);
193         return;
194     }
195 
196     iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
197     llend = vhost_vdpa_section_end(section);
198     if (int128_ge(int128_make64(iova), llend)) {
199         return;
200     }
201 
202     memory_region_ref(section->mr);
203 
204     /* Here we assume that memory_region_is_ram(section->mr)==true */
205 
206     vaddr = memory_region_get_ram_ptr(section->mr) +
207             section->offset_within_region +
208             (iova - section->offset_within_address_space);
209 
210     trace_vhost_vdpa_listener_region_add(v, iova, int128_get64(llend),
211                                          vaddr, section->readonly);
212 
213     llsize = int128_sub(llend, int128_make64(iova));
214     if (v->shadow_vqs_enabled) {
215         DMAMap mem_region = {
216             .translated_addr = (hwaddr)(uintptr_t)vaddr,
217             .size = int128_get64(llsize) - 1,
218             .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
219         };
220 
221         int r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region);
222         if (unlikely(r != IOVA_OK)) {
223             error_report("Can't allocate a mapping (%d)", r);
224             goto fail;
225         }
226 
227         iova = mem_region.iova;
228     }
229 
230     vhost_vdpa_iotlb_batch_begin_once(v);
231     ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
232                              vaddr, section->readonly);
233     if (ret) {
234         error_report("vhost vdpa map fail!");
235         goto fail;
236     }
237 
238     return;
239 
240 fail:
241     /*
242      * On the initfn path, store the first error in the container so we
243      * can gracefully fail.  Runtime, there's not much we can do other
244      * than throw a hardware error.
245      */
246     error_report("vhost-vdpa: DMA mapping failed, unable to continue");
247     return;
248 
249 }
250 
251 static void vhost_vdpa_listener_region_del(MemoryListener *listener,
252                                            MemoryRegionSection *section)
253 {
254     struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
255     hwaddr iova;
256     Int128 llend, llsize;
257     int ret;
258 
259     if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first,
260                                             v->iova_range.last)) {
261         return;
262     }
263 
264     if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
265                  (section->offset_within_region & ~TARGET_PAGE_MASK))) {
266         error_report("%s received unaligned region", __func__);
267         return;
268     }
269 
270     iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
271     llend = vhost_vdpa_section_end(section);
272 
273     trace_vhost_vdpa_listener_region_del(v, iova, int128_get64(llend));
274 
275     if (int128_ge(int128_make64(iova), llend)) {
276         return;
277     }
278 
279     llsize = int128_sub(llend, int128_make64(iova));
280 
281     if (v->shadow_vqs_enabled) {
282         const DMAMap *result;
283         const void *vaddr = memory_region_get_ram_ptr(section->mr) +
284             section->offset_within_region +
285             (iova - section->offset_within_address_space);
286         DMAMap mem_region = {
287             .translated_addr = (hwaddr)(uintptr_t)vaddr,
288             .size = int128_get64(llsize) - 1,
289         };
290 
291         result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region);
292         iova = result->iova;
293         vhost_iova_tree_remove(v->iova_tree, &mem_region);
294     }
295     vhost_vdpa_iotlb_batch_begin_once(v);
296     ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize));
297     if (ret) {
298         error_report("vhost_vdpa dma unmap error!");
299     }
300 
301     memory_region_unref(section->mr);
302 }
303 /*
304  * IOTLB API is used by vhost-vdpa which requires incremental updating
305  * of the mapping. So we can not use generic vhost memory listener which
306  * depends on the addnop().
307  */
308 static const MemoryListener vhost_vdpa_memory_listener = {
309     .name = "vhost-vdpa",
310     .commit = vhost_vdpa_listener_commit,
311     .region_add = vhost_vdpa_listener_region_add,
312     .region_del = vhost_vdpa_listener_region_del,
313 };
314 
315 static int vhost_vdpa_call(struct vhost_dev *dev, unsigned long int request,
316                              void *arg)
317 {
318     struct vhost_vdpa *v = dev->opaque;
319     int fd = v->device_fd;
320     int ret;
321 
322     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
323 
324     ret = ioctl(fd, request, arg);
325     return ret < 0 ? -errno : ret;
326 }
327 
328 static int vhost_vdpa_add_status(struct vhost_dev *dev, uint8_t status)
329 {
330     uint8_t s;
331     int ret;
332 
333     trace_vhost_vdpa_add_status(dev, status);
334     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s);
335     if (ret < 0) {
336         return ret;
337     }
338 
339     s |= status;
340 
341     ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &s);
342     if (ret < 0) {
343         return ret;
344     }
345 
346     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s);
347     if (ret < 0) {
348         return ret;
349     }
350 
351     if (!(s & status)) {
352         return -EIO;
353     }
354 
355     return 0;
356 }
357 
358 static void vhost_vdpa_get_iova_range(struct vhost_vdpa *v)
359 {
360     int ret = vhost_vdpa_call(v->dev, VHOST_VDPA_GET_IOVA_RANGE,
361                               &v->iova_range);
362     if (ret != 0) {
363         v->iova_range.first = 0;
364         v->iova_range.last = UINT64_MAX;
365     }
366 
367     trace_vhost_vdpa_get_iova_range(v->dev, v->iova_range.first,
368                                     v->iova_range.last);
369 }
370 
371 static bool vhost_vdpa_one_time_request(struct vhost_dev *dev)
372 {
373     struct vhost_vdpa *v = dev->opaque;
374 
375     return v->index != 0;
376 }
377 
378 static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
379                                        uint64_t *features)
380 {
381     int ret;
382 
383     ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
384     trace_vhost_vdpa_get_features(dev, *features);
385     return ret;
386 }
387 
388 static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
389                                Error **errp)
390 {
391     g_autoptr(GPtrArray) shadow_vqs = NULL;
392     uint64_t dev_features, svq_features;
393     int r;
394     bool ok;
395 
396     if (!v->shadow_vqs_enabled) {
397         return 0;
398     }
399 
400     r = vhost_vdpa_get_dev_features(hdev, &dev_features);
401     if (r != 0) {
402         error_setg_errno(errp, -r, "Can't get vdpa device features");
403         return r;
404     }
405 
406     svq_features = dev_features;
407     ok = vhost_svq_valid_features(svq_features, errp);
408     if (unlikely(!ok)) {
409         return -1;
410     }
411 
412     shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
413     for (unsigned n = 0; n < hdev->nvqs; ++n) {
414         g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree);
415 
416         if (unlikely(!svq)) {
417             error_setg(errp, "Cannot create svq %u", n);
418             return -1;
419         }
420         g_ptr_array_add(shadow_vqs, g_steal_pointer(&svq));
421     }
422 
423     v->shadow_vqs = g_steal_pointer(&shadow_vqs);
424     return 0;
425 }
426 
427 static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
428 {
429     struct vhost_vdpa *v;
430     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
431     trace_vhost_vdpa_init(dev, opaque);
432     int ret;
433 
434     /*
435      * Similar to VFIO, we end up pinning all guest memory and have to
436      * disable discarding of RAM.
437      */
438     ret = ram_block_discard_disable(true);
439     if (ret) {
440         error_report("Cannot set discarding of RAM broken");
441         return ret;
442     }
443 
444     v = opaque;
445     v->dev = dev;
446     dev->opaque =  opaque ;
447     v->listener = vhost_vdpa_memory_listener;
448     v->msg_type = VHOST_IOTLB_MSG_V2;
449     ret = vhost_vdpa_init_svq(dev, v, errp);
450     if (ret) {
451         goto err;
452     }
453 
454     vhost_vdpa_get_iova_range(v);
455 
456     if (vhost_vdpa_one_time_request(dev)) {
457         return 0;
458     }
459 
460     vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
461                                VIRTIO_CONFIG_S_DRIVER);
462 
463     return 0;
464 
465 err:
466     ram_block_discard_disable(false);
467     return ret;
468 }
469 
470 static void vhost_vdpa_host_notifier_uninit(struct vhost_dev *dev,
471                                             int queue_index)
472 {
473     size_t page_size = qemu_real_host_page_size();
474     struct vhost_vdpa *v = dev->opaque;
475     VirtIODevice *vdev = dev->vdev;
476     VhostVDPAHostNotifier *n;
477 
478     n = &v->notifier[queue_index];
479 
480     if (n->addr) {
481         virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, false);
482         object_unparent(OBJECT(&n->mr));
483         munmap(n->addr, page_size);
484         n->addr = NULL;
485     }
486 }
487 
488 static int vhost_vdpa_host_notifier_init(struct vhost_dev *dev, int queue_index)
489 {
490     size_t page_size = qemu_real_host_page_size();
491     struct vhost_vdpa *v = dev->opaque;
492     VirtIODevice *vdev = dev->vdev;
493     VhostVDPAHostNotifier *n;
494     int fd = v->device_fd;
495     void *addr;
496     char *name;
497 
498     vhost_vdpa_host_notifier_uninit(dev, queue_index);
499 
500     n = &v->notifier[queue_index];
501 
502     addr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, fd,
503                 queue_index * page_size);
504     if (addr == MAP_FAILED) {
505         goto err;
506     }
507 
508     name = g_strdup_printf("vhost-vdpa/host-notifier@%p mmaps[%d]",
509                            v, queue_index);
510     memory_region_init_ram_device_ptr(&n->mr, OBJECT(vdev), name,
511                                       page_size, addr);
512     g_free(name);
513 
514     if (virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, true)) {
515         object_unparent(OBJECT(&n->mr));
516         munmap(addr, page_size);
517         goto err;
518     }
519     n->addr = addr;
520 
521     return 0;
522 
523 err:
524     return -1;
525 }
526 
527 static void vhost_vdpa_host_notifiers_uninit(struct vhost_dev *dev, int n)
528 {
529     int i;
530 
531     for (i = dev->vq_index; i < dev->vq_index + n; i++) {
532         vhost_vdpa_host_notifier_uninit(dev, i);
533     }
534 }
535 
536 static void vhost_vdpa_host_notifiers_init(struct vhost_dev *dev)
537 {
538     struct vhost_vdpa *v = dev->opaque;
539     int i;
540 
541     if (v->shadow_vqs_enabled) {
542         /* FIXME SVQ is not compatible with host notifiers mr */
543         return;
544     }
545 
546     for (i = dev->vq_index; i < dev->vq_index + dev->nvqs; i++) {
547         if (vhost_vdpa_host_notifier_init(dev, i)) {
548             goto err;
549         }
550     }
551 
552     return;
553 
554 err:
555     vhost_vdpa_host_notifiers_uninit(dev, i - dev->vq_index);
556     return;
557 }
558 
559 static void vhost_vdpa_svq_cleanup(struct vhost_dev *dev)
560 {
561     struct vhost_vdpa *v = dev->opaque;
562     size_t idx;
563 
564     if (!v->shadow_vqs) {
565         return;
566     }
567 
568     for (idx = 0; idx < v->shadow_vqs->len; ++idx) {
569         vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, idx));
570     }
571     g_ptr_array_free(v->shadow_vqs, true);
572 }
573 
574 static int vhost_vdpa_cleanup(struct vhost_dev *dev)
575 {
576     struct vhost_vdpa *v;
577     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
578     v = dev->opaque;
579     trace_vhost_vdpa_cleanup(dev, v);
580     vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
581     memory_listener_unregister(&v->listener);
582     vhost_vdpa_svq_cleanup(dev);
583 
584     dev->opaque = NULL;
585     ram_block_discard_disable(false);
586 
587     return 0;
588 }
589 
590 static int vhost_vdpa_memslots_limit(struct vhost_dev *dev)
591 {
592     trace_vhost_vdpa_memslots_limit(dev, INT_MAX);
593     return INT_MAX;
594 }
595 
596 static int vhost_vdpa_set_mem_table(struct vhost_dev *dev,
597                                     struct vhost_memory *mem)
598 {
599     if (vhost_vdpa_one_time_request(dev)) {
600         return 0;
601     }
602 
603     trace_vhost_vdpa_set_mem_table(dev, mem->nregions, mem->padding);
604     if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_MEM_TABLE) &&
605         trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_REGIONS)) {
606         int i;
607         for (i = 0; i < mem->nregions; i++) {
608             trace_vhost_vdpa_dump_regions(dev, i,
609                                           mem->regions[i].guest_phys_addr,
610                                           mem->regions[i].memory_size,
611                                           mem->regions[i].userspace_addr,
612                                           mem->regions[i].flags_padding);
613         }
614     }
615     if (mem->padding) {
616         return -EINVAL;
617     }
618 
619     return 0;
620 }
621 
622 static int vhost_vdpa_set_features(struct vhost_dev *dev,
623                                    uint64_t features)
624 {
625     struct vhost_vdpa *v = dev->opaque;
626     int ret;
627 
628     if (vhost_vdpa_one_time_request(dev)) {
629         return 0;
630     }
631 
632     if (v->shadow_vqs_enabled) {
633         if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) {
634             /*
635              * QEMU is just trying to enable or disable logging. SVQ handles
636              * this sepparately, so no need to forward this.
637              */
638             v->acked_features = features;
639             return 0;
640         }
641 
642         v->acked_features = features;
643 
644         /* We must not ack _F_LOG if SVQ is enabled */
645         features &= ~BIT_ULL(VHOST_F_LOG_ALL);
646     }
647 
648     trace_vhost_vdpa_set_features(dev, features);
649     ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features);
650     if (ret) {
651         return ret;
652     }
653 
654     return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK);
655 }
656 
657 static int vhost_vdpa_set_backend_cap(struct vhost_dev *dev)
658 {
659     uint64_t features;
660     uint64_t f = 0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2 |
661         0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH;
662     int r;
663 
664     if (vhost_vdpa_call(dev, VHOST_GET_BACKEND_FEATURES, &features)) {
665         return -EFAULT;
666     }
667 
668     features &= f;
669 
670     if (vhost_vdpa_one_time_request(dev)) {
671         r = vhost_vdpa_call(dev, VHOST_SET_BACKEND_FEATURES, &features);
672         if (r) {
673             return -EFAULT;
674         }
675     }
676 
677     dev->backend_cap = features;
678 
679     return 0;
680 }
681 
682 static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
683                                     uint32_t *device_id)
684 {
685     int ret;
686     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_DEVICE_ID, device_id);
687     trace_vhost_vdpa_get_device_id(dev, *device_id);
688     return ret;
689 }
690 
691 static void vhost_vdpa_reset_svq(struct vhost_vdpa *v)
692 {
693     if (!v->shadow_vqs_enabled) {
694         return;
695     }
696 
697     for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
698         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
699         vhost_svq_stop(svq);
700     }
701 }
702 
703 static int vhost_vdpa_reset_device(struct vhost_dev *dev)
704 {
705     struct vhost_vdpa *v = dev->opaque;
706     int ret;
707     uint8_t status = 0;
708 
709     vhost_vdpa_reset_svq(v);
710 
711     ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
712     trace_vhost_vdpa_reset_device(dev, status);
713     return ret;
714 }
715 
716 static int vhost_vdpa_get_vq_index(struct vhost_dev *dev, int idx)
717 {
718     assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs);
719 
720     trace_vhost_vdpa_get_vq_index(dev, idx, idx);
721     return idx;
722 }
723 
724 static int vhost_vdpa_set_vring_ready(struct vhost_dev *dev)
725 {
726     int i;
727     trace_vhost_vdpa_set_vring_ready(dev);
728     for (i = 0; i < dev->nvqs; ++i) {
729         struct vhost_vring_state state = {
730             .index = dev->vq_index + i,
731             .num = 1,
732         };
733         vhost_vdpa_call(dev, VHOST_VDPA_SET_VRING_ENABLE, &state);
734     }
735     return 0;
736 }
737 
738 static void vhost_vdpa_dump_config(struct vhost_dev *dev, const uint8_t *config,
739                                    uint32_t config_len)
740 {
741     int b, len;
742     char line[QEMU_HEXDUMP_LINE_LEN];
743 
744     for (b = 0; b < config_len; b += 16) {
745         len = config_len - b;
746         qemu_hexdump_line(line, b, config, len, false);
747         trace_vhost_vdpa_dump_config(dev, line);
748     }
749 }
750 
751 static int vhost_vdpa_set_config(struct vhost_dev *dev, const uint8_t *data,
752                                    uint32_t offset, uint32_t size,
753                                    uint32_t flags)
754 {
755     struct vhost_vdpa_config *config;
756     int ret;
757     unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
758 
759     trace_vhost_vdpa_set_config(dev, offset, size, flags);
760     config = g_malloc(size + config_size);
761     config->off = offset;
762     config->len = size;
763     memcpy(config->buf, data, size);
764     if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_CONFIG) &&
765         trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) {
766         vhost_vdpa_dump_config(dev, data, size);
767     }
768     ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_CONFIG, config);
769     g_free(config);
770     return ret;
771 }
772 
773 static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
774                                    uint32_t config_len, Error **errp)
775 {
776     struct vhost_vdpa_config *v_config;
777     unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
778     int ret;
779 
780     trace_vhost_vdpa_get_config(dev, config, config_len);
781     v_config = g_malloc(config_len + config_size);
782     v_config->len = config_len;
783     v_config->off = 0;
784     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_CONFIG, v_config);
785     memcpy(config, v_config->buf, config_len);
786     g_free(v_config);
787     if (trace_event_get_state_backends(TRACE_VHOST_VDPA_GET_CONFIG) &&
788         trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) {
789         vhost_vdpa_dump_config(dev, config, config_len);
790     }
791     return ret;
792  }
793 
794 static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
795                                          struct vhost_vring_state *ring)
796 {
797     trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
798     return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
799 }
800 
801 static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
802                                          struct vhost_vring_file *file)
803 {
804     trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd);
805     return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
806 }
807 
808 static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
809                                          struct vhost_vring_file *file)
810 {
811     trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
812     return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
813 }
814 
815 static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
816                                          struct vhost_vring_addr *addr)
817 {
818     trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
819                                 addr->desc_user_addr, addr->used_user_addr,
820                                 addr->avail_user_addr,
821                                 addr->log_guest_addr);
822 
823     return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
824 
825 }
826 
827 /**
828  * Set the shadow virtqueue descriptors to the device
829  *
830  * @dev: The vhost device model
831  * @svq: The shadow virtqueue
832  * @idx: The index of the virtqueue in the vhost device
833  * @errp: Error
834  *
835  * Note that this function does not rewind kick file descriptor if cannot set
836  * call one.
837  */
838 static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
839                                   VhostShadowVirtqueue *svq, unsigned idx,
840                                   Error **errp)
841 {
842     struct vhost_vring_file file = {
843         .index = dev->vq_index + idx,
844     };
845     const EventNotifier *event_notifier = &svq->hdev_kick;
846     int r;
847 
848     file.fd = event_notifier_get_fd(event_notifier);
849     r = vhost_vdpa_set_vring_dev_kick(dev, &file);
850     if (unlikely(r != 0)) {
851         error_setg_errno(errp, -r, "Can't set device kick fd");
852         return r;
853     }
854 
855     event_notifier = &svq->hdev_call;
856     file.fd = event_notifier_get_fd(event_notifier);
857     r = vhost_vdpa_set_vring_dev_call(dev, &file);
858     if (unlikely(r != 0)) {
859         error_setg_errno(errp, -r, "Can't set device call fd");
860     }
861 
862     return r;
863 }
864 
865 /**
866  * Unmap a SVQ area in the device
867  */
868 static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v,
869                                       const DMAMap *needle)
870 {
871     const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle);
872     hwaddr size;
873     int r;
874 
875     if (unlikely(!result)) {
876         error_report("Unable to find SVQ address to unmap");
877         return false;
878     }
879 
880     size = ROUND_UP(result->size, qemu_real_host_page_size());
881     r = vhost_vdpa_dma_unmap(v, result->iova, size);
882     return r == 0;
883 }
884 
885 static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
886                                        const VhostShadowVirtqueue *svq)
887 {
888     DMAMap needle = {};
889     struct vhost_vdpa *v = dev->opaque;
890     struct vhost_vring_addr svq_addr;
891     bool ok;
892 
893     vhost_svq_get_vring_addr(svq, &svq_addr);
894 
895     needle.translated_addr = svq_addr.desc_user_addr;
896     ok = vhost_vdpa_svq_unmap_ring(v, &needle);
897     if (unlikely(!ok)) {
898         return false;
899     }
900 
901     needle.translated_addr = svq_addr.used_user_addr;
902     return vhost_vdpa_svq_unmap_ring(v, &needle);
903 }
904 
905 /**
906  * Map the SVQ area in the device
907  *
908  * @v: Vhost-vdpa device
909  * @needle: The area to search iova
910  * @errorp: Error pointer
911  */
912 static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle,
913                                     Error **errp)
914 {
915     int r;
916 
917     r = vhost_iova_tree_map_alloc(v->iova_tree, needle);
918     if (unlikely(r != IOVA_OK)) {
919         error_setg(errp, "Cannot allocate iova (%d)", r);
920         return false;
921     }
922 
923     r = vhost_vdpa_dma_map(v, needle->iova, needle->size + 1,
924                            (void *)(uintptr_t)needle->translated_addr,
925                            needle->perm == IOMMU_RO);
926     if (unlikely(r != 0)) {
927         error_setg_errno(errp, -r, "Cannot map region to device");
928         vhost_iova_tree_remove(v->iova_tree, needle);
929     }
930 
931     return r == 0;
932 }
933 
934 /**
935  * Map the shadow virtqueue rings in the device
936  *
937  * @dev: The vhost device
938  * @svq: The shadow virtqueue
939  * @addr: Assigned IOVA addresses
940  * @errp: Error pointer
941  */
942 static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
943                                      const VhostShadowVirtqueue *svq,
944                                      struct vhost_vring_addr *addr,
945                                      Error **errp)
946 {
947     DMAMap device_region, driver_region;
948     struct vhost_vring_addr svq_addr;
949     struct vhost_vdpa *v = dev->opaque;
950     size_t device_size = vhost_svq_device_area_size(svq);
951     size_t driver_size = vhost_svq_driver_area_size(svq);
952     size_t avail_offset;
953     bool ok;
954 
955     ERRP_GUARD();
956     vhost_svq_get_vring_addr(svq, &svq_addr);
957 
958     driver_region = (DMAMap) {
959         .translated_addr = svq_addr.desc_user_addr,
960         .size = driver_size - 1,
961         .perm = IOMMU_RO,
962     };
963     ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
964     if (unlikely(!ok)) {
965         error_prepend(errp, "Cannot create vq driver region: ");
966         return false;
967     }
968     addr->desc_user_addr = driver_region.iova;
969     avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
970     addr->avail_user_addr = driver_region.iova + avail_offset;
971 
972     device_region = (DMAMap) {
973         .translated_addr = svq_addr.used_user_addr,
974         .size = device_size - 1,
975         .perm = IOMMU_RW,
976     };
977     ok = vhost_vdpa_svq_map_ring(v, &device_region, errp);
978     if (unlikely(!ok)) {
979         error_prepend(errp, "Cannot create vq device region: ");
980         vhost_vdpa_svq_unmap_ring(v, &driver_region);
981     }
982     addr->used_user_addr = device_region.iova;
983 
984     return ok;
985 }
986 
987 static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
988                                  VhostShadowVirtqueue *svq, unsigned idx,
989                                  Error **errp)
990 {
991     uint16_t vq_index = dev->vq_index + idx;
992     struct vhost_vring_state s = {
993         .index = vq_index,
994     };
995     int r;
996 
997     r = vhost_vdpa_set_dev_vring_base(dev, &s);
998     if (unlikely(r)) {
999         error_setg_errno(errp, -r, "Cannot set vring base");
1000         return false;
1001     }
1002 
1003     r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp);
1004     return r == 0;
1005 }
1006 
1007 static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
1008 {
1009     struct vhost_vdpa *v = dev->opaque;
1010     Error *err = NULL;
1011     unsigned i;
1012 
1013     if (!v->shadow_vqs) {
1014         return true;
1015     }
1016 
1017     for (i = 0; i < v->shadow_vqs->len; ++i) {
1018         VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
1019         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
1020         struct vhost_vring_addr addr = {
1021             .index = i,
1022         };
1023         int r;
1024         bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
1025         if (unlikely(!ok)) {
1026             goto err;
1027         }
1028 
1029         vhost_svq_start(svq, dev->vdev, vq);
1030         ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err);
1031         if (unlikely(!ok)) {
1032             goto err_map;
1033         }
1034 
1035         /* Override vring GPA set by vhost subsystem */
1036         r = vhost_vdpa_set_vring_dev_addr(dev, &addr);
1037         if (unlikely(r != 0)) {
1038             error_setg_errno(&err, -r, "Cannot set device address");
1039             goto err_set_addr;
1040         }
1041     }
1042 
1043     return true;
1044 
1045 err_set_addr:
1046     vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i));
1047 
1048 err_map:
1049     vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i));
1050 
1051 err:
1052     error_reportf_err(err, "Cannot setup SVQ %u: ", i);
1053     for (unsigned j = 0; j < i; ++j) {
1054         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j);
1055         vhost_vdpa_svq_unmap_rings(dev, svq);
1056         vhost_svq_stop(svq);
1057     }
1058 
1059     return false;
1060 }
1061 
1062 static bool vhost_vdpa_svqs_stop(struct vhost_dev *dev)
1063 {
1064     struct vhost_vdpa *v = dev->opaque;
1065 
1066     if (!v->shadow_vqs) {
1067         return true;
1068     }
1069 
1070     for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
1071         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
1072         bool ok = vhost_vdpa_svq_unmap_rings(dev, svq);
1073         if (unlikely(!ok)) {
1074             return false;
1075         }
1076     }
1077 
1078     return true;
1079 }
1080 
1081 static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
1082 {
1083     struct vhost_vdpa *v = dev->opaque;
1084     bool ok;
1085     trace_vhost_vdpa_dev_start(dev, started);
1086 
1087     if (started) {
1088         vhost_vdpa_host_notifiers_init(dev);
1089         ok = vhost_vdpa_svqs_start(dev);
1090         if (unlikely(!ok)) {
1091             return -1;
1092         }
1093         vhost_vdpa_set_vring_ready(dev);
1094     } else {
1095         ok = vhost_vdpa_svqs_stop(dev);
1096         if (unlikely(!ok)) {
1097             return -1;
1098         }
1099         vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
1100     }
1101 
1102     if (dev->vq_index + dev->nvqs != dev->vq_index_end) {
1103         return 0;
1104     }
1105 
1106     if (started) {
1107         memory_listener_register(&v->listener, &address_space_memory);
1108         return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
1109     } else {
1110         vhost_vdpa_reset_device(dev);
1111         vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
1112                                    VIRTIO_CONFIG_S_DRIVER);
1113         memory_listener_unregister(&v->listener);
1114 
1115         return 0;
1116     }
1117 }
1118 
1119 static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
1120                                      struct vhost_log *log)
1121 {
1122     struct vhost_vdpa *v = dev->opaque;
1123     if (v->shadow_vqs_enabled || vhost_vdpa_one_time_request(dev)) {
1124         return 0;
1125     }
1126 
1127     trace_vhost_vdpa_set_log_base(dev, base, log->size, log->refcnt, log->fd,
1128                                   log->log);
1129     return vhost_vdpa_call(dev, VHOST_SET_LOG_BASE, &base);
1130 }
1131 
1132 static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
1133                                        struct vhost_vring_addr *addr)
1134 {
1135     struct vhost_vdpa *v = dev->opaque;
1136 
1137     if (v->shadow_vqs_enabled) {
1138         /*
1139          * Device vring addr was set at device start. SVQ base is handled by
1140          * VirtQueue code.
1141          */
1142         return 0;
1143     }
1144 
1145     return vhost_vdpa_set_vring_dev_addr(dev, addr);
1146 }
1147 
1148 static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
1149                                       struct vhost_vring_state *ring)
1150 {
1151     trace_vhost_vdpa_set_vring_num(dev, ring->index, ring->num);
1152     return vhost_vdpa_call(dev, VHOST_SET_VRING_NUM, ring);
1153 }
1154 
1155 static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
1156                                        struct vhost_vring_state *ring)
1157 {
1158     struct vhost_vdpa *v = dev->opaque;
1159 
1160     if (v->shadow_vqs_enabled) {
1161         /*
1162          * Device vring base was set at device start. SVQ base is handled by
1163          * VirtQueue code.
1164          */
1165         return 0;
1166     }
1167 
1168     return vhost_vdpa_set_dev_vring_base(dev, ring);
1169 }
1170 
1171 static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
1172                                        struct vhost_vring_state *ring)
1173 {
1174     struct vhost_vdpa *v = dev->opaque;
1175     int ret;
1176 
1177     if (v->shadow_vqs_enabled) {
1178         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
1179                                                       ring->index);
1180 
1181         /*
1182          * Setting base as last used idx, so destination will see as available
1183          * all the entries that the device did not use, including the in-flight
1184          * processing ones.
1185          *
1186          * TODO: This is ok for networking, but other kinds of devices might
1187          * have problems with these retransmissions.
1188          */
1189         ring->num = svq->last_used_idx;
1190         return 0;
1191     }
1192 
1193     ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
1194     trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num);
1195     return ret;
1196 }
1197 
1198 static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
1199                                        struct vhost_vring_file *file)
1200 {
1201     struct vhost_vdpa *v = dev->opaque;
1202     int vdpa_idx = file->index - dev->vq_index;
1203 
1204     if (v->shadow_vqs_enabled) {
1205         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
1206         vhost_svq_set_svq_kick_fd(svq, file->fd);
1207         return 0;
1208     } else {
1209         return vhost_vdpa_set_vring_dev_kick(dev, file);
1210     }
1211 }
1212 
1213 static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
1214                                        struct vhost_vring_file *file)
1215 {
1216     struct vhost_vdpa *v = dev->opaque;
1217 
1218     if (v->shadow_vqs_enabled) {
1219         int vdpa_idx = file->index - dev->vq_index;
1220         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
1221 
1222         vhost_svq_set_svq_call_fd(svq, file->fd);
1223         return 0;
1224     } else {
1225         return vhost_vdpa_set_vring_dev_call(dev, file);
1226     }
1227 }
1228 
1229 static int vhost_vdpa_get_features(struct vhost_dev *dev,
1230                                      uint64_t *features)
1231 {
1232     struct vhost_vdpa *v = dev->opaque;
1233     int ret = vhost_vdpa_get_dev_features(dev, features);
1234 
1235     if (ret == 0 && v->shadow_vqs_enabled) {
1236         /* Add SVQ logging capabilities */
1237         *features |= BIT_ULL(VHOST_F_LOG_ALL);
1238     }
1239 
1240     return ret;
1241 }
1242 
1243 static int vhost_vdpa_set_owner(struct vhost_dev *dev)
1244 {
1245     if (vhost_vdpa_one_time_request(dev)) {
1246         return 0;
1247     }
1248 
1249     trace_vhost_vdpa_set_owner(dev);
1250     return vhost_vdpa_call(dev, VHOST_SET_OWNER, NULL);
1251 }
1252 
1253 static int vhost_vdpa_vq_get_addr(struct vhost_dev *dev,
1254                     struct vhost_vring_addr *addr, struct vhost_virtqueue *vq)
1255 {
1256     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
1257     addr->desc_user_addr = (uint64_t)(unsigned long)vq->desc_phys;
1258     addr->avail_user_addr = (uint64_t)(unsigned long)vq->avail_phys;
1259     addr->used_user_addr = (uint64_t)(unsigned long)vq->used_phys;
1260     trace_vhost_vdpa_vq_get_addr(dev, vq, addr->desc_user_addr,
1261                                  addr->avail_user_addr, addr->used_user_addr);
1262     return 0;
1263 }
1264 
1265 static bool  vhost_vdpa_force_iommu(struct vhost_dev *dev)
1266 {
1267     return true;
1268 }
1269 
1270 const VhostOps vdpa_ops = {
1271         .backend_type = VHOST_BACKEND_TYPE_VDPA,
1272         .vhost_backend_init = vhost_vdpa_init,
1273         .vhost_backend_cleanup = vhost_vdpa_cleanup,
1274         .vhost_set_log_base = vhost_vdpa_set_log_base,
1275         .vhost_set_vring_addr = vhost_vdpa_set_vring_addr,
1276         .vhost_set_vring_num = vhost_vdpa_set_vring_num,
1277         .vhost_set_vring_base = vhost_vdpa_set_vring_base,
1278         .vhost_get_vring_base = vhost_vdpa_get_vring_base,
1279         .vhost_set_vring_kick = vhost_vdpa_set_vring_kick,
1280         .vhost_set_vring_call = vhost_vdpa_set_vring_call,
1281         .vhost_get_features = vhost_vdpa_get_features,
1282         .vhost_set_backend_cap = vhost_vdpa_set_backend_cap,
1283         .vhost_set_owner = vhost_vdpa_set_owner,
1284         .vhost_set_vring_endian = NULL,
1285         .vhost_backend_memslots_limit = vhost_vdpa_memslots_limit,
1286         .vhost_set_mem_table = vhost_vdpa_set_mem_table,
1287         .vhost_set_features = vhost_vdpa_set_features,
1288         .vhost_reset_device = vhost_vdpa_reset_device,
1289         .vhost_get_vq_index = vhost_vdpa_get_vq_index,
1290         .vhost_get_config  = vhost_vdpa_get_config,
1291         .vhost_set_config = vhost_vdpa_set_config,
1292         .vhost_requires_shm_log = NULL,
1293         .vhost_migration_done = NULL,
1294         .vhost_backend_can_merge = NULL,
1295         .vhost_net_set_mtu = NULL,
1296         .vhost_set_iotlb_callback = NULL,
1297         .vhost_send_device_iotlb_msg = NULL,
1298         .vhost_dev_start = vhost_vdpa_dev_start,
1299         .vhost_get_device_id = vhost_vdpa_get_device_id,
1300         .vhost_vq_get_addr = vhost_vdpa_vq_get_addr,
1301         .vhost_force_iommu = vhost_vdpa_force_iommu,
1302 };
1303