xref: /openbmc/qemu/hw/virtio/vhost-vdpa.c (revision b3eb5b86)
1 /*
2  * vhost-vdpa
3  *
4  *  Copyright(c) 2017-2018 Intel Corporation.
5  *  Copyright(c) 2020 Red Hat, Inc.
6  *
7  * This work is licensed under the terms of the GNU GPL, version 2 or later.
8  * See the COPYING file in the top-level directory.
9  *
10  */
11 
12 #include "qemu/osdep.h"
13 #include <linux/vhost.h>
14 #include <linux/vfio.h>
15 #include <sys/eventfd.h>
16 #include <sys/ioctl.h>
17 #include "hw/virtio/vhost.h"
18 #include "hw/virtio/vhost-backend.h"
19 #include "hw/virtio/virtio-net.h"
20 #include "hw/virtio/vhost-shadow-virtqueue.h"
21 #include "hw/virtio/vhost-vdpa.h"
22 #include "exec/address-spaces.h"
23 #include "migration/blocker.h"
24 #include "qemu/cutils.h"
25 #include "qemu/main-loop.h"
26 #include "cpu.h"
27 #include "trace.h"
28 #include "qapi/error.h"
29 
30 /*
31  * Return one past the end of the end of section. Be careful with uint64_t
32  * conversions!
33  */
34 static Int128 vhost_vdpa_section_end(const MemoryRegionSection *section)
35 {
36     Int128 llend = int128_make64(section->offset_within_address_space);
37     llend = int128_add(llend, section->size);
38     llend = int128_and(llend, int128_exts64(TARGET_PAGE_MASK));
39 
40     return llend;
41 }
42 
43 static bool vhost_vdpa_listener_skipped_section(MemoryRegionSection *section,
44                                                 uint64_t iova_min,
45                                                 uint64_t iova_max)
46 {
47     Int128 llend;
48 
49     if ((!memory_region_is_ram(section->mr) &&
50          !memory_region_is_iommu(section->mr)) ||
51         memory_region_is_protected(section->mr) ||
52         /* vhost-vDPA doesn't allow MMIO to be mapped  */
53         memory_region_is_ram_device(section->mr)) {
54         return true;
55     }
56 
57     if (section->offset_within_address_space < iova_min) {
58         error_report("RAM section out of device range (min=0x%" PRIx64
59                      ", addr=0x%" HWADDR_PRIx ")",
60                      iova_min, section->offset_within_address_space);
61         return true;
62     }
63 
64     llend = vhost_vdpa_section_end(section);
65     if (int128_gt(llend, int128_make64(iova_max))) {
66         error_report("RAM section out of device range (max=0x%" PRIx64
67                      ", end addr=0x%" PRIx64 ")",
68                      iova_max, int128_get64(llend));
69         return true;
70     }
71 
72     return false;
73 }
74 
75 /*
76  * The caller must set asid = 0 if the device does not support asid.
77  * This is not an ABI break since it is set to 0 by the initializer anyway.
78  */
79 int vhost_vdpa_dma_map(struct vhost_vdpa *v, uint32_t asid, hwaddr iova,
80                        hwaddr size, void *vaddr, bool readonly)
81 {
82     struct vhost_msg_v2 msg = {};
83     int fd = v->device_fd;
84     int ret = 0;
85 
86     msg.type = v->msg_type;
87     msg.asid = asid;
88     msg.iotlb.iova = iova;
89     msg.iotlb.size = size;
90     msg.iotlb.uaddr = (uint64_t)(uintptr_t)vaddr;
91     msg.iotlb.perm = readonly ? VHOST_ACCESS_RO : VHOST_ACCESS_RW;
92     msg.iotlb.type = VHOST_IOTLB_UPDATE;
93 
94     trace_vhost_vdpa_dma_map(v, fd, msg.type, msg.asid, msg.iotlb.iova,
95                              msg.iotlb.size, msg.iotlb.uaddr, msg.iotlb.perm,
96                              msg.iotlb.type);
97 
98     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
99         error_report("failed to write, fd=%d, errno=%d (%s)",
100             fd, errno, strerror(errno));
101         return -EIO ;
102     }
103 
104     return ret;
105 }
106 
107 /*
108  * The caller must set asid = 0 if the device does not support asid.
109  * This is not an ABI break since it is set to 0 by the initializer anyway.
110  */
111 int vhost_vdpa_dma_unmap(struct vhost_vdpa *v, uint32_t asid, hwaddr iova,
112                          hwaddr size)
113 {
114     struct vhost_msg_v2 msg = {};
115     int fd = v->device_fd;
116     int ret = 0;
117 
118     msg.type = v->msg_type;
119     msg.asid = asid;
120     msg.iotlb.iova = iova;
121     msg.iotlb.size = size;
122     msg.iotlb.type = VHOST_IOTLB_INVALIDATE;
123 
124     trace_vhost_vdpa_dma_unmap(v, fd, msg.type, msg.asid, msg.iotlb.iova,
125                                msg.iotlb.size, msg.iotlb.type);
126 
127     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
128         error_report("failed to write, fd=%d, errno=%d (%s)",
129             fd, errno, strerror(errno));
130         return -EIO ;
131     }
132 
133     return ret;
134 }
135 
136 static void vhost_vdpa_listener_begin_batch(struct vhost_vdpa *v)
137 {
138     int fd = v->device_fd;
139     struct vhost_msg_v2 msg = {
140         .type = v->msg_type,
141         .iotlb.type = VHOST_IOTLB_BATCH_BEGIN,
142     };
143 
144     trace_vhost_vdpa_listener_begin_batch(v, fd, msg.type, msg.iotlb.type);
145     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
146         error_report("failed to write, fd=%d, errno=%d (%s)",
147                      fd, errno, strerror(errno));
148     }
149 }
150 
151 static void vhost_vdpa_iotlb_batch_begin_once(struct vhost_vdpa *v)
152 {
153     if (v->dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH) &&
154         !v->iotlb_batch_begin_sent) {
155         vhost_vdpa_listener_begin_batch(v);
156     }
157 
158     v->iotlb_batch_begin_sent = true;
159 }
160 
161 static void vhost_vdpa_listener_commit(MemoryListener *listener)
162 {
163     struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
164     struct vhost_dev *dev = v->dev;
165     struct vhost_msg_v2 msg = {};
166     int fd = v->device_fd;
167 
168     if (!(dev->backend_cap & (0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH))) {
169         return;
170     }
171 
172     if (!v->iotlb_batch_begin_sent) {
173         return;
174     }
175 
176     msg.type = v->msg_type;
177     msg.iotlb.type = VHOST_IOTLB_BATCH_END;
178 
179     trace_vhost_vdpa_listener_commit(v, fd, msg.type, msg.iotlb.type);
180     if (write(fd, &msg, sizeof(msg)) != sizeof(msg)) {
181         error_report("failed to write, fd=%d, errno=%d (%s)",
182                      fd, errno, strerror(errno));
183     }
184 
185     v->iotlb_batch_begin_sent = false;
186 }
187 
188 static void vhost_vdpa_listener_region_add(MemoryListener *listener,
189                                            MemoryRegionSection *section)
190 {
191     DMAMap mem_region = {};
192     struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
193     hwaddr iova;
194     Int128 llend, llsize;
195     void *vaddr;
196     int ret;
197 
198     if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first,
199                                             v->iova_range.last)) {
200         return;
201     }
202 
203     if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
204                  (section->offset_within_region & ~TARGET_PAGE_MASK))) {
205         error_report("%s received unaligned region", __func__);
206         return;
207     }
208 
209     iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
210     llend = vhost_vdpa_section_end(section);
211     if (int128_ge(int128_make64(iova), llend)) {
212         return;
213     }
214 
215     memory_region_ref(section->mr);
216 
217     /* Here we assume that memory_region_is_ram(section->mr)==true */
218 
219     vaddr = memory_region_get_ram_ptr(section->mr) +
220             section->offset_within_region +
221             (iova - section->offset_within_address_space);
222 
223     trace_vhost_vdpa_listener_region_add(v, iova, int128_get64(llend),
224                                          vaddr, section->readonly);
225 
226     llsize = int128_sub(llend, int128_make64(iova));
227     if (v->shadow_data) {
228         int r;
229 
230         mem_region.translated_addr = (hwaddr)(uintptr_t)vaddr,
231         mem_region.size = int128_get64(llsize) - 1,
232         mem_region.perm = IOMMU_ACCESS_FLAG(true, section->readonly),
233 
234         r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region);
235         if (unlikely(r != IOVA_OK)) {
236             error_report("Can't allocate a mapping (%d)", r);
237             goto fail;
238         }
239 
240         iova = mem_region.iova;
241     }
242 
243     vhost_vdpa_iotlb_batch_begin_once(v);
244     ret = vhost_vdpa_dma_map(v, VHOST_VDPA_GUEST_PA_ASID, iova,
245                              int128_get64(llsize), vaddr, section->readonly);
246     if (ret) {
247         error_report("vhost vdpa map fail!");
248         goto fail_map;
249     }
250 
251     return;
252 
253 fail_map:
254     if (v->shadow_data) {
255         vhost_iova_tree_remove(v->iova_tree, mem_region);
256     }
257 
258 fail:
259     /*
260      * On the initfn path, store the first error in the container so we
261      * can gracefully fail.  Runtime, there's not much we can do other
262      * than throw a hardware error.
263      */
264     error_report("vhost-vdpa: DMA mapping failed, unable to continue");
265     return;
266 
267 }
268 
269 static void vhost_vdpa_listener_region_del(MemoryListener *listener,
270                                            MemoryRegionSection *section)
271 {
272     struct vhost_vdpa *v = container_of(listener, struct vhost_vdpa, listener);
273     hwaddr iova;
274     Int128 llend, llsize;
275     int ret;
276 
277     if (vhost_vdpa_listener_skipped_section(section, v->iova_range.first,
278                                             v->iova_range.last)) {
279         return;
280     }
281 
282     if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
283                  (section->offset_within_region & ~TARGET_PAGE_MASK))) {
284         error_report("%s received unaligned region", __func__);
285         return;
286     }
287 
288     iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
289     llend = vhost_vdpa_section_end(section);
290 
291     trace_vhost_vdpa_listener_region_del(v, iova, int128_get64(llend));
292 
293     if (int128_ge(int128_make64(iova), llend)) {
294         return;
295     }
296 
297     llsize = int128_sub(llend, int128_make64(iova));
298 
299     if (v->shadow_data) {
300         const DMAMap *result;
301         const void *vaddr = memory_region_get_ram_ptr(section->mr) +
302             section->offset_within_region +
303             (iova - section->offset_within_address_space);
304         DMAMap mem_region = {
305             .translated_addr = (hwaddr)(uintptr_t)vaddr,
306             .size = int128_get64(llsize) - 1,
307         };
308 
309         result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region);
310         if (!result) {
311             /* The memory listener map wasn't mapped */
312             return;
313         }
314         iova = result->iova;
315         vhost_iova_tree_remove(v->iova_tree, *result);
316     }
317     vhost_vdpa_iotlb_batch_begin_once(v);
318     ret = vhost_vdpa_dma_unmap(v, VHOST_VDPA_GUEST_PA_ASID, iova,
319                                int128_get64(llsize));
320     if (ret) {
321         error_report("vhost_vdpa dma unmap error!");
322     }
323 
324     memory_region_unref(section->mr);
325 }
326 /*
327  * IOTLB API is used by vhost-vdpa which requires incremental updating
328  * of the mapping. So we can not use generic vhost memory listener which
329  * depends on the addnop().
330  */
331 static const MemoryListener vhost_vdpa_memory_listener = {
332     .name = "vhost-vdpa",
333     .commit = vhost_vdpa_listener_commit,
334     .region_add = vhost_vdpa_listener_region_add,
335     .region_del = vhost_vdpa_listener_region_del,
336 };
337 
338 static int vhost_vdpa_call(struct vhost_dev *dev, unsigned long int request,
339                              void *arg)
340 {
341     struct vhost_vdpa *v = dev->opaque;
342     int fd = v->device_fd;
343     int ret;
344 
345     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
346 
347     ret = ioctl(fd, request, arg);
348     return ret < 0 ? -errno : ret;
349 }
350 
351 static int vhost_vdpa_add_status(struct vhost_dev *dev, uint8_t status)
352 {
353     uint8_t s;
354     int ret;
355 
356     trace_vhost_vdpa_add_status(dev, status);
357     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s);
358     if (ret < 0) {
359         return ret;
360     }
361 
362     s |= status;
363 
364     ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &s);
365     if (ret < 0) {
366         return ret;
367     }
368 
369     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_STATUS, &s);
370     if (ret < 0) {
371         return ret;
372     }
373 
374     if (!(s & status)) {
375         return -EIO;
376     }
377 
378     return 0;
379 }
380 
381 /*
382  * The use of this function is for requests that only need to be
383  * applied once. Typically such request occurs at the beginning
384  * of operation, and before setting up queues. It should not be
385  * used for request that performs operation until all queues are
386  * set, which would need to check dev->vq_index_end instead.
387  */
388 static bool vhost_vdpa_first_dev(struct vhost_dev *dev)
389 {
390     struct vhost_vdpa *v = dev->opaque;
391 
392     return v->index == 0;
393 }
394 
395 static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
396                                        uint64_t *features)
397 {
398     int ret;
399 
400     ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
401     trace_vhost_vdpa_get_features(dev, *features);
402     return ret;
403 }
404 
405 static void vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v)
406 {
407     g_autoptr(GPtrArray) shadow_vqs = NULL;
408 
409     shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
410     for (unsigned n = 0; n < hdev->nvqs; ++n) {
411         VhostShadowVirtqueue *svq;
412 
413         svq = vhost_svq_new(v->shadow_vq_ops, v->shadow_vq_ops_opaque);
414         g_ptr_array_add(shadow_vqs, svq);
415     }
416 
417     v->shadow_vqs = g_steal_pointer(&shadow_vqs);
418 }
419 
420 static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
421 {
422     struct vhost_vdpa *v;
423     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
424     trace_vhost_vdpa_init(dev, opaque);
425     int ret;
426 
427     /*
428      * Similar to VFIO, we end up pinning all guest memory and have to
429      * disable discarding of RAM.
430      */
431     ret = ram_block_discard_disable(true);
432     if (ret) {
433         error_report("Cannot set discarding of RAM broken");
434         return ret;
435     }
436 
437     v = opaque;
438     v->dev = dev;
439     dev->opaque =  opaque ;
440     v->listener = vhost_vdpa_memory_listener;
441     v->msg_type = VHOST_IOTLB_MSG_V2;
442     vhost_vdpa_init_svq(dev, v);
443 
444     if (!vhost_vdpa_first_dev(dev)) {
445         return 0;
446     }
447 
448     vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
449                                VIRTIO_CONFIG_S_DRIVER);
450 
451     return 0;
452 }
453 
454 static void vhost_vdpa_host_notifier_uninit(struct vhost_dev *dev,
455                                             int queue_index)
456 {
457     size_t page_size = qemu_real_host_page_size();
458     struct vhost_vdpa *v = dev->opaque;
459     VirtIODevice *vdev = dev->vdev;
460     VhostVDPAHostNotifier *n;
461 
462     n = &v->notifier[queue_index];
463 
464     if (n->addr) {
465         virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, false);
466         object_unparent(OBJECT(&n->mr));
467         munmap(n->addr, page_size);
468         n->addr = NULL;
469     }
470 }
471 
472 static int vhost_vdpa_host_notifier_init(struct vhost_dev *dev, int queue_index)
473 {
474     size_t page_size = qemu_real_host_page_size();
475     struct vhost_vdpa *v = dev->opaque;
476     VirtIODevice *vdev = dev->vdev;
477     VhostVDPAHostNotifier *n;
478     int fd = v->device_fd;
479     void *addr;
480     char *name;
481 
482     vhost_vdpa_host_notifier_uninit(dev, queue_index);
483 
484     n = &v->notifier[queue_index];
485 
486     addr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, fd,
487                 queue_index * page_size);
488     if (addr == MAP_FAILED) {
489         goto err;
490     }
491 
492     name = g_strdup_printf("vhost-vdpa/host-notifier@%p mmaps[%d]",
493                            v, queue_index);
494     memory_region_init_ram_device_ptr(&n->mr, OBJECT(vdev), name,
495                                       page_size, addr);
496     g_free(name);
497 
498     if (virtio_queue_set_host_notifier_mr(vdev, queue_index, &n->mr, true)) {
499         object_unparent(OBJECT(&n->mr));
500         munmap(addr, page_size);
501         goto err;
502     }
503     n->addr = addr;
504 
505     return 0;
506 
507 err:
508     return -1;
509 }
510 
511 static void vhost_vdpa_host_notifiers_uninit(struct vhost_dev *dev, int n)
512 {
513     int i;
514 
515     for (i = dev->vq_index; i < dev->vq_index + n; i++) {
516         vhost_vdpa_host_notifier_uninit(dev, i);
517     }
518 }
519 
520 static void vhost_vdpa_host_notifiers_init(struct vhost_dev *dev)
521 {
522     struct vhost_vdpa *v = dev->opaque;
523     int i;
524 
525     if (v->shadow_vqs_enabled) {
526         /* FIXME SVQ is not compatible with host notifiers mr */
527         return;
528     }
529 
530     for (i = dev->vq_index; i < dev->vq_index + dev->nvqs; i++) {
531         if (vhost_vdpa_host_notifier_init(dev, i)) {
532             goto err;
533         }
534     }
535 
536     return;
537 
538 err:
539     vhost_vdpa_host_notifiers_uninit(dev, i - dev->vq_index);
540     return;
541 }
542 
543 static void vhost_vdpa_svq_cleanup(struct vhost_dev *dev)
544 {
545     struct vhost_vdpa *v = dev->opaque;
546     size_t idx;
547 
548     for (idx = 0; idx < v->shadow_vqs->len; ++idx) {
549         vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, idx));
550     }
551     g_ptr_array_free(v->shadow_vqs, true);
552 }
553 
554 static int vhost_vdpa_cleanup(struct vhost_dev *dev)
555 {
556     struct vhost_vdpa *v;
557     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
558     v = dev->opaque;
559     trace_vhost_vdpa_cleanup(dev, v);
560     vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
561     memory_listener_unregister(&v->listener);
562     vhost_vdpa_svq_cleanup(dev);
563 
564     dev->opaque = NULL;
565     ram_block_discard_disable(false);
566 
567     return 0;
568 }
569 
570 static int vhost_vdpa_memslots_limit(struct vhost_dev *dev)
571 {
572     trace_vhost_vdpa_memslots_limit(dev, INT_MAX);
573     return INT_MAX;
574 }
575 
576 static int vhost_vdpa_set_mem_table(struct vhost_dev *dev,
577                                     struct vhost_memory *mem)
578 {
579     if (!vhost_vdpa_first_dev(dev)) {
580         return 0;
581     }
582 
583     trace_vhost_vdpa_set_mem_table(dev, mem->nregions, mem->padding);
584     if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_MEM_TABLE) &&
585         trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_REGIONS)) {
586         int i;
587         for (i = 0; i < mem->nregions; i++) {
588             trace_vhost_vdpa_dump_regions(dev, i,
589                                           mem->regions[i].guest_phys_addr,
590                                           mem->regions[i].memory_size,
591                                           mem->regions[i].userspace_addr,
592                                           mem->regions[i].flags_padding);
593         }
594     }
595     if (mem->padding) {
596         return -EINVAL;
597     }
598 
599     return 0;
600 }
601 
602 static int vhost_vdpa_set_features(struct vhost_dev *dev,
603                                    uint64_t features)
604 {
605     struct vhost_vdpa *v = dev->opaque;
606     int ret;
607 
608     if (!vhost_vdpa_first_dev(dev)) {
609         return 0;
610     }
611 
612     if (v->shadow_vqs_enabled) {
613         if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) {
614             /*
615              * QEMU is just trying to enable or disable logging. SVQ handles
616              * this sepparately, so no need to forward this.
617              */
618             v->acked_features = features;
619             return 0;
620         }
621 
622         v->acked_features = features;
623 
624         /* We must not ack _F_LOG if SVQ is enabled */
625         features &= ~BIT_ULL(VHOST_F_LOG_ALL);
626     }
627 
628     trace_vhost_vdpa_set_features(dev, features);
629     ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features);
630     if (ret) {
631         return ret;
632     }
633 
634     return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_FEATURES_OK);
635 }
636 
637 static int vhost_vdpa_set_backend_cap(struct vhost_dev *dev)
638 {
639     uint64_t features;
640     uint64_t f = 0x1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2 |
641         0x1ULL << VHOST_BACKEND_F_IOTLB_BATCH |
642         0x1ULL << VHOST_BACKEND_F_IOTLB_ASID;
643     int r;
644 
645     if (vhost_vdpa_call(dev, VHOST_GET_BACKEND_FEATURES, &features)) {
646         return -EFAULT;
647     }
648 
649     features &= f;
650 
651     if (vhost_vdpa_first_dev(dev)) {
652         r = vhost_vdpa_call(dev, VHOST_SET_BACKEND_FEATURES, &features);
653         if (r) {
654             return -EFAULT;
655         }
656     }
657 
658     dev->backend_cap = features;
659 
660     return 0;
661 }
662 
663 static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
664                                     uint32_t *device_id)
665 {
666     int ret;
667     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_DEVICE_ID, device_id);
668     trace_vhost_vdpa_get_device_id(dev, *device_id);
669     return ret;
670 }
671 
672 static void vhost_vdpa_reset_svq(struct vhost_vdpa *v)
673 {
674     if (!v->shadow_vqs_enabled) {
675         return;
676     }
677 
678     for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
679         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
680         vhost_svq_stop(svq);
681     }
682 }
683 
684 static int vhost_vdpa_reset_device(struct vhost_dev *dev)
685 {
686     struct vhost_vdpa *v = dev->opaque;
687     int ret;
688     uint8_t status = 0;
689 
690     vhost_vdpa_reset_svq(v);
691 
692     ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
693     trace_vhost_vdpa_reset_device(dev, status);
694     return ret;
695 }
696 
697 static int vhost_vdpa_get_vq_index(struct vhost_dev *dev, int idx)
698 {
699     assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs);
700 
701     trace_vhost_vdpa_get_vq_index(dev, idx, idx);
702     return idx;
703 }
704 
705 static int vhost_vdpa_set_vring_ready(struct vhost_dev *dev)
706 {
707     int i;
708     trace_vhost_vdpa_set_vring_ready(dev);
709     for (i = 0; i < dev->nvqs; ++i) {
710         struct vhost_vring_state state = {
711             .index = dev->vq_index + i,
712             .num = 1,
713         };
714         vhost_vdpa_call(dev, VHOST_VDPA_SET_VRING_ENABLE, &state);
715     }
716     return 0;
717 }
718 
719 static void vhost_vdpa_dump_config(struct vhost_dev *dev, const uint8_t *config,
720                                    uint32_t config_len)
721 {
722     int b, len;
723     char line[QEMU_HEXDUMP_LINE_LEN];
724 
725     for (b = 0; b < config_len; b += 16) {
726         len = config_len - b;
727         qemu_hexdump_line(line, b, config, len, false);
728         trace_vhost_vdpa_dump_config(dev, line);
729     }
730 }
731 
732 static int vhost_vdpa_set_config(struct vhost_dev *dev, const uint8_t *data,
733                                    uint32_t offset, uint32_t size,
734                                    uint32_t flags)
735 {
736     struct vhost_vdpa_config *config;
737     int ret;
738     unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
739 
740     trace_vhost_vdpa_set_config(dev, offset, size, flags);
741     config = g_malloc(size + config_size);
742     config->off = offset;
743     config->len = size;
744     memcpy(config->buf, data, size);
745     if (trace_event_get_state_backends(TRACE_VHOST_VDPA_SET_CONFIG) &&
746         trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) {
747         vhost_vdpa_dump_config(dev, data, size);
748     }
749     ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_CONFIG, config);
750     g_free(config);
751     return ret;
752 }
753 
754 static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
755                                    uint32_t config_len, Error **errp)
756 {
757     struct vhost_vdpa_config *v_config;
758     unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
759     int ret;
760 
761     trace_vhost_vdpa_get_config(dev, config, config_len);
762     v_config = g_malloc(config_len + config_size);
763     v_config->len = config_len;
764     v_config->off = 0;
765     ret = vhost_vdpa_call(dev, VHOST_VDPA_GET_CONFIG, v_config);
766     memcpy(config, v_config->buf, config_len);
767     g_free(v_config);
768     if (trace_event_get_state_backends(TRACE_VHOST_VDPA_GET_CONFIG) &&
769         trace_event_get_state_backends(TRACE_VHOST_VDPA_DUMP_CONFIG)) {
770         vhost_vdpa_dump_config(dev, config, config_len);
771     }
772     return ret;
773  }
774 
775 static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
776                                          struct vhost_vring_state *ring)
777 {
778     trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
779     return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
780 }
781 
782 static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
783                                          struct vhost_vring_file *file)
784 {
785     trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd);
786     return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
787 }
788 
789 static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
790                                          struct vhost_vring_file *file)
791 {
792     trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
793     return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
794 }
795 
796 static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
797                                          struct vhost_vring_addr *addr)
798 {
799     trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
800                                 addr->desc_user_addr, addr->used_user_addr,
801                                 addr->avail_user_addr,
802                                 addr->log_guest_addr);
803 
804     return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
805 
806 }
807 
808 /**
809  * Set the shadow virtqueue descriptors to the device
810  *
811  * @dev: The vhost device model
812  * @svq: The shadow virtqueue
813  * @idx: The index of the virtqueue in the vhost device
814  * @errp: Error
815  *
816  * Note that this function does not rewind kick file descriptor if cannot set
817  * call one.
818  */
819 static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
820                                   VhostShadowVirtqueue *svq, unsigned idx,
821                                   Error **errp)
822 {
823     struct vhost_vring_file file = {
824         .index = dev->vq_index + idx,
825     };
826     const EventNotifier *event_notifier = &svq->hdev_kick;
827     int r;
828 
829     r = event_notifier_init(&svq->hdev_kick, 0);
830     if (r != 0) {
831         error_setg_errno(errp, -r, "Couldn't create kick event notifier");
832         goto err_init_hdev_kick;
833     }
834 
835     r = event_notifier_init(&svq->hdev_call, 0);
836     if (r != 0) {
837         error_setg_errno(errp, -r, "Couldn't create call event notifier");
838         goto err_init_hdev_call;
839     }
840 
841     file.fd = event_notifier_get_fd(event_notifier);
842     r = vhost_vdpa_set_vring_dev_kick(dev, &file);
843     if (unlikely(r != 0)) {
844         error_setg_errno(errp, -r, "Can't set device kick fd");
845         goto err_init_set_dev_fd;
846     }
847 
848     event_notifier = &svq->hdev_call;
849     file.fd = event_notifier_get_fd(event_notifier);
850     r = vhost_vdpa_set_vring_dev_call(dev, &file);
851     if (unlikely(r != 0)) {
852         error_setg_errno(errp, -r, "Can't set device call fd");
853         goto err_init_set_dev_fd;
854     }
855 
856     return 0;
857 
858 err_init_set_dev_fd:
859     event_notifier_set_handler(&svq->hdev_call, NULL);
860 
861 err_init_hdev_call:
862     event_notifier_cleanup(&svq->hdev_kick);
863 
864 err_init_hdev_kick:
865     return r;
866 }
867 
868 /**
869  * Unmap a SVQ area in the device
870  */
871 static void vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr addr)
872 {
873     const DMAMap needle = {
874         .translated_addr = addr,
875     };
876     const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, &needle);
877     hwaddr size;
878     int r;
879 
880     if (unlikely(!result)) {
881         error_report("Unable to find SVQ address to unmap");
882         return;
883     }
884 
885     size = ROUND_UP(result->size, qemu_real_host_page_size());
886     r = vhost_vdpa_dma_unmap(v, v->address_space_id, result->iova, size);
887     if (unlikely(r < 0)) {
888         error_report("Unable to unmap SVQ vring: %s (%d)", g_strerror(-r), -r);
889         return;
890     }
891 
892     vhost_iova_tree_remove(v->iova_tree, *result);
893 }
894 
895 static void vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
896                                        const VhostShadowVirtqueue *svq)
897 {
898     struct vhost_vdpa *v = dev->opaque;
899     struct vhost_vring_addr svq_addr;
900 
901     vhost_svq_get_vring_addr(svq, &svq_addr);
902 
903     vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr);
904 
905     vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr);
906 }
907 
908 /**
909  * Map the SVQ area in the device
910  *
911  * @v: Vhost-vdpa device
912  * @needle: The area to search iova
913  * @errorp: Error pointer
914  */
915 static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle,
916                                     Error **errp)
917 {
918     int r;
919 
920     r = vhost_iova_tree_map_alloc(v->iova_tree, needle);
921     if (unlikely(r != IOVA_OK)) {
922         error_setg(errp, "Cannot allocate iova (%d)", r);
923         return false;
924     }
925 
926     r = vhost_vdpa_dma_map(v, v->address_space_id, needle->iova,
927                            needle->size + 1,
928                            (void *)(uintptr_t)needle->translated_addr,
929                            needle->perm == IOMMU_RO);
930     if (unlikely(r != 0)) {
931         error_setg_errno(errp, -r, "Cannot map region to device");
932         vhost_iova_tree_remove(v->iova_tree, *needle);
933     }
934 
935     return r == 0;
936 }
937 
938 /**
939  * Map the shadow virtqueue rings in the device
940  *
941  * @dev: The vhost device
942  * @svq: The shadow virtqueue
943  * @addr: Assigned IOVA addresses
944  * @errp: Error pointer
945  */
946 static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
947                                      const VhostShadowVirtqueue *svq,
948                                      struct vhost_vring_addr *addr,
949                                      Error **errp)
950 {
951     ERRP_GUARD();
952     DMAMap device_region, driver_region;
953     struct vhost_vring_addr svq_addr;
954     struct vhost_vdpa *v = dev->opaque;
955     size_t device_size = vhost_svq_device_area_size(svq);
956     size_t driver_size = vhost_svq_driver_area_size(svq);
957     size_t avail_offset;
958     bool ok;
959 
960     vhost_svq_get_vring_addr(svq, &svq_addr);
961 
962     driver_region = (DMAMap) {
963         .translated_addr = svq_addr.desc_user_addr,
964         .size = driver_size - 1,
965         .perm = IOMMU_RO,
966     };
967     ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
968     if (unlikely(!ok)) {
969         error_prepend(errp, "Cannot create vq driver region: ");
970         return false;
971     }
972     addr->desc_user_addr = driver_region.iova;
973     avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
974     addr->avail_user_addr = driver_region.iova + avail_offset;
975 
976     device_region = (DMAMap) {
977         .translated_addr = svq_addr.used_user_addr,
978         .size = device_size - 1,
979         .perm = IOMMU_RW,
980     };
981     ok = vhost_vdpa_svq_map_ring(v, &device_region, errp);
982     if (unlikely(!ok)) {
983         error_prepend(errp, "Cannot create vq device region: ");
984         vhost_vdpa_svq_unmap_ring(v, driver_region.translated_addr);
985     }
986     addr->used_user_addr = device_region.iova;
987 
988     return ok;
989 }
990 
991 static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
992                                  VhostShadowVirtqueue *svq, unsigned idx,
993                                  Error **errp)
994 {
995     uint16_t vq_index = dev->vq_index + idx;
996     struct vhost_vring_state s = {
997         .index = vq_index,
998     };
999     int r;
1000 
1001     r = vhost_vdpa_set_dev_vring_base(dev, &s);
1002     if (unlikely(r)) {
1003         error_setg_errno(errp, -r, "Cannot set vring base");
1004         return false;
1005     }
1006 
1007     r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp);
1008     return r == 0;
1009 }
1010 
1011 static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
1012 {
1013     struct vhost_vdpa *v = dev->opaque;
1014     Error *err = NULL;
1015     unsigned i;
1016 
1017     if (!v->shadow_vqs_enabled) {
1018         return true;
1019     }
1020 
1021     for (i = 0; i < v->shadow_vqs->len; ++i) {
1022         VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
1023         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
1024         struct vhost_vring_addr addr = {
1025             .index = dev->vq_index + i,
1026         };
1027         int r;
1028         bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
1029         if (unlikely(!ok)) {
1030             goto err;
1031         }
1032 
1033         vhost_svq_start(svq, dev->vdev, vq, v->iova_tree);
1034         ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err);
1035         if (unlikely(!ok)) {
1036             goto err_map;
1037         }
1038 
1039         /* Override vring GPA set by vhost subsystem */
1040         r = vhost_vdpa_set_vring_dev_addr(dev, &addr);
1041         if (unlikely(r != 0)) {
1042             error_setg_errno(&err, -r, "Cannot set device address");
1043             goto err_set_addr;
1044         }
1045     }
1046 
1047     return true;
1048 
1049 err_set_addr:
1050     vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i));
1051 
1052 err_map:
1053     vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i));
1054 
1055 err:
1056     error_reportf_err(err, "Cannot setup SVQ %u: ", i);
1057     for (unsigned j = 0; j < i; ++j) {
1058         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j);
1059         vhost_vdpa_svq_unmap_rings(dev, svq);
1060         vhost_svq_stop(svq);
1061     }
1062 
1063     return false;
1064 }
1065 
1066 static void vhost_vdpa_svqs_stop(struct vhost_dev *dev)
1067 {
1068     struct vhost_vdpa *v = dev->opaque;
1069 
1070     if (!v->shadow_vqs_enabled) {
1071         return;
1072     }
1073 
1074     for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
1075         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
1076         vhost_vdpa_svq_unmap_rings(dev, svq);
1077 
1078         event_notifier_cleanup(&svq->hdev_kick);
1079         event_notifier_cleanup(&svq->hdev_call);
1080     }
1081 }
1082 
1083 static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
1084 {
1085     struct vhost_vdpa *v = dev->opaque;
1086     bool ok;
1087     trace_vhost_vdpa_dev_start(dev, started);
1088 
1089     if (started) {
1090         vhost_vdpa_host_notifiers_init(dev);
1091         ok = vhost_vdpa_svqs_start(dev);
1092         if (unlikely(!ok)) {
1093             return -1;
1094         }
1095         vhost_vdpa_set_vring_ready(dev);
1096     } else {
1097         vhost_vdpa_svqs_stop(dev);
1098         vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
1099     }
1100 
1101     if (dev->vq_index + dev->nvqs != dev->vq_index_end) {
1102         return 0;
1103     }
1104 
1105     if (started) {
1106         memory_listener_register(&v->listener, &address_space_memory);
1107         return vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
1108     } else {
1109         vhost_vdpa_reset_device(dev);
1110         vhost_vdpa_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE |
1111                                    VIRTIO_CONFIG_S_DRIVER);
1112         memory_listener_unregister(&v->listener);
1113 
1114         return 0;
1115     }
1116 }
1117 
1118 static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
1119                                      struct vhost_log *log)
1120 {
1121     struct vhost_vdpa *v = dev->opaque;
1122     if (v->shadow_vqs_enabled || !vhost_vdpa_first_dev(dev)) {
1123         return 0;
1124     }
1125 
1126     trace_vhost_vdpa_set_log_base(dev, base, log->size, log->refcnt, log->fd,
1127                                   log->log);
1128     return vhost_vdpa_call(dev, VHOST_SET_LOG_BASE, &base);
1129 }
1130 
1131 static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
1132                                        struct vhost_vring_addr *addr)
1133 {
1134     struct vhost_vdpa *v = dev->opaque;
1135 
1136     if (v->shadow_vqs_enabled) {
1137         /*
1138          * Device vring addr was set at device start. SVQ base is handled by
1139          * VirtQueue code.
1140          */
1141         return 0;
1142     }
1143 
1144     return vhost_vdpa_set_vring_dev_addr(dev, addr);
1145 }
1146 
1147 static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
1148                                       struct vhost_vring_state *ring)
1149 {
1150     trace_vhost_vdpa_set_vring_num(dev, ring->index, ring->num);
1151     return vhost_vdpa_call(dev, VHOST_SET_VRING_NUM, ring);
1152 }
1153 
1154 static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
1155                                        struct vhost_vring_state *ring)
1156 {
1157     struct vhost_vdpa *v = dev->opaque;
1158     VirtQueue *vq = virtio_get_queue(dev->vdev, ring->index);
1159 
1160     /*
1161      * vhost-vdpa devices does not support in-flight requests. Set all of them
1162      * as available.
1163      *
1164      * TODO: This is ok for networking, but other kinds of devices might
1165      * have problems with these retransmissions.
1166      */
1167     while (virtqueue_rewind(vq, 1)) {
1168         continue;
1169     }
1170     if (v->shadow_vqs_enabled) {
1171         /*
1172          * Device vring base was set at device start. SVQ base is handled by
1173          * VirtQueue code.
1174          */
1175         return 0;
1176     }
1177 
1178     return vhost_vdpa_set_dev_vring_base(dev, ring);
1179 }
1180 
1181 static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
1182                                        struct vhost_vring_state *ring)
1183 {
1184     struct vhost_vdpa *v = dev->opaque;
1185     int ret;
1186 
1187     if (v->shadow_vqs_enabled) {
1188         ring->num = virtio_queue_get_last_avail_idx(dev->vdev, ring->index);
1189         return 0;
1190     }
1191 
1192     ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
1193     trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num);
1194     return ret;
1195 }
1196 
1197 static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
1198                                        struct vhost_vring_file *file)
1199 {
1200     struct vhost_vdpa *v = dev->opaque;
1201     int vdpa_idx = file->index - dev->vq_index;
1202 
1203     if (v->shadow_vqs_enabled) {
1204         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
1205         vhost_svq_set_svq_kick_fd(svq, file->fd);
1206         return 0;
1207     } else {
1208         return vhost_vdpa_set_vring_dev_kick(dev, file);
1209     }
1210 }
1211 
1212 static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
1213                                        struct vhost_vring_file *file)
1214 {
1215     struct vhost_vdpa *v = dev->opaque;
1216 
1217     if (v->shadow_vqs_enabled) {
1218         int vdpa_idx = file->index - dev->vq_index;
1219         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
1220 
1221         vhost_svq_set_svq_call_fd(svq, file->fd);
1222         return 0;
1223     } else {
1224         return vhost_vdpa_set_vring_dev_call(dev, file);
1225     }
1226 }
1227 
1228 static int vhost_vdpa_get_features(struct vhost_dev *dev,
1229                                      uint64_t *features)
1230 {
1231     struct vhost_vdpa *v = dev->opaque;
1232     int ret = vhost_vdpa_get_dev_features(dev, features);
1233 
1234     if (ret == 0 && v->shadow_vqs_enabled) {
1235         /* Add SVQ logging capabilities */
1236         *features |= BIT_ULL(VHOST_F_LOG_ALL);
1237     }
1238 
1239     return ret;
1240 }
1241 
1242 static int vhost_vdpa_set_owner(struct vhost_dev *dev)
1243 {
1244     if (!vhost_vdpa_first_dev(dev)) {
1245         return 0;
1246     }
1247 
1248     trace_vhost_vdpa_set_owner(dev);
1249     return vhost_vdpa_call(dev, VHOST_SET_OWNER, NULL);
1250 }
1251 
1252 static int vhost_vdpa_vq_get_addr(struct vhost_dev *dev,
1253                     struct vhost_vring_addr *addr, struct vhost_virtqueue *vq)
1254 {
1255     assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
1256     addr->desc_user_addr = (uint64_t)(unsigned long)vq->desc_phys;
1257     addr->avail_user_addr = (uint64_t)(unsigned long)vq->avail_phys;
1258     addr->used_user_addr = (uint64_t)(unsigned long)vq->used_phys;
1259     trace_vhost_vdpa_vq_get_addr(dev, vq, addr->desc_user_addr,
1260                                  addr->avail_user_addr, addr->used_user_addr);
1261     return 0;
1262 }
1263 
1264 static bool  vhost_vdpa_force_iommu(struct vhost_dev *dev)
1265 {
1266     return true;
1267 }
1268 
1269 const VhostOps vdpa_ops = {
1270         .backend_type = VHOST_BACKEND_TYPE_VDPA,
1271         .vhost_backend_init = vhost_vdpa_init,
1272         .vhost_backend_cleanup = vhost_vdpa_cleanup,
1273         .vhost_set_log_base = vhost_vdpa_set_log_base,
1274         .vhost_set_vring_addr = vhost_vdpa_set_vring_addr,
1275         .vhost_set_vring_num = vhost_vdpa_set_vring_num,
1276         .vhost_set_vring_base = vhost_vdpa_set_vring_base,
1277         .vhost_get_vring_base = vhost_vdpa_get_vring_base,
1278         .vhost_set_vring_kick = vhost_vdpa_set_vring_kick,
1279         .vhost_set_vring_call = vhost_vdpa_set_vring_call,
1280         .vhost_get_features = vhost_vdpa_get_features,
1281         .vhost_set_backend_cap = vhost_vdpa_set_backend_cap,
1282         .vhost_set_owner = vhost_vdpa_set_owner,
1283         .vhost_set_vring_endian = NULL,
1284         .vhost_backend_memslots_limit = vhost_vdpa_memslots_limit,
1285         .vhost_set_mem_table = vhost_vdpa_set_mem_table,
1286         .vhost_set_features = vhost_vdpa_set_features,
1287         .vhost_reset_device = vhost_vdpa_reset_device,
1288         .vhost_get_vq_index = vhost_vdpa_get_vq_index,
1289         .vhost_get_config  = vhost_vdpa_get_config,
1290         .vhost_set_config = vhost_vdpa_set_config,
1291         .vhost_requires_shm_log = NULL,
1292         .vhost_migration_done = NULL,
1293         .vhost_backend_can_merge = NULL,
1294         .vhost_net_set_mtu = NULL,
1295         .vhost_set_iotlb_callback = NULL,
1296         .vhost_send_device_iotlb_msg = NULL,
1297         .vhost_dev_start = vhost_vdpa_dev_start,
1298         .vhost_get_device_id = vhost_vdpa_get_device_id,
1299         .vhost_vq_get_addr = vhost_vdpa_vq_get_addr,
1300         .vhost_force_iommu = vhost_vdpa_force_iommu,
1301 };
1302