xref: /openbmc/qemu/hw/remote/vfio-user-obj.c (revision c79aa350)
1 /**
2  * QEMU vfio-user-server server object
3  *
4  * Copyright © 2022 Oracle and/or its affiliates.
5  *
6  * This work is licensed under the terms of the GNU GPL-v2, version 2 or later.
7  *
8  * See the COPYING file in the top-level directory.
9  *
10  */
11 
12 /**
13  * Usage: add options:
14  *     -machine x-remote,vfio-user=on,auto-shutdown=on
15  *     -device <PCI-device>,id=<pci-dev-id>
16  *     -object x-vfio-user-server,id=<id>,type=unix,path=<socket-path>,
17  *             device=<pci-dev-id>
18  *
19  * Note that x-vfio-user-server object must be used with x-remote machine only.
20  * This server could only support PCI devices for now.
21  *
22  * type - SocketAddress type - presently "unix" alone is supported. Required
23  *        option
24  *
25  * path - named unix socket, it will be created by the server. It is
26  *        a required option
27  *
28  * device - id of a device on the server, a required option. PCI devices
29  *          alone are supported presently.
30  *
31  * notes - x-vfio-user-server could block IO and monitor during the
32  *         initialization phase.
33  */
34 
35 #include "qemu/osdep.h"
36 
37 #include "qom/object.h"
38 #include "qom/object_interfaces.h"
39 #include "qemu/error-report.h"
40 #include "trace.h"
41 #include "sysemu/runstate.h"
42 #include "hw/boards.h"
43 #include "hw/remote/machine.h"
44 #include "qapi/error.h"
45 #include "qapi/qapi-visit-sockets.h"
46 #include "qapi/qapi-events-misc.h"
47 #include "qemu/notify.h"
48 #include "qemu/thread.h"
49 #include "qemu/main-loop.h"
50 #include "sysemu/sysemu.h"
51 #include "libvfio-user.h"
52 #include "hw/qdev-core.h"
53 #include "hw/pci/pci.h"
54 #include "qemu/timer.h"
55 #include "exec/memory.h"
56 #include "hw/pci/msi.h"
57 #include "hw/pci/msix.h"
58 #include "hw/remote/vfio-user-obj.h"
59 
60 #define TYPE_VFU_OBJECT "x-vfio-user-server"
61 OBJECT_DECLARE_TYPE(VfuObject, VfuObjectClass, VFU_OBJECT)
62 
63 /**
64  * VFU_OBJECT_ERROR - reports an error message. If auto_shutdown
65  * is set, it aborts the machine on error. Otherwise, it logs an
66  * error message without aborting.
67  */
68 #define VFU_OBJECT_ERROR(o, fmt, ...)                                     \
69     {                                                                     \
70         if (vfu_object_auto_shutdown()) {                                 \
71             error_setg(&error_abort, (fmt), ## __VA_ARGS__);              \
72         } else {                                                          \
73             error_report((fmt), ## __VA_ARGS__);                          \
74         }                                                                 \
75     }                                                                     \
76 
77 struct VfuObjectClass {
78     ObjectClass parent_class;
79 
80     unsigned int nr_devs;
81 };
82 
83 struct VfuObject {
84     /* private */
85     Object parent;
86 
87     SocketAddress *socket;
88 
89     char *device;
90 
91     Error *err;
92 
93     Notifier machine_done;
94 
95     vfu_ctx_t *vfu_ctx;
96 
97     PCIDevice *pci_dev;
98 
99     Error *unplug_blocker;
100 
101     int vfu_poll_fd;
102 
103     MSITriggerFunc *default_msi_trigger;
104     MSIPrepareMessageFunc *default_msi_prepare_message;
105     MSIxPrepareMessageFunc *default_msix_prepare_message;
106 };
107 
108 static void vfu_object_init_ctx(VfuObject *o, Error **errp);
109 
110 static bool vfu_object_auto_shutdown(void)
111 {
112     bool auto_shutdown = true;
113     Error *local_err = NULL;
114 
115     if (!current_machine) {
116         return auto_shutdown;
117     }
118 
119     auto_shutdown = object_property_get_bool(OBJECT(current_machine),
120                                              "auto-shutdown",
121                                              &local_err);
122 
123     /*
124      * local_err would be set if no such property exists - safe to ignore.
125      * Unlikely scenario as auto-shutdown is always defined for
126      * TYPE_REMOTE_MACHINE, and  TYPE_VFU_OBJECT only works with
127      * TYPE_REMOTE_MACHINE
128      */
129     if (local_err) {
130         auto_shutdown = true;
131         error_free(local_err);
132     }
133 
134     return auto_shutdown;
135 }
136 
137 static void vfu_object_set_socket(Object *obj, Visitor *v, const char *name,
138                                   void *opaque, Error **errp)
139 {
140     VfuObject *o = VFU_OBJECT(obj);
141 
142     if (o->vfu_ctx) {
143         error_setg(errp, "vfu: Unable to set socket property - server busy");
144         return;
145     }
146 
147     qapi_free_SocketAddress(o->socket);
148 
149     o->socket = NULL;
150 
151     visit_type_SocketAddress(v, name, &o->socket, errp);
152 
153     if (o->socket->type != SOCKET_ADDRESS_TYPE_UNIX) {
154         error_setg(errp, "vfu: Unsupported socket type - %s",
155                    SocketAddressType_str(o->socket->type));
156         qapi_free_SocketAddress(o->socket);
157         o->socket = NULL;
158         return;
159     }
160 
161     trace_vfu_prop("socket", o->socket->u.q_unix.path);
162 
163     vfu_object_init_ctx(o, errp);
164 }
165 
166 static void vfu_object_set_device(Object *obj, const char *str, Error **errp)
167 {
168     VfuObject *o = VFU_OBJECT(obj);
169 
170     if (o->vfu_ctx) {
171         error_setg(errp, "vfu: Unable to set device property - server busy");
172         return;
173     }
174 
175     g_free(o->device);
176 
177     o->device = g_strdup(str);
178 
179     trace_vfu_prop("device", str);
180 
181     vfu_object_init_ctx(o, errp);
182 }
183 
184 static void vfu_object_ctx_run(void *opaque)
185 {
186     VfuObject *o = opaque;
187     const char *vfu_id;
188     char *vfu_path, *pci_dev_path;
189     int ret = -1;
190 
191     while (ret != 0) {
192         ret = vfu_run_ctx(o->vfu_ctx);
193         if (ret < 0) {
194             if (errno == EINTR) {
195                 continue;
196             } else if (errno == ENOTCONN) {
197                 vfu_id = object_get_canonical_path_component(OBJECT(o));
198                 vfu_path = object_get_canonical_path(OBJECT(o));
199                 g_assert(o->pci_dev);
200                 pci_dev_path = object_get_canonical_path(OBJECT(o->pci_dev));
201                  /* o->device is a required property and is non-NULL here */
202                 g_assert(o->device);
203                 qapi_event_send_vfu_client_hangup(vfu_id, vfu_path,
204                                                   o->device, pci_dev_path);
205                 qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
206                 o->vfu_poll_fd = -1;
207                 object_unparent(OBJECT(o));
208                 g_free(vfu_path);
209                 g_free(pci_dev_path);
210                 break;
211             } else {
212                 VFU_OBJECT_ERROR(o, "vfu: Failed to run device %s - %s",
213                                  o->device, strerror(errno));
214                 break;
215             }
216         }
217     }
218 }
219 
220 static void vfu_object_attach_ctx(void *opaque)
221 {
222     VfuObject *o = opaque;
223     GPollFD pfds[1];
224     int ret;
225 
226     qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
227 
228     pfds[0].fd = o->vfu_poll_fd;
229     pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
230 
231 retry_attach:
232     ret = vfu_attach_ctx(o->vfu_ctx);
233     if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
234         /**
235          * vfu_object_attach_ctx can block QEMU's main loop
236          * during attach - the monitor and other IO
237          * could be unresponsive during this time.
238          */
239         (void)qemu_poll_ns(pfds, 1, 500 * (int64_t)SCALE_MS);
240         goto retry_attach;
241     } else if (ret < 0) {
242         VFU_OBJECT_ERROR(o, "vfu: Failed to attach device %s to context - %s",
243                          o->device, strerror(errno));
244         return;
245     }
246 
247     o->vfu_poll_fd = vfu_get_poll_fd(o->vfu_ctx);
248     if (o->vfu_poll_fd < 0) {
249         VFU_OBJECT_ERROR(o, "vfu: Failed to get poll fd %s", o->device);
250         return;
251     }
252 
253     qemu_set_fd_handler(o->vfu_poll_fd, vfu_object_ctx_run, NULL, o);
254 }
255 
256 static ssize_t vfu_object_cfg_access(vfu_ctx_t *vfu_ctx, char * const buf,
257                                      size_t count, loff_t offset,
258                                      const bool is_write)
259 {
260     VfuObject *o = vfu_get_private(vfu_ctx);
261     uint32_t pci_access_width = sizeof(uint32_t);
262     size_t bytes = count;
263     uint32_t val = 0;
264     char *ptr = buf;
265     int len;
266 
267     /*
268      * Writes to the BAR registers would trigger an update to the
269      * global Memory and IO AddressSpaces. But the remote device
270      * never uses the global AddressSpaces, therefore overlapping
271      * memory regions are not a problem
272      */
273     while (bytes > 0) {
274         len = (bytes > pci_access_width) ? pci_access_width : bytes;
275         if (is_write) {
276             memcpy(&val, ptr, len);
277             pci_host_config_write_common(o->pci_dev, offset,
278                                          pci_config_size(o->pci_dev),
279                                          val, len);
280             trace_vfu_cfg_write(offset, val);
281         } else {
282             val = pci_host_config_read_common(o->pci_dev, offset,
283                                               pci_config_size(o->pci_dev), len);
284             memcpy(ptr, &val, len);
285             trace_vfu_cfg_read(offset, val);
286         }
287         offset += len;
288         ptr += len;
289         bytes -= len;
290     }
291 
292     return count;
293 }
294 
295 static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
296 {
297     VfuObject *o = vfu_get_private(vfu_ctx);
298     AddressSpace *dma_as = NULL;
299     MemoryRegion *subregion = NULL;
300     g_autofree char *name = NULL;
301     struct iovec *iov = &info->iova;
302 
303     if (!info->vaddr) {
304         return;
305     }
306 
307     name = g_strdup_printf("mem-%s-%"PRIx64"", o->device,
308                            (uint64_t)info->vaddr);
309 
310     subregion = g_new0(MemoryRegion, 1);
311 
312     memory_region_init_ram_ptr(subregion, NULL, name,
313                                iov->iov_len, info->vaddr);
314 
315     dma_as = pci_device_iommu_address_space(o->pci_dev);
316 
317     memory_region_add_subregion(dma_as->root, (hwaddr)iov->iov_base, subregion);
318 
319     trace_vfu_dma_register((uint64_t)iov->iov_base, iov->iov_len);
320 }
321 
322 static void dma_unregister(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
323 {
324     VfuObject *o = vfu_get_private(vfu_ctx);
325     AddressSpace *dma_as = NULL;
326     MemoryRegion *mr = NULL;
327     ram_addr_t offset;
328 
329     mr = memory_region_from_host(info->vaddr, &offset);
330     if (!mr) {
331         return;
332     }
333 
334     dma_as = pci_device_iommu_address_space(o->pci_dev);
335 
336     memory_region_del_subregion(dma_as->root, mr);
337 
338     object_unparent((OBJECT(mr)));
339 
340     trace_vfu_dma_unregister((uint64_t)info->iova.iov_base);
341 }
342 
343 static int vfu_object_mr_rw(MemoryRegion *mr, uint8_t *buf, hwaddr offset,
344                             hwaddr size, const bool is_write)
345 {
346     uint8_t *ptr = buf;
347     bool release_lock = false;
348     uint8_t *ram_ptr = NULL;
349     MemTxResult result;
350     int access_size;
351     uint64_t val;
352 
353     if (memory_access_is_direct(mr, is_write)) {
354         /**
355          * Some devices expose a PCI expansion ROM, which could be buffer
356          * based as compared to other regions which are primarily based on
357          * MemoryRegionOps. memory_region_find() would already check
358          * for buffer overflow, we don't need to repeat it here.
359          */
360         ram_ptr = memory_region_get_ram_ptr(mr);
361 
362         if (is_write) {
363             memcpy((ram_ptr + offset), buf, size);
364         } else {
365             memcpy(buf, (ram_ptr + offset), size);
366         }
367 
368         return 0;
369     }
370 
371     while (size) {
372         /**
373          * The read/write logic used below is similar to the ones in
374          * flatview_read/write_continue()
375          */
376         release_lock = prepare_mmio_access(mr);
377 
378         access_size = memory_access_size(mr, size, offset);
379 
380         if (is_write) {
381             val = ldn_he_p(ptr, access_size);
382 
383             result = memory_region_dispatch_write(mr, offset, val,
384                                                   size_memop(access_size),
385                                                   MEMTXATTRS_UNSPECIFIED);
386         } else {
387             result = memory_region_dispatch_read(mr, offset, &val,
388                                                  size_memop(access_size),
389                                                  MEMTXATTRS_UNSPECIFIED);
390 
391             stn_he_p(ptr, access_size, val);
392         }
393 
394         if (release_lock) {
395             qemu_mutex_unlock_iothread();
396             release_lock = false;
397         }
398 
399         if (result != MEMTX_OK) {
400             return -1;
401         }
402 
403         size -= access_size;
404         ptr += access_size;
405         offset += access_size;
406     }
407 
408     return 0;
409 }
410 
411 static size_t vfu_object_bar_rw(PCIDevice *pci_dev, int pci_bar,
412                                 hwaddr bar_offset, char * const buf,
413                                 hwaddr len, const bool is_write)
414 {
415     MemoryRegionSection section = { 0 };
416     uint8_t *ptr = (uint8_t *)buf;
417     MemoryRegion *section_mr = NULL;
418     uint64_t section_size;
419     hwaddr section_offset;
420     hwaddr size = 0;
421 
422     while (len) {
423         section = memory_region_find(pci_dev->io_regions[pci_bar].memory,
424                                      bar_offset, len);
425 
426         if (!section.mr) {
427             warn_report("vfu: invalid address 0x%"PRIx64"", bar_offset);
428             return size;
429         }
430 
431         section_mr = section.mr;
432         section_offset = section.offset_within_region;
433         section_size = int128_get64(section.size);
434 
435         if (is_write && section_mr->readonly) {
436             warn_report("vfu: attempting to write to readonly region in "
437                         "bar %d - [0x%"PRIx64" - 0x%"PRIx64"]",
438                         pci_bar, bar_offset,
439                         (bar_offset + section_size));
440             memory_region_unref(section_mr);
441             return size;
442         }
443 
444         if (vfu_object_mr_rw(section_mr, ptr, section_offset,
445                              section_size, is_write)) {
446             warn_report("vfu: failed to %s "
447                         "[0x%"PRIx64" - 0x%"PRIx64"] in bar %d",
448                         is_write ? "write to" : "read from", bar_offset,
449                         (bar_offset + section_size), pci_bar);
450             memory_region_unref(section_mr);
451             return size;
452         }
453 
454         size += section_size;
455         bar_offset += section_size;
456         ptr += section_size;
457         len -= section_size;
458 
459         memory_region_unref(section_mr);
460     }
461 
462     return size;
463 }
464 
465 /**
466  * VFU_OBJECT_BAR_HANDLER - macro for defining handlers for PCI BARs.
467  *
468  * To create handler for BAR number 2, VFU_OBJECT_BAR_HANDLER(2) would
469  * define vfu_object_bar2_handler
470  */
471 #define VFU_OBJECT_BAR_HANDLER(BAR_NO)                                         \
472     static ssize_t vfu_object_bar##BAR_NO##_handler(vfu_ctx_t *vfu_ctx,        \
473                                         char * const buf, size_t count,        \
474                                         loff_t offset, const bool is_write)    \
475     {                                                                          \
476         VfuObject *o = vfu_get_private(vfu_ctx);                               \
477         PCIDevice *pci_dev = o->pci_dev;                                       \
478                                                                                \
479         return vfu_object_bar_rw(pci_dev, BAR_NO, offset,                      \
480                                  buf, count, is_write);                        \
481     }                                                                          \
482 
483 VFU_OBJECT_BAR_HANDLER(0)
484 VFU_OBJECT_BAR_HANDLER(1)
485 VFU_OBJECT_BAR_HANDLER(2)
486 VFU_OBJECT_BAR_HANDLER(3)
487 VFU_OBJECT_BAR_HANDLER(4)
488 VFU_OBJECT_BAR_HANDLER(5)
489 VFU_OBJECT_BAR_HANDLER(6)
490 
491 static vfu_region_access_cb_t *vfu_object_bar_handlers[PCI_NUM_REGIONS] = {
492     &vfu_object_bar0_handler,
493     &vfu_object_bar1_handler,
494     &vfu_object_bar2_handler,
495     &vfu_object_bar3_handler,
496     &vfu_object_bar4_handler,
497     &vfu_object_bar5_handler,
498     &vfu_object_bar6_handler,
499 };
500 
501 /**
502  * vfu_object_register_bars - Identify active BAR regions of pdev and setup
503  *                            callbacks to handle read/write accesses
504  */
505 static void vfu_object_register_bars(vfu_ctx_t *vfu_ctx, PCIDevice *pdev)
506 {
507     int flags = VFU_REGION_FLAG_RW;
508     int i;
509 
510     for (i = 0; i < PCI_NUM_REGIONS; i++) {
511         if (!pdev->io_regions[i].size) {
512             continue;
513         }
514 
515         if ((i == VFU_PCI_DEV_ROM_REGION_IDX) ||
516             pdev->io_regions[i].memory->readonly) {
517             flags &= ~VFU_REGION_FLAG_WRITE;
518         }
519 
520         vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX + i,
521                          (size_t)pdev->io_regions[i].size,
522                          vfu_object_bar_handlers[i],
523                          flags, NULL, 0, -1, 0);
524 
525         trace_vfu_bar_register(i, pdev->io_regions[i].addr,
526                                pdev->io_regions[i].size);
527     }
528 }
529 
530 static int vfu_object_map_irq(PCIDevice *pci_dev, int intx)
531 {
532     int pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)),
533                                 pci_dev->devfn);
534 
535     return pci_bdf;
536 }
537 
538 static void vfu_object_set_irq(void *opaque, int pirq, int level)
539 {
540     PCIBus *pci_bus = opaque;
541     PCIDevice *pci_dev = NULL;
542     vfu_ctx_t *vfu_ctx = NULL;
543     int pci_bus_num, devfn;
544 
545     if (level) {
546         pci_bus_num = PCI_BUS_NUM(pirq);
547         devfn = PCI_BDF_TO_DEVFN(pirq);
548 
549         /*
550          * pci_find_device() performs at O(1) if the device is attached
551          * to the root PCI bus. Whereas, if the device is attached to a
552          * secondary PCI bus (such as when a root port is involved),
553          * finding the parent PCI bus could take O(n)
554          */
555         pci_dev = pci_find_device(pci_bus, pci_bus_num, devfn);
556 
557         vfu_ctx = pci_dev->irq_opaque;
558 
559         g_assert(vfu_ctx);
560 
561         vfu_irq_trigger(vfu_ctx, 0);
562     }
563 }
564 
565 static MSIMessage vfu_object_msi_prepare_msg(PCIDevice *pci_dev,
566                                              unsigned int vector)
567 {
568     MSIMessage msg;
569 
570     msg.address = 0;
571     msg.data = vector;
572 
573     return msg;
574 }
575 
576 static void vfu_object_msi_trigger(PCIDevice *pci_dev, MSIMessage msg)
577 {
578     vfu_ctx_t *vfu_ctx = pci_dev->irq_opaque;
579 
580     vfu_irq_trigger(vfu_ctx, msg.data);
581 }
582 
583 static void vfu_object_setup_msi_cbs(VfuObject *o)
584 {
585     o->default_msi_trigger = o->pci_dev->msi_trigger;
586     o->default_msi_prepare_message = o->pci_dev->msi_prepare_message;
587     o->default_msix_prepare_message = o->pci_dev->msix_prepare_message;
588 
589     o->pci_dev->msi_trigger = vfu_object_msi_trigger;
590     o->pci_dev->msi_prepare_message = vfu_object_msi_prepare_msg;
591     o->pci_dev->msix_prepare_message = vfu_object_msi_prepare_msg;
592 }
593 
594 static void vfu_object_restore_msi_cbs(VfuObject *o)
595 {
596     o->pci_dev->msi_trigger = o->default_msi_trigger;
597     o->pci_dev->msi_prepare_message = o->default_msi_prepare_message;
598     o->pci_dev->msix_prepare_message = o->default_msix_prepare_message;
599 }
600 
601 static void vfu_msix_irq_state(vfu_ctx_t *vfu_ctx, uint32_t start,
602                                uint32_t count, bool mask)
603 {
604     VfuObject *o = vfu_get_private(vfu_ctx);
605     uint32_t vector;
606 
607     for (vector = start; vector < count; vector++) {
608         msix_set_mask(o->pci_dev, vector, mask);
609     }
610 }
611 
612 static void vfu_msi_irq_state(vfu_ctx_t *vfu_ctx, uint32_t start,
613                               uint32_t count, bool mask)
614 {
615     VfuObject *o = vfu_get_private(vfu_ctx);
616     Error *err = NULL;
617     uint32_t vector;
618 
619     for (vector = start; vector < count; vector++) {
620         msi_set_mask(o->pci_dev, vector, mask, &err);
621         if (err) {
622             VFU_OBJECT_ERROR(o, "vfu: %s: %s", o->device,
623                              error_get_pretty(err));
624             error_free(err);
625             err = NULL;
626         }
627     }
628 }
629 
630 static int vfu_object_setup_irqs(VfuObject *o, PCIDevice *pci_dev)
631 {
632     vfu_ctx_t *vfu_ctx = o->vfu_ctx;
633     int ret;
634 
635     ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1);
636     if (ret < 0) {
637         return ret;
638     }
639 
640     if (msix_nr_vectors_allocated(pci_dev)) {
641         ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ,
642                                        msix_nr_vectors_allocated(pci_dev));
643         vfu_setup_irq_state_callback(vfu_ctx, VFU_DEV_MSIX_IRQ,
644                                      &vfu_msix_irq_state);
645     } else if (msi_nr_vectors_allocated(pci_dev)) {
646         ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSI_IRQ,
647                                        msi_nr_vectors_allocated(pci_dev));
648         vfu_setup_irq_state_callback(vfu_ctx, VFU_DEV_MSI_IRQ,
649                                      &vfu_msi_irq_state);
650     }
651 
652     if (ret < 0) {
653         return ret;
654     }
655 
656     vfu_object_setup_msi_cbs(o);
657 
658     pci_dev->irq_opaque = vfu_ctx;
659 
660     return 0;
661 }
662 
663 void vfu_object_set_bus_irq(PCIBus *pci_bus)
664 {
665     int bus_num = pci_bus_num(pci_bus);
666     int max_bdf = PCI_BUILD_BDF(bus_num, PCI_DEVFN_MAX - 1);
667 
668     pci_bus_irqs(pci_bus, vfu_object_set_irq, vfu_object_map_irq, pci_bus,
669                  max_bdf);
670 }
671 
672 static int vfu_object_device_reset(vfu_ctx_t *vfu_ctx, vfu_reset_type_t type)
673 {
674     VfuObject *o = vfu_get_private(vfu_ctx);
675 
676     /* vfu_object_ctx_run() handles lost connection */
677     if (type == VFU_RESET_LOST_CONN) {
678         return 0;
679     }
680 
681     device_cold_reset(DEVICE(o->pci_dev));
682 
683     return 0;
684 }
685 
686 /*
687  * TYPE_VFU_OBJECT depends on the availability of the 'socket' and 'device'
688  * properties. It also depends on devices instantiated in QEMU. These
689  * dependencies are not available during the instance_init phase of this
690  * object's life-cycle. As such, the server is initialized after the
691  * machine is setup. machine_init_done_notifier notifies TYPE_VFU_OBJECT
692  * when the machine is setup, and the dependencies are available.
693  */
694 static void vfu_object_machine_done(Notifier *notifier, void *data)
695 {
696     VfuObject *o = container_of(notifier, VfuObject, machine_done);
697     Error *err = NULL;
698 
699     vfu_object_init_ctx(o, &err);
700 
701     if (err) {
702         error_propagate(&error_abort, err);
703     }
704 }
705 
706 /**
707  * vfu_object_init_ctx: Create and initialize libvfio-user context. Add
708  *     an unplug blocker for the associated PCI device. Setup a FD handler
709  *     to process incoming messages in the context's socket.
710  *
711  *     The socket and device properties are mandatory, and this function
712  *     will not create the context without them - the setters for these
713  *     properties should call this function when the property is set. The
714  *     machine should also be ready when this function is invoked - it is
715  *     because QEMU objects are initialized before devices, and the
716  *     associated PCI device wouldn't be available at the object
717  *     initialization time. Until these conditions are satisfied, this
718  *     function would return early without performing any task.
719  */
720 static void vfu_object_init_ctx(VfuObject *o, Error **errp)
721 {
722     DeviceState *dev = NULL;
723     vfu_pci_type_t pci_type = VFU_PCI_TYPE_CONVENTIONAL;
724     int ret;
725 
726     if (o->vfu_ctx || !o->socket || !o->device ||
727             !phase_check(PHASE_MACHINE_READY)) {
728         return;
729     }
730 
731     if (o->err) {
732         error_propagate(errp, o->err);
733         o->err = NULL;
734         return;
735     }
736 
737     o->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, o->socket->u.q_unix.path,
738                                 LIBVFIO_USER_FLAG_ATTACH_NB,
739                                 o, VFU_DEV_TYPE_PCI);
740     if (o->vfu_ctx == NULL) {
741         error_setg(errp, "vfu: Failed to create context - %s", strerror(errno));
742         return;
743     }
744 
745     dev = qdev_find_recursive(sysbus_get_default(), o->device);
746     if (dev == NULL) {
747         error_setg(errp, "vfu: Device %s not found", o->device);
748         goto fail;
749     }
750 
751     if (!object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
752         error_setg(errp, "vfu: %s not a PCI device", o->device);
753         goto fail;
754     }
755 
756     o->pci_dev = PCI_DEVICE(dev);
757 
758     object_ref(OBJECT(o->pci_dev));
759 
760     if (pci_is_express(o->pci_dev)) {
761         pci_type = VFU_PCI_TYPE_EXPRESS;
762     }
763 
764     ret = vfu_pci_init(o->vfu_ctx, pci_type, PCI_HEADER_TYPE_NORMAL, 0);
765     if (ret < 0) {
766         error_setg(errp,
767                    "vfu: Failed to attach PCI device %s to context - %s",
768                    o->device, strerror(errno));
769         goto fail;
770     }
771 
772     error_setg(&o->unplug_blocker,
773                "vfu: %s for %s must be deleted before unplugging",
774                TYPE_VFU_OBJECT, o->device);
775     qdev_add_unplug_blocker(DEVICE(o->pci_dev), o->unplug_blocker);
776 
777     ret = vfu_setup_region(o->vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX,
778                            pci_config_size(o->pci_dev), &vfu_object_cfg_access,
779                            VFU_REGION_FLAG_RW | VFU_REGION_FLAG_ALWAYS_CB,
780                            NULL, 0, -1, 0);
781     if (ret < 0) {
782         error_setg(errp,
783                    "vfu: Failed to setup config space handlers for %s- %s",
784                    o->device, strerror(errno));
785         goto fail;
786     }
787 
788     ret = vfu_setup_device_dma(o->vfu_ctx, &dma_register, &dma_unregister);
789     if (ret < 0) {
790         error_setg(errp, "vfu: Failed to setup DMA handlers for %s",
791                    o->device);
792         goto fail;
793     }
794 
795     vfu_object_register_bars(o->vfu_ctx, o->pci_dev);
796 
797     ret = vfu_object_setup_irqs(o, o->pci_dev);
798     if (ret < 0) {
799         error_setg(errp, "vfu: Failed to setup interrupts for %s",
800                    o->device);
801         goto fail;
802     }
803 
804     ret = vfu_setup_device_reset_cb(o->vfu_ctx, &vfu_object_device_reset);
805     if (ret < 0) {
806         error_setg(errp, "vfu: Failed to setup reset callback");
807         goto fail;
808     }
809 
810     ret = vfu_realize_ctx(o->vfu_ctx);
811     if (ret < 0) {
812         error_setg(errp, "vfu: Failed to realize device %s- %s",
813                    o->device, strerror(errno));
814         goto fail;
815     }
816 
817     o->vfu_poll_fd = vfu_get_poll_fd(o->vfu_ctx);
818     if (o->vfu_poll_fd < 0) {
819         error_setg(errp, "vfu: Failed to get poll fd %s", o->device);
820         goto fail;
821     }
822 
823     qemu_set_fd_handler(o->vfu_poll_fd, vfu_object_attach_ctx, NULL, o);
824 
825     return;
826 
827 fail:
828     vfu_destroy_ctx(o->vfu_ctx);
829     if (o->unplug_blocker && o->pci_dev) {
830         qdev_del_unplug_blocker(DEVICE(o->pci_dev), o->unplug_blocker);
831         error_free(o->unplug_blocker);
832         o->unplug_blocker = NULL;
833     }
834     if (o->pci_dev) {
835         vfu_object_restore_msi_cbs(o);
836         o->pci_dev->irq_opaque = NULL;
837         object_unref(OBJECT(o->pci_dev));
838         o->pci_dev = NULL;
839     }
840     o->vfu_ctx = NULL;
841 }
842 
843 static void vfu_object_init(Object *obj)
844 {
845     VfuObjectClass *k = VFU_OBJECT_GET_CLASS(obj);
846     VfuObject *o = VFU_OBJECT(obj);
847 
848     k->nr_devs++;
849 
850     if (!object_dynamic_cast(OBJECT(current_machine), TYPE_REMOTE_MACHINE)) {
851         error_setg(&o->err, "vfu: %s only compatible with %s machine",
852                    TYPE_VFU_OBJECT, TYPE_REMOTE_MACHINE);
853         return;
854     }
855 
856     if (!phase_check(PHASE_MACHINE_READY)) {
857         o->machine_done.notify = vfu_object_machine_done;
858         qemu_add_machine_init_done_notifier(&o->machine_done);
859     }
860 
861     o->vfu_poll_fd = -1;
862 }
863 
864 static void vfu_object_finalize(Object *obj)
865 {
866     VfuObjectClass *k = VFU_OBJECT_GET_CLASS(obj);
867     VfuObject *o = VFU_OBJECT(obj);
868 
869     k->nr_devs--;
870 
871     qapi_free_SocketAddress(o->socket);
872 
873     o->socket = NULL;
874 
875     if (o->vfu_poll_fd != -1) {
876         qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
877         o->vfu_poll_fd = -1;
878     }
879 
880     if (o->vfu_ctx) {
881         vfu_destroy_ctx(o->vfu_ctx);
882         o->vfu_ctx = NULL;
883     }
884 
885     g_free(o->device);
886 
887     o->device = NULL;
888 
889     if (o->unplug_blocker && o->pci_dev) {
890         qdev_del_unplug_blocker(DEVICE(o->pci_dev), o->unplug_blocker);
891         error_free(o->unplug_blocker);
892         o->unplug_blocker = NULL;
893     }
894 
895     if (o->pci_dev) {
896         vfu_object_restore_msi_cbs(o);
897         o->pci_dev->irq_opaque = NULL;
898         object_unref(OBJECT(o->pci_dev));
899         o->pci_dev = NULL;
900     }
901 
902     if (!k->nr_devs && vfu_object_auto_shutdown()) {
903         qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
904     }
905 
906     if (o->machine_done.notify) {
907         qemu_remove_machine_init_done_notifier(&o->machine_done);
908         o->machine_done.notify = NULL;
909     }
910 }
911 
912 static void vfu_object_class_init(ObjectClass *klass, void *data)
913 {
914     VfuObjectClass *k = VFU_OBJECT_CLASS(klass);
915 
916     k->nr_devs = 0;
917 
918     object_class_property_add(klass, "socket", "SocketAddress", NULL,
919                               vfu_object_set_socket, NULL, NULL);
920     object_class_property_set_description(klass, "socket",
921                                           "SocketAddress "
922                                           "(ex: type=unix,path=/tmp/sock). "
923                                           "Only UNIX is presently supported");
924     object_class_property_add_str(klass, "device", NULL,
925                                   vfu_object_set_device);
926     object_class_property_set_description(klass, "device",
927                                           "device ID - only PCI devices "
928                                           "are presently supported");
929 }
930 
931 static const TypeInfo vfu_object_info = {
932     .name = TYPE_VFU_OBJECT,
933     .parent = TYPE_OBJECT,
934     .instance_size = sizeof(VfuObject),
935     .instance_init = vfu_object_init,
936     .instance_finalize = vfu_object_finalize,
937     .class_size = sizeof(VfuObjectClass),
938     .class_init = vfu_object_class_init,
939     .interfaces = (InterfaceInfo[]) {
940         { TYPE_USER_CREATABLE },
941         { }
942     }
943 };
944 
945 static void vfu_register_types(void)
946 {
947     type_register_static(&vfu_object_info);
948 }
949 
950 type_init(vfu_register_types);
951