xref: /openbmc/qemu/hw/remote/vfio-user-obj.c (revision 1a648f7ae4d3ac97ef0855baec46047ea21a400a)
1 /**
2  * QEMU vfio-user-server server object
3  *
4  * Copyright © 2022 Oracle and/or its affiliates.
5  *
6  * This work is licensed under the terms of the GNU GPL-v2, version 2 or later.
7  *
8  * See the COPYING file in the top-level directory.
9  *
10  */
11 
12 /**
13  * Usage: add options:
14  *     -machine x-remote,vfio-user=on,auto-shutdown=on
15  *     -device <PCI-device>,id=<pci-dev-id>
16  *     -object x-vfio-user-server,id=<id>,type=unix,path=<socket-path>,
17  *             device=<pci-dev-id>
18  *
19  * Note that x-vfio-user-server object must be used with x-remote machine only.
20  * This server could only support PCI devices for now.
21  *
22  * type - SocketAddress type - presently "unix" alone is supported. Required
23  *        option
24  *
25  * path - named unix socket, it will be created by the server. It is
26  *        a required option
27  *
28  * device - id of a device on the server, a required option. PCI devices
29  *          alone are supported presently.
30  *
31  * notes - x-vfio-user-server could block IO and monitor during the
32  *         initialization phase.
33  *
34  *         When x-remote machine has the auto-shutdown property
35  *         enabled (default), x-vfio-user-server terminates after the last
36  *         client disconnects. Otherwise, it will continue running until
37  *         explicitly killed.
38  */
39 
40 #include "qemu/osdep.h"
41 
42 #include "qom/object.h"
43 #include "qom/object_interfaces.h"
44 #include "qemu/error-report.h"
45 #include "trace.h"
46 #include "sysemu/runstate.h"
47 #include "hw/boards.h"
48 #include "hw/remote/machine.h"
49 #include "qapi/error.h"
50 #include "qapi/qapi-visit-sockets.h"
51 #include "qapi/qapi-events-misc.h"
52 #include "qemu/notify.h"
53 #include "qemu/thread.h"
54 #include "qemu/main-loop.h"
55 #include "sysemu/sysemu.h"
56 #include "libvfio-user.h"
57 #include "hw/qdev-core.h"
58 #include "hw/pci/pci.h"
59 #include "qemu/timer.h"
60 #include "exec/memory.h"
61 #include "hw/pci/msi.h"
62 #include "hw/pci/msix.h"
63 #include "hw/remote/vfio-user-obj.h"
64 
65 #define TYPE_VFU_OBJECT "x-vfio-user-server"
66 OBJECT_DECLARE_TYPE(VfuObject, VfuObjectClass, VFU_OBJECT)
67 
68 /**
69  * VFU_OBJECT_ERROR - reports an error message.
70  *
71  * If auto_shutdown is set, it aborts the machine on error. Otherwise,
72  * it logs an error message without aborting. auto_shutdown is disabled
73  * when the server serves clients from multiple VMs; as such, an error
74  * from one VM shouldn't be able to disrupt other VM's services.
75  */
76 #define VFU_OBJECT_ERROR(o, fmt, ...)                                     \
77     {                                                                     \
78         if (vfu_object_auto_shutdown()) {                                 \
79             error_setg(&error_abort, (fmt), ## __VA_ARGS__);              \
80         } else {                                                          \
81             error_report((fmt), ## __VA_ARGS__);                          \
82         }                                                                 \
83     }                                                                     \
84 
85 struct VfuObjectClass {
86     ObjectClass parent_class;
87 
88     unsigned int nr_devs;
89 };
90 
91 struct VfuObject {
92     /* private */
93     Object parent;
94 
95     SocketAddress *socket;
96 
97     char *device;
98 
99     Error *err;
100 
101     Notifier machine_done;
102 
103     vfu_ctx_t *vfu_ctx;
104 
105     PCIDevice *pci_dev;
106 
107     Error *unplug_blocker;
108 
109     int vfu_poll_fd;
110 
111     MSITriggerFunc *default_msi_trigger;
112     MSIPrepareMessageFunc *default_msi_prepare_message;
113     MSIxPrepareMessageFunc *default_msix_prepare_message;
114 };
115 
116 static void vfu_object_init_ctx(VfuObject *o, Error **errp);
117 
vfu_object_auto_shutdown(void)118 static bool vfu_object_auto_shutdown(void)
119 {
120     bool auto_shutdown = true;
121     Error *local_err = NULL;
122 
123     if (!current_machine) {
124         return auto_shutdown;
125     }
126 
127     auto_shutdown = object_property_get_bool(OBJECT(current_machine),
128                                              "auto-shutdown",
129                                              &local_err);
130 
131     /*
132      * local_err would be set if no such property exists - safe to ignore.
133      * Unlikely scenario as auto-shutdown is always defined for
134      * TYPE_REMOTE_MACHINE, and  TYPE_VFU_OBJECT only works with
135      * TYPE_REMOTE_MACHINE
136      */
137     if (local_err) {
138         auto_shutdown = true;
139         error_free(local_err);
140     }
141 
142     return auto_shutdown;
143 }
144 
vfu_object_set_socket(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)145 static void vfu_object_set_socket(Object *obj, Visitor *v, const char *name,
146                                   void *opaque, Error **errp)
147 {
148     VfuObject *o = VFU_OBJECT(obj);
149 
150     if (o->vfu_ctx) {
151         error_setg(errp, "vfu: Unable to set socket property - server busy");
152         return;
153     }
154 
155     qapi_free_SocketAddress(o->socket);
156 
157     o->socket = NULL;
158 
159     visit_type_SocketAddress(v, name, &o->socket, errp);
160 
161     if (o->socket->type != SOCKET_ADDRESS_TYPE_UNIX) {
162         error_setg(errp, "vfu: Unsupported socket type - %s",
163                    SocketAddressType_str(o->socket->type));
164         qapi_free_SocketAddress(o->socket);
165         o->socket = NULL;
166         return;
167     }
168 
169     trace_vfu_prop("socket", o->socket->u.q_unix.path);
170 
171     vfu_object_init_ctx(o, errp);
172 }
173 
vfu_object_set_device(Object * obj,const char * str,Error ** errp)174 static void vfu_object_set_device(Object *obj, const char *str, Error **errp)
175 {
176     VfuObject *o = VFU_OBJECT(obj);
177 
178     if (o->vfu_ctx) {
179         error_setg(errp, "vfu: Unable to set device property - server busy");
180         return;
181     }
182 
183     g_free(o->device);
184 
185     o->device = g_strdup(str);
186 
187     trace_vfu_prop("device", str);
188 
189     vfu_object_init_ctx(o, errp);
190 }
191 
vfu_object_ctx_run(void * opaque)192 static void vfu_object_ctx_run(void *opaque)
193 {
194     VfuObject *o = opaque;
195     const char *vfu_id;
196     char *vfu_path, *pci_dev_path;
197     int ret = -1;
198 
199     while (ret != 0) {
200         ret = vfu_run_ctx(o->vfu_ctx);
201         if (ret < 0) {
202             if (errno == EINTR) {
203                 continue;
204             } else if (errno == ENOTCONN) {
205                 vfu_id = object_get_canonical_path_component(OBJECT(o));
206                 vfu_path = object_get_canonical_path(OBJECT(o));
207                 g_assert(o->pci_dev);
208                 pci_dev_path = object_get_canonical_path(OBJECT(o->pci_dev));
209                  /* o->device is a required property and is non-NULL here */
210                 g_assert(o->device);
211                 qapi_event_send_vfu_client_hangup(vfu_id, vfu_path,
212                                                   o->device, pci_dev_path);
213                 qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
214                 o->vfu_poll_fd = -1;
215                 object_unparent(OBJECT(o));
216                 g_free(vfu_path);
217                 g_free(pci_dev_path);
218                 break;
219             } else {
220                 VFU_OBJECT_ERROR(o, "vfu: Failed to run device %s - %s",
221                                  o->device, strerror(errno));
222                 break;
223             }
224         }
225     }
226 }
227 
vfu_object_attach_ctx(void * opaque)228 static void vfu_object_attach_ctx(void *opaque)
229 {
230     VfuObject *o = opaque;
231     GPollFD pfds[1];
232     int ret;
233 
234     qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
235 
236     pfds[0].fd = o->vfu_poll_fd;
237     pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
238 
239 retry_attach:
240     ret = vfu_attach_ctx(o->vfu_ctx);
241     if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
242         /**
243          * vfu_object_attach_ctx can block QEMU's main loop
244          * during attach - the monitor and other IO
245          * could be unresponsive during this time.
246          */
247         (void)qemu_poll_ns(pfds, 1, 500 * (int64_t)SCALE_MS);
248         goto retry_attach;
249     } else if (ret < 0) {
250         VFU_OBJECT_ERROR(o, "vfu: Failed to attach device %s to context - %s",
251                          o->device, strerror(errno));
252         return;
253     }
254 
255     o->vfu_poll_fd = vfu_get_poll_fd(o->vfu_ctx);
256     if (o->vfu_poll_fd < 0) {
257         VFU_OBJECT_ERROR(o, "vfu: Failed to get poll fd %s", o->device);
258         return;
259     }
260 
261     qemu_set_fd_handler(o->vfu_poll_fd, vfu_object_ctx_run, NULL, o);
262 }
263 
vfu_object_cfg_access(vfu_ctx_t * vfu_ctx,char * const buf,size_t count,loff_t offset,const bool is_write)264 static ssize_t vfu_object_cfg_access(vfu_ctx_t *vfu_ctx, char * const buf,
265                                      size_t count, loff_t offset,
266                                      const bool is_write)
267 {
268     VfuObject *o = vfu_get_private(vfu_ctx);
269     uint32_t pci_access_width = sizeof(uint32_t);
270     size_t bytes = count;
271     uint32_t val = 0;
272     char *ptr = buf;
273     int len;
274 
275     /*
276      * Writes to the BAR registers would trigger an update to the
277      * global Memory and IO AddressSpaces. But the remote device
278      * never uses the global AddressSpaces, therefore overlapping
279      * memory regions are not a problem
280      */
281     while (bytes > 0) {
282         len = (bytes > pci_access_width) ? pci_access_width : bytes;
283         if (is_write) {
284             val = ldn_le_p(ptr, len);
285             pci_host_config_write_common(o->pci_dev, offset,
286                                          pci_config_size(o->pci_dev),
287                                          val, len);
288             trace_vfu_cfg_write(offset, val);
289         } else {
290             val = pci_host_config_read_common(o->pci_dev, offset,
291                                               pci_config_size(o->pci_dev), len);
292             stn_le_p(ptr, len, val);
293             trace_vfu_cfg_read(offset, val);
294         }
295         offset += len;
296         ptr += len;
297         bytes -= len;
298     }
299 
300     return count;
301 }
302 
dma_register(vfu_ctx_t * vfu_ctx,vfu_dma_info_t * info)303 static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
304 {
305     VfuObject *o = vfu_get_private(vfu_ctx);
306     AddressSpace *dma_as = NULL;
307     MemoryRegion *subregion = NULL;
308     g_autofree char *name = NULL;
309     struct iovec *iov = &info->iova;
310 
311     if (!info->vaddr) {
312         return;
313     }
314 
315     name = g_strdup_printf("mem-%s-%"PRIx64"", o->device,
316                            (uint64_t)info->vaddr);
317 
318     subregion = g_new0(MemoryRegion, 1);
319 
320     memory_region_init_ram_ptr(subregion, NULL, name,
321                                iov->iov_len, info->vaddr);
322 
323     dma_as = pci_device_iommu_address_space(o->pci_dev);
324 
325     memory_region_add_subregion(dma_as->root, (hwaddr)iov->iov_base, subregion);
326 
327     trace_vfu_dma_register((uint64_t)iov->iov_base, iov->iov_len);
328 }
329 
dma_unregister(vfu_ctx_t * vfu_ctx,vfu_dma_info_t * info)330 static void dma_unregister(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
331 {
332     VfuObject *o = vfu_get_private(vfu_ctx);
333     AddressSpace *dma_as = NULL;
334     MemoryRegion *mr = NULL;
335     ram_addr_t offset;
336 
337     mr = memory_region_from_host(info->vaddr, &offset);
338     if (!mr) {
339         return;
340     }
341 
342     dma_as = pci_device_iommu_address_space(o->pci_dev);
343 
344     memory_region_del_subregion(dma_as->root, mr);
345 
346     object_unparent((OBJECT(mr)));
347 
348     trace_vfu_dma_unregister((uint64_t)info->iova.iov_base);
349 }
350 
vfu_object_mr_rw(MemoryRegion * mr,uint8_t * buf,hwaddr offset,hwaddr size,const bool is_write)351 static int vfu_object_mr_rw(MemoryRegion *mr, uint8_t *buf, hwaddr offset,
352                             hwaddr size, const bool is_write)
353 {
354     uint8_t *ptr = buf;
355     bool release_lock = false;
356     uint8_t *ram_ptr = NULL;
357     MemTxResult result;
358     int access_size;
359     uint64_t val;
360 
361     if (memory_access_is_direct(mr, is_write)) {
362         /**
363          * Some devices expose a PCI expansion ROM, which could be buffer
364          * based as compared to other regions which are primarily based on
365          * MemoryRegionOps. memory_region_find() would already check
366          * for buffer overflow, we don't need to repeat it here.
367          */
368         ram_ptr = memory_region_get_ram_ptr(mr);
369 
370         if (is_write) {
371             memcpy((ram_ptr + offset), buf, size);
372         } else {
373             memcpy(buf, (ram_ptr + offset), size);
374         }
375 
376         return 0;
377     }
378 
379     while (size) {
380         /**
381          * The read/write logic used below is similar to the ones in
382          * flatview_read/write_continue()
383          */
384         release_lock = prepare_mmio_access(mr);
385 
386         access_size = memory_access_size(mr, size, offset);
387 
388         if (is_write) {
389             val = ldn_he_p(ptr, access_size);
390 
391             result = memory_region_dispatch_write(mr, offset, val,
392                                                   size_memop(access_size),
393                                                   MEMTXATTRS_UNSPECIFIED);
394         } else {
395             result = memory_region_dispatch_read(mr, offset, &val,
396                                                  size_memop(access_size),
397                                                  MEMTXATTRS_UNSPECIFIED);
398 
399             stn_he_p(ptr, access_size, val);
400         }
401 
402         if (release_lock) {
403             bql_unlock();
404             release_lock = false;
405         }
406 
407         if (result != MEMTX_OK) {
408             return -1;
409         }
410 
411         size -= access_size;
412         ptr += access_size;
413         offset += access_size;
414     }
415 
416     return 0;
417 }
418 
vfu_object_bar_rw(PCIDevice * pci_dev,int pci_bar,hwaddr bar_offset,char * const buf,hwaddr len,const bool is_write)419 static size_t vfu_object_bar_rw(PCIDevice *pci_dev, int pci_bar,
420                                 hwaddr bar_offset, char * const buf,
421                                 hwaddr len, const bool is_write)
422 {
423     MemoryRegionSection section = { 0 };
424     uint8_t *ptr = (uint8_t *)buf;
425     MemoryRegion *section_mr = NULL;
426     uint64_t section_size;
427     hwaddr section_offset;
428     hwaddr size = 0;
429 
430     while (len) {
431         section = memory_region_find(pci_dev->io_regions[pci_bar].memory,
432                                      bar_offset, len);
433 
434         if (!section.mr) {
435             warn_report("vfu: invalid address 0x%"PRIx64"", bar_offset);
436             return size;
437         }
438 
439         section_mr = section.mr;
440         section_offset = section.offset_within_region;
441         section_size = int128_get64(section.size);
442 
443         if (is_write && section_mr->readonly) {
444             warn_report("vfu: attempting to write to readonly region in "
445                         "bar %d - [0x%"PRIx64" - 0x%"PRIx64"]",
446                         pci_bar, bar_offset,
447                         (bar_offset + section_size));
448             memory_region_unref(section_mr);
449             return size;
450         }
451 
452         if (vfu_object_mr_rw(section_mr, ptr, section_offset,
453                              section_size, is_write)) {
454             warn_report("vfu: failed to %s "
455                         "[0x%"PRIx64" - 0x%"PRIx64"] in bar %d",
456                         is_write ? "write to" : "read from", bar_offset,
457                         (bar_offset + section_size), pci_bar);
458             memory_region_unref(section_mr);
459             return size;
460         }
461 
462         size += section_size;
463         bar_offset += section_size;
464         ptr += section_size;
465         len -= section_size;
466 
467         memory_region_unref(section_mr);
468     }
469 
470     return size;
471 }
472 
473 /**
474  * VFU_OBJECT_BAR_HANDLER - macro for defining handlers for PCI BARs.
475  *
476  * To create handler for BAR number 2, VFU_OBJECT_BAR_HANDLER(2) would
477  * define vfu_object_bar2_handler
478  */
479 #define VFU_OBJECT_BAR_HANDLER(BAR_NO)                                         \
480     static ssize_t vfu_object_bar##BAR_NO##_handler(vfu_ctx_t *vfu_ctx,        \
481                                         char * const buf, size_t count,        \
482                                         loff_t offset, const bool is_write)    \
483     {                                                                          \
484         VfuObject *o = vfu_get_private(vfu_ctx);                               \
485         PCIDevice *pci_dev = o->pci_dev;                                       \
486                                                                                \
487         return vfu_object_bar_rw(pci_dev, BAR_NO, offset,                      \
488                                  buf, count, is_write);                        \
489     }                                                                          \
490 
491 VFU_OBJECT_BAR_HANDLER(0)
492 VFU_OBJECT_BAR_HANDLER(1)
493 VFU_OBJECT_BAR_HANDLER(2)
494 VFU_OBJECT_BAR_HANDLER(3)
495 VFU_OBJECT_BAR_HANDLER(4)
496 VFU_OBJECT_BAR_HANDLER(5)
497 VFU_OBJECT_BAR_HANDLER(6)
498 
499 static vfu_region_access_cb_t *vfu_object_bar_handlers[PCI_NUM_REGIONS] = {
500     &vfu_object_bar0_handler,
501     &vfu_object_bar1_handler,
502     &vfu_object_bar2_handler,
503     &vfu_object_bar3_handler,
504     &vfu_object_bar4_handler,
505     &vfu_object_bar5_handler,
506     &vfu_object_bar6_handler,
507 };
508 
509 /**
510  * vfu_object_register_bars - Identify active BAR regions of pdev and setup
511  *                            callbacks to handle read/write accesses
512  */
vfu_object_register_bars(vfu_ctx_t * vfu_ctx,PCIDevice * pdev)513 static void vfu_object_register_bars(vfu_ctx_t *vfu_ctx, PCIDevice *pdev)
514 {
515     int flags = VFU_REGION_FLAG_RW;
516     int i;
517 
518     for (i = 0; i < PCI_NUM_REGIONS; i++) {
519         if (!pdev->io_regions[i].size) {
520             continue;
521         }
522 
523         if ((i == VFU_PCI_DEV_ROM_REGION_IDX) ||
524             pdev->io_regions[i].memory->readonly) {
525             flags &= ~VFU_REGION_FLAG_WRITE;
526         }
527 
528         vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX + i,
529                          (size_t)pdev->io_regions[i].size,
530                          vfu_object_bar_handlers[i],
531                          flags, NULL, 0, -1, 0);
532 
533         trace_vfu_bar_register(i, pdev->io_regions[i].addr,
534                                pdev->io_regions[i].size);
535     }
536 }
537 
vfu_object_map_irq(PCIDevice * pci_dev,int intx)538 static int vfu_object_map_irq(PCIDevice *pci_dev, int intx)
539 {
540     int pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)),
541                                 pci_dev->devfn);
542 
543     return pci_bdf;
544 }
545 
vfu_object_set_irq(void * opaque,int pirq,int level)546 static void vfu_object_set_irq(void *opaque, int pirq, int level)
547 {
548     PCIBus *pci_bus = opaque;
549     PCIDevice *pci_dev = NULL;
550     vfu_ctx_t *vfu_ctx = NULL;
551     int pci_bus_num, devfn;
552 
553     if (level) {
554         pci_bus_num = PCI_BUS_NUM(pirq);
555         devfn = PCI_BDF_TO_DEVFN(pirq);
556 
557         /*
558          * pci_find_device() performs at O(1) if the device is attached
559          * to the root PCI bus. Whereas, if the device is attached to a
560          * secondary PCI bus (such as when a root port is involved),
561          * finding the parent PCI bus could take O(n)
562          */
563         pci_dev = pci_find_device(pci_bus, pci_bus_num, devfn);
564 
565         vfu_ctx = pci_dev->irq_opaque;
566 
567         g_assert(vfu_ctx);
568 
569         vfu_irq_trigger(vfu_ctx, 0);
570     }
571 }
572 
vfu_object_msi_prepare_msg(PCIDevice * pci_dev,unsigned int vector)573 static MSIMessage vfu_object_msi_prepare_msg(PCIDevice *pci_dev,
574                                              unsigned int vector)
575 {
576     MSIMessage msg;
577 
578     msg.address = 0;
579     msg.data = vector;
580 
581     return msg;
582 }
583 
vfu_object_msi_trigger(PCIDevice * pci_dev,MSIMessage msg)584 static void vfu_object_msi_trigger(PCIDevice *pci_dev, MSIMessage msg)
585 {
586     vfu_ctx_t *vfu_ctx = pci_dev->irq_opaque;
587 
588     vfu_irq_trigger(vfu_ctx, msg.data);
589 }
590 
vfu_object_setup_msi_cbs(VfuObject * o)591 static void vfu_object_setup_msi_cbs(VfuObject *o)
592 {
593     o->default_msi_trigger = o->pci_dev->msi_trigger;
594     o->default_msi_prepare_message = o->pci_dev->msi_prepare_message;
595     o->default_msix_prepare_message = o->pci_dev->msix_prepare_message;
596 
597     o->pci_dev->msi_trigger = vfu_object_msi_trigger;
598     o->pci_dev->msi_prepare_message = vfu_object_msi_prepare_msg;
599     o->pci_dev->msix_prepare_message = vfu_object_msi_prepare_msg;
600 }
601 
vfu_object_restore_msi_cbs(VfuObject * o)602 static void vfu_object_restore_msi_cbs(VfuObject *o)
603 {
604     o->pci_dev->msi_trigger = o->default_msi_trigger;
605     o->pci_dev->msi_prepare_message = o->default_msi_prepare_message;
606     o->pci_dev->msix_prepare_message = o->default_msix_prepare_message;
607 }
608 
vfu_msix_irq_state(vfu_ctx_t * vfu_ctx,uint32_t start,uint32_t count,bool mask)609 static void vfu_msix_irq_state(vfu_ctx_t *vfu_ctx, uint32_t start,
610                                uint32_t count, bool mask)
611 {
612     VfuObject *o = vfu_get_private(vfu_ctx);
613     uint32_t vector;
614 
615     for (vector = start; vector < count; vector++) {
616         msix_set_mask(o->pci_dev, vector, mask);
617     }
618 }
619 
vfu_msi_irq_state(vfu_ctx_t * vfu_ctx,uint32_t start,uint32_t count,bool mask)620 static void vfu_msi_irq_state(vfu_ctx_t *vfu_ctx, uint32_t start,
621                               uint32_t count, bool mask)
622 {
623     VfuObject *o = vfu_get_private(vfu_ctx);
624     Error *err = NULL;
625     uint32_t vector;
626 
627     for (vector = start; vector < count; vector++) {
628         msi_set_mask(o->pci_dev, vector, mask, &err);
629         if (err) {
630             VFU_OBJECT_ERROR(o, "vfu: %s: %s", o->device,
631                              error_get_pretty(err));
632             error_free(err);
633             err = NULL;
634         }
635     }
636 }
637 
vfu_object_setup_irqs(VfuObject * o,PCIDevice * pci_dev)638 static int vfu_object_setup_irqs(VfuObject *o, PCIDevice *pci_dev)
639 {
640     vfu_ctx_t *vfu_ctx = o->vfu_ctx;
641     int ret;
642 
643     ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1);
644     if (ret < 0) {
645         return ret;
646     }
647 
648     if (msix_nr_vectors_allocated(pci_dev)) {
649         ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ,
650                                        msix_nr_vectors_allocated(pci_dev));
651         vfu_setup_irq_state_callback(vfu_ctx, VFU_DEV_MSIX_IRQ,
652                                      &vfu_msix_irq_state);
653     } else if (msi_nr_vectors_allocated(pci_dev)) {
654         ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSI_IRQ,
655                                        msi_nr_vectors_allocated(pci_dev));
656         vfu_setup_irq_state_callback(vfu_ctx, VFU_DEV_MSI_IRQ,
657                                      &vfu_msi_irq_state);
658     }
659 
660     if (ret < 0) {
661         return ret;
662     }
663 
664     vfu_object_setup_msi_cbs(o);
665 
666     pci_dev->irq_opaque = vfu_ctx;
667 
668     return 0;
669 }
670 
vfu_object_set_bus_irq(PCIBus * pci_bus)671 void vfu_object_set_bus_irq(PCIBus *pci_bus)
672 {
673     int bus_num = pci_bus_num(pci_bus);
674     int max_bdf = PCI_BUILD_BDF(bus_num, PCI_DEVFN_MAX - 1);
675 
676     pci_bus_irqs(pci_bus, vfu_object_set_irq, pci_bus, max_bdf);
677     pci_bus_map_irqs(pci_bus, vfu_object_map_irq);
678 }
679 
vfu_object_device_reset(vfu_ctx_t * vfu_ctx,vfu_reset_type_t type)680 static int vfu_object_device_reset(vfu_ctx_t *vfu_ctx, vfu_reset_type_t type)
681 {
682     VfuObject *o = vfu_get_private(vfu_ctx);
683 
684     /* vfu_object_ctx_run() handles lost connection */
685     if (type == VFU_RESET_LOST_CONN) {
686         return 0;
687     }
688 
689     device_cold_reset(DEVICE(o->pci_dev));
690 
691     return 0;
692 }
693 
694 /*
695  * TYPE_VFU_OBJECT depends on the availability of the 'socket' and 'device'
696  * properties. It also depends on devices instantiated in QEMU. These
697  * dependencies are not available during the instance_init phase of this
698  * object's life-cycle. As such, the server is initialized after the
699  * machine is setup. machine_init_done_notifier notifies TYPE_VFU_OBJECT
700  * when the machine is setup, and the dependencies are available.
701  */
vfu_object_machine_done(Notifier * notifier,void * data)702 static void vfu_object_machine_done(Notifier *notifier, void *data)
703 {
704     VfuObject *o = container_of(notifier, VfuObject, machine_done);
705     Error *err = NULL;
706 
707     vfu_object_init_ctx(o, &err);
708 
709     if (err) {
710         error_propagate(&error_abort, err);
711     }
712 }
713 
714 /**
715  * vfu_object_init_ctx: Create and initialize libvfio-user context. Add
716  *     an unplug blocker for the associated PCI device. Setup a FD handler
717  *     to process incoming messages in the context's socket.
718  *
719  *     The socket and device properties are mandatory, and this function
720  *     will not create the context without them - the setters for these
721  *     properties should call this function when the property is set. The
722  *     machine should also be ready when this function is invoked - it is
723  *     because QEMU objects are initialized before devices, and the
724  *     associated PCI device wouldn't be available at the object
725  *     initialization time. Until these conditions are satisfied, this
726  *     function would return early without performing any task.
727  */
vfu_object_init_ctx(VfuObject * o,Error ** errp)728 static void vfu_object_init_ctx(VfuObject *o, Error **errp)
729 {
730     DeviceState *dev = NULL;
731     vfu_pci_type_t pci_type = VFU_PCI_TYPE_CONVENTIONAL;
732     int ret;
733 
734     if (o->vfu_ctx || !o->socket || !o->device ||
735             !phase_check(PHASE_MACHINE_READY)) {
736         return;
737     }
738 
739     if (o->err) {
740         error_propagate(errp, o->err);
741         o->err = NULL;
742         return;
743     }
744 
745     o->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, o->socket->u.q_unix.path,
746                                 LIBVFIO_USER_FLAG_ATTACH_NB,
747                                 o, VFU_DEV_TYPE_PCI);
748     if (o->vfu_ctx == NULL) {
749         error_setg(errp, "vfu: Failed to create context - %s", strerror(errno));
750         return;
751     }
752 
753     dev = qdev_find_recursive(sysbus_get_default(), o->device);
754     if (dev == NULL) {
755         error_setg(errp, "vfu: Device %s not found", o->device);
756         goto fail;
757     }
758 
759     if (!object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
760         error_setg(errp, "vfu: %s not a PCI device", o->device);
761         goto fail;
762     }
763 
764     o->pci_dev = PCI_DEVICE(dev);
765 
766     object_ref(OBJECT(o->pci_dev));
767 
768     if (pci_is_express(o->pci_dev)) {
769         pci_type = VFU_PCI_TYPE_EXPRESS;
770     }
771 
772     ret = vfu_pci_init(o->vfu_ctx, pci_type, PCI_HEADER_TYPE_NORMAL, 0);
773     if (ret < 0) {
774         error_setg(errp,
775                    "vfu: Failed to attach PCI device %s to context - %s",
776                    o->device, strerror(errno));
777         goto fail;
778     }
779 
780     error_setg(&o->unplug_blocker,
781                "vfu: %s for %s must be deleted before unplugging",
782                TYPE_VFU_OBJECT, o->device);
783     qdev_add_unplug_blocker(DEVICE(o->pci_dev), o->unplug_blocker);
784 
785     ret = vfu_setup_region(o->vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX,
786                            pci_config_size(o->pci_dev), &vfu_object_cfg_access,
787                            VFU_REGION_FLAG_RW | VFU_REGION_FLAG_ALWAYS_CB,
788                            NULL, 0, -1, 0);
789     if (ret < 0) {
790         error_setg(errp,
791                    "vfu: Failed to setup config space handlers for %s- %s",
792                    o->device, strerror(errno));
793         goto fail;
794     }
795 
796     ret = vfu_setup_device_dma(o->vfu_ctx, &dma_register, &dma_unregister);
797     if (ret < 0) {
798         error_setg(errp, "vfu: Failed to setup DMA handlers for %s",
799                    o->device);
800         goto fail;
801     }
802 
803     vfu_object_register_bars(o->vfu_ctx, o->pci_dev);
804 
805     ret = vfu_object_setup_irqs(o, o->pci_dev);
806     if (ret < 0) {
807         error_setg(errp, "vfu: Failed to setup interrupts for %s",
808                    o->device);
809         goto fail;
810     }
811 
812     ret = vfu_setup_device_reset_cb(o->vfu_ctx, &vfu_object_device_reset);
813     if (ret < 0) {
814         error_setg(errp, "vfu: Failed to setup reset callback");
815         goto fail;
816     }
817 
818     ret = vfu_realize_ctx(o->vfu_ctx);
819     if (ret < 0) {
820         error_setg(errp, "vfu: Failed to realize device %s- %s",
821                    o->device, strerror(errno));
822         goto fail;
823     }
824 
825     o->vfu_poll_fd = vfu_get_poll_fd(o->vfu_ctx);
826     if (o->vfu_poll_fd < 0) {
827         error_setg(errp, "vfu: Failed to get poll fd %s", o->device);
828         goto fail;
829     }
830 
831     qemu_set_fd_handler(o->vfu_poll_fd, vfu_object_attach_ctx, NULL, o);
832 
833     return;
834 
835 fail:
836     vfu_destroy_ctx(o->vfu_ctx);
837     if (o->unplug_blocker && o->pci_dev) {
838         qdev_del_unplug_blocker(DEVICE(o->pci_dev), o->unplug_blocker);
839         error_free(o->unplug_blocker);
840         o->unplug_blocker = NULL;
841     }
842     if (o->pci_dev) {
843         vfu_object_restore_msi_cbs(o);
844         o->pci_dev->irq_opaque = NULL;
845         object_unref(OBJECT(o->pci_dev));
846         o->pci_dev = NULL;
847     }
848     o->vfu_ctx = NULL;
849 }
850 
vfu_object_init(Object * obj)851 static void vfu_object_init(Object *obj)
852 {
853     VfuObjectClass *k = VFU_OBJECT_GET_CLASS(obj);
854     VfuObject *o = VFU_OBJECT(obj);
855 
856     k->nr_devs++;
857 
858     if (!object_dynamic_cast(OBJECT(current_machine), TYPE_REMOTE_MACHINE)) {
859         error_setg(&o->err, "vfu: %s only compatible with %s machine",
860                    TYPE_VFU_OBJECT, TYPE_REMOTE_MACHINE);
861         return;
862     }
863 
864     if (!phase_check(PHASE_MACHINE_READY)) {
865         o->machine_done.notify = vfu_object_machine_done;
866         qemu_add_machine_init_done_notifier(&o->machine_done);
867     }
868 
869     o->vfu_poll_fd = -1;
870 }
871 
vfu_object_finalize(Object * obj)872 static void vfu_object_finalize(Object *obj)
873 {
874     VfuObjectClass *k = VFU_OBJECT_GET_CLASS(obj);
875     VfuObject *o = VFU_OBJECT(obj);
876 
877     k->nr_devs--;
878 
879     qapi_free_SocketAddress(o->socket);
880 
881     o->socket = NULL;
882 
883     if (o->vfu_poll_fd != -1) {
884         qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
885         o->vfu_poll_fd = -1;
886     }
887 
888     if (o->vfu_ctx) {
889         vfu_destroy_ctx(o->vfu_ctx);
890         o->vfu_ctx = NULL;
891     }
892 
893     g_free(o->device);
894 
895     o->device = NULL;
896 
897     if (o->unplug_blocker && o->pci_dev) {
898         qdev_del_unplug_blocker(DEVICE(o->pci_dev), o->unplug_blocker);
899         error_free(o->unplug_blocker);
900         o->unplug_blocker = NULL;
901     }
902 
903     if (o->pci_dev) {
904         vfu_object_restore_msi_cbs(o);
905         o->pci_dev->irq_opaque = NULL;
906         object_unref(OBJECT(o->pci_dev));
907         o->pci_dev = NULL;
908     }
909 
910     if (!k->nr_devs && vfu_object_auto_shutdown()) {
911         qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
912     }
913 
914     if (o->machine_done.notify) {
915         qemu_remove_machine_init_done_notifier(&o->machine_done);
916         o->machine_done.notify = NULL;
917     }
918 }
919 
vfu_object_class_init(ObjectClass * klass,void * data)920 static void vfu_object_class_init(ObjectClass *klass, void *data)
921 {
922     VfuObjectClass *k = VFU_OBJECT_CLASS(klass);
923 
924     k->nr_devs = 0;
925 
926     object_class_property_add(klass, "socket", "SocketAddress", NULL,
927                               vfu_object_set_socket, NULL, NULL);
928     object_class_property_set_description(klass, "socket",
929                                           "SocketAddress "
930                                           "(ex: type=unix,path=/tmp/sock). "
931                                           "Only UNIX is presently supported");
932     object_class_property_add_str(klass, "device", NULL,
933                                   vfu_object_set_device);
934     object_class_property_set_description(klass, "device",
935                                           "device ID - only PCI devices "
936                                           "are presently supported");
937 }
938 
939 static const TypeInfo vfu_object_info = {
940     .name = TYPE_VFU_OBJECT,
941     .parent = TYPE_OBJECT,
942     .instance_size = sizeof(VfuObject),
943     .instance_init = vfu_object_init,
944     .instance_finalize = vfu_object_finalize,
945     .class_size = sizeof(VfuObjectClass),
946     .class_init = vfu_object_class_init,
947     .interfaces = (InterfaceInfo[]) {
948         { TYPE_USER_CREATABLE },
949         { }
950     }
951 };
952 
vfu_register_types(void)953 static void vfu_register_types(void)
954 {
955     type_register_static(&vfu_object_info);
956 }
957 
958 type_init(vfu_register_types);
959