1 /**
2 * QEMU vfio-user-server server object
3 *
4 * Copyright © 2022 Oracle and/or its affiliates.
5 *
6 * This work is licensed under the terms of the GNU GPL-v2, version 2 or later.
7 *
8 * See the COPYING file in the top-level directory.
9 *
10 */
11
12 /**
13 * Usage: add options:
14 * -machine x-remote,vfio-user=on,auto-shutdown=on
15 * -device <PCI-device>,id=<pci-dev-id>
16 * -object x-vfio-user-server,id=<id>,type=unix,path=<socket-path>,
17 * device=<pci-dev-id>
18 *
19 * Note that x-vfio-user-server object must be used with x-remote machine only.
20 * This server could only support PCI devices for now.
21 *
22 * type - SocketAddress type - presently "unix" alone is supported. Required
23 * option
24 *
25 * path - named unix socket, it will be created by the server. It is
26 * a required option
27 *
28 * device - id of a device on the server, a required option. PCI devices
29 * alone are supported presently.
30 *
31 * notes - x-vfio-user-server could block IO and monitor during the
32 * initialization phase.
33 *
34 * When x-remote machine has the auto-shutdown property
35 * enabled (default), x-vfio-user-server terminates after the last
36 * client disconnects. Otherwise, it will continue running until
37 * explicitly killed.
38 */
39
40 #include "qemu/osdep.h"
41
42 #include "qom/object.h"
43 #include "qom/object_interfaces.h"
44 #include "qemu/error-report.h"
45 #include "trace.h"
46 #include "sysemu/runstate.h"
47 #include "hw/boards.h"
48 #include "hw/remote/machine.h"
49 #include "qapi/error.h"
50 #include "qapi/qapi-visit-sockets.h"
51 #include "qapi/qapi-events-misc.h"
52 #include "qemu/notify.h"
53 #include "qemu/thread.h"
54 #include "qemu/main-loop.h"
55 #include "sysemu/sysemu.h"
56 #include "libvfio-user.h"
57 #include "hw/qdev-core.h"
58 #include "hw/pci/pci.h"
59 #include "qemu/timer.h"
60 #include "exec/memory.h"
61 #include "hw/pci/msi.h"
62 #include "hw/pci/msix.h"
63 #include "hw/remote/vfio-user-obj.h"
64
65 #define TYPE_VFU_OBJECT "x-vfio-user-server"
66 OBJECT_DECLARE_TYPE(VfuObject, VfuObjectClass, VFU_OBJECT)
67
68 /**
69 * VFU_OBJECT_ERROR - reports an error message.
70 *
71 * If auto_shutdown is set, it aborts the machine on error. Otherwise,
72 * it logs an error message without aborting. auto_shutdown is disabled
73 * when the server serves clients from multiple VMs; as such, an error
74 * from one VM shouldn't be able to disrupt other VM's services.
75 */
76 #define VFU_OBJECT_ERROR(o, fmt, ...) \
77 { \
78 if (vfu_object_auto_shutdown()) { \
79 error_setg(&error_abort, (fmt), ## __VA_ARGS__); \
80 } else { \
81 error_report((fmt), ## __VA_ARGS__); \
82 } \
83 } \
84
85 struct VfuObjectClass {
86 ObjectClass parent_class;
87
88 unsigned int nr_devs;
89 };
90
91 struct VfuObject {
92 /* private */
93 Object parent;
94
95 SocketAddress *socket;
96
97 char *device;
98
99 Error *err;
100
101 Notifier machine_done;
102
103 vfu_ctx_t *vfu_ctx;
104
105 PCIDevice *pci_dev;
106
107 Error *unplug_blocker;
108
109 int vfu_poll_fd;
110
111 MSITriggerFunc *default_msi_trigger;
112 MSIPrepareMessageFunc *default_msi_prepare_message;
113 MSIxPrepareMessageFunc *default_msix_prepare_message;
114 };
115
116 static void vfu_object_init_ctx(VfuObject *o, Error **errp);
117
vfu_object_auto_shutdown(void)118 static bool vfu_object_auto_shutdown(void)
119 {
120 bool auto_shutdown = true;
121 Error *local_err = NULL;
122
123 if (!current_machine) {
124 return auto_shutdown;
125 }
126
127 auto_shutdown = object_property_get_bool(OBJECT(current_machine),
128 "auto-shutdown",
129 &local_err);
130
131 /*
132 * local_err would be set if no such property exists - safe to ignore.
133 * Unlikely scenario as auto-shutdown is always defined for
134 * TYPE_REMOTE_MACHINE, and TYPE_VFU_OBJECT only works with
135 * TYPE_REMOTE_MACHINE
136 */
137 if (local_err) {
138 auto_shutdown = true;
139 error_free(local_err);
140 }
141
142 return auto_shutdown;
143 }
144
vfu_object_set_socket(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)145 static void vfu_object_set_socket(Object *obj, Visitor *v, const char *name,
146 void *opaque, Error **errp)
147 {
148 VfuObject *o = VFU_OBJECT(obj);
149
150 if (o->vfu_ctx) {
151 error_setg(errp, "vfu: Unable to set socket property - server busy");
152 return;
153 }
154
155 qapi_free_SocketAddress(o->socket);
156
157 o->socket = NULL;
158
159 visit_type_SocketAddress(v, name, &o->socket, errp);
160
161 if (o->socket->type != SOCKET_ADDRESS_TYPE_UNIX) {
162 error_setg(errp, "vfu: Unsupported socket type - %s",
163 SocketAddressType_str(o->socket->type));
164 qapi_free_SocketAddress(o->socket);
165 o->socket = NULL;
166 return;
167 }
168
169 trace_vfu_prop("socket", o->socket->u.q_unix.path);
170
171 vfu_object_init_ctx(o, errp);
172 }
173
vfu_object_set_device(Object * obj,const char * str,Error ** errp)174 static void vfu_object_set_device(Object *obj, const char *str, Error **errp)
175 {
176 VfuObject *o = VFU_OBJECT(obj);
177
178 if (o->vfu_ctx) {
179 error_setg(errp, "vfu: Unable to set device property - server busy");
180 return;
181 }
182
183 g_free(o->device);
184
185 o->device = g_strdup(str);
186
187 trace_vfu_prop("device", str);
188
189 vfu_object_init_ctx(o, errp);
190 }
191
vfu_object_ctx_run(void * opaque)192 static void vfu_object_ctx_run(void *opaque)
193 {
194 VfuObject *o = opaque;
195 const char *vfu_id;
196 char *vfu_path, *pci_dev_path;
197 int ret = -1;
198
199 while (ret != 0) {
200 ret = vfu_run_ctx(o->vfu_ctx);
201 if (ret < 0) {
202 if (errno == EINTR) {
203 continue;
204 } else if (errno == ENOTCONN) {
205 vfu_id = object_get_canonical_path_component(OBJECT(o));
206 vfu_path = object_get_canonical_path(OBJECT(o));
207 g_assert(o->pci_dev);
208 pci_dev_path = object_get_canonical_path(OBJECT(o->pci_dev));
209 /* o->device is a required property and is non-NULL here */
210 g_assert(o->device);
211 qapi_event_send_vfu_client_hangup(vfu_id, vfu_path,
212 o->device, pci_dev_path);
213 qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
214 o->vfu_poll_fd = -1;
215 object_unparent(OBJECT(o));
216 g_free(vfu_path);
217 g_free(pci_dev_path);
218 break;
219 } else {
220 VFU_OBJECT_ERROR(o, "vfu: Failed to run device %s - %s",
221 o->device, strerror(errno));
222 break;
223 }
224 }
225 }
226 }
227
vfu_object_attach_ctx(void * opaque)228 static void vfu_object_attach_ctx(void *opaque)
229 {
230 VfuObject *o = opaque;
231 GPollFD pfds[1];
232 int ret;
233
234 qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
235
236 pfds[0].fd = o->vfu_poll_fd;
237 pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
238
239 retry_attach:
240 ret = vfu_attach_ctx(o->vfu_ctx);
241 if (ret < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) {
242 /**
243 * vfu_object_attach_ctx can block QEMU's main loop
244 * during attach - the monitor and other IO
245 * could be unresponsive during this time.
246 */
247 (void)qemu_poll_ns(pfds, 1, 500 * (int64_t)SCALE_MS);
248 goto retry_attach;
249 } else if (ret < 0) {
250 VFU_OBJECT_ERROR(o, "vfu: Failed to attach device %s to context - %s",
251 o->device, strerror(errno));
252 return;
253 }
254
255 o->vfu_poll_fd = vfu_get_poll_fd(o->vfu_ctx);
256 if (o->vfu_poll_fd < 0) {
257 VFU_OBJECT_ERROR(o, "vfu: Failed to get poll fd %s", o->device);
258 return;
259 }
260
261 qemu_set_fd_handler(o->vfu_poll_fd, vfu_object_ctx_run, NULL, o);
262 }
263
vfu_object_cfg_access(vfu_ctx_t * vfu_ctx,char * const buf,size_t count,loff_t offset,const bool is_write)264 static ssize_t vfu_object_cfg_access(vfu_ctx_t *vfu_ctx, char * const buf,
265 size_t count, loff_t offset,
266 const bool is_write)
267 {
268 VfuObject *o = vfu_get_private(vfu_ctx);
269 uint32_t pci_access_width = sizeof(uint32_t);
270 size_t bytes = count;
271 uint32_t val = 0;
272 char *ptr = buf;
273 int len;
274
275 /*
276 * Writes to the BAR registers would trigger an update to the
277 * global Memory and IO AddressSpaces. But the remote device
278 * never uses the global AddressSpaces, therefore overlapping
279 * memory regions are not a problem
280 */
281 while (bytes > 0) {
282 len = (bytes > pci_access_width) ? pci_access_width : bytes;
283 if (is_write) {
284 val = ldn_le_p(ptr, len);
285 pci_host_config_write_common(o->pci_dev, offset,
286 pci_config_size(o->pci_dev),
287 val, len);
288 trace_vfu_cfg_write(offset, val);
289 } else {
290 val = pci_host_config_read_common(o->pci_dev, offset,
291 pci_config_size(o->pci_dev), len);
292 stn_le_p(ptr, len, val);
293 trace_vfu_cfg_read(offset, val);
294 }
295 offset += len;
296 ptr += len;
297 bytes -= len;
298 }
299
300 return count;
301 }
302
dma_register(vfu_ctx_t * vfu_ctx,vfu_dma_info_t * info)303 static void dma_register(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
304 {
305 VfuObject *o = vfu_get_private(vfu_ctx);
306 AddressSpace *dma_as = NULL;
307 MemoryRegion *subregion = NULL;
308 g_autofree char *name = NULL;
309 struct iovec *iov = &info->iova;
310
311 if (!info->vaddr) {
312 return;
313 }
314
315 name = g_strdup_printf("mem-%s-%"PRIx64"", o->device,
316 (uint64_t)info->vaddr);
317
318 subregion = g_new0(MemoryRegion, 1);
319
320 memory_region_init_ram_ptr(subregion, NULL, name,
321 iov->iov_len, info->vaddr);
322
323 dma_as = pci_device_iommu_address_space(o->pci_dev);
324
325 memory_region_add_subregion(dma_as->root, (hwaddr)iov->iov_base, subregion);
326
327 trace_vfu_dma_register((uint64_t)iov->iov_base, iov->iov_len);
328 }
329
dma_unregister(vfu_ctx_t * vfu_ctx,vfu_dma_info_t * info)330 static void dma_unregister(vfu_ctx_t *vfu_ctx, vfu_dma_info_t *info)
331 {
332 VfuObject *o = vfu_get_private(vfu_ctx);
333 AddressSpace *dma_as = NULL;
334 MemoryRegion *mr = NULL;
335 ram_addr_t offset;
336
337 mr = memory_region_from_host(info->vaddr, &offset);
338 if (!mr) {
339 return;
340 }
341
342 dma_as = pci_device_iommu_address_space(o->pci_dev);
343
344 memory_region_del_subregion(dma_as->root, mr);
345
346 object_unparent((OBJECT(mr)));
347
348 trace_vfu_dma_unregister((uint64_t)info->iova.iov_base);
349 }
350
vfu_object_mr_rw(MemoryRegion * mr,uint8_t * buf,hwaddr offset,hwaddr size,const bool is_write)351 static int vfu_object_mr_rw(MemoryRegion *mr, uint8_t *buf, hwaddr offset,
352 hwaddr size, const bool is_write)
353 {
354 uint8_t *ptr = buf;
355 bool release_lock = false;
356 uint8_t *ram_ptr = NULL;
357 MemTxResult result;
358 int access_size;
359 uint64_t val;
360
361 if (memory_access_is_direct(mr, is_write)) {
362 /**
363 * Some devices expose a PCI expansion ROM, which could be buffer
364 * based as compared to other regions which are primarily based on
365 * MemoryRegionOps. memory_region_find() would already check
366 * for buffer overflow, we don't need to repeat it here.
367 */
368 ram_ptr = memory_region_get_ram_ptr(mr);
369
370 if (is_write) {
371 memcpy((ram_ptr + offset), buf, size);
372 } else {
373 memcpy(buf, (ram_ptr + offset), size);
374 }
375
376 return 0;
377 }
378
379 while (size) {
380 /**
381 * The read/write logic used below is similar to the ones in
382 * flatview_read/write_continue()
383 */
384 release_lock = prepare_mmio_access(mr);
385
386 access_size = memory_access_size(mr, size, offset);
387
388 if (is_write) {
389 val = ldn_he_p(ptr, access_size);
390
391 result = memory_region_dispatch_write(mr, offset, val,
392 size_memop(access_size),
393 MEMTXATTRS_UNSPECIFIED);
394 } else {
395 result = memory_region_dispatch_read(mr, offset, &val,
396 size_memop(access_size),
397 MEMTXATTRS_UNSPECIFIED);
398
399 stn_he_p(ptr, access_size, val);
400 }
401
402 if (release_lock) {
403 bql_unlock();
404 release_lock = false;
405 }
406
407 if (result != MEMTX_OK) {
408 return -1;
409 }
410
411 size -= access_size;
412 ptr += access_size;
413 offset += access_size;
414 }
415
416 return 0;
417 }
418
vfu_object_bar_rw(PCIDevice * pci_dev,int pci_bar,hwaddr bar_offset,char * const buf,hwaddr len,const bool is_write)419 static size_t vfu_object_bar_rw(PCIDevice *pci_dev, int pci_bar,
420 hwaddr bar_offset, char * const buf,
421 hwaddr len, const bool is_write)
422 {
423 MemoryRegionSection section = { 0 };
424 uint8_t *ptr = (uint8_t *)buf;
425 MemoryRegion *section_mr = NULL;
426 uint64_t section_size;
427 hwaddr section_offset;
428 hwaddr size = 0;
429
430 while (len) {
431 section = memory_region_find(pci_dev->io_regions[pci_bar].memory,
432 bar_offset, len);
433
434 if (!section.mr) {
435 warn_report("vfu: invalid address 0x%"PRIx64"", bar_offset);
436 return size;
437 }
438
439 section_mr = section.mr;
440 section_offset = section.offset_within_region;
441 section_size = int128_get64(section.size);
442
443 if (is_write && section_mr->readonly) {
444 warn_report("vfu: attempting to write to readonly region in "
445 "bar %d - [0x%"PRIx64" - 0x%"PRIx64"]",
446 pci_bar, bar_offset,
447 (bar_offset + section_size));
448 memory_region_unref(section_mr);
449 return size;
450 }
451
452 if (vfu_object_mr_rw(section_mr, ptr, section_offset,
453 section_size, is_write)) {
454 warn_report("vfu: failed to %s "
455 "[0x%"PRIx64" - 0x%"PRIx64"] in bar %d",
456 is_write ? "write to" : "read from", bar_offset,
457 (bar_offset + section_size), pci_bar);
458 memory_region_unref(section_mr);
459 return size;
460 }
461
462 size += section_size;
463 bar_offset += section_size;
464 ptr += section_size;
465 len -= section_size;
466
467 memory_region_unref(section_mr);
468 }
469
470 return size;
471 }
472
473 /**
474 * VFU_OBJECT_BAR_HANDLER - macro for defining handlers for PCI BARs.
475 *
476 * To create handler for BAR number 2, VFU_OBJECT_BAR_HANDLER(2) would
477 * define vfu_object_bar2_handler
478 */
479 #define VFU_OBJECT_BAR_HANDLER(BAR_NO) \
480 static ssize_t vfu_object_bar##BAR_NO##_handler(vfu_ctx_t *vfu_ctx, \
481 char * const buf, size_t count, \
482 loff_t offset, const bool is_write) \
483 { \
484 VfuObject *o = vfu_get_private(vfu_ctx); \
485 PCIDevice *pci_dev = o->pci_dev; \
486 \
487 return vfu_object_bar_rw(pci_dev, BAR_NO, offset, \
488 buf, count, is_write); \
489 } \
490
491 VFU_OBJECT_BAR_HANDLER(0)
492 VFU_OBJECT_BAR_HANDLER(1)
493 VFU_OBJECT_BAR_HANDLER(2)
494 VFU_OBJECT_BAR_HANDLER(3)
495 VFU_OBJECT_BAR_HANDLER(4)
496 VFU_OBJECT_BAR_HANDLER(5)
497 VFU_OBJECT_BAR_HANDLER(6)
498
499 static vfu_region_access_cb_t *vfu_object_bar_handlers[PCI_NUM_REGIONS] = {
500 &vfu_object_bar0_handler,
501 &vfu_object_bar1_handler,
502 &vfu_object_bar2_handler,
503 &vfu_object_bar3_handler,
504 &vfu_object_bar4_handler,
505 &vfu_object_bar5_handler,
506 &vfu_object_bar6_handler,
507 };
508
509 /**
510 * vfu_object_register_bars - Identify active BAR regions of pdev and setup
511 * callbacks to handle read/write accesses
512 */
vfu_object_register_bars(vfu_ctx_t * vfu_ctx,PCIDevice * pdev)513 static void vfu_object_register_bars(vfu_ctx_t *vfu_ctx, PCIDevice *pdev)
514 {
515 int flags = VFU_REGION_FLAG_RW;
516 int i;
517
518 for (i = 0; i < PCI_NUM_REGIONS; i++) {
519 if (!pdev->io_regions[i].size) {
520 continue;
521 }
522
523 if ((i == VFU_PCI_DEV_ROM_REGION_IDX) ||
524 pdev->io_regions[i].memory->readonly) {
525 flags &= ~VFU_REGION_FLAG_WRITE;
526 }
527
528 vfu_setup_region(vfu_ctx, VFU_PCI_DEV_BAR0_REGION_IDX + i,
529 (size_t)pdev->io_regions[i].size,
530 vfu_object_bar_handlers[i],
531 flags, NULL, 0, -1, 0);
532
533 trace_vfu_bar_register(i, pdev->io_regions[i].addr,
534 pdev->io_regions[i].size);
535 }
536 }
537
vfu_object_map_irq(PCIDevice * pci_dev,int intx)538 static int vfu_object_map_irq(PCIDevice *pci_dev, int intx)
539 {
540 int pci_bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)),
541 pci_dev->devfn);
542
543 return pci_bdf;
544 }
545
vfu_object_set_irq(void * opaque,int pirq,int level)546 static void vfu_object_set_irq(void *opaque, int pirq, int level)
547 {
548 PCIBus *pci_bus = opaque;
549 PCIDevice *pci_dev = NULL;
550 vfu_ctx_t *vfu_ctx = NULL;
551 int pci_bus_num, devfn;
552
553 if (level) {
554 pci_bus_num = PCI_BUS_NUM(pirq);
555 devfn = PCI_BDF_TO_DEVFN(pirq);
556
557 /*
558 * pci_find_device() performs at O(1) if the device is attached
559 * to the root PCI bus. Whereas, if the device is attached to a
560 * secondary PCI bus (such as when a root port is involved),
561 * finding the parent PCI bus could take O(n)
562 */
563 pci_dev = pci_find_device(pci_bus, pci_bus_num, devfn);
564
565 vfu_ctx = pci_dev->irq_opaque;
566
567 g_assert(vfu_ctx);
568
569 vfu_irq_trigger(vfu_ctx, 0);
570 }
571 }
572
vfu_object_msi_prepare_msg(PCIDevice * pci_dev,unsigned int vector)573 static MSIMessage vfu_object_msi_prepare_msg(PCIDevice *pci_dev,
574 unsigned int vector)
575 {
576 MSIMessage msg;
577
578 msg.address = 0;
579 msg.data = vector;
580
581 return msg;
582 }
583
vfu_object_msi_trigger(PCIDevice * pci_dev,MSIMessage msg)584 static void vfu_object_msi_trigger(PCIDevice *pci_dev, MSIMessage msg)
585 {
586 vfu_ctx_t *vfu_ctx = pci_dev->irq_opaque;
587
588 vfu_irq_trigger(vfu_ctx, msg.data);
589 }
590
vfu_object_setup_msi_cbs(VfuObject * o)591 static void vfu_object_setup_msi_cbs(VfuObject *o)
592 {
593 o->default_msi_trigger = o->pci_dev->msi_trigger;
594 o->default_msi_prepare_message = o->pci_dev->msi_prepare_message;
595 o->default_msix_prepare_message = o->pci_dev->msix_prepare_message;
596
597 o->pci_dev->msi_trigger = vfu_object_msi_trigger;
598 o->pci_dev->msi_prepare_message = vfu_object_msi_prepare_msg;
599 o->pci_dev->msix_prepare_message = vfu_object_msi_prepare_msg;
600 }
601
vfu_object_restore_msi_cbs(VfuObject * o)602 static void vfu_object_restore_msi_cbs(VfuObject *o)
603 {
604 o->pci_dev->msi_trigger = o->default_msi_trigger;
605 o->pci_dev->msi_prepare_message = o->default_msi_prepare_message;
606 o->pci_dev->msix_prepare_message = o->default_msix_prepare_message;
607 }
608
vfu_msix_irq_state(vfu_ctx_t * vfu_ctx,uint32_t start,uint32_t count,bool mask)609 static void vfu_msix_irq_state(vfu_ctx_t *vfu_ctx, uint32_t start,
610 uint32_t count, bool mask)
611 {
612 VfuObject *o = vfu_get_private(vfu_ctx);
613 uint32_t vector;
614
615 for (vector = start; vector < count; vector++) {
616 msix_set_mask(o->pci_dev, vector, mask);
617 }
618 }
619
vfu_msi_irq_state(vfu_ctx_t * vfu_ctx,uint32_t start,uint32_t count,bool mask)620 static void vfu_msi_irq_state(vfu_ctx_t *vfu_ctx, uint32_t start,
621 uint32_t count, bool mask)
622 {
623 VfuObject *o = vfu_get_private(vfu_ctx);
624 Error *err = NULL;
625 uint32_t vector;
626
627 for (vector = start; vector < count; vector++) {
628 msi_set_mask(o->pci_dev, vector, mask, &err);
629 if (err) {
630 VFU_OBJECT_ERROR(o, "vfu: %s: %s", o->device,
631 error_get_pretty(err));
632 error_free(err);
633 err = NULL;
634 }
635 }
636 }
637
vfu_object_setup_irqs(VfuObject * o,PCIDevice * pci_dev)638 static int vfu_object_setup_irqs(VfuObject *o, PCIDevice *pci_dev)
639 {
640 vfu_ctx_t *vfu_ctx = o->vfu_ctx;
641 int ret;
642
643 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_INTX_IRQ, 1);
644 if (ret < 0) {
645 return ret;
646 }
647
648 if (msix_nr_vectors_allocated(pci_dev)) {
649 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSIX_IRQ,
650 msix_nr_vectors_allocated(pci_dev));
651 vfu_setup_irq_state_callback(vfu_ctx, VFU_DEV_MSIX_IRQ,
652 &vfu_msix_irq_state);
653 } else if (msi_nr_vectors_allocated(pci_dev)) {
654 ret = vfu_setup_device_nr_irqs(vfu_ctx, VFU_DEV_MSI_IRQ,
655 msi_nr_vectors_allocated(pci_dev));
656 vfu_setup_irq_state_callback(vfu_ctx, VFU_DEV_MSI_IRQ,
657 &vfu_msi_irq_state);
658 }
659
660 if (ret < 0) {
661 return ret;
662 }
663
664 vfu_object_setup_msi_cbs(o);
665
666 pci_dev->irq_opaque = vfu_ctx;
667
668 return 0;
669 }
670
vfu_object_set_bus_irq(PCIBus * pci_bus)671 void vfu_object_set_bus_irq(PCIBus *pci_bus)
672 {
673 int bus_num = pci_bus_num(pci_bus);
674 int max_bdf = PCI_BUILD_BDF(bus_num, PCI_DEVFN_MAX - 1);
675
676 pci_bus_irqs(pci_bus, vfu_object_set_irq, pci_bus, max_bdf);
677 pci_bus_map_irqs(pci_bus, vfu_object_map_irq);
678 }
679
vfu_object_device_reset(vfu_ctx_t * vfu_ctx,vfu_reset_type_t type)680 static int vfu_object_device_reset(vfu_ctx_t *vfu_ctx, vfu_reset_type_t type)
681 {
682 VfuObject *o = vfu_get_private(vfu_ctx);
683
684 /* vfu_object_ctx_run() handles lost connection */
685 if (type == VFU_RESET_LOST_CONN) {
686 return 0;
687 }
688
689 device_cold_reset(DEVICE(o->pci_dev));
690
691 return 0;
692 }
693
694 /*
695 * TYPE_VFU_OBJECT depends on the availability of the 'socket' and 'device'
696 * properties. It also depends on devices instantiated in QEMU. These
697 * dependencies are not available during the instance_init phase of this
698 * object's life-cycle. As such, the server is initialized after the
699 * machine is setup. machine_init_done_notifier notifies TYPE_VFU_OBJECT
700 * when the machine is setup, and the dependencies are available.
701 */
vfu_object_machine_done(Notifier * notifier,void * data)702 static void vfu_object_machine_done(Notifier *notifier, void *data)
703 {
704 VfuObject *o = container_of(notifier, VfuObject, machine_done);
705 Error *err = NULL;
706
707 vfu_object_init_ctx(o, &err);
708
709 if (err) {
710 error_propagate(&error_abort, err);
711 }
712 }
713
714 /**
715 * vfu_object_init_ctx: Create and initialize libvfio-user context. Add
716 * an unplug blocker for the associated PCI device. Setup a FD handler
717 * to process incoming messages in the context's socket.
718 *
719 * The socket and device properties are mandatory, and this function
720 * will not create the context without them - the setters for these
721 * properties should call this function when the property is set. The
722 * machine should also be ready when this function is invoked - it is
723 * because QEMU objects are initialized before devices, and the
724 * associated PCI device wouldn't be available at the object
725 * initialization time. Until these conditions are satisfied, this
726 * function would return early without performing any task.
727 */
vfu_object_init_ctx(VfuObject * o,Error ** errp)728 static void vfu_object_init_ctx(VfuObject *o, Error **errp)
729 {
730 DeviceState *dev = NULL;
731 vfu_pci_type_t pci_type = VFU_PCI_TYPE_CONVENTIONAL;
732 int ret;
733
734 if (o->vfu_ctx || !o->socket || !o->device ||
735 !phase_check(PHASE_MACHINE_READY)) {
736 return;
737 }
738
739 if (o->err) {
740 error_propagate(errp, o->err);
741 o->err = NULL;
742 return;
743 }
744
745 o->vfu_ctx = vfu_create_ctx(VFU_TRANS_SOCK, o->socket->u.q_unix.path,
746 LIBVFIO_USER_FLAG_ATTACH_NB,
747 o, VFU_DEV_TYPE_PCI);
748 if (o->vfu_ctx == NULL) {
749 error_setg(errp, "vfu: Failed to create context - %s", strerror(errno));
750 return;
751 }
752
753 dev = qdev_find_recursive(sysbus_get_default(), o->device);
754 if (dev == NULL) {
755 error_setg(errp, "vfu: Device %s not found", o->device);
756 goto fail;
757 }
758
759 if (!object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
760 error_setg(errp, "vfu: %s not a PCI device", o->device);
761 goto fail;
762 }
763
764 o->pci_dev = PCI_DEVICE(dev);
765
766 object_ref(OBJECT(o->pci_dev));
767
768 if (pci_is_express(o->pci_dev)) {
769 pci_type = VFU_PCI_TYPE_EXPRESS;
770 }
771
772 ret = vfu_pci_init(o->vfu_ctx, pci_type, PCI_HEADER_TYPE_NORMAL, 0);
773 if (ret < 0) {
774 error_setg(errp,
775 "vfu: Failed to attach PCI device %s to context - %s",
776 o->device, strerror(errno));
777 goto fail;
778 }
779
780 error_setg(&o->unplug_blocker,
781 "vfu: %s for %s must be deleted before unplugging",
782 TYPE_VFU_OBJECT, o->device);
783 qdev_add_unplug_blocker(DEVICE(o->pci_dev), o->unplug_blocker);
784
785 ret = vfu_setup_region(o->vfu_ctx, VFU_PCI_DEV_CFG_REGION_IDX,
786 pci_config_size(o->pci_dev), &vfu_object_cfg_access,
787 VFU_REGION_FLAG_RW | VFU_REGION_FLAG_ALWAYS_CB,
788 NULL, 0, -1, 0);
789 if (ret < 0) {
790 error_setg(errp,
791 "vfu: Failed to setup config space handlers for %s- %s",
792 o->device, strerror(errno));
793 goto fail;
794 }
795
796 ret = vfu_setup_device_dma(o->vfu_ctx, &dma_register, &dma_unregister);
797 if (ret < 0) {
798 error_setg(errp, "vfu: Failed to setup DMA handlers for %s",
799 o->device);
800 goto fail;
801 }
802
803 vfu_object_register_bars(o->vfu_ctx, o->pci_dev);
804
805 ret = vfu_object_setup_irqs(o, o->pci_dev);
806 if (ret < 0) {
807 error_setg(errp, "vfu: Failed to setup interrupts for %s",
808 o->device);
809 goto fail;
810 }
811
812 ret = vfu_setup_device_reset_cb(o->vfu_ctx, &vfu_object_device_reset);
813 if (ret < 0) {
814 error_setg(errp, "vfu: Failed to setup reset callback");
815 goto fail;
816 }
817
818 ret = vfu_realize_ctx(o->vfu_ctx);
819 if (ret < 0) {
820 error_setg(errp, "vfu: Failed to realize device %s- %s",
821 o->device, strerror(errno));
822 goto fail;
823 }
824
825 o->vfu_poll_fd = vfu_get_poll_fd(o->vfu_ctx);
826 if (o->vfu_poll_fd < 0) {
827 error_setg(errp, "vfu: Failed to get poll fd %s", o->device);
828 goto fail;
829 }
830
831 qemu_set_fd_handler(o->vfu_poll_fd, vfu_object_attach_ctx, NULL, o);
832
833 return;
834
835 fail:
836 vfu_destroy_ctx(o->vfu_ctx);
837 if (o->unplug_blocker && o->pci_dev) {
838 qdev_del_unplug_blocker(DEVICE(o->pci_dev), o->unplug_blocker);
839 error_free(o->unplug_blocker);
840 o->unplug_blocker = NULL;
841 }
842 if (o->pci_dev) {
843 vfu_object_restore_msi_cbs(o);
844 o->pci_dev->irq_opaque = NULL;
845 object_unref(OBJECT(o->pci_dev));
846 o->pci_dev = NULL;
847 }
848 o->vfu_ctx = NULL;
849 }
850
vfu_object_init(Object * obj)851 static void vfu_object_init(Object *obj)
852 {
853 VfuObjectClass *k = VFU_OBJECT_GET_CLASS(obj);
854 VfuObject *o = VFU_OBJECT(obj);
855
856 k->nr_devs++;
857
858 if (!object_dynamic_cast(OBJECT(current_machine), TYPE_REMOTE_MACHINE)) {
859 error_setg(&o->err, "vfu: %s only compatible with %s machine",
860 TYPE_VFU_OBJECT, TYPE_REMOTE_MACHINE);
861 return;
862 }
863
864 if (!phase_check(PHASE_MACHINE_READY)) {
865 o->machine_done.notify = vfu_object_machine_done;
866 qemu_add_machine_init_done_notifier(&o->machine_done);
867 }
868
869 o->vfu_poll_fd = -1;
870 }
871
vfu_object_finalize(Object * obj)872 static void vfu_object_finalize(Object *obj)
873 {
874 VfuObjectClass *k = VFU_OBJECT_GET_CLASS(obj);
875 VfuObject *o = VFU_OBJECT(obj);
876
877 k->nr_devs--;
878
879 qapi_free_SocketAddress(o->socket);
880
881 o->socket = NULL;
882
883 if (o->vfu_poll_fd != -1) {
884 qemu_set_fd_handler(o->vfu_poll_fd, NULL, NULL, NULL);
885 o->vfu_poll_fd = -1;
886 }
887
888 if (o->vfu_ctx) {
889 vfu_destroy_ctx(o->vfu_ctx);
890 o->vfu_ctx = NULL;
891 }
892
893 g_free(o->device);
894
895 o->device = NULL;
896
897 if (o->unplug_blocker && o->pci_dev) {
898 qdev_del_unplug_blocker(DEVICE(o->pci_dev), o->unplug_blocker);
899 error_free(o->unplug_blocker);
900 o->unplug_blocker = NULL;
901 }
902
903 if (o->pci_dev) {
904 vfu_object_restore_msi_cbs(o);
905 o->pci_dev->irq_opaque = NULL;
906 object_unref(OBJECT(o->pci_dev));
907 o->pci_dev = NULL;
908 }
909
910 if (!k->nr_devs && vfu_object_auto_shutdown()) {
911 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
912 }
913
914 if (o->machine_done.notify) {
915 qemu_remove_machine_init_done_notifier(&o->machine_done);
916 o->machine_done.notify = NULL;
917 }
918 }
919
vfu_object_class_init(ObjectClass * klass,void * data)920 static void vfu_object_class_init(ObjectClass *klass, void *data)
921 {
922 VfuObjectClass *k = VFU_OBJECT_CLASS(klass);
923
924 k->nr_devs = 0;
925
926 object_class_property_add(klass, "socket", "SocketAddress", NULL,
927 vfu_object_set_socket, NULL, NULL);
928 object_class_property_set_description(klass, "socket",
929 "SocketAddress "
930 "(ex: type=unix,path=/tmp/sock). "
931 "Only UNIX is presently supported");
932 object_class_property_add_str(klass, "device", NULL,
933 vfu_object_set_device);
934 object_class_property_set_description(klass, "device",
935 "device ID - only PCI devices "
936 "are presently supported");
937 }
938
939 static const TypeInfo vfu_object_info = {
940 .name = TYPE_VFU_OBJECT,
941 .parent = TYPE_OBJECT,
942 .instance_size = sizeof(VfuObject),
943 .instance_init = vfu_object_init,
944 .instance_finalize = vfu_object_finalize,
945 .class_size = sizeof(VfuObjectClass),
946 .class_init = vfu_object_class_init,
947 .interfaces = (InterfaceInfo[]) {
948 { TYPE_USER_CREATABLE },
949 { }
950 }
951 };
952
vfu_register_types(void)953 static void vfu_register_types(void)
954 {
955 type_register_static(&vfu_object_info);
956 }
957
958 type_init(vfu_register_types);
959