xref: /openbmc/qemu/hw/misc/ivshmem.c (revision 09a274d8)
1 /*
2  * Inter-VM Shared Memory PCI device.
3  *
4  * Author:
5  *      Cam Macdonell <cam@cs.ualberta.ca>
6  *
7  * Based On: cirrus_vga.c
8  *          Copyright (c) 2004 Fabrice Bellard
9  *          Copyright (c) 2004 Makoto Suzuki (suzu)
10  *
11  *      and rtl8139.c
12  *          Copyright (c) 2006 Igor Kovalenko
13  *
14  * This code is licensed under the GNU GPL v2.
15  *
16  * Contributions after 2012-01-13 are licensed under the terms of the
17  * GNU GPL, version 2 or (at your option) any later version.
18  */
19 #include "qemu/osdep.h"
20 #include "qemu/units.h"
21 #include "qapi/error.h"
22 #include "qemu/cutils.h"
23 #include "hw/hw.h"
24 #include "hw/pci/pci.h"
25 #include "hw/pci/msi.h"
26 #include "hw/pci/msix.h"
27 #include "sysemu/kvm.h"
28 #include "migration/blocker.h"
29 #include "qemu/error-report.h"
30 #include "qemu/event_notifier.h"
31 #include "qom/object_interfaces.h"
32 #include "chardev/char-fe.h"
33 #include "sysemu/hostmem.h"
34 #include "sysemu/qtest.h"
35 #include "qapi/visitor.h"
36 
37 #include "hw/misc/ivshmem.h"
38 
39 #define PCI_VENDOR_ID_IVSHMEM   PCI_VENDOR_ID_REDHAT_QUMRANET
40 #define PCI_DEVICE_ID_IVSHMEM   0x1110
41 
42 #define IVSHMEM_MAX_PEERS UINT16_MAX
43 #define IVSHMEM_IOEVENTFD   0
44 #define IVSHMEM_MSI     1
45 
46 #define IVSHMEM_REG_BAR_SIZE 0x100
47 
48 #define IVSHMEM_DEBUG 0
49 #define IVSHMEM_DPRINTF(fmt, ...)                       \
50     do {                                                \
51         if (IVSHMEM_DEBUG) {                            \
52             printf("IVSHMEM: " fmt, ## __VA_ARGS__);    \
53         }                                               \
54     } while (0)
55 
56 #define TYPE_IVSHMEM_COMMON "ivshmem-common"
57 #define IVSHMEM_COMMON(obj) \
58     OBJECT_CHECK(IVShmemState, (obj), TYPE_IVSHMEM_COMMON)
59 
60 #define TYPE_IVSHMEM_PLAIN "ivshmem-plain"
61 #define IVSHMEM_PLAIN(obj) \
62     OBJECT_CHECK(IVShmemState, (obj), TYPE_IVSHMEM_PLAIN)
63 
64 #define TYPE_IVSHMEM_DOORBELL "ivshmem-doorbell"
65 #define IVSHMEM_DOORBELL(obj) \
66     OBJECT_CHECK(IVShmemState, (obj), TYPE_IVSHMEM_DOORBELL)
67 
68 #define TYPE_IVSHMEM "ivshmem"
69 #define IVSHMEM(obj) \
70     OBJECT_CHECK(IVShmemState, (obj), TYPE_IVSHMEM)
71 
72 typedef struct Peer {
73     int nb_eventfds;
74     EventNotifier *eventfds;
75 } Peer;
76 
77 typedef struct MSIVector {
78     PCIDevice *pdev;
79     int virq;
80     bool unmasked;
81 } MSIVector;
82 
83 typedef struct IVShmemState {
84     /*< private >*/
85     PCIDevice parent_obj;
86     /*< public >*/
87 
88     uint32_t features;
89 
90     /* exactly one of these two may be set */
91     HostMemoryBackend *hostmem; /* with interrupts */
92     CharBackend server_chr; /* without interrupts */
93 
94     /* registers */
95     uint32_t intrmask;
96     uint32_t intrstatus;
97     int vm_id;
98 
99     /* BARs */
100     MemoryRegion ivshmem_mmio;  /* BAR 0 (registers) */
101     MemoryRegion *ivshmem_bar2; /* BAR 2 (shared memory) */
102     MemoryRegion server_bar2;   /* used with server_chr */
103 
104     /* interrupt support */
105     Peer *peers;
106     int nb_peers;               /* space in @peers[] */
107     uint32_t vectors;
108     MSIVector *msi_vectors;
109     uint64_t msg_buf;           /* buffer for receiving server messages */
110     int msg_buffered_bytes;     /* #bytes in @msg_buf */
111 
112     /* migration stuff */
113     OnOffAuto master;
114     Error *migration_blocker;
115 } IVShmemState;
116 
117 /* registers for the Inter-VM shared memory device */
118 enum ivshmem_registers {
119     INTRMASK = 0,
120     INTRSTATUS = 4,
121     IVPOSITION = 8,
122     DOORBELL = 12,
123 };
124 
125 static inline uint32_t ivshmem_has_feature(IVShmemState *ivs,
126                                                     unsigned int feature) {
127     return (ivs->features & (1 << feature));
128 }
129 
130 static inline bool ivshmem_is_master(IVShmemState *s)
131 {
132     assert(s->master != ON_OFF_AUTO_AUTO);
133     return s->master == ON_OFF_AUTO_ON;
134 }
135 
136 static void ivshmem_update_irq(IVShmemState *s)
137 {
138     PCIDevice *d = PCI_DEVICE(s);
139     uint32_t isr = s->intrstatus & s->intrmask;
140 
141     /*
142      * Do nothing unless the device actually uses INTx.  Here's how
143      * the device variants signal interrupts, what they put in PCI
144      * config space:
145      * Device variant    Interrupt  Interrupt Pin  MSI-X cap.
146      * ivshmem-plain         none            0         no
147      * ivshmem-doorbell     MSI-X            1        yes(1)
148      * ivshmem,msi=off       INTx            1         no
149      * ivshmem,msi=on       MSI-X            1(2)     yes(1)
150      * (1) if guest enabled MSI-X
151      * (2) the device lies
152      * Leads to the condition for doing nothing:
153      */
154     if (ivshmem_has_feature(s, IVSHMEM_MSI)
155         || !d->config[PCI_INTERRUPT_PIN]) {
156         return;
157     }
158 
159     /* don't print ISR resets */
160     if (isr) {
161         IVSHMEM_DPRINTF("Set IRQ to %d (%04x %04x)\n",
162                         isr ? 1 : 0, s->intrstatus, s->intrmask);
163     }
164 
165     pci_set_irq(d, isr != 0);
166 }
167 
168 static void ivshmem_IntrMask_write(IVShmemState *s, uint32_t val)
169 {
170     IVSHMEM_DPRINTF("IntrMask write(w) val = 0x%04x\n", val);
171 
172     s->intrmask = val;
173     ivshmem_update_irq(s);
174 }
175 
176 static uint32_t ivshmem_IntrMask_read(IVShmemState *s)
177 {
178     uint32_t ret = s->intrmask;
179 
180     IVSHMEM_DPRINTF("intrmask read(w) val = 0x%04x\n", ret);
181     return ret;
182 }
183 
184 static void ivshmem_IntrStatus_write(IVShmemState *s, uint32_t val)
185 {
186     IVSHMEM_DPRINTF("IntrStatus write(w) val = 0x%04x\n", val);
187 
188     s->intrstatus = val;
189     ivshmem_update_irq(s);
190 }
191 
192 static uint32_t ivshmem_IntrStatus_read(IVShmemState *s)
193 {
194     uint32_t ret = s->intrstatus;
195 
196     /* reading ISR clears all interrupts */
197     s->intrstatus = 0;
198     ivshmem_update_irq(s);
199     return ret;
200 }
201 
202 static void ivshmem_io_write(void *opaque, hwaddr addr,
203                              uint64_t val, unsigned size)
204 {
205     IVShmemState *s = opaque;
206 
207     uint16_t dest = val >> 16;
208     uint16_t vector = val & 0xff;
209 
210     addr &= 0xfc;
211 
212     IVSHMEM_DPRINTF("writing to addr " TARGET_FMT_plx "\n", addr);
213     switch (addr)
214     {
215         case INTRMASK:
216             ivshmem_IntrMask_write(s, val);
217             break;
218 
219         case INTRSTATUS:
220             ivshmem_IntrStatus_write(s, val);
221             break;
222 
223         case DOORBELL:
224             /* check that dest VM ID is reasonable */
225             if (dest >= s->nb_peers) {
226                 IVSHMEM_DPRINTF("Invalid destination VM ID (%d)\n", dest);
227                 break;
228             }
229 
230             /* check doorbell range */
231             if (vector < s->peers[dest].nb_eventfds) {
232                 IVSHMEM_DPRINTF("Notifying VM %d on vector %d\n", dest, vector);
233                 event_notifier_set(&s->peers[dest].eventfds[vector]);
234             } else {
235                 IVSHMEM_DPRINTF("Invalid destination vector %d on VM %d\n",
236                                 vector, dest);
237             }
238             break;
239         default:
240             IVSHMEM_DPRINTF("Unhandled write " TARGET_FMT_plx "\n", addr);
241     }
242 }
243 
244 static uint64_t ivshmem_io_read(void *opaque, hwaddr addr,
245                                 unsigned size)
246 {
247 
248     IVShmemState *s = opaque;
249     uint32_t ret;
250 
251     switch (addr)
252     {
253         case INTRMASK:
254             ret = ivshmem_IntrMask_read(s);
255             break;
256 
257         case INTRSTATUS:
258             ret = ivshmem_IntrStatus_read(s);
259             break;
260 
261         case IVPOSITION:
262             ret = s->vm_id;
263             break;
264 
265         default:
266             IVSHMEM_DPRINTF("why are we reading " TARGET_FMT_plx "\n", addr);
267             ret = 0;
268     }
269 
270     return ret;
271 }
272 
273 static const MemoryRegionOps ivshmem_mmio_ops = {
274     .read = ivshmem_io_read,
275     .write = ivshmem_io_write,
276     .endianness = DEVICE_NATIVE_ENDIAN,
277     .impl = {
278         .min_access_size = 4,
279         .max_access_size = 4,
280     },
281 };
282 
283 static void ivshmem_vector_notify(void *opaque)
284 {
285     MSIVector *entry = opaque;
286     PCIDevice *pdev = entry->pdev;
287     IVShmemState *s = IVSHMEM_COMMON(pdev);
288     int vector = entry - s->msi_vectors;
289     EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
290 
291     if (!event_notifier_test_and_clear(n)) {
292         return;
293     }
294 
295     IVSHMEM_DPRINTF("interrupt on vector %p %d\n", pdev, vector);
296     if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
297         if (msix_enabled(pdev)) {
298             msix_notify(pdev, vector);
299         }
300     } else {
301         ivshmem_IntrStatus_write(s, 1);
302     }
303 }
304 
305 static int ivshmem_vector_unmask(PCIDevice *dev, unsigned vector,
306                                  MSIMessage msg)
307 {
308     IVShmemState *s = IVSHMEM_COMMON(dev);
309     EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
310     MSIVector *v = &s->msi_vectors[vector];
311     int ret;
312 
313     IVSHMEM_DPRINTF("vector unmask %p %d\n", dev, vector);
314     if (!v->pdev) {
315         error_report("ivshmem: vector %d route does not exist", vector);
316         return -EINVAL;
317     }
318     assert(!v->unmasked);
319 
320     ret = kvm_irqchip_update_msi_route(kvm_state, v->virq, msg, dev);
321     if (ret < 0) {
322         return ret;
323     }
324     kvm_irqchip_commit_routes(kvm_state);
325 
326     ret = kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, n, NULL, v->virq);
327     if (ret < 0) {
328         return ret;
329     }
330     v->unmasked = true;
331 
332     return 0;
333 }
334 
335 static void ivshmem_vector_mask(PCIDevice *dev, unsigned vector)
336 {
337     IVShmemState *s = IVSHMEM_COMMON(dev);
338     EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
339     MSIVector *v = &s->msi_vectors[vector];
340     int ret;
341 
342     IVSHMEM_DPRINTF("vector mask %p %d\n", dev, vector);
343     if (!v->pdev) {
344         error_report("ivshmem: vector %d route does not exist", vector);
345         return;
346     }
347     assert(v->unmasked);
348 
349     ret = kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, n, v->virq);
350     if (ret < 0) {
351         error_report("remove_irqfd_notifier_gsi failed");
352         return;
353     }
354     v->unmasked = false;
355 }
356 
357 static void ivshmem_vector_poll(PCIDevice *dev,
358                                 unsigned int vector_start,
359                                 unsigned int vector_end)
360 {
361     IVShmemState *s = IVSHMEM_COMMON(dev);
362     unsigned int vector;
363 
364     IVSHMEM_DPRINTF("vector poll %p %d-%d\n", dev, vector_start, vector_end);
365 
366     vector_end = MIN(vector_end, s->vectors);
367 
368     for (vector = vector_start; vector < vector_end; vector++) {
369         EventNotifier *notifier = &s->peers[s->vm_id].eventfds[vector];
370 
371         if (!msix_is_masked(dev, vector)) {
372             continue;
373         }
374 
375         if (event_notifier_test_and_clear(notifier)) {
376             msix_set_pending(dev, vector);
377         }
378     }
379 }
380 
381 static void watch_vector_notifier(IVShmemState *s, EventNotifier *n,
382                                  int vector)
383 {
384     int eventfd = event_notifier_get_fd(n);
385 
386     assert(!s->msi_vectors[vector].pdev);
387     s->msi_vectors[vector].pdev = PCI_DEVICE(s);
388 
389     qemu_set_fd_handler(eventfd, ivshmem_vector_notify,
390                         NULL, &s->msi_vectors[vector]);
391 }
392 
393 static void ivshmem_add_eventfd(IVShmemState *s, int posn, int i)
394 {
395     memory_region_add_eventfd(&s->ivshmem_mmio,
396                               DOORBELL,
397                               4,
398                               true,
399                               (posn << 16) | i,
400                               &s->peers[posn].eventfds[i]);
401 }
402 
403 static void ivshmem_del_eventfd(IVShmemState *s, int posn, int i)
404 {
405     memory_region_del_eventfd(&s->ivshmem_mmio,
406                               DOORBELL,
407                               4,
408                               true,
409                               (posn << 16) | i,
410                               &s->peers[posn].eventfds[i]);
411 }
412 
413 static void close_peer_eventfds(IVShmemState *s, int posn)
414 {
415     int i, n;
416 
417     assert(posn >= 0 && posn < s->nb_peers);
418     n = s->peers[posn].nb_eventfds;
419 
420     if (ivshmem_has_feature(s, IVSHMEM_IOEVENTFD)) {
421         memory_region_transaction_begin();
422         for (i = 0; i < n; i++) {
423             ivshmem_del_eventfd(s, posn, i);
424         }
425         memory_region_transaction_commit();
426     }
427 
428     for (i = 0; i < n; i++) {
429         event_notifier_cleanup(&s->peers[posn].eventfds[i]);
430     }
431 
432     g_free(s->peers[posn].eventfds);
433     s->peers[posn].nb_eventfds = 0;
434 }
435 
436 static void resize_peers(IVShmemState *s, int nb_peers)
437 {
438     int old_nb_peers = s->nb_peers;
439     int i;
440 
441     assert(nb_peers > old_nb_peers);
442     IVSHMEM_DPRINTF("bumping storage to %d peers\n", nb_peers);
443 
444     s->peers = g_realloc(s->peers, nb_peers * sizeof(Peer));
445     s->nb_peers = nb_peers;
446 
447     for (i = old_nb_peers; i < nb_peers; i++) {
448         s->peers[i].eventfds = g_new0(EventNotifier, s->vectors);
449         s->peers[i].nb_eventfds = 0;
450     }
451 }
452 
453 static void ivshmem_add_kvm_msi_virq(IVShmemState *s, int vector,
454                                      Error **errp)
455 {
456     PCIDevice *pdev = PCI_DEVICE(s);
457     int ret;
458 
459     IVSHMEM_DPRINTF("ivshmem_add_kvm_msi_virq vector:%d\n", vector);
460     assert(!s->msi_vectors[vector].pdev);
461 
462     ret = kvm_irqchip_add_msi_route(kvm_state, vector, pdev);
463     if (ret < 0) {
464         error_setg(errp, "kvm_irqchip_add_msi_route failed");
465         return;
466     }
467 
468     s->msi_vectors[vector].virq = ret;
469     s->msi_vectors[vector].pdev = pdev;
470 }
471 
472 static void setup_interrupt(IVShmemState *s, int vector, Error **errp)
473 {
474     EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
475     bool with_irqfd = kvm_msi_via_irqfd_enabled() &&
476         ivshmem_has_feature(s, IVSHMEM_MSI);
477     PCIDevice *pdev = PCI_DEVICE(s);
478     Error *err = NULL;
479 
480     IVSHMEM_DPRINTF("setting up interrupt for vector: %d\n", vector);
481 
482     if (!with_irqfd) {
483         IVSHMEM_DPRINTF("with eventfd\n");
484         watch_vector_notifier(s, n, vector);
485     } else if (msix_enabled(pdev)) {
486         IVSHMEM_DPRINTF("with irqfd\n");
487         ivshmem_add_kvm_msi_virq(s, vector, &err);
488         if (err) {
489             error_propagate(errp, err);
490             return;
491         }
492 
493         if (!msix_is_masked(pdev, vector)) {
494             kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, n, NULL,
495                                                s->msi_vectors[vector].virq);
496             /* TODO handle error */
497         }
498     } else {
499         /* it will be delayed until msix is enabled, in write_config */
500         IVSHMEM_DPRINTF("with irqfd, delayed until msix enabled\n");
501     }
502 }
503 
504 static void process_msg_shmem(IVShmemState *s, int fd, Error **errp)
505 {
506     Error *local_err = NULL;
507     struct stat buf;
508     size_t size;
509 
510     if (s->ivshmem_bar2) {
511         error_setg(errp, "server sent unexpected shared memory message");
512         close(fd);
513         return;
514     }
515 
516     if (fstat(fd, &buf) < 0) {
517         error_setg_errno(errp, errno,
518             "can't determine size of shared memory sent by server");
519         close(fd);
520         return;
521     }
522 
523     size = buf.st_size;
524 
525     /* mmap the region and map into the BAR2 */
526     memory_region_init_ram_from_fd(&s->server_bar2, OBJECT(s),
527                                    "ivshmem.bar2", size, true, fd, &local_err);
528     if (local_err) {
529         error_propagate(errp, local_err);
530         return;
531     }
532 
533     s->ivshmem_bar2 = &s->server_bar2;
534 }
535 
536 static void process_msg_disconnect(IVShmemState *s, uint16_t posn,
537                                    Error **errp)
538 {
539     IVSHMEM_DPRINTF("posn %d has gone away\n", posn);
540     if (posn >= s->nb_peers || posn == s->vm_id) {
541         error_setg(errp, "invalid peer %d", posn);
542         return;
543     }
544     close_peer_eventfds(s, posn);
545 }
546 
547 static void process_msg_connect(IVShmemState *s, uint16_t posn, int fd,
548                                 Error **errp)
549 {
550     Peer *peer = &s->peers[posn];
551     int vector;
552 
553     /*
554      * The N-th connect message for this peer comes with the file
555      * descriptor for vector N-1.  Count messages to find the vector.
556      */
557     if (peer->nb_eventfds >= s->vectors) {
558         error_setg(errp, "Too many eventfd received, device has %d vectors",
559                    s->vectors);
560         close(fd);
561         return;
562     }
563     vector = peer->nb_eventfds++;
564 
565     IVSHMEM_DPRINTF("eventfds[%d][%d] = %d\n", posn, vector, fd);
566     event_notifier_init_fd(&peer->eventfds[vector], fd);
567     fcntl_setfl(fd, O_NONBLOCK); /* msix/irqfd poll non block */
568 
569     if (posn == s->vm_id) {
570         setup_interrupt(s, vector, errp);
571         /* TODO do we need to handle the error? */
572     }
573 
574     if (ivshmem_has_feature(s, IVSHMEM_IOEVENTFD)) {
575         ivshmem_add_eventfd(s, posn, vector);
576     }
577 }
578 
579 static void process_msg(IVShmemState *s, int64_t msg, int fd, Error **errp)
580 {
581     IVSHMEM_DPRINTF("posn is %" PRId64 ", fd is %d\n", msg, fd);
582 
583     if (msg < -1 || msg > IVSHMEM_MAX_PEERS) {
584         error_setg(errp, "server sent invalid message %" PRId64, msg);
585         close(fd);
586         return;
587     }
588 
589     if (msg == -1) {
590         process_msg_shmem(s, fd, errp);
591         return;
592     }
593 
594     if (msg >= s->nb_peers) {
595         resize_peers(s, msg + 1);
596     }
597 
598     if (fd >= 0) {
599         process_msg_connect(s, msg, fd, errp);
600     } else {
601         process_msg_disconnect(s, msg, errp);
602     }
603 }
604 
605 static int ivshmem_can_receive(void *opaque)
606 {
607     IVShmemState *s = opaque;
608 
609     assert(s->msg_buffered_bytes < sizeof(s->msg_buf));
610     return sizeof(s->msg_buf) - s->msg_buffered_bytes;
611 }
612 
613 static void ivshmem_read(void *opaque, const uint8_t *buf, int size)
614 {
615     IVShmemState *s = opaque;
616     Error *err = NULL;
617     int fd;
618     int64_t msg;
619 
620     assert(size >= 0 && s->msg_buffered_bytes + size <= sizeof(s->msg_buf));
621     memcpy((unsigned char *)&s->msg_buf + s->msg_buffered_bytes, buf, size);
622     s->msg_buffered_bytes += size;
623     if (s->msg_buffered_bytes < sizeof(s->msg_buf)) {
624         return;
625     }
626     msg = le64_to_cpu(s->msg_buf);
627     s->msg_buffered_bytes = 0;
628 
629     fd = qemu_chr_fe_get_msgfd(&s->server_chr);
630 
631     process_msg(s, msg, fd, &err);
632     if (err) {
633         error_report_err(err);
634     }
635 }
636 
637 static int64_t ivshmem_recv_msg(IVShmemState *s, int *pfd, Error **errp)
638 {
639     int64_t msg;
640     int n, ret;
641 
642     n = 0;
643     do {
644         ret = qemu_chr_fe_read_all(&s->server_chr, (uint8_t *)&msg + n,
645                                    sizeof(msg) - n);
646         if (ret < 0) {
647             if (ret == -EINTR) {
648                 continue;
649             }
650             error_setg_errno(errp, -ret, "read from server failed");
651             return INT64_MIN;
652         }
653         n += ret;
654     } while (n < sizeof(msg));
655 
656     *pfd = qemu_chr_fe_get_msgfd(&s->server_chr);
657     return le64_to_cpu(msg);
658 }
659 
660 static void ivshmem_recv_setup(IVShmemState *s, Error **errp)
661 {
662     Error *err = NULL;
663     int64_t msg;
664     int fd;
665 
666     msg = ivshmem_recv_msg(s, &fd, &err);
667     if (err) {
668         error_propagate(errp, err);
669         return;
670     }
671     if (msg != IVSHMEM_PROTOCOL_VERSION) {
672         error_setg(errp, "server sent version %" PRId64 ", expecting %d",
673                    msg, IVSHMEM_PROTOCOL_VERSION);
674         return;
675     }
676     if (fd != -1) {
677         error_setg(errp, "server sent invalid version message");
678         return;
679     }
680 
681     /*
682      * ivshmem-server sends the remaining initial messages in a fixed
683      * order, but the device has always accepted them in any order.
684      * Stay as compatible as practical, just in case people use
685      * servers that behave differently.
686      */
687 
688     /*
689      * ivshmem_device_spec.txt has always required the ID message
690      * right here, and ivshmem-server has always complied.  However,
691      * older versions of the device accepted it out of order, but
692      * broke when an interrupt setup message arrived before it.
693      */
694     msg = ivshmem_recv_msg(s, &fd, &err);
695     if (err) {
696         error_propagate(errp, err);
697         return;
698     }
699     if (fd != -1 || msg < 0 || msg > IVSHMEM_MAX_PEERS) {
700         error_setg(errp, "server sent invalid ID message");
701         return;
702     }
703     s->vm_id = msg;
704 
705     /*
706      * Receive more messages until we got shared memory.
707      */
708     do {
709         msg = ivshmem_recv_msg(s, &fd, &err);
710         if (err) {
711             error_propagate(errp, err);
712             return;
713         }
714         process_msg(s, msg, fd, &err);
715         if (err) {
716             error_propagate(errp, err);
717             return;
718         }
719     } while (msg != -1);
720 
721     /*
722      * This function must either map the shared memory or fail.  The
723      * loop above ensures that: it terminates normally only after it
724      * successfully processed the server's shared memory message.
725      * Assert that actually mapped the shared memory:
726      */
727     assert(s->ivshmem_bar2);
728 }
729 
730 /* Select the MSI-X vectors used by device.
731  * ivshmem maps events to vectors statically, so
732  * we just enable all vectors on init and after reset. */
733 static void ivshmem_msix_vector_use(IVShmemState *s)
734 {
735     PCIDevice *d = PCI_DEVICE(s);
736     int i;
737 
738     for (i = 0; i < s->vectors; i++) {
739         msix_vector_use(d, i);
740     }
741 }
742 
743 static void ivshmem_disable_irqfd(IVShmemState *s);
744 
745 static void ivshmem_reset(DeviceState *d)
746 {
747     IVShmemState *s = IVSHMEM_COMMON(d);
748 
749     ivshmem_disable_irqfd(s);
750 
751     s->intrstatus = 0;
752     s->intrmask = 0;
753     if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
754         ivshmem_msix_vector_use(s);
755     }
756 }
757 
758 static int ivshmem_setup_interrupts(IVShmemState *s, Error **errp)
759 {
760     /* allocate QEMU callback data for receiving interrupts */
761     s->msi_vectors = g_malloc0(s->vectors * sizeof(MSIVector));
762 
763     if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
764         if (msix_init_exclusive_bar(PCI_DEVICE(s), s->vectors, 1, errp)) {
765             return -1;
766         }
767 
768         IVSHMEM_DPRINTF("msix initialized (%d vectors)\n", s->vectors);
769         ivshmem_msix_vector_use(s);
770     }
771 
772     return 0;
773 }
774 
775 static void ivshmem_remove_kvm_msi_virq(IVShmemState *s, int vector)
776 {
777     IVSHMEM_DPRINTF("ivshmem_remove_kvm_msi_virq vector:%d\n", vector);
778 
779     if (s->msi_vectors[vector].pdev == NULL) {
780         return;
781     }
782 
783     /* it was cleaned when masked in the frontend. */
784     kvm_irqchip_release_virq(kvm_state, s->msi_vectors[vector].virq);
785 
786     s->msi_vectors[vector].pdev = NULL;
787 }
788 
789 static void ivshmem_enable_irqfd(IVShmemState *s)
790 {
791     PCIDevice *pdev = PCI_DEVICE(s);
792     int i;
793 
794     for (i = 0; i < s->peers[s->vm_id].nb_eventfds; i++) {
795         Error *err = NULL;
796 
797         ivshmem_add_kvm_msi_virq(s, i, &err);
798         if (err) {
799             error_report_err(err);
800             goto undo;
801         }
802     }
803 
804     if (msix_set_vector_notifiers(pdev,
805                                   ivshmem_vector_unmask,
806                                   ivshmem_vector_mask,
807                                   ivshmem_vector_poll)) {
808         error_report("ivshmem: msix_set_vector_notifiers failed");
809         goto undo;
810     }
811     return;
812 
813 undo:
814     while (--i >= 0) {
815         ivshmem_remove_kvm_msi_virq(s, i);
816     }
817 }
818 
819 static void ivshmem_disable_irqfd(IVShmemState *s)
820 {
821     PCIDevice *pdev = PCI_DEVICE(s);
822     int i;
823 
824     if (!pdev->msix_vector_use_notifier) {
825         return;
826     }
827 
828     msix_unset_vector_notifiers(pdev);
829 
830     for (i = 0; i < s->peers[s->vm_id].nb_eventfds; i++) {
831         /*
832          * MSI-X is already disabled here so msix_unset_vector_notifiers()
833          * didn't call our release notifier.  Do it now to keep our masks and
834          * unmasks balanced.
835          */
836         if (s->msi_vectors[i].unmasked) {
837             ivshmem_vector_mask(pdev, i);
838         }
839         ivshmem_remove_kvm_msi_virq(s, i);
840     }
841 
842 }
843 
844 static void ivshmem_write_config(PCIDevice *pdev, uint32_t address,
845                                  uint32_t val, int len)
846 {
847     IVShmemState *s = IVSHMEM_COMMON(pdev);
848     int is_enabled, was_enabled = msix_enabled(pdev);
849 
850     pci_default_write_config(pdev, address, val, len);
851     is_enabled = msix_enabled(pdev);
852 
853     if (kvm_msi_via_irqfd_enabled()) {
854         if (!was_enabled && is_enabled) {
855             ivshmem_enable_irqfd(s);
856         } else if (was_enabled && !is_enabled) {
857             ivshmem_disable_irqfd(s);
858         }
859     }
860 }
861 
862 static void ivshmem_common_realize(PCIDevice *dev, Error **errp)
863 {
864     IVShmemState *s = IVSHMEM_COMMON(dev);
865     Error *err = NULL;
866     uint8_t *pci_conf;
867     Error *local_err = NULL;
868 
869     /* IRQFD requires MSI */
870     if (ivshmem_has_feature(s, IVSHMEM_IOEVENTFD) &&
871         !ivshmem_has_feature(s, IVSHMEM_MSI)) {
872         error_setg(errp, "ioeventfd/irqfd requires MSI");
873         return;
874     }
875 
876     pci_conf = dev->config;
877     pci_conf[PCI_COMMAND] = PCI_COMMAND_IO | PCI_COMMAND_MEMORY;
878 
879     memory_region_init_io(&s->ivshmem_mmio, OBJECT(s), &ivshmem_mmio_ops, s,
880                           "ivshmem-mmio", IVSHMEM_REG_BAR_SIZE);
881 
882     /* region for registers*/
883     pci_register_bar(dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY,
884                      &s->ivshmem_mmio);
885 
886     if (s->hostmem != NULL) {
887         IVSHMEM_DPRINTF("using hostmem\n");
888 
889         s->ivshmem_bar2 = host_memory_backend_get_memory(s->hostmem);
890         host_memory_backend_set_mapped(s->hostmem, true);
891     } else {
892         Chardev *chr = qemu_chr_fe_get_driver(&s->server_chr);
893         assert(chr);
894 
895         IVSHMEM_DPRINTF("using shared memory server (socket = %s)\n",
896                         chr->filename);
897 
898         /* we allocate enough space for 16 peers and grow as needed */
899         resize_peers(s, 16);
900 
901         /*
902          * Receive setup messages from server synchronously.
903          * Older versions did it asynchronously, but that creates a
904          * number of entertaining race conditions.
905          */
906         ivshmem_recv_setup(s, &err);
907         if (err) {
908             error_propagate(errp, err);
909             return;
910         }
911 
912         if (s->master == ON_OFF_AUTO_ON && s->vm_id != 0) {
913             error_setg(errp,
914                        "master must connect to the server before any peers");
915             return;
916         }
917 
918         qemu_chr_fe_set_handlers(&s->server_chr, ivshmem_can_receive,
919                                  ivshmem_read, NULL, NULL, s, NULL, true);
920 
921         if (ivshmem_setup_interrupts(s, errp) < 0) {
922             error_prepend(errp, "Failed to initialize interrupts: ");
923             return;
924         }
925     }
926 
927     if (s->master == ON_OFF_AUTO_AUTO) {
928         s->master = s->vm_id == 0 ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
929     }
930 
931     if (!ivshmem_is_master(s)) {
932         error_setg(&s->migration_blocker,
933                    "Migration is disabled when using feature 'peer mode' in device 'ivshmem'");
934         migrate_add_blocker(s->migration_blocker, &local_err);
935         if (local_err) {
936             error_propagate(errp, local_err);
937             error_free(s->migration_blocker);
938             return;
939         }
940     }
941 
942     vmstate_register_ram(s->ivshmem_bar2, DEVICE(s));
943     pci_register_bar(PCI_DEVICE(s), 2,
944                      PCI_BASE_ADDRESS_SPACE_MEMORY |
945                      PCI_BASE_ADDRESS_MEM_PREFETCH |
946                      PCI_BASE_ADDRESS_MEM_TYPE_64,
947                      s->ivshmem_bar2);
948 }
949 
950 static void ivshmem_exit(PCIDevice *dev)
951 {
952     IVShmemState *s = IVSHMEM_COMMON(dev);
953     int i;
954 
955     if (s->migration_blocker) {
956         migrate_del_blocker(s->migration_blocker);
957         error_free(s->migration_blocker);
958     }
959 
960     if (memory_region_is_mapped(s->ivshmem_bar2)) {
961         if (!s->hostmem) {
962             void *addr = memory_region_get_ram_ptr(s->ivshmem_bar2);
963             int fd;
964 
965             if (munmap(addr, memory_region_size(s->ivshmem_bar2) == -1)) {
966                 error_report("Failed to munmap shared memory %s",
967                              strerror(errno));
968             }
969 
970             fd = memory_region_get_fd(s->ivshmem_bar2);
971             close(fd);
972         }
973 
974         vmstate_unregister_ram(s->ivshmem_bar2, DEVICE(dev));
975     }
976 
977     if (s->hostmem) {
978         host_memory_backend_set_mapped(s->hostmem, false);
979     }
980 
981     if (s->peers) {
982         for (i = 0; i < s->nb_peers; i++) {
983             close_peer_eventfds(s, i);
984         }
985         g_free(s->peers);
986     }
987 
988     if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
989         msix_uninit_exclusive_bar(dev);
990     }
991 
992     g_free(s->msi_vectors);
993 }
994 
995 static int ivshmem_pre_load(void *opaque)
996 {
997     IVShmemState *s = opaque;
998 
999     if (!ivshmem_is_master(s)) {
1000         error_report("'peer' devices are not migratable");
1001         return -EINVAL;
1002     }
1003 
1004     return 0;
1005 }
1006 
1007 static int ivshmem_post_load(void *opaque, int version_id)
1008 {
1009     IVShmemState *s = opaque;
1010 
1011     if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
1012         ivshmem_msix_vector_use(s);
1013     }
1014     return 0;
1015 }
1016 
1017 static void ivshmem_common_class_init(ObjectClass *klass, void *data)
1018 {
1019     DeviceClass *dc = DEVICE_CLASS(klass);
1020     PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
1021 
1022     k->realize = ivshmem_common_realize;
1023     k->exit = ivshmem_exit;
1024     k->config_write = ivshmem_write_config;
1025     k->vendor_id = PCI_VENDOR_ID_IVSHMEM;
1026     k->device_id = PCI_DEVICE_ID_IVSHMEM;
1027     k->class_id = PCI_CLASS_MEMORY_RAM;
1028     k->revision = 1;
1029     dc->reset = ivshmem_reset;
1030     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
1031     dc->desc = "Inter-VM shared memory";
1032 }
1033 
1034 static const TypeInfo ivshmem_common_info = {
1035     .name          = TYPE_IVSHMEM_COMMON,
1036     .parent        = TYPE_PCI_DEVICE,
1037     .instance_size = sizeof(IVShmemState),
1038     .abstract      = true,
1039     .class_init    = ivshmem_common_class_init,
1040     .interfaces = (InterfaceInfo[]) {
1041         { INTERFACE_CONVENTIONAL_PCI_DEVICE },
1042         { },
1043     },
1044 };
1045 
1046 static const VMStateDescription ivshmem_plain_vmsd = {
1047     .name = TYPE_IVSHMEM_PLAIN,
1048     .version_id = 0,
1049     .minimum_version_id = 0,
1050     .pre_load = ivshmem_pre_load,
1051     .post_load = ivshmem_post_load,
1052     .fields = (VMStateField[]) {
1053         VMSTATE_PCI_DEVICE(parent_obj, IVShmemState),
1054         VMSTATE_UINT32(intrstatus, IVShmemState),
1055         VMSTATE_UINT32(intrmask, IVShmemState),
1056         VMSTATE_END_OF_LIST()
1057     },
1058 };
1059 
1060 static Property ivshmem_plain_properties[] = {
1061     DEFINE_PROP_ON_OFF_AUTO("master", IVShmemState, master, ON_OFF_AUTO_OFF),
1062     DEFINE_PROP_LINK("memdev", IVShmemState, hostmem, TYPE_MEMORY_BACKEND,
1063                      HostMemoryBackend *),
1064     DEFINE_PROP_END_OF_LIST(),
1065 };
1066 
1067 static void ivshmem_plain_realize(PCIDevice *dev, Error **errp)
1068 {
1069     IVShmemState *s = IVSHMEM_COMMON(dev);
1070 
1071     if (!s->hostmem) {
1072         error_setg(errp, "You must specify a 'memdev'");
1073         return;
1074     } else if (host_memory_backend_is_mapped(s->hostmem)) {
1075         char *path = object_get_canonical_path_component(OBJECT(s->hostmem));
1076         error_setg(errp, "can't use already busy memdev: %s", path);
1077         g_free(path);
1078         return;
1079     }
1080 
1081     ivshmem_common_realize(dev, errp);
1082 }
1083 
1084 static void ivshmem_plain_class_init(ObjectClass *klass, void *data)
1085 {
1086     DeviceClass *dc = DEVICE_CLASS(klass);
1087     PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
1088 
1089     k->realize = ivshmem_plain_realize;
1090     dc->props = ivshmem_plain_properties;
1091     dc->vmsd = &ivshmem_plain_vmsd;
1092 }
1093 
1094 static const TypeInfo ivshmem_plain_info = {
1095     .name          = TYPE_IVSHMEM_PLAIN,
1096     .parent        = TYPE_IVSHMEM_COMMON,
1097     .instance_size = sizeof(IVShmemState),
1098     .class_init    = ivshmem_plain_class_init,
1099 };
1100 
1101 static const VMStateDescription ivshmem_doorbell_vmsd = {
1102     .name = TYPE_IVSHMEM_DOORBELL,
1103     .version_id = 0,
1104     .minimum_version_id = 0,
1105     .pre_load = ivshmem_pre_load,
1106     .post_load = ivshmem_post_load,
1107     .fields = (VMStateField[]) {
1108         VMSTATE_PCI_DEVICE(parent_obj, IVShmemState),
1109         VMSTATE_MSIX(parent_obj, IVShmemState),
1110         VMSTATE_UINT32(intrstatus, IVShmemState),
1111         VMSTATE_UINT32(intrmask, IVShmemState),
1112         VMSTATE_END_OF_LIST()
1113     },
1114 };
1115 
1116 static Property ivshmem_doorbell_properties[] = {
1117     DEFINE_PROP_CHR("chardev", IVShmemState, server_chr),
1118     DEFINE_PROP_UINT32("vectors", IVShmemState, vectors, 1),
1119     DEFINE_PROP_BIT("ioeventfd", IVShmemState, features, IVSHMEM_IOEVENTFD,
1120                     true),
1121     DEFINE_PROP_ON_OFF_AUTO("master", IVShmemState, master, ON_OFF_AUTO_OFF),
1122     DEFINE_PROP_END_OF_LIST(),
1123 };
1124 
1125 static void ivshmem_doorbell_init(Object *obj)
1126 {
1127     IVShmemState *s = IVSHMEM_DOORBELL(obj);
1128 
1129     s->features |= (1 << IVSHMEM_MSI);
1130 }
1131 
1132 static void ivshmem_doorbell_realize(PCIDevice *dev, Error **errp)
1133 {
1134     IVShmemState *s = IVSHMEM_COMMON(dev);
1135 
1136     if (!qemu_chr_fe_backend_connected(&s->server_chr)) {
1137         error_setg(errp, "You must specify a 'chardev'");
1138         return;
1139     }
1140 
1141     ivshmem_common_realize(dev, errp);
1142 }
1143 
1144 static void ivshmem_doorbell_class_init(ObjectClass *klass, void *data)
1145 {
1146     DeviceClass *dc = DEVICE_CLASS(klass);
1147     PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
1148 
1149     k->realize = ivshmem_doorbell_realize;
1150     dc->props = ivshmem_doorbell_properties;
1151     dc->vmsd = &ivshmem_doorbell_vmsd;
1152 }
1153 
1154 static const TypeInfo ivshmem_doorbell_info = {
1155     .name          = TYPE_IVSHMEM_DOORBELL,
1156     .parent        = TYPE_IVSHMEM_COMMON,
1157     .instance_size = sizeof(IVShmemState),
1158     .instance_init = ivshmem_doorbell_init,
1159     .class_init    = ivshmem_doorbell_class_init,
1160 };
1161 
1162 static void ivshmem_register_types(void)
1163 {
1164     type_register_static(&ivshmem_common_info);
1165     type_register_static(&ivshmem_plain_info);
1166     type_register_static(&ivshmem_doorbell_info);
1167 }
1168 
1169 type_init(ivshmem_register_types)
1170