xref: /openbmc/qemu/accel/kvm/kvm-all.c (revision 92229a57)
1 /*
2  * QEMU KVM support
3  *
4  * Copyright IBM, Corp. 2008
5  *           Red Hat, Inc. 2008
6  *
7  * Authors:
8  *  Anthony Liguori   <aliguori@us.ibm.com>
9  *  Glauber Costa     <gcosta@redhat.com>
10  *
11  * This work is licensed under the terms of the GNU GPL, version 2 or later.
12  * See the COPYING file in the top-level directory.
13  *
14  */
15 
16 #include "qemu/osdep.h"
17 #include <sys/ioctl.h>
18 
19 #include <linux/kvm.h>
20 
21 #include "qemu-common.h"
22 #include "qemu/atomic.h"
23 #include "qemu/option.h"
24 #include "qemu/config-file.h"
25 #include "qemu/error-report.h"
26 #include "qapi/error.h"
27 #include "hw/hw.h"
28 #include "hw/pci/msi.h"
29 #include "hw/pci/msix.h"
30 #include "hw/s390x/adapter.h"
31 #include "exec/gdbstub.h"
32 #include "sysemu/kvm_int.h"
33 #include "sysemu/cpus.h"
34 #include "qemu/bswap.h"
35 #include "exec/memory.h"
36 #include "exec/ram_addr.h"
37 #include "exec/address-spaces.h"
38 #include "qemu/event_notifier.h"
39 #include "trace.h"
40 #include "hw/irq.h"
41 
42 #include "hw/boards.h"
43 
44 /* This check must be after config-host.h is included */
45 #ifdef CONFIG_EVENTFD
46 #include <sys/eventfd.h>
47 #endif
48 
49 /* KVM uses PAGE_SIZE in its definition of KVM_COALESCED_MMIO_MAX. We
50  * need to use the real host PAGE_SIZE, as that's what KVM will use.
51  */
52 #define PAGE_SIZE getpagesize()
53 
54 //#define DEBUG_KVM
55 
56 #ifdef DEBUG_KVM
57 #define DPRINTF(fmt, ...) \
58     do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
59 #else
60 #define DPRINTF(fmt, ...) \
61     do { } while (0)
62 #endif
63 
64 #define KVM_MSI_HASHTAB_SIZE    256
65 
66 struct KVMParkedVcpu {
67     unsigned long vcpu_id;
68     int kvm_fd;
69     QLIST_ENTRY(KVMParkedVcpu) node;
70 };
71 
72 struct KVMState
73 {
74     AccelState parent_obj;
75 
76     int nr_slots;
77     int fd;
78     int vmfd;
79     int coalesced_mmio;
80     struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
81     bool coalesced_flush_in_progress;
82     int broken_set_mem_region;
83     int vcpu_events;
84     int robust_singlestep;
85     int debugregs;
86 #ifdef KVM_CAP_SET_GUEST_DEBUG
87     struct kvm_sw_breakpoint_head kvm_sw_breakpoints;
88 #endif
89     int many_ioeventfds;
90     int intx_set_mask;
91     /* The man page (and posix) say ioctl numbers are signed int, but
92      * they're not.  Linux, glibc and *BSD all treat ioctl numbers as
93      * unsigned, and treating them as signed here can break things */
94     unsigned irq_set_ioctl;
95     unsigned int sigmask_len;
96     GHashTable *gsimap;
97 #ifdef KVM_CAP_IRQ_ROUTING
98     struct kvm_irq_routing *irq_routes;
99     int nr_allocated_irq_routes;
100     unsigned long *used_gsi_bitmap;
101     unsigned int gsi_count;
102     QTAILQ_HEAD(msi_hashtab, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE];
103 #endif
104     KVMMemoryListener memory_listener;
105     QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus;
106 };
107 
108 KVMState *kvm_state;
109 bool kvm_kernel_irqchip;
110 bool kvm_split_irqchip;
111 bool kvm_async_interrupts_allowed;
112 bool kvm_halt_in_kernel_allowed;
113 bool kvm_eventfds_allowed;
114 bool kvm_irqfds_allowed;
115 bool kvm_resamplefds_allowed;
116 bool kvm_msi_via_irqfd_allowed;
117 bool kvm_gsi_routing_allowed;
118 bool kvm_gsi_direct_mapping;
119 bool kvm_allowed;
120 bool kvm_readonly_mem_allowed;
121 bool kvm_vm_attributes_allowed;
122 bool kvm_direct_msi_allowed;
123 bool kvm_ioeventfd_any_length_allowed;
124 bool kvm_msi_use_devid;
125 static bool kvm_immediate_exit;
126 
127 static const KVMCapabilityInfo kvm_required_capabilites[] = {
128     KVM_CAP_INFO(USER_MEMORY),
129     KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS),
130     KVM_CAP_LAST_INFO
131 };
132 
133 int kvm_get_max_memslots(void)
134 {
135     KVMState *s = KVM_STATE(current_machine->accelerator);
136 
137     return s->nr_slots;
138 }
139 
140 static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml)
141 {
142     KVMState *s = kvm_state;
143     int i;
144 
145     for (i = 0; i < s->nr_slots; i++) {
146         if (kml->slots[i].memory_size == 0) {
147             return &kml->slots[i];
148         }
149     }
150 
151     return NULL;
152 }
153 
154 bool kvm_has_free_slot(MachineState *ms)
155 {
156     KVMState *s = KVM_STATE(ms->accelerator);
157 
158     return kvm_get_free_slot(&s->memory_listener);
159 }
160 
161 static KVMSlot *kvm_alloc_slot(KVMMemoryListener *kml)
162 {
163     KVMSlot *slot = kvm_get_free_slot(kml);
164 
165     if (slot) {
166         return slot;
167     }
168 
169     fprintf(stderr, "%s: no free slot available\n", __func__);
170     abort();
171 }
172 
173 static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml,
174                                          hwaddr start_addr,
175                                          hwaddr end_addr)
176 {
177     KVMState *s = kvm_state;
178     int i;
179 
180     for (i = 0; i < s->nr_slots; i++) {
181         KVMSlot *mem = &kml->slots[i];
182 
183         if (start_addr == mem->start_addr &&
184             end_addr == mem->start_addr + mem->memory_size) {
185             return mem;
186         }
187     }
188 
189     return NULL;
190 }
191 
192 /*
193  * Find overlapping slot with lowest start address
194  */
195 static KVMSlot *kvm_lookup_overlapping_slot(KVMMemoryListener *kml,
196                                             hwaddr start_addr,
197                                             hwaddr end_addr)
198 {
199     KVMState *s = kvm_state;
200     KVMSlot *found = NULL;
201     int i;
202 
203     for (i = 0; i < s->nr_slots; i++) {
204         KVMSlot *mem = &kml->slots[i];
205 
206         if (mem->memory_size == 0 ||
207             (found && found->start_addr < mem->start_addr)) {
208             continue;
209         }
210 
211         if (end_addr > mem->start_addr &&
212             start_addr < mem->start_addr + mem->memory_size) {
213             found = mem;
214         }
215     }
216 
217     return found;
218 }
219 
220 int kvm_physical_memory_addr_from_host(KVMState *s, void *ram,
221                                        hwaddr *phys_addr)
222 {
223     KVMMemoryListener *kml = &s->memory_listener;
224     int i;
225 
226     for (i = 0; i < s->nr_slots; i++) {
227         KVMSlot *mem = &kml->slots[i];
228 
229         if (ram >= mem->ram && ram < mem->ram + mem->memory_size) {
230             *phys_addr = mem->start_addr + (ram - mem->ram);
231             return 1;
232         }
233     }
234 
235     return 0;
236 }
237 
238 static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot)
239 {
240     KVMState *s = kvm_state;
241     struct kvm_userspace_memory_region mem;
242 
243     mem.slot = slot->slot | (kml->as_id << 16);
244     mem.guest_phys_addr = slot->start_addr;
245     mem.userspace_addr = (unsigned long)slot->ram;
246     mem.flags = slot->flags;
247 
248     if (slot->memory_size && mem.flags & KVM_MEM_READONLY) {
249         /* Set the slot size to 0 before setting the slot to the desired
250          * value. This is needed based on KVM commit 75d61fbc. */
251         mem.memory_size = 0;
252         kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
253     }
254     mem.memory_size = slot->memory_size;
255     return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
256 }
257 
258 int kvm_destroy_vcpu(CPUState *cpu)
259 {
260     KVMState *s = kvm_state;
261     long mmap_size;
262     struct KVMParkedVcpu *vcpu = NULL;
263     int ret = 0;
264 
265     DPRINTF("kvm_destroy_vcpu\n");
266 
267     mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
268     if (mmap_size < 0) {
269         ret = mmap_size;
270         DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
271         goto err;
272     }
273 
274     ret = munmap(cpu->kvm_run, mmap_size);
275     if (ret < 0) {
276         goto err;
277     }
278 
279     vcpu = g_malloc0(sizeof(*vcpu));
280     vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
281     vcpu->kvm_fd = cpu->kvm_fd;
282     QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
283 err:
284     return ret;
285 }
286 
287 static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id)
288 {
289     struct KVMParkedVcpu *cpu;
290 
291     QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
292         if (cpu->vcpu_id == vcpu_id) {
293             int kvm_fd;
294 
295             QLIST_REMOVE(cpu, node);
296             kvm_fd = cpu->kvm_fd;
297             g_free(cpu);
298             return kvm_fd;
299         }
300     }
301 
302     return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);
303 }
304 
305 int kvm_init_vcpu(CPUState *cpu)
306 {
307     KVMState *s = kvm_state;
308     long mmap_size;
309     int ret;
310 
311     DPRINTF("kvm_init_vcpu\n");
312 
313     ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu));
314     if (ret < 0) {
315         DPRINTF("kvm_create_vcpu failed\n");
316         goto err;
317     }
318 
319     cpu->kvm_fd = ret;
320     cpu->kvm_state = s;
321     cpu->kvm_vcpu_dirty = true;
322 
323     mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
324     if (mmap_size < 0) {
325         ret = mmap_size;
326         DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
327         goto err;
328     }
329 
330     cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
331                         cpu->kvm_fd, 0);
332     if (cpu->kvm_run == MAP_FAILED) {
333         ret = -errno;
334         DPRINTF("mmap'ing vcpu state failed\n");
335         goto err;
336     }
337 
338     if (s->coalesced_mmio && !s->coalesced_mmio_ring) {
339         s->coalesced_mmio_ring =
340             (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE;
341     }
342 
343     ret = kvm_arch_init_vcpu(cpu);
344 err:
345     return ret;
346 }
347 
348 /*
349  * dirty pages logging control
350  */
351 
352 static int kvm_mem_flags(MemoryRegion *mr)
353 {
354     bool readonly = mr->readonly || memory_region_is_romd(mr);
355     int flags = 0;
356 
357     if (memory_region_get_dirty_log_mask(mr) != 0) {
358         flags |= KVM_MEM_LOG_DIRTY_PAGES;
359     }
360     if (readonly && kvm_readonly_mem_allowed) {
361         flags |= KVM_MEM_READONLY;
362     }
363     return flags;
364 }
365 
366 static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem,
367                                  MemoryRegion *mr)
368 {
369     int old_flags;
370 
371     old_flags = mem->flags;
372     mem->flags = kvm_mem_flags(mr);
373 
374     /* If nothing changed effectively, no need to issue ioctl */
375     if (mem->flags == old_flags) {
376         return 0;
377     }
378 
379     return kvm_set_user_memory_region(kml, mem);
380 }
381 
382 static int kvm_section_update_flags(KVMMemoryListener *kml,
383                                     MemoryRegionSection *section)
384 {
385     hwaddr phys_addr = section->offset_within_address_space;
386     ram_addr_t size = int128_get64(section->size);
387     KVMSlot *mem = kvm_lookup_matching_slot(kml, phys_addr, phys_addr + size);
388 
389     if (mem == NULL)  {
390         return 0;
391     } else {
392         return kvm_slot_update_flags(kml, mem, section->mr);
393     }
394 }
395 
396 static void kvm_log_start(MemoryListener *listener,
397                           MemoryRegionSection *section,
398                           int old, int new)
399 {
400     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
401     int r;
402 
403     if (old != 0) {
404         return;
405     }
406 
407     r = kvm_section_update_flags(kml, section);
408     if (r < 0) {
409         abort();
410     }
411 }
412 
413 static void kvm_log_stop(MemoryListener *listener,
414                           MemoryRegionSection *section,
415                           int old, int new)
416 {
417     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
418     int r;
419 
420     if (new != 0) {
421         return;
422     }
423 
424     r = kvm_section_update_flags(kml, section);
425     if (r < 0) {
426         abort();
427     }
428 }
429 
430 /* get kvm's dirty pages bitmap and update qemu's */
431 static int kvm_get_dirty_pages_log_range(MemoryRegionSection *section,
432                                          unsigned long *bitmap)
433 {
434     ram_addr_t start = section->offset_within_region +
435                        memory_region_get_ram_addr(section->mr);
436     ram_addr_t pages = int128_get64(section->size) / getpagesize();
437 
438     cpu_physical_memory_set_dirty_lebitmap(bitmap, start, pages);
439     return 0;
440 }
441 
442 #define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
443 
444 /**
445  * kvm_physical_sync_dirty_bitmap - Grab dirty bitmap from kernel space
446  * This function updates qemu's dirty bitmap using
447  * memory_region_set_dirty().  This means all bits are set
448  * to dirty.
449  *
450  * @start_add: start of logged region.
451  * @end_addr: end of logged region.
452  */
453 static int kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml,
454                                           MemoryRegionSection *section)
455 {
456     KVMState *s = kvm_state;
457     unsigned long size, allocated_size = 0;
458     struct kvm_dirty_log d = {};
459     KVMSlot *mem;
460     int ret = 0;
461     hwaddr start_addr = section->offset_within_address_space;
462     hwaddr end_addr = start_addr + int128_get64(section->size);
463 
464     d.dirty_bitmap = NULL;
465     while (start_addr < end_addr) {
466         mem = kvm_lookup_overlapping_slot(kml, start_addr, end_addr);
467         if (mem == NULL) {
468             break;
469         }
470 
471         /* XXX bad kernel interface alert
472          * For dirty bitmap, kernel allocates array of size aligned to
473          * bits-per-long.  But for case when the kernel is 64bits and
474          * the userspace is 32bits, userspace can't align to the same
475          * bits-per-long, since sizeof(long) is different between kernel
476          * and user space.  This way, userspace will provide buffer which
477          * may be 4 bytes less than the kernel will use, resulting in
478          * userspace memory corruption (which is not detectable by valgrind
479          * too, in most cases).
480          * So for now, let's align to 64 instead of HOST_LONG_BITS here, in
481          * a hope that sizeof(long) won't become >8 any time soon.
482          */
483         size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS),
484                      /*HOST_LONG_BITS*/ 64) / 8;
485         if (!d.dirty_bitmap) {
486             d.dirty_bitmap = g_malloc(size);
487         } else if (size > allocated_size) {
488             d.dirty_bitmap = g_realloc(d.dirty_bitmap, size);
489         }
490         allocated_size = size;
491         memset(d.dirty_bitmap, 0, allocated_size);
492 
493         d.slot = mem->slot | (kml->as_id << 16);
494         if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) {
495             DPRINTF("ioctl failed %d\n", errno);
496             ret = -1;
497             break;
498         }
499 
500         kvm_get_dirty_pages_log_range(section, d.dirty_bitmap);
501         start_addr = mem->start_addr + mem->memory_size;
502     }
503     g_free(d.dirty_bitmap);
504 
505     return ret;
506 }
507 
508 static void kvm_coalesce_mmio_region(MemoryListener *listener,
509                                      MemoryRegionSection *secion,
510                                      hwaddr start, hwaddr size)
511 {
512     KVMState *s = kvm_state;
513 
514     if (s->coalesced_mmio) {
515         struct kvm_coalesced_mmio_zone zone;
516 
517         zone.addr = start;
518         zone.size = size;
519         zone.pad = 0;
520 
521         (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
522     }
523 }
524 
525 static void kvm_uncoalesce_mmio_region(MemoryListener *listener,
526                                        MemoryRegionSection *secion,
527                                        hwaddr start, hwaddr size)
528 {
529     KVMState *s = kvm_state;
530 
531     if (s->coalesced_mmio) {
532         struct kvm_coalesced_mmio_zone zone;
533 
534         zone.addr = start;
535         zone.size = size;
536         zone.pad = 0;
537 
538         (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
539     }
540 }
541 
542 int kvm_check_extension(KVMState *s, unsigned int extension)
543 {
544     int ret;
545 
546     ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
547     if (ret < 0) {
548         ret = 0;
549     }
550 
551     return ret;
552 }
553 
554 int kvm_vm_check_extension(KVMState *s, unsigned int extension)
555 {
556     int ret;
557 
558     ret = kvm_vm_ioctl(s, KVM_CHECK_EXTENSION, extension);
559     if (ret < 0) {
560         /* VM wide version not implemented, use global one instead */
561         ret = kvm_check_extension(s, extension);
562     }
563 
564     return ret;
565 }
566 
567 static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size)
568 {
569 #if defined(HOST_WORDS_BIGENDIAN) != defined(TARGET_WORDS_BIGENDIAN)
570     /* The kernel expects ioeventfd values in HOST_WORDS_BIGENDIAN
571      * endianness, but the memory core hands them in target endianness.
572      * For example, PPC is always treated as big-endian even if running
573      * on KVM and on PPC64LE.  Correct here.
574      */
575     switch (size) {
576     case 2:
577         val = bswap16(val);
578         break;
579     case 4:
580         val = bswap32(val);
581         break;
582     }
583 #endif
584     return val;
585 }
586 
587 static int kvm_set_ioeventfd_mmio(int fd, hwaddr addr, uint32_t val,
588                                   bool assign, uint32_t size, bool datamatch)
589 {
590     int ret;
591     struct kvm_ioeventfd iofd = {
592         .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
593         .addr = addr,
594         .len = size,
595         .flags = 0,
596         .fd = fd,
597     };
598 
599     if (!kvm_enabled()) {
600         return -ENOSYS;
601     }
602 
603     if (datamatch) {
604         iofd.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
605     }
606     if (!assign) {
607         iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
608     }
609 
610     ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);
611 
612     if (ret < 0) {
613         return -errno;
614     }
615 
616     return 0;
617 }
618 
619 static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val,
620                                  bool assign, uint32_t size, bool datamatch)
621 {
622     struct kvm_ioeventfd kick = {
623         .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
624         .addr = addr,
625         .flags = KVM_IOEVENTFD_FLAG_PIO,
626         .len = size,
627         .fd = fd,
628     };
629     int r;
630     if (!kvm_enabled()) {
631         return -ENOSYS;
632     }
633     if (datamatch) {
634         kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
635     }
636     if (!assign) {
637         kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
638     }
639     r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
640     if (r < 0) {
641         return r;
642     }
643     return 0;
644 }
645 
646 
647 static int kvm_check_many_ioeventfds(void)
648 {
649     /* Userspace can use ioeventfd for io notification.  This requires a host
650      * that supports eventfd(2) and an I/O thread; since eventfd does not
651      * support SIGIO it cannot interrupt the vcpu.
652      *
653      * Older kernels have a 6 device limit on the KVM io bus.  Find out so we
654      * can avoid creating too many ioeventfds.
655      */
656 #if defined(CONFIG_EVENTFD)
657     int ioeventfds[7];
658     int i, ret = 0;
659     for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) {
660         ioeventfds[i] = eventfd(0, EFD_CLOEXEC);
661         if (ioeventfds[i] < 0) {
662             break;
663         }
664         ret = kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, true, 2, true);
665         if (ret < 0) {
666             close(ioeventfds[i]);
667             break;
668         }
669     }
670 
671     /* Decide whether many devices are supported or not */
672     ret = i == ARRAY_SIZE(ioeventfds);
673 
674     while (i-- > 0) {
675         kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, false, 2, true);
676         close(ioeventfds[i]);
677     }
678     return ret;
679 #else
680     return 0;
681 #endif
682 }
683 
684 static const KVMCapabilityInfo *
685 kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list)
686 {
687     while (list->name) {
688         if (!kvm_check_extension(s, list->value)) {
689             return list;
690         }
691         list++;
692     }
693     return NULL;
694 }
695 
696 static void kvm_set_phys_mem(KVMMemoryListener *kml,
697                              MemoryRegionSection *section, bool add)
698 {
699     KVMState *s = kvm_state;
700     KVMSlot *mem, old;
701     int err;
702     MemoryRegion *mr = section->mr;
703     bool writeable = !mr->readonly && !mr->rom_device;
704     hwaddr start_addr = section->offset_within_address_space;
705     ram_addr_t size = int128_get64(section->size);
706     void *ram = NULL;
707     unsigned delta;
708 
709     /* kvm works in page size chunks, but the function may be called
710        with sub-page size and unaligned start address. Pad the start
711        address to next and truncate size to previous page boundary. */
712     delta = qemu_real_host_page_size - (start_addr & ~qemu_real_host_page_mask);
713     delta &= ~qemu_real_host_page_mask;
714     if (delta > size) {
715         return;
716     }
717     start_addr += delta;
718     size -= delta;
719     size &= qemu_real_host_page_mask;
720     if (!size || (start_addr & ~qemu_real_host_page_mask)) {
721         return;
722     }
723 
724     if (!memory_region_is_ram(mr)) {
725         if (writeable || !kvm_readonly_mem_allowed) {
726             return;
727         } else if (!mr->romd_mode) {
728             /* If the memory device is not in romd_mode, then we actually want
729              * to remove the kvm memory slot so all accesses will trap. */
730             add = false;
731         }
732     }
733 
734     ram = memory_region_get_ram_ptr(mr) + section->offset_within_region + delta;
735 
736     while (1) {
737         mem = kvm_lookup_overlapping_slot(kml, start_addr, start_addr + size);
738         if (!mem) {
739             break;
740         }
741 
742         if (add && start_addr >= mem->start_addr &&
743             (start_addr + size <= mem->start_addr + mem->memory_size) &&
744             (ram - start_addr == mem->ram - mem->start_addr)) {
745             /* The new slot fits into the existing one and comes with
746              * identical parameters - update flags and done. */
747             kvm_slot_update_flags(kml, mem, mr);
748             return;
749         }
750 
751         old = *mem;
752 
753         if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
754             kvm_physical_sync_dirty_bitmap(kml, section);
755         }
756 
757         /* unregister the overlapping slot */
758         mem->memory_size = 0;
759         err = kvm_set_user_memory_region(kml, mem);
760         if (err) {
761             fprintf(stderr, "%s: error unregistering overlapping slot: %s\n",
762                     __func__, strerror(-err));
763             abort();
764         }
765 
766         /* Workaround for older KVM versions: we can't join slots, even not by
767          * unregistering the previous ones and then registering the larger
768          * slot. We have to maintain the existing fragmentation. Sigh.
769          *
770          * This workaround assumes that the new slot starts at the same
771          * address as the first existing one. If not or if some overlapping
772          * slot comes around later, we will fail (not seen in practice so far)
773          * - and actually require a recent KVM version. */
774         if (s->broken_set_mem_region &&
775             old.start_addr == start_addr && old.memory_size < size && add) {
776             mem = kvm_alloc_slot(kml);
777             mem->memory_size = old.memory_size;
778             mem->start_addr = old.start_addr;
779             mem->ram = old.ram;
780             mem->flags = kvm_mem_flags(mr);
781 
782             err = kvm_set_user_memory_region(kml, mem);
783             if (err) {
784                 fprintf(stderr, "%s: error updating slot: %s\n", __func__,
785                         strerror(-err));
786                 abort();
787             }
788 
789             start_addr += old.memory_size;
790             ram += old.memory_size;
791             size -= old.memory_size;
792             continue;
793         }
794 
795         /* register prefix slot */
796         if (old.start_addr < start_addr) {
797             mem = kvm_alloc_slot(kml);
798             mem->memory_size = start_addr - old.start_addr;
799             mem->start_addr = old.start_addr;
800             mem->ram = old.ram;
801             mem->flags =  kvm_mem_flags(mr);
802 
803             err = kvm_set_user_memory_region(kml, mem);
804             if (err) {
805                 fprintf(stderr, "%s: error registering prefix slot: %s\n",
806                         __func__, strerror(-err));
807 #ifdef TARGET_PPC
808                 fprintf(stderr, "%s: This is probably because your kernel's " \
809                                 "PAGE_SIZE is too big. Please try to use 4k " \
810                                 "PAGE_SIZE!\n", __func__);
811 #endif
812                 abort();
813             }
814         }
815 
816         /* register suffix slot */
817         if (old.start_addr + old.memory_size > start_addr + size) {
818             ram_addr_t size_delta;
819 
820             mem = kvm_alloc_slot(kml);
821             mem->start_addr = start_addr + size;
822             size_delta = mem->start_addr - old.start_addr;
823             mem->memory_size = old.memory_size - size_delta;
824             mem->ram = old.ram + size_delta;
825             mem->flags = kvm_mem_flags(mr);
826 
827             err = kvm_set_user_memory_region(kml, mem);
828             if (err) {
829                 fprintf(stderr, "%s: error registering suffix slot: %s\n",
830                         __func__, strerror(-err));
831                 abort();
832             }
833         }
834     }
835 
836     /* in case the KVM bug workaround already "consumed" the new slot */
837     if (!size) {
838         return;
839     }
840     if (!add) {
841         return;
842     }
843     mem = kvm_alloc_slot(kml);
844     mem->memory_size = size;
845     mem->start_addr = start_addr;
846     mem->ram = ram;
847     mem->flags = kvm_mem_flags(mr);
848 
849     err = kvm_set_user_memory_region(kml, mem);
850     if (err) {
851         fprintf(stderr, "%s: error registering slot: %s\n", __func__,
852                 strerror(-err));
853         abort();
854     }
855 }
856 
857 static void kvm_region_add(MemoryListener *listener,
858                            MemoryRegionSection *section)
859 {
860     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
861 
862     memory_region_ref(section->mr);
863     kvm_set_phys_mem(kml, section, true);
864 }
865 
866 static void kvm_region_del(MemoryListener *listener,
867                            MemoryRegionSection *section)
868 {
869     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
870 
871     kvm_set_phys_mem(kml, section, false);
872     memory_region_unref(section->mr);
873 }
874 
875 static void kvm_log_sync(MemoryListener *listener,
876                          MemoryRegionSection *section)
877 {
878     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
879     int r;
880 
881     r = kvm_physical_sync_dirty_bitmap(kml, section);
882     if (r < 0) {
883         abort();
884     }
885 }
886 
887 static void kvm_mem_ioeventfd_add(MemoryListener *listener,
888                                   MemoryRegionSection *section,
889                                   bool match_data, uint64_t data,
890                                   EventNotifier *e)
891 {
892     int fd = event_notifier_get_fd(e);
893     int r;
894 
895     r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
896                                data, true, int128_get64(section->size),
897                                match_data);
898     if (r < 0) {
899         fprintf(stderr, "%s: error adding ioeventfd: %s\n",
900                 __func__, strerror(-r));
901         abort();
902     }
903 }
904 
905 static void kvm_mem_ioeventfd_del(MemoryListener *listener,
906                                   MemoryRegionSection *section,
907                                   bool match_data, uint64_t data,
908                                   EventNotifier *e)
909 {
910     int fd = event_notifier_get_fd(e);
911     int r;
912 
913     r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
914                                data, false, int128_get64(section->size),
915                                match_data);
916     if (r < 0) {
917         abort();
918     }
919 }
920 
921 static void kvm_io_ioeventfd_add(MemoryListener *listener,
922                                  MemoryRegionSection *section,
923                                  bool match_data, uint64_t data,
924                                  EventNotifier *e)
925 {
926     int fd = event_notifier_get_fd(e);
927     int r;
928 
929     r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
930                               data, true, int128_get64(section->size),
931                               match_data);
932     if (r < 0) {
933         fprintf(stderr, "%s: error adding ioeventfd: %s\n",
934                 __func__, strerror(-r));
935         abort();
936     }
937 }
938 
939 static void kvm_io_ioeventfd_del(MemoryListener *listener,
940                                  MemoryRegionSection *section,
941                                  bool match_data, uint64_t data,
942                                  EventNotifier *e)
943 
944 {
945     int fd = event_notifier_get_fd(e);
946     int r;
947 
948     r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
949                               data, false, int128_get64(section->size),
950                               match_data);
951     if (r < 0) {
952         abort();
953     }
954 }
955 
956 void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
957                                   AddressSpace *as, int as_id)
958 {
959     int i;
960 
961     kml->slots = g_malloc0(s->nr_slots * sizeof(KVMSlot));
962     kml->as_id = as_id;
963 
964     for (i = 0; i < s->nr_slots; i++) {
965         kml->slots[i].slot = i;
966     }
967 
968     kml->listener.region_add = kvm_region_add;
969     kml->listener.region_del = kvm_region_del;
970     kml->listener.log_start = kvm_log_start;
971     kml->listener.log_stop = kvm_log_stop;
972     kml->listener.log_sync = kvm_log_sync;
973     kml->listener.priority = 10;
974 
975     memory_listener_register(&kml->listener, as);
976 }
977 
978 static MemoryListener kvm_io_listener = {
979     .eventfd_add = kvm_io_ioeventfd_add,
980     .eventfd_del = kvm_io_ioeventfd_del,
981     .priority = 10,
982 };
983 
984 static void kvm_handle_interrupt(CPUState *cpu, int mask)
985 {
986     cpu->interrupt_request |= mask;
987 
988     if (!qemu_cpu_is_self(cpu)) {
989         qemu_cpu_kick(cpu);
990     }
991 }
992 
993 int kvm_set_irq(KVMState *s, int irq, int level)
994 {
995     struct kvm_irq_level event;
996     int ret;
997 
998     assert(kvm_async_interrupts_enabled());
999 
1000     event.level = level;
1001     event.irq = irq;
1002     ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event);
1003     if (ret < 0) {
1004         perror("kvm_set_irq");
1005         abort();
1006     }
1007 
1008     return (s->irq_set_ioctl == KVM_IRQ_LINE) ? 1 : event.status;
1009 }
1010 
1011 #ifdef KVM_CAP_IRQ_ROUTING
1012 typedef struct KVMMSIRoute {
1013     struct kvm_irq_routing_entry kroute;
1014     QTAILQ_ENTRY(KVMMSIRoute) entry;
1015 } KVMMSIRoute;
1016 
1017 static void set_gsi(KVMState *s, unsigned int gsi)
1018 {
1019     set_bit(gsi, s->used_gsi_bitmap);
1020 }
1021 
1022 static void clear_gsi(KVMState *s, unsigned int gsi)
1023 {
1024     clear_bit(gsi, s->used_gsi_bitmap);
1025 }
1026 
1027 void kvm_init_irq_routing(KVMState *s)
1028 {
1029     int gsi_count, i;
1030 
1031     gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING) - 1;
1032     if (gsi_count > 0) {
1033         /* Round up so we can search ints using ffs */
1034         s->used_gsi_bitmap = bitmap_new(gsi_count);
1035         s->gsi_count = gsi_count;
1036     }
1037 
1038     s->irq_routes = g_malloc0(sizeof(*s->irq_routes));
1039     s->nr_allocated_irq_routes = 0;
1040 
1041     if (!kvm_direct_msi_allowed) {
1042         for (i = 0; i < KVM_MSI_HASHTAB_SIZE; i++) {
1043             QTAILQ_INIT(&s->msi_hashtab[i]);
1044         }
1045     }
1046 
1047     kvm_arch_init_irq_routing(s);
1048 }
1049 
1050 void kvm_irqchip_commit_routes(KVMState *s)
1051 {
1052     int ret;
1053 
1054     if (kvm_gsi_direct_mapping()) {
1055         return;
1056     }
1057 
1058     if (!kvm_gsi_routing_enabled()) {
1059         return;
1060     }
1061 
1062     s->irq_routes->flags = 0;
1063     trace_kvm_irqchip_commit_routes();
1064     ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes);
1065     assert(ret == 0);
1066 }
1067 
1068 static void kvm_add_routing_entry(KVMState *s,
1069                                   struct kvm_irq_routing_entry *entry)
1070 {
1071     struct kvm_irq_routing_entry *new;
1072     int n, size;
1073 
1074     if (s->irq_routes->nr == s->nr_allocated_irq_routes) {
1075         n = s->nr_allocated_irq_routes * 2;
1076         if (n < 64) {
1077             n = 64;
1078         }
1079         size = sizeof(struct kvm_irq_routing);
1080         size += n * sizeof(*new);
1081         s->irq_routes = g_realloc(s->irq_routes, size);
1082         s->nr_allocated_irq_routes = n;
1083     }
1084     n = s->irq_routes->nr++;
1085     new = &s->irq_routes->entries[n];
1086 
1087     *new = *entry;
1088 
1089     set_gsi(s, entry->gsi);
1090 }
1091 
1092 static int kvm_update_routing_entry(KVMState *s,
1093                                     struct kvm_irq_routing_entry *new_entry)
1094 {
1095     struct kvm_irq_routing_entry *entry;
1096     int n;
1097 
1098     for (n = 0; n < s->irq_routes->nr; n++) {
1099         entry = &s->irq_routes->entries[n];
1100         if (entry->gsi != new_entry->gsi) {
1101             continue;
1102         }
1103 
1104         if(!memcmp(entry, new_entry, sizeof *entry)) {
1105             return 0;
1106         }
1107 
1108         *entry = *new_entry;
1109 
1110         return 0;
1111     }
1112 
1113     return -ESRCH;
1114 }
1115 
1116 void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin)
1117 {
1118     struct kvm_irq_routing_entry e = {};
1119 
1120     assert(pin < s->gsi_count);
1121 
1122     e.gsi = irq;
1123     e.type = KVM_IRQ_ROUTING_IRQCHIP;
1124     e.flags = 0;
1125     e.u.irqchip.irqchip = irqchip;
1126     e.u.irqchip.pin = pin;
1127     kvm_add_routing_entry(s, &e);
1128 }
1129 
1130 void kvm_irqchip_release_virq(KVMState *s, int virq)
1131 {
1132     struct kvm_irq_routing_entry *e;
1133     int i;
1134 
1135     if (kvm_gsi_direct_mapping()) {
1136         return;
1137     }
1138 
1139     for (i = 0; i < s->irq_routes->nr; i++) {
1140         e = &s->irq_routes->entries[i];
1141         if (e->gsi == virq) {
1142             s->irq_routes->nr--;
1143             *e = s->irq_routes->entries[s->irq_routes->nr];
1144         }
1145     }
1146     clear_gsi(s, virq);
1147     kvm_arch_release_virq_post(virq);
1148     trace_kvm_irqchip_release_virq(virq);
1149 }
1150 
1151 static unsigned int kvm_hash_msi(uint32_t data)
1152 {
1153     /* This is optimized for IA32 MSI layout. However, no other arch shall
1154      * repeat the mistake of not providing a direct MSI injection API. */
1155     return data & 0xff;
1156 }
1157 
1158 static void kvm_flush_dynamic_msi_routes(KVMState *s)
1159 {
1160     KVMMSIRoute *route, *next;
1161     unsigned int hash;
1162 
1163     for (hash = 0; hash < KVM_MSI_HASHTAB_SIZE; hash++) {
1164         QTAILQ_FOREACH_SAFE(route, &s->msi_hashtab[hash], entry, next) {
1165             kvm_irqchip_release_virq(s, route->kroute.gsi);
1166             QTAILQ_REMOVE(&s->msi_hashtab[hash], route, entry);
1167             g_free(route);
1168         }
1169     }
1170 }
1171 
1172 static int kvm_irqchip_get_virq(KVMState *s)
1173 {
1174     int next_virq;
1175 
1176     /*
1177      * PIC and IOAPIC share the first 16 GSI numbers, thus the available
1178      * GSI numbers are more than the number of IRQ route. Allocating a GSI
1179      * number can succeed even though a new route entry cannot be added.
1180      * When this happens, flush dynamic MSI entries to free IRQ route entries.
1181      */
1182     if (!kvm_direct_msi_allowed && s->irq_routes->nr == s->gsi_count) {
1183         kvm_flush_dynamic_msi_routes(s);
1184     }
1185 
1186     /* Return the lowest unused GSI in the bitmap */
1187     next_virq = find_first_zero_bit(s->used_gsi_bitmap, s->gsi_count);
1188     if (next_virq >= s->gsi_count) {
1189         return -ENOSPC;
1190     } else {
1191         return next_virq;
1192     }
1193 }
1194 
1195 static KVMMSIRoute *kvm_lookup_msi_route(KVMState *s, MSIMessage msg)
1196 {
1197     unsigned int hash = kvm_hash_msi(msg.data);
1198     KVMMSIRoute *route;
1199 
1200     QTAILQ_FOREACH(route, &s->msi_hashtab[hash], entry) {
1201         if (route->kroute.u.msi.address_lo == (uint32_t)msg.address &&
1202             route->kroute.u.msi.address_hi == (msg.address >> 32) &&
1203             route->kroute.u.msi.data == le32_to_cpu(msg.data)) {
1204             return route;
1205         }
1206     }
1207     return NULL;
1208 }
1209 
1210 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
1211 {
1212     struct kvm_msi msi;
1213     KVMMSIRoute *route;
1214 
1215     if (kvm_direct_msi_allowed) {
1216         msi.address_lo = (uint32_t)msg.address;
1217         msi.address_hi = msg.address >> 32;
1218         msi.data = le32_to_cpu(msg.data);
1219         msi.flags = 0;
1220         memset(msi.pad, 0, sizeof(msi.pad));
1221 
1222         return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi);
1223     }
1224 
1225     route = kvm_lookup_msi_route(s, msg);
1226     if (!route) {
1227         int virq;
1228 
1229         virq = kvm_irqchip_get_virq(s);
1230         if (virq < 0) {
1231             return virq;
1232         }
1233 
1234         route = g_malloc0(sizeof(KVMMSIRoute));
1235         route->kroute.gsi = virq;
1236         route->kroute.type = KVM_IRQ_ROUTING_MSI;
1237         route->kroute.flags = 0;
1238         route->kroute.u.msi.address_lo = (uint32_t)msg.address;
1239         route->kroute.u.msi.address_hi = msg.address >> 32;
1240         route->kroute.u.msi.data = le32_to_cpu(msg.data);
1241 
1242         kvm_add_routing_entry(s, &route->kroute);
1243         kvm_irqchip_commit_routes(s);
1244 
1245         QTAILQ_INSERT_TAIL(&s->msi_hashtab[kvm_hash_msi(msg.data)], route,
1246                            entry);
1247     }
1248 
1249     assert(route->kroute.type == KVM_IRQ_ROUTING_MSI);
1250 
1251     return kvm_set_irq(s, route->kroute.gsi, 1);
1252 }
1253 
1254 int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev)
1255 {
1256     struct kvm_irq_routing_entry kroute = {};
1257     int virq;
1258     MSIMessage msg = {0, 0};
1259 
1260     if (dev) {
1261         msg = pci_get_msi_message(dev, vector);
1262     }
1263 
1264     if (kvm_gsi_direct_mapping()) {
1265         return kvm_arch_msi_data_to_gsi(msg.data);
1266     }
1267 
1268     if (!kvm_gsi_routing_enabled()) {
1269         return -ENOSYS;
1270     }
1271 
1272     virq = kvm_irqchip_get_virq(s);
1273     if (virq < 0) {
1274         return virq;
1275     }
1276 
1277     kroute.gsi = virq;
1278     kroute.type = KVM_IRQ_ROUTING_MSI;
1279     kroute.flags = 0;
1280     kroute.u.msi.address_lo = (uint32_t)msg.address;
1281     kroute.u.msi.address_hi = msg.address >> 32;
1282     kroute.u.msi.data = le32_to_cpu(msg.data);
1283     if (kvm_msi_devid_required()) {
1284         kroute.flags = KVM_MSI_VALID_DEVID;
1285         kroute.u.msi.devid = pci_requester_id(dev);
1286     }
1287     if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
1288         kvm_irqchip_release_virq(s, virq);
1289         return -EINVAL;
1290     }
1291 
1292     trace_kvm_irqchip_add_msi_route(dev ? dev->name : (char *)"N/A",
1293                                     vector, virq);
1294 
1295     kvm_add_routing_entry(s, &kroute);
1296     kvm_arch_add_msi_route_post(&kroute, vector, dev);
1297     kvm_irqchip_commit_routes(s);
1298 
1299     return virq;
1300 }
1301 
1302 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg,
1303                                  PCIDevice *dev)
1304 {
1305     struct kvm_irq_routing_entry kroute = {};
1306 
1307     if (kvm_gsi_direct_mapping()) {
1308         return 0;
1309     }
1310 
1311     if (!kvm_irqchip_in_kernel()) {
1312         return -ENOSYS;
1313     }
1314 
1315     kroute.gsi = virq;
1316     kroute.type = KVM_IRQ_ROUTING_MSI;
1317     kroute.flags = 0;
1318     kroute.u.msi.address_lo = (uint32_t)msg.address;
1319     kroute.u.msi.address_hi = msg.address >> 32;
1320     kroute.u.msi.data = le32_to_cpu(msg.data);
1321     if (kvm_msi_devid_required()) {
1322         kroute.flags = KVM_MSI_VALID_DEVID;
1323         kroute.u.msi.devid = pci_requester_id(dev);
1324     }
1325     if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
1326         return -EINVAL;
1327     }
1328 
1329     trace_kvm_irqchip_update_msi_route(virq);
1330 
1331     return kvm_update_routing_entry(s, &kroute);
1332 }
1333 
1334 static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int rfd, int virq,
1335                                     bool assign)
1336 {
1337     struct kvm_irqfd irqfd = {
1338         .fd = fd,
1339         .gsi = virq,
1340         .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN,
1341     };
1342 
1343     if (rfd != -1) {
1344         irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE;
1345         irqfd.resamplefd = rfd;
1346     }
1347 
1348     if (!kvm_irqfds_enabled()) {
1349         return -ENOSYS;
1350     }
1351 
1352     return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd);
1353 }
1354 
1355 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter)
1356 {
1357     struct kvm_irq_routing_entry kroute = {};
1358     int virq;
1359 
1360     if (!kvm_gsi_routing_enabled()) {
1361         return -ENOSYS;
1362     }
1363 
1364     virq = kvm_irqchip_get_virq(s);
1365     if (virq < 0) {
1366         return virq;
1367     }
1368 
1369     kroute.gsi = virq;
1370     kroute.type = KVM_IRQ_ROUTING_S390_ADAPTER;
1371     kroute.flags = 0;
1372     kroute.u.adapter.summary_addr = adapter->summary_addr;
1373     kroute.u.adapter.ind_addr = adapter->ind_addr;
1374     kroute.u.adapter.summary_offset = adapter->summary_offset;
1375     kroute.u.adapter.ind_offset = adapter->ind_offset;
1376     kroute.u.adapter.adapter_id = adapter->adapter_id;
1377 
1378     kvm_add_routing_entry(s, &kroute);
1379 
1380     return virq;
1381 }
1382 
1383 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint)
1384 {
1385     struct kvm_irq_routing_entry kroute = {};
1386     int virq;
1387 
1388     if (!kvm_gsi_routing_enabled()) {
1389         return -ENOSYS;
1390     }
1391     if (!kvm_check_extension(s, KVM_CAP_HYPERV_SYNIC)) {
1392         return -ENOSYS;
1393     }
1394     virq = kvm_irqchip_get_virq(s);
1395     if (virq < 0) {
1396         return virq;
1397     }
1398 
1399     kroute.gsi = virq;
1400     kroute.type = KVM_IRQ_ROUTING_HV_SINT;
1401     kroute.flags = 0;
1402     kroute.u.hv_sint.vcpu = vcpu;
1403     kroute.u.hv_sint.sint = sint;
1404 
1405     kvm_add_routing_entry(s, &kroute);
1406     kvm_irqchip_commit_routes(s);
1407 
1408     return virq;
1409 }
1410 
1411 #else /* !KVM_CAP_IRQ_ROUTING */
1412 
1413 void kvm_init_irq_routing(KVMState *s)
1414 {
1415 }
1416 
1417 void kvm_irqchip_release_virq(KVMState *s, int virq)
1418 {
1419 }
1420 
1421 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
1422 {
1423     abort();
1424 }
1425 
1426 int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev)
1427 {
1428     return -ENOSYS;
1429 }
1430 
1431 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter)
1432 {
1433     return -ENOSYS;
1434 }
1435 
1436 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint)
1437 {
1438     return -ENOSYS;
1439 }
1440 
1441 static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int virq, bool assign)
1442 {
1443     abort();
1444 }
1445 
1446 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg)
1447 {
1448     return -ENOSYS;
1449 }
1450 #endif /* !KVM_CAP_IRQ_ROUTING */
1451 
1452 int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
1453                                        EventNotifier *rn, int virq)
1454 {
1455     return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n),
1456            rn ? event_notifier_get_fd(rn) : -1, virq, true);
1457 }
1458 
1459 int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
1460                                           int virq)
1461 {
1462     return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), -1, virq,
1463            false);
1464 }
1465 
1466 int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n,
1467                                    EventNotifier *rn, qemu_irq irq)
1468 {
1469     gpointer key, gsi;
1470     gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
1471 
1472     if (!found) {
1473         return -ENXIO;
1474     }
1475     return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, GPOINTER_TO_INT(gsi));
1476 }
1477 
1478 int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n,
1479                                       qemu_irq irq)
1480 {
1481     gpointer key, gsi;
1482     gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
1483 
1484     if (!found) {
1485         return -ENXIO;
1486     }
1487     return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, GPOINTER_TO_INT(gsi));
1488 }
1489 
1490 void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi)
1491 {
1492     g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi));
1493 }
1494 
1495 static void kvm_irqchip_create(MachineState *machine, KVMState *s)
1496 {
1497     int ret;
1498 
1499     if (kvm_check_extension(s, KVM_CAP_IRQCHIP)) {
1500         ;
1501     } else if (kvm_check_extension(s, KVM_CAP_S390_IRQCHIP)) {
1502         ret = kvm_vm_enable_cap(s, KVM_CAP_S390_IRQCHIP, 0);
1503         if (ret < 0) {
1504             fprintf(stderr, "Enable kernel irqchip failed: %s\n", strerror(-ret));
1505             exit(1);
1506         }
1507     } else {
1508         return;
1509     }
1510 
1511     /* First probe and see if there's a arch-specific hook to create the
1512      * in-kernel irqchip for us */
1513     ret = kvm_arch_irqchip_create(machine, s);
1514     if (ret == 0) {
1515         if (machine_kernel_irqchip_split(machine)) {
1516             perror("Split IRQ chip mode not supported.");
1517             exit(1);
1518         } else {
1519             ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP);
1520         }
1521     }
1522     if (ret < 0) {
1523         fprintf(stderr, "Create kernel irqchip failed: %s\n", strerror(-ret));
1524         exit(1);
1525     }
1526 
1527     kvm_kernel_irqchip = true;
1528     /* If we have an in-kernel IRQ chip then we must have asynchronous
1529      * interrupt delivery (though the reverse is not necessarily true)
1530      */
1531     kvm_async_interrupts_allowed = true;
1532     kvm_halt_in_kernel_allowed = true;
1533 
1534     kvm_init_irq_routing(s);
1535 
1536     s->gsimap = g_hash_table_new(g_direct_hash, g_direct_equal);
1537 }
1538 
1539 /* Find number of supported CPUs using the recommended
1540  * procedure from the kernel API documentation to cope with
1541  * older kernels that may be missing capabilities.
1542  */
1543 static int kvm_recommended_vcpus(KVMState *s)
1544 {
1545     int ret = kvm_check_extension(s, KVM_CAP_NR_VCPUS);
1546     return (ret) ? ret : 4;
1547 }
1548 
1549 static int kvm_max_vcpus(KVMState *s)
1550 {
1551     int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS);
1552     return (ret) ? ret : kvm_recommended_vcpus(s);
1553 }
1554 
1555 static int kvm_max_vcpu_id(KVMState *s)
1556 {
1557     int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPU_ID);
1558     return (ret) ? ret : kvm_max_vcpus(s);
1559 }
1560 
1561 bool kvm_vcpu_id_is_valid(int vcpu_id)
1562 {
1563     KVMState *s = KVM_STATE(current_machine->accelerator);
1564     return vcpu_id >= 0 && vcpu_id < kvm_max_vcpu_id(s);
1565 }
1566 
1567 static int kvm_init(MachineState *ms)
1568 {
1569     MachineClass *mc = MACHINE_GET_CLASS(ms);
1570     static const char upgrade_note[] =
1571         "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
1572         "(see http://sourceforge.net/projects/kvm).\n";
1573     struct {
1574         const char *name;
1575         int num;
1576     } num_cpus[] = {
1577         { "SMP",          smp_cpus },
1578         { "hotpluggable", max_cpus },
1579         { NULL, }
1580     }, *nc = num_cpus;
1581     int soft_vcpus_limit, hard_vcpus_limit;
1582     KVMState *s;
1583     const KVMCapabilityInfo *missing_cap;
1584     int ret;
1585     int type = 0;
1586     const char *kvm_type;
1587 
1588     s = KVM_STATE(ms->accelerator);
1589 
1590     /*
1591      * On systems where the kernel can support different base page
1592      * sizes, host page size may be different from TARGET_PAGE_SIZE,
1593      * even with KVM.  TARGET_PAGE_SIZE is assumed to be the minimum
1594      * page size for the system though.
1595      */
1596     assert(TARGET_PAGE_SIZE <= getpagesize());
1597 
1598     s->sigmask_len = 8;
1599 
1600 #ifdef KVM_CAP_SET_GUEST_DEBUG
1601     QTAILQ_INIT(&s->kvm_sw_breakpoints);
1602 #endif
1603     QLIST_INIT(&s->kvm_parked_vcpus);
1604     s->vmfd = -1;
1605     s->fd = qemu_open("/dev/kvm", O_RDWR);
1606     if (s->fd == -1) {
1607         fprintf(stderr, "Could not access KVM kernel module: %m\n");
1608         ret = -errno;
1609         goto err;
1610     }
1611 
1612     ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
1613     if (ret < KVM_API_VERSION) {
1614         if (ret >= 0) {
1615             ret = -EINVAL;
1616         }
1617         fprintf(stderr, "kvm version too old\n");
1618         goto err;
1619     }
1620 
1621     if (ret > KVM_API_VERSION) {
1622         ret = -EINVAL;
1623         fprintf(stderr, "kvm version not supported\n");
1624         goto err;
1625     }
1626 
1627     kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT);
1628     s->nr_slots = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS);
1629 
1630     /* If unspecified, use the default value */
1631     if (!s->nr_slots) {
1632         s->nr_slots = 32;
1633     }
1634 
1635     /* check the vcpu limits */
1636     soft_vcpus_limit = kvm_recommended_vcpus(s);
1637     hard_vcpus_limit = kvm_max_vcpus(s);
1638 
1639     while (nc->name) {
1640         if (nc->num > soft_vcpus_limit) {
1641             fprintf(stderr,
1642                     "Warning: Number of %s cpus requested (%d) exceeds "
1643                     "the recommended cpus supported by KVM (%d)\n",
1644                     nc->name, nc->num, soft_vcpus_limit);
1645 
1646             if (nc->num > hard_vcpus_limit) {
1647                 fprintf(stderr, "Number of %s cpus requested (%d) exceeds "
1648                         "the maximum cpus supported by KVM (%d)\n",
1649                         nc->name, nc->num, hard_vcpus_limit);
1650                 exit(1);
1651             }
1652         }
1653         nc++;
1654     }
1655 
1656     kvm_type = qemu_opt_get(qemu_get_machine_opts(), "kvm-type");
1657     if (mc->kvm_type) {
1658         type = mc->kvm_type(kvm_type);
1659     } else if (kvm_type) {
1660         ret = -EINVAL;
1661         fprintf(stderr, "Invalid argument kvm-type=%s\n", kvm_type);
1662         goto err;
1663     }
1664 
1665     do {
1666         ret = kvm_ioctl(s, KVM_CREATE_VM, type);
1667     } while (ret == -EINTR);
1668 
1669     if (ret < 0) {
1670         fprintf(stderr, "ioctl(KVM_CREATE_VM) failed: %d %s\n", -ret,
1671                 strerror(-ret));
1672 
1673 #ifdef TARGET_S390X
1674         if (ret == -EINVAL) {
1675             fprintf(stderr,
1676                     "Host kernel setup problem detected. Please verify:\n");
1677             fprintf(stderr, "- for kernels supporting the switch_amode or"
1678                     " user_mode parameters, whether\n");
1679             fprintf(stderr,
1680                     "  user space is running in primary address space\n");
1681             fprintf(stderr,
1682                     "- for kernels supporting the vm.allocate_pgste sysctl, "
1683                     "whether it is enabled\n");
1684         }
1685 #endif
1686         goto err;
1687     }
1688 
1689     s->vmfd = ret;
1690     missing_cap = kvm_check_extension_list(s, kvm_required_capabilites);
1691     if (!missing_cap) {
1692         missing_cap =
1693             kvm_check_extension_list(s, kvm_arch_required_capabilities);
1694     }
1695     if (missing_cap) {
1696         ret = -EINVAL;
1697         fprintf(stderr, "kvm does not support %s\n%s",
1698                 missing_cap->name, upgrade_note);
1699         goto err;
1700     }
1701 
1702     s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
1703 
1704     s->broken_set_mem_region = 1;
1705     ret = kvm_check_extension(s, KVM_CAP_JOIN_MEMORY_REGIONS_WORKS);
1706     if (ret > 0) {
1707         s->broken_set_mem_region = 0;
1708     }
1709 
1710 #ifdef KVM_CAP_VCPU_EVENTS
1711     s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
1712 #endif
1713 
1714     s->robust_singlestep =
1715         kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP);
1716 
1717 #ifdef KVM_CAP_DEBUGREGS
1718     s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
1719 #endif
1720 
1721 #ifdef KVM_CAP_IRQ_ROUTING
1722     kvm_direct_msi_allowed = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0);
1723 #endif
1724 
1725     s->intx_set_mask = kvm_check_extension(s, KVM_CAP_PCI_2_3);
1726 
1727     s->irq_set_ioctl = KVM_IRQ_LINE;
1728     if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) {
1729         s->irq_set_ioctl = KVM_IRQ_LINE_STATUS;
1730     }
1731 
1732 #ifdef KVM_CAP_READONLY_MEM
1733     kvm_readonly_mem_allowed =
1734         (kvm_check_extension(s, KVM_CAP_READONLY_MEM) > 0);
1735 #endif
1736 
1737     kvm_eventfds_allowed =
1738         (kvm_check_extension(s, KVM_CAP_IOEVENTFD) > 0);
1739 
1740     kvm_irqfds_allowed =
1741         (kvm_check_extension(s, KVM_CAP_IRQFD) > 0);
1742 
1743     kvm_resamplefds_allowed =
1744         (kvm_check_extension(s, KVM_CAP_IRQFD_RESAMPLE) > 0);
1745 
1746     kvm_vm_attributes_allowed =
1747         (kvm_check_extension(s, KVM_CAP_VM_ATTRIBUTES) > 0);
1748 
1749     kvm_ioeventfd_any_length_allowed =
1750         (kvm_check_extension(s, KVM_CAP_IOEVENTFD_ANY_LENGTH) > 0);
1751 
1752     kvm_state = s;
1753 
1754     ret = kvm_arch_init(ms, s);
1755     if (ret < 0) {
1756         goto err;
1757     }
1758 
1759     if (machine_kernel_irqchip_allowed(ms)) {
1760         kvm_irqchip_create(ms, s);
1761     }
1762 
1763     if (kvm_eventfds_allowed) {
1764         s->memory_listener.listener.eventfd_add = kvm_mem_ioeventfd_add;
1765         s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del;
1766     }
1767     s->memory_listener.listener.coalesced_mmio_add = kvm_coalesce_mmio_region;
1768     s->memory_listener.listener.coalesced_mmio_del = kvm_uncoalesce_mmio_region;
1769 
1770     kvm_memory_listener_register(s, &s->memory_listener,
1771                                  &address_space_memory, 0);
1772     memory_listener_register(&kvm_io_listener,
1773                              &address_space_io);
1774 
1775     s->many_ioeventfds = kvm_check_many_ioeventfds();
1776 
1777     cpu_interrupt_handler = kvm_handle_interrupt;
1778 
1779     return 0;
1780 
1781 err:
1782     assert(ret < 0);
1783     if (s->vmfd >= 0) {
1784         close(s->vmfd);
1785     }
1786     if (s->fd != -1) {
1787         close(s->fd);
1788     }
1789     g_free(s->memory_listener.slots);
1790 
1791     return ret;
1792 }
1793 
1794 void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len)
1795 {
1796     s->sigmask_len = sigmask_len;
1797 }
1798 
1799 static void kvm_handle_io(uint16_t port, MemTxAttrs attrs, void *data, int direction,
1800                           int size, uint32_t count)
1801 {
1802     int i;
1803     uint8_t *ptr = data;
1804 
1805     for (i = 0; i < count; i++) {
1806         address_space_rw(&address_space_io, port, attrs,
1807                          ptr, size,
1808                          direction == KVM_EXIT_IO_OUT);
1809         ptr += size;
1810     }
1811 }
1812 
1813 static int kvm_handle_internal_error(CPUState *cpu, struct kvm_run *run)
1814 {
1815     fprintf(stderr, "KVM internal error. Suberror: %d\n",
1816             run->internal.suberror);
1817 
1818     if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) {
1819         int i;
1820 
1821         for (i = 0; i < run->internal.ndata; ++i) {
1822             fprintf(stderr, "extra data[%d]: %"PRIx64"\n",
1823                     i, (uint64_t)run->internal.data[i]);
1824         }
1825     }
1826     if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
1827         fprintf(stderr, "emulation failure\n");
1828         if (!kvm_arch_stop_on_emulation_error(cpu)) {
1829             cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_CODE);
1830             return EXCP_INTERRUPT;
1831         }
1832     }
1833     /* FIXME: Should trigger a qmp message to let management know
1834      * something went wrong.
1835      */
1836     return -1;
1837 }
1838 
1839 void kvm_flush_coalesced_mmio_buffer(void)
1840 {
1841     KVMState *s = kvm_state;
1842 
1843     if (s->coalesced_flush_in_progress) {
1844         return;
1845     }
1846 
1847     s->coalesced_flush_in_progress = true;
1848 
1849     if (s->coalesced_mmio_ring) {
1850         struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
1851         while (ring->first != ring->last) {
1852             struct kvm_coalesced_mmio *ent;
1853 
1854             ent = &ring->coalesced_mmio[ring->first];
1855 
1856             cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
1857             smp_wmb();
1858             ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
1859         }
1860     }
1861 
1862     s->coalesced_flush_in_progress = false;
1863 }
1864 
1865 static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
1866 {
1867     if (!cpu->kvm_vcpu_dirty) {
1868         kvm_arch_get_registers(cpu);
1869         cpu->kvm_vcpu_dirty = true;
1870     }
1871 }
1872 
1873 void kvm_cpu_synchronize_state(CPUState *cpu)
1874 {
1875     if (!cpu->kvm_vcpu_dirty) {
1876         run_on_cpu(cpu, do_kvm_cpu_synchronize_state, RUN_ON_CPU_NULL);
1877     }
1878 }
1879 
1880 static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg)
1881 {
1882     kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE);
1883     cpu->kvm_vcpu_dirty = false;
1884 }
1885 
1886 void kvm_cpu_synchronize_post_reset(CPUState *cpu)
1887 {
1888     run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
1889 }
1890 
1891 static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg)
1892 {
1893     kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE);
1894     cpu->kvm_vcpu_dirty = false;
1895 }
1896 
1897 void kvm_cpu_synchronize_post_init(CPUState *cpu)
1898 {
1899     run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
1900 }
1901 
1902 static void do_kvm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg)
1903 {
1904     cpu->kvm_vcpu_dirty = true;
1905 }
1906 
1907 void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu)
1908 {
1909     run_on_cpu(cpu, do_kvm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
1910 }
1911 
1912 #ifdef KVM_HAVE_MCE_INJECTION
1913 static __thread void *pending_sigbus_addr;
1914 static __thread int pending_sigbus_code;
1915 static __thread bool have_sigbus_pending;
1916 #endif
1917 
1918 static void kvm_cpu_kick(CPUState *cpu)
1919 {
1920     atomic_set(&cpu->kvm_run->immediate_exit, 1);
1921 }
1922 
1923 static void kvm_cpu_kick_self(void)
1924 {
1925     if (kvm_immediate_exit) {
1926         kvm_cpu_kick(current_cpu);
1927     } else {
1928         qemu_cpu_kick_self();
1929     }
1930 }
1931 
1932 static void kvm_eat_signals(CPUState *cpu)
1933 {
1934     struct timespec ts = { 0, 0 };
1935     siginfo_t siginfo;
1936     sigset_t waitset;
1937     sigset_t chkset;
1938     int r;
1939 
1940     if (kvm_immediate_exit) {
1941         atomic_set(&cpu->kvm_run->immediate_exit, 0);
1942         /* Write kvm_run->immediate_exit before the cpu->exit_request
1943          * write in kvm_cpu_exec.
1944          */
1945         smp_wmb();
1946         return;
1947     }
1948 
1949     sigemptyset(&waitset);
1950     sigaddset(&waitset, SIG_IPI);
1951 
1952     do {
1953         r = sigtimedwait(&waitset, &siginfo, &ts);
1954         if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
1955             perror("sigtimedwait");
1956             exit(1);
1957         }
1958 
1959         r = sigpending(&chkset);
1960         if (r == -1) {
1961             perror("sigpending");
1962             exit(1);
1963         }
1964     } while (sigismember(&chkset, SIG_IPI));
1965 }
1966 
1967 int kvm_cpu_exec(CPUState *cpu)
1968 {
1969     struct kvm_run *run = cpu->kvm_run;
1970     int ret, run_ret;
1971 
1972     DPRINTF("kvm_cpu_exec()\n");
1973 
1974     if (kvm_arch_process_async_events(cpu)) {
1975         atomic_set(&cpu->exit_request, 0);
1976         return EXCP_HLT;
1977     }
1978 
1979     qemu_mutex_unlock_iothread();
1980     cpu_exec_start(cpu);
1981 
1982     do {
1983         MemTxAttrs attrs;
1984 
1985         if (cpu->kvm_vcpu_dirty) {
1986             kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE);
1987             cpu->kvm_vcpu_dirty = false;
1988         }
1989 
1990         kvm_arch_pre_run(cpu, run);
1991         if (atomic_read(&cpu->exit_request)) {
1992             DPRINTF("interrupt exit requested\n");
1993             /*
1994              * KVM requires us to reenter the kernel after IO exits to complete
1995              * instruction emulation. This self-signal will ensure that we
1996              * leave ASAP again.
1997              */
1998             kvm_cpu_kick_self();
1999         }
2000 
2001         /* Read cpu->exit_request before KVM_RUN reads run->immediate_exit.
2002          * Matching barrier in kvm_eat_signals.
2003          */
2004         smp_rmb();
2005 
2006         run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0);
2007 
2008         attrs = kvm_arch_post_run(cpu, run);
2009 
2010 #ifdef KVM_HAVE_MCE_INJECTION
2011         if (unlikely(have_sigbus_pending)) {
2012             qemu_mutex_lock_iothread();
2013             kvm_arch_on_sigbus_vcpu(cpu, pending_sigbus_code,
2014                                     pending_sigbus_addr);
2015             have_sigbus_pending = false;
2016             qemu_mutex_unlock_iothread();
2017         }
2018 #endif
2019 
2020         if (run_ret < 0) {
2021             if (run_ret == -EINTR || run_ret == -EAGAIN) {
2022                 DPRINTF("io window exit\n");
2023                 kvm_eat_signals(cpu);
2024                 ret = EXCP_INTERRUPT;
2025                 break;
2026             }
2027             fprintf(stderr, "error: kvm run failed %s\n",
2028                     strerror(-run_ret));
2029 #ifdef TARGET_PPC
2030             if (run_ret == -EBUSY) {
2031                 fprintf(stderr,
2032                         "This is probably because your SMT is enabled.\n"
2033                         "VCPU can only run on primary threads with all "
2034                         "secondary threads offline.\n");
2035             }
2036 #endif
2037             ret = -1;
2038             break;
2039         }
2040 
2041         trace_kvm_run_exit(cpu->cpu_index, run->exit_reason);
2042         switch (run->exit_reason) {
2043         case KVM_EXIT_IO:
2044             DPRINTF("handle_io\n");
2045             /* Called outside BQL */
2046             kvm_handle_io(run->io.port, attrs,
2047                           (uint8_t *)run + run->io.data_offset,
2048                           run->io.direction,
2049                           run->io.size,
2050                           run->io.count);
2051             ret = 0;
2052             break;
2053         case KVM_EXIT_MMIO:
2054             DPRINTF("handle_mmio\n");
2055             /* Called outside BQL */
2056             address_space_rw(&address_space_memory,
2057                              run->mmio.phys_addr, attrs,
2058                              run->mmio.data,
2059                              run->mmio.len,
2060                              run->mmio.is_write);
2061             ret = 0;
2062             break;
2063         case KVM_EXIT_IRQ_WINDOW_OPEN:
2064             DPRINTF("irq_window_open\n");
2065             ret = EXCP_INTERRUPT;
2066             break;
2067         case KVM_EXIT_SHUTDOWN:
2068             DPRINTF("shutdown\n");
2069             qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
2070             ret = EXCP_INTERRUPT;
2071             break;
2072         case KVM_EXIT_UNKNOWN:
2073             fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n",
2074                     (uint64_t)run->hw.hardware_exit_reason);
2075             ret = -1;
2076             break;
2077         case KVM_EXIT_INTERNAL_ERROR:
2078             ret = kvm_handle_internal_error(cpu, run);
2079             break;
2080         case KVM_EXIT_SYSTEM_EVENT:
2081             switch (run->system_event.type) {
2082             case KVM_SYSTEM_EVENT_SHUTDOWN:
2083                 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
2084                 ret = EXCP_INTERRUPT;
2085                 break;
2086             case KVM_SYSTEM_EVENT_RESET:
2087                 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
2088                 ret = EXCP_INTERRUPT;
2089                 break;
2090             case KVM_SYSTEM_EVENT_CRASH:
2091                 kvm_cpu_synchronize_state(cpu);
2092                 qemu_mutex_lock_iothread();
2093                 qemu_system_guest_panicked(cpu_get_crash_info(cpu));
2094                 qemu_mutex_unlock_iothread();
2095                 ret = 0;
2096                 break;
2097             default:
2098                 DPRINTF("kvm_arch_handle_exit\n");
2099                 ret = kvm_arch_handle_exit(cpu, run);
2100                 break;
2101             }
2102             break;
2103         default:
2104             DPRINTF("kvm_arch_handle_exit\n");
2105             ret = kvm_arch_handle_exit(cpu, run);
2106             break;
2107         }
2108     } while (ret == 0);
2109 
2110     cpu_exec_end(cpu);
2111     qemu_mutex_lock_iothread();
2112 
2113     if (ret < 0) {
2114         cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_CODE);
2115         vm_stop(RUN_STATE_INTERNAL_ERROR);
2116     }
2117 
2118     atomic_set(&cpu->exit_request, 0);
2119     return ret;
2120 }
2121 
2122 int kvm_ioctl(KVMState *s, int type, ...)
2123 {
2124     int ret;
2125     void *arg;
2126     va_list ap;
2127 
2128     va_start(ap, type);
2129     arg = va_arg(ap, void *);
2130     va_end(ap);
2131 
2132     trace_kvm_ioctl(type, arg);
2133     ret = ioctl(s->fd, type, arg);
2134     if (ret == -1) {
2135         ret = -errno;
2136     }
2137     return ret;
2138 }
2139 
2140 int kvm_vm_ioctl(KVMState *s, int type, ...)
2141 {
2142     int ret;
2143     void *arg;
2144     va_list ap;
2145 
2146     va_start(ap, type);
2147     arg = va_arg(ap, void *);
2148     va_end(ap);
2149 
2150     trace_kvm_vm_ioctl(type, arg);
2151     ret = ioctl(s->vmfd, type, arg);
2152     if (ret == -1) {
2153         ret = -errno;
2154     }
2155     return ret;
2156 }
2157 
2158 int kvm_vcpu_ioctl(CPUState *cpu, int type, ...)
2159 {
2160     int ret;
2161     void *arg;
2162     va_list ap;
2163 
2164     va_start(ap, type);
2165     arg = va_arg(ap, void *);
2166     va_end(ap);
2167 
2168     trace_kvm_vcpu_ioctl(cpu->cpu_index, type, arg);
2169     ret = ioctl(cpu->kvm_fd, type, arg);
2170     if (ret == -1) {
2171         ret = -errno;
2172     }
2173     return ret;
2174 }
2175 
2176 int kvm_device_ioctl(int fd, int type, ...)
2177 {
2178     int ret;
2179     void *arg;
2180     va_list ap;
2181 
2182     va_start(ap, type);
2183     arg = va_arg(ap, void *);
2184     va_end(ap);
2185 
2186     trace_kvm_device_ioctl(fd, type, arg);
2187     ret = ioctl(fd, type, arg);
2188     if (ret == -1) {
2189         ret = -errno;
2190     }
2191     return ret;
2192 }
2193 
2194 int kvm_vm_check_attr(KVMState *s, uint32_t group, uint64_t attr)
2195 {
2196     int ret;
2197     struct kvm_device_attr attribute = {
2198         .group = group,
2199         .attr = attr,
2200     };
2201 
2202     if (!kvm_vm_attributes_allowed) {
2203         return 0;
2204     }
2205 
2206     ret = kvm_vm_ioctl(s, KVM_HAS_DEVICE_ATTR, &attribute);
2207     /* kvm returns 0 on success for HAS_DEVICE_ATTR */
2208     return ret ? 0 : 1;
2209 }
2210 
2211 int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr)
2212 {
2213     struct kvm_device_attr attribute = {
2214         .group = group,
2215         .attr = attr,
2216         .flags = 0,
2217     };
2218 
2219     return kvm_device_ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute) ? 0 : 1;
2220 }
2221 
2222 int kvm_device_access(int fd, int group, uint64_t attr,
2223                       void *val, bool write, Error **errp)
2224 {
2225     struct kvm_device_attr kvmattr;
2226     int err;
2227 
2228     kvmattr.flags = 0;
2229     kvmattr.group = group;
2230     kvmattr.attr = attr;
2231     kvmattr.addr = (uintptr_t)val;
2232 
2233     err = kvm_device_ioctl(fd,
2234                            write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR,
2235                            &kvmattr);
2236     if (err < 0) {
2237         error_setg_errno(errp, -err,
2238                          "KVM_%s_DEVICE_ATTR failed: Group %d "
2239                          "attr 0x%016" PRIx64,
2240                          write ? "SET" : "GET", group, attr);
2241     }
2242     return err;
2243 }
2244 
2245 /* Return 1 on success, 0 on failure */
2246 int kvm_has_sync_mmu(void)
2247 {
2248     return kvm_check_extension(kvm_state, KVM_CAP_SYNC_MMU);
2249 }
2250 
2251 int kvm_has_vcpu_events(void)
2252 {
2253     return kvm_state->vcpu_events;
2254 }
2255 
2256 int kvm_has_robust_singlestep(void)
2257 {
2258     return kvm_state->robust_singlestep;
2259 }
2260 
2261 int kvm_has_debugregs(void)
2262 {
2263     return kvm_state->debugregs;
2264 }
2265 
2266 int kvm_has_many_ioeventfds(void)
2267 {
2268     if (!kvm_enabled()) {
2269         return 0;
2270     }
2271     return kvm_state->many_ioeventfds;
2272 }
2273 
2274 int kvm_has_gsi_routing(void)
2275 {
2276 #ifdef KVM_CAP_IRQ_ROUTING
2277     return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
2278 #else
2279     return false;
2280 #endif
2281 }
2282 
2283 int kvm_has_intx_set_mask(void)
2284 {
2285     return kvm_state->intx_set_mask;
2286 }
2287 
2288 #ifdef KVM_CAP_SET_GUEST_DEBUG
2289 struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu,
2290                                                  target_ulong pc)
2291 {
2292     struct kvm_sw_breakpoint *bp;
2293 
2294     QTAILQ_FOREACH(bp, &cpu->kvm_state->kvm_sw_breakpoints, entry) {
2295         if (bp->pc == pc) {
2296             return bp;
2297         }
2298     }
2299     return NULL;
2300 }
2301 
2302 int kvm_sw_breakpoints_active(CPUState *cpu)
2303 {
2304     return !QTAILQ_EMPTY(&cpu->kvm_state->kvm_sw_breakpoints);
2305 }
2306 
2307 struct kvm_set_guest_debug_data {
2308     struct kvm_guest_debug dbg;
2309     int err;
2310 };
2311 
2312 static void kvm_invoke_set_guest_debug(CPUState *cpu, run_on_cpu_data data)
2313 {
2314     struct kvm_set_guest_debug_data *dbg_data =
2315         (struct kvm_set_guest_debug_data *) data.host_ptr;
2316 
2317     dbg_data->err = kvm_vcpu_ioctl(cpu, KVM_SET_GUEST_DEBUG,
2318                                    &dbg_data->dbg);
2319 }
2320 
2321 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap)
2322 {
2323     struct kvm_set_guest_debug_data data;
2324 
2325     data.dbg.control = reinject_trap;
2326 
2327     if (cpu->singlestep_enabled) {
2328         data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
2329     }
2330     kvm_arch_update_guest_debug(cpu, &data.dbg);
2331 
2332     run_on_cpu(cpu, kvm_invoke_set_guest_debug,
2333                RUN_ON_CPU_HOST_PTR(&data));
2334     return data.err;
2335 }
2336 
2337 int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr,
2338                           target_ulong len, int type)
2339 {
2340     struct kvm_sw_breakpoint *bp;
2341     int err;
2342 
2343     if (type == GDB_BREAKPOINT_SW) {
2344         bp = kvm_find_sw_breakpoint(cpu, addr);
2345         if (bp) {
2346             bp->use_count++;
2347             return 0;
2348         }
2349 
2350         bp = g_malloc(sizeof(struct kvm_sw_breakpoint));
2351         bp->pc = addr;
2352         bp->use_count = 1;
2353         err = kvm_arch_insert_sw_breakpoint(cpu, bp);
2354         if (err) {
2355             g_free(bp);
2356             return err;
2357         }
2358 
2359         QTAILQ_INSERT_HEAD(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
2360     } else {
2361         err = kvm_arch_insert_hw_breakpoint(addr, len, type);
2362         if (err) {
2363             return err;
2364         }
2365     }
2366 
2367     CPU_FOREACH(cpu) {
2368         err = kvm_update_guest_debug(cpu, 0);
2369         if (err) {
2370             return err;
2371         }
2372     }
2373     return 0;
2374 }
2375 
2376 int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr,
2377                           target_ulong len, int type)
2378 {
2379     struct kvm_sw_breakpoint *bp;
2380     int err;
2381 
2382     if (type == GDB_BREAKPOINT_SW) {
2383         bp = kvm_find_sw_breakpoint(cpu, addr);
2384         if (!bp) {
2385             return -ENOENT;
2386         }
2387 
2388         if (bp->use_count > 1) {
2389             bp->use_count--;
2390             return 0;
2391         }
2392 
2393         err = kvm_arch_remove_sw_breakpoint(cpu, bp);
2394         if (err) {
2395             return err;
2396         }
2397 
2398         QTAILQ_REMOVE(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
2399         g_free(bp);
2400     } else {
2401         err = kvm_arch_remove_hw_breakpoint(addr, len, type);
2402         if (err) {
2403             return err;
2404         }
2405     }
2406 
2407     CPU_FOREACH(cpu) {
2408         err = kvm_update_guest_debug(cpu, 0);
2409         if (err) {
2410             return err;
2411         }
2412     }
2413     return 0;
2414 }
2415 
2416 void kvm_remove_all_breakpoints(CPUState *cpu)
2417 {
2418     struct kvm_sw_breakpoint *bp, *next;
2419     KVMState *s = cpu->kvm_state;
2420     CPUState *tmpcpu;
2421 
2422     QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
2423         if (kvm_arch_remove_sw_breakpoint(cpu, bp) != 0) {
2424             /* Try harder to find a CPU that currently sees the breakpoint. */
2425             CPU_FOREACH(tmpcpu) {
2426                 if (kvm_arch_remove_sw_breakpoint(tmpcpu, bp) == 0) {
2427                     break;
2428                 }
2429             }
2430         }
2431         QTAILQ_REMOVE(&s->kvm_sw_breakpoints, bp, entry);
2432         g_free(bp);
2433     }
2434     kvm_arch_remove_all_hw_breakpoints();
2435 
2436     CPU_FOREACH(cpu) {
2437         kvm_update_guest_debug(cpu, 0);
2438     }
2439 }
2440 
2441 #else /* !KVM_CAP_SET_GUEST_DEBUG */
2442 
2443 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap)
2444 {
2445     return -EINVAL;
2446 }
2447 
2448 int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr,
2449                           target_ulong len, int type)
2450 {
2451     return -EINVAL;
2452 }
2453 
2454 int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr,
2455                           target_ulong len, int type)
2456 {
2457     return -EINVAL;
2458 }
2459 
2460 void kvm_remove_all_breakpoints(CPUState *cpu)
2461 {
2462 }
2463 #endif /* !KVM_CAP_SET_GUEST_DEBUG */
2464 
2465 static int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset)
2466 {
2467     KVMState *s = kvm_state;
2468     struct kvm_signal_mask *sigmask;
2469     int r;
2470 
2471     sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset));
2472 
2473     sigmask->len = s->sigmask_len;
2474     memcpy(sigmask->sigset, sigset, sizeof(*sigset));
2475     r = kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, sigmask);
2476     g_free(sigmask);
2477 
2478     return r;
2479 }
2480 
2481 static void kvm_ipi_signal(int sig)
2482 {
2483     if (current_cpu) {
2484         assert(kvm_immediate_exit);
2485         kvm_cpu_kick(current_cpu);
2486     }
2487 }
2488 
2489 void kvm_init_cpu_signals(CPUState *cpu)
2490 {
2491     int r;
2492     sigset_t set;
2493     struct sigaction sigact;
2494 
2495     memset(&sigact, 0, sizeof(sigact));
2496     sigact.sa_handler = kvm_ipi_signal;
2497     sigaction(SIG_IPI, &sigact, NULL);
2498 
2499     pthread_sigmask(SIG_BLOCK, NULL, &set);
2500 #if defined KVM_HAVE_MCE_INJECTION
2501     sigdelset(&set, SIGBUS);
2502     pthread_sigmask(SIG_SETMASK, &set, NULL);
2503 #endif
2504     sigdelset(&set, SIG_IPI);
2505     if (kvm_immediate_exit) {
2506         r = pthread_sigmask(SIG_SETMASK, &set, NULL);
2507     } else {
2508         r = kvm_set_signal_mask(cpu, &set);
2509     }
2510     if (r) {
2511         fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
2512         exit(1);
2513     }
2514 }
2515 
2516 /* Called asynchronously in VCPU thread.  */
2517 int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr)
2518 {
2519 #ifdef KVM_HAVE_MCE_INJECTION
2520     if (have_sigbus_pending) {
2521         return 1;
2522     }
2523     have_sigbus_pending = true;
2524     pending_sigbus_addr = addr;
2525     pending_sigbus_code = code;
2526     atomic_set(&cpu->exit_request, 1);
2527     return 0;
2528 #else
2529     return 1;
2530 #endif
2531 }
2532 
2533 /* Called synchronously (via signalfd) in main thread.  */
2534 int kvm_on_sigbus(int code, void *addr)
2535 {
2536 #ifdef KVM_HAVE_MCE_INJECTION
2537     /* Action required MCE kills the process if SIGBUS is blocked.  Because
2538      * that's what happens in the I/O thread, where we handle MCE via signalfd,
2539      * we can only get action optional here.
2540      */
2541     assert(code != BUS_MCEERR_AR);
2542     kvm_arch_on_sigbus_vcpu(first_cpu, code, addr);
2543     return 0;
2544 #else
2545     return 1;
2546 #endif
2547 }
2548 
2549 int kvm_create_device(KVMState *s, uint64_t type, bool test)
2550 {
2551     int ret;
2552     struct kvm_create_device create_dev;
2553 
2554     create_dev.type = type;
2555     create_dev.fd = -1;
2556     create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0;
2557 
2558     if (!kvm_check_extension(s, KVM_CAP_DEVICE_CTRL)) {
2559         return -ENOTSUP;
2560     }
2561 
2562     ret = kvm_vm_ioctl(s, KVM_CREATE_DEVICE, &create_dev);
2563     if (ret) {
2564         return ret;
2565     }
2566 
2567     return test ? 0 : create_dev.fd;
2568 }
2569 
2570 bool kvm_device_supported(int vmfd, uint64_t type)
2571 {
2572     struct kvm_create_device create_dev = {
2573         .type = type,
2574         .fd = -1,
2575         .flags = KVM_CREATE_DEVICE_TEST,
2576     };
2577 
2578     if (ioctl(vmfd, KVM_CHECK_EXTENSION, KVM_CAP_DEVICE_CTRL) <= 0) {
2579         return false;
2580     }
2581 
2582     return (ioctl(vmfd, KVM_CREATE_DEVICE, &create_dev) >= 0);
2583 }
2584 
2585 int kvm_set_one_reg(CPUState *cs, uint64_t id, void *source)
2586 {
2587     struct kvm_one_reg reg;
2588     int r;
2589 
2590     reg.id = id;
2591     reg.addr = (uintptr_t) source;
2592     r = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
2593     if (r) {
2594         trace_kvm_failed_reg_set(id, strerror(-r));
2595     }
2596     return r;
2597 }
2598 
2599 int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target)
2600 {
2601     struct kvm_one_reg reg;
2602     int r;
2603 
2604     reg.id = id;
2605     reg.addr = (uintptr_t) target;
2606     r = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
2607     if (r) {
2608         trace_kvm_failed_reg_get(id, strerror(-r));
2609     }
2610     return r;
2611 }
2612 
2613 static void kvm_accel_class_init(ObjectClass *oc, void *data)
2614 {
2615     AccelClass *ac = ACCEL_CLASS(oc);
2616     ac->name = "KVM";
2617     ac->init_machine = kvm_init;
2618     ac->allowed = &kvm_allowed;
2619 }
2620 
2621 static const TypeInfo kvm_accel_type = {
2622     .name = TYPE_KVM_ACCEL,
2623     .parent = TYPE_ACCEL,
2624     .class_init = kvm_accel_class_init,
2625     .instance_size = sizeof(KVMState),
2626 };
2627 
2628 static void kvm_type_init(void)
2629 {
2630     type_register_static(&kvm_accel_type);
2631 }
2632 
2633 type_init(kvm_type_init);
2634