xref: /openbmc/qemu/accel/kvm/kvm-all.c (revision f357f564)
1 /*
2  * QEMU KVM support
3  *
4  * Copyright IBM, Corp. 2008
5  *           Red Hat, Inc. 2008
6  *
7  * Authors:
8  *  Anthony Liguori   <aliguori@us.ibm.com>
9  *  Glauber Costa     <gcosta@redhat.com>
10  *
11  * This work is licensed under the terms of the GNU GPL, version 2 or later.
12  * See the COPYING file in the top-level directory.
13  *
14  */
15 
16 #include "qemu/osdep.h"
17 #include <sys/ioctl.h>
18 
19 #include <linux/kvm.h>
20 
21 #include "qemu-common.h"
22 #include "qemu/atomic.h"
23 #include "qemu/option.h"
24 #include "qemu/config-file.h"
25 #include "qemu/error-report.h"
26 #include "qapi/error.h"
27 #include "hw/hw.h"
28 #include "hw/pci/msi.h"
29 #include "hw/pci/msix.h"
30 #include "hw/s390x/adapter.h"
31 #include "exec/gdbstub.h"
32 #include "sysemu/kvm_int.h"
33 #include "sysemu/cpus.h"
34 #include "qemu/bswap.h"
35 #include "exec/memory.h"
36 #include "exec/ram_addr.h"
37 #include "exec/address-spaces.h"
38 #include "qemu/event_notifier.h"
39 #include "trace.h"
40 #include "hw/irq.h"
41 
42 #include "hw/boards.h"
43 
44 /* This check must be after config-host.h is included */
45 #ifdef CONFIG_EVENTFD
46 #include <sys/eventfd.h>
47 #endif
48 
49 /* KVM uses PAGE_SIZE in its definition of KVM_COALESCED_MMIO_MAX. We
50  * need to use the real host PAGE_SIZE, as that's what KVM will use.
51  */
52 #define PAGE_SIZE getpagesize()
53 
54 //#define DEBUG_KVM
55 
56 #ifdef DEBUG_KVM
57 #define DPRINTF(fmt, ...) \
58     do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
59 #else
60 #define DPRINTF(fmt, ...) \
61     do { } while (0)
62 #endif
63 
64 #define KVM_MSI_HASHTAB_SIZE    256
65 
66 struct KVMParkedVcpu {
67     unsigned long vcpu_id;
68     int kvm_fd;
69     QLIST_ENTRY(KVMParkedVcpu) node;
70 };
71 
72 struct KVMState
73 {
74     AccelState parent_obj;
75 
76     int nr_slots;
77     int fd;
78     int vmfd;
79     int coalesced_mmio;
80     struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
81     bool coalesced_flush_in_progress;
82     int vcpu_events;
83     int robust_singlestep;
84     int debugregs;
85 #ifdef KVM_CAP_SET_GUEST_DEBUG
86     struct kvm_sw_breakpoint_head kvm_sw_breakpoints;
87 #endif
88     int many_ioeventfds;
89     int intx_set_mask;
90     /* The man page (and posix) say ioctl numbers are signed int, but
91      * they're not.  Linux, glibc and *BSD all treat ioctl numbers as
92      * unsigned, and treating them as signed here can break things */
93     unsigned irq_set_ioctl;
94     unsigned int sigmask_len;
95     GHashTable *gsimap;
96 #ifdef KVM_CAP_IRQ_ROUTING
97     struct kvm_irq_routing *irq_routes;
98     int nr_allocated_irq_routes;
99     unsigned long *used_gsi_bitmap;
100     unsigned int gsi_count;
101     QTAILQ_HEAD(msi_hashtab, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE];
102 #endif
103     KVMMemoryListener memory_listener;
104     QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus;
105 };
106 
107 KVMState *kvm_state;
108 bool kvm_kernel_irqchip;
109 bool kvm_split_irqchip;
110 bool kvm_async_interrupts_allowed;
111 bool kvm_halt_in_kernel_allowed;
112 bool kvm_eventfds_allowed;
113 bool kvm_irqfds_allowed;
114 bool kvm_resamplefds_allowed;
115 bool kvm_msi_via_irqfd_allowed;
116 bool kvm_gsi_routing_allowed;
117 bool kvm_gsi_direct_mapping;
118 bool kvm_allowed;
119 bool kvm_readonly_mem_allowed;
120 bool kvm_vm_attributes_allowed;
121 bool kvm_direct_msi_allowed;
122 bool kvm_ioeventfd_any_length_allowed;
123 bool kvm_msi_use_devid;
124 static bool kvm_immediate_exit;
125 
126 static const KVMCapabilityInfo kvm_required_capabilites[] = {
127     KVM_CAP_INFO(USER_MEMORY),
128     KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS),
129     KVM_CAP_INFO(JOIN_MEMORY_REGIONS_WORKS),
130     KVM_CAP_LAST_INFO
131 };
132 
133 int kvm_get_max_memslots(void)
134 {
135     KVMState *s = KVM_STATE(current_machine->accelerator);
136 
137     return s->nr_slots;
138 }
139 
140 static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml)
141 {
142     KVMState *s = kvm_state;
143     int i;
144 
145     for (i = 0; i < s->nr_slots; i++) {
146         if (kml->slots[i].memory_size == 0) {
147             return &kml->slots[i];
148         }
149     }
150 
151     return NULL;
152 }
153 
154 bool kvm_has_free_slot(MachineState *ms)
155 {
156     KVMState *s = KVM_STATE(ms->accelerator);
157 
158     return kvm_get_free_slot(&s->memory_listener);
159 }
160 
161 static KVMSlot *kvm_alloc_slot(KVMMemoryListener *kml)
162 {
163     KVMSlot *slot = kvm_get_free_slot(kml);
164 
165     if (slot) {
166         return slot;
167     }
168 
169     fprintf(stderr, "%s: no free slot available\n", __func__);
170     abort();
171 }
172 
173 static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml,
174                                          hwaddr start_addr,
175                                          hwaddr size)
176 {
177     KVMState *s = kvm_state;
178     int i;
179 
180     for (i = 0; i < s->nr_slots; i++) {
181         KVMSlot *mem = &kml->slots[i];
182 
183         if (start_addr == mem->start_addr && size == mem->memory_size) {
184             return mem;
185         }
186     }
187 
188     return NULL;
189 }
190 
191 /*
192  * Calculate and align the start address and the size of the section.
193  * Return the size. If the size is 0, the aligned section is empty.
194  */
195 static hwaddr kvm_align_section(MemoryRegionSection *section,
196                                 hwaddr *start)
197 {
198     hwaddr size = int128_get64(section->size);
199     hwaddr delta;
200 
201     *start = section->offset_within_address_space;
202 
203     /* kvm works in page size chunks, but the function may be called
204        with sub-page size and unaligned start address. Pad the start
205        address to next and truncate size to previous page boundary. */
206     delta = qemu_real_host_page_size - (*start & ~qemu_real_host_page_mask);
207     delta &= ~qemu_real_host_page_mask;
208     *start += delta;
209     if (delta > size) {
210         return 0;
211     }
212     size -= delta;
213     size &= qemu_real_host_page_mask;
214     if (*start & ~qemu_real_host_page_mask) {
215         return 0;
216     }
217 
218     return size;
219 }
220 
221 /*
222  * Find overlapping slot with lowest start address
223  */
224 static KVMSlot *kvm_lookup_overlapping_slot(KVMMemoryListener *kml,
225                                             hwaddr start_addr,
226                                             hwaddr end_addr)
227 {
228     KVMState *s = kvm_state;
229     KVMSlot *found = NULL;
230     int i;
231 
232     for (i = 0; i < s->nr_slots; i++) {
233         KVMSlot *mem = &kml->slots[i];
234 
235         if (mem->memory_size == 0 ||
236             (found && found->start_addr < mem->start_addr)) {
237             continue;
238         }
239 
240         if (end_addr > mem->start_addr &&
241             start_addr < mem->start_addr + mem->memory_size) {
242             found = mem;
243         }
244     }
245 
246     return found;
247 }
248 
249 int kvm_physical_memory_addr_from_host(KVMState *s, void *ram,
250                                        hwaddr *phys_addr)
251 {
252     KVMMemoryListener *kml = &s->memory_listener;
253     int i;
254 
255     for (i = 0; i < s->nr_slots; i++) {
256         KVMSlot *mem = &kml->slots[i];
257 
258         if (ram >= mem->ram && ram < mem->ram + mem->memory_size) {
259             *phys_addr = mem->start_addr + (ram - mem->ram);
260             return 1;
261         }
262     }
263 
264     return 0;
265 }
266 
267 static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot)
268 {
269     KVMState *s = kvm_state;
270     struct kvm_userspace_memory_region mem;
271 
272     mem.slot = slot->slot | (kml->as_id << 16);
273     mem.guest_phys_addr = slot->start_addr;
274     mem.userspace_addr = (unsigned long)slot->ram;
275     mem.flags = slot->flags;
276 
277     if (slot->memory_size && mem.flags & KVM_MEM_READONLY) {
278         /* Set the slot size to 0 before setting the slot to the desired
279          * value. This is needed based on KVM commit 75d61fbc. */
280         mem.memory_size = 0;
281         kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
282     }
283     mem.memory_size = slot->memory_size;
284     return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
285 }
286 
287 int kvm_destroy_vcpu(CPUState *cpu)
288 {
289     KVMState *s = kvm_state;
290     long mmap_size;
291     struct KVMParkedVcpu *vcpu = NULL;
292     int ret = 0;
293 
294     DPRINTF("kvm_destroy_vcpu\n");
295 
296     mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
297     if (mmap_size < 0) {
298         ret = mmap_size;
299         DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
300         goto err;
301     }
302 
303     ret = munmap(cpu->kvm_run, mmap_size);
304     if (ret < 0) {
305         goto err;
306     }
307 
308     vcpu = g_malloc0(sizeof(*vcpu));
309     vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
310     vcpu->kvm_fd = cpu->kvm_fd;
311     QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
312 err:
313     return ret;
314 }
315 
316 static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id)
317 {
318     struct KVMParkedVcpu *cpu;
319 
320     QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
321         if (cpu->vcpu_id == vcpu_id) {
322             int kvm_fd;
323 
324             QLIST_REMOVE(cpu, node);
325             kvm_fd = cpu->kvm_fd;
326             g_free(cpu);
327             return kvm_fd;
328         }
329     }
330 
331     return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);
332 }
333 
334 int kvm_init_vcpu(CPUState *cpu)
335 {
336     KVMState *s = kvm_state;
337     long mmap_size;
338     int ret;
339 
340     DPRINTF("kvm_init_vcpu\n");
341 
342     ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu));
343     if (ret < 0) {
344         DPRINTF("kvm_create_vcpu failed\n");
345         goto err;
346     }
347 
348     cpu->kvm_fd = ret;
349     cpu->kvm_state = s;
350     cpu->vcpu_dirty = true;
351 
352     mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
353     if (mmap_size < 0) {
354         ret = mmap_size;
355         DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
356         goto err;
357     }
358 
359     cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
360                         cpu->kvm_fd, 0);
361     if (cpu->kvm_run == MAP_FAILED) {
362         ret = -errno;
363         DPRINTF("mmap'ing vcpu state failed\n");
364         goto err;
365     }
366 
367     if (s->coalesced_mmio && !s->coalesced_mmio_ring) {
368         s->coalesced_mmio_ring =
369             (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE;
370     }
371 
372     ret = kvm_arch_init_vcpu(cpu);
373 err:
374     return ret;
375 }
376 
377 /*
378  * dirty pages logging control
379  */
380 
381 static int kvm_mem_flags(MemoryRegion *mr)
382 {
383     bool readonly = mr->readonly || memory_region_is_romd(mr);
384     int flags = 0;
385 
386     if (memory_region_get_dirty_log_mask(mr) != 0) {
387         flags |= KVM_MEM_LOG_DIRTY_PAGES;
388     }
389     if (readonly && kvm_readonly_mem_allowed) {
390         flags |= KVM_MEM_READONLY;
391     }
392     return flags;
393 }
394 
395 static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem,
396                                  MemoryRegion *mr)
397 {
398     int old_flags;
399 
400     old_flags = mem->flags;
401     mem->flags = kvm_mem_flags(mr);
402 
403     /* If nothing changed effectively, no need to issue ioctl */
404     if (mem->flags == old_flags) {
405         return 0;
406     }
407 
408     return kvm_set_user_memory_region(kml, mem);
409 }
410 
411 static int kvm_section_update_flags(KVMMemoryListener *kml,
412                                     MemoryRegionSection *section)
413 {
414     hwaddr phys_addr = section->offset_within_address_space;
415     ram_addr_t size = int128_get64(section->size);
416     KVMSlot *mem = kvm_lookup_matching_slot(kml, phys_addr, size);
417 
418     if (mem == NULL)  {
419         return 0;
420     } else {
421         return kvm_slot_update_flags(kml, mem, section->mr);
422     }
423 }
424 
425 static void kvm_log_start(MemoryListener *listener,
426                           MemoryRegionSection *section,
427                           int old, int new)
428 {
429     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
430     int r;
431 
432     if (old != 0) {
433         return;
434     }
435 
436     r = kvm_section_update_flags(kml, section);
437     if (r < 0) {
438         abort();
439     }
440 }
441 
442 static void kvm_log_stop(MemoryListener *listener,
443                           MemoryRegionSection *section,
444                           int old, int new)
445 {
446     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
447     int r;
448 
449     if (new != 0) {
450         return;
451     }
452 
453     r = kvm_section_update_flags(kml, section);
454     if (r < 0) {
455         abort();
456     }
457 }
458 
459 /* get kvm's dirty pages bitmap and update qemu's */
460 static int kvm_get_dirty_pages_log_range(MemoryRegionSection *section,
461                                          unsigned long *bitmap)
462 {
463     ram_addr_t start = section->offset_within_region +
464                        memory_region_get_ram_addr(section->mr);
465     ram_addr_t pages = int128_get64(section->size) / getpagesize();
466 
467     cpu_physical_memory_set_dirty_lebitmap(bitmap, start, pages);
468     return 0;
469 }
470 
471 #define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
472 
473 /**
474  * kvm_physical_sync_dirty_bitmap - Grab dirty bitmap from kernel space
475  * This function updates qemu's dirty bitmap using
476  * memory_region_set_dirty().  This means all bits are set
477  * to dirty.
478  *
479  * @start_add: start of logged region.
480  * @end_addr: end of logged region.
481  */
482 static int kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml,
483                                           MemoryRegionSection *section)
484 {
485     KVMState *s = kvm_state;
486     unsigned long size, allocated_size = 0;
487     struct kvm_dirty_log d = {};
488     KVMSlot *mem;
489     int ret = 0;
490     hwaddr start_addr = section->offset_within_address_space;
491     hwaddr end_addr = start_addr + int128_get64(section->size);
492 
493     d.dirty_bitmap = NULL;
494     while (start_addr < end_addr) {
495         mem = kvm_lookup_overlapping_slot(kml, start_addr, end_addr);
496         if (mem == NULL) {
497             break;
498         }
499 
500         /* XXX bad kernel interface alert
501          * For dirty bitmap, kernel allocates array of size aligned to
502          * bits-per-long.  But for case when the kernel is 64bits and
503          * the userspace is 32bits, userspace can't align to the same
504          * bits-per-long, since sizeof(long) is different between kernel
505          * and user space.  This way, userspace will provide buffer which
506          * may be 4 bytes less than the kernel will use, resulting in
507          * userspace memory corruption (which is not detectable by valgrind
508          * too, in most cases).
509          * So for now, let's align to 64 instead of HOST_LONG_BITS here, in
510          * a hope that sizeof(long) won't become >8 any time soon.
511          */
512         size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS),
513                      /*HOST_LONG_BITS*/ 64) / 8;
514         if (!d.dirty_bitmap) {
515             d.dirty_bitmap = g_malloc(size);
516         } else if (size > allocated_size) {
517             d.dirty_bitmap = g_realloc(d.dirty_bitmap, size);
518         }
519         allocated_size = size;
520         memset(d.dirty_bitmap, 0, allocated_size);
521 
522         d.slot = mem->slot | (kml->as_id << 16);
523         if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) {
524             DPRINTF("ioctl failed %d\n", errno);
525             ret = -1;
526             break;
527         }
528 
529         kvm_get_dirty_pages_log_range(section, d.dirty_bitmap);
530         start_addr = mem->start_addr + mem->memory_size;
531     }
532     g_free(d.dirty_bitmap);
533 
534     return ret;
535 }
536 
537 static void kvm_coalesce_mmio_region(MemoryListener *listener,
538                                      MemoryRegionSection *secion,
539                                      hwaddr start, hwaddr size)
540 {
541     KVMState *s = kvm_state;
542 
543     if (s->coalesced_mmio) {
544         struct kvm_coalesced_mmio_zone zone;
545 
546         zone.addr = start;
547         zone.size = size;
548         zone.pad = 0;
549 
550         (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
551     }
552 }
553 
554 static void kvm_uncoalesce_mmio_region(MemoryListener *listener,
555                                        MemoryRegionSection *secion,
556                                        hwaddr start, hwaddr size)
557 {
558     KVMState *s = kvm_state;
559 
560     if (s->coalesced_mmio) {
561         struct kvm_coalesced_mmio_zone zone;
562 
563         zone.addr = start;
564         zone.size = size;
565         zone.pad = 0;
566 
567         (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
568     }
569 }
570 
571 int kvm_check_extension(KVMState *s, unsigned int extension)
572 {
573     int ret;
574 
575     ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
576     if (ret < 0) {
577         ret = 0;
578     }
579 
580     return ret;
581 }
582 
583 int kvm_vm_check_extension(KVMState *s, unsigned int extension)
584 {
585     int ret;
586 
587     ret = kvm_vm_ioctl(s, KVM_CHECK_EXTENSION, extension);
588     if (ret < 0) {
589         /* VM wide version not implemented, use global one instead */
590         ret = kvm_check_extension(s, extension);
591     }
592 
593     return ret;
594 }
595 
596 static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size)
597 {
598 #if defined(HOST_WORDS_BIGENDIAN) != defined(TARGET_WORDS_BIGENDIAN)
599     /* The kernel expects ioeventfd values in HOST_WORDS_BIGENDIAN
600      * endianness, but the memory core hands them in target endianness.
601      * For example, PPC is always treated as big-endian even if running
602      * on KVM and on PPC64LE.  Correct here.
603      */
604     switch (size) {
605     case 2:
606         val = bswap16(val);
607         break;
608     case 4:
609         val = bswap32(val);
610         break;
611     }
612 #endif
613     return val;
614 }
615 
616 static int kvm_set_ioeventfd_mmio(int fd, hwaddr addr, uint32_t val,
617                                   bool assign, uint32_t size, bool datamatch)
618 {
619     int ret;
620     struct kvm_ioeventfd iofd = {
621         .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
622         .addr = addr,
623         .len = size,
624         .flags = 0,
625         .fd = fd,
626     };
627 
628     if (!kvm_enabled()) {
629         return -ENOSYS;
630     }
631 
632     if (datamatch) {
633         iofd.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
634     }
635     if (!assign) {
636         iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
637     }
638 
639     ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);
640 
641     if (ret < 0) {
642         return -errno;
643     }
644 
645     return 0;
646 }
647 
648 static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val,
649                                  bool assign, uint32_t size, bool datamatch)
650 {
651     struct kvm_ioeventfd kick = {
652         .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
653         .addr = addr,
654         .flags = KVM_IOEVENTFD_FLAG_PIO,
655         .len = size,
656         .fd = fd,
657     };
658     int r;
659     if (!kvm_enabled()) {
660         return -ENOSYS;
661     }
662     if (datamatch) {
663         kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
664     }
665     if (!assign) {
666         kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
667     }
668     r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
669     if (r < 0) {
670         return r;
671     }
672     return 0;
673 }
674 
675 
676 static int kvm_check_many_ioeventfds(void)
677 {
678     /* Userspace can use ioeventfd for io notification.  This requires a host
679      * that supports eventfd(2) and an I/O thread; since eventfd does not
680      * support SIGIO it cannot interrupt the vcpu.
681      *
682      * Older kernels have a 6 device limit on the KVM io bus.  Find out so we
683      * can avoid creating too many ioeventfds.
684      */
685 #if defined(CONFIG_EVENTFD)
686     int ioeventfds[7];
687     int i, ret = 0;
688     for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) {
689         ioeventfds[i] = eventfd(0, EFD_CLOEXEC);
690         if (ioeventfds[i] < 0) {
691             break;
692         }
693         ret = kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, true, 2, true);
694         if (ret < 0) {
695             close(ioeventfds[i]);
696             break;
697         }
698     }
699 
700     /* Decide whether many devices are supported or not */
701     ret = i == ARRAY_SIZE(ioeventfds);
702 
703     while (i-- > 0) {
704         kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, false, 2, true);
705         close(ioeventfds[i]);
706     }
707     return ret;
708 #else
709     return 0;
710 #endif
711 }
712 
713 static const KVMCapabilityInfo *
714 kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list)
715 {
716     while (list->name) {
717         if (!kvm_check_extension(s, list->value)) {
718             return list;
719         }
720         list++;
721     }
722     return NULL;
723 }
724 
725 static void kvm_set_phys_mem(KVMMemoryListener *kml,
726                              MemoryRegionSection *section, bool add)
727 {
728     KVMSlot *mem;
729     int err;
730     MemoryRegion *mr = section->mr;
731     bool writeable = !mr->readonly && !mr->rom_device;
732     hwaddr start_addr, size;
733     void *ram;
734 
735     if (!memory_region_is_ram(mr)) {
736         if (writeable || !kvm_readonly_mem_allowed) {
737             return;
738         } else if (!mr->romd_mode) {
739             /* If the memory device is not in romd_mode, then we actually want
740              * to remove the kvm memory slot so all accesses will trap. */
741             add = false;
742         }
743     }
744 
745     size = kvm_align_section(section, &start_addr);
746     if (!size) {
747         return;
748     }
749 
750     ram = memory_region_get_ram_ptr(mr) + section->offset_within_region +
751           (section->offset_within_address_space - start_addr);
752 
753     mem = kvm_lookup_matching_slot(kml, start_addr, size);
754     if (!add) {
755         if (!mem) {
756             g_assert(!memory_region_is_ram(mr) && !writeable && !mr->romd_mode);
757             return;
758         }
759         if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
760             kvm_physical_sync_dirty_bitmap(kml, section);
761         }
762 
763         /* unregister the slot */
764         mem->memory_size = 0;
765         err = kvm_set_user_memory_region(kml, mem);
766         if (err) {
767             fprintf(stderr, "%s: error unregistering overlapping slot: %s\n",
768                     __func__, strerror(-err));
769             abort();
770         }
771         return;
772     }
773 
774     if (mem) {
775         /* update the slot */
776         kvm_slot_update_flags(kml, mem, mr);
777         return;
778     }
779 
780     /* register the new slot */
781     mem = kvm_alloc_slot(kml);
782     mem->memory_size = size;
783     mem->start_addr = start_addr;
784     mem->ram = ram;
785     mem->flags = kvm_mem_flags(mr);
786 
787     err = kvm_set_user_memory_region(kml, mem);
788     if (err) {
789         fprintf(stderr, "%s: error registering slot: %s\n", __func__,
790                 strerror(-err));
791         abort();
792     }
793 }
794 
795 static void kvm_region_add(MemoryListener *listener,
796                            MemoryRegionSection *section)
797 {
798     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
799 
800     memory_region_ref(section->mr);
801     kvm_set_phys_mem(kml, section, true);
802 }
803 
804 static void kvm_region_del(MemoryListener *listener,
805                            MemoryRegionSection *section)
806 {
807     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
808 
809     kvm_set_phys_mem(kml, section, false);
810     memory_region_unref(section->mr);
811 }
812 
813 static void kvm_log_sync(MemoryListener *listener,
814                          MemoryRegionSection *section)
815 {
816     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
817     int r;
818 
819     r = kvm_physical_sync_dirty_bitmap(kml, section);
820     if (r < 0) {
821         abort();
822     }
823 }
824 
825 static void kvm_mem_ioeventfd_add(MemoryListener *listener,
826                                   MemoryRegionSection *section,
827                                   bool match_data, uint64_t data,
828                                   EventNotifier *e)
829 {
830     int fd = event_notifier_get_fd(e);
831     int r;
832 
833     r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
834                                data, true, int128_get64(section->size),
835                                match_data);
836     if (r < 0) {
837         fprintf(stderr, "%s: error adding ioeventfd: %s\n",
838                 __func__, strerror(-r));
839         abort();
840     }
841 }
842 
843 static void kvm_mem_ioeventfd_del(MemoryListener *listener,
844                                   MemoryRegionSection *section,
845                                   bool match_data, uint64_t data,
846                                   EventNotifier *e)
847 {
848     int fd = event_notifier_get_fd(e);
849     int r;
850 
851     r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
852                                data, false, int128_get64(section->size),
853                                match_data);
854     if (r < 0) {
855         abort();
856     }
857 }
858 
859 static void kvm_io_ioeventfd_add(MemoryListener *listener,
860                                  MemoryRegionSection *section,
861                                  bool match_data, uint64_t data,
862                                  EventNotifier *e)
863 {
864     int fd = event_notifier_get_fd(e);
865     int r;
866 
867     r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
868                               data, true, int128_get64(section->size),
869                               match_data);
870     if (r < 0) {
871         fprintf(stderr, "%s: error adding ioeventfd: %s\n",
872                 __func__, strerror(-r));
873         abort();
874     }
875 }
876 
877 static void kvm_io_ioeventfd_del(MemoryListener *listener,
878                                  MemoryRegionSection *section,
879                                  bool match_data, uint64_t data,
880                                  EventNotifier *e)
881 
882 {
883     int fd = event_notifier_get_fd(e);
884     int r;
885 
886     r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
887                               data, false, int128_get64(section->size),
888                               match_data);
889     if (r < 0) {
890         abort();
891     }
892 }
893 
894 void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
895                                   AddressSpace *as, int as_id)
896 {
897     int i;
898 
899     kml->slots = g_malloc0(s->nr_slots * sizeof(KVMSlot));
900     kml->as_id = as_id;
901 
902     for (i = 0; i < s->nr_slots; i++) {
903         kml->slots[i].slot = i;
904     }
905 
906     kml->listener.region_add = kvm_region_add;
907     kml->listener.region_del = kvm_region_del;
908     kml->listener.log_start = kvm_log_start;
909     kml->listener.log_stop = kvm_log_stop;
910     kml->listener.log_sync = kvm_log_sync;
911     kml->listener.priority = 10;
912 
913     memory_listener_register(&kml->listener, as);
914 }
915 
916 static MemoryListener kvm_io_listener = {
917     .eventfd_add = kvm_io_ioeventfd_add,
918     .eventfd_del = kvm_io_ioeventfd_del,
919     .priority = 10,
920 };
921 
922 int kvm_set_irq(KVMState *s, int irq, int level)
923 {
924     struct kvm_irq_level event;
925     int ret;
926 
927     assert(kvm_async_interrupts_enabled());
928 
929     event.level = level;
930     event.irq = irq;
931     ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event);
932     if (ret < 0) {
933         perror("kvm_set_irq");
934         abort();
935     }
936 
937     return (s->irq_set_ioctl == KVM_IRQ_LINE) ? 1 : event.status;
938 }
939 
940 #ifdef KVM_CAP_IRQ_ROUTING
941 typedef struct KVMMSIRoute {
942     struct kvm_irq_routing_entry kroute;
943     QTAILQ_ENTRY(KVMMSIRoute) entry;
944 } KVMMSIRoute;
945 
946 static void set_gsi(KVMState *s, unsigned int gsi)
947 {
948     set_bit(gsi, s->used_gsi_bitmap);
949 }
950 
951 static void clear_gsi(KVMState *s, unsigned int gsi)
952 {
953     clear_bit(gsi, s->used_gsi_bitmap);
954 }
955 
956 void kvm_init_irq_routing(KVMState *s)
957 {
958     int gsi_count, i;
959 
960     gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING) - 1;
961     if (gsi_count > 0) {
962         /* Round up so we can search ints using ffs */
963         s->used_gsi_bitmap = bitmap_new(gsi_count);
964         s->gsi_count = gsi_count;
965     }
966 
967     s->irq_routes = g_malloc0(sizeof(*s->irq_routes));
968     s->nr_allocated_irq_routes = 0;
969 
970     if (!kvm_direct_msi_allowed) {
971         for (i = 0; i < KVM_MSI_HASHTAB_SIZE; i++) {
972             QTAILQ_INIT(&s->msi_hashtab[i]);
973         }
974     }
975 
976     kvm_arch_init_irq_routing(s);
977 }
978 
979 void kvm_irqchip_commit_routes(KVMState *s)
980 {
981     int ret;
982 
983     if (kvm_gsi_direct_mapping()) {
984         return;
985     }
986 
987     if (!kvm_gsi_routing_enabled()) {
988         return;
989     }
990 
991     s->irq_routes->flags = 0;
992     trace_kvm_irqchip_commit_routes();
993     ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes);
994     assert(ret == 0);
995 }
996 
997 static void kvm_add_routing_entry(KVMState *s,
998                                   struct kvm_irq_routing_entry *entry)
999 {
1000     struct kvm_irq_routing_entry *new;
1001     int n, size;
1002 
1003     if (s->irq_routes->nr == s->nr_allocated_irq_routes) {
1004         n = s->nr_allocated_irq_routes * 2;
1005         if (n < 64) {
1006             n = 64;
1007         }
1008         size = sizeof(struct kvm_irq_routing);
1009         size += n * sizeof(*new);
1010         s->irq_routes = g_realloc(s->irq_routes, size);
1011         s->nr_allocated_irq_routes = n;
1012     }
1013     n = s->irq_routes->nr++;
1014     new = &s->irq_routes->entries[n];
1015 
1016     *new = *entry;
1017 
1018     set_gsi(s, entry->gsi);
1019 }
1020 
1021 static int kvm_update_routing_entry(KVMState *s,
1022                                     struct kvm_irq_routing_entry *new_entry)
1023 {
1024     struct kvm_irq_routing_entry *entry;
1025     int n;
1026 
1027     for (n = 0; n < s->irq_routes->nr; n++) {
1028         entry = &s->irq_routes->entries[n];
1029         if (entry->gsi != new_entry->gsi) {
1030             continue;
1031         }
1032 
1033         if(!memcmp(entry, new_entry, sizeof *entry)) {
1034             return 0;
1035         }
1036 
1037         *entry = *new_entry;
1038 
1039         return 0;
1040     }
1041 
1042     return -ESRCH;
1043 }
1044 
1045 void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin)
1046 {
1047     struct kvm_irq_routing_entry e = {};
1048 
1049     assert(pin < s->gsi_count);
1050 
1051     e.gsi = irq;
1052     e.type = KVM_IRQ_ROUTING_IRQCHIP;
1053     e.flags = 0;
1054     e.u.irqchip.irqchip = irqchip;
1055     e.u.irqchip.pin = pin;
1056     kvm_add_routing_entry(s, &e);
1057 }
1058 
1059 void kvm_irqchip_release_virq(KVMState *s, int virq)
1060 {
1061     struct kvm_irq_routing_entry *e;
1062     int i;
1063 
1064     if (kvm_gsi_direct_mapping()) {
1065         return;
1066     }
1067 
1068     for (i = 0; i < s->irq_routes->nr; i++) {
1069         e = &s->irq_routes->entries[i];
1070         if (e->gsi == virq) {
1071             s->irq_routes->nr--;
1072             *e = s->irq_routes->entries[s->irq_routes->nr];
1073         }
1074     }
1075     clear_gsi(s, virq);
1076     kvm_arch_release_virq_post(virq);
1077     trace_kvm_irqchip_release_virq(virq);
1078 }
1079 
1080 static unsigned int kvm_hash_msi(uint32_t data)
1081 {
1082     /* This is optimized for IA32 MSI layout. However, no other arch shall
1083      * repeat the mistake of not providing a direct MSI injection API. */
1084     return data & 0xff;
1085 }
1086 
1087 static void kvm_flush_dynamic_msi_routes(KVMState *s)
1088 {
1089     KVMMSIRoute *route, *next;
1090     unsigned int hash;
1091 
1092     for (hash = 0; hash < KVM_MSI_HASHTAB_SIZE; hash++) {
1093         QTAILQ_FOREACH_SAFE(route, &s->msi_hashtab[hash], entry, next) {
1094             kvm_irqchip_release_virq(s, route->kroute.gsi);
1095             QTAILQ_REMOVE(&s->msi_hashtab[hash], route, entry);
1096             g_free(route);
1097         }
1098     }
1099 }
1100 
1101 static int kvm_irqchip_get_virq(KVMState *s)
1102 {
1103     int next_virq;
1104 
1105     /*
1106      * PIC and IOAPIC share the first 16 GSI numbers, thus the available
1107      * GSI numbers are more than the number of IRQ route. Allocating a GSI
1108      * number can succeed even though a new route entry cannot be added.
1109      * When this happens, flush dynamic MSI entries to free IRQ route entries.
1110      */
1111     if (!kvm_direct_msi_allowed && s->irq_routes->nr == s->gsi_count) {
1112         kvm_flush_dynamic_msi_routes(s);
1113     }
1114 
1115     /* Return the lowest unused GSI in the bitmap */
1116     next_virq = find_first_zero_bit(s->used_gsi_bitmap, s->gsi_count);
1117     if (next_virq >= s->gsi_count) {
1118         return -ENOSPC;
1119     } else {
1120         return next_virq;
1121     }
1122 }
1123 
1124 static KVMMSIRoute *kvm_lookup_msi_route(KVMState *s, MSIMessage msg)
1125 {
1126     unsigned int hash = kvm_hash_msi(msg.data);
1127     KVMMSIRoute *route;
1128 
1129     QTAILQ_FOREACH(route, &s->msi_hashtab[hash], entry) {
1130         if (route->kroute.u.msi.address_lo == (uint32_t)msg.address &&
1131             route->kroute.u.msi.address_hi == (msg.address >> 32) &&
1132             route->kroute.u.msi.data == le32_to_cpu(msg.data)) {
1133             return route;
1134         }
1135     }
1136     return NULL;
1137 }
1138 
1139 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
1140 {
1141     struct kvm_msi msi;
1142     KVMMSIRoute *route;
1143 
1144     if (kvm_direct_msi_allowed) {
1145         msi.address_lo = (uint32_t)msg.address;
1146         msi.address_hi = msg.address >> 32;
1147         msi.data = le32_to_cpu(msg.data);
1148         msi.flags = 0;
1149         memset(msi.pad, 0, sizeof(msi.pad));
1150 
1151         return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi);
1152     }
1153 
1154     route = kvm_lookup_msi_route(s, msg);
1155     if (!route) {
1156         int virq;
1157 
1158         virq = kvm_irqchip_get_virq(s);
1159         if (virq < 0) {
1160             return virq;
1161         }
1162 
1163         route = g_malloc0(sizeof(KVMMSIRoute));
1164         route->kroute.gsi = virq;
1165         route->kroute.type = KVM_IRQ_ROUTING_MSI;
1166         route->kroute.flags = 0;
1167         route->kroute.u.msi.address_lo = (uint32_t)msg.address;
1168         route->kroute.u.msi.address_hi = msg.address >> 32;
1169         route->kroute.u.msi.data = le32_to_cpu(msg.data);
1170 
1171         kvm_add_routing_entry(s, &route->kroute);
1172         kvm_irqchip_commit_routes(s);
1173 
1174         QTAILQ_INSERT_TAIL(&s->msi_hashtab[kvm_hash_msi(msg.data)], route,
1175                            entry);
1176     }
1177 
1178     assert(route->kroute.type == KVM_IRQ_ROUTING_MSI);
1179 
1180     return kvm_set_irq(s, route->kroute.gsi, 1);
1181 }
1182 
1183 int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev)
1184 {
1185     struct kvm_irq_routing_entry kroute = {};
1186     int virq;
1187     MSIMessage msg = {0, 0};
1188 
1189     if (pci_available && dev) {
1190         msg = pci_get_msi_message(dev, vector);
1191     }
1192 
1193     if (kvm_gsi_direct_mapping()) {
1194         return kvm_arch_msi_data_to_gsi(msg.data);
1195     }
1196 
1197     if (!kvm_gsi_routing_enabled()) {
1198         return -ENOSYS;
1199     }
1200 
1201     virq = kvm_irqchip_get_virq(s);
1202     if (virq < 0) {
1203         return virq;
1204     }
1205 
1206     kroute.gsi = virq;
1207     kroute.type = KVM_IRQ_ROUTING_MSI;
1208     kroute.flags = 0;
1209     kroute.u.msi.address_lo = (uint32_t)msg.address;
1210     kroute.u.msi.address_hi = msg.address >> 32;
1211     kroute.u.msi.data = le32_to_cpu(msg.data);
1212     if (pci_available && kvm_msi_devid_required()) {
1213         kroute.flags = KVM_MSI_VALID_DEVID;
1214         kroute.u.msi.devid = pci_requester_id(dev);
1215     }
1216     if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
1217         kvm_irqchip_release_virq(s, virq);
1218         return -EINVAL;
1219     }
1220 
1221     trace_kvm_irqchip_add_msi_route(dev ? dev->name : (char *)"N/A",
1222                                     vector, virq);
1223 
1224     kvm_add_routing_entry(s, &kroute);
1225     kvm_arch_add_msi_route_post(&kroute, vector, dev);
1226     kvm_irqchip_commit_routes(s);
1227 
1228     return virq;
1229 }
1230 
1231 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg,
1232                                  PCIDevice *dev)
1233 {
1234     struct kvm_irq_routing_entry kroute = {};
1235 
1236     if (kvm_gsi_direct_mapping()) {
1237         return 0;
1238     }
1239 
1240     if (!kvm_irqchip_in_kernel()) {
1241         return -ENOSYS;
1242     }
1243 
1244     kroute.gsi = virq;
1245     kroute.type = KVM_IRQ_ROUTING_MSI;
1246     kroute.flags = 0;
1247     kroute.u.msi.address_lo = (uint32_t)msg.address;
1248     kroute.u.msi.address_hi = msg.address >> 32;
1249     kroute.u.msi.data = le32_to_cpu(msg.data);
1250     if (pci_available && kvm_msi_devid_required()) {
1251         kroute.flags = KVM_MSI_VALID_DEVID;
1252         kroute.u.msi.devid = pci_requester_id(dev);
1253     }
1254     if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
1255         return -EINVAL;
1256     }
1257 
1258     trace_kvm_irqchip_update_msi_route(virq);
1259 
1260     return kvm_update_routing_entry(s, &kroute);
1261 }
1262 
1263 static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int rfd, int virq,
1264                                     bool assign)
1265 {
1266     struct kvm_irqfd irqfd = {
1267         .fd = fd,
1268         .gsi = virq,
1269         .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN,
1270     };
1271 
1272     if (rfd != -1) {
1273         irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE;
1274         irqfd.resamplefd = rfd;
1275     }
1276 
1277     if (!kvm_irqfds_enabled()) {
1278         return -ENOSYS;
1279     }
1280 
1281     return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd);
1282 }
1283 
1284 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter)
1285 {
1286     struct kvm_irq_routing_entry kroute = {};
1287     int virq;
1288 
1289     if (!kvm_gsi_routing_enabled()) {
1290         return -ENOSYS;
1291     }
1292 
1293     virq = kvm_irqchip_get_virq(s);
1294     if (virq < 0) {
1295         return virq;
1296     }
1297 
1298     kroute.gsi = virq;
1299     kroute.type = KVM_IRQ_ROUTING_S390_ADAPTER;
1300     kroute.flags = 0;
1301     kroute.u.adapter.summary_addr = adapter->summary_addr;
1302     kroute.u.adapter.ind_addr = adapter->ind_addr;
1303     kroute.u.adapter.summary_offset = adapter->summary_offset;
1304     kroute.u.adapter.ind_offset = adapter->ind_offset;
1305     kroute.u.adapter.adapter_id = adapter->adapter_id;
1306 
1307     kvm_add_routing_entry(s, &kroute);
1308 
1309     return virq;
1310 }
1311 
1312 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint)
1313 {
1314     struct kvm_irq_routing_entry kroute = {};
1315     int virq;
1316 
1317     if (!kvm_gsi_routing_enabled()) {
1318         return -ENOSYS;
1319     }
1320     if (!kvm_check_extension(s, KVM_CAP_HYPERV_SYNIC)) {
1321         return -ENOSYS;
1322     }
1323     virq = kvm_irqchip_get_virq(s);
1324     if (virq < 0) {
1325         return virq;
1326     }
1327 
1328     kroute.gsi = virq;
1329     kroute.type = KVM_IRQ_ROUTING_HV_SINT;
1330     kroute.flags = 0;
1331     kroute.u.hv_sint.vcpu = vcpu;
1332     kroute.u.hv_sint.sint = sint;
1333 
1334     kvm_add_routing_entry(s, &kroute);
1335     kvm_irqchip_commit_routes(s);
1336 
1337     return virq;
1338 }
1339 
1340 #else /* !KVM_CAP_IRQ_ROUTING */
1341 
1342 void kvm_init_irq_routing(KVMState *s)
1343 {
1344 }
1345 
1346 void kvm_irqchip_release_virq(KVMState *s, int virq)
1347 {
1348 }
1349 
1350 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
1351 {
1352     abort();
1353 }
1354 
1355 int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev)
1356 {
1357     return -ENOSYS;
1358 }
1359 
1360 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter)
1361 {
1362     return -ENOSYS;
1363 }
1364 
1365 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint)
1366 {
1367     return -ENOSYS;
1368 }
1369 
1370 static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int virq, bool assign)
1371 {
1372     abort();
1373 }
1374 
1375 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg)
1376 {
1377     return -ENOSYS;
1378 }
1379 #endif /* !KVM_CAP_IRQ_ROUTING */
1380 
1381 int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
1382                                        EventNotifier *rn, int virq)
1383 {
1384     return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n),
1385            rn ? event_notifier_get_fd(rn) : -1, virq, true);
1386 }
1387 
1388 int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
1389                                           int virq)
1390 {
1391     return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), -1, virq,
1392            false);
1393 }
1394 
1395 int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n,
1396                                    EventNotifier *rn, qemu_irq irq)
1397 {
1398     gpointer key, gsi;
1399     gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
1400 
1401     if (!found) {
1402         return -ENXIO;
1403     }
1404     return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, GPOINTER_TO_INT(gsi));
1405 }
1406 
1407 int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n,
1408                                       qemu_irq irq)
1409 {
1410     gpointer key, gsi;
1411     gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
1412 
1413     if (!found) {
1414         return -ENXIO;
1415     }
1416     return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, GPOINTER_TO_INT(gsi));
1417 }
1418 
1419 void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi)
1420 {
1421     g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi));
1422 }
1423 
1424 static void kvm_irqchip_create(MachineState *machine, KVMState *s)
1425 {
1426     int ret;
1427 
1428     if (kvm_check_extension(s, KVM_CAP_IRQCHIP)) {
1429         ;
1430     } else if (kvm_check_extension(s, KVM_CAP_S390_IRQCHIP)) {
1431         ret = kvm_vm_enable_cap(s, KVM_CAP_S390_IRQCHIP, 0);
1432         if (ret < 0) {
1433             fprintf(stderr, "Enable kernel irqchip failed: %s\n", strerror(-ret));
1434             exit(1);
1435         }
1436     } else {
1437         return;
1438     }
1439 
1440     /* First probe and see if there's a arch-specific hook to create the
1441      * in-kernel irqchip for us */
1442     ret = kvm_arch_irqchip_create(machine, s);
1443     if (ret == 0) {
1444         if (machine_kernel_irqchip_split(machine)) {
1445             perror("Split IRQ chip mode not supported.");
1446             exit(1);
1447         } else {
1448             ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP);
1449         }
1450     }
1451     if (ret < 0) {
1452         fprintf(stderr, "Create kernel irqchip failed: %s\n", strerror(-ret));
1453         exit(1);
1454     }
1455 
1456     kvm_kernel_irqchip = true;
1457     /* If we have an in-kernel IRQ chip then we must have asynchronous
1458      * interrupt delivery (though the reverse is not necessarily true)
1459      */
1460     kvm_async_interrupts_allowed = true;
1461     kvm_halt_in_kernel_allowed = true;
1462 
1463     kvm_init_irq_routing(s);
1464 
1465     s->gsimap = g_hash_table_new(g_direct_hash, g_direct_equal);
1466 }
1467 
1468 /* Find number of supported CPUs using the recommended
1469  * procedure from the kernel API documentation to cope with
1470  * older kernels that may be missing capabilities.
1471  */
1472 static int kvm_recommended_vcpus(KVMState *s)
1473 {
1474     int ret = kvm_check_extension(s, KVM_CAP_NR_VCPUS);
1475     return (ret) ? ret : 4;
1476 }
1477 
1478 static int kvm_max_vcpus(KVMState *s)
1479 {
1480     int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS);
1481     return (ret) ? ret : kvm_recommended_vcpus(s);
1482 }
1483 
1484 static int kvm_max_vcpu_id(KVMState *s)
1485 {
1486     int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPU_ID);
1487     return (ret) ? ret : kvm_max_vcpus(s);
1488 }
1489 
1490 bool kvm_vcpu_id_is_valid(int vcpu_id)
1491 {
1492     KVMState *s = KVM_STATE(current_machine->accelerator);
1493     return vcpu_id >= 0 && vcpu_id < kvm_max_vcpu_id(s);
1494 }
1495 
1496 static int kvm_init(MachineState *ms)
1497 {
1498     MachineClass *mc = MACHINE_GET_CLASS(ms);
1499     static const char upgrade_note[] =
1500         "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
1501         "(see http://sourceforge.net/projects/kvm).\n";
1502     struct {
1503         const char *name;
1504         int num;
1505     } num_cpus[] = {
1506         { "SMP",          smp_cpus },
1507         { "hotpluggable", max_cpus },
1508         { NULL, }
1509     }, *nc = num_cpus;
1510     int soft_vcpus_limit, hard_vcpus_limit;
1511     KVMState *s;
1512     const KVMCapabilityInfo *missing_cap;
1513     int ret;
1514     int type = 0;
1515     const char *kvm_type;
1516 
1517     s = KVM_STATE(ms->accelerator);
1518 
1519     /*
1520      * On systems where the kernel can support different base page
1521      * sizes, host page size may be different from TARGET_PAGE_SIZE,
1522      * even with KVM.  TARGET_PAGE_SIZE is assumed to be the minimum
1523      * page size for the system though.
1524      */
1525     assert(TARGET_PAGE_SIZE <= getpagesize());
1526 
1527     s->sigmask_len = 8;
1528 
1529 #ifdef KVM_CAP_SET_GUEST_DEBUG
1530     QTAILQ_INIT(&s->kvm_sw_breakpoints);
1531 #endif
1532     QLIST_INIT(&s->kvm_parked_vcpus);
1533     s->vmfd = -1;
1534     s->fd = qemu_open("/dev/kvm", O_RDWR);
1535     if (s->fd == -1) {
1536         fprintf(stderr, "Could not access KVM kernel module: %m\n");
1537         ret = -errno;
1538         goto err;
1539     }
1540 
1541     ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
1542     if (ret < KVM_API_VERSION) {
1543         if (ret >= 0) {
1544             ret = -EINVAL;
1545         }
1546         fprintf(stderr, "kvm version too old\n");
1547         goto err;
1548     }
1549 
1550     if (ret > KVM_API_VERSION) {
1551         ret = -EINVAL;
1552         fprintf(stderr, "kvm version not supported\n");
1553         goto err;
1554     }
1555 
1556     kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT);
1557     s->nr_slots = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS);
1558 
1559     /* If unspecified, use the default value */
1560     if (!s->nr_slots) {
1561         s->nr_slots = 32;
1562     }
1563 
1564     /* check the vcpu limits */
1565     soft_vcpus_limit = kvm_recommended_vcpus(s);
1566     hard_vcpus_limit = kvm_max_vcpus(s);
1567 
1568     while (nc->name) {
1569         if (nc->num > soft_vcpus_limit) {
1570             fprintf(stderr,
1571                     "Warning: Number of %s cpus requested (%d) exceeds "
1572                     "the recommended cpus supported by KVM (%d)\n",
1573                     nc->name, nc->num, soft_vcpus_limit);
1574 
1575             if (nc->num > hard_vcpus_limit) {
1576                 fprintf(stderr, "Number of %s cpus requested (%d) exceeds "
1577                         "the maximum cpus supported by KVM (%d)\n",
1578                         nc->name, nc->num, hard_vcpus_limit);
1579                 exit(1);
1580             }
1581         }
1582         nc++;
1583     }
1584 
1585     kvm_type = qemu_opt_get(qemu_get_machine_opts(), "kvm-type");
1586     if (mc->kvm_type) {
1587         type = mc->kvm_type(kvm_type);
1588     } else if (kvm_type) {
1589         ret = -EINVAL;
1590         fprintf(stderr, "Invalid argument kvm-type=%s\n", kvm_type);
1591         goto err;
1592     }
1593 
1594     do {
1595         ret = kvm_ioctl(s, KVM_CREATE_VM, type);
1596     } while (ret == -EINTR);
1597 
1598     if (ret < 0) {
1599         fprintf(stderr, "ioctl(KVM_CREATE_VM) failed: %d %s\n", -ret,
1600                 strerror(-ret));
1601 
1602 #ifdef TARGET_S390X
1603         if (ret == -EINVAL) {
1604             fprintf(stderr,
1605                     "Host kernel setup problem detected. Please verify:\n");
1606             fprintf(stderr, "- for kernels supporting the switch_amode or"
1607                     " user_mode parameters, whether\n");
1608             fprintf(stderr,
1609                     "  user space is running in primary address space\n");
1610             fprintf(stderr,
1611                     "- for kernels supporting the vm.allocate_pgste sysctl, "
1612                     "whether it is enabled\n");
1613         }
1614 #endif
1615         goto err;
1616     }
1617 
1618     s->vmfd = ret;
1619     missing_cap = kvm_check_extension_list(s, kvm_required_capabilites);
1620     if (!missing_cap) {
1621         missing_cap =
1622             kvm_check_extension_list(s, kvm_arch_required_capabilities);
1623     }
1624     if (missing_cap) {
1625         ret = -EINVAL;
1626         fprintf(stderr, "kvm does not support %s\n%s",
1627                 missing_cap->name, upgrade_note);
1628         goto err;
1629     }
1630 
1631     s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
1632 
1633 #ifdef KVM_CAP_VCPU_EVENTS
1634     s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
1635 #endif
1636 
1637     s->robust_singlestep =
1638         kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP);
1639 
1640 #ifdef KVM_CAP_DEBUGREGS
1641     s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
1642 #endif
1643 
1644 #ifdef KVM_CAP_IRQ_ROUTING
1645     kvm_direct_msi_allowed = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0);
1646 #endif
1647 
1648     s->intx_set_mask = kvm_check_extension(s, KVM_CAP_PCI_2_3);
1649 
1650     s->irq_set_ioctl = KVM_IRQ_LINE;
1651     if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) {
1652         s->irq_set_ioctl = KVM_IRQ_LINE_STATUS;
1653     }
1654 
1655 #ifdef KVM_CAP_READONLY_MEM
1656     kvm_readonly_mem_allowed =
1657         (kvm_check_extension(s, KVM_CAP_READONLY_MEM) > 0);
1658 #endif
1659 
1660     kvm_eventfds_allowed =
1661         (kvm_check_extension(s, KVM_CAP_IOEVENTFD) > 0);
1662 
1663     kvm_irqfds_allowed =
1664         (kvm_check_extension(s, KVM_CAP_IRQFD) > 0);
1665 
1666     kvm_resamplefds_allowed =
1667         (kvm_check_extension(s, KVM_CAP_IRQFD_RESAMPLE) > 0);
1668 
1669     kvm_vm_attributes_allowed =
1670         (kvm_check_extension(s, KVM_CAP_VM_ATTRIBUTES) > 0);
1671 
1672     kvm_ioeventfd_any_length_allowed =
1673         (kvm_check_extension(s, KVM_CAP_IOEVENTFD_ANY_LENGTH) > 0);
1674 
1675     kvm_state = s;
1676 
1677     ret = kvm_arch_init(ms, s);
1678     if (ret < 0) {
1679         goto err;
1680     }
1681 
1682     if (machine_kernel_irqchip_allowed(ms)) {
1683         kvm_irqchip_create(ms, s);
1684     }
1685 
1686     if (kvm_eventfds_allowed) {
1687         s->memory_listener.listener.eventfd_add = kvm_mem_ioeventfd_add;
1688         s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del;
1689     }
1690     s->memory_listener.listener.coalesced_mmio_add = kvm_coalesce_mmio_region;
1691     s->memory_listener.listener.coalesced_mmio_del = kvm_uncoalesce_mmio_region;
1692 
1693     kvm_memory_listener_register(s, &s->memory_listener,
1694                                  &address_space_memory, 0);
1695     memory_listener_register(&kvm_io_listener,
1696                              &address_space_io);
1697 
1698     s->many_ioeventfds = kvm_check_many_ioeventfds();
1699 
1700     return 0;
1701 
1702 err:
1703     assert(ret < 0);
1704     if (s->vmfd >= 0) {
1705         close(s->vmfd);
1706     }
1707     if (s->fd != -1) {
1708         close(s->fd);
1709     }
1710     g_free(s->memory_listener.slots);
1711 
1712     return ret;
1713 }
1714 
1715 void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len)
1716 {
1717     s->sigmask_len = sigmask_len;
1718 }
1719 
1720 static void kvm_handle_io(uint16_t port, MemTxAttrs attrs, void *data, int direction,
1721                           int size, uint32_t count)
1722 {
1723     int i;
1724     uint8_t *ptr = data;
1725 
1726     for (i = 0; i < count; i++) {
1727         address_space_rw(&address_space_io, port, attrs,
1728                          ptr, size,
1729                          direction == KVM_EXIT_IO_OUT);
1730         ptr += size;
1731     }
1732 }
1733 
1734 static int kvm_handle_internal_error(CPUState *cpu, struct kvm_run *run)
1735 {
1736     fprintf(stderr, "KVM internal error. Suberror: %d\n",
1737             run->internal.suberror);
1738 
1739     if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) {
1740         int i;
1741 
1742         for (i = 0; i < run->internal.ndata; ++i) {
1743             fprintf(stderr, "extra data[%d]: %"PRIx64"\n",
1744                     i, (uint64_t)run->internal.data[i]);
1745         }
1746     }
1747     if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
1748         fprintf(stderr, "emulation failure\n");
1749         if (!kvm_arch_stop_on_emulation_error(cpu)) {
1750             cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_CODE);
1751             return EXCP_INTERRUPT;
1752         }
1753     }
1754     /* FIXME: Should trigger a qmp message to let management know
1755      * something went wrong.
1756      */
1757     return -1;
1758 }
1759 
1760 void kvm_flush_coalesced_mmio_buffer(void)
1761 {
1762     KVMState *s = kvm_state;
1763 
1764     if (s->coalesced_flush_in_progress) {
1765         return;
1766     }
1767 
1768     s->coalesced_flush_in_progress = true;
1769 
1770     if (s->coalesced_mmio_ring) {
1771         struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
1772         while (ring->first != ring->last) {
1773             struct kvm_coalesced_mmio *ent;
1774 
1775             ent = &ring->coalesced_mmio[ring->first];
1776 
1777             cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
1778             smp_wmb();
1779             ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
1780         }
1781     }
1782 
1783     s->coalesced_flush_in_progress = false;
1784 }
1785 
1786 static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
1787 {
1788     if (!cpu->vcpu_dirty) {
1789         kvm_arch_get_registers(cpu);
1790         cpu->vcpu_dirty = true;
1791     }
1792 }
1793 
1794 void kvm_cpu_synchronize_state(CPUState *cpu)
1795 {
1796     if (!cpu->vcpu_dirty) {
1797         run_on_cpu(cpu, do_kvm_cpu_synchronize_state, RUN_ON_CPU_NULL);
1798     }
1799 }
1800 
1801 static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg)
1802 {
1803     kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE);
1804     cpu->vcpu_dirty = false;
1805 }
1806 
1807 void kvm_cpu_synchronize_post_reset(CPUState *cpu)
1808 {
1809     run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
1810 }
1811 
1812 static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg)
1813 {
1814     kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE);
1815     cpu->vcpu_dirty = false;
1816 }
1817 
1818 void kvm_cpu_synchronize_post_init(CPUState *cpu)
1819 {
1820     run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
1821 }
1822 
1823 static void do_kvm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg)
1824 {
1825     cpu->vcpu_dirty = true;
1826 }
1827 
1828 void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu)
1829 {
1830     run_on_cpu(cpu, do_kvm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
1831 }
1832 
1833 #ifdef KVM_HAVE_MCE_INJECTION
1834 static __thread void *pending_sigbus_addr;
1835 static __thread int pending_sigbus_code;
1836 static __thread bool have_sigbus_pending;
1837 #endif
1838 
1839 static void kvm_cpu_kick(CPUState *cpu)
1840 {
1841     atomic_set(&cpu->kvm_run->immediate_exit, 1);
1842 }
1843 
1844 static void kvm_cpu_kick_self(void)
1845 {
1846     if (kvm_immediate_exit) {
1847         kvm_cpu_kick(current_cpu);
1848     } else {
1849         qemu_cpu_kick_self();
1850     }
1851 }
1852 
1853 static void kvm_eat_signals(CPUState *cpu)
1854 {
1855     struct timespec ts = { 0, 0 };
1856     siginfo_t siginfo;
1857     sigset_t waitset;
1858     sigset_t chkset;
1859     int r;
1860 
1861     if (kvm_immediate_exit) {
1862         atomic_set(&cpu->kvm_run->immediate_exit, 0);
1863         /* Write kvm_run->immediate_exit before the cpu->exit_request
1864          * write in kvm_cpu_exec.
1865          */
1866         smp_wmb();
1867         return;
1868     }
1869 
1870     sigemptyset(&waitset);
1871     sigaddset(&waitset, SIG_IPI);
1872 
1873     do {
1874         r = sigtimedwait(&waitset, &siginfo, &ts);
1875         if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
1876             perror("sigtimedwait");
1877             exit(1);
1878         }
1879 
1880         r = sigpending(&chkset);
1881         if (r == -1) {
1882             perror("sigpending");
1883             exit(1);
1884         }
1885     } while (sigismember(&chkset, SIG_IPI));
1886 }
1887 
1888 int kvm_cpu_exec(CPUState *cpu)
1889 {
1890     struct kvm_run *run = cpu->kvm_run;
1891     int ret, run_ret;
1892 
1893     DPRINTF("kvm_cpu_exec()\n");
1894 
1895     if (kvm_arch_process_async_events(cpu)) {
1896         atomic_set(&cpu->exit_request, 0);
1897         return EXCP_HLT;
1898     }
1899 
1900     qemu_mutex_unlock_iothread();
1901     cpu_exec_start(cpu);
1902 
1903     do {
1904         MemTxAttrs attrs;
1905 
1906         if (cpu->vcpu_dirty) {
1907             kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE);
1908             cpu->vcpu_dirty = false;
1909         }
1910 
1911         kvm_arch_pre_run(cpu, run);
1912         if (atomic_read(&cpu->exit_request)) {
1913             DPRINTF("interrupt exit requested\n");
1914             /*
1915              * KVM requires us to reenter the kernel after IO exits to complete
1916              * instruction emulation. This self-signal will ensure that we
1917              * leave ASAP again.
1918              */
1919             kvm_cpu_kick_self();
1920         }
1921 
1922         /* Read cpu->exit_request before KVM_RUN reads run->immediate_exit.
1923          * Matching barrier in kvm_eat_signals.
1924          */
1925         smp_rmb();
1926 
1927         run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0);
1928 
1929         attrs = kvm_arch_post_run(cpu, run);
1930 
1931 #ifdef KVM_HAVE_MCE_INJECTION
1932         if (unlikely(have_sigbus_pending)) {
1933             qemu_mutex_lock_iothread();
1934             kvm_arch_on_sigbus_vcpu(cpu, pending_sigbus_code,
1935                                     pending_sigbus_addr);
1936             have_sigbus_pending = false;
1937             qemu_mutex_unlock_iothread();
1938         }
1939 #endif
1940 
1941         if (run_ret < 0) {
1942             if (run_ret == -EINTR || run_ret == -EAGAIN) {
1943                 DPRINTF("io window exit\n");
1944                 kvm_eat_signals(cpu);
1945                 ret = EXCP_INTERRUPT;
1946                 break;
1947             }
1948             fprintf(stderr, "error: kvm run failed %s\n",
1949                     strerror(-run_ret));
1950 #ifdef TARGET_PPC
1951             if (run_ret == -EBUSY) {
1952                 fprintf(stderr,
1953                         "This is probably because your SMT is enabled.\n"
1954                         "VCPU can only run on primary threads with all "
1955                         "secondary threads offline.\n");
1956             }
1957 #endif
1958             ret = -1;
1959             break;
1960         }
1961 
1962         trace_kvm_run_exit(cpu->cpu_index, run->exit_reason);
1963         switch (run->exit_reason) {
1964         case KVM_EXIT_IO:
1965             DPRINTF("handle_io\n");
1966             /* Called outside BQL */
1967             kvm_handle_io(run->io.port, attrs,
1968                           (uint8_t *)run + run->io.data_offset,
1969                           run->io.direction,
1970                           run->io.size,
1971                           run->io.count);
1972             ret = 0;
1973             break;
1974         case KVM_EXIT_MMIO:
1975             DPRINTF("handle_mmio\n");
1976             /* Called outside BQL */
1977             address_space_rw(&address_space_memory,
1978                              run->mmio.phys_addr, attrs,
1979                              run->mmio.data,
1980                              run->mmio.len,
1981                              run->mmio.is_write);
1982             ret = 0;
1983             break;
1984         case KVM_EXIT_IRQ_WINDOW_OPEN:
1985             DPRINTF("irq_window_open\n");
1986             ret = EXCP_INTERRUPT;
1987             break;
1988         case KVM_EXIT_SHUTDOWN:
1989             DPRINTF("shutdown\n");
1990             qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
1991             ret = EXCP_INTERRUPT;
1992             break;
1993         case KVM_EXIT_UNKNOWN:
1994             fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n",
1995                     (uint64_t)run->hw.hardware_exit_reason);
1996             ret = -1;
1997             break;
1998         case KVM_EXIT_INTERNAL_ERROR:
1999             ret = kvm_handle_internal_error(cpu, run);
2000             break;
2001         case KVM_EXIT_SYSTEM_EVENT:
2002             switch (run->system_event.type) {
2003             case KVM_SYSTEM_EVENT_SHUTDOWN:
2004                 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
2005                 ret = EXCP_INTERRUPT;
2006                 break;
2007             case KVM_SYSTEM_EVENT_RESET:
2008                 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
2009                 ret = EXCP_INTERRUPT;
2010                 break;
2011             case KVM_SYSTEM_EVENT_CRASH:
2012                 kvm_cpu_synchronize_state(cpu);
2013                 qemu_mutex_lock_iothread();
2014                 qemu_system_guest_panicked(cpu_get_crash_info(cpu));
2015                 qemu_mutex_unlock_iothread();
2016                 ret = 0;
2017                 break;
2018             default:
2019                 DPRINTF("kvm_arch_handle_exit\n");
2020                 ret = kvm_arch_handle_exit(cpu, run);
2021                 break;
2022             }
2023             break;
2024         default:
2025             DPRINTF("kvm_arch_handle_exit\n");
2026             ret = kvm_arch_handle_exit(cpu, run);
2027             break;
2028         }
2029     } while (ret == 0);
2030 
2031     cpu_exec_end(cpu);
2032     qemu_mutex_lock_iothread();
2033 
2034     if (ret < 0) {
2035         cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_CODE);
2036         vm_stop(RUN_STATE_INTERNAL_ERROR);
2037     }
2038 
2039     atomic_set(&cpu->exit_request, 0);
2040     return ret;
2041 }
2042 
2043 int kvm_ioctl(KVMState *s, int type, ...)
2044 {
2045     int ret;
2046     void *arg;
2047     va_list ap;
2048 
2049     va_start(ap, type);
2050     arg = va_arg(ap, void *);
2051     va_end(ap);
2052 
2053     trace_kvm_ioctl(type, arg);
2054     ret = ioctl(s->fd, type, arg);
2055     if (ret == -1) {
2056         ret = -errno;
2057     }
2058     return ret;
2059 }
2060 
2061 int kvm_vm_ioctl(KVMState *s, int type, ...)
2062 {
2063     int ret;
2064     void *arg;
2065     va_list ap;
2066 
2067     va_start(ap, type);
2068     arg = va_arg(ap, void *);
2069     va_end(ap);
2070 
2071     trace_kvm_vm_ioctl(type, arg);
2072     ret = ioctl(s->vmfd, type, arg);
2073     if (ret == -1) {
2074         ret = -errno;
2075     }
2076     return ret;
2077 }
2078 
2079 int kvm_vcpu_ioctl(CPUState *cpu, int type, ...)
2080 {
2081     int ret;
2082     void *arg;
2083     va_list ap;
2084 
2085     va_start(ap, type);
2086     arg = va_arg(ap, void *);
2087     va_end(ap);
2088 
2089     trace_kvm_vcpu_ioctl(cpu->cpu_index, type, arg);
2090     ret = ioctl(cpu->kvm_fd, type, arg);
2091     if (ret == -1) {
2092         ret = -errno;
2093     }
2094     return ret;
2095 }
2096 
2097 int kvm_device_ioctl(int fd, int type, ...)
2098 {
2099     int ret;
2100     void *arg;
2101     va_list ap;
2102 
2103     va_start(ap, type);
2104     arg = va_arg(ap, void *);
2105     va_end(ap);
2106 
2107     trace_kvm_device_ioctl(fd, type, arg);
2108     ret = ioctl(fd, type, arg);
2109     if (ret == -1) {
2110         ret = -errno;
2111     }
2112     return ret;
2113 }
2114 
2115 int kvm_vm_check_attr(KVMState *s, uint32_t group, uint64_t attr)
2116 {
2117     int ret;
2118     struct kvm_device_attr attribute = {
2119         .group = group,
2120         .attr = attr,
2121     };
2122 
2123     if (!kvm_vm_attributes_allowed) {
2124         return 0;
2125     }
2126 
2127     ret = kvm_vm_ioctl(s, KVM_HAS_DEVICE_ATTR, &attribute);
2128     /* kvm returns 0 on success for HAS_DEVICE_ATTR */
2129     return ret ? 0 : 1;
2130 }
2131 
2132 int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr)
2133 {
2134     struct kvm_device_attr attribute = {
2135         .group = group,
2136         .attr = attr,
2137         .flags = 0,
2138     };
2139 
2140     return kvm_device_ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute) ? 0 : 1;
2141 }
2142 
2143 int kvm_device_access(int fd, int group, uint64_t attr,
2144                       void *val, bool write, Error **errp)
2145 {
2146     struct kvm_device_attr kvmattr;
2147     int err;
2148 
2149     kvmattr.flags = 0;
2150     kvmattr.group = group;
2151     kvmattr.attr = attr;
2152     kvmattr.addr = (uintptr_t)val;
2153 
2154     err = kvm_device_ioctl(fd,
2155                            write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR,
2156                            &kvmattr);
2157     if (err < 0) {
2158         error_setg_errno(errp, -err,
2159                          "KVM_%s_DEVICE_ATTR failed: Group %d "
2160                          "attr 0x%016" PRIx64,
2161                          write ? "SET" : "GET", group, attr);
2162     }
2163     return err;
2164 }
2165 
2166 /* Return 1 on success, 0 on failure */
2167 int kvm_has_sync_mmu(void)
2168 {
2169     return kvm_check_extension(kvm_state, KVM_CAP_SYNC_MMU);
2170 }
2171 
2172 int kvm_has_vcpu_events(void)
2173 {
2174     return kvm_state->vcpu_events;
2175 }
2176 
2177 int kvm_has_robust_singlestep(void)
2178 {
2179     return kvm_state->robust_singlestep;
2180 }
2181 
2182 int kvm_has_debugregs(void)
2183 {
2184     return kvm_state->debugregs;
2185 }
2186 
2187 int kvm_has_many_ioeventfds(void)
2188 {
2189     if (!kvm_enabled()) {
2190         return 0;
2191     }
2192     return kvm_state->many_ioeventfds;
2193 }
2194 
2195 int kvm_has_gsi_routing(void)
2196 {
2197 #ifdef KVM_CAP_IRQ_ROUTING
2198     return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
2199 #else
2200     return false;
2201 #endif
2202 }
2203 
2204 int kvm_has_intx_set_mask(void)
2205 {
2206     return kvm_state->intx_set_mask;
2207 }
2208 
2209 bool kvm_arm_supports_user_irq(void)
2210 {
2211     return kvm_check_extension(kvm_state, KVM_CAP_ARM_USER_IRQ);
2212 }
2213 
2214 #ifdef KVM_CAP_SET_GUEST_DEBUG
2215 struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu,
2216                                                  target_ulong pc)
2217 {
2218     struct kvm_sw_breakpoint *bp;
2219 
2220     QTAILQ_FOREACH(bp, &cpu->kvm_state->kvm_sw_breakpoints, entry) {
2221         if (bp->pc == pc) {
2222             return bp;
2223         }
2224     }
2225     return NULL;
2226 }
2227 
2228 int kvm_sw_breakpoints_active(CPUState *cpu)
2229 {
2230     return !QTAILQ_EMPTY(&cpu->kvm_state->kvm_sw_breakpoints);
2231 }
2232 
2233 struct kvm_set_guest_debug_data {
2234     struct kvm_guest_debug dbg;
2235     int err;
2236 };
2237 
2238 static void kvm_invoke_set_guest_debug(CPUState *cpu, run_on_cpu_data data)
2239 {
2240     struct kvm_set_guest_debug_data *dbg_data =
2241         (struct kvm_set_guest_debug_data *) data.host_ptr;
2242 
2243     dbg_data->err = kvm_vcpu_ioctl(cpu, KVM_SET_GUEST_DEBUG,
2244                                    &dbg_data->dbg);
2245 }
2246 
2247 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap)
2248 {
2249     struct kvm_set_guest_debug_data data;
2250 
2251     data.dbg.control = reinject_trap;
2252 
2253     if (cpu->singlestep_enabled) {
2254         data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
2255     }
2256     kvm_arch_update_guest_debug(cpu, &data.dbg);
2257 
2258     run_on_cpu(cpu, kvm_invoke_set_guest_debug,
2259                RUN_ON_CPU_HOST_PTR(&data));
2260     return data.err;
2261 }
2262 
2263 int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr,
2264                           target_ulong len, int type)
2265 {
2266     struct kvm_sw_breakpoint *bp;
2267     int err;
2268 
2269     if (type == GDB_BREAKPOINT_SW) {
2270         bp = kvm_find_sw_breakpoint(cpu, addr);
2271         if (bp) {
2272             bp->use_count++;
2273             return 0;
2274         }
2275 
2276         bp = g_malloc(sizeof(struct kvm_sw_breakpoint));
2277         bp->pc = addr;
2278         bp->use_count = 1;
2279         err = kvm_arch_insert_sw_breakpoint(cpu, bp);
2280         if (err) {
2281             g_free(bp);
2282             return err;
2283         }
2284 
2285         QTAILQ_INSERT_HEAD(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
2286     } else {
2287         err = kvm_arch_insert_hw_breakpoint(addr, len, type);
2288         if (err) {
2289             return err;
2290         }
2291     }
2292 
2293     CPU_FOREACH(cpu) {
2294         err = kvm_update_guest_debug(cpu, 0);
2295         if (err) {
2296             return err;
2297         }
2298     }
2299     return 0;
2300 }
2301 
2302 int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr,
2303                           target_ulong len, int type)
2304 {
2305     struct kvm_sw_breakpoint *bp;
2306     int err;
2307 
2308     if (type == GDB_BREAKPOINT_SW) {
2309         bp = kvm_find_sw_breakpoint(cpu, addr);
2310         if (!bp) {
2311             return -ENOENT;
2312         }
2313 
2314         if (bp->use_count > 1) {
2315             bp->use_count--;
2316             return 0;
2317         }
2318 
2319         err = kvm_arch_remove_sw_breakpoint(cpu, bp);
2320         if (err) {
2321             return err;
2322         }
2323 
2324         QTAILQ_REMOVE(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
2325         g_free(bp);
2326     } else {
2327         err = kvm_arch_remove_hw_breakpoint(addr, len, type);
2328         if (err) {
2329             return err;
2330         }
2331     }
2332 
2333     CPU_FOREACH(cpu) {
2334         err = kvm_update_guest_debug(cpu, 0);
2335         if (err) {
2336             return err;
2337         }
2338     }
2339     return 0;
2340 }
2341 
2342 void kvm_remove_all_breakpoints(CPUState *cpu)
2343 {
2344     struct kvm_sw_breakpoint *bp, *next;
2345     KVMState *s = cpu->kvm_state;
2346     CPUState *tmpcpu;
2347 
2348     QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
2349         if (kvm_arch_remove_sw_breakpoint(cpu, bp) != 0) {
2350             /* Try harder to find a CPU that currently sees the breakpoint. */
2351             CPU_FOREACH(tmpcpu) {
2352                 if (kvm_arch_remove_sw_breakpoint(tmpcpu, bp) == 0) {
2353                     break;
2354                 }
2355             }
2356         }
2357         QTAILQ_REMOVE(&s->kvm_sw_breakpoints, bp, entry);
2358         g_free(bp);
2359     }
2360     kvm_arch_remove_all_hw_breakpoints();
2361 
2362     CPU_FOREACH(cpu) {
2363         kvm_update_guest_debug(cpu, 0);
2364     }
2365 }
2366 
2367 #else /* !KVM_CAP_SET_GUEST_DEBUG */
2368 
2369 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap)
2370 {
2371     return -EINVAL;
2372 }
2373 
2374 int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr,
2375                           target_ulong len, int type)
2376 {
2377     return -EINVAL;
2378 }
2379 
2380 int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr,
2381                           target_ulong len, int type)
2382 {
2383     return -EINVAL;
2384 }
2385 
2386 void kvm_remove_all_breakpoints(CPUState *cpu)
2387 {
2388 }
2389 #endif /* !KVM_CAP_SET_GUEST_DEBUG */
2390 
2391 static int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset)
2392 {
2393     KVMState *s = kvm_state;
2394     struct kvm_signal_mask *sigmask;
2395     int r;
2396 
2397     sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset));
2398 
2399     sigmask->len = s->sigmask_len;
2400     memcpy(sigmask->sigset, sigset, sizeof(*sigset));
2401     r = kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, sigmask);
2402     g_free(sigmask);
2403 
2404     return r;
2405 }
2406 
2407 static void kvm_ipi_signal(int sig)
2408 {
2409     if (current_cpu) {
2410         assert(kvm_immediate_exit);
2411         kvm_cpu_kick(current_cpu);
2412     }
2413 }
2414 
2415 void kvm_init_cpu_signals(CPUState *cpu)
2416 {
2417     int r;
2418     sigset_t set;
2419     struct sigaction sigact;
2420 
2421     memset(&sigact, 0, sizeof(sigact));
2422     sigact.sa_handler = kvm_ipi_signal;
2423     sigaction(SIG_IPI, &sigact, NULL);
2424 
2425     pthread_sigmask(SIG_BLOCK, NULL, &set);
2426 #if defined KVM_HAVE_MCE_INJECTION
2427     sigdelset(&set, SIGBUS);
2428     pthread_sigmask(SIG_SETMASK, &set, NULL);
2429 #endif
2430     sigdelset(&set, SIG_IPI);
2431     if (kvm_immediate_exit) {
2432         r = pthread_sigmask(SIG_SETMASK, &set, NULL);
2433     } else {
2434         r = kvm_set_signal_mask(cpu, &set);
2435     }
2436     if (r) {
2437         fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
2438         exit(1);
2439     }
2440 }
2441 
2442 /* Called asynchronously in VCPU thread.  */
2443 int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr)
2444 {
2445 #ifdef KVM_HAVE_MCE_INJECTION
2446     if (have_sigbus_pending) {
2447         return 1;
2448     }
2449     have_sigbus_pending = true;
2450     pending_sigbus_addr = addr;
2451     pending_sigbus_code = code;
2452     atomic_set(&cpu->exit_request, 1);
2453     return 0;
2454 #else
2455     return 1;
2456 #endif
2457 }
2458 
2459 /* Called synchronously (via signalfd) in main thread.  */
2460 int kvm_on_sigbus(int code, void *addr)
2461 {
2462 #ifdef KVM_HAVE_MCE_INJECTION
2463     /* Action required MCE kills the process if SIGBUS is blocked.  Because
2464      * that's what happens in the I/O thread, where we handle MCE via signalfd,
2465      * we can only get action optional here.
2466      */
2467     assert(code != BUS_MCEERR_AR);
2468     kvm_arch_on_sigbus_vcpu(first_cpu, code, addr);
2469     return 0;
2470 #else
2471     return 1;
2472 #endif
2473 }
2474 
2475 int kvm_create_device(KVMState *s, uint64_t type, bool test)
2476 {
2477     int ret;
2478     struct kvm_create_device create_dev;
2479 
2480     create_dev.type = type;
2481     create_dev.fd = -1;
2482     create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0;
2483 
2484     if (!kvm_check_extension(s, KVM_CAP_DEVICE_CTRL)) {
2485         return -ENOTSUP;
2486     }
2487 
2488     ret = kvm_vm_ioctl(s, KVM_CREATE_DEVICE, &create_dev);
2489     if (ret) {
2490         return ret;
2491     }
2492 
2493     return test ? 0 : create_dev.fd;
2494 }
2495 
2496 bool kvm_device_supported(int vmfd, uint64_t type)
2497 {
2498     struct kvm_create_device create_dev = {
2499         .type = type,
2500         .fd = -1,
2501         .flags = KVM_CREATE_DEVICE_TEST,
2502     };
2503 
2504     if (ioctl(vmfd, KVM_CHECK_EXTENSION, KVM_CAP_DEVICE_CTRL) <= 0) {
2505         return false;
2506     }
2507 
2508     return (ioctl(vmfd, KVM_CREATE_DEVICE, &create_dev) >= 0);
2509 }
2510 
2511 int kvm_set_one_reg(CPUState *cs, uint64_t id, void *source)
2512 {
2513     struct kvm_one_reg reg;
2514     int r;
2515 
2516     reg.id = id;
2517     reg.addr = (uintptr_t) source;
2518     r = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
2519     if (r) {
2520         trace_kvm_failed_reg_set(id, strerror(-r));
2521     }
2522     return r;
2523 }
2524 
2525 int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target)
2526 {
2527     struct kvm_one_reg reg;
2528     int r;
2529 
2530     reg.id = id;
2531     reg.addr = (uintptr_t) target;
2532     r = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
2533     if (r) {
2534         trace_kvm_failed_reg_get(id, strerror(-r));
2535     }
2536     return r;
2537 }
2538 
2539 static void kvm_accel_class_init(ObjectClass *oc, void *data)
2540 {
2541     AccelClass *ac = ACCEL_CLASS(oc);
2542     ac->name = "KVM";
2543     ac->init_machine = kvm_init;
2544     ac->allowed = &kvm_allowed;
2545 }
2546 
2547 static const TypeInfo kvm_accel_type = {
2548     .name = TYPE_KVM_ACCEL,
2549     .parent = TYPE_ACCEL,
2550     .class_init = kvm_accel_class_init,
2551     .instance_size = sizeof(KVMState),
2552 };
2553 
2554 static void kvm_type_init(void)
2555 {
2556     type_register_static(&kvm_accel_type);
2557 }
2558 
2559 type_init(kvm_type_init);
2560