xref: /openbmc/qemu/accel/kvm/kvm-all.c (revision 997340f3)
1 /*
2  * QEMU KVM support
3  *
4  * Copyright IBM, Corp. 2008
5  *           Red Hat, Inc. 2008
6  *
7  * Authors:
8  *  Anthony Liguori   <aliguori@us.ibm.com>
9  *  Glauber Costa     <gcosta@redhat.com>
10  *
11  * This work is licensed under the terms of the GNU GPL, version 2 or later.
12  * See the COPYING file in the top-level directory.
13  *
14  */
15 
16 #include "qemu/osdep.h"
17 #include <sys/ioctl.h>
18 #include <poll.h>
19 
20 #include <linux/kvm.h>
21 
22 #include "qemu/atomic.h"
23 #include "qemu/option.h"
24 #include "qemu/config-file.h"
25 #include "qemu/error-report.h"
26 #include "qapi/error.h"
27 #include "hw/pci/msi.h"
28 #include "hw/pci/msix.h"
29 #include "hw/s390x/adapter.h"
30 #include "exec/gdbstub.h"
31 #include "sysemu/kvm_int.h"
32 #include "sysemu/runstate.h"
33 #include "sysemu/cpus.h"
34 #include "qemu/bswap.h"
35 #include "exec/memory.h"
36 #include "exec/ram_addr.h"
37 #include "qemu/event_notifier.h"
38 #include "qemu/main-loop.h"
39 #include "trace.h"
40 #include "hw/irq.h"
41 #include "qapi/visitor.h"
42 #include "qapi/qapi-types-common.h"
43 #include "qapi/qapi-visit-common.h"
44 #include "sysemu/reset.h"
45 #include "qemu/guest-random.h"
46 #include "sysemu/hw_accel.h"
47 #include "kvm-cpus.h"
48 
49 #include "hw/boards.h"
50 #include "monitor/stats.h"
51 
52 /* This check must be after config-host.h is included */
53 #ifdef CONFIG_EVENTFD
54 #include <sys/eventfd.h>
55 #endif
56 
57 /* KVM uses PAGE_SIZE in its definition of KVM_COALESCED_MMIO_MAX. We
58  * need to use the real host PAGE_SIZE, as that's what KVM will use.
59  */
60 #ifdef PAGE_SIZE
61 #undef PAGE_SIZE
62 #endif
63 #define PAGE_SIZE qemu_real_host_page_size()
64 
65 #ifndef KVM_GUESTDBG_BLOCKIRQ
66 #define KVM_GUESTDBG_BLOCKIRQ 0
67 #endif
68 
69 //#define DEBUG_KVM
70 
71 #ifdef DEBUG_KVM
72 #define DPRINTF(fmt, ...) \
73     do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
74 #else
75 #define DPRINTF(fmt, ...) \
76     do { } while (0)
77 #endif
78 
79 #define KVM_MSI_HASHTAB_SIZE    256
80 
81 struct KVMParkedVcpu {
82     unsigned long vcpu_id;
83     int kvm_fd;
84     QLIST_ENTRY(KVMParkedVcpu) node;
85 };
86 
87 enum KVMDirtyRingReaperState {
88     KVM_DIRTY_RING_REAPER_NONE = 0,
89     /* The reaper is sleeping */
90     KVM_DIRTY_RING_REAPER_WAIT,
91     /* The reaper is reaping for dirty pages */
92     KVM_DIRTY_RING_REAPER_REAPING,
93 };
94 
95 /*
96  * KVM reaper instance, responsible for collecting the KVM dirty bits
97  * via the dirty ring.
98  */
99 struct KVMDirtyRingReaper {
100     /* The reaper thread */
101     QemuThread reaper_thr;
102     volatile uint64_t reaper_iteration; /* iteration number of reaper thr */
103     volatile enum KVMDirtyRingReaperState reaper_state; /* reap thr state */
104 };
105 
106 struct KVMState
107 {
108     AccelState parent_obj;
109 
110     int nr_slots;
111     int fd;
112     int vmfd;
113     int coalesced_mmio;
114     int coalesced_pio;
115     struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
116     bool coalesced_flush_in_progress;
117     int vcpu_events;
118     int robust_singlestep;
119     int debugregs;
120 #ifdef KVM_CAP_SET_GUEST_DEBUG
121     QTAILQ_HEAD(, kvm_sw_breakpoint) kvm_sw_breakpoints;
122 #endif
123     int max_nested_state_len;
124     int many_ioeventfds;
125     int intx_set_mask;
126     int kvm_shadow_mem;
127     bool kernel_irqchip_allowed;
128     bool kernel_irqchip_required;
129     OnOffAuto kernel_irqchip_split;
130     bool sync_mmu;
131     uint64_t manual_dirty_log_protect;
132     /* The man page (and posix) say ioctl numbers are signed int, but
133      * they're not.  Linux, glibc and *BSD all treat ioctl numbers as
134      * unsigned, and treating them as signed here can break things */
135     unsigned irq_set_ioctl;
136     unsigned int sigmask_len;
137     GHashTable *gsimap;
138 #ifdef KVM_CAP_IRQ_ROUTING
139     struct kvm_irq_routing *irq_routes;
140     int nr_allocated_irq_routes;
141     unsigned long *used_gsi_bitmap;
142     unsigned int gsi_count;
143     QTAILQ_HEAD(, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE];
144 #endif
145     KVMMemoryListener memory_listener;
146     QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus;
147 
148     /* For "info mtree -f" to tell if an MR is registered in KVM */
149     int nr_as;
150     struct KVMAs {
151         KVMMemoryListener *ml;
152         AddressSpace *as;
153     } *as;
154     uint64_t kvm_dirty_ring_bytes;  /* Size of the per-vcpu dirty ring */
155     uint32_t kvm_dirty_ring_size;   /* Number of dirty GFNs per ring */
156     struct KVMDirtyRingReaper reaper;
157 };
158 
159 KVMState *kvm_state;
160 bool kvm_kernel_irqchip;
161 bool kvm_split_irqchip;
162 bool kvm_async_interrupts_allowed;
163 bool kvm_halt_in_kernel_allowed;
164 bool kvm_eventfds_allowed;
165 bool kvm_irqfds_allowed;
166 bool kvm_resamplefds_allowed;
167 bool kvm_msi_via_irqfd_allowed;
168 bool kvm_gsi_routing_allowed;
169 bool kvm_gsi_direct_mapping;
170 bool kvm_allowed;
171 bool kvm_readonly_mem_allowed;
172 bool kvm_vm_attributes_allowed;
173 bool kvm_direct_msi_allowed;
174 bool kvm_ioeventfd_any_length_allowed;
175 bool kvm_msi_use_devid;
176 bool kvm_has_guest_debug;
177 int kvm_sstep_flags;
178 static bool kvm_immediate_exit;
179 static hwaddr kvm_max_slot_size = ~0;
180 
181 static const KVMCapabilityInfo kvm_required_capabilites[] = {
182     KVM_CAP_INFO(USER_MEMORY),
183     KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS),
184     KVM_CAP_INFO(JOIN_MEMORY_REGIONS_WORKS),
185     KVM_CAP_LAST_INFO
186 };
187 
188 static NotifierList kvm_irqchip_change_notifiers =
189     NOTIFIER_LIST_INITIALIZER(kvm_irqchip_change_notifiers);
190 
191 struct KVMResampleFd {
192     int gsi;
193     EventNotifier *resample_event;
194     QLIST_ENTRY(KVMResampleFd) node;
195 };
196 typedef struct KVMResampleFd KVMResampleFd;
197 
198 /*
199  * Only used with split irqchip where we need to do the resample fd
200  * kick for the kernel from userspace.
201  */
202 static QLIST_HEAD(, KVMResampleFd) kvm_resample_fd_list =
203     QLIST_HEAD_INITIALIZER(kvm_resample_fd_list);
204 
205 static QemuMutex kml_slots_lock;
206 
207 #define kvm_slots_lock()    qemu_mutex_lock(&kml_slots_lock)
208 #define kvm_slots_unlock()  qemu_mutex_unlock(&kml_slots_lock)
209 
210 static void kvm_slot_init_dirty_bitmap(KVMSlot *mem);
211 
212 static inline void kvm_resample_fd_remove(int gsi)
213 {
214     KVMResampleFd *rfd;
215 
216     QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) {
217         if (rfd->gsi == gsi) {
218             QLIST_REMOVE(rfd, node);
219             g_free(rfd);
220             break;
221         }
222     }
223 }
224 
225 static inline void kvm_resample_fd_insert(int gsi, EventNotifier *event)
226 {
227     KVMResampleFd *rfd = g_new0(KVMResampleFd, 1);
228 
229     rfd->gsi = gsi;
230     rfd->resample_event = event;
231 
232     QLIST_INSERT_HEAD(&kvm_resample_fd_list, rfd, node);
233 }
234 
235 void kvm_resample_fd_notify(int gsi)
236 {
237     KVMResampleFd *rfd;
238 
239     QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) {
240         if (rfd->gsi == gsi) {
241             event_notifier_set(rfd->resample_event);
242             trace_kvm_resample_fd_notify(gsi);
243             return;
244         }
245     }
246 }
247 
248 int kvm_get_max_memslots(void)
249 {
250     KVMState *s = KVM_STATE(current_accel());
251 
252     return s->nr_slots;
253 }
254 
255 /* Called with KVMMemoryListener.slots_lock held */
256 static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml)
257 {
258     KVMState *s = kvm_state;
259     int i;
260 
261     for (i = 0; i < s->nr_slots; i++) {
262         if (kml->slots[i].memory_size == 0) {
263             return &kml->slots[i];
264         }
265     }
266 
267     return NULL;
268 }
269 
270 bool kvm_has_free_slot(MachineState *ms)
271 {
272     KVMState *s = KVM_STATE(ms->accelerator);
273     bool result;
274     KVMMemoryListener *kml = &s->memory_listener;
275 
276     kvm_slots_lock();
277     result = !!kvm_get_free_slot(kml);
278     kvm_slots_unlock();
279 
280     return result;
281 }
282 
283 /* Called with KVMMemoryListener.slots_lock held */
284 static KVMSlot *kvm_alloc_slot(KVMMemoryListener *kml)
285 {
286     KVMSlot *slot = kvm_get_free_slot(kml);
287 
288     if (slot) {
289         return slot;
290     }
291 
292     fprintf(stderr, "%s: no free slot available\n", __func__);
293     abort();
294 }
295 
296 static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml,
297                                          hwaddr start_addr,
298                                          hwaddr size)
299 {
300     KVMState *s = kvm_state;
301     int i;
302 
303     for (i = 0; i < s->nr_slots; i++) {
304         KVMSlot *mem = &kml->slots[i];
305 
306         if (start_addr == mem->start_addr && size == mem->memory_size) {
307             return mem;
308         }
309     }
310 
311     return NULL;
312 }
313 
314 /*
315  * Calculate and align the start address and the size of the section.
316  * Return the size. If the size is 0, the aligned section is empty.
317  */
318 static hwaddr kvm_align_section(MemoryRegionSection *section,
319                                 hwaddr *start)
320 {
321     hwaddr size = int128_get64(section->size);
322     hwaddr delta, aligned;
323 
324     /* kvm works in page size chunks, but the function may be called
325        with sub-page size and unaligned start address. Pad the start
326        address to next and truncate size to previous page boundary. */
327     aligned = ROUND_UP(section->offset_within_address_space,
328                        qemu_real_host_page_size());
329     delta = aligned - section->offset_within_address_space;
330     *start = aligned;
331     if (delta > size) {
332         return 0;
333     }
334 
335     return (size - delta) & qemu_real_host_page_mask();
336 }
337 
338 int kvm_physical_memory_addr_from_host(KVMState *s, void *ram,
339                                        hwaddr *phys_addr)
340 {
341     KVMMemoryListener *kml = &s->memory_listener;
342     int i, ret = 0;
343 
344     kvm_slots_lock();
345     for (i = 0; i < s->nr_slots; i++) {
346         KVMSlot *mem = &kml->slots[i];
347 
348         if (ram >= mem->ram && ram < mem->ram + mem->memory_size) {
349             *phys_addr = mem->start_addr + (ram - mem->ram);
350             ret = 1;
351             break;
352         }
353     }
354     kvm_slots_unlock();
355 
356     return ret;
357 }
358 
359 static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot, bool new)
360 {
361     KVMState *s = kvm_state;
362     struct kvm_userspace_memory_region mem;
363     int ret;
364 
365     mem.slot = slot->slot | (kml->as_id << 16);
366     mem.guest_phys_addr = slot->start_addr;
367     mem.userspace_addr = (unsigned long)slot->ram;
368     mem.flags = slot->flags;
369 
370     if (slot->memory_size && !new && (mem.flags ^ slot->old_flags) & KVM_MEM_READONLY) {
371         /* Set the slot size to 0 before setting the slot to the desired
372          * value. This is needed based on KVM commit 75d61fbc. */
373         mem.memory_size = 0;
374         ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
375         if (ret < 0) {
376             goto err;
377         }
378     }
379     mem.memory_size = slot->memory_size;
380     ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
381     slot->old_flags = mem.flags;
382 err:
383     trace_kvm_set_user_memory(mem.slot, mem.flags, mem.guest_phys_addr,
384                               mem.memory_size, mem.userspace_addr, ret);
385     if (ret < 0) {
386         error_report("%s: KVM_SET_USER_MEMORY_REGION failed, slot=%d,"
387                      " start=0x%" PRIx64 ", size=0x%" PRIx64 ": %s",
388                      __func__, mem.slot, slot->start_addr,
389                      (uint64_t)mem.memory_size, strerror(errno));
390     }
391     return ret;
392 }
393 
394 static int do_kvm_destroy_vcpu(CPUState *cpu)
395 {
396     KVMState *s = kvm_state;
397     long mmap_size;
398     struct KVMParkedVcpu *vcpu = NULL;
399     int ret = 0;
400 
401     DPRINTF("kvm_destroy_vcpu\n");
402 
403     ret = kvm_arch_destroy_vcpu(cpu);
404     if (ret < 0) {
405         goto err;
406     }
407 
408     mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
409     if (mmap_size < 0) {
410         ret = mmap_size;
411         DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
412         goto err;
413     }
414 
415     ret = munmap(cpu->kvm_run, mmap_size);
416     if (ret < 0) {
417         goto err;
418     }
419 
420     if (cpu->kvm_dirty_gfns) {
421         ret = munmap(cpu->kvm_dirty_gfns, s->kvm_dirty_ring_bytes);
422         if (ret < 0) {
423             goto err;
424         }
425     }
426 
427     vcpu = g_malloc0(sizeof(*vcpu));
428     vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
429     vcpu->kvm_fd = cpu->kvm_fd;
430     QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
431 err:
432     return ret;
433 }
434 
435 void kvm_destroy_vcpu(CPUState *cpu)
436 {
437     if (do_kvm_destroy_vcpu(cpu) < 0) {
438         error_report("kvm_destroy_vcpu failed");
439         exit(EXIT_FAILURE);
440     }
441 }
442 
443 static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id)
444 {
445     struct KVMParkedVcpu *cpu;
446 
447     QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
448         if (cpu->vcpu_id == vcpu_id) {
449             int kvm_fd;
450 
451             QLIST_REMOVE(cpu, node);
452             kvm_fd = cpu->kvm_fd;
453             g_free(cpu);
454             return kvm_fd;
455         }
456     }
457 
458     return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);
459 }
460 
461 int kvm_init_vcpu(CPUState *cpu, Error **errp)
462 {
463     KVMState *s = kvm_state;
464     long mmap_size;
465     int ret;
466 
467     trace_kvm_init_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
468 
469     ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu));
470     if (ret < 0) {
471         error_setg_errno(errp, -ret, "kvm_init_vcpu: kvm_get_vcpu failed (%lu)",
472                          kvm_arch_vcpu_id(cpu));
473         goto err;
474     }
475 
476     cpu->kvm_fd = ret;
477     cpu->kvm_state = s;
478     cpu->vcpu_dirty = true;
479     cpu->dirty_pages = 0;
480 
481     mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
482     if (mmap_size < 0) {
483         ret = mmap_size;
484         error_setg_errno(errp, -mmap_size,
485                          "kvm_init_vcpu: KVM_GET_VCPU_MMAP_SIZE failed");
486         goto err;
487     }
488 
489     cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
490                         cpu->kvm_fd, 0);
491     if (cpu->kvm_run == MAP_FAILED) {
492         ret = -errno;
493         error_setg_errno(errp, ret,
494                          "kvm_init_vcpu: mmap'ing vcpu state failed (%lu)",
495                          kvm_arch_vcpu_id(cpu));
496         goto err;
497     }
498 
499     if (s->coalesced_mmio && !s->coalesced_mmio_ring) {
500         s->coalesced_mmio_ring =
501             (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE;
502     }
503 
504     if (s->kvm_dirty_ring_size) {
505         /* Use MAP_SHARED to share pages with the kernel */
506         cpu->kvm_dirty_gfns = mmap(NULL, s->kvm_dirty_ring_bytes,
507                                    PROT_READ | PROT_WRITE, MAP_SHARED,
508                                    cpu->kvm_fd,
509                                    PAGE_SIZE * KVM_DIRTY_LOG_PAGE_OFFSET);
510         if (cpu->kvm_dirty_gfns == MAP_FAILED) {
511             ret = -errno;
512             DPRINTF("mmap'ing vcpu dirty gfns failed: %d\n", ret);
513             goto err;
514         }
515     }
516 
517     ret = kvm_arch_init_vcpu(cpu);
518     if (ret < 0) {
519         error_setg_errno(errp, -ret,
520                          "kvm_init_vcpu: kvm_arch_init_vcpu failed (%lu)",
521                          kvm_arch_vcpu_id(cpu));
522     }
523 err:
524     return ret;
525 }
526 
527 /*
528  * dirty pages logging control
529  */
530 
531 static int kvm_mem_flags(MemoryRegion *mr)
532 {
533     bool readonly = mr->readonly || memory_region_is_romd(mr);
534     int flags = 0;
535 
536     if (memory_region_get_dirty_log_mask(mr) != 0) {
537         flags |= KVM_MEM_LOG_DIRTY_PAGES;
538     }
539     if (readonly && kvm_readonly_mem_allowed) {
540         flags |= KVM_MEM_READONLY;
541     }
542     return flags;
543 }
544 
545 /* Called with KVMMemoryListener.slots_lock held */
546 static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem,
547                                  MemoryRegion *mr)
548 {
549     mem->flags = kvm_mem_flags(mr);
550 
551     /* If nothing changed effectively, no need to issue ioctl */
552     if (mem->flags == mem->old_flags) {
553         return 0;
554     }
555 
556     kvm_slot_init_dirty_bitmap(mem);
557     return kvm_set_user_memory_region(kml, mem, false);
558 }
559 
560 static int kvm_section_update_flags(KVMMemoryListener *kml,
561                                     MemoryRegionSection *section)
562 {
563     hwaddr start_addr, size, slot_size;
564     KVMSlot *mem;
565     int ret = 0;
566 
567     size = kvm_align_section(section, &start_addr);
568     if (!size) {
569         return 0;
570     }
571 
572     kvm_slots_lock();
573 
574     while (size && !ret) {
575         slot_size = MIN(kvm_max_slot_size, size);
576         mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
577         if (!mem) {
578             /* We don't have a slot if we want to trap every access. */
579             goto out;
580         }
581 
582         ret = kvm_slot_update_flags(kml, mem, section->mr);
583         start_addr += slot_size;
584         size -= slot_size;
585     }
586 
587 out:
588     kvm_slots_unlock();
589     return ret;
590 }
591 
592 static void kvm_log_start(MemoryListener *listener,
593                           MemoryRegionSection *section,
594                           int old, int new)
595 {
596     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
597     int r;
598 
599     if (old != 0) {
600         return;
601     }
602 
603     r = kvm_section_update_flags(kml, section);
604     if (r < 0) {
605         abort();
606     }
607 }
608 
609 static void kvm_log_stop(MemoryListener *listener,
610                           MemoryRegionSection *section,
611                           int old, int new)
612 {
613     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
614     int r;
615 
616     if (new != 0) {
617         return;
618     }
619 
620     r = kvm_section_update_flags(kml, section);
621     if (r < 0) {
622         abort();
623     }
624 }
625 
626 /* get kvm's dirty pages bitmap and update qemu's */
627 static void kvm_slot_sync_dirty_pages(KVMSlot *slot)
628 {
629     ram_addr_t start = slot->ram_start_offset;
630     ram_addr_t pages = slot->memory_size / qemu_real_host_page_size();
631 
632     cpu_physical_memory_set_dirty_lebitmap(slot->dirty_bmap, start, pages);
633 }
634 
635 static void kvm_slot_reset_dirty_pages(KVMSlot *slot)
636 {
637     memset(slot->dirty_bmap, 0, slot->dirty_bmap_size);
638 }
639 
640 #define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
641 
642 /* Allocate the dirty bitmap for a slot  */
643 static void kvm_slot_init_dirty_bitmap(KVMSlot *mem)
644 {
645     if (!(mem->flags & KVM_MEM_LOG_DIRTY_PAGES) || mem->dirty_bmap) {
646         return;
647     }
648 
649     /*
650      * XXX bad kernel interface alert
651      * For dirty bitmap, kernel allocates array of size aligned to
652      * bits-per-long.  But for case when the kernel is 64bits and
653      * the userspace is 32bits, userspace can't align to the same
654      * bits-per-long, since sizeof(long) is different between kernel
655      * and user space.  This way, userspace will provide buffer which
656      * may be 4 bytes less than the kernel will use, resulting in
657      * userspace memory corruption (which is not detectable by valgrind
658      * too, in most cases).
659      * So for now, let's align to 64 instead of HOST_LONG_BITS here, in
660      * a hope that sizeof(long) won't become >8 any time soon.
661      *
662      * Note: the granule of kvm dirty log is qemu_real_host_page_size.
663      * And mem->memory_size is aligned to it (otherwise this mem can't
664      * be registered to KVM).
665      */
666     hwaddr bitmap_size = ALIGN(mem->memory_size / qemu_real_host_page_size(),
667                                         /*HOST_LONG_BITS*/ 64) / 8;
668     mem->dirty_bmap = g_malloc0(bitmap_size);
669     mem->dirty_bmap_size = bitmap_size;
670 }
671 
672 /*
673  * Sync dirty bitmap from kernel to KVMSlot.dirty_bmap, return true if
674  * succeeded, false otherwise
675  */
676 static bool kvm_slot_get_dirty_log(KVMState *s, KVMSlot *slot)
677 {
678     struct kvm_dirty_log d = {};
679     int ret;
680 
681     d.dirty_bitmap = slot->dirty_bmap;
682     d.slot = slot->slot | (slot->as_id << 16);
683     ret = kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d);
684 
685     if (ret == -ENOENT) {
686         /* kernel does not have dirty bitmap in this slot */
687         ret = 0;
688     }
689     if (ret) {
690         error_report_once("%s: KVM_GET_DIRTY_LOG failed with %d",
691                           __func__, ret);
692     }
693     return ret == 0;
694 }
695 
696 /* Should be with all slots_lock held for the address spaces. */
697 static void kvm_dirty_ring_mark_page(KVMState *s, uint32_t as_id,
698                                      uint32_t slot_id, uint64_t offset)
699 {
700     KVMMemoryListener *kml;
701     KVMSlot *mem;
702 
703     if (as_id >= s->nr_as) {
704         return;
705     }
706 
707     kml = s->as[as_id].ml;
708     mem = &kml->slots[slot_id];
709 
710     if (!mem->memory_size || offset >=
711         (mem->memory_size / qemu_real_host_page_size())) {
712         return;
713     }
714 
715     set_bit(offset, mem->dirty_bmap);
716 }
717 
718 static bool dirty_gfn_is_dirtied(struct kvm_dirty_gfn *gfn)
719 {
720     return gfn->flags == KVM_DIRTY_GFN_F_DIRTY;
721 }
722 
723 static void dirty_gfn_set_collected(struct kvm_dirty_gfn *gfn)
724 {
725     gfn->flags = KVM_DIRTY_GFN_F_RESET;
726 }
727 
728 /*
729  * Should be with all slots_lock held for the address spaces.  It returns the
730  * dirty page we've collected on this dirty ring.
731  */
732 static uint32_t kvm_dirty_ring_reap_one(KVMState *s, CPUState *cpu)
733 {
734     struct kvm_dirty_gfn *dirty_gfns = cpu->kvm_dirty_gfns, *cur;
735     uint32_t ring_size = s->kvm_dirty_ring_size;
736     uint32_t count = 0, fetch = cpu->kvm_fetch_index;
737 
738     assert(dirty_gfns && ring_size);
739     trace_kvm_dirty_ring_reap_vcpu(cpu->cpu_index);
740 
741     while (true) {
742         cur = &dirty_gfns[fetch % ring_size];
743         if (!dirty_gfn_is_dirtied(cur)) {
744             break;
745         }
746         kvm_dirty_ring_mark_page(s, cur->slot >> 16, cur->slot & 0xffff,
747                                  cur->offset);
748         dirty_gfn_set_collected(cur);
749         trace_kvm_dirty_ring_page(cpu->cpu_index, fetch, cur->offset);
750         fetch++;
751         count++;
752     }
753     cpu->kvm_fetch_index = fetch;
754     cpu->dirty_pages += count;
755 
756     return count;
757 }
758 
759 /* Must be with slots_lock held */
760 static uint64_t kvm_dirty_ring_reap_locked(KVMState *s)
761 {
762     int ret;
763     CPUState *cpu;
764     uint64_t total = 0;
765     int64_t stamp;
766 
767     stamp = get_clock();
768 
769     CPU_FOREACH(cpu) {
770         total += kvm_dirty_ring_reap_one(s, cpu);
771     }
772 
773     if (total) {
774         ret = kvm_vm_ioctl(s, KVM_RESET_DIRTY_RINGS);
775         assert(ret == total);
776     }
777 
778     stamp = get_clock() - stamp;
779 
780     if (total) {
781         trace_kvm_dirty_ring_reap(total, stamp / 1000);
782     }
783 
784     return total;
785 }
786 
787 /*
788  * Currently for simplicity, we must hold BQL before calling this.  We can
789  * consider to drop the BQL if we're clear with all the race conditions.
790  */
791 static uint64_t kvm_dirty_ring_reap(KVMState *s)
792 {
793     uint64_t total;
794 
795     /*
796      * We need to lock all kvm slots for all address spaces here,
797      * because:
798      *
799      * (1) We need to mark dirty for dirty bitmaps in multiple slots
800      *     and for tons of pages, so it's better to take the lock here
801      *     once rather than once per page.  And more importantly,
802      *
803      * (2) We must _NOT_ publish dirty bits to the other threads
804      *     (e.g., the migration thread) via the kvm memory slot dirty
805      *     bitmaps before correctly re-protect those dirtied pages.
806      *     Otherwise we can have potential risk of data corruption if
807      *     the page data is read in the other thread before we do
808      *     reset below.
809      */
810     kvm_slots_lock();
811     total = kvm_dirty_ring_reap_locked(s);
812     kvm_slots_unlock();
813 
814     return total;
815 }
816 
817 static void do_kvm_cpu_synchronize_kick(CPUState *cpu, run_on_cpu_data arg)
818 {
819     /* No need to do anything */
820 }
821 
822 /*
823  * Kick all vcpus out in a synchronized way.  When returned, we
824  * guarantee that every vcpu has been kicked and at least returned to
825  * userspace once.
826  */
827 static void kvm_cpu_synchronize_kick_all(void)
828 {
829     CPUState *cpu;
830 
831     CPU_FOREACH(cpu) {
832         run_on_cpu(cpu, do_kvm_cpu_synchronize_kick, RUN_ON_CPU_NULL);
833     }
834 }
835 
836 /*
837  * Flush all the existing dirty pages to the KVM slot buffers.  When
838  * this call returns, we guarantee that all the touched dirty pages
839  * before calling this function have been put into the per-kvmslot
840  * dirty bitmap.
841  *
842  * This function must be called with BQL held.
843  */
844 static void kvm_dirty_ring_flush(void)
845 {
846     trace_kvm_dirty_ring_flush(0);
847     /*
848      * The function needs to be serialized.  Since this function
849      * should always be with BQL held, serialization is guaranteed.
850      * However, let's be sure of it.
851      */
852     assert(qemu_mutex_iothread_locked());
853     /*
854      * First make sure to flush the hardware buffers by kicking all
855      * vcpus out in a synchronous way.
856      */
857     kvm_cpu_synchronize_kick_all();
858     kvm_dirty_ring_reap(kvm_state);
859     trace_kvm_dirty_ring_flush(1);
860 }
861 
862 /**
863  * kvm_physical_sync_dirty_bitmap - Sync dirty bitmap from kernel space
864  *
865  * This function will first try to fetch dirty bitmap from the kernel,
866  * and then updates qemu's dirty bitmap.
867  *
868  * NOTE: caller must be with kml->slots_lock held.
869  *
870  * @kml: the KVM memory listener object
871  * @section: the memory section to sync the dirty bitmap with
872  */
873 static void kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml,
874                                            MemoryRegionSection *section)
875 {
876     KVMState *s = kvm_state;
877     KVMSlot *mem;
878     hwaddr start_addr, size;
879     hwaddr slot_size;
880 
881     size = kvm_align_section(section, &start_addr);
882     while (size) {
883         slot_size = MIN(kvm_max_slot_size, size);
884         mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
885         if (!mem) {
886             /* We don't have a slot if we want to trap every access. */
887             return;
888         }
889         if (kvm_slot_get_dirty_log(s, mem)) {
890             kvm_slot_sync_dirty_pages(mem);
891         }
892         start_addr += slot_size;
893         size -= slot_size;
894     }
895 }
896 
897 /* Alignment requirement for KVM_CLEAR_DIRTY_LOG - 64 pages */
898 #define KVM_CLEAR_LOG_SHIFT  6
899 #define KVM_CLEAR_LOG_ALIGN  (qemu_real_host_page_size() << KVM_CLEAR_LOG_SHIFT)
900 #define KVM_CLEAR_LOG_MASK   (-KVM_CLEAR_LOG_ALIGN)
901 
902 static int kvm_log_clear_one_slot(KVMSlot *mem, int as_id, uint64_t start,
903                                   uint64_t size)
904 {
905     KVMState *s = kvm_state;
906     uint64_t end, bmap_start, start_delta, bmap_npages;
907     struct kvm_clear_dirty_log d;
908     unsigned long *bmap_clear = NULL, psize = qemu_real_host_page_size();
909     int ret;
910 
911     /*
912      * We need to extend either the start or the size or both to
913      * satisfy the KVM interface requirement.  Firstly, do the start
914      * page alignment on 64 host pages
915      */
916     bmap_start = start & KVM_CLEAR_LOG_MASK;
917     start_delta = start - bmap_start;
918     bmap_start /= psize;
919 
920     /*
921      * The kernel interface has restriction on the size too, that either:
922      *
923      * (1) the size is 64 host pages aligned (just like the start), or
924      * (2) the size fills up until the end of the KVM memslot.
925      */
926     bmap_npages = DIV_ROUND_UP(size + start_delta, KVM_CLEAR_LOG_ALIGN)
927         << KVM_CLEAR_LOG_SHIFT;
928     end = mem->memory_size / psize;
929     if (bmap_npages > end - bmap_start) {
930         bmap_npages = end - bmap_start;
931     }
932     start_delta /= psize;
933 
934     /*
935      * Prepare the bitmap to clear dirty bits.  Here we must guarantee
936      * that we won't clear any unknown dirty bits otherwise we might
937      * accidentally clear some set bits which are not yet synced from
938      * the kernel into QEMU's bitmap, then we'll lose track of the
939      * guest modifications upon those pages (which can directly lead
940      * to guest data loss or panic after migration).
941      *
942      * Layout of the KVMSlot.dirty_bmap:
943      *
944      *                   |<-------- bmap_npages -----------..>|
945      *                                                     [1]
946      *                     start_delta         size
947      *  |----------------|-------------|------------------|------------|
948      *  ^                ^             ^                               ^
949      *  |                |             |                               |
950      * start          bmap_start     (start)                         end
951      * of memslot                                             of memslot
952      *
953      * [1] bmap_npages can be aligned to either 64 pages or the end of slot
954      */
955 
956     assert(bmap_start % BITS_PER_LONG == 0);
957     /* We should never do log_clear before log_sync */
958     assert(mem->dirty_bmap);
959     if (start_delta || bmap_npages - size / psize) {
960         /* Slow path - we need to manipulate a temp bitmap */
961         bmap_clear = bitmap_new(bmap_npages);
962         bitmap_copy_with_src_offset(bmap_clear, mem->dirty_bmap,
963                                     bmap_start, start_delta + size / psize);
964         /*
965          * We need to fill the holes at start because that was not
966          * specified by the caller and we extended the bitmap only for
967          * 64 pages alignment
968          */
969         bitmap_clear(bmap_clear, 0, start_delta);
970         d.dirty_bitmap = bmap_clear;
971     } else {
972         /*
973          * Fast path - both start and size align well with BITS_PER_LONG
974          * (or the end of memory slot)
975          */
976         d.dirty_bitmap = mem->dirty_bmap + BIT_WORD(bmap_start);
977     }
978 
979     d.first_page = bmap_start;
980     /* It should never overflow.  If it happens, say something */
981     assert(bmap_npages <= UINT32_MAX);
982     d.num_pages = bmap_npages;
983     d.slot = mem->slot | (as_id << 16);
984 
985     ret = kvm_vm_ioctl(s, KVM_CLEAR_DIRTY_LOG, &d);
986     if (ret < 0 && ret != -ENOENT) {
987         error_report("%s: KVM_CLEAR_DIRTY_LOG failed, slot=%d, "
988                      "start=0x%"PRIx64", size=0x%"PRIx32", errno=%d",
989                      __func__, d.slot, (uint64_t)d.first_page,
990                      (uint32_t)d.num_pages, ret);
991     } else {
992         ret = 0;
993         trace_kvm_clear_dirty_log(d.slot, d.first_page, d.num_pages);
994     }
995 
996     /*
997      * After we have updated the remote dirty bitmap, we update the
998      * cached bitmap as well for the memslot, then if another user
999      * clears the same region we know we shouldn't clear it again on
1000      * the remote otherwise it's data loss as well.
1001      */
1002     bitmap_clear(mem->dirty_bmap, bmap_start + start_delta,
1003                  size / psize);
1004     /* This handles the NULL case well */
1005     g_free(bmap_clear);
1006     return ret;
1007 }
1008 
1009 
1010 /**
1011  * kvm_physical_log_clear - Clear the kernel's dirty bitmap for range
1012  *
1013  * NOTE: this will be a no-op if we haven't enabled manual dirty log
1014  * protection in the host kernel because in that case this operation
1015  * will be done within log_sync().
1016  *
1017  * @kml:     the kvm memory listener
1018  * @section: the memory range to clear dirty bitmap
1019  */
1020 static int kvm_physical_log_clear(KVMMemoryListener *kml,
1021                                   MemoryRegionSection *section)
1022 {
1023     KVMState *s = kvm_state;
1024     uint64_t start, size, offset, count;
1025     KVMSlot *mem;
1026     int ret = 0, i;
1027 
1028     if (!s->manual_dirty_log_protect) {
1029         /* No need to do explicit clear */
1030         return ret;
1031     }
1032 
1033     start = section->offset_within_address_space;
1034     size = int128_get64(section->size);
1035 
1036     if (!size) {
1037         /* Nothing more we can do... */
1038         return ret;
1039     }
1040 
1041     kvm_slots_lock();
1042 
1043     for (i = 0; i < s->nr_slots; i++) {
1044         mem = &kml->slots[i];
1045         /* Discard slots that are empty or do not overlap the section */
1046         if (!mem->memory_size ||
1047             mem->start_addr > start + size - 1 ||
1048             start > mem->start_addr + mem->memory_size - 1) {
1049             continue;
1050         }
1051 
1052         if (start >= mem->start_addr) {
1053             /* The slot starts before section or is aligned to it.  */
1054             offset = start - mem->start_addr;
1055             count = MIN(mem->memory_size - offset, size);
1056         } else {
1057             /* The slot starts after section.  */
1058             offset = 0;
1059             count = MIN(mem->memory_size, size - (mem->start_addr - start));
1060         }
1061         ret = kvm_log_clear_one_slot(mem, kml->as_id, offset, count);
1062         if (ret < 0) {
1063             break;
1064         }
1065     }
1066 
1067     kvm_slots_unlock();
1068 
1069     return ret;
1070 }
1071 
1072 static void kvm_coalesce_mmio_region(MemoryListener *listener,
1073                                      MemoryRegionSection *secion,
1074                                      hwaddr start, hwaddr size)
1075 {
1076     KVMState *s = kvm_state;
1077 
1078     if (s->coalesced_mmio) {
1079         struct kvm_coalesced_mmio_zone zone;
1080 
1081         zone.addr = start;
1082         zone.size = size;
1083         zone.pad = 0;
1084 
1085         (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
1086     }
1087 }
1088 
1089 static void kvm_uncoalesce_mmio_region(MemoryListener *listener,
1090                                        MemoryRegionSection *secion,
1091                                        hwaddr start, hwaddr size)
1092 {
1093     KVMState *s = kvm_state;
1094 
1095     if (s->coalesced_mmio) {
1096         struct kvm_coalesced_mmio_zone zone;
1097 
1098         zone.addr = start;
1099         zone.size = size;
1100         zone.pad = 0;
1101 
1102         (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
1103     }
1104 }
1105 
1106 static void kvm_coalesce_pio_add(MemoryListener *listener,
1107                                 MemoryRegionSection *section,
1108                                 hwaddr start, hwaddr size)
1109 {
1110     KVMState *s = kvm_state;
1111 
1112     if (s->coalesced_pio) {
1113         struct kvm_coalesced_mmio_zone zone;
1114 
1115         zone.addr = start;
1116         zone.size = size;
1117         zone.pio = 1;
1118 
1119         (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
1120     }
1121 }
1122 
1123 static void kvm_coalesce_pio_del(MemoryListener *listener,
1124                                 MemoryRegionSection *section,
1125                                 hwaddr start, hwaddr size)
1126 {
1127     KVMState *s = kvm_state;
1128 
1129     if (s->coalesced_pio) {
1130         struct kvm_coalesced_mmio_zone zone;
1131 
1132         zone.addr = start;
1133         zone.size = size;
1134         zone.pio = 1;
1135 
1136         (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
1137      }
1138 }
1139 
1140 static MemoryListener kvm_coalesced_pio_listener = {
1141     .name = "kvm-coalesced-pio",
1142     .coalesced_io_add = kvm_coalesce_pio_add,
1143     .coalesced_io_del = kvm_coalesce_pio_del,
1144 };
1145 
1146 int kvm_check_extension(KVMState *s, unsigned int extension)
1147 {
1148     int ret;
1149 
1150     ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
1151     if (ret < 0) {
1152         ret = 0;
1153     }
1154 
1155     return ret;
1156 }
1157 
1158 int kvm_vm_check_extension(KVMState *s, unsigned int extension)
1159 {
1160     int ret;
1161 
1162     ret = kvm_vm_ioctl(s, KVM_CHECK_EXTENSION, extension);
1163     if (ret < 0) {
1164         /* VM wide version not implemented, use global one instead */
1165         ret = kvm_check_extension(s, extension);
1166     }
1167 
1168     return ret;
1169 }
1170 
1171 typedef struct HWPoisonPage {
1172     ram_addr_t ram_addr;
1173     QLIST_ENTRY(HWPoisonPage) list;
1174 } HWPoisonPage;
1175 
1176 static QLIST_HEAD(, HWPoisonPage) hwpoison_page_list =
1177     QLIST_HEAD_INITIALIZER(hwpoison_page_list);
1178 
1179 static void kvm_unpoison_all(void *param)
1180 {
1181     HWPoisonPage *page, *next_page;
1182 
1183     QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) {
1184         QLIST_REMOVE(page, list);
1185         qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE);
1186         g_free(page);
1187     }
1188 }
1189 
1190 void kvm_hwpoison_page_add(ram_addr_t ram_addr)
1191 {
1192     HWPoisonPage *page;
1193 
1194     QLIST_FOREACH(page, &hwpoison_page_list, list) {
1195         if (page->ram_addr == ram_addr) {
1196             return;
1197         }
1198     }
1199     page = g_new(HWPoisonPage, 1);
1200     page->ram_addr = ram_addr;
1201     QLIST_INSERT_HEAD(&hwpoison_page_list, page, list);
1202 }
1203 
1204 static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size)
1205 {
1206 #if HOST_BIG_ENDIAN != TARGET_BIG_ENDIAN
1207     /* The kernel expects ioeventfd values in HOST_BIG_ENDIAN
1208      * endianness, but the memory core hands them in target endianness.
1209      * For example, PPC is always treated as big-endian even if running
1210      * on KVM and on PPC64LE.  Correct here.
1211      */
1212     switch (size) {
1213     case 2:
1214         val = bswap16(val);
1215         break;
1216     case 4:
1217         val = bswap32(val);
1218         break;
1219     }
1220 #endif
1221     return val;
1222 }
1223 
1224 static int kvm_set_ioeventfd_mmio(int fd, hwaddr addr, uint32_t val,
1225                                   bool assign, uint32_t size, bool datamatch)
1226 {
1227     int ret;
1228     struct kvm_ioeventfd iofd = {
1229         .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
1230         .addr = addr,
1231         .len = size,
1232         .flags = 0,
1233         .fd = fd,
1234     };
1235 
1236     trace_kvm_set_ioeventfd_mmio(fd, (uint64_t)addr, val, assign, size,
1237                                  datamatch);
1238     if (!kvm_enabled()) {
1239         return -ENOSYS;
1240     }
1241 
1242     if (datamatch) {
1243         iofd.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
1244     }
1245     if (!assign) {
1246         iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1247     }
1248 
1249     ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);
1250 
1251     if (ret < 0) {
1252         return -errno;
1253     }
1254 
1255     return 0;
1256 }
1257 
1258 static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val,
1259                                  bool assign, uint32_t size, bool datamatch)
1260 {
1261     struct kvm_ioeventfd kick = {
1262         .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
1263         .addr = addr,
1264         .flags = KVM_IOEVENTFD_FLAG_PIO,
1265         .len = size,
1266         .fd = fd,
1267     };
1268     int r;
1269     trace_kvm_set_ioeventfd_pio(fd, addr, val, assign, size, datamatch);
1270     if (!kvm_enabled()) {
1271         return -ENOSYS;
1272     }
1273     if (datamatch) {
1274         kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
1275     }
1276     if (!assign) {
1277         kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1278     }
1279     r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
1280     if (r < 0) {
1281         return r;
1282     }
1283     return 0;
1284 }
1285 
1286 
1287 static int kvm_check_many_ioeventfds(void)
1288 {
1289     /* Userspace can use ioeventfd for io notification.  This requires a host
1290      * that supports eventfd(2) and an I/O thread; since eventfd does not
1291      * support SIGIO it cannot interrupt the vcpu.
1292      *
1293      * Older kernels have a 6 device limit on the KVM io bus.  Find out so we
1294      * can avoid creating too many ioeventfds.
1295      */
1296 #if defined(CONFIG_EVENTFD)
1297     int ioeventfds[7];
1298     int i, ret = 0;
1299     for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) {
1300         ioeventfds[i] = eventfd(0, EFD_CLOEXEC);
1301         if (ioeventfds[i] < 0) {
1302             break;
1303         }
1304         ret = kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, true, 2, true);
1305         if (ret < 0) {
1306             close(ioeventfds[i]);
1307             break;
1308         }
1309     }
1310 
1311     /* Decide whether many devices are supported or not */
1312     ret = i == ARRAY_SIZE(ioeventfds);
1313 
1314     while (i-- > 0) {
1315         kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, false, 2, true);
1316         close(ioeventfds[i]);
1317     }
1318     return ret;
1319 #else
1320     return 0;
1321 #endif
1322 }
1323 
1324 static const KVMCapabilityInfo *
1325 kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list)
1326 {
1327     while (list->name) {
1328         if (!kvm_check_extension(s, list->value)) {
1329             return list;
1330         }
1331         list++;
1332     }
1333     return NULL;
1334 }
1335 
1336 void kvm_set_max_memslot_size(hwaddr max_slot_size)
1337 {
1338     g_assert(
1339         ROUND_UP(max_slot_size, qemu_real_host_page_size()) == max_slot_size
1340     );
1341     kvm_max_slot_size = max_slot_size;
1342 }
1343 
1344 static void kvm_set_phys_mem(KVMMemoryListener *kml,
1345                              MemoryRegionSection *section, bool add)
1346 {
1347     KVMSlot *mem;
1348     int err;
1349     MemoryRegion *mr = section->mr;
1350     bool writable = !mr->readonly && !mr->rom_device;
1351     hwaddr start_addr, size, slot_size, mr_offset;
1352     ram_addr_t ram_start_offset;
1353     void *ram;
1354 
1355     if (!memory_region_is_ram(mr)) {
1356         if (writable || !kvm_readonly_mem_allowed) {
1357             return;
1358         } else if (!mr->romd_mode) {
1359             /* If the memory device is not in romd_mode, then we actually want
1360              * to remove the kvm memory slot so all accesses will trap. */
1361             add = false;
1362         }
1363     }
1364 
1365     size = kvm_align_section(section, &start_addr);
1366     if (!size) {
1367         return;
1368     }
1369 
1370     /* The offset of the kvmslot within the memory region */
1371     mr_offset = section->offset_within_region + start_addr -
1372         section->offset_within_address_space;
1373 
1374     /* use aligned delta to align the ram address and offset */
1375     ram = memory_region_get_ram_ptr(mr) + mr_offset;
1376     ram_start_offset = memory_region_get_ram_addr(mr) + mr_offset;
1377 
1378     kvm_slots_lock();
1379 
1380     if (!add) {
1381         do {
1382             slot_size = MIN(kvm_max_slot_size, size);
1383             mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
1384             if (!mem) {
1385                 goto out;
1386             }
1387             if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
1388                 /*
1389                  * NOTE: We should be aware of the fact that here we're only
1390                  * doing a best effort to sync dirty bits.  No matter whether
1391                  * we're using dirty log or dirty ring, we ignored two facts:
1392                  *
1393                  * (1) dirty bits can reside in hardware buffers (PML)
1394                  *
1395                  * (2) after we collected dirty bits here, pages can be dirtied
1396                  * again before we do the final KVM_SET_USER_MEMORY_REGION to
1397                  * remove the slot.
1398                  *
1399                  * Not easy.  Let's cross the fingers until it's fixed.
1400                  */
1401                 if (kvm_state->kvm_dirty_ring_size) {
1402                     kvm_dirty_ring_reap_locked(kvm_state);
1403                 } else {
1404                     kvm_slot_get_dirty_log(kvm_state, mem);
1405                 }
1406                 kvm_slot_sync_dirty_pages(mem);
1407             }
1408 
1409             /* unregister the slot */
1410             g_free(mem->dirty_bmap);
1411             mem->dirty_bmap = NULL;
1412             mem->memory_size = 0;
1413             mem->flags = 0;
1414             err = kvm_set_user_memory_region(kml, mem, false);
1415             if (err) {
1416                 fprintf(stderr, "%s: error unregistering slot: %s\n",
1417                         __func__, strerror(-err));
1418                 abort();
1419             }
1420             start_addr += slot_size;
1421             size -= slot_size;
1422         } while (size);
1423         goto out;
1424     }
1425 
1426     /* register the new slot */
1427     do {
1428         slot_size = MIN(kvm_max_slot_size, size);
1429         mem = kvm_alloc_slot(kml);
1430         mem->as_id = kml->as_id;
1431         mem->memory_size = slot_size;
1432         mem->start_addr = start_addr;
1433         mem->ram_start_offset = ram_start_offset;
1434         mem->ram = ram;
1435         mem->flags = kvm_mem_flags(mr);
1436         kvm_slot_init_dirty_bitmap(mem);
1437         err = kvm_set_user_memory_region(kml, mem, true);
1438         if (err) {
1439             fprintf(stderr, "%s: error registering slot: %s\n", __func__,
1440                     strerror(-err));
1441             abort();
1442         }
1443         start_addr += slot_size;
1444         ram_start_offset += slot_size;
1445         ram += slot_size;
1446         size -= slot_size;
1447     } while (size);
1448 
1449 out:
1450     kvm_slots_unlock();
1451 }
1452 
1453 static void *kvm_dirty_ring_reaper_thread(void *data)
1454 {
1455     KVMState *s = data;
1456     struct KVMDirtyRingReaper *r = &s->reaper;
1457 
1458     rcu_register_thread();
1459 
1460     trace_kvm_dirty_ring_reaper("init");
1461 
1462     while (true) {
1463         r->reaper_state = KVM_DIRTY_RING_REAPER_WAIT;
1464         trace_kvm_dirty_ring_reaper("wait");
1465         /*
1466          * TODO: provide a smarter timeout rather than a constant?
1467          */
1468         sleep(1);
1469 
1470         trace_kvm_dirty_ring_reaper("wakeup");
1471         r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING;
1472 
1473         qemu_mutex_lock_iothread();
1474         kvm_dirty_ring_reap(s);
1475         qemu_mutex_unlock_iothread();
1476 
1477         r->reaper_iteration++;
1478     }
1479 
1480     trace_kvm_dirty_ring_reaper("exit");
1481 
1482     rcu_unregister_thread();
1483 
1484     return NULL;
1485 }
1486 
1487 static int kvm_dirty_ring_reaper_init(KVMState *s)
1488 {
1489     struct KVMDirtyRingReaper *r = &s->reaper;
1490 
1491     qemu_thread_create(&r->reaper_thr, "kvm-reaper",
1492                        kvm_dirty_ring_reaper_thread,
1493                        s, QEMU_THREAD_JOINABLE);
1494 
1495     return 0;
1496 }
1497 
1498 static void kvm_region_add(MemoryListener *listener,
1499                            MemoryRegionSection *section)
1500 {
1501     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1502 
1503     memory_region_ref(section->mr);
1504     kvm_set_phys_mem(kml, section, true);
1505 }
1506 
1507 static void kvm_region_del(MemoryListener *listener,
1508                            MemoryRegionSection *section)
1509 {
1510     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1511 
1512     kvm_set_phys_mem(kml, section, false);
1513     memory_region_unref(section->mr);
1514 }
1515 
1516 static void kvm_log_sync(MemoryListener *listener,
1517                          MemoryRegionSection *section)
1518 {
1519     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1520 
1521     kvm_slots_lock();
1522     kvm_physical_sync_dirty_bitmap(kml, section);
1523     kvm_slots_unlock();
1524 }
1525 
1526 static void kvm_log_sync_global(MemoryListener *l)
1527 {
1528     KVMMemoryListener *kml = container_of(l, KVMMemoryListener, listener);
1529     KVMState *s = kvm_state;
1530     KVMSlot *mem;
1531     int i;
1532 
1533     /* Flush all kernel dirty addresses into KVMSlot dirty bitmap */
1534     kvm_dirty_ring_flush();
1535 
1536     /*
1537      * TODO: make this faster when nr_slots is big while there are
1538      * only a few used slots (small VMs).
1539      */
1540     kvm_slots_lock();
1541     for (i = 0; i < s->nr_slots; i++) {
1542         mem = &kml->slots[i];
1543         if (mem->memory_size && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
1544             kvm_slot_sync_dirty_pages(mem);
1545             /*
1546              * This is not needed by KVM_GET_DIRTY_LOG because the
1547              * ioctl will unconditionally overwrite the whole region.
1548              * However kvm dirty ring has no such side effect.
1549              */
1550             kvm_slot_reset_dirty_pages(mem);
1551         }
1552     }
1553     kvm_slots_unlock();
1554 }
1555 
1556 static void kvm_log_clear(MemoryListener *listener,
1557                           MemoryRegionSection *section)
1558 {
1559     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1560     int r;
1561 
1562     r = kvm_physical_log_clear(kml, section);
1563     if (r < 0) {
1564         error_report_once("%s: kvm log clear failed: mr=%s "
1565                           "offset=%"HWADDR_PRIx" size=%"PRIx64, __func__,
1566                           section->mr->name, section->offset_within_region,
1567                           int128_get64(section->size));
1568         abort();
1569     }
1570 }
1571 
1572 static void kvm_mem_ioeventfd_add(MemoryListener *listener,
1573                                   MemoryRegionSection *section,
1574                                   bool match_data, uint64_t data,
1575                                   EventNotifier *e)
1576 {
1577     int fd = event_notifier_get_fd(e);
1578     int r;
1579 
1580     r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
1581                                data, true, int128_get64(section->size),
1582                                match_data);
1583     if (r < 0) {
1584         fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n",
1585                 __func__, strerror(-r), -r);
1586         abort();
1587     }
1588 }
1589 
1590 static void kvm_mem_ioeventfd_del(MemoryListener *listener,
1591                                   MemoryRegionSection *section,
1592                                   bool match_data, uint64_t data,
1593                                   EventNotifier *e)
1594 {
1595     int fd = event_notifier_get_fd(e);
1596     int r;
1597 
1598     r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
1599                                data, false, int128_get64(section->size),
1600                                match_data);
1601     if (r < 0) {
1602         fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n",
1603                 __func__, strerror(-r), -r);
1604         abort();
1605     }
1606 }
1607 
1608 static void kvm_io_ioeventfd_add(MemoryListener *listener,
1609                                  MemoryRegionSection *section,
1610                                  bool match_data, uint64_t data,
1611                                  EventNotifier *e)
1612 {
1613     int fd = event_notifier_get_fd(e);
1614     int r;
1615 
1616     r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
1617                               data, true, int128_get64(section->size),
1618                               match_data);
1619     if (r < 0) {
1620         fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n",
1621                 __func__, strerror(-r), -r);
1622         abort();
1623     }
1624 }
1625 
1626 static void kvm_io_ioeventfd_del(MemoryListener *listener,
1627                                  MemoryRegionSection *section,
1628                                  bool match_data, uint64_t data,
1629                                  EventNotifier *e)
1630 
1631 {
1632     int fd = event_notifier_get_fd(e);
1633     int r;
1634 
1635     r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
1636                               data, false, int128_get64(section->size),
1637                               match_data);
1638     if (r < 0) {
1639         fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n",
1640                 __func__, strerror(-r), -r);
1641         abort();
1642     }
1643 }
1644 
1645 void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
1646                                   AddressSpace *as, int as_id, const char *name)
1647 {
1648     int i;
1649 
1650     kml->slots = g_new0(KVMSlot, s->nr_slots);
1651     kml->as_id = as_id;
1652 
1653     for (i = 0; i < s->nr_slots; i++) {
1654         kml->slots[i].slot = i;
1655     }
1656 
1657     kml->listener.region_add = kvm_region_add;
1658     kml->listener.region_del = kvm_region_del;
1659     kml->listener.log_start = kvm_log_start;
1660     kml->listener.log_stop = kvm_log_stop;
1661     kml->listener.priority = 10;
1662     kml->listener.name = name;
1663 
1664     if (s->kvm_dirty_ring_size) {
1665         kml->listener.log_sync_global = kvm_log_sync_global;
1666     } else {
1667         kml->listener.log_sync = kvm_log_sync;
1668         kml->listener.log_clear = kvm_log_clear;
1669     }
1670 
1671     memory_listener_register(&kml->listener, as);
1672 
1673     for (i = 0; i < s->nr_as; ++i) {
1674         if (!s->as[i].as) {
1675             s->as[i].as = as;
1676             s->as[i].ml = kml;
1677             break;
1678         }
1679     }
1680 }
1681 
1682 static MemoryListener kvm_io_listener = {
1683     .name = "kvm-io",
1684     .eventfd_add = kvm_io_ioeventfd_add,
1685     .eventfd_del = kvm_io_ioeventfd_del,
1686     .priority = 10,
1687 };
1688 
1689 int kvm_set_irq(KVMState *s, int irq, int level)
1690 {
1691     struct kvm_irq_level event;
1692     int ret;
1693 
1694     assert(kvm_async_interrupts_enabled());
1695 
1696     event.level = level;
1697     event.irq = irq;
1698     ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event);
1699     if (ret < 0) {
1700         perror("kvm_set_irq");
1701         abort();
1702     }
1703 
1704     return (s->irq_set_ioctl == KVM_IRQ_LINE) ? 1 : event.status;
1705 }
1706 
1707 #ifdef KVM_CAP_IRQ_ROUTING
1708 typedef struct KVMMSIRoute {
1709     struct kvm_irq_routing_entry kroute;
1710     QTAILQ_ENTRY(KVMMSIRoute) entry;
1711 } KVMMSIRoute;
1712 
1713 static void set_gsi(KVMState *s, unsigned int gsi)
1714 {
1715     set_bit(gsi, s->used_gsi_bitmap);
1716 }
1717 
1718 static void clear_gsi(KVMState *s, unsigned int gsi)
1719 {
1720     clear_bit(gsi, s->used_gsi_bitmap);
1721 }
1722 
1723 void kvm_init_irq_routing(KVMState *s)
1724 {
1725     int gsi_count, i;
1726 
1727     gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING) - 1;
1728     if (gsi_count > 0) {
1729         /* Round up so we can search ints using ffs */
1730         s->used_gsi_bitmap = bitmap_new(gsi_count);
1731         s->gsi_count = gsi_count;
1732     }
1733 
1734     s->irq_routes = g_malloc0(sizeof(*s->irq_routes));
1735     s->nr_allocated_irq_routes = 0;
1736 
1737     if (!kvm_direct_msi_allowed) {
1738         for (i = 0; i < KVM_MSI_HASHTAB_SIZE; i++) {
1739             QTAILQ_INIT(&s->msi_hashtab[i]);
1740         }
1741     }
1742 
1743     kvm_arch_init_irq_routing(s);
1744 }
1745 
1746 void kvm_irqchip_commit_routes(KVMState *s)
1747 {
1748     int ret;
1749 
1750     if (kvm_gsi_direct_mapping()) {
1751         return;
1752     }
1753 
1754     if (!kvm_gsi_routing_enabled()) {
1755         return;
1756     }
1757 
1758     s->irq_routes->flags = 0;
1759     trace_kvm_irqchip_commit_routes();
1760     ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes);
1761     assert(ret == 0);
1762 }
1763 
1764 static void kvm_add_routing_entry(KVMState *s,
1765                                   struct kvm_irq_routing_entry *entry)
1766 {
1767     struct kvm_irq_routing_entry *new;
1768     int n, size;
1769 
1770     if (s->irq_routes->nr == s->nr_allocated_irq_routes) {
1771         n = s->nr_allocated_irq_routes * 2;
1772         if (n < 64) {
1773             n = 64;
1774         }
1775         size = sizeof(struct kvm_irq_routing);
1776         size += n * sizeof(*new);
1777         s->irq_routes = g_realloc(s->irq_routes, size);
1778         s->nr_allocated_irq_routes = n;
1779     }
1780     n = s->irq_routes->nr++;
1781     new = &s->irq_routes->entries[n];
1782 
1783     *new = *entry;
1784 
1785     set_gsi(s, entry->gsi);
1786 }
1787 
1788 static int kvm_update_routing_entry(KVMState *s,
1789                                     struct kvm_irq_routing_entry *new_entry)
1790 {
1791     struct kvm_irq_routing_entry *entry;
1792     int n;
1793 
1794     for (n = 0; n < s->irq_routes->nr; n++) {
1795         entry = &s->irq_routes->entries[n];
1796         if (entry->gsi != new_entry->gsi) {
1797             continue;
1798         }
1799 
1800         if(!memcmp(entry, new_entry, sizeof *entry)) {
1801             return 0;
1802         }
1803 
1804         *entry = *new_entry;
1805 
1806         return 0;
1807     }
1808 
1809     return -ESRCH;
1810 }
1811 
1812 void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin)
1813 {
1814     struct kvm_irq_routing_entry e = {};
1815 
1816     assert(pin < s->gsi_count);
1817 
1818     e.gsi = irq;
1819     e.type = KVM_IRQ_ROUTING_IRQCHIP;
1820     e.flags = 0;
1821     e.u.irqchip.irqchip = irqchip;
1822     e.u.irqchip.pin = pin;
1823     kvm_add_routing_entry(s, &e);
1824 }
1825 
1826 void kvm_irqchip_release_virq(KVMState *s, int virq)
1827 {
1828     struct kvm_irq_routing_entry *e;
1829     int i;
1830 
1831     if (kvm_gsi_direct_mapping()) {
1832         return;
1833     }
1834 
1835     for (i = 0; i < s->irq_routes->nr; i++) {
1836         e = &s->irq_routes->entries[i];
1837         if (e->gsi == virq) {
1838             s->irq_routes->nr--;
1839             *e = s->irq_routes->entries[s->irq_routes->nr];
1840         }
1841     }
1842     clear_gsi(s, virq);
1843     kvm_arch_release_virq_post(virq);
1844     trace_kvm_irqchip_release_virq(virq);
1845 }
1846 
1847 void kvm_irqchip_add_change_notifier(Notifier *n)
1848 {
1849     notifier_list_add(&kvm_irqchip_change_notifiers, n);
1850 }
1851 
1852 void kvm_irqchip_remove_change_notifier(Notifier *n)
1853 {
1854     notifier_remove(n);
1855 }
1856 
1857 void kvm_irqchip_change_notify(void)
1858 {
1859     notifier_list_notify(&kvm_irqchip_change_notifiers, NULL);
1860 }
1861 
1862 static unsigned int kvm_hash_msi(uint32_t data)
1863 {
1864     /* This is optimized for IA32 MSI layout. However, no other arch shall
1865      * repeat the mistake of not providing a direct MSI injection API. */
1866     return data & 0xff;
1867 }
1868 
1869 static void kvm_flush_dynamic_msi_routes(KVMState *s)
1870 {
1871     KVMMSIRoute *route, *next;
1872     unsigned int hash;
1873 
1874     for (hash = 0; hash < KVM_MSI_HASHTAB_SIZE; hash++) {
1875         QTAILQ_FOREACH_SAFE(route, &s->msi_hashtab[hash], entry, next) {
1876             kvm_irqchip_release_virq(s, route->kroute.gsi);
1877             QTAILQ_REMOVE(&s->msi_hashtab[hash], route, entry);
1878             g_free(route);
1879         }
1880     }
1881 }
1882 
1883 static int kvm_irqchip_get_virq(KVMState *s)
1884 {
1885     int next_virq;
1886 
1887     /*
1888      * PIC and IOAPIC share the first 16 GSI numbers, thus the available
1889      * GSI numbers are more than the number of IRQ route. Allocating a GSI
1890      * number can succeed even though a new route entry cannot be added.
1891      * When this happens, flush dynamic MSI entries to free IRQ route entries.
1892      */
1893     if (!kvm_direct_msi_allowed && s->irq_routes->nr == s->gsi_count) {
1894         kvm_flush_dynamic_msi_routes(s);
1895     }
1896 
1897     /* Return the lowest unused GSI in the bitmap */
1898     next_virq = find_first_zero_bit(s->used_gsi_bitmap, s->gsi_count);
1899     if (next_virq >= s->gsi_count) {
1900         return -ENOSPC;
1901     } else {
1902         return next_virq;
1903     }
1904 }
1905 
1906 static KVMMSIRoute *kvm_lookup_msi_route(KVMState *s, MSIMessage msg)
1907 {
1908     unsigned int hash = kvm_hash_msi(msg.data);
1909     KVMMSIRoute *route;
1910 
1911     QTAILQ_FOREACH(route, &s->msi_hashtab[hash], entry) {
1912         if (route->kroute.u.msi.address_lo == (uint32_t)msg.address &&
1913             route->kroute.u.msi.address_hi == (msg.address >> 32) &&
1914             route->kroute.u.msi.data == le32_to_cpu(msg.data)) {
1915             return route;
1916         }
1917     }
1918     return NULL;
1919 }
1920 
1921 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
1922 {
1923     struct kvm_msi msi;
1924     KVMMSIRoute *route;
1925 
1926     if (kvm_direct_msi_allowed) {
1927         msi.address_lo = (uint32_t)msg.address;
1928         msi.address_hi = msg.address >> 32;
1929         msi.data = le32_to_cpu(msg.data);
1930         msi.flags = 0;
1931         memset(msi.pad, 0, sizeof(msi.pad));
1932 
1933         return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi);
1934     }
1935 
1936     route = kvm_lookup_msi_route(s, msg);
1937     if (!route) {
1938         int virq;
1939 
1940         virq = kvm_irqchip_get_virq(s);
1941         if (virq < 0) {
1942             return virq;
1943         }
1944 
1945         route = g_new0(KVMMSIRoute, 1);
1946         route->kroute.gsi = virq;
1947         route->kroute.type = KVM_IRQ_ROUTING_MSI;
1948         route->kroute.flags = 0;
1949         route->kroute.u.msi.address_lo = (uint32_t)msg.address;
1950         route->kroute.u.msi.address_hi = msg.address >> 32;
1951         route->kroute.u.msi.data = le32_to_cpu(msg.data);
1952 
1953         kvm_add_routing_entry(s, &route->kroute);
1954         kvm_irqchip_commit_routes(s);
1955 
1956         QTAILQ_INSERT_TAIL(&s->msi_hashtab[kvm_hash_msi(msg.data)], route,
1957                            entry);
1958     }
1959 
1960     assert(route->kroute.type == KVM_IRQ_ROUTING_MSI);
1961 
1962     return kvm_set_irq(s, route->kroute.gsi, 1);
1963 }
1964 
1965 int kvm_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev)
1966 {
1967     struct kvm_irq_routing_entry kroute = {};
1968     int virq;
1969     KVMState *s = c->s;
1970     MSIMessage msg = {0, 0};
1971 
1972     if (pci_available && dev) {
1973         msg = pci_get_msi_message(dev, vector);
1974     }
1975 
1976     if (kvm_gsi_direct_mapping()) {
1977         return kvm_arch_msi_data_to_gsi(msg.data);
1978     }
1979 
1980     if (!kvm_gsi_routing_enabled()) {
1981         return -ENOSYS;
1982     }
1983 
1984     virq = kvm_irqchip_get_virq(s);
1985     if (virq < 0) {
1986         return virq;
1987     }
1988 
1989     kroute.gsi = virq;
1990     kroute.type = KVM_IRQ_ROUTING_MSI;
1991     kroute.flags = 0;
1992     kroute.u.msi.address_lo = (uint32_t)msg.address;
1993     kroute.u.msi.address_hi = msg.address >> 32;
1994     kroute.u.msi.data = le32_to_cpu(msg.data);
1995     if (pci_available && kvm_msi_devid_required()) {
1996         kroute.flags = KVM_MSI_VALID_DEVID;
1997         kroute.u.msi.devid = pci_requester_id(dev);
1998     }
1999     if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
2000         kvm_irqchip_release_virq(s, virq);
2001         return -EINVAL;
2002     }
2003 
2004     trace_kvm_irqchip_add_msi_route(dev ? dev->name : (char *)"N/A",
2005                                     vector, virq);
2006 
2007     kvm_add_routing_entry(s, &kroute);
2008     kvm_arch_add_msi_route_post(&kroute, vector, dev);
2009     c->changes++;
2010 
2011     return virq;
2012 }
2013 
2014 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg,
2015                                  PCIDevice *dev)
2016 {
2017     struct kvm_irq_routing_entry kroute = {};
2018 
2019     if (kvm_gsi_direct_mapping()) {
2020         return 0;
2021     }
2022 
2023     if (!kvm_irqchip_in_kernel()) {
2024         return -ENOSYS;
2025     }
2026 
2027     kroute.gsi = virq;
2028     kroute.type = KVM_IRQ_ROUTING_MSI;
2029     kroute.flags = 0;
2030     kroute.u.msi.address_lo = (uint32_t)msg.address;
2031     kroute.u.msi.address_hi = msg.address >> 32;
2032     kroute.u.msi.data = le32_to_cpu(msg.data);
2033     if (pci_available && kvm_msi_devid_required()) {
2034         kroute.flags = KVM_MSI_VALID_DEVID;
2035         kroute.u.msi.devid = pci_requester_id(dev);
2036     }
2037     if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
2038         return -EINVAL;
2039     }
2040 
2041     trace_kvm_irqchip_update_msi_route(virq);
2042 
2043     return kvm_update_routing_entry(s, &kroute);
2044 }
2045 
2046 static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event,
2047                                     EventNotifier *resample, int virq,
2048                                     bool assign)
2049 {
2050     int fd = event_notifier_get_fd(event);
2051     int rfd = resample ? event_notifier_get_fd(resample) : -1;
2052 
2053     struct kvm_irqfd irqfd = {
2054         .fd = fd,
2055         .gsi = virq,
2056         .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN,
2057     };
2058 
2059     if (rfd != -1) {
2060         assert(assign);
2061         if (kvm_irqchip_is_split()) {
2062             /*
2063              * When the slow irqchip (e.g. IOAPIC) is in the
2064              * userspace, KVM kernel resamplefd will not work because
2065              * the EOI of the interrupt will be delivered to userspace
2066              * instead, so the KVM kernel resamplefd kick will be
2067              * skipped.  The userspace here mimics what the kernel
2068              * provides with resamplefd, remember the resamplefd and
2069              * kick it when we receive EOI of this IRQ.
2070              *
2071              * This is hackery because IOAPIC is mostly bypassed
2072              * (except EOI broadcasts) when irqfd is used.  However
2073              * this can bring much performance back for split irqchip
2074              * with INTx IRQs (for VFIO, this gives 93% perf of the
2075              * full fast path, which is 46% perf boost comparing to
2076              * the INTx slow path).
2077              */
2078             kvm_resample_fd_insert(virq, resample);
2079         } else {
2080             irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE;
2081             irqfd.resamplefd = rfd;
2082         }
2083     } else if (!assign) {
2084         if (kvm_irqchip_is_split()) {
2085             kvm_resample_fd_remove(virq);
2086         }
2087     }
2088 
2089     if (!kvm_irqfds_enabled()) {
2090         return -ENOSYS;
2091     }
2092 
2093     return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd);
2094 }
2095 
2096 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter)
2097 {
2098     struct kvm_irq_routing_entry kroute = {};
2099     int virq;
2100 
2101     if (!kvm_gsi_routing_enabled()) {
2102         return -ENOSYS;
2103     }
2104 
2105     virq = kvm_irqchip_get_virq(s);
2106     if (virq < 0) {
2107         return virq;
2108     }
2109 
2110     kroute.gsi = virq;
2111     kroute.type = KVM_IRQ_ROUTING_S390_ADAPTER;
2112     kroute.flags = 0;
2113     kroute.u.adapter.summary_addr = adapter->summary_addr;
2114     kroute.u.adapter.ind_addr = adapter->ind_addr;
2115     kroute.u.adapter.summary_offset = adapter->summary_offset;
2116     kroute.u.adapter.ind_offset = adapter->ind_offset;
2117     kroute.u.adapter.adapter_id = adapter->adapter_id;
2118 
2119     kvm_add_routing_entry(s, &kroute);
2120 
2121     return virq;
2122 }
2123 
2124 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint)
2125 {
2126     struct kvm_irq_routing_entry kroute = {};
2127     int virq;
2128 
2129     if (!kvm_gsi_routing_enabled()) {
2130         return -ENOSYS;
2131     }
2132     if (!kvm_check_extension(s, KVM_CAP_HYPERV_SYNIC)) {
2133         return -ENOSYS;
2134     }
2135     virq = kvm_irqchip_get_virq(s);
2136     if (virq < 0) {
2137         return virq;
2138     }
2139 
2140     kroute.gsi = virq;
2141     kroute.type = KVM_IRQ_ROUTING_HV_SINT;
2142     kroute.flags = 0;
2143     kroute.u.hv_sint.vcpu = vcpu;
2144     kroute.u.hv_sint.sint = sint;
2145 
2146     kvm_add_routing_entry(s, &kroute);
2147     kvm_irqchip_commit_routes(s);
2148 
2149     return virq;
2150 }
2151 
2152 #else /* !KVM_CAP_IRQ_ROUTING */
2153 
2154 void kvm_init_irq_routing(KVMState *s)
2155 {
2156 }
2157 
2158 void kvm_irqchip_release_virq(KVMState *s, int virq)
2159 {
2160 }
2161 
2162 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
2163 {
2164     abort();
2165 }
2166 
2167 int kvm_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev)
2168 {
2169     return -ENOSYS;
2170 }
2171 
2172 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter)
2173 {
2174     return -ENOSYS;
2175 }
2176 
2177 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint)
2178 {
2179     return -ENOSYS;
2180 }
2181 
2182 static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event,
2183                                     EventNotifier *resample, int virq,
2184                                     bool assign)
2185 {
2186     abort();
2187 }
2188 
2189 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg)
2190 {
2191     return -ENOSYS;
2192 }
2193 #endif /* !KVM_CAP_IRQ_ROUTING */
2194 
2195 int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
2196                                        EventNotifier *rn, int virq)
2197 {
2198     return kvm_irqchip_assign_irqfd(s, n, rn, virq, true);
2199 }
2200 
2201 int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
2202                                           int virq)
2203 {
2204     return kvm_irqchip_assign_irqfd(s, n, NULL, virq, false);
2205 }
2206 
2207 int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n,
2208                                    EventNotifier *rn, qemu_irq irq)
2209 {
2210     gpointer key, gsi;
2211     gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
2212 
2213     if (!found) {
2214         return -ENXIO;
2215     }
2216     return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, GPOINTER_TO_INT(gsi));
2217 }
2218 
2219 int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n,
2220                                       qemu_irq irq)
2221 {
2222     gpointer key, gsi;
2223     gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
2224 
2225     if (!found) {
2226         return -ENXIO;
2227     }
2228     return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, GPOINTER_TO_INT(gsi));
2229 }
2230 
2231 void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi)
2232 {
2233     g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi));
2234 }
2235 
2236 static void kvm_irqchip_create(KVMState *s)
2237 {
2238     int ret;
2239 
2240     assert(s->kernel_irqchip_split != ON_OFF_AUTO_AUTO);
2241     if (kvm_check_extension(s, KVM_CAP_IRQCHIP)) {
2242         ;
2243     } else if (kvm_check_extension(s, KVM_CAP_S390_IRQCHIP)) {
2244         ret = kvm_vm_enable_cap(s, KVM_CAP_S390_IRQCHIP, 0);
2245         if (ret < 0) {
2246             fprintf(stderr, "Enable kernel irqchip failed: %s\n", strerror(-ret));
2247             exit(1);
2248         }
2249     } else {
2250         return;
2251     }
2252 
2253     /* First probe and see if there's a arch-specific hook to create the
2254      * in-kernel irqchip for us */
2255     ret = kvm_arch_irqchip_create(s);
2256     if (ret == 0) {
2257         if (s->kernel_irqchip_split == ON_OFF_AUTO_ON) {
2258             perror("Split IRQ chip mode not supported.");
2259             exit(1);
2260         } else {
2261             ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP);
2262         }
2263     }
2264     if (ret < 0) {
2265         fprintf(stderr, "Create kernel irqchip failed: %s\n", strerror(-ret));
2266         exit(1);
2267     }
2268 
2269     kvm_kernel_irqchip = true;
2270     /* If we have an in-kernel IRQ chip then we must have asynchronous
2271      * interrupt delivery (though the reverse is not necessarily true)
2272      */
2273     kvm_async_interrupts_allowed = true;
2274     kvm_halt_in_kernel_allowed = true;
2275 
2276     kvm_init_irq_routing(s);
2277 
2278     s->gsimap = g_hash_table_new(g_direct_hash, g_direct_equal);
2279 }
2280 
2281 /* Find number of supported CPUs using the recommended
2282  * procedure from the kernel API documentation to cope with
2283  * older kernels that may be missing capabilities.
2284  */
2285 static int kvm_recommended_vcpus(KVMState *s)
2286 {
2287     int ret = kvm_vm_check_extension(s, KVM_CAP_NR_VCPUS);
2288     return (ret) ? ret : 4;
2289 }
2290 
2291 static int kvm_max_vcpus(KVMState *s)
2292 {
2293     int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS);
2294     return (ret) ? ret : kvm_recommended_vcpus(s);
2295 }
2296 
2297 static int kvm_max_vcpu_id(KVMState *s)
2298 {
2299     int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPU_ID);
2300     return (ret) ? ret : kvm_max_vcpus(s);
2301 }
2302 
2303 bool kvm_vcpu_id_is_valid(int vcpu_id)
2304 {
2305     KVMState *s = KVM_STATE(current_accel());
2306     return vcpu_id >= 0 && vcpu_id < kvm_max_vcpu_id(s);
2307 }
2308 
2309 bool kvm_dirty_ring_enabled(void)
2310 {
2311     return kvm_state->kvm_dirty_ring_size ? true : false;
2312 }
2313 
2314 static void query_stats_cb(StatsResultList **result, StatsTarget target,
2315                            strList *names, strList *targets, Error **errp);
2316 static void query_stats_schemas_cb(StatsSchemaList **result, Error **errp);
2317 
2318 static int kvm_init(MachineState *ms)
2319 {
2320     MachineClass *mc = MACHINE_GET_CLASS(ms);
2321     static const char upgrade_note[] =
2322         "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
2323         "(see http://sourceforge.net/projects/kvm).\n";
2324     struct {
2325         const char *name;
2326         int num;
2327     } num_cpus[] = {
2328         { "SMP",          ms->smp.cpus },
2329         { "hotpluggable", ms->smp.max_cpus },
2330         { NULL, }
2331     }, *nc = num_cpus;
2332     int soft_vcpus_limit, hard_vcpus_limit;
2333     KVMState *s;
2334     const KVMCapabilityInfo *missing_cap;
2335     int ret;
2336     int type = 0;
2337     uint64_t dirty_log_manual_caps;
2338 
2339     qemu_mutex_init(&kml_slots_lock);
2340 
2341     s = KVM_STATE(ms->accelerator);
2342 
2343     /*
2344      * On systems where the kernel can support different base page
2345      * sizes, host page size may be different from TARGET_PAGE_SIZE,
2346      * even with KVM.  TARGET_PAGE_SIZE is assumed to be the minimum
2347      * page size for the system though.
2348      */
2349     assert(TARGET_PAGE_SIZE <= qemu_real_host_page_size());
2350 
2351     s->sigmask_len = 8;
2352 
2353 #ifdef KVM_CAP_SET_GUEST_DEBUG
2354     QTAILQ_INIT(&s->kvm_sw_breakpoints);
2355 #endif
2356     QLIST_INIT(&s->kvm_parked_vcpus);
2357     s->fd = qemu_open_old("/dev/kvm", O_RDWR);
2358     if (s->fd == -1) {
2359         fprintf(stderr, "Could not access KVM kernel module: %m\n");
2360         ret = -errno;
2361         goto err;
2362     }
2363 
2364     ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
2365     if (ret < KVM_API_VERSION) {
2366         if (ret >= 0) {
2367             ret = -EINVAL;
2368         }
2369         fprintf(stderr, "kvm version too old\n");
2370         goto err;
2371     }
2372 
2373     if (ret > KVM_API_VERSION) {
2374         ret = -EINVAL;
2375         fprintf(stderr, "kvm version not supported\n");
2376         goto err;
2377     }
2378 
2379     kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT);
2380     s->nr_slots = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS);
2381 
2382     /* If unspecified, use the default value */
2383     if (!s->nr_slots) {
2384         s->nr_slots = 32;
2385     }
2386 
2387     s->nr_as = kvm_check_extension(s, KVM_CAP_MULTI_ADDRESS_SPACE);
2388     if (s->nr_as <= 1) {
2389         s->nr_as = 1;
2390     }
2391     s->as = g_new0(struct KVMAs, s->nr_as);
2392 
2393     if (object_property_find(OBJECT(current_machine), "kvm-type")) {
2394         g_autofree char *kvm_type = object_property_get_str(OBJECT(current_machine),
2395                                                             "kvm-type",
2396                                                             &error_abort);
2397         type = mc->kvm_type(ms, kvm_type);
2398     } else if (mc->kvm_type) {
2399         type = mc->kvm_type(ms, NULL);
2400     }
2401 
2402     do {
2403         ret = kvm_ioctl(s, KVM_CREATE_VM, type);
2404     } while (ret == -EINTR);
2405 
2406     if (ret < 0) {
2407         fprintf(stderr, "ioctl(KVM_CREATE_VM) failed: %d %s\n", -ret,
2408                 strerror(-ret));
2409 
2410 #ifdef TARGET_S390X
2411         if (ret == -EINVAL) {
2412             fprintf(stderr,
2413                     "Host kernel setup problem detected. Please verify:\n");
2414             fprintf(stderr, "- for kernels supporting the switch_amode or"
2415                     " user_mode parameters, whether\n");
2416             fprintf(stderr,
2417                     "  user space is running in primary address space\n");
2418             fprintf(stderr,
2419                     "- for kernels supporting the vm.allocate_pgste sysctl, "
2420                     "whether it is enabled\n");
2421         }
2422 #elif defined(TARGET_PPC)
2423         if (ret == -EINVAL) {
2424             fprintf(stderr,
2425                     "PPC KVM module is not loaded. Try modprobe kvm_%s.\n",
2426                     (type == 2) ? "pr" : "hv");
2427         }
2428 #endif
2429         goto err;
2430     }
2431 
2432     s->vmfd = ret;
2433 
2434     /* check the vcpu limits */
2435     soft_vcpus_limit = kvm_recommended_vcpus(s);
2436     hard_vcpus_limit = kvm_max_vcpus(s);
2437 
2438     while (nc->name) {
2439         if (nc->num > soft_vcpus_limit) {
2440             warn_report("Number of %s cpus requested (%d) exceeds "
2441                         "the recommended cpus supported by KVM (%d)",
2442                         nc->name, nc->num, soft_vcpus_limit);
2443 
2444             if (nc->num > hard_vcpus_limit) {
2445                 fprintf(stderr, "Number of %s cpus requested (%d) exceeds "
2446                         "the maximum cpus supported by KVM (%d)\n",
2447                         nc->name, nc->num, hard_vcpus_limit);
2448                 exit(1);
2449             }
2450         }
2451         nc++;
2452     }
2453 
2454     missing_cap = kvm_check_extension_list(s, kvm_required_capabilites);
2455     if (!missing_cap) {
2456         missing_cap =
2457             kvm_check_extension_list(s, kvm_arch_required_capabilities);
2458     }
2459     if (missing_cap) {
2460         ret = -EINVAL;
2461         fprintf(stderr, "kvm does not support %s\n%s",
2462                 missing_cap->name, upgrade_note);
2463         goto err;
2464     }
2465 
2466     s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
2467     s->coalesced_pio = s->coalesced_mmio &&
2468                        kvm_check_extension(s, KVM_CAP_COALESCED_PIO);
2469 
2470     /*
2471      * Enable KVM dirty ring if supported, otherwise fall back to
2472      * dirty logging mode
2473      */
2474     if (s->kvm_dirty_ring_size > 0) {
2475         uint64_t ring_bytes;
2476 
2477         ring_bytes = s->kvm_dirty_ring_size * sizeof(struct kvm_dirty_gfn);
2478 
2479         /* Read the max supported pages */
2480         ret = kvm_vm_check_extension(s, KVM_CAP_DIRTY_LOG_RING);
2481         if (ret > 0) {
2482             if (ring_bytes > ret) {
2483                 error_report("KVM dirty ring size %" PRIu32 " too big "
2484                              "(maximum is %ld).  Please use a smaller value.",
2485                              s->kvm_dirty_ring_size,
2486                              (long)ret / sizeof(struct kvm_dirty_gfn));
2487                 ret = -EINVAL;
2488                 goto err;
2489             }
2490 
2491             ret = kvm_vm_enable_cap(s, KVM_CAP_DIRTY_LOG_RING, 0, ring_bytes);
2492             if (ret) {
2493                 error_report("Enabling of KVM dirty ring failed: %s. "
2494                              "Suggested minimum value is 1024.", strerror(-ret));
2495                 goto err;
2496             }
2497 
2498             s->kvm_dirty_ring_bytes = ring_bytes;
2499          } else {
2500              warn_report("KVM dirty ring not available, using bitmap method");
2501              s->kvm_dirty_ring_size = 0;
2502         }
2503     }
2504 
2505     /*
2506      * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is not needed when dirty ring is
2507      * enabled.  More importantly, KVM_DIRTY_LOG_INITIALLY_SET will assume no
2508      * page is wr-protected initially, which is against how kvm dirty ring is
2509      * usage - kvm dirty ring requires all pages are wr-protected at the very
2510      * beginning.  Enabling this feature for dirty ring causes data corruption.
2511      *
2512      * TODO: Without KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 and kvm clear dirty log,
2513      * we may expect a higher stall time when starting the migration.  In the
2514      * future we can enable KVM_CLEAR_DIRTY_LOG to work with dirty ring too:
2515      * instead of clearing dirty bit, it can be a way to explicitly wr-protect
2516      * guest pages.
2517      */
2518     if (!s->kvm_dirty_ring_size) {
2519         dirty_log_manual_caps =
2520             kvm_check_extension(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
2521         dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
2522                                   KVM_DIRTY_LOG_INITIALLY_SET);
2523         s->manual_dirty_log_protect = dirty_log_manual_caps;
2524         if (dirty_log_manual_caps) {
2525             ret = kvm_vm_enable_cap(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0,
2526                                     dirty_log_manual_caps);
2527             if (ret) {
2528                 warn_report("Trying to enable capability %"PRIu64" of "
2529                             "KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 but failed. "
2530                             "Falling back to the legacy mode. ",
2531                             dirty_log_manual_caps);
2532                 s->manual_dirty_log_protect = 0;
2533             }
2534         }
2535     }
2536 
2537 #ifdef KVM_CAP_VCPU_EVENTS
2538     s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
2539 #endif
2540 
2541     s->robust_singlestep =
2542         kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP);
2543 
2544 #ifdef KVM_CAP_DEBUGREGS
2545     s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
2546 #endif
2547 
2548     s->max_nested_state_len = kvm_check_extension(s, KVM_CAP_NESTED_STATE);
2549 
2550 #ifdef KVM_CAP_IRQ_ROUTING
2551     kvm_direct_msi_allowed = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0);
2552 #endif
2553 
2554     s->intx_set_mask = kvm_check_extension(s, KVM_CAP_PCI_2_3);
2555 
2556     s->irq_set_ioctl = KVM_IRQ_LINE;
2557     if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) {
2558         s->irq_set_ioctl = KVM_IRQ_LINE_STATUS;
2559     }
2560 
2561     kvm_readonly_mem_allowed =
2562         (kvm_check_extension(s, KVM_CAP_READONLY_MEM) > 0);
2563 
2564     kvm_eventfds_allowed =
2565         (kvm_check_extension(s, KVM_CAP_IOEVENTFD) > 0);
2566 
2567     kvm_irqfds_allowed =
2568         (kvm_check_extension(s, KVM_CAP_IRQFD) > 0);
2569 
2570     kvm_resamplefds_allowed =
2571         (kvm_check_extension(s, KVM_CAP_IRQFD_RESAMPLE) > 0);
2572 
2573     kvm_vm_attributes_allowed =
2574         (kvm_check_extension(s, KVM_CAP_VM_ATTRIBUTES) > 0);
2575 
2576     kvm_ioeventfd_any_length_allowed =
2577         (kvm_check_extension(s, KVM_CAP_IOEVENTFD_ANY_LENGTH) > 0);
2578 
2579 #ifdef KVM_CAP_SET_GUEST_DEBUG
2580     kvm_has_guest_debug =
2581         (kvm_check_extension(s, KVM_CAP_SET_GUEST_DEBUG) > 0);
2582 #endif
2583 
2584     kvm_sstep_flags = 0;
2585     if (kvm_has_guest_debug) {
2586         kvm_sstep_flags = SSTEP_ENABLE;
2587 
2588 #if defined KVM_CAP_SET_GUEST_DEBUG2
2589         int guest_debug_flags =
2590             kvm_check_extension(s, KVM_CAP_SET_GUEST_DEBUG2);
2591 
2592         if (guest_debug_flags & KVM_GUESTDBG_BLOCKIRQ) {
2593             kvm_sstep_flags |= SSTEP_NOIRQ;
2594         }
2595 #endif
2596     }
2597 
2598     kvm_state = s;
2599 
2600     ret = kvm_arch_init(ms, s);
2601     if (ret < 0) {
2602         goto err;
2603     }
2604 
2605     if (s->kernel_irqchip_split == ON_OFF_AUTO_AUTO) {
2606         s->kernel_irqchip_split = mc->default_kernel_irqchip_split ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
2607     }
2608 
2609     qemu_register_reset(kvm_unpoison_all, NULL);
2610 
2611     if (s->kernel_irqchip_allowed) {
2612         kvm_irqchip_create(s);
2613     }
2614 
2615     if (kvm_eventfds_allowed) {
2616         s->memory_listener.listener.eventfd_add = kvm_mem_ioeventfd_add;
2617         s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del;
2618     }
2619     s->memory_listener.listener.coalesced_io_add = kvm_coalesce_mmio_region;
2620     s->memory_listener.listener.coalesced_io_del = kvm_uncoalesce_mmio_region;
2621 
2622     kvm_memory_listener_register(s, &s->memory_listener,
2623                                  &address_space_memory, 0, "kvm-memory");
2624     if (kvm_eventfds_allowed) {
2625         memory_listener_register(&kvm_io_listener,
2626                                  &address_space_io);
2627     }
2628     memory_listener_register(&kvm_coalesced_pio_listener,
2629                              &address_space_io);
2630 
2631     s->many_ioeventfds = kvm_check_many_ioeventfds();
2632 
2633     s->sync_mmu = !!kvm_vm_check_extension(kvm_state, KVM_CAP_SYNC_MMU);
2634     if (!s->sync_mmu) {
2635         ret = ram_block_discard_disable(true);
2636         assert(!ret);
2637     }
2638 
2639     if (s->kvm_dirty_ring_size) {
2640         ret = kvm_dirty_ring_reaper_init(s);
2641         if (ret) {
2642             goto err;
2643         }
2644     }
2645 
2646     if (kvm_check_extension(kvm_state, KVM_CAP_BINARY_STATS_FD)) {
2647         add_stats_callbacks(STATS_PROVIDER_KVM, query_stats_cb,
2648                             query_stats_schemas_cb);
2649     }
2650 
2651     return 0;
2652 
2653 err:
2654     assert(ret < 0);
2655     if (s->vmfd >= 0) {
2656         close(s->vmfd);
2657     }
2658     if (s->fd != -1) {
2659         close(s->fd);
2660     }
2661     g_free(s->memory_listener.slots);
2662 
2663     return ret;
2664 }
2665 
2666 void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len)
2667 {
2668     s->sigmask_len = sigmask_len;
2669 }
2670 
2671 static void kvm_handle_io(uint16_t port, MemTxAttrs attrs, void *data, int direction,
2672                           int size, uint32_t count)
2673 {
2674     int i;
2675     uint8_t *ptr = data;
2676 
2677     for (i = 0; i < count; i++) {
2678         address_space_rw(&address_space_io, port, attrs,
2679                          ptr, size,
2680                          direction == KVM_EXIT_IO_OUT);
2681         ptr += size;
2682     }
2683 }
2684 
2685 static int kvm_handle_internal_error(CPUState *cpu, struct kvm_run *run)
2686 {
2687     fprintf(stderr, "KVM internal error. Suberror: %d\n",
2688             run->internal.suberror);
2689 
2690     if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) {
2691         int i;
2692 
2693         for (i = 0; i < run->internal.ndata; ++i) {
2694             fprintf(stderr, "extra data[%d]: 0x%016"PRIx64"\n",
2695                     i, (uint64_t)run->internal.data[i]);
2696         }
2697     }
2698     if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
2699         fprintf(stderr, "emulation failure\n");
2700         if (!kvm_arch_stop_on_emulation_error(cpu)) {
2701             cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
2702             return EXCP_INTERRUPT;
2703         }
2704     }
2705     /* FIXME: Should trigger a qmp message to let management know
2706      * something went wrong.
2707      */
2708     return -1;
2709 }
2710 
2711 void kvm_flush_coalesced_mmio_buffer(void)
2712 {
2713     KVMState *s = kvm_state;
2714 
2715     if (s->coalesced_flush_in_progress) {
2716         return;
2717     }
2718 
2719     s->coalesced_flush_in_progress = true;
2720 
2721     if (s->coalesced_mmio_ring) {
2722         struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
2723         while (ring->first != ring->last) {
2724             struct kvm_coalesced_mmio *ent;
2725 
2726             ent = &ring->coalesced_mmio[ring->first];
2727 
2728             if (ent->pio == 1) {
2729                 address_space_write(&address_space_io, ent->phys_addr,
2730                                     MEMTXATTRS_UNSPECIFIED, ent->data,
2731                                     ent->len);
2732             } else {
2733                 cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
2734             }
2735             smp_wmb();
2736             ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
2737         }
2738     }
2739 
2740     s->coalesced_flush_in_progress = false;
2741 }
2742 
2743 bool kvm_cpu_check_are_resettable(void)
2744 {
2745     return kvm_arch_cpu_check_are_resettable();
2746 }
2747 
2748 static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
2749 {
2750     if (!cpu->vcpu_dirty) {
2751         kvm_arch_get_registers(cpu);
2752         cpu->vcpu_dirty = true;
2753     }
2754 }
2755 
2756 void kvm_cpu_synchronize_state(CPUState *cpu)
2757 {
2758     if (!cpu->vcpu_dirty) {
2759         run_on_cpu(cpu, do_kvm_cpu_synchronize_state, RUN_ON_CPU_NULL);
2760     }
2761 }
2762 
2763 static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg)
2764 {
2765     kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE);
2766     cpu->vcpu_dirty = false;
2767 }
2768 
2769 void kvm_cpu_synchronize_post_reset(CPUState *cpu)
2770 {
2771     run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
2772 }
2773 
2774 static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg)
2775 {
2776     kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE);
2777     cpu->vcpu_dirty = false;
2778 }
2779 
2780 void kvm_cpu_synchronize_post_init(CPUState *cpu)
2781 {
2782     run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
2783 }
2784 
2785 static void do_kvm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg)
2786 {
2787     cpu->vcpu_dirty = true;
2788 }
2789 
2790 void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu)
2791 {
2792     run_on_cpu(cpu, do_kvm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
2793 }
2794 
2795 #ifdef KVM_HAVE_MCE_INJECTION
2796 static __thread void *pending_sigbus_addr;
2797 static __thread int pending_sigbus_code;
2798 static __thread bool have_sigbus_pending;
2799 #endif
2800 
2801 static void kvm_cpu_kick(CPUState *cpu)
2802 {
2803     qatomic_set(&cpu->kvm_run->immediate_exit, 1);
2804 }
2805 
2806 static void kvm_cpu_kick_self(void)
2807 {
2808     if (kvm_immediate_exit) {
2809         kvm_cpu_kick(current_cpu);
2810     } else {
2811         qemu_cpu_kick_self();
2812     }
2813 }
2814 
2815 static void kvm_eat_signals(CPUState *cpu)
2816 {
2817     struct timespec ts = { 0, 0 };
2818     siginfo_t siginfo;
2819     sigset_t waitset;
2820     sigset_t chkset;
2821     int r;
2822 
2823     if (kvm_immediate_exit) {
2824         qatomic_set(&cpu->kvm_run->immediate_exit, 0);
2825         /* Write kvm_run->immediate_exit before the cpu->exit_request
2826          * write in kvm_cpu_exec.
2827          */
2828         smp_wmb();
2829         return;
2830     }
2831 
2832     sigemptyset(&waitset);
2833     sigaddset(&waitset, SIG_IPI);
2834 
2835     do {
2836         r = sigtimedwait(&waitset, &siginfo, &ts);
2837         if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
2838             perror("sigtimedwait");
2839             exit(1);
2840         }
2841 
2842         r = sigpending(&chkset);
2843         if (r == -1) {
2844             perror("sigpending");
2845             exit(1);
2846         }
2847     } while (sigismember(&chkset, SIG_IPI));
2848 }
2849 
2850 int kvm_cpu_exec(CPUState *cpu)
2851 {
2852     struct kvm_run *run = cpu->kvm_run;
2853     int ret, run_ret;
2854 
2855     DPRINTF("kvm_cpu_exec()\n");
2856 
2857     if (kvm_arch_process_async_events(cpu)) {
2858         qatomic_set(&cpu->exit_request, 0);
2859         return EXCP_HLT;
2860     }
2861 
2862     qemu_mutex_unlock_iothread();
2863     cpu_exec_start(cpu);
2864 
2865     do {
2866         MemTxAttrs attrs;
2867 
2868         if (cpu->vcpu_dirty) {
2869             kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE);
2870             cpu->vcpu_dirty = false;
2871         }
2872 
2873         kvm_arch_pre_run(cpu, run);
2874         if (qatomic_read(&cpu->exit_request)) {
2875             DPRINTF("interrupt exit requested\n");
2876             /*
2877              * KVM requires us to reenter the kernel after IO exits to complete
2878              * instruction emulation. This self-signal will ensure that we
2879              * leave ASAP again.
2880              */
2881             kvm_cpu_kick_self();
2882         }
2883 
2884         /* Read cpu->exit_request before KVM_RUN reads run->immediate_exit.
2885          * Matching barrier in kvm_eat_signals.
2886          */
2887         smp_rmb();
2888 
2889         run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0);
2890 
2891         attrs = kvm_arch_post_run(cpu, run);
2892 
2893 #ifdef KVM_HAVE_MCE_INJECTION
2894         if (unlikely(have_sigbus_pending)) {
2895             qemu_mutex_lock_iothread();
2896             kvm_arch_on_sigbus_vcpu(cpu, pending_sigbus_code,
2897                                     pending_sigbus_addr);
2898             have_sigbus_pending = false;
2899             qemu_mutex_unlock_iothread();
2900         }
2901 #endif
2902 
2903         if (run_ret < 0) {
2904             if (run_ret == -EINTR || run_ret == -EAGAIN) {
2905                 DPRINTF("io window exit\n");
2906                 kvm_eat_signals(cpu);
2907                 ret = EXCP_INTERRUPT;
2908                 break;
2909             }
2910             fprintf(stderr, "error: kvm run failed %s\n",
2911                     strerror(-run_ret));
2912 #ifdef TARGET_PPC
2913             if (run_ret == -EBUSY) {
2914                 fprintf(stderr,
2915                         "This is probably because your SMT is enabled.\n"
2916                         "VCPU can only run on primary threads with all "
2917                         "secondary threads offline.\n");
2918             }
2919 #endif
2920             ret = -1;
2921             break;
2922         }
2923 
2924         trace_kvm_run_exit(cpu->cpu_index, run->exit_reason);
2925         switch (run->exit_reason) {
2926         case KVM_EXIT_IO:
2927             DPRINTF("handle_io\n");
2928             /* Called outside BQL */
2929             kvm_handle_io(run->io.port, attrs,
2930                           (uint8_t *)run + run->io.data_offset,
2931                           run->io.direction,
2932                           run->io.size,
2933                           run->io.count);
2934             ret = 0;
2935             break;
2936         case KVM_EXIT_MMIO:
2937             DPRINTF("handle_mmio\n");
2938             /* Called outside BQL */
2939             address_space_rw(&address_space_memory,
2940                              run->mmio.phys_addr, attrs,
2941                              run->mmio.data,
2942                              run->mmio.len,
2943                              run->mmio.is_write);
2944             ret = 0;
2945             break;
2946         case KVM_EXIT_IRQ_WINDOW_OPEN:
2947             DPRINTF("irq_window_open\n");
2948             ret = EXCP_INTERRUPT;
2949             break;
2950         case KVM_EXIT_SHUTDOWN:
2951             DPRINTF("shutdown\n");
2952             qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
2953             ret = EXCP_INTERRUPT;
2954             break;
2955         case KVM_EXIT_UNKNOWN:
2956             fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n",
2957                     (uint64_t)run->hw.hardware_exit_reason);
2958             ret = -1;
2959             break;
2960         case KVM_EXIT_INTERNAL_ERROR:
2961             ret = kvm_handle_internal_error(cpu, run);
2962             break;
2963         case KVM_EXIT_DIRTY_RING_FULL:
2964             /*
2965              * We shouldn't continue if the dirty ring of this vcpu is
2966              * still full.  Got kicked by KVM_RESET_DIRTY_RINGS.
2967              */
2968             trace_kvm_dirty_ring_full(cpu->cpu_index);
2969             qemu_mutex_lock_iothread();
2970             kvm_dirty_ring_reap(kvm_state);
2971             qemu_mutex_unlock_iothread();
2972             ret = 0;
2973             break;
2974         case KVM_EXIT_SYSTEM_EVENT:
2975             switch (run->system_event.type) {
2976             case KVM_SYSTEM_EVENT_SHUTDOWN:
2977                 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
2978                 ret = EXCP_INTERRUPT;
2979                 break;
2980             case KVM_SYSTEM_EVENT_RESET:
2981                 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
2982                 ret = EXCP_INTERRUPT;
2983                 break;
2984             case KVM_SYSTEM_EVENT_CRASH:
2985                 kvm_cpu_synchronize_state(cpu);
2986                 qemu_mutex_lock_iothread();
2987                 qemu_system_guest_panicked(cpu_get_crash_info(cpu));
2988                 qemu_mutex_unlock_iothread();
2989                 ret = 0;
2990                 break;
2991             default:
2992                 DPRINTF("kvm_arch_handle_exit\n");
2993                 ret = kvm_arch_handle_exit(cpu, run);
2994                 break;
2995             }
2996             break;
2997         default:
2998             DPRINTF("kvm_arch_handle_exit\n");
2999             ret = kvm_arch_handle_exit(cpu, run);
3000             break;
3001         }
3002     } while (ret == 0);
3003 
3004     cpu_exec_end(cpu);
3005     qemu_mutex_lock_iothread();
3006 
3007     if (ret < 0) {
3008         cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
3009         vm_stop(RUN_STATE_INTERNAL_ERROR);
3010     }
3011 
3012     qatomic_set(&cpu->exit_request, 0);
3013     return ret;
3014 }
3015 
3016 int kvm_ioctl(KVMState *s, int type, ...)
3017 {
3018     int ret;
3019     void *arg;
3020     va_list ap;
3021 
3022     va_start(ap, type);
3023     arg = va_arg(ap, void *);
3024     va_end(ap);
3025 
3026     trace_kvm_ioctl(type, arg);
3027     ret = ioctl(s->fd, type, arg);
3028     if (ret == -1) {
3029         ret = -errno;
3030     }
3031     return ret;
3032 }
3033 
3034 int kvm_vm_ioctl(KVMState *s, int type, ...)
3035 {
3036     int ret;
3037     void *arg;
3038     va_list ap;
3039 
3040     va_start(ap, type);
3041     arg = va_arg(ap, void *);
3042     va_end(ap);
3043 
3044     trace_kvm_vm_ioctl(type, arg);
3045     ret = ioctl(s->vmfd, type, arg);
3046     if (ret == -1) {
3047         ret = -errno;
3048     }
3049     return ret;
3050 }
3051 
3052 int kvm_vcpu_ioctl(CPUState *cpu, int type, ...)
3053 {
3054     int ret;
3055     void *arg;
3056     va_list ap;
3057 
3058     va_start(ap, type);
3059     arg = va_arg(ap, void *);
3060     va_end(ap);
3061 
3062     trace_kvm_vcpu_ioctl(cpu->cpu_index, type, arg);
3063     ret = ioctl(cpu->kvm_fd, type, arg);
3064     if (ret == -1) {
3065         ret = -errno;
3066     }
3067     return ret;
3068 }
3069 
3070 int kvm_device_ioctl(int fd, int type, ...)
3071 {
3072     int ret;
3073     void *arg;
3074     va_list ap;
3075 
3076     va_start(ap, type);
3077     arg = va_arg(ap, void *);
3078     va_end(ap);
3079 
3080     trace_kvm_device_ioctl(fd, type, arg);
3081     ret = ioctl(fd, type, arg);
3082     if (ret == -1) {
3083         ret = -errno;
3084     }
3085     return ret;
3086 }
3087 
3088 int kvm_vm_check_attr(KVMState *s, uint32_t group, uint64_t attr)
3089 {
3090     int ret;
3091     struct kvm_device_attr attribute = {
3092         .group = group,
3093         .attr = attr,
3094     };
3095 
3096     if (!kvm_vm_attributes_allowed) {
3097         return 0;
3098     }
3099 
3100     ret = kvm_vm_ioctl(s, KVM_HAS_DEVICE_ATTR, &attribute);
3101     /* kvm returns 0 on success for HAS_DEVICE_ATTR */
3102     return ret ? 0 : 1;
3103 }
3104 
3105 int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr)
3106 {
3107     struct kvm_device_attr attribute = {
3108         .group = group,
3109         .attr = attr,
3110         .flags = 0,
3111     };
3112 
3113     return kvm_device_ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute) ? 0 : 1;
3114 }
3115 
3116 int kvm_device_access(int fd, int group, uint64_t attr,
3117                       void *val, bool write, Error **errp)
3118 {
3119     struct kvm_device_attr kvmattr;
3120     int err;
3121 
3122     kvmattr.flags = 0;
3123     kvmattr.group = group;
3124     kvmattr.attr = attr;
3125     kvmattr.addr = (uintptr_t)val;
3126 
3127     err = kvm_device_ioctl(fd,
3128                            write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR,
3129                            &kvmattr);
3130     if (err < 0) {
3131         error_setg_errno(errp, -err,
3132                          "KVM_%s_DEVICE_ATTR failed: Group %d "
3133                          "attr 0x%016" PRIx64,
3134                          write ? "SET" : "GET", group, attr);
3135     }
3136     return err;
3137 }
3138 
3139 bool kvm_has_sync_mmu(void)
3140 {
3141     return kvm_state->sync_mmu;
3142 }
3143 
3144 int kvm_has_vcpu_events(void)
3145 {
3146     return kvm_state->vcpu_events;
3147 }
3148 
3149 int kvm_has_robust_singlestep(void)
3150 {
3151     return kvm_state->robust_singlestep;
3152 }
3153 
3154 int kvm_has_debugregs(void)
3155 {
3156     return kvm_state->debugregs;
3157 }
3158 
3159 int kvm_max_nested_state_length(void)
3160 {
3161     return kvm_state->max_nested_state_len;
3162 }
3163 
3164 int kvm_has_many_ioeventfds(void)
3165 {
3166     if (!kvm_enabled()) {
3167         return 0;
3168     }
3169     return kvm_state->many_ioeventfds;
3170 }
3171 
3172 int kvm_has_gsi_routing(void)
3173 {
3174 #ifdef KVM_CAP_IRQ_ROUTING
3175     return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
3176 #else
3177     return false;
3178 #endif
3179 }
3180 
3181 int kvm_has_intx_set_mask(void)
3182 {
3183     return kvm_state->intx_set_mask;
3184 }
3185 
3186 bool kvm_arm_supports_user_irq(void)
3187 {
3188     return kvm_check_extension(kvm_state, KVM_CAP_ARM_USER_IRQ);
3189 }
3190 
3191 #ifdef KVM_CAP_SET_GUEST_DEBUG
3192 struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu,
3193                                                  target_ulong pc)
3194 {
3195     struct kvm_sw_breakpoint *bp;
3196 
3197     QTAILQ_FOREACH(bp, &cpu->kvm_state->kvm_sw_breakpoints, entry) {
3198         if (bp->pc == pc) {
3199             return bp;
3200         }
3201     }
3202     return NULL;
3203 }
3204 
3205 int kvm_sw_breakpoints_active(CPUState *cpu)
3206 {
3207     return !QTAILQ_EMPTY(&cpu->kvm_state->kvm_sw_breakpoints);
3208 }
3209 
3210 struct kvm_set_guest_debug_data {
3211     struct kvm_guest_debug dbg;
3212     int err;
3213 };
3214 
3215 static void kvm_invoke_set_guest_debug(CPUState *cpu, run_on_cpu_data data)
3216 {
3217     struct kvm_set_guest_debug_data *dbg_data =
3218         (struct kvm_set_guest_debug_data *) data.host_ptr;
3219 
3220     dbg_data->err = kvm_vcpu_ioctl(cpu, KVM_SET_GUEST_DEBUG,
3221                                    &dbg_data->dbg);
3222 }
3223 
3224 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap)
3225 {
3226     struct kvm_set_guest_debug_data data;
3227 
3228     data.dbg.control = reinject_trap;
3229 
3230     if (cpu->singlestep_enabled) {
3231         data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
3232 
3233         if (cpu->singlestep_enabled & SSTEP_NOIRQ) {
3234             data.dbg.control |= KVM_GUESTDBG_BLOCKIRQ;
3235         }
3236     }
3237     kvm_arch_update_guest_debug(cpu, &data.dbg);
3238 
3239     run_on_cpu(cpu, kvm_invoke_set_guest_debug,
3240                RUN_ON_CPU_HOST_PTR(&data));
3241     return data.err;
3242 }
3243 
3244 int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr,
3245                           target_ulong len, int type)
3246 {
3247     struct kvm_sw_breakpoint *bp;
3248     int err;
3249 
3250     if (type == GDB_BREAKPOINT_SW) {
3251         bp = kvm_find_sw_breakpoint(cpu, addr);
3252         if (bp) {
3253             bp->use_count++;
3254             return 0;
3255         }
3256 
3257         bp = g_new(struct kvm_sw_breakpoint, 1);
3258         bp->pc = addr;
3259         bp->use_count = 1;
3260         err = kvm_arch_insert_sw_breakpoint(cpu, bp);
3261         if (err) {
3262             g_free(bp);
3263             return err;
3264         }
3265 
3266         QTAILQ_INSERT_HEAD(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
3267     } else {
3268         err = kvm_arch_insert_hw_breakpoint(addr, len, type);
3269         if (err) {
3270             return err;
3271         }
3272     }
3273 
3274     CPU_FOREACH(cpu) {
3275         err = kvm_update_guest_debug(cpu, 0);
3276         if (err) {
3277             return err;
3278         }
3279     }
3280     return 0;
3281 }
3282 
3283 int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr,
3284                           target_ulong len, int type)
3285 {
3286     struct kvm_sw_breakpoint *bp;
3287     int err;
3288 
3289     if (type == GDB_BREAKPOINT_SW) {
3290         bp = kvm_find_sw_breakpoint(cpu, addr);
3291         if (!bp) {
3292             return -ENOENT;
3293         }
3294 
3295         if (bp->use_count > 1) {
3296             bp->use_count--;
3297             return 0;
3298         }
3299 
3300         err = kvm_arch_remove_sw_breakpoint(cpu, bp);
3301         if (err) {
3302             return err;
3303         }
3304 
3305         QTAILQ_REMOVE(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
3306         g_free(bp);
3307     } else {
3308         err = kvm_arch_remove_hw_breakpoint(addr, len, type);
3309         if (err) {
3310             return err;
3311         }
3312     }
3313 
3314     CPU_FOREACH(cpu) {
3315         err = kvm_update_guest_debug(cpu, 0);
3316         if (err) {
3317             return err;
3318         }
3319     }
3320     return 0;
3321 }
3322 
3323 void kvm_remove_all_breakpoints(CPUState *cpu)
3324 {
3325     struct kvm_sw_breakpoint *bp, *next;
3326     KVMState *s = cpu->kvm_state;
3327     CPUState *tmpcpu;
3328 
3329     QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
3330         if (kvm_arch_remove_sw_breakpoint(cpu, bp) != 0) {
3331             /* Try harder to find a CPU that currently sees the breakpoint. */
3332             CPU_FOREACH(tmpcpu) {
3333                 if (kvm_arch_remove_sw_breakpoint(tmpcpu, bp) == 0) {
3334                     break;
3335                 }
3336             }
3337         }
3338         QTAILQ_REMOVE(&s->kvm_sw_breakpoints, bp, entry);
3339         g_free(bp);
3340     }
3341     kvm_arch_remove_all_hw_breakpoints();
3342 
3343     CPU_FOREACH(cpu) {
3344         kvm_update_guest_debug(cpu, 0);
3345     }
3346 }
3347 
3348 #else /* !KVM_CAP_SET_GUEST_DEBUG */
3349 
3350 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap)
3351 {
3352     return -EINVAL;
3353 }
3354 
3355 int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr,
3356                           target_ulong len, int type)
3357 {
3358     return -EINVAL;
3359 }
3360 
3361 int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr,
3362                           target_ulong len, int type)
3363 {
3364     return -EINVAL;
3365 }
3366 
3367 void kvm_remove_all_breakpoints(CPUState *cpu)
3368 {
3369 }
3370 #endif /* !KVM_CAP_SET_GUEST_DEBUG */
3371 
3372 static int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset)
3373 {
3374     KVMState *s = kvm_state;
3375     struct kvm_signal_mask *sigmask;
3376     int r;
3377 
3378     sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset));
3379 
3380     sigmask->len = s->sigmask_len;
3381     memcpy(sigmask->sigset, sigset, sizeof(*sigset));
3382     r = kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, sigmask);
3383     g_free(sigmask);
3384 
3385     return r;
3386 }
3387 
3388 static void kvm_ipi_signal(int sig)
3389 {
3390     if (current_cpu) {
3391         assert(kvm_immediate_exit);
3392         kvm_cpu_kick(current_cpu);
3393     }
3394 }
3395 
3396 void kvm_init_cpu_signals(CPUState *cpu)
3397 {
3398     int r;
3399     sigset_t set;
3400     struct sigaction sigact;
3401 
3402     memset(&sigact, 0, sizeof(sigact));
3403     sigact.sa_handler = kvm_ipi_signal;
3404     sigaction(SIG_IPI, &sigact, NULL);
3405 
3406     pthread_sigmask(SIG_BLOCK, NULL, &set);
3407 #if defined KVM_HAVE_MCE_INJECTION
3408     sigdelset(&set, SIGBUS);
3409     pthread_sigmask(SIG_SETMASK, &set, NULL);
3410 #endif
3411     sigdelset(&set, SIG_IPI);
3412     if (kvm_immediate_exit) {
3413         r = pthread_sigmask(SIG_SETMASK, &set, NULL);
3414     } else {
3415         r = kvm_set_signal_mask(cpu, &set);
3416     }
3417     if (r) {
3418         fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
3419         exit(1);
3420     }
3421 }
3422 
3423 /* Called asynchronously in VCPU thread.  */
3424 int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr)
3425 {
3426 #ifdef KVM_HAVE_MCE_INJECTION
3427     if (have_sigbus_pending) {
3428         return 1;
3429     }
3430     have_sigbus_pending = true;
3431     pending_sigbus_addr = addr;
3432     pending_sigbus_code = code;
3433     qatomic_set(&cpu->exit_request, 1);
3434     return 0;
3435 #else
3436     return 1;
3437 #endif
3438 }
3439 
3440 /* Called synchronously (via signalfd) in main thread.  */
3441 int kvm_on_sigbus(int code, void *addr)
3442 {
3443 #ifdef KVM_HAVE_MCE_INJECTION
3444     /* Action required MCE kills the process if SIGBUS is blocked.  Because
3445      * that's what happens in the I/O thread, where we handle MCE via signalfd,
3446      * we can only get action optional here.
3447      */
3448     assert(code != BUS_MCEERR_AR);
3449     kvm_arch_on_sigbus_vcpu(first_cpu, code, addr);
3450     return 0;
3451 #else
3452     return 1;
3453 #endif
3454 }
3455 
3456 int kvm_create_device(KVMState *s, uint64_t type, bool test)
3457 {
3458     int ret;
3459     struct kvm_create_device create_dev;
3460 
3461     create_dev.type = type;
3462     create_dev.fd = -1;
3463     create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0;
3464 
3465     if (!kvm_check_extension(s, KVM_CAP_DEVICE_CTRL)) {
3466         return -ENOTSUP;
3467     }
3468 
3469     ret = kvm_vm_ioctl(s, KVM_CREATE_DEVICE, &create_dev);
3470     if (ret) {
3471         return ret;
3472     }
3473 
3474     return test ? 0 : create_dev.fd;
3475 }
3476 
3477 bool kvm_device_supported(int vmfd, uint64_t type)
3478 {
3479     struct kvm_create_device create_dev = {
3480         .type = type,
3481         .fd = -1,
3482         .flags = KVM_CREATE_DEVICE_TEST,
3483     };
3484 
3485     if (ioctl(vmfd, KVM_CHECK_EXTENSION, KVM_CAP_DEVICE_CTRL) <= 0) {
3486         return false;
3487     }
3488 
3489     return (ioctl(vmfd, KVM_CREATE_DEVICE, &create_dev) >= 0);
3490 }
3491 
3492 int kvm_set_one_reg(CPUState *cs, uint64_t id, void *source)
3493 {
3494     struct kvm_one_reg reg;
3495     int r;
3496 
3497     reg.id = id;
3498     reg.addr = (uintptr_t) source;
3499     r = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
3500     if (r) {
3501         trace_kvm_failed_reg_set(id, strerror(-r));
3502     }
3503     return r;
3504 }
3505 
3506 int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target)
3507 {
3508     struct kvm_one_reg reg;
3509     int r;
3510 
3511     reg.id = id;
3512     reg.addr = (uintptr_t) target;
3513     r = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
3514     if (r) {
3515         trace_kvm_failed_reg_get(id, strerror(-r));
3516     }
3517     return r;
3518 }
3519 
3520 static bool kvm_accel_has_memory(MachineState *ms, AddressSpace *as,
3521                                  hwaddr start_addr, hwaddr size)
3522 {
3523     KVMState *kvm = KVM_STATE(ms->accelerator);
3524     int i;
3525 
3526     for (i = 0; i < kvm->nr_as; ++i) {
3527         if (kvm->as[i].as == as && kvm->as[i].ml) {
3528             size = MIN(kvm_max_slot_size, size);
3529             return NULL != kvm_lookup_matching_slot(kvm->as[i].ml,
3530                                                     start_addr, size);
3531         }
3532     }
3533 
3534     return false;
3535 }
3536 
3537 static void kvm_get_kvm_shadow_mem(Object *obj, Visitor *v,
3538                                    const char *name, void *opaque,
3539                                    Error **errp)
3540 {
3541     KVMState *s = KVM_STATE(obj);
3542     int64_t value = s->kvm_shadow_mem;
3543 
3544     visit_type_int(v, name, &value, errp);
3545 }
3546 
3547 static void kvm_set_kvm_shadow_mem(Object *obj, Visitor *v,
3548                                    const char *name, void *opaque,
3549                                    Error **errp)
3550 {
3551     KVMState *s = KVM_STATE(obj);
3552     int64_t value;
3553 
3554     if (s->fd != -1) {
3555         error_setg(errp, "Cannot set properties after the accelerator has been initialized");
3556         return;
3557     }
3558 
3559     if (!visit_type_int(v, name, &value, errp)) {
3560         return;
3561     }
3562 
3563     s->kvm_shadow_mem = value;
3564 }
3565 
3566 static void kvm_set_kernel_irqchip(Object *obj, Visitor *v,
3567                                    const char *name, void *opaque,
3568                                    Error **errp)
3569 {
3570     KVMState *s = KVM_STATE(obj);
3571     OnOffSplit mode;
3572 
3573     if (s->fd != -1) {
3574         error_setg(errp, "Cannot set properties after the accelerator has been initialized");
3575         return;
3576     }
3577 
3578     if (!visit_type_OnOffSplit(v, name, &mode, errp)) {
3579         return;
3580     }
3581     switch (mode) {
3582     case ON_OFF_SPLIT_ON:
3583         s->kernel_irqchip_allowed = true;
3584         s->kernel_irqchip_required = true;
3585         s->kernel_irqchip_split = ON_OFF_AUTO_OFF;
3586         break;
3587     case ON_OFF_SPLIT_OFF:
3588         s->kernel_irqchip_allowed = false;
3589         s->kernel_irqchip_required = false;
3590         s->kernel_irqchip_split = ON_OFF_AUTO_OFF;
3591         break;
3592     case ON_OFF_SPLIT_SPLIT:
3593         s->kernel_irqchip_allowed = true;
3594         s->kernel_irqchip_required = true;
3595         s->kernel_irqchip_split = ON_OFF_AUTO_ON;
3596         break;
3597     default:
3598         /* The value was checked in visit_type_OnOffSplit() above. If
3599          * we get here, then something is wrong in QEMU.
3600          */
3601         abort();
3602     }
3603 }
3604 
3605 bool kvm_kernel_irqchip_allowed(void)
3606 {
3607     return kvm_state->kernel_irqchip_allowed;
3608 }
3609 
3610 bool kvm_kernel_irqchip_required(void)
3611 {
3612     return kvm_state->kernel_irqchip_required;
3613 }
3614 
3615 bool kvm_kernel_irqchip_split(void)
3616 {
3617     return kvm_state->kernel_irqchip_split == ON_OFF_AUTO_ON;
3618 }
3619 
3620 static void kvm_get_dirty_ring_size(Object *obj, Visitor *v,
3621                                     const char *name, void *opaque,
3622                                     Error **errp)
3623 {
3624     KVMState *s = KVM_STATE(obj);
3625     uint32_t value = s->kvm_dirty_ring_size;
3626 
3627     visit_type_uint32(v, name, &value, errp);
3628 }
3629 
3630 static void kvm_set_dirty_ring_size(Object *obj, Visitor *v,
3631                                     const char *name, void *opaque,
3632                                     Error **errp)
3633 {
3634     KVMState *s = KVM_STATE(obj);
3635     Error *error = NULL;
3636     uint32_t value;
3637 
3638     if (s->fd != -1) {
3639         error_setg(errp, "Cannot set properties after the accelerator has been initialized");
3640         return;
3641     }
3642 
3643     visit_type_uint32(v, name, &value, &error);
3644     if (error) {
3645         error_propagate(errp, error);
3646         return;
3647     }
3648     if (value & (value - 1)) {
3649         error_setg(errp, "dirty-ring-size must be a power of two.");
3650         return;
3651     }
3652 
3653     s->kvm_dirty_ring_size = value;
3654 }
3655 
3656 static void kvm_accel_instance_init(Object *obj)
3657 {
3658     KVMState *s = KVM_STATE(obj);
3659 
3660     s->fd = -1;
3661     s->vmfd = -1;
3662     s->kvm_shadow_mem = -1;
3663     s->kernel_irqchip_allowed = true;
3664     s->kernel_irqchip_split = ON_OFF_AUTO_AUTO;
3665     /* KVM dirty ring is by default off */
3666     s->kvm_dirty_ring_size = 0;
3667 }
3668 
3669 static void kvm_accel_class_init(ObjectClass *oc, void *data)
3670 {
3671     AccelClass *ac = ACCEL_CLASS(oc);
3672     ac->name = "KVM";
3673     ac->init_machine = kvm_init;
3674     ac->has_memory = kvm_accel_has_memory;
3675     ac->allowed = &kvm_allowed;
3676 
3677     object_class_property_add(oc, "kernel-irqchip", "on|off|split",
3678         NULL, kvm_set_kernel_irqchip,
3679         NULL, NULL);
3680     object_class_property_set_description(oc, "kernel-irqchip",
3681         "Configure KVM in-kernel irqchip");
3682 
3683     object_class_property_add(oc, "kvm-shadow-mem", "int",
3684         kvm_get_kvm_shadow_mem, kvm_set_kvm_shadow_mem,
3685         NULL, NULL);
3686     object_class_property_set_description(oc, "kvm-shadow-mem",
3687         "KVM shadow MMU size");
3688 
3689     object_class_property_add(oc, "dirty-ring-size", "uint32",
3690         kvm_get_dirty_ring_size, kvm_set_dirty_ring_size,
3691         NULL, NULL);
3692     object_class_property_set_description(oc, "dirty-ring-size",
3693         "Size of KVM dirty page ring buffer (default: 0, i.e. use bitmap)");
3694 }
3695 
3696 static const TypeInfo kvm_accel_type = {
3697     .name = TYPE_KVM_ACCEL,
3698     .parent = TYPE_ACCEL,
3699     .instance_init = kvm_accel_instance_init,
3700     .class_init = kvm_accel_class_init,
3701     .instance_size = sizeof(KVMState),
3702 };
3703 
3704 static void kvm_type_init(void)
3705 {
3706     type_register_static(&kvm_accel_type);
3707 }
3708 
3709 type_init(kvm_type_init);
3710 
3711 typedef struct StatsArgs {
3712     union StatsResultsType {
3713         StatsResultList **stats;
3714         StatsSchemaList **schema;
3715     } result;
3716     strList *names;
3717     Error **errp;
3718 } StatsArgs;
3719 
3720 static StatsList *add_kvmstat_entry(struct kvm_stats_desc *pdesc,
3721                                     uint64_t *stats_data,
3722                                     StatsList *stats_list,
3723                                     Error **errp)
3724 {
3725 
3726     Stats *stats;
3727     uint64List *val_list = NULL;
3728 
3729     /* Only add stats that we understand.  */
3730     switch (pdesc->flags & KVM_STATS_TYPE_MASK) {
3731     case KVM_STATS_TYPE_CUMULATIVE:
3732     case KVM_STATS_TYPE_INSTANT:
3733     case KVM_STATS_TYPE_PEAK:
3734     case KVM_STATS_TYPE_LINEAR_HIST:
3735     case KVM_STATS_TYPE_LOG_HIST:
3736         break;
3737     default:
3738         return stats_list;
3739     }
3740 
3741     switch (pdesc->flags & KVM_STATS_UNIT_MASK) {
3742     case KVM_STATS_UNIT_NONE:
3743     case KVM_STATS_UNIT_BYTES:
3744     case KVM_STATS_UNIT_CYCLES:
3745     case KVM_STATS_UNIT_SECONDS:
3746         break;
3747     default:
3748         return stats_list;
3749     }
3750 
3751     switch (pdesc->flags & KVM_STATS_BASE_MASK) {
3752     case KVM_STATS_BASE_POW10:
3753     case KVM_STATS_BASE_POW2:
3754         break;
3755     default:
3756         return stats_list;
3757     }
3758 
3759     /* Alloc and populate data list */
3760     stats = g_new0(Stats, 1);
3761     stats->name = g_strdup(pdesc->name);
3762     stats->value = g_new0(StatsValue, 1);;
3763 
3764     if (pdesc->size == 1) {
3765         stats->value->u.scalar = *stats_data;
3766         stats->value->type = QTYPE_QNUM;
3767     } else {
3768         int i;
3769         for (i = 0; i < pdesc->size; i++) {
3770             QAPI_LIST_PREPEND(val_list, stats_data[i]);
3771         }
3772         stats->value->u.list = val_list;
3773         stats->value->type = QTYPE_QLIST;
3774     }
3775 
3776     QAPI_LIST_PREPEND(stats_list, stats);
3777     return stats_list;
3778 }
3779 
3780 static StatsSchemaValueList *add_kvmschema_entry(struct kvm_stats_desc *pdesc,
3781                                                  StatsSchemaValueList *list,
3782                                                  Error **errp)
3783 {
3784     StatsSchemaValueList *schema_entry = g_new0(StatsSchemaValueList, 1);
3785     schema_entry->value = g_new0(StatsSchemaValue, 1);
3786 
3787     switch (pdesc->flags & KVM_STATS_TYPE_MASK) {
3788     case KVM_STATS_TYPE_CUMULATIVE:
3789         schema_entry->value->type = STATS_TYPE_CUMULATIVE;
3790         break;
3791     case KVM_STATS_TYPE_INSTANT:
3792         schema_entry->value->type = STATS_TYPE_INSTANT;
3793         break;
3794     case KVM_STATS_TYPE_PEAK:
3795         schema_entry->value->type = STATS_TYPE_PEAK;
3796         break;
3797     case KVM_STATS_TYPE_LINEAR_HIST:
3798         schema_entry->value->type = STATS_TYPE_LINEAR_HISTOGRAM;
3799         schema_entry->value->bucket_size = pdesc->bucket_size;
3800         schema_entry->value->has_bucket_size = true;
3801         break;
3802     case KVM_STATS_TYPE_LOG_HIST:
3803         schema_entry->value->type = STATS_TYPE_LOG2_HISTOGRAM;
3804         break;
3805     default:
3806         goto exit;
3807     }
3808 
3809     switch (pdesc->flags & KVM_STATS_UNIT_MASK) {
3810     case KVM_STATS_UNIT_NONE:
3811         break;
3812     case KVM_STATS_UNIT_BYTES:
3813         schema_entry->value->has_unit = true;
3814         schema_entry->value->unit = STATS_UNIT_BYTES;
3815         break;
3816     case KVM_STATS_UNIT_CYCLES:
3817         schema_entry->value->has_unit = true;
3818         schema_entry->value->unit = STATS_UNIT_CYCLES;
3819         break;
3820     case KVM_STATS_UNIT_SECONDS:
3821         schema_entry->value->has_unit = true;
3822         schema_entry->value->unit = STATS_UNIT_SECONDS;
3823         break;
3824     default:
3825         goto exit;
3826     }
3827 
3828     schema_entry->value->exponent = pdesc->exponent;
3829     if (pdesc->exponent) {
3830         switch (pdesc->flags & KVM_STATS_BASE_MASK) {
3831         case KVM_STATS_BASE_POW10:
3832             schema_entry->value->has_base = true;
3833             schema_entry->value->base = 10;
3834             break;
3835         case KVM_STATS_BASE_POW2:
3836             schema_entry->value->has_base = true;
3837             schema_entry->value->base = 2;
3838             break;
3839         default:
3840             goto exit;
3841         }
3842     }
3843 
3844     schema_entry->value->name = g_strdup(pdesc->name);
3845     schema_entry->next = list;
3846     return schema_entry;
3847 exit:
3848     g_free(schema_entry->value);
3849     g_free(schema_entry);
3850     return list;
3851 }
3852 
3853 /* Cached stats descriptors */
3854 typedef struct StatsDescriptors {
3855     const char *ident; /* cache key, currently the StatsTarget */
3856     struct kvm_stats_desc *kvm_stats_desc;
3857     struct kvm_stats_header *kvm_stats_header;
3858     QTAILQ_ENTRY(StatsDescriptors) next;
3859 } StatsDescriptors;
3860 
3861 static QTAILQ_HEAD(, StatsDescriptors) stats_descriptors =
3862     QTAILQ_HEAD_INITIALIZER(stats_descriptors);
3863 
3864 /*
3865  * Return the descriptors for 'target', that either have already been read
3866  * or are retrieved from 'stats_fd'.
3867  */
3868 static StatsDescriptors *find_stats_descriptors(StatsTarget target, int stats_fd,
3869                                                 Error **errp)
3870 {
3871     StatsDescriptors *descriptors;
3872     const char *ident;
3873     struct kvm_stats_desc *kvm_stats_desc;
3874     struct kvm_stats_header *kvm_stats_header;
3875     size_t size_desc;
3876     ssize_t ret;
3877 
3878     ident = StatsTarget_str(target);
3879     QTAILQ_FOREACH(descriptors, &stats_descriptors, next) {
3880         if (g_str_equal(descriptors->ident, ident)) {
3881             return descriptors;
3882         }
3883     }
3884 
3885     descriptors = g_new0(StatsDescriptors, 1);
3886 
3887     /* Read stats header */
3888     kvm_stats_header = g_malloc(sizeof(*kvm_stats_header));
3889     ret = read(stats_fd, kvm_stats_header, sizeof(*kvm_stats_header));
3890     if (ret != sizeof(*kvm_stats_header)) {
3891         error_setg(errp, "KVM stats: failed to read stats header: "
3892                    "expected %zu actual %zu",
3893                    sizeof(*kvm_stats_header), ret);
3894         return NULL;
3895     }
3896     size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size;
3897 
3898     /* Read stats descriptors */
3899     kvm_stats_desc = g_malloc0_n(kvm_stats_header->num_desc, size_desc);
3900     ret = pread(stats_fd, kvm_stats_desc,
3901                 size_desc * kvm_stats_header->num_desc,
3902                 kvm_stats_header->desc_offset);
3903 
3904     if (ret != size_desc * kvm_stats_header->num_desc) {
3905         error_setg(errp, "KVM stats: failed to read stats descriptors: "
3906                    "expected %zu actual %zu",
3907                    size_desc * kvm_stats_header->num_desc, ret);
3908         g_free(descriptors);
3909         g_free(kvm_stats_desc);
3910         return NULL;
3911     }
3912     descriptors->kvm_stats_header = kvm_stats_header;
3913     descriptors->kvm_stats_desc = kvm_stats_desc;
3914     descriptors->ident = ident;
3915     QTAILQ_INSERT_TAIL(&stats_descriptors, descriptors, next);
3916     return descriptors;
3917 }
3918 
3919 static void query_stats(StatsResultList **result, StatsTarget target,
3920                         strList *names, int stats_fd, Error **errp)
3921 {
3922     struct kvm_stats_desc *kvm_stats_desc;
3923     struct kvm_stats_header *kvm_stats_header;
3924     StatsDescriptors *descriptors;
3925     g_autofree uint64_t *stats_data = NULL;
3926     struct kvm_stats_desc *pdesc;
3927     StatsList *stats_list = NULL;
3928     size_t size_desc, size_data = 0;
3929     ssize_t ret;
3930     int i;
3931 
3932     descriptors = find_stats_descriptors(target, stats_fd, errp);
3933     if (!descriptors) {
3934         return;
3935     }
3936 
3937     kvm_stats_header = descriptors->kvm_stats_header;
3938     kvm_stats_desc = descriptors->kvm_stats_desc;
3939     size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size;
3940 
3941     /* Tally the total data size; read schema data */
3942     for (i = 0; i < kvm_stats_header->num_desc; ++i) {
3943         pdesc = (void *)kvm_stats_desc + i * size_desc;
3944         size_data += pdesc->size * sizeof(*stats_data);
3945     }
3946 
3947     stats_data = g_malloc0(size_data);
3948     ret = pread(stats_fd, stats_data, size_data, kvm_stats_header->data_offset);
3949 
3950     if (ret != size_data) {
3951         error_setg(errp, "KVM stats: failed to read data: "
3952                    "expected %zu actual %zu", size_data, ret);
3953         return;
3954     }
3955 
3956     for (i = 0; i < kvm_stats_header->num_desc; ++i) {
3957         uint64_t *stats;
3958         pdesc = (void *)kvm_stats_desc + i * size_desc;
3959 
3960         /* Add entry to the list */
3961         stats = (void *)stats_data + pdesc->offset;
3962         if (!apply_str_list_filter(pdesc->name, names)) {
3963             continue;
3964         }
3965         stats_list = add_kvmstat_entry(pdesc, stats, stats_list, errp);
3966     }
3967 
3968     if (!stats_list) {
3969         return;
3970     }
3971 
3972     switch (target) {
3973     case STATS_TARGET_VM:
3974         add_stats_entry(result, STATS_PROVIDER_KVM, NULL, stats_list);
3975         break;
3976     case STATS_TARGET_VCPU:
3977         add_stats_entry(result, STATS_PROVIDER_KVM,
3978                         current_cpu->parent_obj.canonical_path,
3979                         stats_list);
3980         break;
3981     default:
3982         break;
3983     }
3984 }
3985 
3986 static void query_stats_schema(StatsSchemaList **result, StatsTarget target,
3987                                int stats_fd, Error **errp)
3988 {
3989     struct kvm_stats_desc *kvm_stats_desc;
3990     struct kvm_stats_header *kvm_stats_header;
3991     StatsDescriptors *descriptors;
3992     struct kvm_stats_desc *pdesc;
3993     StatsSchemaValueList *stats_list = NULL;
3994     size_t size_desc;
3995     int i;
3996 
3997     descriptors = find_stats_descriptors(target, stats_fd, errp);
3998     if (!descriptors) {
3999         return;
4000     }
4001 
4002     kvm_stats_header = descriptors->kvm_stats_header;
4003     kvm_stats_desc = descriptors->kvm_stats_desc;
4004     size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size;
4005 
4006     /* Tally the total data size; read schema data */
4007     for (i = 0; i < kvm_stats_header->num_desc; ++i) {
4008         pdesc = (void *)kvm_stats_desc + i * size_desc;
4009         stats_list = add_kvmschema_entry(pdesc, stats_list, errp);
4010     }
4011 
4012     add_stats_schema(result, STATS_PROVIDER_KVM, target, stats_list);
4013 }
4014 
4015 static void query_stats_vcpu(CPUState *cpu, run_on_cpu_data data)
4016 {
4017     StatsArgs *kvm_stats_args = (StatsArgs *) data.host_ptr;
4018     int stats_fd = kvm_vcpu_ioctl(cpu, KVM_GET_STATS_FD, NULL);
4019     Error *local_err = NULL;
4020 
4021     if (stats_fd == -1) {
4022         error_setg_errno(&local_err, errno, "KVM stats: ioctl failed");
4023         error_propagate(kvm_stats_args->errp, local_err);
4024         return;
4025     }
4026     query_stats(kvm_stats_args->result.stats, STATS_TARGET_VCPU,
4027                 kvm_stats_args->names, stats_fd, kvm_stats_args->errp);
4028     close(stats_fd);
4029 }
4030 
4031 static void query_stats_schema_vcpu(CPUState *cpu, run_on_cpu_data data)
4032 {
4033     StatsArgs *kvm_stats_args = (StatsArgs *) data.host_ptr;
4034     int stats_fd = kvm_vcpu_ioctl(cpu, KVM_GET_STATS_FD, NULL);
4035     Error *local_err = NULL;
4036 
4037     if (stats_fd == -1) {
4038         error_setg_errno(&local_err, errno, "KVM stats: ioctl failed");
4039         error_propagate(kvm_stats_args->errp, local_err);
4040         return;
4041     }
4042     query_stats_schema(kvm_stats_args->result.schema, STATS_TARGET_VCPU, stats_fd,
4043                        kvm_stats_args->errp);
4044     close(stats_fd);
4045 }
4046 
4047 static void query_stats_cb(StatsResultList **result, StatsTarget target,
4048                            strList *names, strList *targets, Error **errp)
4049 {
4050     KVMState *s = kvm_state;
4051     CPUState *cpu;
4052     int stats_fd;
4053 
4054     switch (target) {
4055     case STATS_TARGET_VM:
4056     {
4057         stats_fd = kvm_vm_ioctl(s, KVM_GET_STATS_FD, NULL);
4058         if (stats_fd == -1) {
4059             error_setg_errno(errp, errno, "KVM stats: ioctl failed");
4060             return;
4061         }
4062         query_stats(result, target, names, stats_fd, errp);
4063         close(stats_fd);
4064         break;
4065     }
4066     case STATS_TARGET_VCPU:
4067     {
4068         StatsArgs stats_args;
4069         stats_args.result.stats = result;
4070         stats_args.names = names;
4071         stats_args.errp = errp;
4072         CPU_FOREACH(cpu) {
4073             if (!apply_str_list_filter(cpu->parent_obj.canonical_path, targets)) {
4074                 continue;
4075             }
4076             run_on_cpu(cpu, query_stats_vcpu, RUN_ON_CPU_HOST_PTR(&stats_args));
4077         }
4078         break;
4079     }
4080     default:
4081         break;
4082     }
4083 }
4084 
4085 void query_stats_schemas_cb(StatsSchemaList **result, Error **errp)
4086 {
4087     StatsArgs stats_args;
4088     KVMState *s = kvm_state;
4089     int stats_fd;
4090 
4091     stats_fd = kvm_vm_ioctl(s, KVM_GET_STATS_FD, NULL);
4092     if (stats_fd == -1) {
4093         error_setg_errno(errp, errno, "KVM stats: ioctl failed");
4094         return;
4095     }
4096     query_stats_schema(result, STATS_TARGET_VM, stats_fd, errp);
4097     close(stats_fd);
4098 
4099     stats_args.result.schema = result;
4100     stats_args.errp = errp;
4101     run_on_cpu(first_cpu, query_stats_schema_vcpu, RUN_ON_CPU_HOST_PTR(&stats_args));
4102 }
4103