xref: /openbmc/qemu/accel/kvm/kvm-all.c (revision 2dc652961d6a9508d5db140765a0b22238165d88)
192229a57SYang Zhong /*
292229a57SYang Zhong  * QEMU KVM support
392229a57SYang Zhong  *
492229a57SYang Zhong  * Copyright IBM, Corp. 2008
592229a57SYang Zhong  *           Red Hat, Inc. 2008
692229a57SYang Zhong  *
792229a57SYang Zhong  * Authors:
892229a57SYang Zhong  *  Anthony Liguori   <aliguori@us.ibm.com>
992229a57SYang Zhong  *  Glauber Costa     <gcosta@redhat.com>
1092229a57SYang Zhong  *
1192229a57SYang Zhong  * This work is licensed under the terms of the GNU GPL, version 2 or later.
1292229a57SYang Zhong  * See the COPYING file in the top-level directory.
1392229a57SYang Zhong  *
1492229a57SYang Zhong  */
1592229a57SYang Zhong 
1692229a57SYang Zhong #include "qemu/osdep.h"
1792229a57SYang Zhong #include <sys/ioctl.h>
18b4420f19SPeter Xu #include <poll.h>
1992229a57SYang Zhong 
2092229a57SYang Zhong #include <linux/kvm.h>
2192229a57SYang Zhong 
2292229a57SYang Zhong #include "qemu/atomic.h"
2392229a57SYang Zhong #include "qemu/option.h"
2492229a57SYang Zhong #include "qemu/config-file.h"
2592229a57SYang Zhong #include "qemu/error-report.h"
2692229a57SYang Zhong #include "qapi/error.h"
2792229a57SYang Zhong #include "hw/pci/msi.h"
2892229a57SYang Zhong #include "hw/pci/msix.h"
2992229a57SYang Zhong #include "hw/s390x/adapter.h"
305b7d54d4SAlex Bennée #include "gdbstub/enums.h"
3192229a57SYang Zhong #include "sysemu/kvm_int.h"
3254d31236SMarkus Armbruster #include "sysemu/runstate.h"
3392229a57SYang Zhong #include "sysemu/cpus.h"
34f39b7d2bSDavid Hildenbrand #include "sysemu/accel-blocker.h"
3592229a57SYang Zhong #include "qemu/bswap.h"
3692229a57SYang Zhong #include "exec/memory.h"
3792229a57SYang Zhong #include "exec/ram_addr.h"
3892229a57SYang Zhong #include "qemu/event_notifier.h"
39db725815SMarkus Armbruster #include "qemu/main-loop.h"
4092229a57SYang Zhong #include "trace.h"
4192229a57SYang Zhong #include "hw/irq.h"
4223b0898eSPaolo Bonzini #include "qapi/visitor.h"
4311bc4a13SPaolo Bonzini #include "qapi/qapi-types-common.h"
4411bc4a13SPaolo Bonzini #include "qapi/qapi-visit-common.h"
456b552b9bSDongjiu Geng #include "sysemu/reset.h"
4657038a92SClaudio Fontana #include "qemu/guest-random.h"
4757038a92SClaudio Fontana #include "sysemu/hw_accel.h"
4857038a92SClaudio Fontana #include "kvm-cpus.h"
49baa60983SHyman Huang(黄勇) #include "sysemu/dirtylimit.h"
50f39b7d2bSDavid Hildenbrand #include "qemu/range.h"
5192229a57SYang Zhong 
5292229a57SYang Zhong #include "hw/boards.h"
53aa09b3d5SMarkus Armbruster #include "sysemu/stats.h"
5492229a57SYang Zhong 
5592229a57SYang Zhong /* This check must be after config-host.h is included */
5692229a57SYang Zhong #ifdef CONFIG_EVENTFD
5792229a57SYang Zhong #include <sys/eventfd.h>
5892229a57SYang Zhong #endif
5992229a57SYang Zhong 
6092229a57SYang Zhong /* KVM uses PAGE_SIZE in its definition of KVM_COALESCED_MMIO_MAX. We
6192229a57SYang Zhong  * need to use the real host PAGE_SIZE, as that's what KVM will use.
6292229a57SYang Zhong  */
63eb8b1a79SJiaxun Yang #ifdef PAGE_SIZE
64eb8b1a79SJiaxun Yang #undef PAGE_SIZE
65eb8b1a79SJiaxun Yang #endif
668e3b0cbbSMarc-André Lureau #define PAGE_SIZE qemu_real_host_page_size()
6792229a57SYang Zhong 
68fd2ddd16SMaxim Levitsky #ifndef KVM_GUESTDBG_BLOCKIRQ
69fd2ddd16SMaxim Levitsky #define KVM_GUESTDBG_BLOCKIRQ 0
70fd2ddd16SMaxim Levitsky #endif
71fd2ddd16SMaxim Levitsky 
725504a812SPeter Xu /* Default num of memslots to be allocated when VM starts */
735504a812SPeter Xu #define  KVM_MEMSLOTS_NR_ALLOC_DEFAULT                      16
74b34a908cSPeter Xu /* Default max allowed memslots if kernel reported nothing */
75b34a908cSPeter Xu #define  KVM_MEMSLOTS_NR_MAX_DEFAULT                        32
765504a812SPeter Xu 
7792229a57SYang Zhong struct KVMParkedVcpu {
7892229a57SYang Zhong     unsigned long vcpu_id;
7992229a57SYang Zhong     int kvm_fd;
8092229a57SYang Zhong     QLIST_ENTRY(KVMParkedVcpu) node;
8192229a57SYang Zhong };
8292229a57SYang Zhong 
8392229a57SYang Zhong KVMState *kvm_state;
8492229a57SYang Zhong bool kvm_kernel_irqchip;
8592229a57SYang Zhong bool kvm_split_irqchip;
8692229a57SYang Zhong bool kvm_async_interrupts_allowed;
8792229a57SYang Zhong bool kvm_halt_in_kernel_allowed;
8892229a57SYang Zhong bool kvm_resamplefds_allowed;
8992229a57SYang Zhong bool kvm_msi_via_irqfd_allowed;
9092229a57SYang Zhong bool kvm_gsi_routing_allowed;
9192229a57SYang Zhong bool kvm_gsi_direct_mapping;
9292229a57SYang Zhong bool kvm_allowed;
9392229a57SYang Zhong bool kvm_readonly_mem_allowed;
9492229a57SYang Zhong bool kvm_vm_attributes_allowed;
9592229a57SYang Zhong bool kvm_msi_use_devid;
9616617c3cSRichard Henderson static bool kvm_has_guest_debug;
973b7a9388SAlex Bennée static int kvm_sstep_flags;
9892229a57SYang Zhong static bool kvm_immediate_exit;
990811baedSXiaoyao Li static uint64_t kvm_supported_memory_attributes;
10015f7a80cSXiaoyao Li static bool kvm_guest_memfd_supported;
101023ae9a8SIgor Mammedov static hwaddr kvm_max_slot_size = ~0;
10292229a57SYang Zhong 
10392229a57SYang Zhong static const KVMCapabilityInfo kvm_required_capabilites[] = {
10492229a57SYang Zhong     KVM_CAP_INFO(USER_MEMORY),
10592229a57SYang Zhong     KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS),
10689de4b91SDavid Hildenbrand     KVM_CAP_INFO(JOIN_MEMORY_REGIONS_WORKS),
107aacec9aeSPaolo Bonzini     KVM_CAP_INFO(INTERNAL_ERROR_DATA),
108126e7f78SPaolo Bonzini     KVM_CAP_INFO(IOEVENTFD),
109126e7f78SPaolo Bonzini     KVM_CAP_INFO(IOEVENTFD_ANY_LENGTH),
11092229a57SYang Zhong     KVM_CAP_LAST_INFO
11192229a57SYang Zhong };
11292229a57SYang Zhong 
1133607715aSDavid Gibson static NotifierList kvm_irqchip_change_notifiers =
1143607715aSDavid Gibson     NOTIFIER_LIST_INITIALIZER(kvm_irqchip_change_notifiers);
1153607715aSDavid Gibson 
116c82d9d43SPeter Xu struct KVMResampleFd {
117c82d9d43SPeter Xu     int gsi;
118c82d9d43SPeter Xu     EventNotifier *resample_event;
119c82d9d43SPeter Xu     QLIST_ENTRY(KVMResampleFd) node;
120c82d9d43SPeter Xu };
121c82d9d43SPeter Xu typedef struct KVMResampleFd KVMResampleFd;
122c82d9d43SPeter Xu 
123c82d9d43SPeter Xu /*
124c82d9d43SPeter Xu  * Only used with split irqchip where we need to do the resample fd
125c82d9d43SPeter Xu  * kick for the kernel from userspace.
126c82d9d43SPeter Xu  */
127c82d9d43SPeter Xu static QLIST_HEAD(, KVMResampleFd) kvm_resample_fd_list =
128c82d9d43SPeter Xu     QLIST_HEAD_INITIALIZER(kvm_resample_fd_list);
129c82d9d43SPeter Xu 
130a2f77862SPeter Xu static QemuMutex kml_slots_lock;
131a2f77862SPeter Xu 
132a2f77862SPeter Xu #define kvm_slots_lock()    qemu_mutex_lock(&kml_slots_lock)
133a2f77862SPeter Xu #define kvm_slots_unlock()  qemu_mutex_unlock(&kml_slots_lock)
13436adac49SPeter Xu 
135ea776d15SPeter Xu static void kvm_slot_init_dirty_bitmap(KVMSlot *mem);
136ea776d15SPeter Xu 
kvm_resample_fd_remove(int gsi)137c82d9d43SPeter Xu static inline void kvm_resample_fd_remove(int gsi)
138c82d9d43SPeter Xu {
139c82d9d43SPeter Xu     KVMResampleFd *rfd;
140c82d9d43SPeter Xu 
141c82d9d43SPeter Xu     QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) {
142c82d9d43SPeter Xu         if (rfd->gsi == gsi) {
143c82d9d43SPeter Xu             QLIST_REMOVE(rfd, node);
144c82d9d43SPeter Xu             g_free(rfd);
145c82d9d43SPeter Xu             break;
146c82d9d43SPeter Xu         }
147c82d9d43SPeter Xu     }
148c82d9d43SPeter Xu }
149c82d9d43SPeter Xu 
kvm_resample_fd_insert(int gsi,EventNotifier * event)150c82d9d43SPeter Xu static inline void kvm_resample_fd_insert(int gsi, EventNotifier *event)
151c82d9d43SPeter Xu {
152c82d9d43SPeter Xu     KVMResampleFd *rfd = g_new0(KVMResampleFd, 1);
153c82d9d43SPeter Xu 
154c82d9d43SPeter Xu     rfd->gsi = gsi;
155c82d9d43SPeter Xu     rfd->resample_event = event;
156c82d9d43SPeter Xu 
157c82d9d43SPeter Xu     QLIST_INSERT_HEAD(&kvm_resample_fd_list, rfd, node);
158c82d9d43SPeter Xu }
159c82d9d43SPeter Xu 
kvm_resample_fd_notify(int gsi)160c82d9d43SPeter Xu void kvm_resample_fd_notify(int gsi)
161c82d9d43SPeter Xu {
162c82d9d43SPeter Xu     KVMResampleFd *rfd;
163c82d9d43SPeter Xu 
164c82d9d43SPeter Xu     QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) {
165c82d9d43SPeter Xu         if (rfd->gsi == gsi) {
166c82d9d43SPeter Xu             event_notifier_set(rfd->resample_event);
167c82d9d43SPeter Xu             trace_kvm_resample_fd_notify(gsi);
168c82d9d43SPeter Xu             return;
169c82d9d43SPeter Xu         }
170c82d9d43SPeter Xu     }
171c82d9d43SPeter Xu }
172c82d9d43SPeter Xu 
1735504a812SPeter Xu /**
1745504a812SPeter Xu  * kvm_slots_grow(): Grow the slots[] array in the KVMMemoryListener
1755504a812SPeter Xu  *
1765504a812SPeter Xu  * @kml: The KVMMemoryListener* to grow the slots[] array
1775504a812SPeter Xu  * @nr_slots_new: The new size of slots[] array
1785504a812SPeter Xu  *
1795504a812SPeter Xu  * Returns: True if the array grows larger, false otherwise.
1805504a812SPeter Xu  */
kvm_slots_grow(KVMMemoryListener * kml,unsigned int nr_slots_new)1815504a812SPeter Xu static bool kvm_slots_grow(KVMMemoryListener *kml, unsigned int nr_slots_new)
1825504a812SPeter Xu {
1835504a812SPeter Xu     unsigned int i, cur = kml->nr_slots_allocated;
1845504a812SPeter Xu     KVMSlot *slots;
1855504a812SPeter Xu 
186943c7428SPeter Xu     if (nr_slots_new > kvm_state->nr_slots_max) {
187943c7428SPeter Xu         nr_slots_new = kvm_state->nr_slots_max;
1885504a812SPeter Xu     }
1895504a812SPeter Xu 
1905504a812SPeter Xu     if (cur >= nr_slots_new) {
1915504a812SPeter Xu         /* Big enough, no need to grow, or we reached max */
1925504a812SPeter Xu         return false;
1935504a812SPeter Xu     }
1945504a812SPeter Xu 
1955504a812SPeter Xu     if (cur == 0) {
1965504a812SPeter Xu         slots = g_new0(KVMSlot, nr_slots_new);
1975504a812SPeter Xu     } else {
1985504a812SPeter Xu         assert(kml->slots);
1995504a812SPeter Xu         slots = g_renew(KVMSlot, kml->slots, nr_slots_new);
2005504a812SPeter Xu         /*
2015504a812SPeter Xu          * g_renew() doesn't initialize extended buffers, however kvm
2025504a812SPeter Xu          * memslots require fields to be zero-initialized. E.g. pointers,
2035504a812SPeter Xu          * memory_size field, etc.
2045504a812SPeter Xu          */
2055504a812SPeter Xu         memset(&slots[cur], 0x0, sizeof(slots[0]) * (nr_slots_new - cur));
2065504a812SPeter Xu     }
2075504a812SPeter Xu 
2085504a812SPeter Xu     for (i = cur; i < nr_slots_new; i++) {
2095504a812SPeter Xu         slots[i].slot = i;
2105504a812SPeter Xu     }
2115504a812SPeter Xu 
2125504a812SPeter Xu     kml->slots = slots;
2135504a812SPeter Xu     kml->nr_slots_allocated = nr_slots_new;
2145504a812SPeter Xu     trace_kvm_slots_grow(cur, nr_slots_new);
2155504a812SPeter Xu 
2165504a812SPeter Xu     return true;
2175504a812SPeter Xu }
2185504a812SPeter Xu 
kvm_slots_double(KVMMemoryListener * kml)2195504a812SPeter Xu static bool kvm_slots_double(KVMMemoryListener *kml)
2205504a812SPeter Xu {
2215504a812SPeter Xu     return kvm_slots_grow(kml, kml->nr_slots_allocated * 2);
2225504a812SPeter Xu }
2235504a812SPeter Xu 
kvm_get_max_memslots(void)22416ab2edaSDavid Hildenbrand unsigned int kvm_get_max_memslots(void)
22592229a57SYang Zhong {
2264f7f5893SPhilippe Mathieu-Daudé     KVMState *s = KVM_STATE(current_accel());
22792229a57SYang Zhong 
228943c7428SPeter Xu     return s->nr_slots_max;
22992229a57SYang Zhong }
23092229a57SYang Zhong 
kvm_get_free_memslots(void)2315b23186aSDavid Hildenbrand unsigned int kvm_get_free_memslots(void)
2325b23186aSDavid Hildenbrand {
2335b23186aSDavid Hildenbrand     unsigned int used_slots = 0;
2345b23186aSDavid Hildenbrand     KVMState *s = kvm_state;
2355b23186aSDavid Hildenbrand     int i;
2365b23186aSDavid Hildenbrand 
2375b23186aSDavid Hildenbrand     kvm_slots_lock();
2385b23186aSDavid Hildenbrand     for (i = 0; i < s->nr_as; i++) {
2395b23186aSDavid Hildenbrand         if (!s->as[i].ml) {
2405b23186aSDavid Hildenbrand             continue;
2415b23186aSDavid Hildenbrand         }
242dbdc00baSPeter Xu         used_slots = MAX(used_slots, s->as[i].ml->nr_slots_used);
2435b23186aSDavid Hildenbrand     }
2445b23186aSDavid Hildenbrand     kvm_slots_unlock();
2455b23186aSDavid Hildenbrand 
246943c7428SPeter Xu     return s->nr_slots_max - used_slots;
2475b23186aSDavid Hildenbrand }
2485b23186aSDavid Hildenbrand 
24936adac49SPeter Xu /* Called with KVMMemoryListener.slots_lock held */
kvm_get_free_slot(KVMMemoryListener * kml)25092229a57SYang Zhong static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml)
25192229a57SYang Zhong {
2525504a812SPeter Xu     unsigned int n;
25392229a57SYang Zhong     int i;
25492229a57SYang Zhong 
2555504a812SPeter Xu     for (i = 0; i < kml->nr_slots_allocated; i++) {
25692229a57SYang Zhong         if (kml->slots[i].memory_size == 0) {
25792229a57SYang Zhong             return &kml->slots[i];
25892229a57SYang Zhong         }
25992229a57SYang Zhong     }
26092229a57SYang Zhong 
2615504a812SPeter Xu     /*
2625504a812SPeter Xu      * If no free slots, try to grow first by doubling.  Cache the old size
2635504a812SPeter Xu      * here to avoid another round of search: if the grow succeeded, it
2645504a812SPeter Xu      * means slots[] now must have the existing "n" slots occupied,
2655504a812SPeter Xu      * followed by one or more free slots starting from slots[n].
2665504a812SPeter Xu      */
2675504a812SPeter Xu     n = kml->nr_slots_allocated;
2685504a812SPeter Xu     if (kvm_slots_double(kml)) {
2695504a812SPeter Xu         return &kml->slots[n];
2705504a812SPeter Xu     }
2715504a812SPeter Xu 
27292229a57SYang Zhong     return NULL;
27392229a57SYang Zhong }
27492229a57SYang Zhong 
27536adac49SPeter Xu /* Called with KVMMemoryListener.slots_lock held */
kvm_alloc_slot(KVMMemoryListener * kml)27692229a57SYang Zhong static KVMSlot *kvm_alloc_slot(KVMMemoryListener *kml)
27792229a57SYang Zhong {
27892229a57SYang Zhong     KVMSlot *slot = kvm_get_free_slot(kml);
27992229a57SYang Zhong 
28092229a57SYang Zhong     if (slot) {
28192229a57SYang Zhong         return slot;
28292229a57SYang Zhong     }
28392229a57SYang Zhong 
28492229a57SYang Zhong     fprintf(stderr, "%s: no free slot available\n", __func__);
28592229a57SYang Zhong     abort();
28692229a57SYang Zhong }
28792229a57SYang Zhong 
kvm_lookup_matching_slot(KVMMemoryListener * kml,hwaddr start_addr,hwaddr size)28892229a57SYang Zhong static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml,
28992229a57SYang Zhong                                          hwaddr start_addr,
2902747e716SDavid Hildenbrand                                          hwaddr size)
29192229a57SYang Zhong {
29292229a57SYang Zhong     int i;
29392229a57SYang Zhong 
2945504a812SPeter Xu     for (i = 0; i < kml->nr_slots_allocated; i++) {
29592229a57SYang Zhong         KVMSlot *mem = &kml->slots[i];
29692229a57SYang Zhong 
2972747e716SDavid Hildenbrand         if (start_addr == mem->start_addr && size == mem->memory_size) {
29892229a57SYang Zhong             return mem;
29992229a57SYang Zhong         }
30092229a57SYang Zhong     }
30192229a57SYang Zhong 
30292229a57SYang Zhong     return NULL;
30392229a57SYang Zhong }
30492229a57SYang Zhong 
30592229a57SYang Zhong /*
3065ea69c2eSDavid Hildenbrand  * Calculate and align the start address and the size of the section.
3075ea69c2eSDavid Hildenbrand  * Return the size. If the size is 0, the aligned section is empty.
3085ea69c2eSDavid Hildenbrand  */
kvm_align_section(MemoryRegionSection * section,hwaddr * start)3095ea69c2eSDavid Hildenbrand static hwaddr kvm_align_section(MemoryRegionSection *section,
3105ea69c2eSDavid Hildenbrand                                 hwaddr *start)
3115ea69c2eSDavid Hildenbrand {
3125ea69c2eSDavid Hildenbrand     hwaddr size = int128_get64(section->size);
313a6ffc423SDavid Hildenbrand     hwaddr delta, aligned;
3145ea69c2eSDavid Hildenbrand 
3155ea69c2eSDavid Hildenbrand     /* kvm works in page size chunks, but the function may be called
3165ea69c2eSDavid Hildenbrand        with sub-page size and unaligned start address. Pad the start
3175ea69c2eSDavid Hildenbrand        address to next and truncate size to previous page boundary. */
318a6ffc423SDavid Hildenbrand     aligned = ROUND_UP(section->offset_within_address_space,
3198e3b0cbbSMarc-André Lureau                        qemu_real_host_page_size());
320a6ffc423SDavid Hildenbrand     delta = aligned - section->offset_within_address_space;
321a6ffc423SDavid Hildenbrand     *start = aligned;
3225ea69c2eSDavid Hildenbrand     if (delta > size) {
3235ea69c2eSDavid Hildenbrand         return 0;
3245ea69c2eSDavid Hildenbrand     }
3255ea69c2eSDavid Hildenbrand 
3268e3b0cbbSMarc-André Lureau     return (size - delta) & qemu_real_host_page_mask();
3275ea69c2eSDavid Hildenbrand }
3285ea69c2eSDavid Hildenbrand 
kvm_physical_memory_addr_from_host(KVMState * s,void * ram,hwaddr * phys_addr)32992229a57SYang Zhong int kvm_physical_memory_addr_from_host(KVMState *s, void *ram,
33092229a57SYang Zhong                                        hwaddr *phys_addr)
33192229a57SYang Zhong {
33292229a57SYang Zhong     KVMMemoryListener *kml = &s->memory_listener;
33336adac49SPeter Xu     int i, ret = 0;
33492229a57SYang Zhong 
335a2f77862SPeter Xu     kvm_slots_lock();
3365504a812SPeter Xu     for (i = 0; i < kml->nr_slots_allocated; i++) {
33792229a57SYang Zhong         KVMSlot *mem = &kml->slots[i];
33892229a57SYang Zhong 
33992229a57SYang Zhong         if (ram >= mem->ram && ram < mem->ram + mem->memory_size) {
34092229a57SYang Zhong             *phys_addr = mem->start_addr + (ram - mem->ram);
34136adac49SPeter Xu             ret = 1;
34236adac49SPeter Xu             break;
34392229a57SYang Zhong         }
34492229a57SYang Zhong     }
345a2f77862SPeter Xu     kvm_slots_unlock();
34692229a57SYang Zhong 
34736adac49SPeter Xu     return ret;
34892229a57SYang Zhong }
34992229a57SYang Zhong 
kvm_set_user_memory_region(KVMMemoryListener * kml,KVMSlot * slot,bool new)3506c090d4aSShannon Zhao static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot, bool new)
35192229a57SYang Zhong {
35292229a57SYang Zhong     KVMState *s = kvm_state;
353ce5a9832SChao Peng     struct kvm_userspace_memory_region2 mem;
354fe29141bSAlexey Kardashevskiy     int ret;
35592229a57SYang Zhong 
35692229a57SYang Zhong     mem.slot = slot->slot | (kml->as_id << 16);
35792229a57SYang Zhong     mem.guest_phys_addr = slot->start_addr;
35892229a57SYang Zhong     mem.userspace_addr = (unsigned long)slot->ram;
35992229a57SYang Zhong     mem.flags = slot->flags;
360ce5a9832SChao Peng     mem.guest_memfd = slot->guest_memfd;
361ce5a9832SChao Peng     mem.guest_memfd_offset = slot->guest_memfd_offset;
36292229a57SYang Zhong 
3636c090d4aSShannon Zhao     if (slot->memory_size && !new && (mem.flags ^ slot->old_flags) & KVM_MEM_READONLY) {
36492229a57SYang Zhong         /* Set the slot size to 0 before setting the slot to the desired
36592229a57SYang Zhong          * value. This is needed based on KVM commit 75d61fbc. */
36692229a57SYang Zhong         mem.memory_size = 0;
367ce5a9832SChao Peng 
368ce5a9832SChao Peng         if (kvm_guest_memfd_supported) {
369ce5a9832SChao Peng             ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION2, &mem);
370ce5a9832SChao Peng         } else {
37188cd34eeSPhilippe Mathieu-Daudé             ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
372ce5a9832SChao Peng         }
37388cd34eeSPhilippe Mathieu-Daudé         if (ret < 0) {
37488cd34eeSPhilippe Mathieu-Daudé             goto err;
37588cd34eeSPhilippe Mathieu-Daudé         }
37692229a57SYang Zhong     }
37792229a57SYang Zhong     mem.memory_size = slot->memory_size;
378ce5a9832SChao Peng     if (kvm_guest_memfd_supported) {
379ce5a9832SChao Peng         ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION2, &mem);
380ce5a9832SChao Peng     } else {
381fe29141bSAlexey Kardashevskiy         ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
382ce5a9832SChao Peng     }
3836c090d4aSShannon Zhao     slot->old_flags = mem.flags;
38488cd34eeSPhilippe Mathieu-Daudé err:
38572853afcSXiaoyao Li     trace_kvm_set_user_memory(mem.slot >> 16, (uint16_t)mem.slot, mem.flags,
38672853afcSXiaoyao Li                               mem.guest_phys_addr, mem.memory_size,
387ce5a9832SChao Peng                               mem.userspace_addr, mem.guest_memfd,
388ce5a9832SChao Peng                               mem.guest_memfd_offset, ret);
38988cd34eeSPhilippe Mathieu-Daudé     if (ret < 0) {
390ce5a9832SChao Peng         if (kvm_guest_memfd_supported) {
391ce5a9832SChao Peng                 error_report("%s: KVM_SET_USER_MEMORY_REGION2 failed, slot=%d,"
392ce5a9832SChao Peng                         " start=0x%" PRIx64 ", size=0x%" PRIx64 ","
393ce5a9832SChao Peng                         " flags=0x%" PRIx32 ", guest_memfd=%" PRId32 ","
394ce5a9832SChao Peng                         " guest_memfd_offset=0x%" PRIx64 ": %s",
395ce5a9832SChao Peng                         __func__, mem.slot, slot->start_addr,
396ce5a9832SChao Peng                         (uint64_t)mem.memory_size, mem.flags,
397ce5a9832SChao Peng                         mem.guest_memfd, (uint64_t)mem.guest_memfd_offset,
398ce5a9832SChao Peng                         strerror(errno));
399ce5a9832SChao Peng         } else {
40088cd34eeSPhilippe Mathieu-Daudé                 error_report("%s: KVM_SET_USER_MEMORY_REGION failed, slot=%d,"
40188cd34eeSPhilippe Mathieu-Daudé                             " start=0x%" PRIx64 ", size=0x%" PRIx64 ": %s",
40288cd34eeSPhilippe Mathieu-Daudé                             __func__, mem.slot, slot->start_addr,
40388cd34eeSPhilippe Mathieu-Daudé                             (uint64_t)mem.memory_size, strerror(errno));
40488cd34eeSPhilippe Mathieu-Daudé         }
405ce5a9832SChao Peng     }
406fe29141bSAlexey Kardashevskiy     return ret;
40792229a57SYang Zhong }
40892229a57SYang Zhong 
kvm_park_vcpu(CPUState * cpu)40908c32868SSalil Mehta void kvm_park_vcpu(CPUState *cpu)
41008c32868SSalil Mehta {
41108c32868SSalil Mehta     struct KVMParkedVcpu *vcpu;
41208c32868SSalil Mehta 
41308c32868SSalil Mehta     trace_kvm_park_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
41408c32868SSalil Mehta 
41508c32868SSalil Mehta     vcpu = g_malloc0(sizeof(*vcpu));
41608c32868SSalil Mehta     vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
41708c32868SSalil Mehta     vcpu->kvm_fd = cpu->kvm_fd;
41808c32868SSalil Mehta     QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
41908c32868SSalil Mehta }
42008c32868SSalil Mehta 
kvm_unpark_vcpu(KVMState * s,unsigned long vcpu_id)42108c32868SSalil Mehta int kvm_unpark_vcpu(KVMState *s, unsigned long vcpu_id)
42208c32868SSalil Mehta {
42308c32868SSalil Mehta     struct KVMParkedVcpu *cpu;
42408c32868SSalil Mehta     int kvm_fd = -ENOENT;
42508c32868SSalil Mehta 
42608c32868SSalil Mehta     QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
42708c32868SSalil Mehta         if (cpu->vcpu_id == vcpu_id) {
42808c32868SSalil Mehta             QLIST_REMOVE(cpu, node);
42908c32868SSalil Mehta             kvm_fd = cpu->kvm_fd;
43008c32868SSalil Mehta             g_free(cpu);
431036144cfSSalil Mehta             break;
43208c32868SSalil Mehta         }
43308c32868SSalil Mehta     }
43408c32868SSalil Mehta 
43508c32868SSalil Mehta     trace_kvm_unpark_vcpu(vcpu_id, kvm_fd > 0 ? "unparked" : "!found parked");
43608c32868SSalil Mehta 
43708c32868SSalil Mehta     return kvm_fd;
43808c32868SSalil Mehta }
43908c32868SSalil Mehta 
kvm_reset_parked_vcpus(void * param)440*2dc65296SMaciej S. Szmigiero static void kvm_reset_parked_vcpus(void *param)
441*2dc65296SMaciej S. Szmigiero {
442*2dc65296SMaciej S. Szmigiero     KVMState *s = param;
443*2dc65296SMaciej S. Szmigiero     struct KVMParkedVcpu *cpu;
444*2dc65296SMaciej S. Szmigiero 
445*2dc65296SMaciej S. Szmigiero     QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
446*2dc65296SMaciej S. Szmigiero         kvm_arch_reset_parked_vcpu(cpu->vcpu_id, cpu->kvm_fd);
447*2dc65296SMaciej S. Szmigiero     }
448*2dc65296SMaciej S. Szmigiero }
449*2dc65296SMaciej S. Szmigiero 
kvm_create_vcpu(CPUState * cpu)45008c32868SSalil Mehta int kvm_create_vcpu(CPUState *cpu)
45108c32868SSalil Mehta {
45208c32868SSalil Mehta     unsigned long vcpu_id = kvm_arch_vcpu_id(cpu);
45308c32868SSalil Mehta     KVMState *s = kvm_state;
45408c32868SSalil Mehta     int kvm_fd;
45508c32868SSalil Mehta 
45608c32868SSalil Mehta     /* check if the KVM vCPU already exist but is parked */
45708c32868SSalil Mehta     kvm_fd = kvm_unpark_vcpu(s, vcpu_id);
45808c32868SSalil Mehta     if (kvm_fd < 0) {
45908c32868SSalil Mehta         /* vCPU not parked: create a new KVM vCPU */
46008c32868SSalil Mehta         kvm_fd = kvm_vm_ioctl(s, KVM_CREATE_VCPU, vcpu_id);
46108c32868SSalil Mehta         if (kvm_fd < 0) {
46208c32868SSalil Mehta             error_report("KVM_CREATE_VCPU IOCTL failed for vCPU %lu", vcpu_id);
46308c32868SSalil Mehta             return kvm_fd;
46408c32868SSalil Mehta         }
46508c32868SSalil Mehta     }
46608c32868SSalil Mehta 
46708c32868SSalil Mehta     cpu->kvm_fd = kvm_fd;
46808c32868SSalil Mehta     cpu->kvm_state = s;
46908c32868SSalil Mehta     cpu->vcpu_dirty = true;
47008c32868SSalil Mehta     cpu->dirty_pages = 0;
47108c32868SSalil Mehta     cpu->throttle_us_per_full = 0;
47208c32868SSalil Mehta 
47308c32868SSalil Mehta     trace_kvm_create_vcpu(cpu->cpu_index, vcpu_id, kvm_fd);
47408c32868SSalil Mehta 
47508c32868SSalil Mehta     return 0;
47608c32868SSalil Mehta }
47708c32868SSalil Mehta 
kvm_create_and_park_vcpu(CPUState * cpu)478c6a3d7bcSHarsh Prateek Bora int kvm_create_and_park_vcpu(CPUState *cpu)
479c6a3d7bcSHarsh Prateek Bora {
480c6a3d7bcSHarsh Prateek Bora     int ret = 0;
481c6a3d7bcSHarsh Prateek Bora 
482c6a3d7bcSHarsh Prateek Bora     ret = kvm_create_vcpu(cpu);
483c6a3d7bcSHarsh Prateek Bora     if (!ret) {
484c6a3d7bcSHarsh Prateek Bora         kvm_park_vcpu(cpu);
485c6a3d7bcSHarsh Prateek Bora     }
486c6a3d7bcSHarsh Prateek Bora 
487c6a3d7bcSHarsh Prateek Bora     return ret;
488c6a3d7bcSHarsh Prateek Bora }
489c6a3d7bcSHarsh Prateek Bora 
do_kvm_destroy_vcpu(CPUState * cpu)49057038a92SClaudio Fontana static int do_kvm_destroy_vcpu(CPUState *cpu)
49192229a57SYang Zhong {
49292229a57SYang Zhong     KVMState *s = kvm_state;
49328d2d03cSPeter Maydell     int mmap_size;
49492229a57SYang Zhong     int ret = 0;
49592229a57SYang Zhong 
49608c32868SSalil Mehta     trace_kvm_destroy_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
49792229a57SYang Zhong 
498b1115c99SLiran Alon     ret = kvm_arch_destroy_vcpu(cpu);
499b1115c99SLiran Alon     if (ret < 0) {
500b1115c99SLiran Alon         goto err;
501b1115c99SLiran Alon     }
502b1115c99SLiran Alon 
50392229a57SYang Zhong     mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
50492229a57SYang Zhong     if (mmap_size < 0) {
50592229a57SYang Zhong         ret = mmap_size;
5069cdfb1e3SJai Arora         trace_kvm_failed_get_vcpu_mmap_size();
50792229a57SYang Zhong         goto err;
50892229a57SYang Zhong     }
50992229a57SYang Zhong 
51092229a57SYang Zhong     ret = munmap(cpu->kvm_run, mmap_size);
51192229a57SYang Zhong     if (ret < 0) {
51292229a57SYang Zhong         goto err;
51392229a57SYang Zhong     }
51492229a57SYang Zhong 
515b4420f19SPeter Xu     if (cpu->kvm_dirty_gfns) {
516dcafa248SPeter Xu         ret = munmap(cpu->kvm_dirty_gfns, s->kvm_dirty_ring_bytes);
517b4420f19SPeter Xu         if (ret < 0) {
518b4420f19SPeter Xu             goto err;
519b4420f19SPeter Xu         }
520b4420f19SPeter Xu     }
521b4420f19SPeter Xu 
52208c32868SSalil Mehta     kvm_park_vcpu(cpu);
52392229a57SYang Zhong err:
52492229a57SYang Zhong     return ret;
52592229a57SYang Zhong }
52692229a57SYang Zhong 
kvm_destroy_vcpu(CPUState * cpu)52757038a92SClaudio Fontana void kvm_destroy_vcpu(CPUState *cpu)
52857038a92SClaudio Fontana {
52957038a92SClaudio Fontana     if (do_kvm_destroy_vcpu(cpu) < 0) {
53057038a92SClaudio Fontana         error_report("kvm_destroy_vcpu failed");
53157038a92SClaudio Fontana         exit(EXIT_FAILURE);
53257038a92SClaudio Fontana     }
53357038a92SClaudio Fontana }
53457038a92SClaudio Fontana 
kvm_init_vcpu(CPUState * cpu,Error ** errp)535d0a92b35SDr. David Alan Gilbert int kvm_init_vcpu(CPUState *cpu, Error **errp)
53692229a57SYang Zhong {
53792229a57SYang Zhong     KVMState *s = kvm_state;
53828d2d03cSPeter Maydell     int mmap_size;
53992229a57SYang Zhong     int ret;
54092229a57SYang Zhong 
541d0a92b35SDr. David Alan Gilbert     trace_kvm_init_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
54292229a57SYang Zhong 
54308c32868SSalil Mehta     ret = kvm_create_vcpu(cpu);
54492229a57SYang Zhong     if (ret < 0) {
54508c32868SSalil Mehta         error_setg_errno(errp, -ret,
54608c32868SSalil Mehta                          "kvm_init_vcpu: kvm_create_vcpu failed (%lu)",
547d0a92b35SDr. David Alan Gilbert                          kvm_arch_vcpu_id(cpu));
54892229a57SYang Zhong         goto err;
54992229a57SYang Zhong     }
55092229a57SYang Zhong 
55192229a57SYang Zhong     mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
55292229a57SYang Zhong     if (mmap_size < 0) {
55392229a57SYang Zhong         ret = mmap_size;
554d0a92b35SDr. David Alan Gilbert         error_setg_errno(errp, -mmap_size,
555d0a92b35SDr. David Alan Gilbert                          "kvm_init_vcpu: KVM_GET_VCPU_MMAP_SIZE failed");
55692229a57SYang Zhong         goto err;
55792229a57SYang Zhong     }
55892229a57SYang Zhong 
55992229a57SYang Zhong     cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
56092229a57SYang Zhong                         cpu->kvm_fd, 0);
56192229a57SYang Zhong     if (cpu->kvm_run == MAP_FAILED) {
56292229a57SYang Zhong         ret = -errno;
563d0a92b35SDr. David Alan Gilbert         error_setg_errno(errp, ret,
564d0a92b35SDr. David Alan Gilbert                          "kvm_init_vcpu: mmap'ing vcpu state failed (%lu)",
565d0a92b35SDr. David Alan Gilbert                          kvm_arch_vcpu_id(cpu));
56692229a57SYang Zhong         goto err;
56792229a57SYang Zhong     }
56892229a57SYang Zhong 
56992229a57SYang Zhong     if (s->coalesced_mmio && !s->coalesced_mmio_ring) {
57092229a57SYang Zhong         s->coalesced_mmio_ring =
57192229a57SYang Zhong             (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE;
57292229a57SYang Zhong     }
57392229a57SYang Zhong 
574b4420f19SPeter Xu     if (s->kvm_dirty_ring_size) {
575b4420f19SPeter Xu         /* Use MAP_SHARED to share pages with the kernel */
576dcafa248SPeter Xu         cpu->kvm_dirty_gfns = mmap(NULL, s->kvm_dirty_ring_bytes,
577b4420f19SPeter Xu                                    PROT_READ | PROT_WRITE, MAP_SHARED,
578b4420f19SPeter Xu                                    cpu->kvm_fd,
579b4420f19SPeter Xu                                    PAGE_SIZE * KVM_DIRTY_LOG_PAGE_OFFSET);
580b4420f19SPeter Xu         if (cpu->kvm_dirty_gfns == MAP_FAILED) {
581b4420f19SPeter Xu             ret = -errno;
582b4420f19SPeter Xu             goto err;
583b4420f19SPeter Xu         }
584b4420f19SPeter Xu     }
585b4420f19SPeter Xu 
58692229a57SYang Zhong     ret = kvm_arch_init_vcpu(cpu);
587d0a92b35SDr. David Alan Gilbert     if (ret < 0) {
588d0a92b35SDr. David Alan Gilbert         error_setg_errno(errp, -ret,
589d0a92b35SDr. David Alan Gilbert                          "kvm_init_vcpu: kvm_arch_init_vcpu failed (%lu)",
590d0a92b35SDr. David Alan Gilbert                          kvm_arch_vcpu_id(cpu));
591d0a92b35SDr. David Alan Gilbert     }
5923b6f4852SMarcelo Tosatti     cpu->kvm_vcpu_stats_fd = kvm_vcpu_ioctl(cpu, KVM_GET_STATS_FD, NULL);
5933b6f4852SMarcelo Tosatti 
59492229a57SYang Zhong err:
59592229a57SYang Zhong     return ret;
59692229a57SYang Zhong }
59792229a57SYang Zhong 
59892229a57SYang Zhong /*
59992229a57SYang Zhong  * dirty pages logging control
60092229a57SYang Zhong  */
60192229a57SYang Zhong 
kvm_mem_flags(MemoryRegion * mr)60292229a57SYang Zhong static int kvm_mem_flags(MemoryRegion *mr)
60392229a57SYang Zhong {
60492229a57SYang Zhong     bool readonly = mr->readonly || memory_region_is_romd(mr);
60592229a57SYang Zhong     int flags = 0;
60692229a57SYang Zhong 
60792229a57SYang Zhong     if (memory_region_get_dirty_log_mask(mr) != 0) {
60892229a57SYang Zhong         flags |= KVM_MEM_LOG_DIRTY_PAGES;
60992229a57SYang Zhong     }
61092229a57SYang Zhong     if (readonly && kvm_readonly_mem_allowed) {
61192229a57SYang Zhong         flags |= KVM_MEM_READONLY;
61292229a57SYang Zhong     }
613ce5a9832SChao Peng     if (memory_region_has_guest_memfd(mr)) {
614ce5a9832SChao Peng         assert(kvm_guest_memfd_supported);
615ce5a9832SChao Peng         flags |= KVM_MEM_GUEST_MEMFD;
616ce5a9832SChao Peng     }
61792229a57SYang Zhong     return flags;
61892229a57SYang Zhong }
61992229a57SYang Zhong 
62036adac49SPeter Xu /* Called with KVMMemoryListener.slots_lock held */
kvm_slot_update_flags(KVMMemoryListener * kml,KVMSlot * mem,MemoryRegion * mr)62192229a57SYang Zhong static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem,
62292229a57SYang Zhong                                  MemoryRegion *mr)
62392229a57SYang Zhong {
62492229a57SYang Zhong     mem->flags = kvm_mem_flags(mr);
62592229a57SYang Zhong 
62692229a57SYang Zhong     /* If nothing changed effectively, no need to issue ioctl */
6276c090d4aSShannon Zhao     if (mem->flags == mem->old_flags) {
62892229a57SYang Zhong         return 0;
62992229a57SYang Zhong     }
63092229a57SYang Zhong 
631ea776d15SPeter Xu     kvm_slot_init_dirty_bitmap(mem);
6326c090d4aSShannon Zhao     return kvm_set_user_memory_region(kml, mem, false);
63392229a57SYang Zhong }
63492229a57SYang Zhong 
kvm_section_update_flags(KVMMemoryListener * kml,MemoryRegionSection * section)63592229a57SYang Zhong static int kvm_section_update_flags(KVMMemoryListener *kml,
63692229a57SYang Zhong                                     MemoryRegionSection *section)
63792229a57SYang Zhong {
638023ae9a8SIgor Mammedov     hwaddr start_addr, size, slot_size;
639343562e8SDavid Hildenbrand     KVMSlot *mem;
64036adac49SPeter Xu     int ret = 0;
64192229a57SYang Zhong 
642343562e8SDavid Hildenbrand     size = kvm_align_section(section, &start_addr);
643343562e8SDavid Hildenbrand     if (!size) {
64492229a57SYang Zhong         return 0;
64592229a57SYang Zhong     }
646343562e8SDavid Hildenbrand 
647a2f77862SPeter Xu     kvm_slots_lock();
64836adac49SPeter Xu 
649023ae9a8SIgor Mammedov     while (size && !ret) {
650023ae9a8SIgor Mammedov         slot_size = MIN(kvm_max_slot_size, size);
651023ae9a8SIgor Mammedov         mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
652343562e8SDavid Hildenbrand         if (!mem) {
653e377e87cSDavid Hildenbrand             /* We don't have a slot if we want to trap every access. */
65436adac49SPeter Xu             goto out;
655343562e8SDavid Hildenbrand         }
656343562e8SDavid Hildenbrand 
65736adac49SPeter Xu         ret = kvm_slot_update_flags(kml, mem, section->mr);
658023ae9a8SIgor Mammedov         start_addr += slot_size;
659023ae9a8SIgor Mammedov         size -= slot_size;
660023ae9a8SIgor Mammedov     }
66136adac49SPeter Xu 
66236adac49SPeter Xu out:
663a2f77862SPeter Xu     kvm_slots_unlock();
66436adac49SPeter Xu     return ret;
66592229a57SYang Zhong }
66692229a57SYang Zhong 
kvm_log_start(MemoryListener * listener,MemoryRegionSection * section,int old,int new)66792229a57SYang Zhong static void kvm_log_start(MemoryListener *listener,
66892229a57SYang Zhong                           MemoryRegionSection *section,
66992229a57SYang Zhong                           int old, int new)
67092229a57SYang Zhong {
67192229a57SYang Zhong     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
67292229a57SYang Zhong     int r;
67392229a57SYang Zhong 
67492229a57SYang Zhong     if (old != 0) {
67592229a57SYang Zhong         return;
67692229a57SYang Zhong     }
67792229a57SYang Zhong 
67892229a57SYang Zhong     r = kvm_section_update_flags(kml, section);
67992229a57SYang Zhong     if (r < 0) {
68092229a57SYang Zhong         abort();
68192229a57SYang Zhong     }
68292229a57SYang Zhong }
68392229a57SYang Zhong 
kvm_log_stop(MemoryListener * listener,MemoryRegionSection * section,int old,int new)68492229a57SYang Zhong static void kvm_log_stop(MemoryListener *listener,
68592229a57SYang Zhong                           MemoryRegionSection *section,
68692229a57SYang Zhong                           int old, int new)
68792229a57SYang Zhong {
68892229a57SYang Zhong     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
68992229a57SYang Zhong     int r;
69092229a57SYang Zhong 
69192229a57SYang Zhong     if (new != 0) {
69292229a57SYang Zhong         return;
69392229a57SYang Zhong     }
69492229a57SYang Zhong 
69592229a57SYang Zhong     r = kvm_section_update_flags(kml, section);
69692229a57SYang Zhong     if (r < 0) {
69792229a57SYang Zhong         abort();
69892229a57SYang Zhong     }
69992229a57SYang Zhong }
70092229a57SYang Zhong 
70192229a57SYang Zhong /* get kvm's dirty pages bitmap and update qemu's */
kvm_slot_sync_dirty_pages(KVMSlot * slot)7022c20b27eSPeter Xu static void kvm_slot_sync_dirty_pages(KVMSlot *slot)
70392229a57SYang Zhong {
7042c20b27eSPeter Xu     ram_addr_t start = slot->ram_start_offset;
7058e3b0cbbSMarc-André Lureau     ram_addr_t pages = slot->memory_size / qemu_real_host_page_size();
70692229a57SYang Zhong 
7072c20b27eSPeter Xu     cpu_physical_memory_set_dirty_lebitmap(slot->dirty_bmap, start, pages);
70892229a57SYang Zhong }
70992229a57SYang Zhong 
kvm_slot_reset_dirty_pages(KVMSlot * slot)710b4420f19SPeter Xu static void kvm_slot_reset_dirty_pages(KVMSlot *slot)
711b4420f19SPeter Xu {
712b4420f19SPeter Xu     memset(slot->dirty_bmap, 0, slot->dirty_bmap_size);
713b4420f19SPeter Xu }
714b4420f19SPeter Xu 
71592229a57SYang Zhong #define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
71692229a57SYang Zhong 
7179b3a31c7SDr. David Alan Gilbert /* Allocate the dirty bitmap for a slot  */
kvm_slot_init_dirty_bitmap(KVMSlot * mem)718ea776d15SPeter Xu static void kvm_slot_init_dirty_bitmap(KVMSlot *mem)
7199b3a31c7SDr. David Alan Gilbert {
720ea776d15SPeter Xu     if (!(mem->flags & KVM_MEM_LOG_DIRTY_PAGES) || mem->dirty_bmap) {
721ea776d15SPeter Xu         return;
722ea776d15SPeter Xu     }
723ea776d15SPeter Xu 
7249b3a31c7SDr. David Alan Gilbert     /*
7259b3a31c7SDr. David Alan Gilbert      * XXX bad kernel interface alert
7269b3a31c7SDr. David Alan Gilbert      * For dirty bitmap, kernel allocates array of size aligned to
7279b3a31c7SDr. David Alan Gilbert      * bits-per-long.  But for case when the kernel is 64bits and
7289b3a31c7SDr. David Alan Gilbert      * the userspace is 32bits, userspace can't align to the same
7299b3a31c7SDr. David Alan Gilbert      * bits-per-long, since sizeof(long) is different between kernel
7309b3a31c7SDr. David Alan Gilbert      * and user space.  This way, userspace will provide buffer which
7319b3a31c7SDr. David Alan Gilbert      * may be 4 bytes less than the kernel will use, resulting in
7329b3a31c7SDr. David Alan Gilbert      * userspace memory corruption (which is not detectable by valgrind
7339b3a31c7SDr. David Alan Gilbert      * too, in most cases).
7349b3a31c7SDr. David Alan Gilbert      * So for now, let's align to 64 instead of HOST_LONG_BITS here, in
7359b3a31c7SDr. David Alan Gilbert      * a hope that sizeof(long) won't become >8 any time soon.
736e0a8f993SKeqian Zhu      *
737e0a8f993SKeqian Zhu      * Note: the granule of kvm dirty log is qemu_real_host_page_size.
738e0a8f993SKeqian Zhu      * And mem->memory_size is aligned to it (otherwise this mem can't
739e0a8f993SKeqian Zhu      * be registered to KVM).
7409b3a31c7SDr. David Alan Gilbert      */
7418e3b0cbbSMarc-André Lureau     hwaddr bitmap_size = ALIGN(mem->memory_size / qemu_real_host_page_size(),
7429b3a31c7SDr. David Alan Gilbert                                         /*HOST_LONG_BITS*/ 64) / 8;
7439b3a31c7SDr. David Alan Gilbert     mem->dirty_bmap = g_malloc0(bitmap_size);
744563d32baSPeter Xu     mem->dirty_bmap_size = bitmap_size;
7459b3a31c7SDr. David Alan Gilbert }
7469b3a31c7SDr. David Alan Gilbert 
747e65e5f50SPeter Xu /*
748e65e5f50SPeter Xu  * Sync dirty bitmap from kernel to KVMSlot.dirty_bmap, return true if
749e65e5f50SPeter Xu  * succeeded, false otherwise
750e65e5f50SPeter Xu  */
kvm_slot_get_dirty_log(KVMState * s,KVMSlot * slot)751e65e5f50SPeter Xu static bool kvm_slot_get_dirty_log(KVMState *s, KVMSlot *slot)
752e65e5f50SPeter Xu {
753e65e5f50SPeter Xu     struct kvm_dirty_log d = {};
754e65e5f50SPeter Xu     int ret;
755e65e5f50SPeter Xu 
756e65e5f50SPeter Xu     d.dirty_bitmap = slot->dirty_bmap;
757e65e5f50SPeter Xu     d.slot = slot->slot | (slot->as_id << 16);
758e65e5f50SPeter Xu     ret = kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d);
759e65e5f50SPeter Xu 
760e65e5f50SPeter Xu     if (ret == -ENOENT) {
761e65e5f50SPeter Xu         /* kernel does not have dirty bitmap in this slot */
762e65e5f50SPeter Xu         ret = 0;
763e65e5f50SPeter Xu     }
764e65e5f50SPeter Xu     if (ret) {
765e65e5f50SPeter Xu         error_report_once("%s: KVM_GET_DIRTY_LOG failed with %d",
766e65e5f50SPeter Xu                           __func__, ret);
767e65e5f50SPeter Xu     }
768e65e5f50SPeter Xu     return ret == 0;
769e65e5f50SPeter Xu }
770e65e5f50SPeter Xu 
771b4420f19SPeter Xu /* Should be with all slots_lock held for the address spaces. */
kvm_dirty_ring_mark_page(KVMState * s,uint32_t as_id,uint32_t slot_id,uint64_t offset)772b4420f19SPeter Xu static void kvm_dirty_ring_mark_page(KVMState *s, uint32_t as_id,
773b4420f19SPeter Xu                                      uint32_t slot_id, uint64_t offset)
774b4420f19SPeter Xu {
775b4420f19SPeter Xu     KVMMemoryListener *kml;
776b4420f19SPeter Xu     KVMSlot *mem;
777b4420f19SPeter Xu 
778b4420f19SPeter Xu     if (as_id >= s->nr_as) {
779b4420f19SPeter Xu         return;
780b4420f19SPeter Xu     }
781b4420f19SPeter Xu 
782b4420f19SPeter Xu     kml = s->as[as_id].ml;
783b4420f19SPeter Xu     mem = &kml->slots[slot_id];
784b4420f19SPeter Xu 
785b4420f19SPeter Xu     if (!mem->memory_size || offset >=
7868e3b0cbbSMarc-André Lureau         (mem->memory_size / qemu_real_host_page_size())) {
787b4420f19SPeter Xu         return;
788b4420f19SPeter Xu     }
789b4420f19SPeter Xu 
790b4420f19SPeter Xu     set_bit(offset, mem->dirty_bmap);
791b4420f19SPeter Xu }
792b4420f19SPeter Xu 
dirty_gfn_is_dirtied(struct kvm_dirty_gfn * gfn)793b4420f19SPeter Xu static bool dirty_gfn_is_dirtied(struct kvm_dirty_gfn *gfn)
794b4420f19SPeter Xu {
7954802bf91SPaolo Bonzini     /*
7964802bf91SPaolo Bonzini      * Read the flags before the value.  Pairs with barrier in
7974802bf91SPaolo Bonzini      * KVM's kvm_dirty_ring_push() function.
7984802bf91SPaolo Bonzini      */
7994802bf91SPaolo Bonzini     return qatomic_load_acquire(&gfn->flags) == KVM_DIRTY_GFN_F_DIRTY;
800b4420f19SPeter Xu }
801b4420f19SPeter Xu 
dirty_gfn_set_collected(struct kvm_dirty_gfn * gfn)802b4420f19SPeter Xu static void dirty_gfn_set_collected(struct kvm_dirty_gfn *gfn)
803b4420f19SPeter Xu {
80452281c6dSPaolo Bonzini     /*
80552281c6dSPaolo Bonzini      * Use a store-release so that the CPU that executes KVM_RESET_DIRTY_RINGS
80652281c6dSPaolo Bonzini      * sees the full content of the ring:
80752281c6dSPaolo Bonzini      *
80852281c6dSPaolo Bonzini      * CPU0                     CPU1                         CPU2
80952281c6dSPaolo Bonzini      * ------------------------------------------------------------------------------
81052281c6dSPaolo Bonzini      *                                                       fill gfn0
81152281c6dSPaolo Bonzini      *                                                       store-rel flags for gfn0
81252281c6dSPaolo Bonzini      * load-acq flags for gfn0
81352281c6dSPaolo Bonzini      * store-rel RESET for gfn0
81452281c6dSPaolo Bonzini      *                          ioctl(RESET_RINGS)
81552281c6dSPaolo Bonzini      *                            load-acq flags for gfn0
81652281c6dSPaolo Bonzini      *                            check if flags have RESET
81752281c6dSPaolo Bonzini      *
81852281c6dSPaolo Bonzini      * The synchronization goes from CPU2 to CPU0 to CPU1.
81952281c6dSPaolo Bonzini      */
82052281c6dSPaolo Bonzini     qatomic_store_release(&gfn->flags, KVM_DIRTY_GFN_F_RESET);
821b4420f19SPeter Xu }
822b4420f19SPeter Xu 
823b4420f19SPeter Xu /*
824b4420f19SPeter Xu  * Should be with all slots_lock held for the address spaces.  It returns the
825b4420f19SPeter Xu  * dirty page we've collected on this dirty ring.
826b4420f19SPeter Xu  */
kvm_dirty_ring_reap_one(KVMState * s,CPUState * cpu)827b4420f19SPeter Xu static uint32_t kvm_dirty_ring_reap_one(KVMState *s, CPUState *cpu)
828b4420f19SPeter Xu {
829b4420f19SPeter Xu     struct kvm_dirty_gfn *dirty_gfns = cpu->kvm_dirty_gfns, *cur;
830b4420f19SPeter Xu     uint32_t ring_size = s->kvm_dirty_ring_size;
831b4420f19SPeter Xu     uint32_t count = 0, fetch = cpu->kvm_fetch_index;
832b4420f19SPeter Xu 
83356adee40SPeter Xu     /*
83456adee40SPeter Xu      * It's possible that we race with vcpu creation code where the vcpu is
83556adee40SPeter Xu      * put onto the vcpus list but not yet initialized the dirty ring
83656adee40SPeter Xu      * structures.  If so, skip it.
83756adee40SPeter Xu      */
83856adee40SPeter Xu     if (!cpu->created) {
83956adee40SPeter Xu         return 0;
84056adee40SPeter Xu     }
84156adee40SPeter Xu 
842b4420f19SPeter Xu     assert(dirty_gfns && ring_size);
843b4420f19SPeter Xu     trace_kvm_dirty_ring_reap_vcpu(cpu->cpu_index);
844b4420f19SPeter Xu 
845b4420f19SPeter Xu     while (true) {
846b4420f19SPeter Xu         cur = &dirty_gfns[fetch % ring_size];
847b4420f19SPeter Xu         if (!dirty_gfn_is_dirtied(cur)) {
848b4420f19SPeter Xu             break;
849b4420f19SPeter Xu         }
850b4420f19SPeter Xu         kvm_dirty_ring_mark_page(s, cur->slot >> 16, cur->slot & 0xffff,
851b4420f19SPeter Xu                                  cur->offset);
852b4420f19SPeter Xu         dirty_gfn_set_collected(cur);
853b4420f19SPeter Xu         trace_kvm_dirty_ring_page(cpu->cpu_index, fetch, cur->offset);
854b4420f19SPeter Xu         fetch++;
855b4420f19SPeter Xu         count++;
856b4420f19SPeter Xu     }
857b4420f19SPeter Xu     cpu->kvm_fetch_index = fetch;
8587786ae40SHyman Huang(黄勇)     cpu->dirty_pages += count;
859b4420f19SPeter Xu 
860b4420f19SPeter Xu     return count;
861b4420f19SPeter Xu }
862b4420f19SPeter Xu 
863b4420f19SPeter Xu /* Must be with slots_lock held */
kvm_dirty_ring_reap_locked(KVMState * s,CPUState * cpu)8641667e2b9SHyman Huang(黄勇) static uint64_t kvm_dirty_ring_reap_locked(KVMState *s, CPUState* cpu)
865b4420f19SPeter Xu {
866b4420f19SPeter Xu     int ret;
867b4420f19SPeter Xu     uint64_t total = 0;
868b4420f19SPeter Xu     int64_t stamp;
869b4420f19SPeter Xu 
870b4420f19SPeter Xu     stamp = get_clock();
871b4420f19SPeter Xu 
8721667e2b9SHyman Huang(黄勇)     if (cpu) {
8731667e2b9SHyman Huang(黄勇)         total = kvm_dirty_ring_reap_one(s, cpu);
8741667e2b9SHyman Huang(黄勇)     } else {
875b4420f19SPeter Xu         CPU_FOREACH(cpu) {
876b4420f19SPeter Xu             total += kvm_dirty_ring_reap_one(s, cpu);
877b4420f19SPeter Xu         }
8781667e2b9SHyman Huang(黄勇)     }
879b4420f19SPeter Xu 
880b4420f19SPeter Xu     if (total) {
881b4420f19SPeter Xu         ret = kvm_vm_ioctl(s, KVM_RESET_DIRTY_RINGS);
882b4420f19SPeter Xu         assert(ret == total);
883b4420f19SPeter Xu     }
884b4420f19SPeter Xu 
885b4420f19SPeter Xu     stamp = get_clock() - stamp;
886b4420f19SPeter Xu 
887b4420f19SPeter Xu     if (total) {
888b4420f19SPeter Xu         trace_kvm_dirty_ring_reap(total, stamp / 1000);
889b4420f19SPeter Xu     }
890b4420f19SPeter Xu 
891b4420f19SPeter Xu     return total;
892b4420f19SPeter Xu }
893b4420f19SPeter Xu 
894b4420f19SPeter Xu /*
895b4420f19SPeter Xu  * Currently for simplicity, we must hold BQL before calling this.  We can
896b4420f19SPeter Xu  * consider to drop the BQL if we're clear with all the race conditions.
897b4420f19SPeter Xu  */
kvm_dirty_ring_reap(KVMState * s,CPUState * cpu)8981667e2b9SHyman Huang(黄勇) static uint64_t kvm_dirty_ring_reap(KVMState *s, CPUState *cpu)
899b4420f19SPeter Xu {
900b4420f19SPeter Xu     uint64_t total;
901b4420f19SPeter Xu 
902b4420f19SPeter Xu     /*
903b4420f19SPeter Xu      * We need to lock all kvm slots for all address spaces here,
904b4420f19SPeter Xu      * because:
905b4420f19SPeter Xu      *
906b4420f19SPeter Xu      * (1) We need to mark dirty for dirty bitmaps in multiple slots
907b4420f19SPeter Xu      *     and for tons of pages, so it's better to take the lock here
908b4420f19SPeter Xu      *     once rather than once per page.  And more importantly,
909b4420f19SPeter Xu      *
910b4420f19SPeter Xu      * (2) We must _NOT_ publish dirty bits to the other threads
911b4420f19SPeter Xu      *     (e.g., the migration thread) via the kvm memory slot dirty
912b4420f19SPeter Xu      *     bitmaps before correctly re-protect those dirtied pages.
913b4420f19SPeter Xu      *     Otherwise we can have potential risk of data corruption if
914b4420f19SPeter Xu      *     the page data is read in the other thread before we do
915b4420f19SPeter Xu      *     reset below.
916b4420f19SPeter Xu      */
917b4420f19SPeter Xu     kvm_slots_lock();
9181667e2b9SHyman Huang(黄勇)     total = kvm_dirty_ring_reap_locked(s, cpu);
919b4420f19SPeter Xu     kvm_slots_unlock();
920b4420f19SPeter Xu 
921b4420f19SPeter Xu     return total;
922b4420f19SPeter Xu }
923b4420f19SPeter Xu 
do_kvm_cpu_synchronize_kick(CPUState * cpu,run_on_cpu_data arg)924b4420f19SPeter Xu static void do_kvm_cpu_synchronize_kick(CPUState *cpu, run_on_cpu_data arg)
925b4420f19SPeter Xu {
926b4420f19SPeter Xu     /* No need to do anything */
927b4420f19SPeter Xu }
928b4420f19SPeter Xu 
929b4420f19SPeter Xu /*
930b4420f19SPeter Xu  * Kick all vcpus out in a synchronized way.  When returned, we
931b4420f19SPeter Xu  * guarantee that every vcpu has been kicked and at least returned to
932b4420f19SPeter Xu  * userspace once.
933b4420f19SPeter Xu  */
kvm_cpu_synchronize_kick_all(void)934b4420f19SPeter Xu static void kvm_cpu_synchronize_kick_all(void)
935b4420f19SPeter Xu {
936b4420f19SPeter Xu     CPUState *cpu;
937b4420f19SPeter Xu 
938b4420f19SPeter Xu     CPU_FOREACH(cpu) {
939b4420f19SPeter Xu         run_on_cpu(cpu, do_kvm_cpu_synchronize_kick, RUN_ON_CPU_NULL);
940b4420f19SPeter Xu     }
941b4420f19SPeter Xu }
942b4420f19SPeter Xu 
943b4420f19SPeter Xu /*
944b4420f19SPeter Xu  * Flush all the existing dirty pages to the KVM slot buffers.  When
945b4420f19SPeter Xu  * this call returns, we guarantee that all the touched dirty pages
946b4420f19SPeter Xu  * before calling this function have been put into the per-kvmslot
947b4420f19SPeter Xu  * dirty bitmap.
948b4420f19SPeter Xu  *
949b4420f19SPeter Xu  * This function must be called with BQL held.
950b4420f19SPeter Xu  */
kvm_dirty_ring_flush(void)951b4420f19SPeter Xu static void kvm_dirty_ring_flush(void)
952b4420f19SPeter Xu {
953b4420f19SPeter Xu     trace_kvm_dirty_ring_flush(0);
954b4420f19SPeter Xu     /*
955b4420f19SPeter Xu      * The function needs to be serialized.  Since this function
956b4420f19SPeter Xu      * should always be with BQL held, serialization is guaranteed.
957b4420f19SPeter Xu      * However, let's be sure of it.
958b4420f19SPeter Xu      */
959195801d7SStefan Hajnoczi     assert(bql_locked());
960b4420f19SPeter Xu     /*
961b4420f19SPeter Xu      * First make sure to flush the hardware buffers by kicking all
962b4420f19SPeter Xu      * vcpus out in a synchronous way.
963b4420f19SPeter Xu      */
964b4420f19SPeter Xu     kvm_cpu_synchronize_kick_all();
9651667e2b9SHyman Huang(黄勇)     kvm_dirty_ring_reap(kvm_state, NULL);
966b4420f19SPeter Xu     trace_kvm_dirty_ring_flush(1);
967b4420f19SPeter Xu }
968b4420f19SPeter Xu 
96992229a57SYang Zhong /**
9704a12a11aSPeter Xu  * kvm_physical_sync_dirty_bitmap - Sync dirty bitmap from kernel space
97192229a57SYang Zhong  *
9724a12a11aSPeter Xu  * This function will first try to fetch dirty bitmap from the kernel,
9734a12a11aSPeter Xu  * and then updates qemu's dirty bitmap.
9744a12a11aSPeter Xu  *
97536adac49SPeter Xu  * NOTE: caller must be with kml->slots_lock held.
97636adac49SPeter Xu  *
9774a12a11aSPeter Xu  * @kml: the KVM memory listener object
9784a12a11aSPeter Xu  * @section: the memory section to sync the dirty bitmap with
97992229a57SYang Zhong  */
kvm_physical_sync_dirty_bitmap(KVMMemoryListener * kml,MemoryRegionSection * section)980e65e5f50SPeter Xu static void kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml,
98192229a57SYang Zhong                                            MemoryRegionSection *section)
98292229a57SYang Zhong {
98392229a57SYang Zhong     KVMState *s = kvm_state;
98492229a57SYang Zhong     KVMSlot *mem;
98567548f09SDavid Hildenbrand     hwaddr start_addr, size;
9862c20b27eSPeter Xu     hwaddr slot_size;
98792229a57SYang Zhong 
98867548f09SDavid Hildenbrand     size = kvm_align_section(section, &start_addr);
989023ae9a8SIgor Mammedov     while (size) {
990023ae9a8SIgor Mammedov         slot_size = MIN(kvm_max_slot_size, size);
991023ae9a8SIgor Mammedov         mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
99267548f09SDavid Hildenbrand         if (!mem) {
993e377e87cSDavid Hildenbrand             /* We don't have a slot if we want to trap every access. */
994e65e5f50SPeter Xu             return;
99592229a57SYang Zhong         }
996e65e5f50SPeter Xu         if (kvm_slot_get_dirty_log(s, mem)) {
9972c20b27eSPeter Xu             kvm_slot_sync_dirty_pages(mem);
99838e0b790SThomas Huth         }
999023ae9a8SIgor Mammedov         start_addr += slot_size;
1000023ae9a8SIgor Mammedov         size -= slot_size;
100167548f09SDavid Hildenbrand     }
100292229a57SYang Zhong }
100392229a57SYang Zhong 
1004ff4aa114SPeter Xu /* Alignment requirement for KVM_CLEAR_DIRTY_LOG - 64 pages */
1005ff4aa114SPeter Xu #define KVM_CLEAR_LOG_SHIFT  6
10068e3b0cbbSMarc-André Lureau #define KVM_CLEAR_LOG_ALIGN  (qemu_real_host_page_size() << KVM_CLEAR_LOG_SHIFT)
1007ff4aa114SPeter Xu #define KVM_CLEAR_LOG_MASK   (-KVM_CLEAR_LOG_ALIGN)
1008ff4aa114SPeter Xu 
kvm_log_clear_one_slot(KVMSlot * mem,int as_id,uint64_t start,uint64_t size)10094222147dSPaolo Bonzini static int kvm_log_clear_one_slot(KVMSlot *mem, int as_id, uint64_t start,
10104222147dSPaolo Bonzini                                   uint64_t size)
1011ff4aa114SPeter Xu {
1012ff4aa114SPeter Xu     KVMState *s = kvm_state;
10134222147dSPaolo Bonzini     uint64_t end, bmap_start, start_delta, bmap_npages;
1014ff4aa114SPeter Xu     struct kvm_clear_dirty_log d;
10158e3b0cbbSMarc-André Lureau     unsigned long *bmap_clear = NULL, psize = qemu_real_host_page_size();
10164222147dSPaolo Bonzini     int ret;
1017ff4aa114SPeter Xu 
1018ff4aa114SPeter Xu     /*
1019ff4aa114SPeter Xu      * We need to extend either the start or the size or both to
1020ff4aa114SPeter Xu      * satisfy the KVM interface requirement.  Firstly, do the start
1021ff4aa114SPeter Xu      * page alignment on 64 host pages
1022ff4aa114SPeter Xu      */
102384516e5bSPaolo Bonzini     bmap_start = start & KVM_CLEAR_LOG_MASK;
102484516e5bSPaolo Bonzini     start_delta = start - bmap_start;
1025ff4aa114SPeter Xu     bmap_start /= psize;
1026ff4aa114SPeter Xu 
1027ff4aa114SPeter Xu     /*
1028ff4aa114SPeter Xu      * The kernel interface has restriction on the size too, that either:
1029ff4aa114SPeter Xu      *
1030ff4aa114SPeter Xu      * (1) the size is 64 host pages aligned (just like the start), or
1031ff4aa114SPeter Xu      * (2) the size fills up until the end of the KVM memslot.
1032ff4aa114SPeter Xu      */
1033ff4aa114SPeter Xu     bmap_npages = DIV_ROUND_UP(size + start_delta, KVM_CLEAR_LOG_ALIGN)
1034ff4aa114SPeter Xu         << KVM_CLEAR_LOG_SHIFT;
1035ff4aa114SPeter Xu     end = mem->memory_size / psize;
1036ff4aa114SPeter Xu     if (bmap_npages > end - bmap_start) {
1037ff4aa114SPeter Xu         bmap_npages = end - bmap_start;
1038ff4aa114SPeter Xu     }
1039ff4aa114SPeter Xu     start_delta /= psize;
1040ff4aa114SPeter Xu 
1041ff4aa114SPeter Xu     /*
1042ff4aa114SPeter Xu      * Prepare the bitmap to clear dirty bits.  Here we must guarantee
1043ff4aa114SPeter Xu      * that we won't clear any unknown dirty bits otherwise we might
1044ff4aa114SPeter Xu      * accidentally clear some set bits which are not yet synced from
1045ff4aa114SPeter Xu      * the kernel into QEMU's bitmap, then we'll lose track of the
1046ff4aa114SPeter Xu      * guest modifications upon those pages (which can directly lead
1047ff4aa114SPeter Xu      * to guest data loss or panic after migration).
1048ff4aa114SPeter Xu      *
1049ff4aa114SPeter Xu      * Layout of the KVMSlot.dirty_bmap:
1050ff4aa114SPeter Xu      *
1051ff4aa114SPeter Xu      *                   |<-------- bmap_npages -----------..>|
1052ff4aa114SPeter Xu      *                                                     [1]
1053ff4aa114SPeter Xu      *                     start_delta         size
1054ff4aa114SPeter Xu      *  |----------------|-------------|------------------|------------|
1055ff4aa114SPeter Xu      *  ^                ^             ^                               ^
1056ff4aa114SPeter Xu      *  |                |             |                               |
1057ff4aa114SPeter Xu      * start          bmap_start     (start)                         end
1058ff4aa114SPeter Xu      * of memslot                                             of memslot
1059ff4aa114SPeter Xu      *
1060ff4aa114SPeter Xu      * [1] bmap_npages can be aligned to either 64 pages or the end of slot
1061ff4aa114SPeter Xu      */
1062ff4aa114SPeter Xu 
1063ff4aa114SPeter Xu     assert(bmap_start % BITS_PER_LONG == 0);
1064ff4aa114SPeter Xu     /* We should never do log_clear before log_sync */
1065ff4aa114SPeter Xu     assert(mem->dirty_bmap);
10664054adbdSZenghui Yu     if (start_delta || bmap_npages - size / psize) {
1067ff4aa114SPeter Xu         /* Slow path - we need to manipulate a temp bitmap */
1068ff4aa114SPeter Xu         bmap_clear = bitmap_new(bmap_npages);
1069ff4aa114SPeter Xu         bitmap_copy_with_src_offset(bmap_clear, mem->dirty_bmap,
1070ff4aa114SPeter Xu                                     bmap_start, start_delta + size / psize);
1071ff4aa114SPeter Xu         /*
1072ff4aa114SPeter Xu          * We need to fill the holes at start because that was not
1073ff4aa114SPeter Xu          * specified by the caller and we extended the bitmap only for
1074ff4aa114SPeter Xu          * 64 pages alignment
1075ff4aa114SPeter Xu          */
1076ff4aa114SPeter Xu         bitmap_clear(bmap_clear, 0, start_delta);
1077ff4aa114SPeter Xu         d.dirty_bitmap = bmap_clear;
1078ff4aa114SPeter Xu     } else {
10794054adbdSZenghui Yu         /*
10804054adbdSZenghui Yu          * Fast path - both start and size align well with BITS_PER_LONG
10814054adbdSZenghui Yu          * (or the end of memory slot)
10824054adbdSZenghui Yu          */
1083ff4aa114SPeter Xu         d.dirty_bitmap = mem->dirty_bmap + BIT_WORD(bmap_start);
1084ff4aa114SPeter Xu     }
1085ff4aa114SPeter Xu 
1086ff4aa114SPeter Xu     d.first_page = bmap_start;
1087ff4aa114SPeter Xu     /* It should never overflow.  If it happens, say something */
1088ff4aa114SPeter Xu     assert(bmap_npages <= UINT32_MAX);
1089ff4aa114SPeter Xu     d.num_pages = bmap_npages;
10904222147dSPaolo Bonzini     d.slot = mem->slot | (as_id << 16);
1091ff4aa114SPeter Xu 
109238e0b790SThomas Huth     ret = kvm_vm_ioctl(s, KVM_CLEAR_DIRTY_LOG, &d);
109338e0b790SThomas Huth     if (ret < 0 && ret != -ENOENT) {
1094ff4aa114SPeter Xu         error_report("%s: KVM_CLEAR_DIRTY_LOG failed, slot=%d, "
1095ff4aa114SPeter Xu                      "start=0x%"PRIx64", size=0x%"PRIx32", errno=%d",
1096ff4aa114SPeter Xu                      __func__, d.slot, (uint64_t)d.first_page,
1097ff4aa114SPeter Xu                      (uint32_t)d.num_pages, ret);
1098ff4aa114SPeter Xu     } else {
1099ff4aa114SPeter Xu         ret = 0;
1100ff4aa114SPeter Xu         trace_kvm_clear_dirty_log(d.slot, d.first_page, d.num_pages);
1101ff4aa114SPeter Xu     }
1102ff4aa114SPeter Xu 
1103ff4aa114SPeter Xu     /*
1104ff4aa114SPeter Xu      * After we have updated the remote dirty bitmap, we update the
1105ff4aa114SPeter Xu      * cached bitmap as well for the memslot, then if another user
1106ff4aa114SPeter Xu      * clears the same region we know we shouldn't clear it again on
1107ff4aa114SPeter Xu      * the remote otherwise it's data loss as well.
1108ff4aa114SPeter Xu      */
1109ff4aa114SPeter Xu     bitmap_clear(mem->dirty_bmap, bmap_start + start_delta,
1110ff4aa114SPeter Xu                  size / psize);
1111ff4aa114SPeter Xu     /* This handles the NULL case well */
1112ff4aa114SPeter Xu     g_free(bmap_clear);
11134222147dSPaolo Bonzini     return ret;
11144222147dSPaolo Bonzini }
11154222147dSPaolo Bonzini 
11164222147dSPaolo Bonzini 
11174222147dSPaolo Bonzini /**
11184222147dSPaolo Bonzini  * kvm_physical_log_clear - Clear the kernel's dirty bitmap for range
11194222147dSPaolo Bonzini  *
11204222147dSPaolo Bonzini  * NOTE: this will be a no-op if we haven't enabled manual dirty log
11214222147dSPaolo Bonzini  * protection in the host kernel because in that case this operation
11224222147dSPaolo Bonzini  * will be done within log_sync().
11234222147dSPaolo Bonzini  *
11244222147dSPaolo Bonzini  * @kml:     the kvm memory listener
11254222147dSPaolo Bonzini  * @section: the memory range to clear dirty bitmap
11264222147dSPaolo Bonzini  */
kvm_physical_log_clear(KVMMemoryListener * kml,MemoryRegionSection * section)11274222147dSPaolo Bonzini static int kvm_physical_log_clear(KVMMemoryListener *kml,
11284222147dSPaolo Bonzini                                   MemoryRegionSection *section)
11294222147dSPaolo Bonzini {
11304222147dSPaolo Bonzini     KVMState *s = kvm_state;
113184516e5bSPaolo Bonzini     uint64_t start, size, offset, count;
113284516e5bSPaolo Bonzini     KVMSlot *mem;
113387287ac0SAlex Bennée     int ret = 0, i;
11344222147dSPaolo Bonzini 
11354222147dSPaolo Bonzini     if (!s->manual_dirty_log_protect) {
11364222147dSPaolo Bonzini         /* No need to do explicit clear */
113787287ac0SAlex Bennée         return ret;
11384222147dSPaolo Bonzini     }
11394222147dSPaolo Bonzini 
11404222147dSPaolo Bonzini     start = section->offset_within_address_space;
11414222147dSPaolo Bonzini     size = int128_get64(section->size);
11424222147dSPaolo Bonzini 
11434222147dSPaolo Bonzini     if (!size) {
11444222147dSPaolo Bonzini         /* Nothing more we can do... */
114587287ac0SAlex Bennée         return ret;
11464222147dSPaolo Bonzini     }
11474222147dSPaolo Bonzini 
1148a2f77862SPeter Xu     kvm_slots_lock();
11494222147dSPaolo Bonzini 
11505504a812SPeter Xu     for (i = 0; i < kml->nr_slots_allocated; i++) {
11514222147dSPaolo Bonzini         mem = &kml->slots[i];
115284516e5bSPaolo Bonzini         /* Discard slots that are empty or do not overlap the section */
115384516e5bSPaolo Bonzini         if (!mem->memory_size ||
115484516e5bSPaolo Bonzini             mem->start_addr > start + size - 1 ||
115584516e5bSPaolo Bonzini             start > mem->start_addr + mem->memory_size - 1) {
115684516e5bSPaolo Bonzini             continue;
115784516e5bSPaolo Bonzini         }
115884516e5bSPaolo Bonzini 
115984516e5bSPaolo Bonzini         if (start >= mem->start_addr) {
116084516e5bSPaolo Bonzini             /* The slot starts before section or is aligned to it.  */
116184516e5bSPaolo Bonzini             offset = start - mem->start_addr;
116284516e5bSPaolo Bonzini             count = MIN(mem->memory_size - offset, size);
116384516e5bSPaolo Bonzini         } else {
116484516e5bSPaolo Bonzini             /* The slot starts after section.  */
116584516e5bSPaolo Bonzini             offset = 0;
116684516e5bSPaolo Bonzini             count = MIN(mem->memory_size, size - (mem->start_addr - start));
116784516e5bSPaolo Bonzini         }
116884516e5bSPaolo Bonzini         ret = kvm_log_clear_one_slot(mem, kml->as_id, offset, count);
116984516e5bSPaolo Bonzini         if (ret < 0) {
11704222147dSPaolo Bonzini             break;
11714222147dSPaolo Bonzini         }
11724222147dSPaolo Bonzini     }
11734222147dSPaolo Bonzini 
1174a2f77862SPeter Xu     kvm_slots_unlock();
1175ff4aa114SPeter Xu 
1176ff4aa114SPeter Xu     return ret;
1177ff4aa114SPeter Xu }
1178ff4aa114SPeter Xu 
kvm_coalesce_mmio_region(MemoryListener * listener,MemoryRegionSection * secion,hwaddr start,hwaddr size)117992229a57SYang Zhong static void kvm_coalesce_mmio_region(MemoryListener *listener,
118092229a57SYang Zhong                                      MemoryRegionSection *secion,
118192229a57SYang Zhong                                      hwaddr start, hwaddr size)
118292229a57SYang Zhong {
118392229a57SYang Zhong     KVMState *s = kvm_state;
118492229a57SYang Zhong 
118592229a57SYang Zhong     if (s->coalesced_mmio) {
118692229a57SYang Zhong         struct kvm_coalesced_mmio_zone zone;
118792229a57SYang Zhong 
118892229a57SYang Zhong         zone.addr = start;
118992229a57SYang Zhong         zone.size = size;
119092229a57SYang Zhong         zone.pad = 0;
119192229a57SYang Zhong 
119292229a57SYang Zhong         (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
119392229a57SYang Zhong     }
119492229a57SYang Zhong }
119592229a57SYang Zhong 
kvm_uncoalesce_mmio_region(MemoryListener * listener,MemoryRegionSection * secion,hwaddr start,hwaddr size)119692229a57SYang Zhong static void kvm_uncoalesce_mmio_region(MemoryListener *listener,
119792229a57SYang Zhong                                        MemoryRegionSection *secion,
119892229a57SYang Zhong                                        hwaddr start, hwaddr size)
119992229a57SYang Zhong {
120092229a57SYang Zhong     KVMState *s = kvm_state;
120192229a57SYang Zhong 
120292229a57SYang Zhong     if (s->coalesced_mmio) {
120392229a57SYang Zhong         struct kvm_coalesced_mmio_zone zone;
120492229a57SYang Zhong 
120592229a57SYang Zhong         zone.addr = start;
120692229a57SYang Zhong         zone.size = size;
120792229a57SYang Zhong         zone.pad = 0;
120892229a57SYang Zhong 
120992229a57SYang Zhong         (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
121092229a57SYang Zhong     }
121192229a57SYang Zhong }
121292229a57SYang Zhong 
kvm_coalesce_pio_add(MemoryListener * listener,MemoryRegionSection * section,hwaddr start,hwaddr size)1213e6d34aeeSPeng Hao static void kvm_coalesce_pio_add(MemoryListener *listener,
1214e6d34aeeSPeng Hao                                 MemoryRegionSection *section,
1215e6d34aeeSPeng Hao                                 hwaddr start, hwaddr size)
1216e6d34aeeSPeng Hao {
1217e6d34aeeSPeng Hao     KVMState *s = kvm_state;
1218e6d34aeeSPeng Hao 
1219e6d34aeeSPeng Hao     if (s->coalesced_pio) {
1220e6d34aeeSPeng Hao         struct kvm_coalesced_mmio_zone zone;
1221e6d34aeeSPeng Hao 
1222e6d34aeeSPeng Hao         zone.addr = start;
1223e6d34aeeSPeng Hao         zone.size = size;
1224e6d34aeeSPeng Hao         zone.pio = 1;
1225e6d34aeeSPeng Hao 
1226e6d34aeeSPeng Hao         (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
1227e6d34aeeSPeng Hao     }
1228e6d34aeeSPeng Hao }
1229e6d34aeeSPeng Hao 
kvm_coalesce_pio_del(MemoryListener * listener,MemoryRegionSection * section,hwaddr start,hwaddr size)1230e6d34aeeSPeng Hao static void kvm_coalesce_pio_del(MemoryListener *listener,
1231e6d34aeeSPeng Hao                                 MemoryRegionSection *section,
1232e6d34aeeSPeng Hao                                 hwaddr start, hwaddr size)
1233e6d34aeeSPeng Hao {
1234e6d34aeeSPeng Hao     KVMState *s = kvm_state;
1235e6d34aeeSPeng Hao 
1236e6d34aeeSPeng Hao     if (s->coalesced_pio) {
1237e6d34aeeSPeng Hao         struct kvm_coalesced_mmio_zone zone;
1238e6d34aeeSPeng Hao 
1239e6d34aeeSPeng Hao         zone.addr = start;
1240e6d34aeeSPeng Hao         zone.size = size;
1241e6d34aeeSPeng Hao         zone.pio = 1;
1242e6d34aeeSPeng Hao 
1243e6d34aeeSPeng Hao         (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
1244e6d34aeeSPeng Hao      }
1245e6d34aeeSPeng Hao }
1246e6d34aeeSPeng Hao 
kvm_check_extension(KVMState * s,unsigned int extension)124792229a57SYang Zhong int kvm_check_extension(KVMState *s, unsigned int extension)
124892229a57SYang Zhong {
124992229a57SYang Zhong     int ret;
125092229a57SYang Zhong 
125192229a57SYang Zhong     ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
125292229a57SYang Zhong     if (ret < 0) {
125392229a57SYang Zhong         ret = 0;
125492229a57SYang Zhong     }
125592229a57SYang Zhong 
125692229a57SYang Zhong     return ret;
125792229a57SYang Zhong }
125892229a57SYang Zhong 
kvm_vm_check_extension(KVMState * s,unsigned int extension)125992229a57SYang Zhong int kvm_vm_check_extension(KVMState *s, unsigned int extension)
126092229a57SYang Zhong {
126192229a57SYang Zhong     int ret;
126292229a57SYang Zhong 
126392229a57SYang Zhong     ret = kvm_vm_ioctl(s, KVM_CHECK_EXTENSION, extension);
126492229a57SYang Zhong     if (ret < 0) {
126592229a57SYang Zhong         /* VM wide version not implemented, use global one instead */
126692229a57SYang Zhong         ret = kvm_check_extension(s, extension);
126792229a57SYang Zhong     }
126892229a57SYang Zhong 
126992229a57SYang Zhong     return ret;
127092229a57SYang Zhong }
127192229a57SYang Zhong 
127206152b89SWilliam Roche /*
127306152b89SWilliam Roche  * We track the poisoned pages to be able to:
127406152b89SWilliam Roche  * - replace them on VM reset
127506152b89SWilliam Roche  * - block a migration for a VM with a poisoned page
127606152b89SWilliam Roche  */
12776b552b9bSDongjiu Geng typedef struct HWPoisonPage {
12786b552b9bSDongjiu Geng     ram_addr_t ram_addr;
12796b552b9bSDongjiu Geng     QLIST_ENTRY(HWPoisonPage) list;
12806b552b9bSDongjiu Geng } HWPoisonPage;
12816b552b9bSDongjiu Geng 
12826b552b9bSDongjiu Geng static QLIST_HEAD(, HWPoisonPage) hwpoison_page_list =
12836b552b9bSDongjiu Geng     QLIST_HEAD_INITIALIZER(hwpoison_page_list);
12846b552b9bSDongjiu Geng 
kvm_unpoison_all(void * param)12856b552b9bSDongjiu Geng static void kvm_unpoison_all(void *param)
12866b552b9bSDongjiu Geng {
12876b552b9bSDongjiu Geng     HWPoisonPage *page, *next_page;
12886b552b9bSDongjiu Geng 
12896b552b9bSDongjiu Geng     QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) {
12906b552b9bSDongjiu Geng         QLIST_REMOVE(page, list);
12916b552b9bSDongjiu Geng         qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE);
12926b552b9bSDongjiu Geng         g_free(page);
12936b552b9bSDongjiu Geng     }
12946b552b9bSDongjiu Geng }
12956b552b9bSDongjiu Geng 
kvm_hwpoison_page_add(ram_addr_t ram_addr)12966b552b9bSDongjiu Geng void kvm_hwpoison_page_add(ram_addr_t ram_addr)
12976b552b9bSDongjiu Geng {
12986b552b9bSDongjiu Geng     HWPoisonPage *page;
12996b552b9bSDongjiu Geng 
13006b552b9bSDongjiu Geng     QLIST_FOREACH(page, &hwpoison_page_list, list) {
13016b552b9bSDongjiu Geng         if (page->ram_addr == ram_addr) {
13026b552b9bSDongjiu Geng             return;
13036b552b9bSDongjiu Geng         }
13046b552b9bSDongjiu Geng     }
13056b552b9bSDongjiu Geng     page = g_new(HWPoisonPage, 1);
13066b552b9bSDongjiu Geng     page->ram_addr = ram_addr;
13076b552b9bSDongjiu Geng     QLIST_INSERT_HEAD(&hwpoison_page_list, page, list);
13086b552b9bSDongjiu Geng }
13096b552b9bSDongjiu Geng 
kvm_hwpoisoned_mem(void)131006152b89SWilliam Roche bool kvm_hwpoisoned_mem(void)
131106152b89SWilliam Roche {
131206152b89SWilliam Roche     return !QLIST_EMPTY(&hwpoison_page_list);
131306152b89SWilliam Roche }
131406152b89SWilliam Roche 
adjust_ioeventfd_endianness(uint32_t val,uint32_t size)131592229a57SYang Zhong static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size)
131692229a57SYang Zhong {
1317ee3eb3a7SMarc-André Lureau #if HOST_BIG_ENDIAN != TARGET_BIG_ENDIAN
1318e03b5686SMarc-André Lureau     /* The kernel expects ioeventfd values in HOST_BIG_ENDIAN
131992229a57SYang Zhong      * endianness, but the memory core hands them in target endianness.
132092229a57SYang Zhong      * For example, PPC is always treated as big-endian even if running
132192229a57SYang Zhong      * on KVM and on PPC64LE.  Correct here.
132292229a57SYang Zhong      */
132392229a57SYang Zhong     switch (size) {
132492229a57SYang Zhong     case 2:
132592229a57SYang Zhong         val = bswap16(val);
132692229a57SYang Zhong         break;
132792229a57SYang Zhong     case 4:
132892229a57SYang Zhong         val = bswap32(val);
132992229a57SYang Zhong         break;
133092229a57SYang Zhong     }
133192229a57SYang Zhong #endif
133292229a57SYang Zhong     return val;
133392229a57SYang Zhong }
133492229a57SYang Zhong 
kvm_set_ioeventfd_mmio(int fd,hwaddr addr,uint32_t val,bool assign,uint32_t size,bool datamatch)133592229a57SYang Zhong static int kvm_set_ioeventfd_mmio(int fd, hwaddr addr, uint32_t val,
133692229a57SYang Zhong                                   bool assign, uint32_t size, bool datamatch)
133792229a57SYang Zhong {
133892229a57SYang Zhong     int ret;
133992229a57SYang Zhong     struct kvm_ioeventfd iofd = {
134092229a57SYang Zhong         .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
134192229a57SYang Zhong         .addr = addr,
134292229a57SYang Zhong         .len = size,
134392229a57SYang Zhong         .flags = 0,
134492229a57SYang Zhong         .fd = fd,
134592229a57SYang Zhong     };
134692229a57SYang Zhong 
1347876d16cdSDr. David Alan Gilbert     trace_kvm_set_ioeventfd_mmio(fd, (uint64_t)addr, val, assign, size,
1348876d16cdSDr. David Alan Gilbert                                  datamatch);
134992229a57SYang Zhong     if (!kvm_enabled()) {
135092229a57SYang Zhong         return -ENOSYS;
135192229a57SYang Zhong     }
135292229a57SYang Zhong 
135392229a57SYang Zhong     if (datamatch) {
135492229a57SYang Zhong         iofd.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
135592229a57SYang Zhong     }
135692229a57SYang Zhong     if (!assign) {
135792229a57SYang Zhong         iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
135892229a57SYang Zhong     }
135992229a57SYang Zhong 
136092229a57SYang Zhong     ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);
136192229a57SYang Zhong 
136292229a57SYang Zhong     if (ret < 0) {
136392229a57SYang Zhong         return -errno;
136492229a57SYang Zhong     }
136592229a57SYang Zhong 
136692229a57SYang Zhong     return 0;
136792229a57SYang Zhong }
136892229a57SYang Zhong 
kvm_set_ioeventfd_pio(int fd,uint16_t addr,uint16_t val,bool assign,uint32_t size,bool datamatch)136992229a57SYang Zhong static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val,
137092229a57SYang Zhong                                  bool assign, uint32_t size, bool datamatch)
137192229a57SYang Zhong {
137292229a57SYang Zhong     struct kvm_ioeventfd kick = {
137392229a57SYang Zhong         .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
137492229a57SYang Zhong         .addr = addr,
137592229a57SYang Zhong         .flags = KVM_IOEVENTFD_FLAG_PIO,
137692229a57SYang Zhong         .len = size,
137792229a57SYang Zhong         .fd = fd,
137892229a57SYang Zhong     };
137992229a57SYang Zhong     int r;
1380876d16cdSDr. David Alan Gilbert     trace_kvm_set_ioeventfd_pio(fd, addr, val, assign, size, datamatch);
138192229a57SYang Zhong     if (!kvm_enabled()) {
138292229a57SYang Zhong         return -ENOSYS;
138392229a57SYang Zhong     }
138492229a57SYang Zhong     if (datamatch) {
138592229a57SYang Zhong         kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
138692229a57SYang Zhong     }
138792229a57SYang Zhong     if (!assign) {
138892229a57SYang Zhong         kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
138992229a57SYang Zhong     }
139092229a57SYang Zhong     r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
139192229a57SYang Zhong     if (r < 0) {
139292229a57SYang Zhong         return r;
139392229a57SYang Zhong     }
139492229a57SYang Zhong     return 0;
139592229a57SYang Zhong }
139692229a57SYang Zhong 
139792229a57SYang Zhong 
139892229a57SYang Zhong static const KVMCapabilityInfo *
kvm_check_extension_list(KVMState * s,const KVMCapabilityInfo * list)139992229a57SYang Zhong kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list)
140092229a57SYang Zhong {
140192229a57SYang Zhong     while (list->name) {
140292229a57SYang Zhong         if (!kvm_check_extension(s, list->value)) {
140392229a57SYang Zhong             return list;
140492229a57SYang Zhong         }
140592229a57SYang Zhong         list++;
140692229a57SYang Zhong     }
140792229a57SYang Zhong     return NULL;
140892229a57SYang Zhong }
140992229a57SYang Zhong 
kvm_set_max_memslot_size(hwaddr max_slot_size)1410023ae9a8SIgor Mammedov void kvm_set_max_memslot_size(hwaddr max_slot_size)
1411023ae9a8SIgor Mammedov {
1412023ae9a8SIgor Mammedov     g_assert(
14138e3b0cbbSMarc-André Lureau         ROUND_UP(max_slot_size, qemu_real_host_page_size()) == max_slot_size
1414023ae9a8SIgor Mammedov     );
1415023ae9a8SIgor Mammedov     kvm_max_slot_size = max_slot_size;
1416023ae9a8SIgor Mammedov }
1417023ae9a8SIgor Mammedov 
kvm_set_memory_attributes(hwaddr start,uint64_t size,uint64_t attr)14180811baedSXiaoyao Li static int kvm_set_memory_attributes(hwaddr start, uint64_t size, uint64_t attr)
14190811baedSXiaoyao Li {
14200811baedSXiaoyao Li     struct kvm_memory_attributes attrs;
14210811baedSXiaoyao Li     int r;
14220811baedSXiaoyao Li 
14230811baedSXiaoyao Li     assert((attr & kvm_supported_memory_attributes) == attr);
14240811baedSXiaoyao Li     attrs.attributes = attr;
14250811baedSXiaoyao Li     attrs.address = start;
14260811baedSXiaoyao Li     attrs.size = size;
14270811baedSXiaoyao Li     attrs.flags = 0;
14280811baedSXiaoyao Li 
14290811baedSXiaoyao Li     r = kvm_vm_ioctl(kvm_state, KVM_SET_MEMORY_ATTRIBUTES, &attrs);
14300811baedSXiaoyao Li     if (r) {
14310811baedSXiaoyao Li         error_report("failed to set memory (0x%" HWADDR_PRIx "+0x%" PRIx64 ") "
14320811baedSXiaoyao Li                      "with attr 0x%" PRIx64 " error '%s'",
14330811baedSXiaoyao Li                      start, size, attr, strerror(errno));
14340811baedSXiaoyao Li     }
14350811baedSXiaoyao Li     return r;
14360811baedSXiaoyao Li }
14370811baedSXiaoyao Li 
kvm_set_memory_attributes_private(hwaddr start,uint64_t size)14380811baedSXiaoyao Li int kvm_set_memory_attributes_private(hwaddr start, uint64_t size)
14390811baedSXiaoyao Li {
14400811baedSXiaoyao Li     return kvm_set_memory_attributes(start, size, KVM_MEMORY_ATTRIBUTE_PRIVATE);
14410811baedSXiaoyao Li }
14420811baedSXiaoyao Li 
kvm_set_memory_attributes_shared(hwaddr start,uint64_t size)14430811baedSXiaoyao Li int kvm_set_memory_attributes_shared(hwaddr start, uint64_t size)
14440811baedSXiaoyao Li {
14450811baedSXiaoyao Li     return kvm_set_memory_attributes(start, size, 0);
14460811baedSXiaoyao Li }
14470811baedSXiaoyao Li 
1448f39b7d2bSDavid Hildenbrand /* Called with KVMMemoryListener.slots_lock held */
kvm_set_phys_mem(KVMMemoryListener * kml,MemoryRegionSection * section,bool add)144992229a57SYang Zhong static void kvm_set_phys_mem(KVMMemoryListener *kml,
145092229a57SYang Zhong                              MemoryRegionSection *section, bool add)
145192229a57SYang Zhong {
1452f357f564SDavid Hildenbrand     KVMSlot *mem;
145392229a57SYang Zhong     int err;
145492229a57SYang Zhong     MemoryRegion *mr = section->mr;
14559323e79fSPeter Maydell     bool writable = !mr->readonly && !mr->rom_device;
14562c20b27eSPeter Xu     hwaddr start_addr, size, slot_size, mr_offset;
14572c20b27eSPeter Xu     ram_addr_t ram_start_offset;
14585ea69c2eSDavid Hildenbrand     void *ram;
145992229a57SYang Zhong 
146092229a57SYang Zhong     if (!memory_region_is_ram(mr)) {
14619323e79fSPeter Maydell         if (writable || !kvm_readonly_mem_allowed) {
146292229a57SYang Zhong             return;
146392229a57SYang Zhong         } else if (!mr->romd_mode) {
146492229a57SYang Zhong             /* If the memory device is not in romd_mode, then we actually want
146592229a57SYang Zhong              * to remove the kvm memory slot so all accesses will trap. */
146692229a57SYang Zhong             add = false;
146792229a57SYang Zhong         }
146892229a57SYang Zhong     }
146992229a57SYang Zhong 
14705ea69c2eSDavid Hildenbrand     size = kvm_align_section(section, &start_addr);
14715ea69c2eSDavid Hildenbrand     if (!size) {
14725ea69c2eSDavid Hildenbrand         return;
14735ea69c2eSDavid Hildenbrand     }
14745ea69c2eSDavid Hildenbrand 
14752c20b27eSPeter Xu     /* The offset of the kvmslot within the memory region */
14762c20b27eSPeter Xu     mr_offset = section->offset_within_region + start_addr -
14772c20b27eSPeter Xu         section->offset_within_address_space;
14782c20b27eSPeter Xu 
14792c20b27eSPeter Xu     /* use aligned delta to align the ram address and offset */
14802c20b27eSPeter Xu     ram = memory_region_get_ram_ptr(mr) + mr_offset;
14812c20b27eSPeter Xu     ram_start_offset = memory_region_get_ram_addr(mr) + mr_offset;
148292229a57SYang Zhong 
1483f357f564SDavid Hildenbrand     if (!add) {
1484023ae9a8SIgor Mammedov         do {
1485023ae9a8SIgor Mammedov             slot_size = MIN(kvm_max_slot_size, size);
1486023ae9a8SIgor Mammedov             mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
148792229a57SYang Zhong             if (!mem) {
1488f39b7d2bSDavid Hildenbrand                 return;
148992229a57SYang Zhong             }
149092229a57SYang Zhong             if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
1491b4420f19SPeter Xu                 /*
1492b4420f19SPeter Xu                  * NOTE: We should be aware of the fact that here we're only
1493b4420f19SPeter Xu                  * doing a best effort to sync dirty bits.  No matter whether
1494b4420f19SPeter Xu                  * we're using dirty log or dirty ring, we ignored two facts:
1495b4420f19SPeter Xu                  *
1496b4420f19SPeter Xu                  * (1) dirty bits can reside in hardware buffers (PML)
1497b4420f19SPeter Xu                  *
1498b4420f19SPeter Xu                  * (2) after we collected dirty bits here, pages can be dirtied
1499b4420f19SPeter Xu                  * again before we do the final KVM_SET_USER_MEMORY_REGION to
1500b4420f19SPeter Xu                  * remove the slot.
1501b4420f19SPeter Xu                  *
1502b4420f19SPeter Xu                  * Not easy.  Let's cross the fingers until it's fixed.
1503b4420f19SPeter Xu                  */
1504b4420f19SPeter Xu                 if (kvm_state->kvm_dirty_ring_size) {
15051667e2b9SHyman Huang(黄勇)                     kvm_dirty_ring_reap_locked(kvm_state, NULL);
1506b20cc776SGavin Shan                     if (kvm_state->kvm_dirty_ring_with_bitmap) {
1507b20cc776SGavin Shan                         kvm_slot_sync_dirty_pages(mem);
1508b20cc776SGavin Shan                         kvm_slot_get_dirty_log(kvm_state, mem);
1509b20cc776SGavin Shan                     }
1510b4420f19SPeter Xu                 } else {
151129b7e8beSPeter Xu                     kvm_slot_get_dirty_log(kvm_state, mem);
1512b4420f19SPeter Xu                 }
151329b7e8beSPeter Xu                 kvm_slot_sync_dirty_pages(mem);
151492229a57SYang Zhong             }
151592229a57SYang Zhong 
1516f357f564SDavid Hildenbrand             /* unregister the slot */
15179f4bf4baSPeter Xu             g_free(mem->dirty_bmap);
15189f4bf4baSPeter Xu             mem->dirty_bmap = NULL;
151992229a57SYang Zhong             mem->memory_size = 0;
15206c090d4aSShannon Zhao             mem->flags = 0;
15216c090d4aSShannon Zhao             err = kvm_set_user_memory_region(kml, mem, false);
152292229a57SYang Zhong             if (err) {
15231c4fdabaSDavid Hildenbrand                 fprintf(stderr, "%s: error unregistering slot: %s\n",
152492229a57SYang Zhong                         __func__, strerror(-err));
152592229a57SYang Zhong                 abort();
152692229a57SYang Zhong             }
1527023ae9a8SIgor Mammedov             start_addr += slot_size;
1528023ae9a8SIgor Mammedov             size -= slot_size;
1529dbdc00baSPeter Xu             kml->nr_slots_used--;
1530023ae9a8SIgor Mammedov         } while (size);
1531f39b7d2bSDavid Hildenbrand         return;
153292229a57SYang Zhong     }
1533f357f564SDavid Hildenbrand 
1534f357f564SDavid Hildenbrand     /* register the new slot */
1535023ae9a8SIgor Mammedov     do {
1536023ae9a8SIgor Mammedov         slot_size = MIN(kvm_max_slot_size, size);
153792229a57SYang Zhong         mem = kvm_alloc_slot(kml);
1538e65e5f50SPeter Xu         mem->as_id = kml->as_id;
1539023ae9a8SIgor Mammedov         mem->memory_size = slot_size;
154092229a57SYang Zhong         mem->start_addr = start_addr;
15412c20b27eSPeter Xu         mem->ram_start_offset = ram_start_offset;
154292229a57SYang Zhong         mem->ram = ram;
154392229a57SYang Zhong         mem->flags = kvm_mem_flags(mr);
1544ce5a9832SChao Peng         mem->guest_memfd = mr->ram_block->guest_memfd;
1545ce5a9832SChao Peng         mem->guest_memfd_offset = (uint8_t*)ram - mr->ram_block->host;
1546ce5a9832SChao Peng 
1547ea776d15SPeter Xu         kvm_slot_init_dirty_bitmap(mem);
15486c090d4aSShannon Zhao         err = kvm_set_user_memory_region(kml, mem, true);
154992229a57SYang Zhong         if (err) {
155092229a57SYang Zhong             fprintf(stderr, "%s: error registering slot: %s\n", __func__,
155192229a57SYang Zhong                     strerror(-err));
155292229a57SYang Zhong             abort();
155392229a57SYang Zhong         }
1554bd3bcf69SXiaoyao Li 
1555bd3bcf69SXiaoyao Li         if (memory_region_has_guest_memfd(mr)) {
1556bd3bcf69SXiaoyao Li             err = kvm_set_memory_attributes_private(start_addr, slot_size);
1557bd3bcf69SXiaoyao Li             if (err) {
1558bd3bcf69SXiaoyao Li                 error_report("%s: failed to set memory attribute private: %s",
1559bd3bcf69SXiaoyao Li                              __func__, strerror(-err));
1560bd3bcf69SXiaoyao Li                 exit(1);
1561bd3bcf69SXiaoyao Li             }
1562bd3bcf69SXiaoyao Li         }
1563bd3bcf69SXiaoyao Li 
1564023ae9a8SIgor Mammedov         start_addr += slot_size;
15652c20b27eSPeter Xu         ram_start_offset += slot_size;
1566023ae9a8SIgor Mammedov         ram += slot_size;
1567023ae9a8SIgor Mammedov         size -= slot_size;
1568dbdc00baSPeter Xu         kml->nr_slots_used++;
1569023ae9a8SIgor Mammedov     } while (size);
157092229a57SYang Zhong }
157192229a57SYang Zhong 
kvm_dirty_ring_reaper_thread(void * data)1572b4420f19SPeter Xu static void *kvm_dirty_ring_reaper_thread(void *data)
1573b4420f19SPeter Xu {
1574b4420f19SPeter Xu     KVMState *s = data;
1575b4420f19SPeter Xu     struct KVMDirtyRingReaper *r = &s->reaper;
1576b4420f19SPeter Xu 
1577b4420f19SPeter Xu     rcu_register_thread();
1578b4420f19SPeter Xu 
1579b4420f19SPeter Xu     trace_kvm_dirty_ring_reaper("init");
1580b4420f19SPeter Xu 
1581b4420f19SPeter Xu     while (true) {
1582b4420f19SPeter Xu         r->reaper_state = KVM_DIRTY_RING_REAPER_WAIT;
1583b4420f19SPeter Xu         trace_kvm_dirty_ring_reaper("wait");
1584b4420f19SPeter Xu         /*
1585b4420f19SPeter Xu          * TODO: provide a smarter timeout rather than a constant?
1586b4420f19SPeter Xu          */
1587b4420f19SPeter Xu         sleep(1);
1588b4420f19SPeter Xu 
1589baa60983SHyman Huang(黄勇)         /* keep sleeping so that dirtylimit not be interfered by reaper */
1590baa60983SHyman Huang(黄勇)         if (dirtylimit_in_service()) {
1591baa60983SHyman Huang(黄勇)             continue;
1592baa60983SHyman Huang(黄勇)         }
1593baa60983SHyman Huang(黄勇) 
1594b4420f19SPeter Xu         trace_kvm_dirty_ring_reaper("wakeup");
1595b4420f19SPeter Xu         r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING;
1596b4420f19SPeter Xu 
1597195801d7SStefan Hajnoczi         bql_lock();
15981667e2b9SHyman Huang(黄勇)         kvm_dirty_ring_reap(s, NULL);
1599195801d7SStefan Hajnoczi         bql_unlock();
1600b4420f19SPeter Xu 
1601b4420f19SPeter Xu         r->reaper_iteration++;
1602b4420f19SPeter Xu     }
1603b4420f19SPeter Xu 
1604c4d16d41SPeter Maydell     g_assert_not_reached();
1605b4420f19SPeter Xu }
1606b4420f19SPeter Xu 
kvm_dirty_ring_reaper_init(KVMState * s)160743a5e377SAkihiko Odaki static void kvm_dirty_ring_reaper_init(KVMState *s)
1608b4420f19SPeter Xu {
1609b4420f19SPeter Xu     struct KVMDirtyRingReaper *r = &s->reaper;
1610b4420f19SPeter Xu 
1611b4420f19SPeter Xu     qemu_thread_create(&r->reaper_thr, "kvm-reaper",
1612b4420f19SPeter Xu                        kvm_dirty_ring_reaper_thread,
1613b4420f19SPeter Xu                        s, QEMU_THREAD_JOINABLE);
1614b4420f19SPeter Xu }
1615b4420f19SPeter Xu 
kvm_dirty_ring_init(KVMState * s)16163794cb94SGavin Shan static int kvm_dirty_ring_init(KVMState *s)
16173794cb94SGavin Shan {
16183794cb94SGavin Shan     uint32_t ring_size = s->kvm_dirty_ring_size;
16193794cb94SGavin Shan     uint64_t ring_bytes = ring_size * sizeof(struct kvm_dirty_gfn);
1620856e23a0SGavin Shan     unsigned int capability = KVM_CAP_DIRTY_LOG_RING;
16213794cb94SGavin Shan     int ret;
16223794cb94SGavin Shan 
16233794cb94SGavin Shan     s->kvm_dirty_ring_size = 0;
16243794cb94SGavin Shan     s->kvm_dirty_ring_bytes = 0;
16253794cb94SGavin Shan 
16263794cb94SGavin Shan     /* Bail if the dirty ring size isn't specified */
16273794cb94SGavin Shan     if (!ring_size) {
16283794cb94SGavin Shan         return 0;
16293794cb94SGavin Shan     }
16303794cb94SGavin Shan 
16313794cb94SGavin Shan     /*
16323794cb94SGavin Shan      * Read the max supported pages. Fall back to dirty logging mode
16333794cb94SGavin Shan      * if the dirty ring isn't supported.
16343794cb94SGavin Shan      */
1635856e23a0SGavin Shan     ret = kvm_vm_check_extension(s, capability);
1636856e23a0SGavin Shan     if (ret <= 0) {
1637856e23a0SGavin Shan         capability = KVM_CAP_DIRTY_LOG_RING_ACQ_REL;
1638856e23a0SGavin Shan         ret = kvm_vm_check_extension(s, capability);
1639856e23a0SGavin Shan     }
1640856e23a0SGavin Shan 
16413794cb94SGavin Shan     if (ret <= 0) {
16423794cb94SGavin Shan         warn_report("KVM dirty ring not available, using bitmap method");
16433794cb94SGavin Shan         return 0;
16443794cb94SGavin Shan     }
16453794cb94SGavin Shan 
16463794cb94SGavin Shan     if (ring_bytes > ret) {
16473794cb94SGavin Shan         error_report("KVM dirty ring size %" PRIu32 " too big "
16483794cb94SGavin Shan                      "(maximum is %ld).  Please use a smaller value.",
16493794cb94SGavin Shan                      ring_size, (long)ret / sizeof(struct kvm_dirty_gfn));
16503794cb94SGavin Shan         return -EINVAL;
16513794cb94SGavin Shan     }
16523794cb94SGavin Shan 
1653856e23a0SGavin Shan     ret = kvm_vm_enable_cap(s, capability, 0, ring_bytes);
16543794cb94SGavin Shan     if (ret) {
16553794cb94SGavin Shan         error_report("Enabling of KVM dirty ring failed: %s. "
16563794cb94SGavin Shan                      "Suggested minimum value is 1024.", strerror(-ret));
16573794cb94SGavin Shan         return -EIO;
16583794cb94SGavin Shan     }
16593794cb94SGavin Shan 
1660856e23a0SGavin Shan     /* Enable the backup bitmap if it is supported */
1661856e23a0SGavin Shan     ret = kvm_vm_check_extension(s, KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP);
1662856e23a0SGavin Shan     if (ret > 0) {
1663856e23a0SGavin Shan         ret = kvm_vm_enable_cap(s, KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP, 0);
1664856e23a0SGavin Shan         if (ret) {
1665856e23a0SGavin Shan             error_report("Enabling of KVM dirty ring's backup bitmap failed: "
1666856e23a0SGavin Shan                          "%s. ", strerror(-ret));
1667856e23a0SGavin Shan             return -EIO;
1668856e23a0SGavin Shan         }
1669856e23a0SGavin Shan 
1670856e23a0SGavin Shan         s->kvm_dirty_ring_with_bitmap = true;
1671856e23a0SGavin Shan     }
1672856e23a0SGavin Shan 
16733794cb94SGavin Shan     s->kvm_dirty_ring_size = ring_size;
16743794cb94SGavin Shan     s->kvm_dirty_ring_bytes = ring_bytes;
16753794cb94SGavin Shan 
16763794cb94SGavin Shan     return 0;
16773794cb94SGavin Shan }
16783794cb94SGavin Shan 
kvm_region_add(MemoryListener * listener,MemoryRegionSection * section)167992229a57SYang Zhong static void kvm_region_add(MemoryListener *listener,
168092229a57SYang Zhong                            MemoryRegionSection *section)
168192229a57SYang Zhong {
168292229a57SYang Zhong     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1683f39b7d2bSDavid Hildenbrand     KVMMemoryUpdate *update;
168492229a57SYang Zhong 
1685f39b7d2bSDavid Hildenbrand     update = g_new0(KVMMemoryUpdate, 1);
1686f39b7d2bSDavid Hildenbrand     update->section = *section;
1687f39b7d2bSDavid Hildenbrand 
1688f39b7d2bSDavid Hildenbrand     QSIMPLEQ_INSERT_TAIL(&kml->transaction_add, update, next);
168992229a57SYang Zhong }
169092229a57SYang Zhong 
kvm_region_del(MemoryListener * listener,MemoryRegionSection * section)169192229a57SYang Zhong static void kvm_region_del(MemoryListener *listener,
169292229a57SYang Zhong                            MemoryRegionSection *section)
169392229a57SYang Zhong {
169492229a57SYang Zhong     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1695f39b7d2bSDavid Hildenbrand     KVMMemoryUpdate *update;
169692229a57SYang Zhong 
1697f39b7d2bSDavid Hildenbrand     update = g_new0(KVMMemoryUpdate, 1);
1698f39b7d2bSDavid Hildenbrand     update->section = *section;
1699f39b7d2bSDavid Hildenbrand 
1700f39b7d2bSDavid Hildenbrand     QSIMPLEQ_INSERT_TAIL(&kml->transaction_del, update, next);
1701f39b7d2bSDavid Hildenbrand }
1702f39b7d2bSDavid Hildenbrand 
kvm_region_commit(MemoryListener * listener)1703f39b7d2bSDavid Hildenbrand static void kvm_region_commit(MemoryListener *listener)
1704f39b7d2bSDavid Hildenbrand {
1705f39b7d2bSDavid Hildenbrand     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener,
1706f39b7d2bSDavid Hildenbrand                                           listener);
1707f39b7d2bSDavid Hildenbrand     KVMMemoryUpdate *u1, *u2;
1708f39b7d2bSDavid Hildenbrand     bool need_inhibit = false;
1709f39b7d2bSDavid Hildenbrand 
1710f39b7d2bSDavid Hildenbrand     if (QSIMPLEQ_EMPTY(&kml->transaction_add) &&
1711f39b7d2bSDavid Hildenbrand         QSIMPLEQ_EMPTY(&kml->transaction_del)) {
1712f39b7d2bSDavid Hildenbrand         return;
1713f39b7d2bSDavid Hildenbrand     }
1714f39b7d2bSDavid Hildenbrand 
1715f39b7d2bSDavid Hildenbrand     /*
1716f39b7d2bSDavid Hildenbrand      * We have to be careful when regions to add overlap with ranges to remove.
1717f39b7d2bSDavid Hildenbrand      * We have to simulate atomic KVM memslot updates by making sure no ioctl()
1718f39b7d2bSDavid Hildenbrand      * is currently active.
1719f39b7d2bSDavid Hildenbrand      *
1720f39b7d2bSDavid Hildenbrand      * The lists are order by addresses, so it's easy to find overlaps.
1721f39b7d2bSDavid Hildenbrand      */
1722f39b7d2bSDavid Hildenbrand     u1 = QSIMPLEQ_FIRST(&kml->transaction_del);
1723f39b7d2bSDavid Hildenbrand     u2 = QSIMPLEQ_FIRST(&kml->transaction_add);
1724f39b7d2bSDavid Hildenbrand     while (u1 && u2) {
1725f39b7d2bSDavid Hildenbrand         Range r1, r2;
1726f39b7d2bSDavid Hildenbrand 
1727f39b7d2bSDavid Hildenbrand         range_init_nofail(&r1, u1->section.offset_within_address_space,
1728f39b7d2bSDavid Hildenbrand                           int128_get64(u1->section.size));
1729f39b7d2bSDavid Hildenbrand         range_init_nofail(&r2, u2->section.offset_within_address_space,
1730f39b7d2bSDavid Hildenbrand                           int128_get64(u2->section.size));
1731f39b7d2bSDavid Hildenbrand 
1732f39b7d2bSDavid Hildenbrand         if (range_overlaps_range(&r1, &r2)) {
1733f39b7d2bSDavid Hildenbrand             need_inhibit = true;
1734f39b7d2bSDavid Hildenbrand             break;
1735f39b7d2bSDavid Hildenbrand         }
1736f39b7d2bSDavid Hildenbrand         if (range_lob(&r1) < range_lob(&r2)) {
1737f39b7d2bSDavid Hildenbrand             u1 = QSIMPLEQ_NEXT(u1, next);
1738f39b7d2bSDavid Hildenbrand         } else {
1739f39b7d2bSDavid Hildenbrand             u2 = QSIMPLEQ_NEXT(u2, next);
1740f39b7d2bSDavid Hildenbrand         }
1741f39b7d2bSDavid Hildenbrand     }
1742f39b7d2bSDavid Hildenbrand 
1743f39b7d2bSDavid Hildenbrand     kvm_slots_lock();
1744f39b7d2bSDavid Hildenbrand     if (need_inhibit) {
1745f39b7d2bSDavid Hildenbrand         accel_ioctl_inhibit_begin();
1746f39b7d2bSDavid Hildenbrand     }
1747f39b7d2bSDavid Hildenbrand 
1748f39b7d2bSDavid Hildenbrand     /* Remove all memslots before adding the new ones. */
1749f39b7d2bSDavid Hildenbrand     while (!QSIMPLEQ_EMPTY(&kml->transaction_del)) {
1750f39b7d2bSDavid Hildenbrand         u1 = QSIMPLEQ_FIRST(&kml->transaction_del);
1751f39b7d2bSDavid Hildenbrand         QSIMPLEQ_REMOVE_HEAD(&kml->transaction_del, next);
1752f39b7d2bSDavid Hildenbrand 
1753f39b7d2bSDavid Hildenbrand         kvm_set_phys_mem(kml, &u1->section, false);
1754f39b7d2bSDavid Hildenbrand         memory_region_unref(u1->section.mr);
1755f39b7d2bSDavid Hildenbrand 
1756f39b7d2bSDavid Hildenbrand         g_free(u1);
1757f39b7d2bSDavid Hildenbrand     }
1758f39b7d2bSDavid Hildenbrand     while (!QSIMPLEQ_EMPTY(&kml->transaction_add)) {
1759f39b7d2bSDavid Hildenbrand         u1 = QSIMPLEQ_FIRST(&kml->transaction_add);
1760f39b7d2bSDavid Hildenbrand         QSIMPLEQ_REMOVE_HEAD(&kml->transaction_add, next);
1761f39b7d2bSDavid Hildenbrand 
1762f39b7d2bSDavid Hildenbrand         memory_region_ref(u1->section.mr);
1763f39b7d2bSDavid Hildenbrand         kvm_set_phys_mem(kml, &u1->section, true);
1764f39b7d2bSDavid Hildenbrand 
1765f39b7d2bSDavid Hildenbrand         g_free(u1);
1766f39b7d2bSDavid Hildenbrand     }
1767f39b7d2bSDavid Hildenbrand 
1768f39b7d2bSDavid Hildenbrand     if (need_inhibit) {
1769f39b7d2bSDavid Hildenbrand         accel_ioctl_inhibit_end();
1770f39b7d2bSDavid Hildenbrand     }
1771f39b7d2bSDavid Hildenbrand     kvm_slots_unlock();
177292229a57SYang Zhong }
177392229a57SYang Zhong 
kvm_log_sync(MemoryListener * listener,MemoryRegionSection * section)177492229a57SYang Zhong static void kvm_log_sync(MemoryListener *listener,
177592229a57SYang Zhong                          MemoryRegionSection *section)
177692229a57SYang Zhong {
177792229a57SYang Zhong     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
177892229a57SYang Zhong 
1779a2f77862SPeter Xu     kvm_slots_lock();
1780e65e5f50SPeter Xu     kvm_physical_sync_dirty_bitmap(kml, section);
1781a2f77862SPeter Xu     kvm_slots_unlock();
178292229a57SYang Zhong }
178392229a57SYang Zhong 
kvm_log_sync_global(MemoryListener * l,bool last_stage)17841e493be5SGavin Shan static void kvm_log_sync_global(MemoryListener *l, bool last_stage)
1785b4420f19SPeter Xu {
1786b4420f19SPeter Xu     KVMMemoryListener *kml = container_of(l, KVMMemoryListener, listener);
1787b4420f19SPeter Xu     KVMState *s = kvm_state;
1788b4420f19SPeter Xu     KVMSlot *mem;
1789b4420f19SPeter Xu     int i;
1790b4420f19SPeter Xu 
1791b4420f19SPeter Xu     /* Flush all kernel dirty addresses into KVMSlot dirty bitmap */
1792b4420f19SPeter Xu     kvm_dirty_ring_flush();
1793b4420f19SPeter Xu 
1794b4420f19SPeter Xu     kvm_slots_lock();
17955504a812SPeter Xu     for (i = 0; i < kml->nr_slots_allocated; i++) {
1796b4420f19SPeter Xu         mem = &kml->slots[i];
1797b4420f19SPeter Xu         if (mem->memory_size && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
1798b4420f19SPeter Xu             kvm_slot_sync_dirty_pages(mem);
1799b20cc776SGavin Shan 
1800b20cc776SGavin Shan             if (s->kvm_dirty_ring_with_bitmap && last_stage &&
1801b20cc776SGavin Shan                 kvm_slot_get_dirty_log(s, mem)) {
1802b20cc776SGavin Shan                 kvm_slot_sync_dirty_pages(mem);
1803b20cc776SGavin Shan             }
1804b20cc776SGavin Shan 
1805b4420f19SPeter Xu             /*
1806b4420f19SPeter Xu              * This is not needed by KVM_GET_DIRTY_LOG because the
1807b4420f19SPeter Xu              * ioctl will unconditionally overwrite the whole region.
1808b4420f19SPeter Xu              * However kvm dirty ring has no such side effect.
1809b4420f19SPeter Xu              */
1810b4420f19SPeter Xu             kvm_slot_reset_dirty_pages(mem);
1811b4420f19SPeter Xu         }
1812b4420f19SPeter Xu     }
1813b4420f19SPeter Xu     kvm_slots_unlock();
1814b4420f19SPeter Xu }
1815b4420f19SPeter Xu 
kvm_log_clear(MemoryListener * listener,MemoryRegionSection * section)1816ff4aa114SPeter Xu static void kvm_log_clear(MemoryListener *listener,
1817ff4aa114SPeter Xu                           MemoryRegionSection *section)
1818ff4aa114SPeter Xu {
1819ff4aa114SPeter Xu     KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1820ff4aa114SPeter Xu     int r;
1821ff4aa114SPeter Xu 
1822ff4aa114SPeter Xu     r = kvm_physical_log_clear(kml, section);
1823ff4aa114SPeter Xu     if (r < 0) {
1824ff4aa114SPeter Xu         error_report_once("%s: kvm log clear failed: mr=%s "
1825ff4aa114SPeter Xu                           "offset=%"HWADDR_PRIx" size=%"PRIx64, __func__,
1826ff4aa114SPeter Xu                           section->mr->name, section->offset_within_region,
1827ff4aa114SPeter Xu                           int128_get64(section->size));
1828ff4aa114SPeter Xu         abort();
1829ff4aa114SPeter Xu     }
1830ff4aa114SPeter Xu }
1831ff4aa114SPeter Xu 
kvm_mem_ioeventfd_add(MemoryListener * listener,MemoryRegionSection * section,bool match_data,uint64_t data,EventNotifier * e)183292229a57SYang Zhong static void kvm_mem_ioeventfd_add(MemoryListener *listener,
183392229a57SYang Zhong                                   MemoryRegionSection *section,
183492229a57SYang Zhong                                   bool match_data, uint64_t data,
183592229a57SYang Zhong                                   EventNotifier *e)
183692229a57SYang Zhong {
183792229a57SYang Zhong     int fd = event_notifier_get_fd(e);
183892229a57SYang Zhong     int r;
183992229a57SYang Zhong 
184092229a57SYang Zhong     r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
184192229a57SYang Zhong                                data, true, int128_get64(section->size),
184292229a57SYang Zhong                                match_data);
184392229a57SYang Zhong     if (r < 0) {
1844e346bcbfSYury Kotov         fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n",
1845e346bcbfSYury Kotov                 __func__, strerror(-r), -r);
184692229a57SYang Zhong         abort();
184792229a57SYang Zhong     }
184892229a57SYang Zhong }
184992229a57SYang Zhong 
kvm_mem_ioeventfd_del(MemoryListener * listener,MemoryRegionSection * section,bool match_data,uint64_t data,EventNotifier * e)185092229a57SYang Zhong static void kvm_mem_ioeventfd_del(MemoryListener *listener,
185192229a57SYang Zhong                                   MemoryRegionSection *section,
185292229a57SYang Zhong                                   bool match_data, uint64_t data,
185392229a57SYang Zhong                                   EventNotifier *e)
185492229a57SYang Zhong {
185592229a57SYang Zhong     int fd = event_notifier_get_fd(e);
185692229a57SYang Zhong     int r;
185792229a57SYang Zhong 
185892229a57SYang Zhong     r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
185992229a57SYang Zhong                                data, false, int128_get64(section->size),
186092229a57SYang Zhong                                match_data);
186192229a57SYang Zhong     if (r < 0) {
1862e346bcbfSYury Kotov         fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n",
1863e346bcbfSYury Kotov                 __func__, strerror(-r), -r);
186492229a57SYang Zhong         abort();
186592229a57SYang Zhong     }
186692229a57SYang Zhong }
186792229a57SYang Zhong 
kvm_io_ioeventfd_add(MemoryListener * listener,MemoryRegionSection * section,bool match_data,uint64_t data,EventNotifier * e)186892229a57SYang Zhong static void kvm_io_ioeventfd_add(MemoryListener *listener,
186992229a57SYang Zhong                                  MemoryRegionSection *section,
187092229a57SYang Zhong                                  bool match_data, uint64_t data,
187192229a57SYang Zhong                                  EventNotifier *e)
187292229a57SYang Zhong {
187392229a57SYang Zhong     int fd = event_notifier_get_fd(e);
187492229a57SYang Zhong     int r;
187592229a57SYang Zhong 
187692229a57SYang Zhong     r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
187792229a57SYang Zhong                               data, true, int128_get64(section->size),
187892229a57SYang Zhong                               match_data);
187992229a57SYang Zhong     if (r < 0) {
1880e346bcbfSYury Kotov         fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n",
1881e346bcbfSYury Kotov                 __func__, strerror(-r), -r);
188292229a57SYang Zhong         abort();
188392229a57SYang Zhong     }
188492229a57SYang Zhong }
188592229a57SYang Zhong 
kvm_io_ioeventfd_del(MemoryListener * listener,MemoryRegionSection * section,bool match_data,uint64_t data,EventNotifier * e)188692229a57SYang Zhong static void kvm_io_ioeventfd_del(MemoryListener *listener,
188792229a57SYang Zhong                                  MemoryRegionSection *section,
188892229a57SYang Zhong                                  bool match_data, uint64_t data,
188992229a57SYang Zhong                                  EventNotifier *e)
189092229a57SYang Zhong 
189192229a57SYang Zhong {
189292229a57SYang Zhong     int fd = event_notifier_get_fd(e);
189392229a57SYang Zhong     int r;
189492229a57SYang Zhong 
189592229a57SYang Zhong     r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
189692229a57SYang Zhong                               data, false, int128_get64(section->size),
189792229a57SYang Zhong                               match_data);
189892229a57SYang Zhong     if (r < 0) {
1899e346bcbfSYury Kotov         fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n",
1900e346bcbfSYury Kotov                 __func__, strerror(-r), -r);
190192229a57SYang Zhong         abort();
190292229a57SYang Zhong     }
190392229a57SYang Zhong }
190492229a57SYang Zhong 
kvm_memory_listener_register(KVMState * s,KVMMemoryListener * kml,AddressSpace * as,int as_id,const char * name)190592229a57SYang Zhong void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
1906142518bdSPeter Xu                                   AddressSpace *as, int as_id, const char *name)
190792229a57SYang Zhong {
190892229a57SYang Zhong     int i;
190992229a57SYang Zhong 
191092229a57SYang Zhong     kml->as_id = as_id;
191192229a57SYang Zhong 
19125504a812SPeter Xu     kvm_slots_grow(kml, KVM_MEMSLOTS_NR_ALLOC_DEFAULT);
191392229a57SYang Zhong 
1914f39b7d2bSDavid Hildenbrand     QSIMPLEQ_INIT(&kml->transaction_add);
1915f39b7d2bSDavid Hildenbrand     QSIMPLEQ_INIT(&kml->transaction_del);
1916f39b7d2bSDavid Hildenbrand 
191792229a57SYang Zhong     kml->listener.region_add = kvm_region_add;
191892229a57SYang Zhong     kml->listener.region_del = kvm_region_del;
1919f39b7d2bSDavid Hildenbrand     kml->listener.commit = kvm_region_commit;
192092229a57SYang Zhong     kml->listener.log_start = kvm_log_start;
192192229a57SYang Zhong     kml->listener.log_stop = kvm_log_stop;
19225369a36cSIsaku Yamahata     kml->listener.priority = MEMORY_LISTENER_PRIORITY_ACCEL;
1923142518bdSPeter Xu     kml->listener.name = name;
1924b4420f19SPeter Xu 
1925b4420f19SPeter Xu     if (s->kvm_dirty_ring_size) {
1926b4420f19SPeter Xu         kml->listener.log_sync_global = kvm_log_sync_global;
1927b4420f19SPeter Xu     } else {
192892229a57SYang Zhong         kml->listener.log_sync = kvm_log_sync;
1929ff4aa114SPeter Xu         kml->listener.log_clear = kvm_log_clear;
1930b4420f19SPeter Xu     }
193192229a57SYang Zhong 
193292229a57SYang Zhong     memory_listener_register(&kml->listener, as);
19338072aae3SAlexey Kardashevskiy 
19348072aae3SAlexey Kardashevskiy     for (i = 0; i < s->nr_as; ++i) {
19358072aae3SAlexey Kardashevskiy         if (!s->as[i].as) {
19368072aae3SAlexey Kardashevskiy             s->as[i].as = as;
19378072aae3SAlexey Kardashevskiy             s->as[i].ml = kml;
19388072aae3SAlexey Kardashevskiy             break;
19398072aae3SAlexey Kardashevskiy         }
19408072aae3SAlexey Kardashevskiy     }
194192229a57SYang Zhong }
194292229a57SYang Zhong 
194392229a57SYang Zhong static MemoryListener kvm_io_listener = {
1944142518bdSPeter Xu     .name = "kvm-io",
19452cb81af0SPaolo Bonzini     .coalesced_io_add = kvm_coalesce_pio_add,
19462cb81af0SPaolo Bonzini     .coalesced_io_del = kvm_coalesce_pio_del,
194792229a57SYang Zhong     .eventfd_add = kvm_io_ioeventfd_add,
194892229a57SYang Zhong     .eventfd_del = kvm_io_ioeventfd_del,
19498be0461dSIsaku Yamahata     .priority = MEMORY_LISTENER_PRIORITY_DEV_BACKEND,
195092229a57SYang Zhong };
195192229a57SYang Zhong 
kvm_set_irq(KVMState * s,int irq,int level)195292229a57SYang Zhong int kvm_set_irq(KVMState *s, int irq, int level)
195392229a57SYang Zhong {
195492229a57SYang Zhong     struct kvm_irq_level event;
195592229a57SYang Zhong     int ret;
195692229a57SYang Zhong 
195792229a57SYang Zhong     assert(kvm_async_interrupts_enabled());
195892229a57SYang Zhong 
195992229a57SYang Zhong     event.level = level;
196092229a57SYang Zhong     event.irq = irq;
196192229a57SYang Zhong     ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event);
196292229a57SYang Zhong     if (ret < 0) {
196392229a57SYang Zhong         perror("kvm_set_irq");
196492229a57SYang Zhong         abort();
196592229a57SYang Zhong     }
196692229a57SYang Zhong 
196792229a57SYang Zhong     return (s->irq_set_ioctl == KVM_IRQ_LINE) ? 1 : event.status;
196892229a57SYang Zhong }
196992229a57SYang Zhong 
197092229a57SYang Zhong #ifdef KVM_CAP_IRQ_ROUTING
197192229a57SYang Zhong typedef struct KVMMSIRoute {
197292229a57SYang Zhong     struct kvm_irq_routing_entry kroute;
197392229a57SYang Zhong     QTAILQ_ENTRY(KVMMSIRoute) entry;
197492229a57SYang Zhong } KVMMSIRoute;
197592229a57SYang Zhong 
set_gsi(KVMState * s,unsigned int gsi)197692229a57SYang Zhong static void set_gsi(KVMState *s, unsigned int gsi)
197792229a57SYang Zhong {
197892229a57SYang Zhong     set_bit(gsi, s->used_gsi_bitmap);
197992229a57SYang Zhong }
198092229a57SYang Zhong 
clear_gsi(KVMState * s,unsigned int gsi)198192229a57SYang Zhong static void clear_gsi(KVMState *s, unsigned int gsi)
198292229a57SYang Zhong {
198392229a57SYang Zhong     clear_bit(gsi, s->used_gsi_bitmap);
198492229a57SYang Zhong }
198592229a57SYang Zhong 
kvm_init_irq_routing(KVMState * s)198692229a57SYang Zhong void kvm_init_irq_routing(KVMState *s)
198792229a57SYang Zhong {
1988cc5e719eSPaolo Bonzini     int gsi_count;
198992229a57SYang Zhong 
199092229a57SYang Zhong     gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING) - 1;
199192229a57SYang Zhong     if (gsi_count > 0) {
199292229a57SYang Zhong         /* Round up so we can search ints using ffs */
199392229a57SYang Zhong         s->used_gsi_bitmap = bitmap_new(gsi_count);
199492229a57SYang Zhong         s->gsi_count = gsi_count;
199592229a57SYang Zhong     }
199692229a57SYang Zhong 
199792229a57SYang Zhong     s->irq_routes = g_malloc0(sizeof(*s->irq_routes));
199892229a57SYang Zhong     s->nr_allocated_irq_routes = 0;
199992229a57SYang Zhong 
200092229a57SYang Zhong     kvm_arch_init_irq_routing(s);
200192229a57SYang Zhong }
200292229a57SYang Zhong 
kvm_irqchip_commit_routes(KVMState * s)200392229a57SYang Zhong void kvm_irqchip_commit_routes(KVMState *s)
200492229a57SYang Zhong {
200592229a57SYang Zhong     int ret;
200692229a57SYang Zhong 
200792229a57SYang Zhong     if (kvm_gsi_direct_mapping()) {
200892229a57SYang Zhong         return;
200992229a57SYang Zhong     }
201092229a57SYang Zhong 
201192229a57SYang Zhong     if (!kvm_gsi_routing_enabled()) {
201292229a57SYang Zhong         return;
201392229a57SYang Zhong     }
201492229a57SYang Zhong 
201592229a57SYang Zhong     s->irq_routes->flags = 0;
201692229a57SYang Zhong     trace_kvm_irqchip_commit_routes();
201792229a57SYang Zhong     ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes);
201892229a57SYang Zhong     assert(ret == 0);
201992229a57SYang Zhong }
202092229a57SYang Zhong 
kvm_add_routing_entry(KVMState * s,struct kvm_irq_routing_entry * entry)202148663349SPaolo Bonzini void kvm_add_routing_entry(KVMState *s,
202292229a57SYang Zhong                            struct kvm_irq_routing_entry *entry)
202392229a57SYang Zhong {
202492229a57SYang Zhong     struct kvm_irq_routing_entry *new;
202592229a57SYang Zhong     int n, size;
202692229a57SYang Zhong 
202792229a57SYang Zhong     if (s->irq_routes->nr == s->nr_allocated_irq_routes) {
202892229a57SYang Zhong         n = s->nr_allocated_irq_routes * 2;
202992229a57SYang Zhong         if (n < 64) {
203092229a57SYang Zhong             n = 64;
203192229a57SYang Zhong         }
203292229a57SYang Zhong         size = sizeof(struct kvm_irq_routing);
203392229a57SYang Zhong         size += n * sizeof(*new);
203492229a57SYang Zhong         s->irq_routes = g_realloc(s->irq_routes, size);
203592229a57SYang Zhong         s->nr_allocated_irq_routes = n;
203692229a57SYang Zhong     }
203792229a57SYang Zhong     n = s->irq_routes->nr++;
203892229a57SYang Zhong     new = &s->irq_routes->entries[n];
203992229a57SYang Zhong 
204092229a57SYang Zhong     *new = *entry;
204192229a57SYang Zhong 
204292229a57SYang Zhong     set_gsi(s, entry->gsi);
204392229a57SYang Zhong }
204492229a57SYang Zhong 
kvm_update_routing_entry(KVMState * s,struct kvm_irq_routing_entry * new_entry)204592229a57SYang Zhong static int kvm_update_routing_entry(KVMState *s,
204692229a57SYang Zhong                                     struct kvm_irq_routing_entry *new_entry)
204792229a57SYang Zhong {
204892229a57SYang Zhong     struct kvm_irq_routing_entry *entry;
204992229a57SYang Zhong     int n;
205092229a57SYang Zhong 
205192229a57SYang Zhong     for (n = 0; n < s->irq_routes->nr; n++) {
205292229a57SYang Zhong         entry = &s->irq_routes->entries[n];
205392229a57SYang Zhong         if (entry->gsi != new_entry->gsi) {
205492229a57SYang Zhong             continue;
205592229a57SYang Zhong         }
205692229a57SYang Zhong 
205792229a57SYang Zhong         if(!memcmp(entry, new_entry, sizeof *entry)) {
205892229a57SYang Zhong             return 0;
205992229a57SYang Zhong         }
206092229a57SYang Zhong 
206192229a57SYang Zhong         *entry = *new_entry;
206292229a57SYang Zhong 
206392229a57SYang Zhong         return 0;
206492229a57SYang Zhong     }
206592229a57SYang Zhong 
206692229a57SYang Zhong     return -ESRCH;
206792229a57SYang Zhong }
206892229a57SYang Zhong 
kvm_irqchip_add_irq_route(KVMState * s,int irq,int irqchip,int pin)206992229a57SYang Zhong void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin)
207092229a57SYang Zhong {
207192229a57SYang Zhong     struct kvm_irq_routing_entry e = {};
207292229a57SYang Zhong 
207392229a57SYang Zhong     assert(pin < s->gsi_count);
207492229a57SYang Zhong 
207592229a57SYang Zhong     e.gsi = irq;
207692229a57SYang Zhong     e.type = KVM_IRQ_ROUTING_IRQCHIP;
207792229a57SYang Zhong     e.flags = 0;
207892229a57SYang Zhong     e.u.irqchip.irqchip = irqchip;
207992229a57SYang Zhong     e.u.irqchip.pin = pin;
208092229a57SYang Zhong     kvm_add_routing_entry(s, &e);
208192229a57SYang Zhong }
208292229a57SYang Zhong 
kvm_irqchip_release_virq(KVMState * s,int virq)208392229a57SYang Zhong void kvm_irqchip_release_virq(KVMState *s, int virq)
208492229a57SYang Zhong {
208592229a57SYang Zhong     struct kvm_irq_routing_entry *e;
208692229a57SYang Zhong     int i;
208792229a57SYang Zhong 
208892229a57SYang Zhong     if (kvm_gsi_direct_mapping()) {
208992229a57SYang Zhong         return;
209092229a57SYang Zhong     }
209192229a57SYang Zhong 
209292229a57SYang Zhong     for (i = 0; i < s->irq_routes->nr; i++) {
209392229a57SYang Zhong         e = &s->irq_routes->entries[i];
209492229a57SYang Zhong         if (e->gsi == virq) {
209592229a57SYang Zhong             s->irq_routes->nr--;
209692229a57SYang Zhong             *e = s->irq_routes->entries[s->irq_routes->nr];
209792229a57SYang Zhong         }
209892229a57SYang Zhong     }
209992229a57SYang Zhong     clear_gsi(s, virq);
210092229a57SYang Zhong     kvm_arch_release_virq_post(virq);
210192229a57SYang Zhong     trace_kvm_irqchip_release_virq(virq);
210292229a57SYang Zhong }
210392229a57SYang Zhong 
kvm_irqchip_add_change_notifier(Notifier * n)21043607715aSDavid Gibson void kvm_irqchip_add_change_notifier(Notifier *n)
21053607715aSDavid Gibson {
21063607715aSDavid Gibson     notifier_list_add(&kvm_irqchip_change_notifiers, n);
21073607715aSDavid Gibson }
21083607715aSDavid Gibson 
kvm_irqchip_remove_change_notifier(Notifier * n)21093607715aSDavid Gibson void kvm_irqchip_remove_change_notifier(Notifier *n)
21103607715aSDavid Gibson {
21113607715aSDavid Gibson     notifier_remove(n);
21123607715aSDavid Gibson }
21133607715aSDavid Gibson 
kvm_irqchip_change_notify(void)21143607715aSDavid Gibson void kvm_irqchip_change_notify(void)
21153607715aSDavid Gibson {
21163607715aSDavid Gibson     notifier_list_notify(&kvm_irqchip_change_notifiers, NULL);
21173607715aSDavid Gibson }
21183607715aSDavid Gibson 
kvm_irqchip_get_virq(KVMState * s)211948663349SPaolo Bonzini int kvm_irqchip_get_virq(KVMState *s)
212092229a57SYang Zhong {
212192229a57SYang Zhong     int next_virq;
212292229a57SYang Zhong 
212392229a57SYang Zhong     /* Return the lowest unused GSI in the bitmap */
212492229a57SYang Zhong     next_virq = find_first_zero_bit(s->used_gsi_bitmap, s->gsi_count);
212592229a57SYang Zhong     if (next_virq >= s->gsi_count) {
212692229a57SYang Zhong         return -ENOSPC;
212792229a57SYang Zhong     } else {
212892229a57SYang Zhong         return next_virq;
212992229a57SYang Zhong     }
213092229a57SYang Zhong }
213192229a57SYang Zhong 
kvm_irqchip_send_msi(KVMState * s,MSIMessage msg)213292229a57SYang Zhong int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
213392229a57SYang Zhong {
213492229a57SYang Zhong     struct kvm_msi msi;
213592229a57SYang Zhong 
213692229a57SYang Zhong     msi.address_lo = (uint32_t)msg.address;
213792229a57SYang Zhong     msi.address_hi = msg.address >> 32;
213892229a57SYang Zhong     msi.data = le32_to_cpu(msg.data);
213992229a57SYang Zhong     msi.flags = 0;
214092229a57SYang Zhong     memset(msi.pad, 0, sizeof(msi.pad));
214192229a57SYang Zhong 
214292229a57SYang Zhong     return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi);
214392229a57SYang Zhong }
214492229a57SYang Zhong 
kvm_irqchip_add_msi_route(KVMRouteChange * c,int vector,PCIDevice * dev)2145def4c557SLongpeng(Mike) int kvm_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev)
214692229a57SYang Zhong {
214792229a57SYang Zhong     struct kvm_irq_routing_entry kroute = {};
214892229a57SYang Zhong     int virq;
2149def4c557SLongpeng(Mike)     KVMState *s = c->s;
215092229a57SYang Zhong     MSIMessage msg = {0, 0};
215192229a57SYang Zhong 
215288c725c7SCornelia Huck     if (pci_available && dev) {
215392229a57SYang Zhong         msg = pci_get_msi_message(dev, vector);
215492229a57SYang Zhong     }
215592229a57SYang Zhong 
215692229a57SYang Zhong     if (kvm_gsi_direct_mapping()) {
215792229a57SYang Zhong         return kvm_arch_msi_data_to_gsi(msg.data);
215892229a57SYang Zhong     }
215992229a57SYang Zhong 
216092229a57SYang Zhong     if (!kvm_gsi_routing_enabled()) {
216192229a57SYang Zhong         return -ENOSYS;
216292229a57SYang Zhong     }
216392229a57SYang Zhong 
216492229a57SYang Zhong     virq = kvm_irqchip_get_virq(s);
216592229a57SYang Zhong     if (virq < 0) {
216692229a57SYang Zhong         return virq;
216792229a57SYang Zhong     }
216892229a57SYang Zhong 
216992229a57SYang Zhong     kroute.gsi = virq;
217092229a57SYang Zhong     kroute.type = KVM_IRQ_ROUTING_MSI;
217192229a57SYang Zhong     kroute.flags = 0;
217292229a57SYang Zhong     kroute.u.msi.address_lo = (uint32_t)msg.address;
217392229a57SYang Zhong     kroute.u.msi.address_hi = msg.address >> 32;
217492229a57SYang Zhong     kroute.u.msi.data = le32_to_cpu(msg.data);
217588c725c7SCornelia Huck     if (pci_available && kvm_msi_devid_required()) {
217692229a57SYang Zhong         kroute.flags = KVM_MSI_VALID_DEVID;
217792229a57SYang Zhong         kroute.u.msi.devid = pci_requester_id(dev);
217892229a57SYang Zhong     }
217992229a57SYang Zhong     if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
218092229a57SYang Zhong         kvm_irqchip_release_virq(s, virq);
218192229a57SYang Zhong         return -EINVAL;
218292229a57SYang Zhong     }
218392229a57SYang Zhong 
2184e34f4d87SIgor Mammedov     if (s->irq_routes->nr < s->gsi_count) {
218592229a57SYang Zhong         trace_kvm_irqchip_add_msi_route(dev ? dev->name : (char *)"N/A",
218692229a57SYang Zhong                                         vector, virq);
218792229a57SYang Zhong 
218892229a57SYang Zhong         kvm_add_routing_entry(s, &kroute);
218992229a57SYang Zhong         kvm_arch_add_msi_route_post(&kroute, vector, dev);
2190def4c557SLongpeng(Mike)         c->changes++;
2191e34f4d87SIgor Mammedov     } else {
2192e34f4d87SIgor Mammedov         kvm_irqchip_release_virq(s, virq);
2193e34f4d87SIgor Mammedov         return -ENOSPC;
2194e34f4d87SIgor Mammedov     }
219592229a57SYang Zhong 
219692229a57SYang Zhong     return virq;
219792229a57SYang Zhong }
219892229a57SYang Zhong 
kvm_irqchip_update_msi_route(KVMState * s,int virq,MSIMessage msg,PCIDevice * dev)219992229a57SYang Zhong int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg,
220092229a57SYang Zhong                                  PCIDevice *dev)
220192229a57SYang Zhong {
220292229a57SYang Zhong     struct kvm_irq_routing_entry kroute = {};
220392229a57SYang Zhong 
220492229a57SYang Zhong     if (kvm_gsi_direct_mapping()) {
220592229a57SYang Zhong         return 0;
220692229a57SYang Zhong     }
220792229a57SYang Zhong 
220892229a57SYang Zhong     if (!kvm_irqchip_in_kernel()) {
220992229a57SYang Zhong         return -ENOSYS;
221092229a57SYang Zhong     }
221192229a57SYang Zhong 
221292229a57SYang Zhong     kroute.gsi = virq;
221392229a57SYang Zhong     kroute.type = KVM_IRQ_ROUTING_MSI;
221492229a57SYang Zhong     kroute.flags = 0;
221592229a57SYang Zhong     kroute.u.msi.address_lo = (uint32_t)msg.address;
221692229a57SYang Zhong     kroute.u.msi.address_hi = msg.address >> 32;
221792229a57SYang Zhong     kroute.u.msi.data = le32_to_cpu(msg.data);
221888c725c7SCornelia Huck     if (pci_available && kvm_msi_devid_required()) {
221992229a57SYang Zhong         kroute.flags = KVM_MSI_VALID_DEVID;
222092229a57SYang Zhong         kroute.u.msi.devid = pci_requester_id(dev);
222192229a57SYang Zhong     }
222292229a57SYang Zhong     if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
222392229a57SYang Zhong         return -EINVAL;
222492229a57SYang Zhong     }
222592229a57SYang Zhong 
222692229a57SYang Zhong     trace_kvm_irqchip_update_msi_route(virq);
222792229a57SYang Zhong 
222892229a57SYang Zhong     return kvm_update_routing_entry(s, &kroute);
222992229a57SYang Zhong }
223092229a57SYang Zhong 
kvm_irqchip_assign_irqfd(KVMState * s,EventNotifier * event,EventNotifier * resample,int virq,bool assign)2231ff66ba87SPeter Xu static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event,
2232ff66ba87SPeter Xu                                     EventNotifier *resample, int virq,
223392229a57SYang Zhong                                     bool assign)
223492229a57SYang Zhong {
2235ff66ba87SPeter Xu     int fd = event_notifier_get_fd(event);
2236ff66ba87SPeter Xu     int rfd = resample ? event_notifier_get_fd(resample) : -1;
2237ff66ba87SPeter Xu 
223892229a57SYang Zhong     struct kvm_irqfd irqfd = {
223992229a57SYang Zhong         .fd = fd,
224092229a57SYang Zhong         .gsi = virq,
224192229a57SYang Zhong         .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN,
224292229a57SYang Zhong     };
224392229a57SYang Zhong 
224492229a57SYang Zhong     if (rfd != -1) {
2245c82d9d43SPeter Xu         assert(assign);
2246c82d9d43SPeter Xu         if (kvm_irqchip_is_split()) {
2247c82d9d43SPeter Xu             /*
2248c82d9d43SPeter Xu              * When the slow irqchip (e.g. IOAPIC) is in the
2249c82d9d43SPeter Xu              * userspace, KVM kernel resamplefd will not work because
2250c82d9d43SPeter Xu              * the EOI of the interrupt will be delivered to userspace
2251c82d9d43SPeter Xu              * instead, so the KVM kernel resamplefd kick will be
2252c82d9d43SPeter Xu              * skipped.  The userspace here mimics what the kernel
2253c82d9d43SPeter Xu              * provides with resamplefd, remember the resamplefd and
2254c82d9d43SPeter Xu              * kick it when we receive EOI of this IRQ.
2255c82d9d43SPeter Xu              *
2256c82d9d43SPeter Xu              * This is hackery because IOAPIC is mostly bypassed
2257c82d9d43SPeter Xu              * (except EOI broadcasts) when irqfd is used.  However
2258c82d9d43SPeter Xu              * this can bring much performance back for split irqchip
2259c82d9d43SPeter Xu              * with INTx IRQs (for VFIO, this gives 93% perf of the
2260c82d9d43SPeter Xu              * full fast path, which is 46% perf boost comparing to
2261c82d9d43SPeter Xu              * the INTx slow path).
2262c82d9d43SPeter Xu              */
2263c82d9d43SPeter Xu             kvm_resample_fd_insert(virq, resample);
2264c82d9d43SPeter Xu         } else {
226592229a57SYang Zhong             irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE;
226692229a57SYang Zhong             irqfd.resamplefd = rfd;
226792229a57SYang Zhong         }
2268c82d9d43SPeter Xu     } else if (!assign) {
2269c82d9d43SPeter Xu         if (kvm_irqchip_is_split()) {
2270c82d9d43SPeter Xu             kvm_resample_fd_remove(virq);
2271c82d9d43SPeter Xu         }
2272c82d9d43SPeter Xu     }
227392229a57SYang Zhong 
227492229a57SYang Zhong     return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd);
227592229a57SYang Zhong }
227692229a57SYang Zhong 
227792229a57SYang Zhong #else /* !KVM_CAP_IRQ_ROUTING */
227892229a57SYang Zhong 
kvm_init_irq_routing(KVMState * s)227992229a57SYang Zhong void kvm_init_irq_routing(KVMState *s)
228092229a57SYang Zhong {
228192229a57SYang Zhong }
228292229a57SYang Zhong 
kvm_irqchip_release_virq(KVMState * s,int virq)228392229a57SYang Zhong void kvm_irqchip_release_virq(KVMState *s, int virq)
228492229a57SYang Zhong {
228592229a57SYang Zhong }
228692229a57SYang Zhong 
kvm_irqchip_send_msi(KVMState * s,MSIMessage msg)228792229a57SYang Zhong int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
228892229a57SYang Zhong {
228992229a57SYang Zhong     abort();
229092229a57SYang Zhong }
229192229a57SYang Zhong 
kvm_irqchip_add_msi_route(KVMRouteChange * c,int vector,PCIDevice * dev)2292def4c557SLongpeng(Mike) int kvm_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev)
229392229a57SYang Zhong {
229492229a57SYang Zhong     return -ENOSYS;
229592229a57SYang Zhong }
229692229a57SYang Zhong 
kvm_irqchip_add_adapter_route(KVMState * s,AdapterInfo * adapter)229792229a57SYang Zhong int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter)
229892229a57SYang Zhong {
229992229a57SYang Zhong     return -ENOSYS;
230092229a57SYang Zhong }
230192229a57SYang Zhong 
kvm_irqchip_add_hv_sint_route(KVMState * s,uint32_t vcpu,uint32_t sint)230292229a57SYang Zhong int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint)
230392229a57SYang Zhong {
230492229a57SYang Zhong     return -ENOSYS;
230592229a57SYang Zhong }
230692229a57SYang Zhong 
kvm_irqchip_assign_irqfd(KVMState * s,EventNotifier * event,EventNotifier * resample,int virq,bool assign)2307ff66ba87SPeter Xu static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event,
2308ff66ba87SPeter Xu                                     EventNotifier *resample, int virq,
2309ff66ba87SPeter Xu                                     bool assign)
231092229a57SYang Zhong {
231192229a57SYang Zhong     abort();
231292229a57SYang Zhong }
231392229a57SYang Zhong 
kvm_irqchip_update_msi_route(KVMState * s,int virq,MSIMessage msg)231492229a57SYang Zhong int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg)
231592229a57SYang Zhong {
231692229a57SYang Zhong     return -ENOSYS;
231792229a57SYang Zhong }
231892229a57SYang Zhong #endif /* !KVM_CAP_IRQ_ROUTING */
231992229a57SYang Zhong 
kvm_irqchip_add_irqfd_notifier_gsi(KVMState * s,EventNotifier * n,EventNotifier * rn,int virq)232092229a57SYang Zhong int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
232192229a57SYang Zhong                                        EventNotifier *rn, int virq)
232292229a57SYang Zhong {
2323ff66ba87SPeter Xu     return kvm_irqchip_assign_irqfd(s, n, rn, virq, true);
232492229a57SYang Zhong }
232592229a57SYang Zhong 
kvm_irqchip_remove_irqfd_notifier_gsi(KVMState * s,EventNotifier * n,int virq)232692229a57SYang Zhong int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
232792229a57SYang Zhong                                           int virq)
232892229a57SYang Zhong {
2329ff66ba87SPeter Xu     return kvm_irqchip_assign_irqfd(s, n, NULL, virq, false);
233092229a57SYang Zhong }
233192229a57SYang Zhong 
kvm_irqchip_add_irqfd_notifier(KVMState * s,EventNotifier * n,EventNotifier * rn,qemu_irq irq)233292229a57SYang Zhong int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n,
233392229a57SYang Zhong                                    EventNotifier *rn, qemu_irq irq)
233492229a57SYang Zhong {
233592229a57SYang Zhong     gpointer key, gsi;
233692229a57SYang Zhong     gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
233792229a57SYang Zhong 
233892229a57SYang Zhong     if (!found) {
233992229a57SYang Zhong         return -ENXIO;
234092229a57SYang Zhong     }
234192229a57SYang Zhong     return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, GPOINTER_TO_INT(gsi));
234292229a57SYang Zhong }
234392229a57SYang Zhong 
kvm_irqchip_remove_irqfd_notifier(KVMState * s,EventNotifier * n,qemu_irq irq)234492229a57SYang Zhong int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n,
234592229a57SYang Zhong                                       qemu_irq irq)
234692229a57SYang Zhong {
234792229a57SYang Zhong     gpointer key, gsi;
234892229a57SYang Zhong     gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
234992229a57SYang Zhong 
235092229a57SYang Zhong     if (!found) {
235192229a57SYang Zhong         return -ENXIO;
235292229a57SYang Zhong     }
235392229a57SYang Zhong     return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, GPOINTER_TO_INT(gsi));
235492229a57SYang Zhong }
235592229a57SYang Zhong 
kvm_irqchip_set_qemuirq_gsi(KVMState * s,qemu_irq irq,int gsi)235692229a57SYang Zhong void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi)
235792229a57SYang Zhong {
235892229a57SYang Zhong     g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi));
235992229a57SYang Zhong }
236092229a57SYang Zhong 
kvm_irqchip_create(KVMState * s)23614376c40dSPaolo Bonzini static void kvm_irqchip_create(KVMState *s)
236292229a57SYang Zhong {
236392229a57SYang Zhong     int ret;
236492229a57SYang Zhong 
2365d1972be1SXiaoyao Li     assert(s->kernel_irqchip_split != ON_OFF_AUTO_AUTO);
236692229a57SYang Zhong     if (kvm_check_extension(s, KVM_CAP_IRQCHIP)) {
236792229a57SYang Zhong         ;
236892229a57SYang Zhong     } else if (kvm_check_extension(s, KVM_CAP_S390_IRQCHIP)) {
236992229a57SYang Zhong         ret = kvm_vm_enable_cap(s, KVM_CAP_S390_IRQCHIP, 0);
237092229a57SYang Zhong         if (ret < 0) {
237192229a57SYang Zhong             fprintf(stderr, "Enable kernel irqchip failed: %s\n", strerror(-ret));
237292229a57SYang Zhong             exit(1);
237392229a57SYang Zhong         }
237492229a57SYang Zhong     } else {
237592229a57SYang Zhong         return;
237692229a57SYang Zhong     }
237792229a57SYang Zhong 
2378a788260bSPaolo Bonzini     if (kvm_check_extension(s, KVM_CAP_IRQFD) <= 0) {
2379a788260bSPaolo Bonzini         fprintf(stderr, "kvm: irqfd not implemented\n");
2380a788260bSPaolo Bonzini         exit(1);
2381a788260bSPaolo Bonzini     }
2382a788260bSPaolo Bonzini 
238392229a57SYang Zhong     /* First probe and see if there's a arch-specific hook to create the
238492229a57SYang Zhong      * in-kernel irqchip for us */
23854376c40dSPaolo Bonzini     ret = kvm_arch_irqchip_create(s);
238692229a57SYang Zhong     if (ret == 0) {
2387d1972be1SXiaoyao Li         if (s->kernel_irqchip_split == ON_OFF_AUTO_ON) {
238847c182feSCornelia Huck             error_report("Split IRQ chip mode not supported.");
238992229a57SYang Zhong             exit(1);
239092229a57SYang Zhong         } else {
239192229a57SYang Zhong             ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP);
239292229a57SYang Zhong         }
239392229a57SYang Zhong     }
239492229a57SYang Zhong     if (ret < 0) {
239592229a57SYang Zhong         fprintf(stderr, "Create kernel irqchip failed: %s\n", strerror(-ret));
239692229a57SYang Zhong         exit(1);
239792229a57SYang Zhong     }
239892229a57SYang Zhong 
239992229a57SYang Zhong     kvm_kernel_irqchip = true;
240092229a57SYang Zhong     /* If we have an in-kernel IRQ chip then we must have asynchronous
240192229a57SYang Zhong      * interrupt delivery (though the reverse is not necessarily true)
240292229a57SYang Zhong      */
240392229a57SYang Zhong     kvm_async_interrupts_allowed = true;
240492229a57SYang Zhong     kvm_halt_in_kernel_allowed = true;
240592229a57SYang Zhong 
240692229a57SYang Zhong     kvm_init_irq_routing(s);
240792229a57SYang Zhong 
240892229a57SYang Zhong     s->gsimap = g_hash_table_new(g_direct_hash, g_direct_equal);
240992229a57SYang Zhong }
241092229a57SYang Zhong 
241192229a57SYang Zhong /* Find number of supported CPUs using the recommended
241292229a57SYang Zhong  * procedure from the kernel API documentation to cope with
241392229a57SYang Zhong  * older kernels that may be missing capabilities.
241492229a57SYang Zhong  */
kvm_recommended_vcpus(KVMState * s)241592229a57SYang Zhong static int kvm_recommended_vcpus(KVMState *s)
241692229a57SYang Zhong {
241711748ba7SGreg Kurz     int ret = kvm_vm_check_extension(s, KVM_CAP_NR_VCPUS);
241892229a57SYang Zhong     return (ret) ? ret : 4;
241992229a57SYang Zhong }
242092229a57SYang Zhong 
kvm_max_vcpus(KVMState * s)242192229a57SYang Zhong static int kvm_max_vcpus(KVMState *s)
242292229a57SYang Zhong {
242392229a57SYang Zhong     int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS);
242492229a57SYang Zhong     return (ret) ? ret : kvm_recommended_vcpus(s);
242592229a57SYang Zhong }
242692229a57SYang Zhong 
kvm_max_vcpu_id(KVMState * s)242792229a57SYang Zhong static int kvm_max_vcpu_id(KVMState *s)
242892229a57SYang Zhong {
242992229a57SYang Zhong     int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPU_ID);
243092229a57SYang Zhong     return (ret) ? ret : kvm_max_vcpus(s);
243192229a57SYang Zhong }
243292229a57SYang Zhong 
kvm_vcpu_id_is_valid(int vcpu_id)243392229a57SYang Zhong bool kvm_vcpu_id_is_valid(int vcpu_id)
243492229a57SYang Zhong {
24354f7f5893SPhilippe Mathieu-Daudé     KVMState *s = KVM_STATE(current_accel());
243692229a57SYang Zhong     return vcpu_id >= 0 && vcpu_id < kvm_max_vcpu_id(s);
243792229a57SYang Zhong }
243892229a57SYang Zhong 
kvm_dirty_ring_enabled(void)24397786ae40SHyman Huang(黄勇) bool kvm_dirty_ring_enabled(void)
24407786ae40SHyman Huang(黄勇) {
2441e65152d5SMasato Imai     return kvm_state && kvm_state->kvm_dirty_ring_size;
24427786ae40SHyman Huang(黄勇) }
24437786ae40SHyman Huang(黄勇) 
2444467ef823SPaolo Bonzini static void query_stats_cb(StatsResultList **result, StatsTarget target,
2445cf7405bcSPaolo Bonzini                            strList *names, strList *targets, Error **errp);
2446cc01a3f4SMark Kanda static void query_stats_schemas_cb(StatsSchemaList **result, Error **errp);
2447cc01a3f4SMark Kanda 
kvm_dirty_ring_size(void)24484a06a7ccSHyman Huang(黄勇) uint32_t kvm_dirty_ring_size(void)
24494a06a7ccSHyman Huang(黄勇) {
24504a06a7ccSHyman Huang(黄勇)     return kvm_state->kvm_dirty_ring_size;
24514a06a7ccSHyman Huang(黄勇) }
24524a06a7ccSHyman Huang(黄勇) 
do_kvm_create_vm(MachineState * ms,int type)245367388078SAni Sinha static int do_kvm_create_vm(MachineState *ms, int type)
245467388078SAni Sinha {
245567388078SAni Sinha     KVMState *s;
245667388078SAni Sinha     int ret;
245767388078SAni Sinha 
245867388078SAni Sinha     s = KVM_STATE(ms->accelerator);
245967388078SAni Sinha 
246067388078SAni Sinha     do {
246167388078SAni Sinha         ret = kvm_ioctl(s, KVM_CREATE_VM, type);
246267388078SAni Sinha     } while (ret == -EINTR);
246367388078SAni Sinha 
246467388078SAni Sinha     if (ret < 0) {
246567388078SAni Sinha         error_report("ioctl(KVM_CREATE_VM) failed: %s", strerror(-ret));
246667388078SAni Sinha 
246767388078SAni Sinha #ifdef TARGET_S390X
246867388078SAni Sinha         if (ret == -EINVAL) {
246967388078SAni Sinha             error_printf("Host kernel setup problem detected."
247067388078SAni Sinha                          " Please verify:\n");
247167388078SAni Sinha             error_printf("- for kernels supporting the"
247267388078SAni Sinha                         " switch_amode or user_mode parameters, whether");
247367388078SAni Sinha             error_printf(" user space is running in primary address space\n");
247467388078SAni Sinha             error_printf("- for kernels supporting the vm.allocate_pgste"
247567388078SAni Sinha                          " sysctl, whether it is enabled\n");
247667388078SAni Sinha         }
247767388078SAni Sinha #elif defined(TARGET_PPC)
247867388078SAni Sinha         if (ret == -EINVAL) {
247967388078SAni Sinha             error_printf("PPC KVM module is not loaded. Try modprobe kvm_%s.\n",
248067388078SAni Sinha                          (type == 2) ? "pr" : "hv");
248167388078SAni Sinha         }
248267388078SAni Sinha #endif
248367388078SAni Sinha     }
248467388078SAni Sinha 
248567388078SAni Sinha     return ret;
248667388078SAni Sinha }
248767388078SAni Sinha 
find_kvm_machine_type(MachineState * ms)248867388078SAni Sinha static int find_kvm_machine_type(MachineState *ms)
248967388078SAni Sinha {
249067388078SAni Sinha     MachineClass *mc = MACHINE_GET_CLASS(ms);
249167388078SAni Sinha     int type;
249267388078SAni Sinha 
249367388078SAni Sinha     if (object_property_find(OBJECT(current_machine), "kvm-type")) {
249467388078SAni Sinha         g_autofree char *kvm_type;
249567388078SAni Sinha         kvm_type = object_property_get_str(OBJECT(current_machine),
249667388078SAni Sinha                                            "kvm-type",
249767388078SAni Sinha                                            &error_abort);
249867388078SAni Sinha         type = mc->kvm_type(ms, kvm_type);
249967388078SAni Sinha     } else if (mc->kvm_type) {
250067388078SAni Sinha         type = mc->kvm_type(ms, NULL);
250167388078SAni Sinha     } else {
250267388078SAni Sinha         type = kvm_arch_get_default_type(ms);
250367388078SAni Sinha     }
250467388078SAni Sinha     return type;
250567388078SAni Sinha }
250667388078SAni Sinha 
kvm_setup_dirty_ring(KVMState * s)250728ed7f97SAni Sinha static int kvm_setup_dirty_ring(KVMState *s)
250828ed7f97SAni Sinha {
250928ed7f97SAni Sinha     uint64_t dirty_log_manual_caps;
251028ed7f97SAni Sinha     int ret;
251128ed7f97SAni Sinha 
251228ed7f97SAni Sinha     /*
251328ed7f97SAni Sinha      * Enable KVM dirty ring if supported, otherwise fall back to
251428ed7f97SAni Sinha      * dirty logging mode
251528ed7f97SAni Sinha      */
251628ed7f97SAni Sinha     ret = kvm_dirty_ring_init(s);
251728ed7f97SAni Sinha     if (ret < 0) {
251828ed7f97SAni Sinha         return ret;
251928ed7f97SAni Sinha     }
252028ed7f97SAni Sinha 
252128ed7f97SAni Sinha     /*
252228ed7f97SAni Sinha      * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is not needed when dirty ring is
252328ed7f97SAni Sinha      * enabled.  More importantly, KVM_DIRTY_LOG_INITIALLY_SET will assume no
252428ed7f97SAni Sinha      * page is wr-protected initially, which is against how kvm dirty ring is
252528ed7f97SAni Sinha      * usage - kvm dirty ring requires all pages are wr-protected at the very
252628ed7f97SAni Sinha      * beginning.  Enabling this feature for dirty ring causes data corruption.
252728ed7f97SAni Sinha      *
252828ed7f97SAni Sinha      * TODO: Without KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 and kvm clear dirty log,
252928ed7f97SAni Sinha      * we may expect a higher stall time when starting the migration.  In the
253028ed7f97SAni Sinha      * future we can enable KVM_CLEAR_DIRTY_LOG to work with dirty ring too:
253128ed7f97SAni Sinha      * instead of clearing dirty bit, it can be a way to explicitly wr-protect
253228ed7f97SAni Sinha      * guest pages.
253328ed7f97SAni Sinha      */
253428ed7f97SAni Sinha     if (!s->kvm_dirty_ring_size) {
253528ed7f97SAni Sinha         dirty_log_manual_caps =
253628ed7f97SAni Sinha             kvm_check_extension(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
253728ed7f97SAni Sinha         dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
253828ed7f97SAni Sinha                                   KVM_DIRTY_LOG_INITIALLY_SET);
253928ed7f97SAni Sinha         s->manual_dirty_log_protect = dirty_log_manual_caps;
254028ed7f97SAni Sinha         if (dirty_log_manual_caps) {
254128ed7f97SAni Sinha             ret = kvm_vm_enable_cap(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0,
254228ed7f97SAni Sinha                                     dirty_log_manual_caps);
254328ed7f97SAni Sinha             if (ret) {
254428ed7f97SAni Sinha                 warn_report("Trying to enable capability %"PRIu64" of "
254528ed7f97SAni Sinha                             "KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 but failed. "
254628ed7f97SAni Sinha                             "Falling back to the legacy mode. ",
254728ed7f97SAni Sinha                             dirty_log_manual_caps);
254828ed7f97SAni Sinha                 s->manual_dirty_log_protect = 0;
254928ed7f97SAni Sinha             }
255028ed7f97SAni Sinha         }
255128ed7f97SAni Sinha     }
255228ed7f97SAni Sinha 
255328ed7f97SAni Sinha     return 0;
255428ed7f97SAni Sinha }
255528ed7f97SAni Sinha 
kvm_init(MachineState * ms)255692229a57SYang Zhong static int kvm_init(MachineState *ms)
255792229a57SYang Zhong {
255892229a57SYang Zhong     MachineClass *mc = MACHINE_GET_CLASS(ms);
255992229a57SYang Zhong     static const char upgrade_note[] =
256092229a57SYang Zhong         "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
256192229a57SYang Zhong         "(see http://sourceforge.net/projects/kvm).\n";
25622459d420SPhilippe Mathieu-Daudé     const struct {
256392229a57SYang Zhong         const char *name;
256492229a57SYang Zhong         int num;
256592229a57SYang Zhong     } num_cpus[] = {
25665cc8767dSLike Xu         { "SMP",          ms->smp.cpus },
25675cc8767dSLike Xu         { "hotpluggable", ms->smp.max_cpus },
25682459d420SPhilippe Mathieu-Daudé         { /* end of list */ }
256992229a57SYang Zhong     }, *nc = num_cpus;
257092229a57SYang Zhong     int soft_vcpus_limit, hard_vcpus_limit;
257192229a57SYang Zhong     KVMState *s;
257292229a57SYang Zhong     const KVMCapabilityInfo *missing_cap;
257392229a57SYang Zhong     int ret;
25745e0d6590SAkihiko Odaki     int type;
257592229a57SYang Zhong 
2576a2f77862SPeter Xu     qemu_mutex_init(&kml_slots_lock);
2577a2f77862SPeter Xu 
257892229a57SYang Zhong     s = KVM_STATE(ms->accelerator);
257992229a57SYang Zhong 
258092229a57SYang Zhong     /*
258192229a57SYang Zhong      * On systems where the kernel can support different base page
258292229a57SYang Zhong      * sizes, host page size may be different from TARGET_PAGE_SIZE,
258392229a57SYang Zhong      * even with KVM.  TARGET_PAGE_SIZE is assumed to be the minimum
258492229a57SYang Zhong      * page size for the system though.
258592229a57SYang Zhong      */
25868e3b0cbbSMarc-André Lureau     assert(TARGET_PAGE_SIZE <= qemu_real_host_page_size());
258792229a57SYang Zhong 
258892229a57SYang Zhong     s->sigmask_len = 8;
2589a27dd2deSEmanuele Giuseppe Esposito     accel_blocker_init();
259092229a57SYang Zhong 
25911e1e4879SPaolo Bonzini #ifdef TARGET_KVM_HAVE_GUEST_DEBUG
259292229a57SYang Zhong     QTAILQ_INIT(&s->kvm_sw_breakpoints);
259392229a57SYang Zhong #endif
259492229a57SYang Zhong     QLIST_INIT(&s->kvm_parked_vcpus);
2595aef158b0SDaan De Meyer     s->fd = qemu_open_old(s->device ?: "/dev/kvm", O_RDWR);
259692229a57SYang Zhong     if (s->fd == -1) {
2597804dfbe3SAni Sinha         error_report("Could not access KVM kernel module: %m");
259892229a57SYang Zhong         ret = -errno;
259992229a57SYang Zhong         goto err;
260092229a57SYang Zhong     }
260192229a57SYang Zhong 
260292229a57SYang Zhong     ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
260392229a57SYang Zhong     if (ret < KVM_API_VERSION) {
260492229a57SYang Zhong         if (ret >= 0) {
260592229a57SYang Zhong             ret = -EINVAL;
260692229a57SYang Zhong         }
2607804dfbe3SAni Sinha         error_report("kvm version too old");
260892229a57SYang Zhong         goto err;
260992229a57SYang Zhong     }
261092229a57SYang Zhong 
261192229a57SYang Zhong     if (ret > KVM_API_VERSION) {
261292229a57SYang Zhong         ret = -EINVAL;
2613804dfbe3SAni Sinha         error_report("kvm version not supported");
261492229a57SYang Zhong         goto err;
261592229a57SYang Zhong     }
261692229a57SYang Zhong 
261792229a57SYang Zhong     kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT);
2618943c7428SPeter Xu     s->nr_slots_max = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS);
261992229a57SYang Zhong 
262092229a57SYang Zhong     /* If unspecified, use the default value */
2621943c7428SPeter Xu     if (!s->nr_slots_max) {
2622b34a908cSPeter Xu         s->nr_slots_max = KVM_MEMSLOTS_NR_MAX_DEFAULT;
262392229a57SYang Zhong     }
262492229a57SYang Zhong 
262567388078SAni Sinha     type = find_kvm_machine_type(ms);
2626bc3e41a0SAkihiko Odaki     if (type < 0) {
2627bc3e41a0SAkihiko Odaki         ret = -EINVAL;
2628bc3e41a0SAkihiko Odaki         goto err;
2629bc3e41a0SAkihiko Odaki     }
2630bc3e41a0SAkihiko Odaki 
263167388078SAni Sinha     ret = do_kvm_create_vm(ms, type);
263292229a57SYang Zhong     if (ret < 0) {
263392229a57SYang Zhong         goto err;
263492229a57SYang Zhong     }
263592229a57SYang Zhong 
263692229a57SYang Zhong     s->vmfd = ret;
263711748ba7SGreg Kurz 
263860de433dSPaolo Bonzini     s->nr_as = kvm_vm_check_extension(s, KVM_CAP_MULTI_ADDRESS_SPACE);
263960de433dSPaolo Bonzini     if (s->nr_as <= 1) {
264060de433dSPaolo Bonzini         s->nr_as = 1;
264160de433dSPaolo Bonzini     }
264260de433dSPaolo Bonzini     s->as = g_new0(struct KVMAs, s->nr_as);
264360de433dSPaolo Bonzini 
264411748ba7SGreg Kurz     /* check the vcpu limits */
264511748ba7SGreg Kurz     soft_vcpus_limit = kvm_recommended_vcpus(s);
264611748ba7SGreg Kurz     hard_vcpus_limit = kvm_max_vcpus(s);
264711748ba7SGreg Kurz 
264811748ba7SGreg Kurz     while (nc->name) {
264911748ba7SGreg Kurz         if (nc->num > soft_vcpus_limit) {
265011748ba7SGreg Kurz             warn_report("Number of %s cpus requested (%d) exceeds "
265111748ba7SGreg Kurz                         "the recommended cpus supported by KVM (%d)",
265211748ba7SGreg Kurz                         nc->name, nc->num, soft_vcpus_limit);
265311748ba7SGreg Kurz 
265411748ba7SGreg Kurz             if (nc->num > hard_vcpus_limit) {
2655804dfbe3SAni Sinha                 error_report("Number of %s cpus requested (%d) exceeds "
2656804dfbe3SAni Sinha                              "the maximum cpus supported by KVM (%d)",
265711748ba7SGreg Kurz                              nc->name, nc->num, hard_vcpus_limit);
265811748ba7SGreg Kurz                 exit(1);
265911748ba7SGreg Kurz             }
266011748ba7SGreg Kurz         }
266111748ba7SGreg Kurz         nc++;
266211748ba7SGreg Kurz     }
266311748ba7SGreg Kurz 
266492229a57SYang Zhong     missing_cap = kvm_check_extension_list(s, kvm_required_capabilites);
266592229a57SYang Zhong     if (!missing_cap) {
266692229a57SYang Zhong         missing_cap =
266792229a57SYang Zhong             kvm_check_extension_list(s, kvm_arch_required_capabilities);
266892229a57SYang Zhong     }
266992229a57SYang Zhong     if (missing_cap) {
267092229a57SYang Zhong         ret = -EINVAL;
2671804dfbe3SAni Sinha         error_report("kvm does not support %s", missing_cap->name);
2672804dfbe3SAni Sinha         error_printf("%s", upgrade_note);
267392229a57SYang Zhong         goto err;
267492229a57SYang Zhong     }
267592229a57SYang Zhong 
267692229a57SYang Zhong     s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
2677e6d34aeeSPeng Hao     s->coalesced_pio = s->coalesced_mmio &&
2678e6d34aeeSPeng Hao                        kvm_check_extension(s, KVM_CAP_COALESCED_PIO);
267992229a57SYang Zhong 
268028ed7f97SAni Sinha     ret = kvm_setup_dirty_ring(s);
26813794cb94SGavin Shan     if (ret < 0) {
2682b4420f19SPeter Xu         goto err;
2683b4420f19SPeter Xu     }
2684b4420f19SPeter Xu 
268592229a57SYang Zhong #ifdef KVM_CAP_VCPU_EVENTS
268692229a57SYang Zhong     s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
268792229a57SYang Zhong #endif
2688ebbfef2fSLiran Alon     s->max_nested_state_len = kvm_check_extension(s, KVM_CAP_NESTED_STATE);
2689ebbfef2fSLiran Alon 
269092229a57SYang Zhong     s->irq_set_ioctl = KVM_IRQ_LINE;
269192229a57SYang Zhong     if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) {
269292229a57SYang Zhong         s->irq_set_ioctl = KVM_IRQ_LINE_STATUS;
269392229a57SYang Zhong     }
269492229a57SYang Zhong 
269592229a57SYang Zhong     kvm_readonly_mem_allowed =
269664e0e63eSTom Dohrmann         (kvm_vm_check_extension(s, KVM_CAP_READONLY_MEM) > 0);
269792229a57SYang Zhong 
269892229a57SYang Zhong     kvm_resamplefds_allowed =
269992229a57SYang Zhong         (kvm_check_extension(s, KVM_CAP_IRQFD_RESAMPLE) > 0);
270092229a57SYang Zhong 
270192229a57SYang Zhong     kvm_vm_attributes_allowed =
270292229a57SYang Zhong         (kvm_check_extension(s, KVM_CAP_VM_ATTRIBUTES) > 0);
270392229a57SYang Zhong 
27041e1e4879SPaolo Bonzini #ifdef TARGET_KVM_HAVE_GUEST_DEBUG
270512bc5b4cSMaxim Levitsky     kvm_has_guest_debug =
270612bc5b4cSMaxim Levitsky         (kvm_check_extension(s, KVM_CAP_SET_GUEST_DEBUG) > 0);
270712bc5b4cSMaxim Levitsky #endif
270812bc5b4cSMaxim Levitsky 
270912bc5b4cSMaxim Levitsky     kvm_sstep_flags = 0;
271012bc5b4cSMaxim Levitsky     if (kvm_has_guest_debug) {
271112bc5b4cSMaxim Levitsky         kvm_sstep_flags = SSTEP_ENABLE;
2712fd2ddd16SMaxim Levitsky 
27131e1e4879SPaolo Bonzini #if defined TARGET_KVM_HAVE_GUEST_DEBUG
2714fd2ddd16SMaxim Levitsky         int guest_debug_flags =
2715fd2ddd16SMaxim Levitsky             kvm_check_extension(s, KVM_CAP_SET_GUEST_DEBUG2);
2716fd2ddd16SMaxim Levitsky 
2717fd2ddd16SMaxim Levitsky         if (guest_debug_flags & KVM_GUESTDBG_BLOCKIRQ) {
2718fd2ddd16SMaxim Levitsky             kvm_sstep_flags |= SSTEP_NOIRQ;
2719fd2ddd16SMaxim Levitsky         }
2720fd2ddd16SMaxim Levitsky #endif
272112bc5b4cSMaxim Levitsky     }
272212bc5b4cSMaxim Levitsky 
272392229a57SYang Zhong     kvm_state = s;
272492229a57SYang Zhong 
272592229a57SYang Zhong     ret = kvm_arch_init(ms, s);
272692229a57SYang Zhong     if (ret < 0) {
272792229a57SYang Zhong         goto err;
272892229a57SYang Zhong     }
272992229a57SYang Zhong 
2730586d708cSPaolo Bonzini     kvm_supported_memory_attributes = kvm_vm_check_extension(s, KVM_CAP_MEMORY_ATTRIBUTES);
2731586d708cSPaolo Bonzini     kvm_guest_memfd_supported =
2732586d708cSPaolo Bonzini         kvm_check_extension(s, KVM_CAP_GUEST_MEMFD) &&
2733586d708cSPaolo Bonzini         kvm_check_extension(s, KVM_CAP_USER_MEMORY2) &&
2734586d708cSPaolo Bonzini         (kvm_supported_memory_attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE);
2735586d708cSPaolo Bonzini 
2736d1972be1SXiaoyao Li     if (s->kernel_irqchip_split == ON_OFF_AUTO_AUTO) {
2737d1972be1SXiaoyao Li         s->kernel_irqchip_split = mc->default_kernel_irqchip_split ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
2738d1972be1SXiaoyao Li     }
2739d1972be1SXiaoyao Li 
27406b552b9bSDongjiu Geng     qemu_register_reset(kvm_unpoison_all, NULL);
2741*2dc65296SMaciej S. Szmigiero     qemu_register_reset(kvm_reset_parked_vcpus, s);
27426b552b9bSDongjiu Geng 
274311bc4a13SPaolo Bonzini     if (s->kernel_irqchip_allowed) {
27444376c40dSPaolo Bonzini         kvm_irqchip_create(s);
274592229a57SYang Zhong     }
274692229a57SYang Zhong 
274792229a57SYang Zhong     s->memory_listener.listener.eventfd_add = kvm_mem_ioeventfd_add;
274892229a57SYang Zhong     s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del;
2749e6d34aeeSPeng Hao     s->memory_listener.listener.coalesced_io_add = kvm_coalesce_mmio_region;
2750e6d34aeeSPeng Hao     s->memory_listener.listener.coalesced_io_del = kvm_uncoalesce_mmio_region;
275192229a57SYang Zhong 
275292229a57SYang Zhong     kvm_memory_listener_register(s, &s->memory_listener,
2753142518bdSPeter Xu                                  &address_space_memory, 0, "kvm-memory");
275492229a57SYang Zhong     memory_listener_register(&kvm_io_listener,
275592229a57SYang Zhong                              &address_space_io);
275692229a57SYang Zhong 
275762dd4edaSGreg Kurz     s->sync_mmu = !!kvm_vm_check_extension(kvm_state, KVM_CAP_SYNC_MMU);
2758f5948942SAlex Williamson     if (!s->sync_mmu) {
2759956b109fSDavid Hildenbrand         ret = ram_block_discard_disable(true);
2760956b109fSDavid Hildenbrand         assert(!ret);
2761f5948942SAlex Williamson     }
2762b4420f19SPeter Xu 
2763b4420f19SPeter Xu     if (s->kvm_dirty_ring_size) {
276443a5e377SAkihiko Odaki         kvm_dirty_ring_reaper_init(s);
2765b4420f19SPeter Xu     }
2766b4420f19SPeter Xu 
2767cc01a3f4SMark Kanda     if (kvm_check_extension(kvm_state, KVM_CAP_BINARY_STATS_FD)) {
2768068cc51dSPaolo Bonzini         add_stats_callbacks(STATS_PROVIDER_KVM, query_stats_cb,
2769068cc51dSPaolo Bonzini                             query_stats_schemas_cb);
2770cc01a3f4SMark Kanda     }
2771cc01a3f4SMark Kanda 
277292229a57SYang Zhong     return 0;
277392229a57SYang Zhong 
277492229a57SYang Zhong err:
277592229a57SYang Zhong     assert(ret < 0);
277692229a57SYang Zhong     if (s->vmfd >= 0) {
277792229a57SYang Zhong         close(s->vmfd);
277892229a57SYang Zhong     }
277992229a57SYang Zhong     if (s->fd != -1) {
278092229a57SYang Zhong         close(s->fd);
278192229a57SYang Zhong     }
27824625742cSAkihiko Odaki     g_free(s->as);
278392229a57SYang Zhong     g_free(s->memory_listener.slots);
278492229a57SYang Zhong 
278592229a57SYang Zhong     return ret;
278692229a57SYang Zhong }
278792229a57SYang Zhong 
kvm_set_sigmask_len(KVMState * s,unsigned int sigmask_len)278892229a57SYang Zhong void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len)
278992229a57SYang Zhong {
279092229a57SYang Zhong     s->sigmask_len = sigmask_len;
279192229a57SYang Zhong }
279292229a57SYang Zhong 
kvm_handle_io(uint16_t port,MemTxAttrs attrs,void * data,int direction,int size,uint32_t count)279392229a57SYang Zhong static void kvm_handle_io(uint16_t port, MemTxAttrs attrs, void *data, int direction,
279492229a57SYang Zhong                           int size, uint32_t count)
279592229a57SYang Zhong {
279692229a57SYang Zhong     int i;
279792229a57SYang Zhong     uint8_t *ptr = data;
279892229a57SYang Zhong 
279992229a57SYang Zhong     for (i = 0; i < count; i++) {
280092229a57SYang Zhong         address_space_rw(&address_space_io, port, attrs,
280192229a57SYang Zhong                          ptr, size,
280292229a57SYang Zhong                          direction == KVM_EXIT_IO_OUT);
280392229a57SYang Zhong         ptr += size;
280492229a57SYang Zhong     }
280592229a57SYang Zhong }
280692229a57SYang Zhong 
kvm_handle_internal_error(CPUState * cpu,struct kvm_run * run)280792229a57SYang Zhong static int kvm_handle_internal_error(CPUState *cpu, struct kvm_run *run)
280892229a57SYang Zhong {
2809aacec9aeSPaolo Bonzini     int i;
2810aacec9aeSPaolo Bonzini 
281192229a57SYang Zhong     fprintf(stderr, "KVM internal error. Suberror: %d\n",
281292229a57SYang Zhong             run->internal.suberror);
281392229a57SYang Zhong 
281492229a57SYang Zhong     for (i = 0; i < run->internal.ndata; ++i) {
281556567da3SDavid Edmondson         fprintf(stderr, "extra data[%d]: 0x%016"PRIx64"\n",
281692229a57SYang Zhong                 i, (uint64_t)run->internal.data[i]);
281792229a57SYang Zhong     }
281892229a57SYang Zhong     if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
281992229a57SYang Zhong         fprintf(stderr, "emulation failure\n");
282092229a57SYang Zhong         if (!kvm_arch_stop_on_emulation_error(cpu)) {
282190c84c56SMarkus Armbruster             cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
282292229a57SYang Zhong             return EXCP_INTERRUPT;
282392229a57SYang Zhong         }
282492229a57SYang Zhong     }
282592229a57SYang Zhong     /* FIXME: Should trigger a qmp message to let management know
282692229a57SYang Zhong      * something went wrong.
282792229a57SYang Zhong      */
282892229a57SYang Zhong     return -1;
282992229a57SYang Zhong }
283092229a57SYang Zhong 
kvm_flush_coalesced_mmio_buffer(void)283192229a57SYang Zhong void kvm_flush_coalesced_mmio_buffer(void)
283292229a57SYang Zhong {
283392229a57SYang Zhong     KVMState *s = kvm_state;
283492229a57SYang Zhong 
2835fe6bda58SGavin Shan     if (!s || s->coalesced_flush_in_progress) {
283692229a57SYang Zhong         return;
283792229a57SYang Zhong     }
283892229a57SYang Zhong 
283992229a57SYang Zhong     s->coalesced_flush_in_progress = true;
284092229a57SYang Zhong 
284192229a57SYang Zhong     if (s->coalesced_mmio_ring) {
284292229a57SYang Zhong         struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
284392229a57SYang Zhong         while (ring->first != ring->last) {
284492229a57SYang Zhong             struct kvm_coalesced_mmio *ent;
284592229a57SYang Zhong 
284692229a57SYang Zhong             ent = &ring->coalesced_mmio[ring->first];
284792229a57SYang Zhong 
2848e6d34aeeSPeng Hao             if (ent->pio == 1) {
284919f70347SPeter Maydell                 address_space_write(&address_space_io, ent->phys_addr,
2850e6d34aeeSPeng Hao                                     MEMTXATTRS_UNSPECIFIED, ent->data,
285119f70347SPeter Maydell                                     ent->len);
2852e6d34aeeSPeng Hao             } else {
285392229a57SYang Zhong                 cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
2854e6d34aeeSPeng Hao             }
285592229a57SYang Zhong             smp_wmb();
285692229a57SYang Zhong             ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
285792229a57SYang Zhong         }
285892229a57SYang Zhong     }
285992229a57SYang Zhong 
286092229a57SYang Zhong     s->coalesced_flush_in_progress = false;
286192229a57SYang Zhong }
286292229a57SYang Zhong 
do_kvm_cpu_synchronize_state(CPUState * cpu,run_on_cpu_data arg)286392229a57SYang Zhong static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
286492229a57SYang Zhong {
28655c3131c3SPaolo Bonzini     if (!cpu->vcpu_dirty && !kvm_state->guest_state_protected) {
2866a1676bb3SJulia Suvorova         Error *err = NULL;
2867a1676bb3SJulia Suvorova         int ret = kvm_arch_get_registers(cpu, &err);
28687191f24cSAkihiko Odaki         if (ret) {
2869a1676bb3SJulia Suvorova             if (err) {
2870a1676bb3SJulia Suvorova                 error_reportf_err(err, "Failed to synchronize CPU state: ");
2871a1676bb3SJulia Suvorova             } else {
28727191f24cSAkihiko Odaki                 error_report("Failed to get registers: %s", strerror(-ret));
2873a1676bb3SJulia Suvorova             }
2874a1676bb3SJulia Suvorova 
28757191f24cSAkihiko Odaki             cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
28767191f24cSAkihiko Odaki             vm_stop(RUN_STATE_INTERNAL_ERROR);
28777191f24cSAkihiko Odaki         }
28787191f24cSAkihiko Odaki 
287999f31832SSergio Andres Gomez Del Real         cpu->vcpu_dirty = true;
288092229a57SYang Zhong     }
288192229a57SYang Zhong }
288292229a57SYang Zhong 
kvm_cpu_synchronize_state(CPUState * cpu)288392229a57SYang Zhong void kvm_cpu_synchronize_state(CPUState *cpu)
288492229a57SYang Zhong {
28855c3131c3SPaolo Bonzini     if (!cpu->vcpu_dirty && !kvm_state->guest_state_protected) {
288692229a57SYang Zhong         run_on_cpu(cpu, do_kvm_cpu_synchronize_state, RUN_ON_CPU_NULL);
288792229a57SYang Zhong     }
288892229a57SYang Zhong }
288992229a57SYang Zhong 
do_kvm_cpu_synchronize_post_reset(CPUState * cpu,run_on_cpu_data arg)289092229a57SYang Zhong static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg)
289192229a57SYang Zhong {
2892a1676bb3SJulia Suvorova     Error *err = NULL;
2893a1676bb3SJulia Suvorova     int ret = kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE, &err);
28947191f24cSAkihiko Odaki     if (ret) {
2895a1676bb3SJulia Suvorova         if (err) {
2896a1676bb3SJulia Suvorova             error_reportf_err(err, "Restoring resisters after reset: ");
2897a1676bb3SJulia Suvorova         } else {
2898a1676bb3SJulia Suvorova             error_report("Failed to put registers after reset: %s",
2899a1676bb3SJulia Suvorova                          strerror(-ret));
2900a1676bb3SJulia Suvorova         }
29017191f24cSAkihiko Odaki         cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
29027191f24cSAkihiko Odaki         vm_stop(RUN_STATE_INTERNAL_ERROR);
29037191f24cSAkihiko Odaki     }
29047191f24cSAkihiko Odaki 
290599f31832SSergio Andres Gomez Del Real     cpu->vcpu_dirty = false;
290692229a57SYang Zhong }
290792229a57SYang Zhong 
kvm_cpu_synchronize_post_reset(CPUState * cpu)290892229a57SYang Zhong void kvm_cpu_synchronize_post_reset(CPUState *cpu)
290992229a57SYang Zhong {
291092229a57SYang Zhong     run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
291192229a57SYang Zhong }
291292229a57SYang Zhong 
do_kvm_cpu_synchronize_post_init(CPUState * cpu,run_on_cpu_data arg)291392229a57SYang Zhong static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg)
291492229a57SYang Zhong {
2915a1676bb3SJulia Suvorova     Error *err = NULL;
2916a1676bb3SJulia Suvorova     int ret = kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE, &err);
29177191f24cSAkihiko Odaki     if (ret) {
2918a1676bb3SJulia Suvorova         if (err) {
2919a1676bb3SJulia Suvorova             error_reportf_err(err, "Putting registers after init: ");
2920a1676bb3SJulia Suvorova         } else {
2921a1676bb3SJulia Suvorova             error_report("Failed to put registers after init: %s",
2922a1676bb3SJulia Suvorova                          strerror(-ret));
2923a1676bb3SJulia Suvorova         }
29247191f24cSAkihiko Odaki         exit(1);
29257191f24cSAkihiko Odaki     }
29267191f24cSAkihiko Odaki 
292799f31832SSergio Andres Gomez Del Real     cpu->vcpu_dirty = false;
292892229a57SYang Zhong }
292992229a57SYang Zhong 
kvm_cpu_synchronize_post_init(CPUState * cpu)293092229a57SYang Zhong void kvm_cpu_synchronize_post_init(CPUState *cpu)
293192229a57SYang Zhong {
29325c3131c3SPaolo Bonzini     if (!kvm_state->guest_state_protected) {
29335c3131c3SPaolo Bonzini         /*
29345c3131c3SPaolo Bonzini          * This runs before the machine_init_done notifiers, and is the last
29355c3131c3SPaolo Bonzini          * opportunity to synchronize the state of confidential guests.
29365c3131c3SPaolo Bonzini          */
293792229a57SYang Zhong         run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
293892229a57SYang Zhong     }
29395c3131c3SPaolo Bonzini }
294092229a57SYang Zhong 
do_kvm_cpu_synchronize_pre_loadvm(CPUState * cpu,run_on_cpu_data arg)294192229a57SYang Zhong static void do_kvm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg)
294292229a57SYang Zhong {
294399f31832SSergio Andres Gomez Del Real     cpu->vcpu_dirty = true;
294492229a57SYang Zhong }
294592229a57SYang Zhong 
kvm_cpu_synchronize_pre_loadvm(CPUState * cpu)294692229a57SYang Zhong void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu)
294792229a57SYang Zhong {
294892229a57SYang Zhong     run_on_cpu(cpu, do_kvm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
294992229a57SYang Zhong }
295092229a57SYang Zhong 
295192229a57SYang Zhong #ifdef KVM_HAVE_MCE_INJECTION
295292229a57SYang Zhong static __thread void *pending_sigbus_addr;
295392229a57SYang Zhong static __thread int pending_sigbus_code;
295492229a57SYang Zhong static __thread bool have_sigbus_pending;
295592229a57SYang Zhong #endif
295692229a57SYang Zhong 
kvm_cpu_kick(CPUState * cpu)295792229a57SYang Zhong static void kvm_cpu_kick(CPUState *cpu)
295892229a57SYang Zhong {
2959d73415a3SStefan Hajnoczi     qatomic_set(&cpu->kvm_run->immediate_exit, 1);
296092229a57SYang Zhong }
296192229a57SYang Zhong 
kvm_cpu_kick_self(void)296292229a57SYang Zhong static void kvm_cpu_kick_self(void)
296392229a57SYang Zhong {
296492229a57SYang Zhong     if (kvm_immediate_exit) {
296592229a57SYang Zhong         kvm_cpu_kick(current_cpu);
296692229a57SYang Zhong     } else {
296792229a57SYang Zhong         qemu_cpu_kick_self();
296892229a57SYang Zhong     }
296992229a57SYang Zhong }
297092229a57SYang Zhong 
kvm_eat_signals(CPUState * cpu)297192229a57SYang Zhong static void kvm_eat_signals(CPUState *cpu)
297292229a57SYang Zhong {
297392229a57SYang Zhong     struct timespec ts = { 0, 0 };
297492229a57SYang Zhong     siginfo_t siginfo;
297592229a57SYang Zhong     sigset_t waitset;
297692229a57SYang Zhong     sigset_t chkset;
297792229a57SYang Zhong     int r;
297892229a57SYang Zhong 
297992229a57SYang Zhong     if (kvm_immediate_exit) {
2980d73415a3SStefan Hajnoczi         qatomic_set(&cpu->kvm_run->immediate_exit, 0);
298192229a57SYang Zhong         /* Write kvm_run->immediate_exit before the cpu->exit_request
298292229a57SYang Zhong          * write in kvm_cpu_exec.
298392229a57SYang Zhong          */
298492229a57SYang Zhong         smp_wmb();
298592229a57SYang Zhong         return;
298692229a57SYang Zhong     }
298792229a57SYang Zhong 
298892229a57SYang Zhong     sigemptyset(&waitset);
298992229a57SYang Zhong     sigaddset(&waitset, SIG_IPI);
299092229a57SYang Zhong 
299192229a57SYang Zhong     do {
299292229a57SYang Zhong         r = sigtimedwait(&waitset, &siginfo, &ts);
299392229a57SYang Zhong         if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
299492229a57SYang Zhong             perror("sigtimedwait");
299592229a57SYang Zhong             exit(1);
299692229a57SYang Zhong         }
299792229a57SYang Zhong 
299892229a57SYang Zhong         r = sigpending(&chkset);
299992229a57SYang Zhong         if (r == -1) {
300092229a57SYang Zhong             perror("sigpending");
300192229a57SYang Zhong             exit(1);
300292229a57SYang Zhong         }
300392229a57SYang Zhong     } while (sigismember(&chkset, SIG_IPI));
300492229a57SYang Zhong }
300592229a57SYang Zhong 
kvm_convert_memory(hwaddr start,hwaddr size,bool to_private)3006c15e5684SChao Peng int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private)
3007c15e5684SChao Peng {
3008c15e5684SChao Peng     MemoryRegionSection section;
3009c15e5684SChao Peng     ram_addr_t offset;
3010c15e5684SChao Peng     MemoryRegion *mr;
3011c15e5684SChao Peng     RAMBlock *rb;
3012c15e5684SChao Peng     void *addr;
3013c15e5684SChao Peng     int ret = -1;
3014c15e5684SChao Peng 
3015c15e5684SChao Peng     trace_kvm_convert_memory(start, size, to_private ? "shared_to_private" : "private_to_shared");
3016c15e5684SChao Peng 
3017c15e5684SChao Peng     if (!QEMU_PTR_IS_ALIGNED(start, qemu_real_host_page_size()) ||
3018c15e5684SChao Peng         !QEMU_PTR_IS_ALIGNED(size, qemu_real_host_page_size())) {
3019c15e5684SChao Peng         return -1;
3020c15e5684SChao Peng     }
3021c15e5684SChao Peng 
3022c15e5684SChao Peng     if (!size) {
3023c15e5684SChao Peng         return -1;
3024c15e5684SChao Peng     }
3025c15e5684SChao Peng 
3026c15e5684SChao Peng     section = memory_region_find(get_system_memory(), start, size);
3027c15e5684SChao Peng     mr = section.mr;
3028c15e5684SChao Peng     if (!mr) {
3029565f4768SIsaku Yamahata         /*
3030565f4768SIsaku Yamahata          * Ignore converting non-assigned region to shared.
3031565f4768SIsaku Yamahata          *
3032565f4768SIsaku Yamahata          * TDX requires vMMIO region to be shared to inject #VE to guest.
3033565f4768SIsaku Yamahata          * OVMF issues conservatively MapGPA(shared) on 32bit PCI MMIO region,
3034565f4768SIsaku Yamahata          * and vIO-APIC 0xFEC00000 4K page.
3035565f4768SIsaku Yamahata          * OVMF assigns 32bit PCI MMIO region to
3036565f4768SIsaku Yamahata          * [top of low memory: typically 2GB=0xC000000,  0xFC00000)
3037565f4768SIsaku Yamahata          */
3038565f4768SIsaku Yamahata         if (!to_private) {
3039565f4768SIsaku Yamahata             return 0;
3040565f4768SIsaku Yamahata         }
3041c15e5684SChao Peng         return -1;
3042c15e5684SChao Peng     }
3043c15e5684SChao Peng 
3044c15e5684SChao Peng     if (!memory_region_has_guest_memfd(mr)) {
3045c5d9425eSIsaku Yamahata         /*
3046c5d9425eSIsaku Yamahata          * Because vMMIO region must be shared, guest TD may convert vMMIO
3047c5d9425eSIsaku Yamahata          * region to shared explicitly.  Don't complain such case.  See
3048c5d9425eSIsaku Yamahata          * memory_region_type() for checking if the region is MMIO region.
3049c5d9425eSIsaku Yamahata          */
3050c5d9425eSIsaku Yamahata         if (!to_private &&
3051c5d9425eSIsaku Yamahata             !memory_region_is_ram(mr) &&
3052c5d9425eSIsaku Yamahata             !memory_region_is_ram_device(mr) &&
3053c5d9425eSIsaku Yamahata             !memory_region_is_rom(mr) &&
3054c5d9425eSIsaku Yamahata             !memory_region_is_romd(mr)) {
3055c5d9425eSIsaku Yamahata             ret = 0;
3056c5d9425eSIsaku Yamahata         } else {
3057c5d9425eSIsaku Yamahata             error_report("Convert non guest_memfd backed memory region "
3058c15e5684SChao Peng                         "(0x%"HWADDR_PRIx" ,+ 0x%"HWADDR_PRIx") to %s",
3059c15e5684SChao Peng                         start, size, to_private ? "private" : "shared");
3060c5d9425eSIsaku Yamahata         }
3061c15e5684SChao Peng         goto out_unref;
3062c15e5684SChao Peng     }
3063c15e5684SChao Peng 
3064c15e5684SChao Peng     if (to_private) {
3065c15e5684SChao Peng         ret = kvm_set_memory_attributes_private(start, size);
3066c15e5684SChao Peng     } else {
3067c15e5684SChao Peng         ret = kvm_set_memory_attributes_shared(start, size);
3068c15e5684SChao Peng     }
3069c15e5684SChao Peng     if (ret) {
3070c15e5684SChao Peng         goto out_unref;
3071c15e5684SChao Peng     }
3072c15e5684SChao Peng 
3073c15e5684SChao Peng     addr = memory_region_get_ram_ptr(mr) + section.offset_within_region;
3074c15e5684SChao Peng     rb = qemu_ram_block_from_host(addr, false, &offset);
3075c15e5684SChao Peng 
3076c15e5684SChao Peng     if (to_private) {
3077c15e5684SChao Peng         if (rb->page_size != qemu_real_host_page_size()) {
3078c15e5684SChao Peng             /*
3079c15e5684SChao Peng              * shared memory is backed by hugetlb, which is supposed to be
3080c15e5684SChao Peng              * pre-allocated and doesn't need to be discarded
3081c15e5684SChao Peng              */
3082c15e5684SChao Peng             goto out_unref;
3083c15e5684SChao Peng         }
3084c15e5684SChao Peng         ret = ram_block_discard_range(rb, offset, size);
3085c15e5684SChao Peng     } else {
3086c15e5684SChao Peng         ret = ram_block_discard_guest_memfd_range(rb, offset, size);
3087c15e5684SChao Peng     }
3088c15e5684SChao Peng 
3089c15e5684SChao Peng out_unref:
3090c15e5684SChao Peng     memory_region_unref(mr);
3091c15e5684SChao Peng     return ret;
3092c15e5684SChao Peng }
3093c15e5684SChao Peng 
kvm_cpu_exec(CPUState * cpu)309492229a57SYang Zhong int kvm_cpu_exec(CPUState *cpu)
309592229a57SYang Zhong {
309692229a57SYang Zhong     struct kvm_run *run = cpu->kvm_run;
309792229a57SYang Zhong     int ret, run_ret;
309892229a57SYang Zhong 
30999cdfb1e3SJai Arora     trace_kvm_cpu_exec();
310092229a57SYang Zhong 
310192229a57SYang Zhong     if (kvm_arch_process_async_events(cpu)) {
3102d73415a3SStefan Hajnoczi         qatomic_set(&cpu->exit_request, 0);
310392229a57SYang Zhong         return EXCP_HLT;
310492229a57SYang Zhong     }
310592229a57SYang Zhong 
3106195801d7SStefan Hajnoczi     bql_unlock();
310792229a57SYang Zhong     cpu_exec_start(cpu);
310892229a57SYang Zhong 
310992229a57SYang Zhong     do {
311092229a57SYang Zhong         MemTxAttrs attrs;
311192229a57SYang Zhong 
311299f31832SSergio Andres Gomez Del Real         if (cpu->vcpu_dirty) {
3113a1676bb3SJulia Suvorova             Error *err = NULL;
3114a1676bb3SJulia Suvorova             ret = kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE, &err);
31157191f24cSAkihiko Odaki             if (ret) {
3116a1676bb3SJulia Suvorova                 if (err) {
3117a1676bb3SJulia Suvorova                     error_reportf_err(err, "Putting registers after init: ");
3118a1676bb3SJulia Suvorova                 } else {
31197191f24cSAkihiko Odaki                     error_report("Failed to put registers after init: %s",
31207191f24cSAkihiko Odaki                                  strerror(-ret));
3121a1676bb3SJulia Suvorova                 }
31227191f24cSAkihiko Odaki                 ret = -1;
31237191f24cSAkihiko Odaki                 break;
31247191f24cSAkihiko Odaki             }
31257191f24cSAkihiko Odaki 
312699f31832SSergio Andres Gomez Del Real             cpu->vcpu_dirty = false;
312792229a57SYang Zhong         }
312892229a57SYang Zhong 
312992229a57SYang Zhong         kvm_arch_pre_run(cpu, run);
3130d73415a3SStefan Hajnoczi         if (qatomic_read(&cpu->exit_request)) {
31319cdfb1e3SJai Arora             trace_kvm_interrupt_exit_request();
313292229a57SYang Zhong             /*
313392229a57SYang Zhong              * KVM requires us to reenter the kernel after IO exits to complete
313492229a57SYang Zhong              * instruction emulation. This self-signal will ensure that we
313592229a57SYang Zhong              * leave ASAP again.
313692229a57SYang Zhong              */
313792229a57SYang Zhong             kvm_cpu_kick_self();
313892229a57SYang Zhong         }
313992229a57SYang Zhong 
314092229a57SYang Zhong         /* Read cpu->exit_request before KVM_RUN reads run->immediate_exit.
314192229a57SYang Zhong          * Matching barrier in kvm_eat_signals.
314292229a57SYang Zhong          */
314392229a57SYang Zhong         smp_rmb();
314492229a57SYang Zhong 
314592229a57SYang Zhong         run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0);
314692229a57SYang Zhong 
314792229a57SYang Zhong         attrs = kvm_arch_post_run(cpu, run);
314892229a57SYang Zhong 
314992229a57SYang Zhong #ifdef KVM_HAVE_MCE_INJECTION
315092229a57SYang Zhong         if (unlikely(have_sigbus_pending)) {
3151195801d7SStefan Hajnoczi             bql_lock();
315292229a57SYang Zhong             kvm_arch_on_sigbus_vcpu(cpu, pending_sigbus_code,
315392229a57SYang Zhong                                     pending_sigbus_addr);
315492229a57SYang Zhong             have_sigbus_pending = false;
3155195801d7SStefan Hajnoczi             bql_unlock();
315692229a57SYang Zhong         }
315792229a57SYang Zhong #endif
315892229a57SYang Zhong 
315992229a57SYang Zhong         if (run_ret < 0) {
316092229a57SYang Zhong             if (run_ret == -EINTR || run_ret == -EAGAIN) {
31619cdfb1e3SJai Arora                 trace_kvm_io_window_exit();
316292229a57SYang Zhong                 kvm_eat_signals(cpu);
316392229a57SYang Zhong                 ret = EXCP_INTERRUPT;
316492229a57SYang Zhong                 break;
316592229a57SYang Zhong             }
3166c15e5684SChao Peng             if (!(run_ret == -EFAULT && run->exit_reason == KVM_EXIT_MEMORY_FAULT)) {
316792229a57SYang Zhong                 fprintf(stderr, "error: kvm run failed %s\n",
316892229a57SYang Zhong                         strerror(-run_ret));
316992229a57SYang Zhong #ifdef TARGET_PPC
317092229a57SYang Zhong                 if (run_ret == -EBUSY) {
317192229a57SYang Zhong                     fprintf(stderr,
317292229a57SYang Zhong                             "This is probably because your SMT is enabled.\n"
317392229a57SYang Zhong                             "VCPU can only run on primary threads with all "
317492229a57SYang Zhong                             "secondary threads offline.\n");
317592229a57SYang Zhong                 }
317692229a57SYang Zhong #endif
317792229a57SYang Zhong                 ret = -1;
317892229a57SYang Zhong                 break;
317992229a57SYang Zhong             }
3180c15e5684SChao Peng         }
318192229a57SYang Zhong 
318292229a57SYang Zhong         trace_kvm_run_exit(cpu->cpu_index, run->exit_reason);
318392229a57SYang Zhong         switch (run->exit_reason) {
318492229a57SYang Zhong         case KVM_EXIT_IO:
318592229a57SYang Zhong             /* Called outside BQL */
318692229a57SYang Zhong             kvm_handle_io(run->io.port, attrs,
318792229a57SYang Zhong                           (uint8_t *)run + run->io.data_offset,
318892229a57SYang Zhong                           run->io.direction,
318992229a57SYang Zhong                           run->io.size,
319092229a57SYang Zhong                           run->io.count);
319192229a57SYang Zhong             ret = 0;
319292229a57SYang Zhong             break;
319392229a57SYang Zhong         case KVM_EXIT_MMIO:
319492229a57SYang Zhong             /* Called outside BQL */
319592229a57SYang Zhong             address_space_rw(&address_space_memory,
319692229a57SYang Zhong                              run->mmio.phys_addr, attrs,
319792229a57SYang Zhong                              run->mmio.data,
319892229a57SYang Zhong                              run->mmio.len,
319992229a57SYang Zhong                              run->mmio.is_write);
320092229a57SYang Zhong             ret = 0;
320192229a57SYang Zhong             break;
320292229a57SYang Zhong         case KVM_EXIT_IRQ_WINDOW_OPEN:
320392229a57SYang Zhong             ret = EXCP_INTERRUPT;
320492229a57SYang Zhong             break;
320592229a57SYang Zhong         case KVM_EXIT_SHUTDOWN:
320692229a57SYang Zhong             qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
320792229a57SYang Zhong             ret = EXCP_INTERRUPT;
320892229a57SYang Zhong             break;
320992229a57SYang Zhong         case KVM_EXIT_UNKNOWN:
321092229a57SYang Zhong             fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n",
321192229a57SYang Zhong                     (uint64_t)run->hw.hardware_exit_reason);
321292229a57SYang Zhong             ret = -1;
321392229a57SYang Zhong             break;
321492229a57SYang Zhong         case KVM_EXIT_INTERNAL_ERROR:
321592229a57SYang Zhong             ret = kvm_handle_internal_error(cpu, run);
321692229a57SYang Zhong             break;
3217b4420f19SPeter Xu         case KVM_EXIT_DIRTY_RING_FULL:
3218b4420f19SPeter Xu             /*
3219b4420f19SPeter Xu              * We shouldn't continue if the dirty ring of this vcpu is
3220b4420f19SPeter Xu              * still full.  Got kicked by KVM_RESET_DIRTY_RINGS.
3221b4420f19SPeter Xu              */
3222b4420f19SPeter Xu             trace_kvm_dirty_ring_full(cpu->cpu_index);
3223195801d7SStefan Hajnoczi             bql_lock();
3224baa60983SHyman Huang(黄勇)             /*
3225baa60983SHyman Huang(黄勇)              * We throttle vCPU by making it sleep once it exit from kernel
3226baa60983SHyman Huang(黄勇)              * due to dirty ring full. In the dirtylimit scenario, reaping
3227baa60983SHyman Huang(黄勇)              * all vCPUs after a single vCPU dirty ring get full result in
3228baa60983SHyman Huang(黄勇)              * the miss of sleep, so just reap the ring-fulled vCPU.
3229baa60983SHyman Huang(黄勇)              */
3230baa60983SHyman Huang(黄勇)             if (dirtylimit_in_service()) {
3231baa60983SHyman Huang(黄勇)                 kvm_dirty_ring_reap(kvm_state, cpu);
3232baa60983SHyman Huang(黄勇)             } else {
32331667e2b9SHyman Huang(黄勇)                 kvm_dirty_ring_reap(kvm_state, NULL);
3234baa60983SHyman Huang(黄勇)             }
3235195801d7SStefan Hajnoczi             bql_unlock();
3236baa60983SHyman Huang(黄勇)             dirtylimit_vcpu_execute(cpu);
3237b4420f19SPeter Xu             ret = 0;
3238b4420f19SPeter Xu             break;
323992229a57SYang Zhong         case KVM_EXIT_SYSTEM_EVENT:
32409cdfb1e3SJai Arora             trace_kvm_run_exit_system_event(cpu->cpu_index, run->system_event.type);
324192229a57SYang Zhong             switch (run->system_event.type) {
324292229a57SYang Zhong             case KVM_SYSTEM_EVENT_SHUTDOWN:
324392229a57SYang Zhong                 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
324492229a57SYang Zhong                 ret = EXCP_INTERRUPT;
324592229a57SYang Zhong                 break;
324692229a57SYang Zhong             case KVM_SYSTEM_EVENT_RESET:
324792229a57SYang Zhong                 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
324892229a57SYang Zhong                 ret = EXCP_INTERRUPT;
324992229a57SYang Zhong                 break;
325092229a57SYang Zhong             case KVM_SYSTEM_EVENT_CRASH:
325192229a57SYang Zhong                 kvm_cpu_synchronize_state(cpu);
3252195801d7SStefan Hajnoczi                 bql_lock();
325392229a57SYang Zhong                 qemu_system_guest_panicked(cpu_get_crash_info(cpu));
3254195801d7SStefan Hajnoczi                 bql_unlock();
325592229a57SYang Zhong                 ret = 0;
325692229a57SYang Zhong                 break;
325792229a57SYang Zhong             default:
325892229a57SYang Zhong                 ret = kvm_arch_handle_exit(cpu, run);
325992229a57SYang Zhong                 break;
326092229a57SYang Zhong             }
326192229a57SYang Zhong             break;
3262c15e5684SChao Peng         case KVM_EXIT_MEMORY_FAULT:
3263c15e5684SChao Peng             trace_kvm_memory_fault(run->memory_fault.gpa,
3264c15e5684SChao Peng                                    run->memory_fault.size,
3265c15e5684SChao Peng                                    run->memory_fault.flags);
3266c15e5684SChao Peng             if (run->memory_fault.flags & ~KVM_MEMORY_EXIT_FLAG_PRIVATE) {
3267c15e5684SChao Peng                 error_report("KVM_EXIT_MEMORY_FAULT: Unknown flag 0x%" PRIx64,
3268c15e5684SChao Peng                              (uint64_t)run->memory_fault.flags);
3269c15e5684SChao Peng                 ret = -1;
3270c15e5684SChao Peng                 break;
3271c15e5684SChao Peng             }
3272c15e5684SChao Peng             ret = kvm_convert_memory(run->memory_fault.gpa, run->memory_fault.size,
3273c15e5684SChao Peng                                      run->memory_fault.flags & KVM_MEMORY_EXIT_FLAG_PRIVATE);
3274c15e5684SChao Peng             break;
327592229a57SYang Zhong         default:
327692229a57SYang Zhong             ret = kvm_arch_handle_exit(cpu, run);
327792229a57SYang Zhong             break;
327892229a57SYang Zhong         }
327992229a57SYang Zhong     } while (ret == 0);
328092229a57SYang Zhong 
328192229a57SYang Zhong     cpu_exec_end(cpu);
3282195801d7SStefan Hajnoczi     bql_lock();
328392229a57SYang Zhong 
328492229a57SYang Zhong     if (ret < 0) {
328590c84c56SMarkus Armbruster         cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
328692229a57SYang Zhong         vm_stop(RUN_STATE_INTERNAL_ERROR);
328792229a57SYang Zhong     }
328892229a57SYang Zhong 
3289d73415a3SStefan Hajnoczi     qatomic_set(&cpu->exit_request, 0);
329092229a57SYang Zhong     return ret;
329192229a57SYang Zhong }
329292229a57SYang Zhong 
kvm_ioctl(KVMState * s,unsigned long type,...)32936a8703aeSJohannes Stoelp int kvm_ioctl(KVMState *s, unsigned long type, ...)
329492229a57SYang Zhong {
329592229a57SYang Zhong     int ret;
329692229a57SYang Zhong     void *arg;
329792229a57SYang Zhong     va_list ap;
329892229a57SYang Zhong 
329992229a57SYang Zhong     va_start(ap, type);
330092229a57SYang Zhong     arg = va_arg(ap, void *);
330192229a57SYang Zhong     va_end(ap);
330292229a57SYang Zhong 
330392229a57SYang Zhong     trace_kvm_ioctl(type, arg);
330492229a57SYang Zhong     ret = ioctl(s->fd, type, arg);
330592229a57SYang Zhong     if (ret == -1) {
330692229a57SYang Zhong         ret = -errno;
330792229a57SYang Zhong     }
330892229a57SYang Zhong     return ret;
330992229a57SYang Zhong }
331092229a57SYang Zhong 
kvm_vm_ioctl(KVMState * s,unsigned long type,...)33116a8703aeSJohannes Stoelp int kvm_vm_ioctl(KVMState *s, unsigned long type, ...)
331292229a57SYang Zhong {
331392229a57SYang Zhong     int ret;
331492229a57SYang Zhong     void *arg;
331592229a57SYang Zhong     va_list ap;
331692229a57SYang Zhong 
331792229a57SYang Zhong     va_start(ap, type);
331892229a57SYang Zhong     arg = va_arg(ap, void *);
331992229a57SYang Zhong     va_end(ap);
332092229a57SYang Zhong 
332192229a57SYang Zhong     trace_kvm_vm_ioctl(type, arg);
3322a27dd2deSEmanuele Giuseppe Esposito     accel_ioctl_begin();
332392229a57SYang Zhong     ret = ioctl(s->vmfd, type, arg);
3324a27dd2deSEmanuele Giuseppe Esposito     accel_ioctl_end();
332592229a57SYang Zhong     if (ret == -1) {
332692229a57SYang Zhong         ret = -errno;
332792229a57SYang Zhong     }
332892229a57SYang Zhong     return ret;
332992229a57SYang Zhong }
333092229a57SYang Zhong 
kvm_vcpu_ioctl(CPUState * cpu,unsigned long type,...)33316a8703aeSJohannes Stoelp int kvm_vcpu_ioctl(CPUState *cpu, unsigned long type, ...)
333292229a57SYang Zhong {
333392229a57SYang Zhong     int ret;
333492229a57SYang Zhong     void *arg;
333592229a57SYang Zhong     va_list ap;
333692229a57SYang Zhong 
333792229a57SYang Zhong     va_start(ap, type);
333892229a57SYang Zhong     arg = va_arg(ap, void *);
333992229a57SYang Zhong     va_end(ap);
334092229a57SYang Zhong 
334192229a57SYang Zhong     trace_kvm_vcpu_ioctl(cpu->cpu_index, type, arg);
3342a27dd2deSEmanuele Giuseppe Esposito     accel_cpu_ioctl_begin(cpu);
334392229a57SYang Zhong     ret = ioctl(cpu->kvm_fd, type, arg);
3344a27dd2deSEmanuele Giuseppe Esposito     accel_cpu_ioctl_end(cpu);
334592229a57SYang Zhong     if (ret == -1) {
334692229a57SYang Zhong         ret = -errno;
334792229a57SYang Zhong     }
334892229a57SYang Zhong     return ret;
334992229a57SYang Zhong }
335092229a57SYang Zhong 
kvm_device_ioctl(int fd,unsigned long type,...)33516a8703aeSJohannes Stoelp int kvm_device_ioctl(int fd, unsigned long type, ...)
335292229a57SYang Zhong {
335392229a57SYang Zhong     int ret;
335492229a57SYang Zhong     void *arg;
335592229a57SYang Zhong     va_list ap;
335692229a57SYang Zhong 
335792229a57SYang Zhong     va_start(ap, type);
335892229a57SYang Zhong     arg = va_arg(ap, void *);
335992229a57SYang Zhong     va_end(ap);
336092229a57SYang Zhong 
336192229a57SYang Zhong     trace_kvm_device_ioctl(fd, type, arg);
3362a27dd2deSEmanuele Giuseppe Esposito     accel_ioctl_begin();
336392229a57SYang Zhong     ret = ioctl(fd, type, arg);
3364a27dd2deSEmanuele Giuseppe Esposito     accel_ioctl_end();
336592229a57SYang Zhong     if (ret == -1) {
336692229a57SYang Zhong         ret = -errno;
336792229a57SYang Zhong     }
336892229a57SYang Zhong     return ret;
336992229a57SYang Zhong }
337092229a57SYang Zhong 
kvm_vm_check_attr(KVMState * s,uint32_t group,uint64_t attr)337192229a57SYang Zhong int kvm_vm_check_attr(KVMState *s, uint32_t group, uint64_t attr)
337292229a57SYang Zhong {
337392229a57SYang Zhong     int ret;
337492229a57SYang Zhong     struct kvm_device_attr attribute = {
337592229a57SYang Zhong         .group = group,
337692229a57SYang Zhong         .attr = attr,
337792229a57SYang Zhong     };
337892229a57SYang Zhong 
337992229a57SYang Zhong     if (!kvm_vm_attributes_allowed) {
338092229a57SYang Zhong         return 0;
338192229a57SYang Zhong     }
338292229a57SYang Zhong 
338392229a57SYang Zhong     ret = kvm_vm_ioctl(s, KVM_HAS_DEVICE_ATTR, &attribute);
338492229a57SYang Zhong     /* kvm returns 0 on success for HAS_DEVICE_ATTR */
338592229a57SYang Zhong     return ret ? 0 : 1;
338692229a57SYang Zhong }
338792229a57SYang Zhong 
kvm_device_check_attr(int dev_fd,uint32_t group,uint64_t attr)338892229a57SYang Zhong int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr)
338992229a57SYang Zhong {
339092229a57SYang Zhong     struct kvm_device_attr attribute = {
339192229a57SYang Zhong         .group = group,
339292229a57SYang Zhong         .attr = attr,
339392229a57SYang Zhong         .flags = 0,
339492229a57SYang Zhong     };
339592229a57SYang Zhong 
339692229a57SYang Zhong     return kvm_device_ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute) ? 0 : 1;
339792229a57SYang Zhong }
339892229a57SYang Zhong 
kvm_device_access(int fd,int group,uint64_t attr,void * val,bool write,Error ** errp)339992229a57SYang Zhong int kvm_device_access(int fd, int group, uint64_t attr,
340092229a57SYang Zhong                       void *val, bool write, Error **errp)
340192229a57SYang Zhong {
340292229a57SYang Zhong     struct kvm_device_attr kvmattr;
340392229a57SYang Zhong     int err;
340492229a57SYang Zhong 
340592229a57SYang Zhong     kvmattr.flags = 0;
340692229a57SYang Zhong     kvmattr.group = group;
340792229a57SYang Zhong     kvmattr.attr = attr;
340892229a57SYang Zhong     kvmattr.addr = (uintptr_t)val;
340992229a57SYang Zhong 
341092229a57SYang Zhong     err = kvm_device_ioctl(fd,
341192229a57SYang Zhong                            write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR,
341292229a57SYang Zhong                            &kvmattr);
341392229a57SYang Zhong     if (err < 0) {
341492229a57SYang Zhong         error_setg_errno(errp, -err,
341592229a57SYang Zhong                          "KVM_%s_DEVICE_ATTR failed: Group %d "
341692229a57SYang Zhong                          "attr 0x%016" PRIx64,
341792229a57SYang Zhong                          write ? "SET" : "GET", group, attr);
341892229a57SYang Zhong     }
341992229a57SYang Zhong     return err;
342092229a57SYang Zhong }
342192229a57SYang Zhong 
kvm_has_sync_mmu(void)342262dd4edaSGreg Kurz bool kvm_has_sync_mmu(void)
342392229a57SYang Zhong {
342462dd4edaSGreg Kurz     return kvm_state->sync_mmu;
342592229a57SYang Zhong }
342692229a57SYang Zhong 
kvm_has_vcpu_events(void)342792229a57SYang Zhong int kvm_has_vcpu_events(void)
342892229a57SYang Zhong {
342992229a57SYang Zhong     return kvm_state->vcpu_events;
343092229a57SYang Zhong }
343192229a57SYang Zhong 
kvm_max_nested_state_length(void)3432ebbfef2fSLiran Alon int kvm_max_nested_state_length(void)
3433ebbfef2fSLiran Alon {
3434ebbfef2fSLiran Alon     return kvm_state->max_nested_state_len;
3435ebbfef2fSLiran Alon }
3436ebbfef2fSLiran Alon 
kvm_has_gsi_routing(void)343792229a57SYang Zhong int kvm_has_gsi_routing(void)
343892229a57SYang Zhong {
343992229a57SYang Zhong #ifdef KVM_CAP_IRQ_ROUTING
344092229a57SYang Zhong     return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
344192229a57SYang Zhong #else
344292229a57SYang Zhong     return false;
344392229a57SYang Zhong #endif
344492229a57SYang Zhong }
344592229a57SYang Zhong 
kvm_arm_supports_user_irq(void)34465d721b78SAlexander Graf bool kvm_arm_supports_user_irq(void)
34475d721b78SAlexander Graf {
34485d721b78SAlexander Graf     return kvm_check_extension(kvm_state, KVM_CAP_ARM_USER_IRQ);
34495d721b78SAlexander Graf }
34505d721b78SAlexander Graf 
34511e1e4879SPaolo Bonzini #ifdef TARGET_KVM_HAVE_GUEST_DEBUG
kvm_find_sw_breakpoint(CPUState * cpu,vaddr pc)3452b67be03eSAnton Johansson struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu, vaddr pc)
345392229a57SYang Zhong {
345492229a57SYang Zhong     struct kvm_sw_breakpoint *bp;
345592229a57SYang Zhong 
345692229a57SYang Zhong     QTAILQ_FOREACH(bp, &cpu->kvm_state->kvm_sw_breakpoints, entry) {
345792229a57SYang Zhong         if (bp->pc == pc) {
345892229a57SYang Zhong             return bp;
345992229a57SYang Zhong         }
346092229a57SYang Zhong     }
346192229a57SYang Zhong     return NULL;
346292229a57SYang Zhong }
346392229a57SYang Zhong 
kvm_sw_breakpoints_active(CPUState * cpu)346492229a57SYang Zhong int kvm_sw_breakpoints_active(CPUState *cpu)
346592229a57SYang Zhong {
346692229a57SYang Zhong     return !QTAILQ_EMPTY(&cpu->kvm_state->kvm_sw_breakpoints);
346792229a57SYang Zhong }
346892229a57SYang Zhong 
346992229a57SYang Zhong struct kvm_set_guest_debug_data {
347092229a57SYang Zhong     struct kvm_guest_debug dbg;
347192229a57SYang Zhong     int err;
347292229a57SYang Zhong };
347392229a57SYang Zhong 
kvm_invoke_set_guest_debug(CPUState * cpu,run_on_cpu_data data)347492229a57SYang Zhong static void kvm_invoke_set_guest_debug(CPUState *cpu, run_on_cpu_data data)
347592229a57SYang Zhong {
347692229a57SYang Zhong     struct kvm_set_guest_debug_data *dbg_data =
347792229a57SYang Zhong         (struct kvm_set_guest_debug_data *) data.host_ptr;
347892229a57SYang Zhong 
347992229a57SYang Zhong     dbg_data->err = kvm_vcpu_ioctl(cpu, KVM_SET_GUEST_DEBUG,
348092229a57SYang Zhong                                    &dbg_data->dbg);
348192229a57SYang Zhong }
348292229a57SYang Zhong 
kvm_update_guest_debug(CPUState * cpu,unsigned long reinject_trap)348392229a57SYang Zhong int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap)
348492229a57SYang Zhong {
348592229a57SYang Zhong     struct kvm_set_guest_debug_data data;
348692229a57SYang Zhong 
348792229a57SYang Zhong     data.dbg.control = reinject_trap;
348892229a57SYang Zhong 
348992229a57SYang Zhong     if (cpu->singlestep_enabled) {
349092229a57SYang Zhong         data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
3491fd2ddd16SMaxim Levitsky 
3492fd2ddd16SMaxim Levitsky         if (cpu->singlestep_enabled & SSTEP_NOIRQ) {
3493fd2ddd16SMaxim Levitsky             data.dbg.control |= KVM_GUESTDBG_BLOCKIRQ;
3494fd2ddd16SMaxim Levitsky         }
349592229a57SYang Zhong     }
349692229a57SYang Zhong     kvm_arch_update_guest_debug(cpu, &data.dbg);
349792229a57SYang Zhong 
349892229a57SYang Zhong     run_on_cpu(cpu, kvm_invoke_set_guest_debug,
349992229a57SYang Zhong                RUN_ON_CPU_HOST_PTR(&data));
350092229a57SYang Zhong     return data.err;
350192229a57SYang Zhong }
350292229a57SYang Zhong 
kvm_supports_guest_debug(void)3503a48e7d9eSAlex Bennée bool kvm_supports_guest_debug(void)
3504a48e7d9eSAlex Bennée {
3505a48e7d9eSAlex Bennée     /* probed during kvm_init() */
3506a48e7d9eSAlex Bennée     return kvm_has_guest_debug;
3507a48e7d9eSAlex Bennée }
3508a48e7d9eSAlex Bennée 
kvm_insert_breakpoint(CPUState * cpu,int type,vaddr addr,vaddr len)350955b5b8e9SPhilippe Mathieu-Daudé int kvm_insert_breakpoint(CPUState *cpu, int type, vaddr addr, vaddr len)
351092229a57SYang Zhong {
351192229a57SYang Zhong     struct kvm_sw_breakpoint *bp;
351292229a57SYang Zhong     int err;
351392229a57SYang Zhong 
351492229a57SYang Zhong     if (type == GDB_BREAKPOINT_SW) {
351592229a57SYang Zhong         bp = kvm_find_sw_breakpoint(cpu, addr);
351692229a57SYang Zhong         if (bp) {
351792229a57SYang Zhong             bp->use_count++;
351892229a57SYang Zhong             return 0;
351992229a57SYang Zhong         }
352092229a57SYang Zhong 
3521b21e2380SMarkus Armbruster         bp = g_new(struct kvm_sw_breakpoint, 1);
352292229a57SYang Zhong         bp->pc = addr;
352392229a57SYang Zhong         bp->use_count = 1;
352492229a57SYang Zhong         err = kvm_arch_insert_sw_breakpoint(cpu, bp);
352592229a57SYang Zhong         if (err) {
352692229a57SYang Zhong             g_free(bp);
352792229a57SYang Zhong             return err;
352892229a57SYang Zhong         }
352992229a57SYang Zhong 
353092229a57SYang Zhong         QTAILQ_INSERT_HEAD(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
353192229a57SYang Zhong     } else {
353292229a57SYang Zhong         err = kvm_arch_insert_hw_breakpoint(addr, len, type);
353392229a57SYang Zhong         if (err) {
353492229a57SYang Zhong             return err;
353592229a57SYang Zhong         }
353692229a57SYang Zhong     }
353792229a57SYang Zhong 
353892229a57SYang Zhong     CPU_FOREACH(cpu) {
353992229a57SYang Zhong         err = kvm_update_guest_debug(cpu, 0);
354092229a57SYang Zhong         if (err) {
354192229a57SYang Zhong             return err;
354292229a57SYang Zhong         }
354392229a57SYang Zhong     }
354492229a57SYang Zhong     return 0;
354592229a57SYang Zhong }
354692229a57SYang Zhong 
kvm_remove_breakpoint(CPUState * cpu,int type,vaddr addr,vaddr len)354755b5b8e9SPhilippe Mathieu-Daudé int kvm_remove_breakpoint(CPUState *cpu, int type, vaddr addr, vaddr len)
354892229a57SYang Zhong {
354992229a57SYang Zhong     struct kvm_sw_breakpoint *bp;
355092229a57SYang Zhong     int err;
355192229a57SYang Zhong 
355292229a57SYang Zhong     if (type == GDB_BREAKPOINT_SW) {
355392229a57SYang Zhong         bp = kvm_find_sw_breakpoint(cpu, addr);
355492229a57SYang Zhong         if (!bp) {
355592229a57SYang Zhong             return -ENOENT;
355692229a57SYang Zhong         }
355792229a57SYang Zhong 
355892229a57SYang Zhong         if (bp->use_count > 1) {
355992229a57SYang Zhong             bp->use_count--;
356092229a57SYang Zhong             return 0;
356192229a57SYang Zhong         }
356292229a57SYang Zhong 
356392229a57SYang Zhong         err = kvm_arch_remove_sw_breakpoint(cpu, bp);
356492229a57SYang Zhong         if (err) {
356592229a57SYang Zhong             return err;
356692229a57SYang Zhong         }
356792229a57SYang Zhong 
356892229a57SYang Zhong         QTAILQ_REMOVE(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
356992229a57SYang Zhong         g_free(bp);
357092229a57SYang Zhong     } else {
357192229a57SYang Zhong         err = kvm_arch_remove_hw_breakpoint(addr, len, type);
357292229a57SYang Zhong         if (err) {
357392229a57SYang Zhong             return err;
357492229a57SYang Zhong         }
357592229a57SYang Zhong     }
357692229a57SYang Zhong 
357792229a57SYang Zhong     CPU_FOREACH(cpu) {
357892229a57SYang Zhong         err = kvm_update_guest_debug(cpu, 0);
357992229a57SYang Zhong         if (err) {
358092229a57SYang Zhong             return err;
358192229a57SYang Zhong         }
358292229a57SYang Zhong     }
358392229a57SYang Zhong     return 0;
358492229a57SYang Zhong }
358592229a57SYang Zhong 
kvm_remove_all_breakpoints(CPUState * cpu)358692229a57SYang Zhong void kvm_remove_all_breakpoints(CPUState *cpu)
358792229a57SYang Zhong {
358892229a57SYang Zhong     struct kvm_sw_breakpoint *bp, *next;
358992229a57SYang Zhong     KVMState *s = cpu->kvm_state;
359092229a57SYang Zhong     CPUState *tmpcpu;
359192229a57SYang Zhong 
359292229a57SYang Zhong     QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
359392229a57SYang Zhong         if (kvm_arch_remove_sw_breakpoint(cpu, bp) != 0) {
359492229a57SYang Zhong             /* Try harder to find a CPU that currently sees the breakpoint. */
359592229a57SYang Zhong             CPU_FOREACH(tmpcpu) {
359692229a57SYang Zhong                 if (kvm_arch_remove_sw_breakpoint(tmpcpu, bp) == 0) {
359792229a57SYang Zhong                     break;
359892229a57SYang Zhong                 }
359992229a57SYang Zhong             }
360092229a57SYang Zhong         }
360192229a57SYang Zhong         QTAILQ_REMOVE(&s->kvm_sw_breakpoints, bp, entry);
360292229a57SYang Zhong         g_free(bp);
360392229a57SYang Zhong     }
360492229a57SYang Zhong     kvm_arch_remove_all_hw_breakpoints();
360592229a57SYang Zhong 
360692229a57SYang Zhong     CPU_FOREACH(cpu) {
360792229a57SYang Zhong         kvm_update_guest_debug(cpu, 0);
360892229a57SYang Zhong     }
360992229a57SYang Zhong }
361092229a57SYang Zhong 
36111e1e4879SPaolo Bonzini #endif /* !TARGET_KVM_HAVE_GUEST_DEBUG */
361292229a57SYang Zhong 
kvm_set_signal_mask(CPUState * cpu,const sigset_t * sigset)361392229a57SYang Zhong static int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset)
361492229a57SYang Zhong {
361592229a57SYang Zhong     KVMState *s = kvm_state;
361692229a57SYang Zhong     struct kvm_signal_mask *sigmask;
361792229a57SYang Zhong     int r;
361892229a57SYang Zhong 
361992229a57SYang Zhong     sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset));
362092229a57SYang Zhong 
362192229a57SYang Zhong     sigmask->len = s->sigmask_len;
362292229a57SYang Zhong     memcpy(sigmask->sigset, sigset, sizeof(*sigset));
362392229a57SYang Zhong     r = kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, sigmask);
362492229a57SYang Zhong     g_free(sigmask);
362592229a57SYang Zhong 
362692229a57SYang Zhong     return r;
362792229a57SYang Zhong }
362892229a57SYang Zhong 
kvm_ipi_signal(int sig)362992229a57SYang Zhong static void kvm_ipi_signal(int sig)
363092229a57SYang Zhong {
363192229a57SYang Zhong     if (current_cpu) {
363292229a57SYang Zhong         assert(kvm_immediate_exit);
363392229a57SYang Zhong         kvm_cpu_kick(current_cpu);
363492229a57SYang Zhong     }
363592229a57SYang Zhong }
363692229a57SYang Zhong 
kvm_init_cpu_signals(CPUState * cpu)363792229a57SYang Zhong void kvm_init_cpu_signals(CPUState *cpu)
363892229a57SYang Zhong {
363992229a57SYang Zhong     int r;
364092229a57SYang Zhong     sigset_t set;
364192229a57SYang Zhong     struct sigaction sigact;
364292229a57SYang Zhong 
364392229a57SYang Zhong     memset(&sigact, 0, sizeof(sigact));
364492229a57SYang Zhong     sigact.sa_handler = kvm_ipi_signal;
364592229a57SYang Zhong     sigaction(SIG_IPI, &sigact, NULL);
364692229a57SYang Zhong 
364792229a57SYang Zhong     pthread_sigmask(SIG_BLOCK, NULL, &set);
364892229a57SYang Zhong #if defined KVM_HAVE_MCE_INJECTION
364992229a57SYang Zhong     sigdelset(&set, SIGBUS);
365092229a57SYang Zhong     pthread_sigmask(SIG_SETMASK, &set, NULL);
365192229a57SYang Zhong #endif
365292229a57SYang Zhong     sigdelset(&set, SIG_IPI);
365392229a57SYang Zhong     if (kvm_immediate_exit) {
365492229a57SYang Zhong         r = pthread_sigmask(SIG_SETMASK, &set, NULL);
365592229a57SYang Zhong     } else {
365692229a57SYang Zhong         r = kvm_set_signal_mask(cpu, &set);
365792229a57SYang Zhong     }
365892229a57SYang Zhong     if (r) {
365992229a57SYang Zhong         fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
366092229a57SYang Zhong         exit(1);
366192229a57SYang Zhong     }
366292229a57SYang Zhong }
366392229a57SYang Zhong 
366492229a57SYang Zhong /* Called asynchronously in VCPU thread.  */
kvm_on_sigbus_vcpu(CPUState * cpu,int code,void * addr)366592229a57SYang Zhong int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr)
366692229a57SYang Zhong {
366792229a57SYang Zhong #ifdef KVM_HAVE_MCE_INJECTION
366892229a57SYang Zhong     if (have_sigbus_pending) {
366992229a57SYang Zhong         return 1;
367092229a57SYang Zhong     }
367192229a57SYang Zhong     have_sigbus_pending = true;
367292229a57SYang Zhong     pending_sigbus_addr = addr;
367392229a57SYang Zhong     pending_sigbus_code = code;
3674d73415a3SStefan Hajnoczi     qatomic_set(&cpu->exit_request, 1);
367592229a57SYang Zhong     return 0;
367692229a57SYang Zhong #else
367792229a57SYang Zhong     return 1;
367892229a57SYang Zhong #endif
367992229a57SYang Zhong }
368092229a57SYang Zhong 
368192229a57SYang Zhong /* Called synchronously (via signalfd) in main thread.  */
kvm_on_sigbus(int code,void * addr)368292229a57SYang Zhong int kvm_on_sigbus(int code, void *addr)
368392229a57SYang Zhong {
368492229a57SYang Zhong #ifdef KVM_HAVE_MCE_INJECTION
368592229a57SYang Zhong     /* Action required MCE kills the process if SIGBUS is blocked.  Because
368692229a57SYang Zhong      * that's what happens in the I/O thread, where we handle MCE via signalfd,
368792229a57SYang Zhong      * we can only get action optional here.
368892229a57SYang Zhong      */
368992229a57SYang Zhong     assert(code != BUS_MCEERR_AR);
369092229a57SYang Zhong     kvm_arch_on_sigbus_vcpu(first_cpu, code, addr);
369192229a57SYang Zhong     return 0;
369292229a57SYang Zhong #else
369392229a57SYang Zhong     return 1;
369492229a57SYang Zhong #endif
369592229a57SYang Zhong }
369692229a57SYang Zhong 
kvm_create_device(KVMState * s,uint64_t type,bool test)369792229a57SYang Zhong int kvm_create_device(KVMState *s, uint64_t type, bool test)
369892229a57SYang Zhong {
369992229a57SYang Zhong     int ret;
370092229a57SYang Zhong     struct kvm_create_device create_dev;
370192229a57SYang Zhong 
370292229a57SYang Zhong     create_dev.type = type;
370392229a57SYang Zhong     create_dev.fd = -1;
370492229a57SYang Zhong     create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0;
370592229a57SYang Zhong 
370692229a57SYang Zhong     if (!kvm_check_extension(s, KVM_CAP_DEVICE_CTRL)) {
370792229a57SYang Zhong         return -ENOTSUP;
370892229a57SYang Zhong     }
370992229a57SYang Zhong 
371092229a57SYang Zhong     ret = kvm_vm_ioctl(s, KVM_CREATE_DEVICE, &create_dev);
371192229a57SYang Zhong     if (ret) {
371292229a57SYang Zhong         return ret;
371392229a57SYang Zhong     }
371492229a57SYang Zhong 
371592229a57SYang Zhong     return test ? 0 : create_dev.fd;
371692229a57SYang Zhong }
371792229a57SYang Zhong 
kvm_device_supported(int vmfd,uint64_t type)371892229a57SYang Zhong bool kvm_device_supported(int vmfd, uint64_t type)
371992229a57SYang Zhong {
372092229a57SYang Zhong     struct kvm_create_device create_dev = {
372192229a57SYang Zhong         .type = type,
372292229a57SYang Zhong         .fd = -1,
372392229a57SYang Zhong         .flags = KVM_CREATE_DEVICE_TEST,
372492229a57SYang Zhong     };
372592229a57SYang Zhong 
372692229a57SYang Zhong     if (ioctl(vmfd, KVM_CHECK_EXTENSION, KVM_CAP_DEVICE_CTRL) <= 0) {
372792229a57SYang Zhong         return false;
372892229a57SYang Zhong     }
372992229a57SYang Zhong 
373092229a57SYang Zhong     return (ioctl(vmfd, KVM_CREATE_DEVICE, &create_dev) >= 0);
373192229a57SYang Zhong }
373292229a57SYang Zhong 
kvm_set_one_reg(CPUState * cs,uint64_t id,void * source)373392229a57SYang Zhong int kvm_set_one_reg(CPUState *cs, uint64_t id, void *source)
373492229a57SYang Zhong {
373592229a57SYang Zhong     struct kvm_one_reg reg;
373692229a57SYang Zhong     int r;
373792229a57SYang Zhong 
373892229a57SYang Zhong     reg.id = id;
373992229a57SYang Zhong     reg.addr = (uintptr_t) source;
374092229a57SYang Zhong     r = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
374192229a57SYang Zhong     if (r) {
374292229a57SYang Zhong         trace_kvm_failed_reg_set(id, strerror(-r));
374392229a57SYang Zhong     }
374492229a57SYang Zhong     return r;
374592229a57SYang Zhong }
374692229a57SYang Zhong 
kvm_get_one_reg(CPUState * cs,uint64_t id,void * target)374792229a57SYang Zhong int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target)
374892229a57SYang Zhong {
374992229a57SYang Zhong     struct kvm_one_reg reg;
375092229a57SYang Zhong     int r;
375192229a57SYang Zhong 
375292229a57SYang Zhong     reg.id = id;
375392229a57SYang Zhong     reg.addr = (uintptr_t) target;
375492229a57SYang Zhong     r = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
375592229a57SYang Zhong     if (r) {
375692229a57SYang Zhong         trace_kvm_failed_reg_get(id, strerror(-r));
375792229a57SYang Zhong     }
375892229a57SYang Zhong     return r;
375992229a57SYang Zhong }
376092229a57SYang Zhong 
kvm_accel_has_memory(MachineState * ms,AddressSpace * as,hwaddr start_addr,hwaddr size)37618072aae3SAlexey Kardashevskiy static bool kvm_accel_has_memory(MachineState *ms, AddressSpace *as,
37628072aae3SAlexey Kardashevskiy                                  hwaddr start_addr, hwaddr size)
37638072aae3SAlexey Kardashevskiy {
37648072aae3SAlexey Kardashevskiy     KVMState *kvm = KVM_STATE(ms->accelerator);
37658072aae3SAlexey Kardashevskiy     int i;
37668072aae3SAlexey Kardashevskiy 
37678072aae3SAlexey Kardashevskiy     for (i = 0; i < kvm->nr_as; ++i) {
37688072aae3SAlexey Kardashevskiy         if (kvm->as[i].as == as && kvm->as[i].ml) {
3769023ae9a8SIgor Mammedov             size = MIN(kvm_max_slot_size, size);
37708072aae3SAlexey Kardashevskiy             return NULL != kvm_lookup_matching_slot(kvm->as[i].ml,
37718072aae3SAlexey Kardashevskiy                                                     start_addr, size);
37728072aae3SAlexey Kardashevskiy         }
37738072aae3SAlexey Kardashevskiy     }
37748072aae3SAlexey Kardashevskiy 
37758072aae3SAlexey Kardashevskiy     return false;
37768072aae3SAlexey Kardashevskiy }
37778072aae3SAlexey Kardashevskiy 
kvm_get_kvm_shadow_mem(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)377823b0898eSPaolo Bonzini static void kvm_get_kvm_shadow_mem(Object *obj, Visitor *v,
377923b0898eSPaolo Bonzini                                    const char *name, void *opaque,
378023b0898eSPaolo Bonzini                                    Error **errp)
378123b0898eSPaolo Bonzini {
378223b0898eSPaolo Bonzini     KVMState *s = KVM_STATE(obj);
378323b0898eSPaolo Bonzini     int64_t value = s->kvm_shadow_mem;
378423b0898eSPaolo Bonzini 
378523b0898eSPaolo Bonzini     visit_type_int(v, name, &value, errp);
378623b0898eSPaolo Bonzini }
378723b0898eSPaolo Bonzini 
kvm_set_kvm_shadow_mem(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)378823b0898eSPaolo Bonzini static void kvm_set_kvm_shadow_mem(Object *obj, Visitor *v,
378923b0898eSPaolo Bonzini                                    const char *name, void *opaque,
379023b0898eSPaolo Bonzini                                    Error **errp)
379123b0898eSPaolo Bonzini {
379223b0898eSPaolo Bonzini     KVMState *s = KVM_STATE(obj);
379323b0898eSPaolo Bonzini     int64_t value;
379423b0898eSPaolo Bonzini 
379570cbae42SPaolo Bonzini     if (s->fd != -1) {
379670cbae42SPaolo Bonzini         error_setg(errp, "Cannot set properties after the accelerator has been initialized");
379770cbae42SPaolo Bonzini         return;
379870cbae42SPaolo Bonzini     }
379970cbae42SPaolo Bonzini 
3800668f62ecSMarkus Armbruster     if (!visit_type_int(v, name, &value, errp)) {
380123b0898eSPaolo Bonzini         return;
380223b0898eSPaolo Bonzini     }
380323b0898eSPaolo Bonzini 
380423b0898eSPaolo Bonzini     s->kvm_shadow_mem = value;
380523b0898eSPaolo Bonzini }
380623b0898eSPaolo Bonzini 
kvm_set_kernel_irqchip(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)380711bc4a13SPaolo Bonzini static void kvm_set_kernel_irqchip(Object *obj, Visitor *v,
380811bc4a13SPaolo Bonzini                                    const char *name, void *opaque,
380911bc4a13SPaolo Bonzini                                    Error **errp)
381011bc4a13SPaolo Bonzini {
381111bc4a13SPaolo Bonzini     KVMState *s = KVM_STATE(obj);
381211bc4a13SPaolo Bonzini     OnOffSplit mode;
381311bc4a13SPaolo Bonzini 
381470cbae42SPaolo Bonzini     if (s->fd != -1) {
381570cbae42SPaolo Bonzini         error_setg(errp, "Cannot set properties after the accelerator has been initialized");
381670cbae42SPaolo Bonzini         return;
381770cbae42SPaolo Bonzini     }
381870cbae42SPaolo Bonzini 
381914217038SMarkus Armbruster     if (!visit_type_OnOffSplit(v, name, &mode, errp)) {
382011bc4a13SPaolo Bonzini         return;
382114217038SMarkus Armbruster     }
382211bc4a13SPaolo Bonzini     switch (mode) {
382311bc4a13SPaolo Bonzini     case ON_OFF_SPLIT_ON:
382411bc4a13SPaolo Bonzini         s->kernel_irqchip_allowed = true;
382511bc4a13SPaolo Bonzini         s->kernel_irqchip_required = true;
3826d1972be1SXiaoyao Li         s->kernel_irqchip_split = ON_OFF_AUTO_OFF;
382711bc4a13SPaolo Bonzini         break;
382811bc4a13SPaolo Bonzini     case ON_OFF_SPLIT_OFF:
382911bc4a13SPaolo Bonzini         s->kernel_irqchip_allowed = false;
383011bc4a13SPaolo Bonzini         s->kernel_irqchip_required = false;
3831d1972be1SXiaoyao Li         s->kernel_irqchip_split = ON_OFF_AUTO_OFF;
383211bc4a13SPaolo Bonzini         break;
383311bc4a13SPaolo Bonzini     case ON_OFF_SPLIT_SPLIT:
383411bc4a13SPaolo Bonzini         s->kernel_irqchip_allowed = true;
383511bc4a13SPaolo Bonzini         s->kernel_irqchip_required = true;
3836d1972be1SXiaoyao Li         s->kernel_irqchip_split = ON_OFF_AUTO_ON;
383711bc4a13SPaolo Bonzini         break;
383811bc4a13SPaolo Bonzini     default:
383911bc4a13SPaolo Bonzini         /* The value was checked in visit_type_OnOffSplit() above. If
384011bc4a13SPaolo Bonzini          * we get here, then something is wrong in QEMU.
384111bc4a13SPaolo Bonzini          */
384211bc4a13SPaolo Bonzini         abort();
384311bc4a13SPaolo Bonzini     }
384411bc4a13SPaolo Bonzini }
384511bc4a13SPaolo Bonzini 
kvm_kernel_irqchip_allowed(void)38464376c40dSPaolo Bonzini bool kvm_kernel_irqchip_allowed(void)
38474376c40dSPaolo Bonzini {
384811bc4a13SPaolo Bonzini     return kvm_state->kernel_irqchip_allowed;
38494376c40dSPaolo Bonzini }
38504376c40dSPaolo Bonzini 
kvm_kernel_irqchip_required(void)38514376c40dSPaolo Bonzini bool kvm_kernel_irqchip_required(void)
38524376c40dSPaolo Bonzini {
385311bc4a13SPaolo Bonzini     return kvm_state->kernel_irqchip_required;
38544376c40dSPaolo Bonzini }
38554376c40dSPaolo Bonzini 
kvm_kernel_irqchip_split(void)38564376c40dSPaolo Bonzini bool kvm_kernel_irqchip_split(void)
38574376c40dSPaolo Bonzini {
3858d1972be1SXiaoyao Li     return kvm_state->kernel_irqchip_split == ON_OFF_AUTO_ON;
38594376c40dSPaolo Bonzini }
38604376c40dSPaolo Bonzini 
kvm_get_dirty_ring_size(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)38612ea5cb0aSPeter Xu static void kvm_get_dirty_ring_size(Object *obj, Visitor *v,
38622ea5cb0aSPeter Xu                                     const char *name, void *opaque,
38632ea5cb0aSPeter Xu                                     Error **errp)
38642ea5cb0aSPeter Xu {
38652ea5cb0aSPeter Xu     KVMState *s = KVM_STATE(obj);
38662ea5cb0aSPeter Xu     uint32_t value = s->kvm_dirty_ring_size;
38672ea5cb0aSPeter Xu 
38682ea5cb0aSPeter Xu     visit_type_uint32(v, name, &value, errp);
38692ea5cb0aSPeter Xu }
38702ea5cb0aSPeter Xu 
kvm_set_dirty_ring_size(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)38712ea5cb0aSPeter Xu static void kvm_set_dirty_ring_size(Object *obj, Visitor *v,
38722ea5cb0aSPeter Xu                                     const char *name, void *opaque,
38732ea5cb0aSPeter Xu                                     Error **errp)
38742ea5cb0aSPeter Xu {
38752ea5cb0aSPeter Xu     KVMState *s = KVM_STATE(obj);
38762ea5cb0aSPeter Xu     uint32_t value;
38772ea5cb0aSPeter Xu 
38782ea5cb0aSPeter Xu     if (s->fd != -1) {
38792ea5cb0aSPeter Xu         error_setg(errp, "Cannot set properties after the accelerator has been initialized");
38802ea5cb0aSPeter Xu         return;
38812ea5cb0aSPeter Xu     }
38822ea5cb0aSPeter Xu 
3883d1c81c34SMarkus Armbruster     if (!visit_type_uint32(v, name, &value, errp)) {
38842ea5cb0aSPeter Xu         return;
38852ea5cb0aSPeter Xu     }
38862ea5cb0aSPeter Xu     if (value & (value - 1)) {
38872ea5cb0aSPeter Xu         error_setg(errp, "dirty-ring-size must be a power of two.");
38882ea5cb0aSPeter Xu         return;
38892ea5cb0aSPeter Xu     }
38902ea5cb0aSPeter Xu 
38912ea5cb0aSPeter Xu     s->kvm_dirty_ring_size = value;
38922ea5cb0aSPeter Xu }
38932ea5cb0aSPeter Xu 
kvm_get_device(Object * obj,Error ** errp G_GNUC_UNUSED)3894aef158b0SDaan De Meyer static char *kvm_get_device(Object *obj,
3895aef158b0SDaan De Meyer                             Error **errp G_GNUC_UNUSED)
3896aef158b0SDaan De Meyer {
3897aef158b0SDaan De Meyer     KVMState *s = KVM_STATE(obj);
3898aef158b0SDaan De Meyer 
3899aef158b0SDaan De Meyer     return g_strdup(s->device);
3900aef158b0SDaan De Meyer }
3901aef158b0SDaan De Meyer 
kvm_set_device(Object * obj,const char * value,Error ** errp G_GNUC_UNUSED)3902aef158b0SDaan De Meyer static void kvm_set_device(Object *obj,
3903aef158b0SDaan De Meyer                            const char *value,
3904aef158b0SDaan De Meyer                            Error **errp G_GNUC_UNUSED)
3905aef158b0SDaan De Meyer {
3906aef158b0SDaan De Meyer     KVMState *s = KVM_STATE(obj);
3907aef158b0SDaan De Meyer 
3908aef158b0SDaan De Meyer     g_free(s->device);
3909aef158b0SDaan De Meyer     s->device = g_strdup(value);
3910aef158b0SDaan De Meyer }
3911aef158b0SDaan De Meyer 
kvm_set_kvm_rapl(Object * obj,bool value,Error ** errp)39120418f908SAnthony Harivel static void kvm_set_kvm_rapl(Object *obj, bool value, Error **errp)
39130418f908SAnthony Harivel {
39140418f908SAnthony Harivel     KVMState *s = KVM_STATE(obj);
39150418f908SAnthony Harivel     s->msr_energy.enable = value;
39160418f908SAnthony Harivel }
39170418f908SAnthony Harivel 
kvm_set_kvm_rapl_socket_path(Object * obj,const char * str,Error ** errp)39180418f908SAnthony Harivel static void kvm_set_kvm_rapl_socket_path(Object *obj,
39190418f908SAnthony Harivel                                          const char *str,
39200418f908SAnthony Harivel                                          Error **errp)
39210418f908SAnthony Harivel {
39220418f908SAnthony Harivel     KVMState *s = KVM_STATE(obj);
39230418f908SAnthony Harivel     g_free(s->msr_energy.socket_path);
39240418f908SAnthony Harivel     s->msr_energy.socket_path = g_strdup(str);
39250418f908SAnthony Harivel }
39260418f908SAnthony Harivel 
kvm_accel_instance_init(Object * obj)392723b0898eSPaolo Bonzini static void kvm_accel_instance_init(Object *obj)
392823b0898eSPaolo Bonzini {
392923b0898eSPaolo Bonzini     KVMState *s = KVM_STATE(obj);
393023b0898eSPaolo Bonzini 
393170cbae42SPaolo Bonzini     s->fd = -1;
393270cbae42SPaolo Bonzini     s->vmfd = -1;
393323b0898eSPaolo Bonzini     s->kvm_shadow_mem = -1;
3934d1972be1SXiaoyao Li     s->kernel_irqchip_allowed = true;
3935d1972be1SXiaoyao Li     s->kernel_irqchip_split = ON_OFF_AUTO_AUTO;
39362ea5cb0aSPeter Xu     /* KVM dirty ring is by default off */
39372ea5cb0aSPeter Xu     s->kvm_dirty_ring_size = 0;
3938b20cc776SGavin Shan     s->kvm_dirty_ring_with_bitmap = false;
3939c8f2eb5dSShameer Kolothum     s->kvm_eager_split_size = 0;
3940e2e69f6bSChenyi Qiang     s->notify_vmexit = NOTIFY_VMEXIT_OPTION_RUN;
3941e2e69f6bSChenyi Qiang     s->notify_window = 0;
394261491cf4SDavid Woodhouse     s->xen_version = 0;
39436f43f2eeSDavid Woodhouse     s->xen_gnttab_max_frames = 64;
3944e16aff4cSDavid Woodhouse     s->xen_evtchn_max_pirq = 256;
3945aef158b0SDaan De Meyer     s->device = NULL;
39460418f908SAnthony Harivel     s->msr_energy.enable = false;
394723b0898eSPaolo Bonzini }
394823b0898eSPaolo Bonzini 
39493b7a9388SAlex Bennée /**
39503b7a9388SAlex Bennée  * kvm_gdbstub_sstep_flags():
39513b7a9388SAlex Bennée  *
39523b7a9388SAlex Bennée  * Returns: SSTEP_* flags that KVM supports for guest debug. The
39533b7a9388SAlex Bennée  * support is probed during kvm_init()
39543b7a9388SAlex Bennée  */
kvm_gdbstub_sstep_flags(void)39553b7a9388SAlex Bennée static int kvm_gdbstub_sstep_flags(void)
39563b7a9388SAlex Bennée {
39573b7a9388SAlex Bennée     return kvm_sstep_flags;
39583b7a9388SAlex Bennée }
39593b7a9388SAlex Bennée 
kvm_accel_class_init(ObjectClass * oc,void * data)396092229a57SYang Zhong static void kvm_accel_class_init(ObjectClass *oc, void *data)
396192229a57SYang Zhong {
396292229a57SYang Zhong     AccelClass *ac = ACCEL_CLASS(oc);
396392229a57SYang Zhong     ac->name = "KVM";
396492229a57SYang Zhong     ac->init_machine = kvm_init;
39658072aae3SAlexey Kardashevskiy     ac->has_memory = kvm_accel_has_memory;
396692229a57SYang Zhong     ac->allowed = &kvm_allowed;
39673b7a9388SAlex Bennée     ac->gdbstub_supported_sstep_flags = kvm_gdbstub_sstep_flags;
396823b0898eSPaolo Bonzini 
396911bc4a13SPaolo Bonzini     object_class_property_add(oc, "kernel-irqchip", "on|off|split",
397011bc4a13SPaolo Bonzini         NULL, kvm_set_kernel_irqchip,
3971d2623129SMarkus Armbruster         NULL, NULL);
397211bc4a13SPaolo Bonzini     object_class_property_set_description(oc, "kernel-irqchip",
39737eecec7dSMarkus Armbruster         "Configure KVM in-kernel irqchip");
397411bc4a13SPaolo Bonzini 
397523b0898eSPaolo Bonzini     object_class_property_add(oc, "kvm-shadow-mem", "int",
397623b0898eSPaolo Bonzini         kvm_get_kvm_shadow_mem, kvm_set_kvm_shadow_mem,
3977d2623129SMarkus Armbruster         NULL, NULL);
397823b0898eSPaolo Bonzini     object_class_property_set_description(oc, "kvm-shadow-mem",
39797eecec7dSMarkus Armbruster         "KVM shadow MMU size");
39802ea5cb0aSPeter Xu 
39812ea5cb0aSPeter Xu     object_class_property_add(oc, "dirty-ring-size", "uint32",
39822ea5cb0aSPeter Xu         kvm_get_dirty_ring_size, kvm_set_dirty_ring_size,
39832ea5cb0aSPeter Xu         NULL, NULL);
39842ea5cb0aSPeter Xu     object_class_property_set_description(oc, "dirty-ring-size",
39852ea5cb0aSPeter Xu         "Size of KVM dirty page ring buffer (default: 0, i.e. use bitmap)");
39863dba0a33SPaolo Bonzini 
3987aef158b0SDaan De Meyer     object_class_property_add_str(oc, "device", kvm_get_device, kvm_set_device);
3988aef158b0SDaan De Meyer     object_class_property_set_description(oc, "device",
3989aef158b0SDaan De Meyer         "Path to the device node to use (default: /dev/kvm)");
3990aef158b0SDaan De Meyer 
39910418f908SAnthony Harivel     object_class_property_add_bool(oc, "rapl",
39920418f908SAnthony Harivel                                    NULL,
39930418f908SAnthony Harivel                                    kvm_set_kvm_rapl);
39940418f908SAnthony Harivel     object_class_property_set_description(oc, "rapl",
39950418f908SAnthony Harivel         "Allow energy related MSRs for RAPL interface in Guest");
39960418f908SAnthony Harivel 
39970418f908SAnthony Harivel     object_class_property_add_str(oc, "rapl-helper-socket", NULL,
39980418f908SAnthony Harivel                                   kvm_set_kvm_rapl_socket_path);
39990418f908SAnthony Harivel     object_class_property_set_description(oc, "rapl-helper-socket",
40000418f908SAnthony Harivel         "Socket Path for comminucating with the Virtual MSR helper daemon");
40010418f908SAnthony Harivel 
40023dba0a33SPaolo Bonzini     kvm_arch_accel_class_init(oc);
400392229a57SYang Zhong }
400492229a57SYang Zhong 
400592229a57SYang Zhong static const TypeInfo kvm_accel_type = {
400692229a57SYang Zhong     .name = TYPE_KVM_ACCEL,
400792229a57SYang Zhong     .parent = TYPE_ACCEL,
400823b0898eSPaolo Bonzini     .instance_init = kvm_accel_instance_init,
400992229a57SYang Zhong     .class_init = kvm_accel_class_init,
401092229a57SYang Zhong     .instance_size = sizeof(KVMState),
401192229a57SYang Zhong };
401292229a57SYang Zhong 
kvm_type_init(void)401392229a57SYang Zhong static void kvm_type_init(void)
401492229a57SYang Zhong {
401592229a57SYang Zhong     type_register_static(&kvm_accel_type);
401692229a57SYang Zhong }
401792229a57SYang Zhong 
401892229a57SYang Zhong type_init(kvm_type_init);
4019cc01a3f4SMark Kanda 
4020cc01a3f4SMark Kanda typedef struct StatsArgs {
4021cc01a3f4SMark Kanda     union StatsResultsType {
4022cc01a3f4SMark Kanda         StatsResultList **stats;
4023cc01a3f4SMark Kanda         StatsSchemaList **schema;
4024cc01a3f4SMark Kanda     } result;
4025cf7405bcSPaolo Bonzini     strList *names;
4026cc01a3f4SMark Kanda     Error **errp;
4027cc01a3f4SMark Kanda } StatsArgs;
4028cc01a3f4SMark Kanda 
add_kvmstat_entry(struct kvm_stats_desc * pdesc,uint64_t * stats_data,StatsList * stats_list,Error ** errp)4029cc01a3f4SMark Kanda static StatsList *add_kvmstat_entry(struct kvm_stats_desc *pdesc,
4030cc01a3f4SMark Kanda                                     uint64_t *stats_data,
4031cc01a3f4SMark Kanda                                     StatsList *stats_list,
4032cc01a3f4SMark Kanda                                     Error **errp)
4033cc01a3f4SMark Kanda {
4034cc01a3f4SMark Kanda 
4035cc01a3f4SMark Kanda     Stats *stats;
4036cc01a3f4SMark Kanda     uint64List *val_list = NULL;
4037cc01a3f4SMark Kanda 
4038cc01a3f4SMark Kanda     /* Only add stats that we understand.  */
4039cc01a3f4SMark Kanda     switch (pdesc->flags & KVM_STATS_TYPE_MASK) {
4040cc01a3f4SMark Kanda     case KVM_STATS_TYPE_CUMULATIVE:
4041cc01a3f4SMark Kanda     case KVM_STATS_TYPE_INSTANT:
4042cc01a3f4SMark Kanda     case KVM_STATS_TYPE_PEAK:
4043cc01a3f4SMark Kanda     case KVM_STATS_TYPE_LINEAR_HIST:
4044cc01a3f4SMark Kanda     case KVM_STATS_TYPE_LOG_HIST:
4045cc01a3f4SMark Kanda         break;
4046cc01a3f4SMark Kanda     default:
4047cc01a3f4SMark Kanda         return stats_list;
4048cc01a3f4SMark Kanda     }
4049cc01a3f4SMark Kanda 
4050cc01a3f4SMark Kanda     switch (pdesc->flags & KVM_STATS_UNIT_MASK) {
4051cc01a3f4SMark Kanda     case KVM_STATS_UNIT_NONE:
4052cc01a3f4SMark Kanda     case KVM_STATS_UNIT_BYTES:
4053cc01a3f4SMark Kanda     case KVM_STATS_UNIT_CYCLES:
4054cc01a3f4SMark Kanda     case KVM_STATS_UNIT_SECONDS:
4055105bb7cdSPaolo Bonzini     case KVM_STATS_UNIT_BOOLEAN:
4056cc01a3f4SMark Kanda         break;
4057cc01a3f4SMark Kanda     default:
4058cc01a3f4SMark Kanda         return stats_list;
4059cc01a3f4SMark Kanda     }
4060cc01a3f4SMark Kanda 
4061cc01a3f4SMark Kanda     switch (pdesc->flags & KVM_STATS_BASE_MASK) {
4062cc01a3f4SMark Kanda     case KVM_STATS_BASE_POW10:
4063cc01a3f4SMark Kanda     case KVM_STATS_BASE_POW2:
4064cc01a3f4SMark Kanda         break;
4065cc01a3f4SMark Kanda     default:
4066cc01a3f4SMark Kanda         return stats_list;
4067cc01a3f4SMark Kanda     }
4068cc01a3f4SMark Kanda 
4069cc01a3f4SMark Kanda     /* Alloc and populate data list */
4070cc01a3f4SMark Kanda     stats = g_new0(Stats, 1);
4071cc01a3f4SMark Kanda     stats->name = g_strdup(pdesc->name);
407244fd9cf6SZhao Liu     stats->value = g_new0(StatsValue, 1);
4073cc01a3f4SMark Kanda 
4074105bb7cdSPaolo Bonzini     if ((pdesc->flags & KVM_STATS_UNIT_MASK) == KVM_STATS_UNIT_BOOLEAN) {
4075105bb7cdSPaolo Bonzini         stats->value->u.boolean = *stats_data;
4076105bb7cdSPaolo Bonzini         stats->value->type = QTYPE_QBOOL;
4077105bb7cdSPaolo Bonzini     } else if (pdesc->size == 1) {
4078cc01a3f4SMark Kanda         stats->value->u.scalar = *stats_data;
4079cc01a3f4SMark Kanda         stats->value->type = QTYPE_QNUM;
4080cc01a3f4SMark Kanda     } else {
4081cc01a3f4SMark Kanda         int i;
4082cc01a3f4SMark Kanda         for (i = 0; i < pdesc->size; i++) {
4083cc01a3f4SMark Kanda             QAPI_LIST_PREPEND(val_list, stats_data[i]);
4084cc01a3f4SMark Kanda         }
4085cc01a3f4SMark Kanda         stats->value->u.list = val_list;
4086cc01a3f4SMark Kanda         stats->value->type = QTYPE_QLIST;
4087cc01a3f4SMark Kanda     }
4088cc01a3f4SMark Kanda 
4089cc01a3f4SMark Kanda     QAPI_LIST_PREPEND(stats_list, stats);
4090cc01a3f4SMark Kanda     return stats_list;
4091cc01a3f4SMark Kanda }
4092cc01a3f4SMark Kanda 
add_kvmschema_entry(struct kvm_stats_desc * pdesc,StatsSchemaValueList * list,Error ** errp)4093cc01a3f4SMark Kanda static StatsSchemaValueList *add_kvmschema_entry(struct kvm_stats_desc *pdesc,
4094cc01a3f4SMark Kanda                                                  StatsSchemaValueList *list,
4095cc01a3f4SMark Kanda                                                  Error **errp)
4096cc01a3f4SMark Kanda {
4097cc01a3f4SMark Kanda     StatsSchemaValueList *schema_entry = g_new0(StatsSchemaValueList, 1);
4098cc01a3f4SMark Kanda     schema_entry->value = g_new0(StatsSchemaValue, 1);
4099cc01a3f4SMark Kanda 
4100cc01a3f4SMark Kanda     switch (pdesc->flags & KVM_STATS_TYPE_MASK) {
4101cc01a3f4SMark Kanda     case KVM_STATS_TYPE_CUMULATIVE:
4102cc01a3f4SMark Kanda         schema_entry->value->type = STATS_TYPE_CUMULATIVE;
4103cc01a3f4SMark Kanda         break;
4104cc01a3f4SMark Kanda     case KVM_STATS_TYPE_INSTANT:
4105cc01a3f4SMark Kanda         schema_entry->value->type = STATS_TYPE_INSTANT;
4106cc01a3f4SMark Kanda         break;
4107cc01a3f4SMark Kanda     case KVM_STATS_TYPE_PEAK:
4108cc01a3f4SMark Kanda         schema_entry->value->type = STATS_TYPE_PEAK;
4109cc01a3f4SMark Kanda         break;
4110cc01a3f4SMark Kanda     case KVM_STATS_TYPE_LINEAR_HIST:
4111cc01a3f4SMark Kanda         schema_entry->value->type = STATS_TYPE_LINEAR_HISTOGRAM;
4112cc01a3f4SMark Kanda         schema_entry->value->bucket_size = pdesc->bucket_size;
4113cc01a3f4SMark Kanda         schema_entry->value->has_bucket_size = true;
4114cc01a3f4SMark Kanda         break;
4115cc01a3f4SMark Kanda     case KVM_STATS_TYPE_LOG_HIST:
4116cc01a3f4SMark Kanda         schema_entry->value->type = STATS_TYPE_LOG2_HISTOGRAM;
4117cc01a3f4SMark Kanda         break;
4118cc01a3f4SMark Kanda     default:
4119cc01a3f4SMark Kanda         goto exit;
4120cc01a3f4SMark Kanda     }
4121cc01a3f4SMark Kanda 
4122cc01a3f4SMark Kanda     switch (pdesc->flags & KVM_STATS_UNIT_MASK) {
4123cc01a3f4SMark Kanda     case KVM_STATS_UNIT_NONE:
4124cc01a3f4SMark Kanda         break;
4125105bb7cdSPaolo Bonzini     case KVM_STATS_UNIT_BOOLEAN:
4126105bb7cdSPaolo Bonzini         schema_entry->value->has_unit = true;
4127105bb7cdSPaolo Bonzini         schema_entry->value->unit = STATS_UNIT_BOOLEAN;
4128105bb7cdSPaolo Bonzini         break;
4129cc01a3f4SMark Kanda     case KVM_STATS_UNIT_BYTES:
4130cc01a3f4SMark Kanda         schema_entry->value->has_unit = true;
4131cc01a3f4SMark Kanda         schema_entry->value->unit = STATS_UNIT_BYTES;
4132cc01a3f4SMark Kanda         break;
4133cc01a3f4SMark Kanda     case KVM_STATS_UNIT_CYCLES:
4134cc01a3f4SMark Kanda         schema_entry->value->has_unit = true;
4135cc01a3f4SMark Kanda         schema_entry->value->unit = STATS_UNIT_CYCLES;
4136cc01a3f4SMark Kanda         break;
4137cc01a3f4SMark Kanda     case KVM_STATS_UNIT_SECONDS:
4138cc01a3f4SMark Kanda         schema_entry->value->has_unit = true;
4139cc01a3f4SMark Kanda         schema_entry->value->unit = STATS_UNIT_SECONDS;
4140cc01a3f4SMark Kanda         break;
4141cc01a3f4SMark Kanda     default:
4142cc01a3f4SMark Kanda         goto exit;
4143cc01a3f4SMark Kanda     }
4144cc01a3f4SMark Kanda 
4145cc01a3f4SMark Kanda     schema_entry->value->exponent = pdesc->exponent;
4146cc01a3f4SMark Kanda     if (pdesc->exponent) {
4147cc01a3f4SMark Kanda         switch (pdesc->flags & KVM_STATS_BASE_MASK) {
4148cc01a3f4SMark Kanda         case KVM_STATS_BASE_POW10:
4149cc01a3f4SMark Kanda             schema_entry->value->has_base = true;
4150cc01a3f4SMark Kanda             schema_entry->value->base = 10;
4151cc01a3f4SMark Kanda             break;
4152cc01a3f4SMark Kanda         case KVM_STATS_BASE_POW2:
4153cc01a3f4SMark Kanda             schema_entry->value->has_base = true;
4154cc01a3f4SMark Kanda             schema_entry->value->base = 2;
4155cc01a3f4SMark Kanda             break;
4156cc01a3f4SMark Kanda         default:
4157cc01a3f4SMark Kanda             goto exit;
4158cc01a3f4SMark Kanda         }
4159cc01a3f4SMark Kanda     }
4160cc01a3f4SMark Kanda 
4161cc01a3f4SMark Kanda     schema_entry->value->name = g_strdup(pdesc->name);
4162cc01a3f4SMark Kanda     schema_entry->next = list;
4163cc01a3f4SMark Kanda     return schema_entry;
4164cc01a3f4SMark Kanda exit:
4165cc01a3f4SMark Kanda     g_free(schema_entry->value);
4166cc01a3f4SMark Kanda     g_free(schema_entry);
4167cc01a3f4SMark Kanda     return list;
4168cc01a3f4SMark Kanda }
4169cc01a3f4SMark Kanda 
4170cc01a3f4SMark Kanda /* Cached stats descriptors */
4171cc01a3f4SMark Kanda typedef struct StatsDescriptors {
4172cc01a3f4SMark Kanda     const char *ident; /* cache key, currently the StatsTarget */
4173cc01a3f4SMark Kanda     struct kvm_stats_desc *kvm_stats_desc;
417421adec30SPaolo Bonzini     struct kvm_stats_header kvm_stats_header;
4175cc01a3f4SMark Kanda     QTAILQ_ENTRY(StatsDescriptors) next;
4176cc01a3f4SMark Kanda } StatsDescriptors;
4177cc01a3f4SMark Kanda 
4178cc01a3f4SMark Kanda static QTAILQ_HEAD(, StatsDescriptors) stats_descriptors =
4179cc01a3f4SMark Kanda     QTAILQ_HEAD_INITIALIZER(stats_descriptors);
4180cc01a3f4SMark Kanda 
4181cc01a3f4SMark Kanda /*
4182cc01a3f4SMark Kanda  * Return the descriptors for 'target', that either have already been read
4183cc01a3f4SMark Kanda  * or are retrieved from 'stats_fd'.
4184cc01a3f4SMark Kanda  */
find_stats_descriptors(StatsTarget target,int stats_fd,Error ** errp)4185cc01a3f4SMark Kanda static StatsDescriptors *find_stats_descriptors(StatsTarget target, int stats_fd,
4186cc01a3f4SMark Kanda                                                 Error **errp)
4187cc01a3f4SMark Kanda {
4188cc01a3f4SMark Kanda     StatsDescriptors *descriptors;
4189cc01a3f4SMark Kanda     const char *ident;
4190cc01a3f4SMark Kanda     struct kvm_stats_desc *kvm_stats_desc;
4191cc01a3f4SMark Kanda     struct kvm_stats_header *kvm_stats_header;
4192cc01a3f4SMark Kanda     size_t size_desc;
4193cc01a3f4SMark Kanda     ssize_t ret;
4194cc01a3f4SMark Kanda 
4195cc01a3f4SMark Kanda     ident = StatsTarget_str(target);
4196cc01a3f4SMark Kanda     QTAILQ_FOREACH(descriptors, &stats_descriptors, next) {
4197cc01a3f4SMark Kanda         if (g_str_equal(descriptors->ident, ident)) {
4198cc01a3f4SMark Kanda             return descriptors;
4199cc01a3f4SMark Kanda         }
4200cc01a3f4SMark Kanda     }
4201cc01a3f4SMark Kanda 
4202cc01a3f4SMark Kanda     descriptors = g_new0(StatsDescriptors, 1);
4203cc01a3f4SMark Kanda 
4204cc01a3f4SMark Kanda     /* Read stats header */
420521adec30SPaolo Bonzini     kvm_stats_header = &descriptors->kvm_stats_header;
42063b6f4852SMarcelo Tosatti     ret = pread(stats_fd, kvm_stats_header, sizeof(*kvm_stats_header), 0);
4207cc01a3f4SMark Kanda     if (ret != sizeof(*kvm_stats_header)) {
4208cc01a3f4SMark Kanda         error_setg(errp, "KVM stats: failed to read stats header: "
4209cc01a3f4SMark Kanda                    "expected %zu actual %zu",
4210cc01a3f4SMark Kanda                    sizeof(*kvm_stats_header), ret);
4211f696b74bSMiaoqian Lin         g_free(descriptors);
4212cc01a3f4SMark Kanda         return NULL;
4213cc01a3f4SMark Kanda     }
4214cc01a3f4SMark Kanda     size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size;
4215cc01a3f4SMark Kanda 
4216cc01a3f4SMark Kanda     /* Read stats descriptors */
4217cc01a3f4SMark Kanda     kvm_stats_desc = g_malloc0_n(kvm_stats_header->num_desc, size_desc);
4218cc01a3f4SMark Kanda     ret = pread(stats_fd, kvm_stats_desc,
4219cc01a3f4SMark Kanda                 size_desc * kvm_stats_header->num_desc,
4220cc01a3f4SMark Kanda                 kvm_stats_header->desc_offset);
4221cc01a3f4SMark Kanda 
4222cc01a3f4SMark Kanda     if (ret != size_desc * kvm_stats_header->num_desc) {
4223cc01a3f4SMark Kanda         error_setg(errp, "KVM stats: failed to read stats descriptors: "
4224cc01a3f4SMark Kanda                    "expected %zu actual %zu",
4225cc01a3f4SMark Kanda                    size_desc * kvm_stats_header->num_desc, ret);
4226cc01a3f4SMark Kanda         g_free(descriptors);
4227cc01a3f4SMark Kanda         g_free(kvm_stats_desc);
4228cc01a3f4SMark Kanda         return NULL;
4229cc01a3f4SMark Kanda     }
4230cc01a3f4SMark Kanda     descriptors->kvm_stats_desc = kvm_stats_desc;
4231cc01a3f4SMark Kanda     descriptors->ident = ident;
4232cc01a3f4SMark Kanda     QTAILQ_INSERT_TAIL(&stats_descriptors, descriptors, next);
4233cc01a3f4SMark Kanda     return descriptors;
4234cc01a3f4SMark Kanda }
4235cc01a3f4SMark Kanda 
query_stats(StatsResultList ** result,StatsTarget target,strList * names,int stats_fd,CPUState * cpu,Error ** errp)4236cc01a3f4SMark Kanda static void query_stats(StatsResultList **result, StatsTarget target,
42373b6f4852SMarcelo Tosatti                         strList *names, int stats_fd, CPUState *cpu,
42383b6f4852SMarcelo Tosatti                         Error **errp)
4239cc01a3f4SMark Kanda {
4240cc01a3f4SMark Kanda     struct kvm_stats_desc *kvm_stats_desc;
4241cc01a3f4SMark Kanda     struct kvm_stats_header *kvm_stats_header;
4242cc01a3f4SMark Kanda     StatsDescriptors *descriptors;
4243cc01a3f4SMark Kanda     g_autofree uint64_t *stats_data = NULL;
4244cc01a3f4SMark Kanda     struct kvm_stats_desc *pdesc;
4245cc01a3f4SMark Kanda     StatsList *stats_list = NULL;
4246cc01a3f4SMark Kanda     size_t size_desc, size_data = 0;
4247cc01a3f4SMark Kanda     ssize_t ret;
4248cc01a3f4SMark Kanda     int i;
4249cc01a3f4SMark Kanda 
4250cc01a3f4SMark Kanda     descriptors = find_stats_descriptors(target, stats_fd, errp);
4251cc01a3f4SMark Kanda     if (!descriptors) {
4252cc01a3f4SMark Kanda         return;
4253cc01a3f4SMark Kanda     }
4254cc01a3f4SMark Kanda 
425521adec30SPaolo Bonzini     kvm_stats_header = &descriptors->kvm_stats_header;
4256cc01a3f4SMark Kanda     kvm_stats_desc = descriptors->kvm_stats_desc;
4257cc01a3f4SMark Kanda     size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size;
4258cc01a3f4SMark Kanda 
4259cc01a3f4SMark Kanda     /* Tally the total data size; read schema data */
4260cc01a3f4SMark Kanda     for (i = 0; i < kvm_stats_header->num_desc; ++i) {
4261cc01a3f4SMark Kanda         pdesc = (void *)kvm_stats_desc + i * size_desc;
4262cc01a3f4SMark Kanda         size_data += pdesc->size * sizeof(*stats_data);
4263cc01a3f4SMark Kanda     }
4264cc01a3f4SMark Kanda 
4265cc01a3f4SMark Kanda     stats_data = g_malloc0(size_data);
4266cc01a3f4SMark Kanda     ret = pread(stats_fd, stats_data, size_data, kvm_stats_header->data_offset);
4267cc01a3f4SMark Kanda 
4268cc01a3f4SMark Kanda     if (ret != size_data) {
4269cc01a3f4SMark Kanda         error_setg(errp, "KVM stats: failed to read data: "
4270cc01a3f4SMark Kanda                    "expected %zu actual %zu", size_data, ret);
4271cc01a3f4SMark Kanda         return;
4272cc01a3f4SMark Kanda     }
4273cc01a3f4SMark Kanda 
4274cc01a3f4SMark Kanda     for (i = 0; i < kvm_stats_header->num_desc; ++i) {
4275cc01a3f4SMark Kanda         uint64_t *stats;
4276cc01a3f4SMark Kanda         pdesc = (void *)kvm_stats_desc + i * size_desc;
4277cc01a3f4SMark Kanda 
4278cc01a3f4SMark Kanda         /* Add entry to the list */
4279cc01a3f4SMark Kanda         stats = (void *)stats_data + pdesc->offset;
4280cf7405bcSPaolo Bonzini         if (!apply_str_list_filter(pdesc->name, names)) {
4281cf7405bcSPaolo Bonzini             continue;
4282cf7405bcSPaolo Bonzini         }
4283cc01a3f4SMark Kanda         stats_list = add_kvmstat_entry(pdesc, stats, stats_list, errp);
4284cc01a3f4SMark Kanda     }
4285cc01a3f4SMark Kanda 
4286cc01a3f4SMark Kanda     if (!stats_list) {
4287cc01a3f4SMark Kanda         return;
4288cc01a3f4SMark Kanda     }
4289cc01a3f4SMark Kanda 
4290cc01a3f4SMark Kanda     switch (target) {
4291cc01a3f4SMark Kanda     case STATS_TARGET_VM:
4292cc01a3f4SMark Kanda         add_stats_entry(result, STATS_PROVIDER_KVM, NULL, stats_list);
4293cc01a3f4SMark Kanda         break;
4294cc01a3f4SMark Kanda     case STATS_TARGET_VCPU:
4295cc01a3f4SMark Kanda         add_stats_entry(result, STATS_PROVIDER_KVM,
42963b6f4852SMarcelo Tosatti                         cpu->parent_obj.canonical_path,
4297cc01a3f4SMark Kanda                         stats_list);
4298cc01a3f4SMark Kanda         break;
4299cc01a3f4SMark Kanda     default:
4300d12dd9c7SPeter Maydell         g_assert_not_reached();
4301cc01a3f4SMark Kanda     }
4302cc01a3f4SMark Kanda }
4303cc01a3f4SMark Kanda 
query_stats_schema(StatsSchemaList ** result,StatsTarget target,int stats_fd,Error ** errp)4304cc01a3f4SMark Kanda static void query_stats_schema(StatsSchemaList **result, StatsTarget target,
4305cc01a3f4SMark Kanda                                int stats_fd, Error **errp)
4306cc01a3f4SMark Kanda {
4307cc01a3f4SMark Kanda     struct kvm_stats_desc *kvm_stats_desc;
4308cc01a3f4SMark Kanda     struct kvm_stats_header *kvm_stats_header;
4309cc01a3f4SMark Kanda     StatsDescriptors *descriptors;
4310cc01a3f4SMark Kanda     struct kvm_stats_desc *pdesc;
4311cc01a3f4SMark Kanda     StatsSchemaValueList *stats_list = NULL;
4312cc01a3f4SMark Kanda     size_t size_desc;
4313cc01a3f4SMark Kanda     int i;
4314cc01a3f4SMark Kanda 
4315cc01a3f4SMark Kanda     descriptors = find_stats_descriptors(target, stats_fd, errp);
4316cc01a3f4SMark Kanda     if (!descriptors) {
4317cc01a3f4SMark Kanda         return;
4318cc01a3f4SMark Kanda     }
4319cc01a3f4SMark Kanda 
432021adec30SPaolo Bonzini     kvm_stats_header = &descriptors->kvm_stats_header;
4321cc01a3f4SMark Kanda     kvm_stats_desc = descriptors->kvm_stats_desc;
4322cc01a3f4SMark Kanda     size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size;
4323cc01a3f4SMark Kanda 
4324cc01a3f4SMark Kanda     /* Tally the total data size; read schema data */
4325cc01a3f4SMark Kanda     for (i = 0; i < kvm_stats_header->num_desc; ++i) {
4326cc01a3f4SMark Kanda         pdesc = (void *)kvm_stats_desc + i * size_desc;
4327cc01a3f4SMark Kanda         stats_list = add_kvmschema_entry(pdesc, stats_list, errp);
4328cc01a3f4SMark Kanda     }
4329cc01a3f4SMark Kanda 
4330cc01a3f4SMark Kanda     add_stats_schema(result, STATS_PROVIDER_KVM, target, stats_list);
4331cc01a3f4SMark Kanda }
4332cc01a3f4SMark Kanda 
query_stats_vcpu(CPUState * cpu,StatsArgs * kvm_stats_args)43333b6f4852SMarcelo Tosatti static void query_stats_vcpu(CPUState *cpu, StatsArgs *kvm_stats_args)
4334cc01a3f4SMark Kanda {
43353b6f4852SMarcelo Tosatti     int stats_fd = cpu->kvm_vcpu_stats_fd;
4336cc01a3f4SMark Kanda     Error *local_err = NULL;
4337cc01a3f4SMark Kanda 
4338cc01a3f4SMark Kanda     if (stats_fd == -1) {
4339cc01a3f4SMark Kanda         error_setg_errno(&local_err, errno, "KVM stats: ioctl failed");
4340cc01a3f4SMark Kanda         error_propagate(kvm_stats_args->errp, local_err);
4341cc01a3f4SMark Kanda         return;
4342cc01a3f4SMark Kanda     }
4343cf7405bcSPaolo Bonzini     query_stats(kvm_stats_args->result.stats, STATS_TARGET_VCPU,
43443b6f4852SMarcelo Tosatti                 kvm_stats_args->names, stats_fd, cpu,
43453b6f4852SMarcelo Tosatti                 kvm_stats_args->errp);
4346cc01a3f4SMark Kanda }
4347cc01a3f4SMark Kanda 
query_stats_schema_vcpu(CPUState * cpu,StatsArgs * kvm_stats_args)43483b6f4852SMarcelo Tosatti static void query_stats_schema_vcpu(CPUState *cpu, StatsArgs *kvm_stats_args)
4349cc01a3f4SMark Kanda {
43503b6f4852SMarcelo Tosatti     int stats_fd = cpu->kvm_vcpu_stats_fd;
4351cc01a3f4SMark Kanda     Error *local_err = NULL;
4352cc01a3f4SMark Kanda 
4353cc01a3f4SMark Kanda     if (stats_fd == -1) {
4354cc01a3f4SMark Kanda         error_setg_errno(&local_err, errno, "KVM stats: ioctl failed");
4355cc01a3f4SMark Kanda         error_propagate(kvm_stats_args->errp, local_err);
4356cc01a3f4SMark Kanda         return;
4357cc01a3f4SMark Kanda     }
4358cc01a3f4SMark Kanda     query_stats_schema(kvm_stats_args->result.schema, STATS_TARGET_VCPU, stats_fd,
4359cc01a3f4SMark Kanda                        kvm_stats_args->errp);
4360cc01a3f4SMark Kanda }
4361cc01a3f4SMark Kanda 
query_stats_cb(StatsResultList ** result,StatsTarget target,strList * names,strList * targets,Error ** errp)4362467ef823SPaolo Bonzini static void query_stats_cb(StatsResultList **result, StatsTarget target,
4363cf7405bcSPaolo Bonzini                            strList *names, strList *targets, Error **errp)
4364cc01a3f4SMark Kanda {
4365cc01a3f4SMark Kanda     KVMState *s = kvm_state;
4366cc01a3f4SMark Kanda     CPUState *cpu;
4367cc01a3f4SMark Kanda     int stats_fd;
4368cc01a3f4SMark Kanda 
4369cc01a3f4SMark Kanda     switch (target) {
4370cc01a3f4SMark Kanda     case STATS_TARGET_VM:
4371cc01a3f4SMark Kanda     {
4372cc01a3f4SMark Kanda         stats_fd = kvm_vm_ioctl(s, KVM_GET_STATS_FD, NULL);
4373cc01a3f4SMark Kanda         if (stats_fd == -1) {
4374cc01a3f4SMark Kanda             error_setg_errno(errp, errno, "KVM stats: ioctl failed");
4375cc01a3f4SMark Kanda             return;
4376cc01a3f4SMark Kanda         }
43773b6f4852SMarcelo Tosatti         query_stats(result, target, names, stats_fd, NULL, errp);
4378cc01a3f4SMark Kanda         close(stats_fd);
4379cc01a3f4SMark Kanda         break;
4380cc01a3f4SMark Kanda     }
4381cc01a3f4SMark Kanda     case STATS_TARGET_VCPU:
4382cc01a3f4SMark Kanda     {
4383cc01a3f4SMark Kanda         StatsArgs stats_args;
4384cc01a3f4SMark Kanda         stats_args.result.stats = result;
4385cf7405bcSPaolo Bonzini         stats_args.names = names;
4386cc01a3f4SMark Kanda         stats_args.errp = errp;
4387cc01a3f4SMark Kanda         CPU_FOREACH(cpu) {
4388467ef823SPaolo Bonzini             if (!apply_str_list_filter(cpu->parent_obj.canonical_path, targets)) {
4389467ef823SPaolo Bonzini                 continue;
4390467ef823SPaolo Bonzini             }
43913b6f4852SMarcelo Tosatti             query_stats_vcpu(cpu, &stats_args);
4392cc01a3f4SMark Kanda         }
4393cc01a3f4SMark Kanda         break;
4394cc01a3f4SMark Kanda     }
4395cc01a3f4SMark Kanda     default:
4396cc01a3f4SMark Kanda         break;
4397cc01a3f4SMark Kanda     }
4398cc01a3f4SMark Kanda }
4399cc01a3f4SMark Kanda 
query_stats_schemas_cb(StatsSchemaList ** result,Error ** errp)4400cc01a3f4SMark Kanda void query_stats_schemas_cb(StatsSchemaList **result, Error **errp)
4401cc01a3f4SMark Kanda {
4402cc01a3f4SMark Kanda     StatsArgs stats_args;
4403cc01a3f4SMark Kanda     KVMState *s = kvm_state;
4404cc01a3f4SMark Kanda     int stats_fd;
4405cc01a3f4SMark Kanda 
4406cc01a3f4SMark Kanda     stats_fd = kvm_vm_ioctl(s, KVM_GET_STATS_FD, NULL);
4407cc01a3f4SMark Kanda     if (stats_fd == -1) {
4408cc01a3f4SMark Kanda         error_setg_errno(errp, errno, "KVM stats: ioctl failed");
4409cc01a3f4SMark Kanda         return;
4410cc01a3f4SMark Kanda     }
4411cc01a3f4SMark Kanda     query_stats_schema(result, STATS_TARGET_VM, stats_fd, errp);
4412cc01a3f4SMark Kanda     close(stats_fd);
4413cc01a3f4SMark Kanda 
4414a9197ad2SPaolo Bonzini     if (first_cpu) {
4415cc01a3f4SMark Kanda         stats_args.result.schema = result;
4416cc01a3f4SMark Kanda         stats_args.errp = errp;
44173b6f4852SMarcelo Tosatti         query_stats_schema_vcpu(first_cpu, &stats_args);
4418cc01a3f4SMark Kanda     }
4419a9197ad2SPaolo Bonzini }
44205c3131c3SPaolo Bonzini 
kvm_mark_guest_state_protected(void)44215c3131c3SPaolo Bonzini void kvm_mark_guest_state_protected(void)
44225c3131c3SPaolo Bonzini {
44235c3131c3SPaolo Bonzini     kvm_state->guest_state_protected = true;
44245c3131c3SPaolo Bonzini }
442515f7a80cSXiaoyao Li 
kvm_create_guest_memfd(uint64_t size,uint64_t flags,Error ** errp)442615f7a80cSXiaoyao Li int kvm_create_guest_memfd(uint64_t size, uint64_t flags, Error **errp)
442715f7a80cSXiaoyao Li {
442815f7a80cSXiaoyao Li     int fd;
442915f7a80cSXiaoyao Li     struct kvm_create_guest_memfd guest_memfd = {
443015f7a80cSXiaoyao Li         .size = size,
443115f7a80cSXiaoyao Li         .flags = flags,
443215f7a80cSXiaoyao Li     };
443315f7a80cSXiaoyao Li 
443415f7a80cSXiaoyao Li     if (!kvm_guest_memfd_supported) {
443515f7a80cSXiaoyao Li         error_setg(errp, "KVM does not support guest_memfd");
443615f7a80cSXiaoyao Li         return -1;
443715f7a80cSXiaoyao Li     }
443815f7a80cSXiaoyao Li 
443915f7a80cSXiaoyao Li     fd = kvm_vm_ioctl(kvm_state, KVM_CREATE_GUEST_MEMFD, &guest_memfd);
444015f7a80cSXiaoyao Li     if (fd < 0) {
444115f7a80cSXiaoyao Li         error_setg_errno(errp, errno, "Error creating KVM guest_memfd");
444215f7a80cSXiaoyao Li         return -1;
444315f7a80cSXiaoyao Li     }
444415f7a80cSXiaoyao Li 
444515f7a80cSXiaoyao Li     return fd;
444615f7a80cSXiaoyao Li }
4447