192229a57SYang Zhong /*
292229a57SYang Zhong * QEMU KVM support
392229a57SYang Zhong *
492229a57SYang Zhong * Copyright IBM, Corp. 2008
592229a57SYang Zhong * Red Hat, Inc. 2008
692229a57SYang Zhong *
792229a57SYang Zhong * Authors:
892229a57SYang Zhong * Anthony Liguori <aliguori@us.ibm.com>
992229a57SYang Zhong * Glauber Costa <gcosta@redhat.com>
1092229a57SYang Zhong *
1192229a57SYang Zhong * This work is licensed under the terms of the GNU GPL, version 2 or later.
1292229a57SYang Zhong * See the COPYING file in the top-level directory.
1392229a57SYang Zhong *
1492229a57SYang Zhong */
1592229a57SYang Zhong
1692229a57SYang Zhong #include "qemu/osdep.h"
1792229a57SYang Zhong #include <sys/ioctl.h>
18b4420f19SPeter Xu #include <poll.h>
1992229a57SYang Zhong
2092229a57SYang Zhong #include <linux/kvm.h>
2192229a57SYang Zhong
2292229a57SYang Zhong #include "qemu/atomic.h"
2392229a57SYang Zhong #include "qemu/option.h"
2492229a57SYang Zhong #include "qemu/config-file.h"
2592229a57SYang Zhong #include "qemu/error-report.h"
2692229a57SYang Zhong #include "qapi/error.h"
2792229a57SYang Zhong #include "hw/pci/msi.h"
2892229a57SYang Zhong #include "hw/pci/msix.h"
2992229a57SYang Zhong #include "hw/s390x/adapter.h"
305b7d54d4SAlex Bennée #include "gdbstub/enums.h"
3192229a57SYang Zhong #include "sysemu/kvm_int.h"
3254d31236SMarkus Armbruster #include "sysemu/runstate.h"
3392229a57SYang Zhong #include "sysemu/cpus.h"
34f39b7d2bSDavid Hildenbrand #include "sysemu/accel-blocker.h"
3592229a57SYang Zhong #include "qemu/bswap.h"
3692229a57SYang Zhong #include "exec/memory.h"
3792229a57SYang Zhong #include "exec/ram_addr.h"
3892229a57SYang Zhong #include "qemu/event_notifier.h"
39db725815SMarkus Armbruster #include "qemu/main-loop.h"
4092229a57SYang Zhong #include "trace.h"
4192229a57SYang Zhong #include "hw/irq.h"
4223b0898eSPaolo Bonzini #include "qapi/visitor.h"
4311bc4a13SPaolo Bonzini #include "qapi/qapi-types-common.h"
4411bc4a13SPaolo Bonzini #include "qapi/qapi-visit-common.h"
456b552b9bSDongjiu Geng #include "sysemu/reset.h"
4657038a92SClaudio Fontana #include "qemu/guest-random.h"
4757038a92SClaudio Fontana #include "sysemu/hw_accel.h"
4857038a92SClaudio Fontana #include "kvm-cpus.h"
49baa60983SHyman Huang(黄勇) #include "sysemu/dirtylimit.h"
50f39b7d2bSDavid Hildenbrand #include "qemu/range.h"
5192229a57SYang Zhong
5292229a57SYang Zhong #include "hw/boards.h"
53aa09b3d5SMarkus Armbruster #include "sysemu/stats.h"
5492229a57SYang Zhong
5592229a57SYang Zhong /* This check must be after config-host.h is included */
5692229a57SYang Zhong #ifdef CONFIG_EVENTFD
5792229a57SYang Zhong #include <sys/eventfd.h>
5892229a57SYang Zhong #endif
5992229a57SYang Zhong
6092229a57SYang Zhong /* KVM uses PAGE_SIZE in its definition of KVM_COALESCED_MMIO_MAX. We
6192229a57SYang Zhong * need to use the real host PAGE_SIZE, as that's what KVM will use.
6292229a57SYang Zhong */
63eb8b1a79SJiaxun Yang #ifdef PAGE_SIZE
64eb8b1a79SJiaxun Yang #undef PAGE_SIZE
65eb8b1a79SJiaxun Yang #endif
668e3b0cbbSMarc-André Lureau #define PAGE_SIZE qemu_real_host_page_size()
6792229a57SYang Zhong
68fd2ddd16SMaxim Levitsky #ifndef KVM_GUESTDBG_BLOCKIRQ
69fd2ddd16SMaxim Levitsky #define KVM_GUESTDBG_BLOCKIRQ 0
70fd2ddd16SMaxim Levitsky #endif
71fd2ddd16SMaxim Levitsky
725504a812SPeter Xu /* Default num of memslots to be allocated when VM starts */
735504a812SPeter Xu #define KVM_MEMSLOTS_NR_ALLOC_DEFAULT 16
74b34a908cSPeter Xu /* Default max allowed memslots if kernel reported nothing */
75b34a908cSPeter Xu #define KVM_MEMSLOTS_NR_MAX_DEFAULT 32
765504a812SPeter Xu
7792229a57SYang Zhong struct KVMParkedVcpu {
7892229a57SYang Zhong unsigned long vcpu_id;
7992229a57SYang Zhong int kvm_fd;
8092229a57SYang Zhong QLIST_ENTRY(KVMParkedVcpu) node;
8192229a57SYang Zhong };
8292229a57SYang Zhong
8392229a57SYang Zhong KVMState *kvm_state;
8492229a57SYang Zhong bool kvm_kernel_irqchip;
8592229a57SYang Zhong bool kvm_split_irqchip;
8692229a57SYang Zhong bool kvm_async_interrupts_allowed;
8792229a57SYang Zhong bool kvm_halt_in_kernel_allowed;
8892229a57SYang Zhong bool kvm_resamplefds_allowed;
8992229a57SYang Zhong bool kvm_msi_via_irqfd_allowed;
9092229a57SYang Zhong bool kvm_gsi_routing_allowed;
9192229a57SYang Zhong bool kvm_gsi_direct_mapping;
9292229a57SYang Zhong bool kvm_allowed;
9392229a57SYang Zhong bool kvm_readonly_mem_allowed;
9492229a57SYang Zhong bool kvm_vm_attributes_allowed;
9592229a57SYang Zhong bool kvm_msi_use_devid;
9616617c3cSRichard Henderson static bool kvm_has_guest_debug;
973b7a9388SAlex Bennée static int kvm_sstep_flags;
9892229a57SYang Zhong static bool kvm_immediate_exit;
990811baedSXiaoyao Li static uint64_t kvm_supported_memory_attributes;
10015f7a80cSXiaoyao Li static bool kvm_guest_memfd_supported;
101023ae9a8SIgor Mammedov static hwaddr kvm_max_slot_size = ~0;
10292229a57SYang Zhong
10392229a57SYang Zhong static const KVMCapabilityInfo kvm_required_capabilites[] = {
10492229a57SYang Zhong KVM_CAP_INFO(USER_MEMORY),
10592229a57SYang Zhong KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS),
10689de4b91SDavid Hildenbrand KVM_CAP_INFO(JOIN_MEMORY_REGIONS_WORKS),
107aacec9aeSPaolo Bonzini KVM_CAP_INFO(INTERNAL_ERROR_DATA),
108126e7f78SPaolo Bonzini KVM_CAP_INFO(IOEVENTFD),
109126e7f78SPaolo Bonzini KVM_CAP_INFO(IOEVENTFD_ANY_LENGTH),
11092229a57SYang Zhong KVM_CAP_LAST_INFO
11192229a57SYang Zhong };
11292229a57SYang Zhong
1133607715aSDavid Gibson static NotifierList kvm_irqchip_change_notifiers =
1143607715aSDavid Gibson NOTIFIER_LIST_INITIALIZER(kvm_irqchip_change_notifiers);
1153607715aSDavid Gibson
116c82d9d43SPeter Xu struct KVMResampleFd {
117c82d9d43SPeter Xu int gsi;
118c82d9d43SPeter Xu EventNotifier *resample_event;
119c82d9d43SPeter Xu QLIST_ENTRY(KVMResampleFd) node;
120c82d9d43SPeter Xu };
121c82d9d43SPeter Xu typedef struct KVMResampleFd KVMResampleFd;
122c82d9d43SPeter Xu
123c82d9d43SPeter Xu /*
124c82d9d43SPeter Xu * Only used with split irqchip where we need to do the resample fd
125c82d9d43SPeter Xu * kick for the kernel from userspace.
126c82d9d43SPeter Xu */
127c82d9d43SPeter Xu static QLIST_HEAD(, KVMResampleFd) kvm_resample_fd_list =
128c82d9d43SPeter Xu QLIST_HEAD_INITIALIZER(kvm_resample_fd_list);
129c82d9d43SPeter Xu
130a2f77862SPeter Xu static QemuMutex kml_slots_lock;
131a2f77862SPeter Xu
132a2f77862SPeter Xu #define kvm_slots_lock() qemu_mutex_lock(&kml_slots_lock)
133a2f77862SPeter Xu #define kvm_slots_unlock() qemu_mutex_unlock(&kml_slots_lock)
13436adac49SPeter Xu
135ea776d15SPeter Xu static void kvm_slot_init_dirty_bitmap(KVMSlot *mem);
136ea776d15SPeter Xu
kvm_resample_fd_remove(int gsi)137c82d9d43SPeter Xu static inline void kvm_resample_fd_remove(int gsi)
138c82d9d43SPeter Xu {
139c82d9d43SPeter Xu KVMResampleFd *rfd;
140c82d9d43SPeter Xu
141c82d9d43SPeter Xu QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) {
142c82d9d43SPeter Xu if (rfd->gsi == gsi) {
143c82d9d43SPeter Xu QLIST_REMOVE(rfd, node);
144c82d9d43SPeter Xu g_free(rfd);
145c82d9d43SPeter Xu break;
146c82d9d43SPeter Xu }
147c82d9d43SPeter Xu }
148c82d9d43SPeter Xu }
149c82d9d43SPeter Xu
kvm_resample_fd_insert(int gsi,EventNotifier * event)150c82d9d43SPeter Xu static inline void kvm_resample_fd_insert(int gsi, EventNotifier *event)
151c82d9d43SPeter Xu {
152c82d9d43SPeter Xu KVMResampleFd *rfd = g_new0(KVMResampleFd, 1);
153c82d9d43SPeter Xu
154c82d9d43SPeter Xu rfd->gsi = gsi;
155c82d9d43SPeter Xu rfd->resample_event = event;
156c82d9d43SPeter Xu
157c82d9d43SPeter Xu QLIST_INSERT_HEAD(&kvm_resample_fd_list, rfd, node);
158c82d9d43SPeter Xu }
159c82d9d43SPeter Xu
kvm_resample_fd_notify(int gsi)160c82d9d43SPeter Xu void kvm_resample_fd_notify(int gsi)
161c82d9d43SPeter Xu {
162c82d9d43SPeter Xu KVMResampleFd *rfd;
163c82d9d43SPeter Xu
164c82d9d43SPeter Xu QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) {
165c82d9d43SPeter Xu if (rfd->gsi == gsi) {
166c82d9d43SPeter Xu event_notifier_set(rfd->resample_event);
167c82d9d43SPeter Xu trace_kvm_resample_fd_notify(gsi);
168c82d9d43SPeter Xu return;
169c82d9d43SPeter Xu }
170c82d9d43SPeter Xu }
171c82d9d43SPeter Xu }
172c82d9d43SPeter Xu
1735504a812SPeter Xu /**
1745504a812SPeter Xu * kvm_slots_grow(): Grow the slots[] array in the KVMMemoryListener
1755504a812SPeter Xu *
1765504a812SPeter Xu * @kml: The KVMMemoryListener* to grow the slots[] array
1775504a812SPeter Xu * @nr_slots_new: The new size of slots[] array
1785504a812SPeter Xu *
1795504a812SPeter Xu * Returns: True if the array grows larger, false otherwise.
1805504a812SPeter Xu */
kvm_slots_grow(KVMMemoryListener * kml,unsigned int nr_slots_new)1815504a812SPeter Xu static bool kvm_slots_grow(KVMMemoryListener *kml, unsigned int nr_slots_new)
1825504a812SPeter Xu {
1835504a812SPeter Xu unsigned int i, cur = kml->nr_slots_allocated;
1845504a812SPeter Xu KVMSlot *slots;
1855504a812SPeter Xu
186943c7428SPeter Xu if (nr_slots_new > kvm_state->nr_slots_max) {
187943c7428SPeter Xu nr_slots_new = kvm_state->nr_slots_max;
1885504a812SPeter Xu }
1895504a812SPeter Xu
1905504a812SPeter Xu if (cur >= nr_slots_new) {
1915504a812SPeter Xu /* Big enough, no need to grow, or we reached max */
1925504a812SPeter Xu return false;
1935504a812SPeter Xu }
1945504a812SPeter Xu
1955504a812SPeter Xu if (cur == 0) {
1965504a812SPeter Xu slots = g_new0(KVMSlot, nr_slots_new);
1975504a812SPeter Xu } else {
1985504a812SPeter Xu assert(kml->slots);
1995504a812SPeter Xu slots = g_renew(KVMSlot, kml->slots, nr_slots_new);
2005504a812SPeter Xu /*
2015504a812SPeter Xu * g_renew() doesn't initialize extended buffers, however kvm
2025504a812SPeter Xu * memslots require fields to be zero-initialized. E.g. pointers,
2035504a812SPeter Xu * memory_size field, etc.
2045504a812SPeter Xu */
2055504a812SPeter Xu memset(&slots[cur], 0x0, sizeof(slots[0]) * (nr_slots_new - cur));
2065504a812SPeter Xu }
2075504a812SPeter Xu
2085504a812SPeter Xu for (i = cur; i < nr_slots_new; i++) {
2095504a812SPeter Xu slots[i].slot = i;
2105504a812SPeter Xu }
2115504a812SPeter Xu
2125504a812SPeter Xu kml->slots = slots;
2135504a812SPeter Xu kml->nr_slots_allocated = nr_slots_new;
2145504a812SPeter Xu trace_kvm_slots_grow(cur, nr_slots_new);
2155504a812SPeter Xu
2165504a812SPeter Xu return true;
2175504a812SPeter Xu }
2185504a812SPeter Xu
kvm_slots_double(KVMMemoryListener * kml)2195504a812SPeter Xu static bool kvm_slots_double(KVMMemoryListener *kml)
2205504a812SPeter Xu {
2215504a812SPeter Xu return kvm_slots_grow(kml, kml->nr_slots_allocated * 2);
2225504a812SPeter Xu }
2235504a812SPeter Xu
kvm_get_max_memslots(void)22416ab2edaSDavid Hildenbrand unsigned int kvm_get_max_memslots(void)
22592229a57SYang Zhong {
2264f7f5893SPhilippe Mathieu-Daudé KVMState *s = KVM_STATE(current_accel());
22792229a57SYang Zhong
228943c7428SPeter Xu return s->nr_slots_max;
22992229a57SYang Zhong }
23092229a57SYang Zhong
kvm_get_free_memslots(void)2315b23186aSDavid Hildenbrand unsigned int kvm_get_free_memslots(void)
2325b23186aSDavid Hildenbrand {
2335b23186aSDavid Hildenbrand unsigned int used_slots = 0;
2345b23186aSDavid Hildenbrand KVMState *s = kvm_state;
2355b23186aSDavid Hildenbrand int i;
2365b23186aSDavid Hildenbrand
2375b23186aSDavid Hildenbrand kvm_slots_lock();
2385b23186aSDavid Hildenbrand for (i = 0; i < s->nr_as; i++) {
2395b23186aSDavid Hildenbrand if (!s->as[i].ml) {
2405b23186aSDavid Hildenbrand continue;
2415b23186aSDavid Hildenbrand }
242dbdc00baSPeter Xu used_slots = MAX(used_slots, s->as[i].ml->nr_slots_used);
2435b23186aSDavid Hildenbrand }
2445b23186aSDavid Hildenbrand kvm_slots_unlock();
2455b23186aSDavid Hildenbrand
246943c7428SPeter Xu return s->nr_slots_max - used_slots;
2475b23186aSDavid Hildenbrand }
2485b23186aSDavid Hildenbrand
24936adac49SPeter Xu /* Called with KVMMemoryListener.slots_lock held */
kvm_get_free_slot(KVMMemoryListener * kml)25092229a57SYang Zhong static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml)
25192229a57SYang Zhong {
2525504a812SPeter Xu unsigned int n;
25392229a57SYang Zhong int i;
25492229a57SYang Zhong
2555504a812SPeter Xu for (i = 0; i < kml->nr_slots_allocated; i++) {
25692229a57SYang Zhong if (kml->slots[i].memory_size == 0) {
25792229a57SYang Zhong return &kml->slots[i];
25892229a57SYang Zhong }
25992229a57SYang Zhong }
26092229a57SYang Zhong
2615504a812SPeter Xu /*
2625504a812SPeter Xu * If no free slots, try to grow first by doubling. Cache the old size
2635504a812SPeter Xu * here to avoid another round of search: if the grow succeeded, it
2645504a812SPeter Xu * means slots[] now must have the existing "n" slots occupied,
2655504a812SPeter Xu * followed by one or more free slots starting from slots[n].
2665504a812SPeter Xu */
2675504a812SPeter Xu n = kml->nr_slots_allocated;
2685504a812SPeter Xu if (kvm_slots_double(kml)) {
2695504a812SPeter Xu return &kml->slots[n];
2705504a812SPeter Xu }
2715504a812SPeter Xu
27292229a57SYang Zhong return NULL;
27392229a57SYang Zhong }
27492229a57SYang Zhong
27536adac49SPeter Xu /* Called with KVMMemoryListener.slots_lock held */
kvm_alloc_slot(KVMMemoryListener * kml)27692229a57SYang Zhong static KVMSlot *kvm_alloc_slot(KVMMemoryListener *kml)
27792229a57SYang Zhong {
27892229a57SYang Zhong KVMSlot *slot = kvm_get_free_slot(kml);
27992229a57SYang Zhong
28092229a57SYang Zhong if (slot) {
28192229a57SYang Zhong return slot;
28292229a57SYang Zhong }
28392229a57SYang Zhong
28492229a57SYang Zhong fprintf(stderr, "%s: no free slot available\n", __func__);
28592229a57SYang Zhong abort();
28692229a57SYang Zhong }
28792229a57SYang Zhong
kvm_lookup_matching_slot(KVMMemoryListener * kml,hwaddr start_addr,hwaddr size)28892229a57SYang Zhong static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml,
28992229a57SYang Zhong hwaddr start_addr,
2902747e716SDavid Hildenbrand hwaddr size)
29192229a57SYang Zhong {
29292229a57SYang Zhong int i;
29392229a57SYang Zhong
2945504a812SPeter Xu for (i = 0; i < kml->nr_slots_allocated; i++) {
29592229a57SYang Zhong KVMSlot *mem = &kml->slots[i];
29692229a57SYang Zhong
2972747e716SDavid Hildenbrand if (start_addr == mem->start_addr && size == mem->memory_size) {
29892229a57SYang Zhong return mem;
29992229a57SYang Zhong }
30092229a57SYang Zhong }
30192229a57SYang Zhong
30292229a57SYang Zhong return NULL;
30392229a57SYang Zhong }
30492229a57SYang Zhong
30592229a57SYang Zhong /*
3065ea69c2eSDavid Hildenbrand * Calculate and align the start address and the size of the section.
3075ea69c2eSDavid Hildenbrand * Return the size. If the size is 0, the aligned section is empty.
3085ea69c2eSDavid Hildenbrand */
kvm_align_section(MemoryRegionSection * section,hwaddr * start)3095ea69c2eSDavid Hildenbrand static hwaddr kvm_align_section(MemoryRegionSection *section,
3105ea69c2eSDavid Hildenbrand hwaddr *start)
3115ea69c2eSDavid Hildenbrand {
3125ea69c2eSDavid Hildenbrand hwaddr size = int128_get64(section->size);
313a6ffc423SDavid Hildenbrand hwaddr delta, aligned;
3145ea69c2eSDavid Hildenbrand
3155ea69c2eSDavid Hildenbrand /* kvm works in page size chunks, but the function may be called
3165ea69c2eSDavid Hildenbrand with sub-page size and unaligned start address. Pad the start
3175ea69c2eSDavid Hildenbrand address to next and truncate size to previous page boundary. */
318a6ffc423SDavid Hildenbrand aligned = ROUND_UP(section->offset_within_address_space,
3198e3b0cbbSMarc-André Lureau qemu_real_host_page_size());
320a6ffc423SDavid Hildenbrand delta = aligned - section->offset_within_address_space;
321a6ffc423SDavid Hildenbrand *start = aligned;
3225ea69c2eSDavid Hildenbrand if (delta > size) {
3235ea69c2eSDavid Hildenbrand return 0;
3245ea69c2eSDavid Hildenbrand }
3255ea69c2eSDavid Hildenbrand
3268e3b0cbbSMarc-André Lureau return (size - delta) & qemu_real_host_page_mask();
3275ea69c2eSDavid Hildenbrand }
3285ea69c2eSDavid Hildenbrand
kvm_physical_memory_addr_from_host(KVMState * s,void * ram,hwaddr * phys_addr)32992229a57SYang Zhong int kvm_physical_memory_addr_from_host(KVMState *s, void *ram,
33092229a57SYang Zhong hwaddr *phys_addr)
33192229a57SYang Zhong {
33292229a57SYang Zhong KVMMemoryListener *kml = &s->memory_listener;
33336adac49SPeter Xu int i, ret = 0;
33492229a57SYang Zhong
335a2f77862SPeter Xu kvm_slots_lock();
3365504a812SPeter Xu for (i = 0; i < kml->nr_slots_allocated; i++) {
33792229a57SYang Zhong KVMSlot *mem = &kml->slots[i];
33892229a57SYang Zhong
33992229a57SYang Zhong if (ram >= mem->ram && ram < mem->ram + mem->memory_size) {
34092229a57SYang Zhong *phys_addr = mem->start_addr + (ram - mem->ram);
34136adac49SPeter Xu ret = 1;
34236adac49SPeter Xu break;
34392229a57SYang Zhong }
34492229a57SYang Zhong }
345a2f77862SPeter Xu kvm_slots_unlock();
34692229a57SYang Zhong
34736adac49SPeter Xu return ret;
34892229a57SYang Zhong }
34992229a57SYang Zhong
kvm_set_user_memory_region(KVMMemoryListener * kml,KVMSlot * slot,bool new)3506c090d4aSShannon Zhao static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot, bool new)
35192229a57SYang Zhong {
35292229a57SYang Zhong KVMState *s = kvm_state;
353ce5a9832SChao Peng struct kvm_userspace_memory_region2 mem;
354fe29141bSAlexey Kardashevskiy int ret;
35592229a57SYang Zhong
35692229a57SYang Zhong mem.slot = slot->slot | (kml->as_id << 16);
35792229a57SYang Zhong mem.guest_phys_addr = slot->start_addr;
35892229a57SYang Zhong mem.userspace_addr = (unsigned long)slot->ram;
35992229a57SYang Zhong mem.flags = slot->flags;
360ce5a9832SChao Peng mem.guest_memfd = slot->guest_memfd;
361ce5a9832SChao Peng mem.guest_memfd_offset = slot->guest_memfd_offset;
36292229a57SYang Zhong
3636c090d4aSShannon Zhao if (slot->memory_size && !new && (mem.flags ^ slot->old_flags) & KVM_MEM_READONLY) {
36492229a57SYang Zhong /* Set the slot size to 0 before setting the slot to the desired
36592229a57SYang Zhong * value. This is needed based on KVM commit 75d61fbc. */
36692229a57SYang Zhong mem.memory_size = 0;
367ce5a9832SChao Peng
368ce5a9832SChao Peng if (kvm_guest_memfd_supported) {
369ce5a9832SChao Peng ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION2, &mem);
370ce5a9832SChao Peng } else {
37188cd34eeSPhilippe Mathieu-Daudé ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
372ce5a9832SChao Peng }
37388cd34eeSPhilippe Mathieu-Daudé if (ret < 0) {
37488cd34eeSPhilippe Mathieu-Daudé goto err;
37588cd34eeSPhilippe Mathieu-Daudé }
37692229a57SYang Zhong }
37792229a57SYang Zhong mem.memory_size = slot->memory_size;
378ce5a9832SChao Peng if (kvm_guest_memfd_supported) {
379ce5a9832SChao Peng ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION2, &mem);
380ce5a9832SChao Peng } else {
381fe29141bSAlexey Kardashevskiy ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
382ce5a9832SChao Peng }
3836c090d4aSShannon Zhao slot->old_flags = mem.flags;
38488cd34eeSPhilippe Mathieu-Daudé err:
38572853afcSXiaoyao Li trace_kvm_set_user_memory(mem.slot >> 16, (uint16_t)mem.slot, mem.flags,
38672853afcSXiaoyao Li mem.guest_phys_addr, mem.memory_size,
387ce5a9832SChao Peng mem.userspace_addr, mem.guest_memfd,
388ce5a9832SChao Peng mem.guest_memfd_offset, ret);
38988cd34eeSPhilippe Mathieu-Daudé if (ret < 0) {
390ce5a9832SChao Peng if (kvm_guest_memfd_supported) {
391ce5a9832SChao Peng error_report("%s: KVM_SET_USER_MEMORY_REGION2 failed, slot=%d,"
392ce5a9832SChao Peng " start=0x%" PRIx64 ", size=0x%" PRIx64 ","
393ce5a9832SChao Peng " flags=0x%" PRIx32 ", guest_memfd=%" PRId32 ","
394ce5a9832SChao Peng " guest_memfd_offset=0x%" PRIx64 ": %s",
395ce5a9832SChao Peng __func__, mem.slot, slot->start_addr,
396ce5a9832SChao Peng (uint64_t)mem.memory_size, mem.flags,
397ce5a9832SChao Peng mem.guest_memfd, (uint64_t)mem.guest_memfd_offset,
398ce5a9832SChao Peng strerror(errno));
399ce5a9832SChao Peng } else {
40088cd34eeSPhilippe Mathieu-Daudé error_report("%s: KVM_SET_USER_MEMORY_REGION failed, slot=%d,"
40188cd34eeSPhilippe Mathieu-Daudé " start=0x%" PRIx64 ", size=0x%" PRIx64 ": %s",
40288cd34eeSPhilippe Mathieu-Daudé __func__, mem.slot, slot->start_addr,
40388cd34eeSPhilippe Mathieu-Daudé (uint64_t)mem.memory_size, strerror(errno));
40488cd34eeSPhilippe Mathieu-Daudé }
405ce5a9832SChao Peng }
406fe29141bSAlexey Kardashevskiy return ret;
40792229a57SYang Zhong }
40892229a57SYang Zhong
kvm_park_vcpu(CPUState * cpu)40908c32868SSalil Mehta void kvm_park_vcpu(CPUState *cpu)
41008c32868SSalil Mehta {
41108c32868SSalil Mehta struct KVMParkedVcpu *vcpu;
41208c32868SSalil Mehta
41308c32868SSalil Mehta trace_kvm_park_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
41408c32868SSalil Mehta
41508c32868SSalil Mehta vcpu = g_malloc0(sizeof(*vcpu));
41608c32868SSalil Mehta vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
41708c32868SSalil Mehta vcpu->kvm_fd = cpu->kvm_fd;
41808c32868SSalil Mehta QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
41908c32868SSalil Mehta }
42008c32868SSalil Mehta
kvm_unpark_vcpu(KVMState * s,unsigned long vcpu_id)42108c32868SSalil Mehta int kvm_unpark_vcpu(KVMState *s, unsigned long vcpu_id)
42208c32868SSalil Mehta {
42308c32868SSalil Mehta struct KVMParkedVcpu *cpu;
42408c32868SSalil Mehta int kvm_fd = -ENOENT;
42508c32868SSalil Mehta
42608c32868SSalil Mehta QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
42708c32868SSalil Mehta if (cpu->vcpu_id == vcpu_id) {
42808c32868SSalil Mehta QLIST_REMOVE(cpu, node);
42908c32868SSalil Mehta kvm_fd = cpu->kvm_fd;
43008c32868SSalil Mehta g_free(cpu);
431036144cfSSalil Mehta break;
43208c32868SSalil Mehta }
43308c32868SSalil Mehta }
43408c32868SSalil Mehta
43508c32868SSalil Mehta trace_kvm_unpark_vcpu(vcpu_id, kvm_fd > 0 ? "unparked" : "!found parked");
43608c32868SSalil Mehta
43708c32868SSalil Mehta return kvm_fd;
43808c32868SSalil Mehta }
43908c32868SSalil Mehta
kvm_reset_parked_vcpus(void * param)440*2dc65296SMaciej S. Szmigiero static void kvm_reset_parked_vcpus(void *param)
441*2dc65296SMaciej S. Szmigiero {
442*2dc65296SMaciej S. Szmigiero KVMState *s = param;
443*2dc65296SMaciej S. Szmigiero struct KVMParkedVcpu *cpu;
444*2dc65296SMaciej S. Szmigiero
445*2dc65296SMaciej S. Szmigiero QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
446*2dc65296SMaciej S. Szmigiero kvm_arch_reset_parked_vcpu(cpu->vcpu_id, cpu->kvm_fd);
447*2dc65296SMaciej S. Szmigiero }
448*2dc65296SMaciej S. Szmigiero }
449*2dc65296SMaciej S. Szmigiero
kvm_create_vcpu(CPUState * cpu)45008c32868SSalil Mehta int kvm_create_vcpu(CPUState *cpu)
45108c32868SSalil Mehta {
45208c32868SSalil Mehta unsigned long vcpu_id = kvm_arch_vcpu_id(cpu);
45308c32868SSalil Mehta KVMState *s = kvm_state;
45408c32868SSalil Mehta int kvm_fd;
45508c32868SSalil Mehta
45608c32868SSalil Mehta /* check if the KVM vCPU already exist but is parked */
45708c32868SSalil Mehta kvm_fd = kvm_unpark_vcpu(s, vcpu_id);
45808c32868SSalil Mehta if (kvm_fd < 0) {
45908c32868SSalil Mehta /* vCPU not parked: create a new KVM vCPU */
46008c32868SSalil Mehta kvm_fd = kvm_vm_ioctl(s, KVM_CREATE_VCPU, vcpu_id);
46108c32868SSalil Mehta if (kvm_fd < 0) {
46208c32868SSalil Mehta error_report("KVM_CREATE_VCPU IOCTL failed for vCPU %lu", vcpu_id);
46308c32868SSalil Mehta return kvm_fd;
46408c32868SSalil Mehta }
46508c32868SSalil Mehta }
46608c32868SSalil Mehta
46708c32868SSalil Mehta cpu->kvm_fd = kvm_fd;
46808c32868SSalil Mehta cpu->kvm_state = s;
46908c32868SSalil Mehta cpu->vcpu_dirty = true;
47008c32868SSalil Mehta cpu->dirty_pages = 0;
47108c32868SSalil Mehta cpu->throttle_us_per_full = 0;
47208c32868SSalil Mehta
47308c32868SSalil Mehta trace_kvm_create_vcpu(cpu->cpu_index, vcpu_id, kvm_fd);
47408c32868SSalil Mehta
47508c32868SSalil Mehta return 0;
47608c32868SSalil Mehta }
47708c32868SSalil Mehta
kvm_create_and_park_vcpu(CPUState * cpu)478c6a3d7bcSHarsh Prateek Bora int kvm_create_and_park_vcpu(CPUState *cpu)
479c6a3d7bcSHarsh Prateek Bora {
480c6a3d7bcSHarsh Prateek Bora int ret = 0;
481c6a3d7bcSHarsh Prateek Bora
482c6a3d7bcSHarsh Prateek Bora ret = kvm_create_vcpu(cpu);
483c6a3d7bcSHarsh Prateek Bora if (!ret) {
484c6a3d7bcSHarsh Prateek Bora kvm_park_vcpu(cpu);
485c6a3d7bcSHarsh Prateek Bora }
486c6a3d7bcSHarsh Prateek Bora
487c6a3d7bcSHarsh Prateek Bora return ret;
488c6a3d7bcSHarsh Prateek Bora }
489c6a3d7bcSHarsh Prateek Bora
do_kvm_destroy_vcpu(CPUState * cpu)49057038a92SClaudio Fontana static int do_kvm_destroy_vcpu(CPUState *cpu)
49192229a57SYang Zhong {
49292229a57SYang Zhong KVMState *s = kvm_state;
49328d2d03cSPeter Maydell int mmap_size;
49492229a57SYang Zhong int ret = 0;
49592229a57SYang Zhong
49608c32868SSalil Mehta trace_kvm_destroy_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
49792229a57SYang Zhong
498b1115c99SLiran Alon ret = kvm_arch_destroy_vcpu(cpu);
499b1115c99SLiran Alon if (ret < 0) {
500b1115c99SLiran Alon goto err;
501b1115c99SLiran Alon }
502b1115c99SLiran Alon
50392229a57SYang Zhong mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
50492229a57SYang Zhong if (mmap_size < 0) {
50592229a57SYang Zhong ret = mmap_size;
5069cdfb1e3SJai Arora trace_kvm_failed_get_vcpu_mmap_size();
50792229a57SYang Zhong goto err;
50892229a57SYang Zhong }
50992229a57SYang Zhong
51092229a57SYang Zhong ret = munmap(cpu->kvm_run, mmap_size);
51192229a57SYang Zhong if (ret < 0) {
51292229a57SYang Zhong goto err;
51392229a57SYang Zhong }
51492229a57SYang Zhong
515b4420f19SPeter Xu if (cpu->kvm_dirty_gfns) {
516dcafa248SPeter Xu ret = munmap(cpu->kvm_dirty_gfns, s->kvm_dirty_ring_bytes);
517b4420f19SPeter Xu if (ret < 0) {
518b4420f19SPeter Xu goto err;
519b4420f19SPeter Xu }
520b4420f19SPeter Xu }
521b4420f19SPeter Xu
52208c32868SSalil Mehta kvm_park_vcpu(cpu);
52392229a57SYang Zhong err:
52492229a57SYang Zhong return ret;
52592229a57SYang Zhong }
52692229a57SYang Zhong
kvm_destroy_vcpu(CPUState * cpu)52757038a92SClaudio Fontana void kvm_destroy_vcpu(CPUState *cpu)
52857038a92SClaudio Fontana {
52957038a92SClaudio Fontana if (do_kvm_destroy_vcpu(cpu) < 0) {
53057038a92SClaudio Fontana error_report("kvm_destroy_vcpu failed");
53157038a92SClaudio Fontana exit(EXIT_FAILURE);
53257038a92SClaudio Fontana }
53357038a92SClaudio Fontana }
53457038a92SClaudio Fontana
kvm_init_vcpu(CPUState * cpu,Error ** errp)535d0a92b35SDr. David Alan Gilbert int kvm_init_vcpu(CPUState *cpu, Error **errp)
53692229a57SYang Zhong {
53792229a57SYang Zhong KVMState *s = kvm_state;
53828d2d03cSPeter Maydell int mmap_size;
53992229a57SYang Zhong int ret;
54092229a57SYang Zhong
541d0a92b35SDr. David Alan Gilbert trace_kvm_init_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
54292229a57SYang Zhong
54308c32868SSalil Mehta ret = kvm_create_vcpu(cpu);
54492229a57SYang Zhong if (ret < 0) {
54508c32868SSalil Mehta error_setg_errno(errp, -ret,
54608c32868SSalil Mehta "kvm_init_vcpu: kvm_create_vcpu failed (%lu)",
547d0a92b35SDr. David Alan Gilbert kvm_arch_vcpu_id(cpu));
54892229a57SYang Zhong goto err;
54992229a57SYang Zhong }
55092229a57SYang Zhong
55192229a57SYang Zhong mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
55292229a57SYang Zhong if (mmap_size < 0) {
55392229a57SYang Zhong ret = mmap_size;
554d0a92b35SDr. David Alan Gilbert error_setg_errno(errp, -mmap_size,
555d0a92b35SDr. David Alan Gilbert "kvm_init_vcpu: KVM_GET_VCPU_MMAP_SIZE failed");
55692229a57SYang Zhong goto err;
55792229a57SYang Zhong }
55892229a57SYang Zhong
55992229a57SYang Zhong cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
56092229a57SYang Zhong cpu->kvm_fd, 0);
56192229a57SYang Zhong if (cpu->kvm_run == MAP_FAILED) {
56292229a57SYang Zhong ret = -errno;
563d0a92b35SDr. David Alan Gilbert error_setg_errno(errp, ret,
564d0a92b35SDr. David Alan Gilbert "kvm_init_vcpu: mmap'ing vcpu state failed (%lu)",
565d0a92b35SDr. David Alan Gilbert kvm_arch_vcpu_id(cpu));
56692229a57SYang Zhong goto err;
56792229a57SYang Zhong }
56892229a57SYang Zhong
56992229a57SYang Zhong if (s->coalesced_mmio && !s->coalesced_mmio_ring) {
57092229a57SYang Zhong s->coalesced_mmio_ring =
57192229a57SYang Zhong (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE;
57292229a57SYang Zhong }
57392229a57SYang Zhong
574b4420f19SPeter Xu if (s->kvm_dirty_ring_size) {
575b4420f19SPeter Xu /* Use MAP_SHARED to share pages with the kernel */
576dcafa248SPeter Xu cpu->kvm_dirty_gfns = mmap(NULL, s->kvm_dirty_ring_bytes,
577b4420f19SPeter Xu PROT_READ | PROT_WRITE, MAP_SHARED,
578b4420f19SPeter Xu cpu->kvm_fd,
579b4420f19SPeter Xu PAGE_SIZE * KVM_DIRTY_LOG_PAGE_OFFSET);
580b4420f19SPeter Xu if (cpu->kvm_dirty_gfns == MAP_FAILED) {
581b4420f19SPeter Xu ret = -errno;
582b4420f19SPeter Xu goto err;
583b4420f19SPeter Xu }
584b4420f19SPeter Xu }
585b4420f19SPeter Xu
58692229a57SYang Zhong ret = kvm_arch_init_vcpu(cpu);
587d0a92b35SDr. David Alan Gilbert if (ret < 0) {
588d0a92b35SDr. David Alan Gilbert error_setg_errno(errp, -ret,
589d0a92b35SDr. David Alan Gilbert "kvm_init_vcpu: kvm_arch_init_vcpu failed (%lu)",
590d0a92b35SDr. David Alan Gilbert kvm_arch_vcpu_id(cpu));
591d0a92b35SDr. David Alan Gilbert }
5923b6f4852SMarcelo Tosatti cpu->kvm_vcpu_stats_fd = kvm_vcpu_ioctl(cpu, KVM_GET_STATS_FD, NULL);
5933b6f4852SMarcelo Tosatti
59492229a57SYang Zhong err:
59592229a57SYang Zhong return ret;
59692229a57SYang Zhong }
59792229a57SYang Zhong
59892229a57SYang Zhong /*
59992229a57SYang Zhong * dirty pages logging control
60092229a57SYang Zhong */
60192229a57SYang Zhong
kvm_mem_flags(MemoryRegion * mr)60292229a57SYang Zhong static int kvm_mem_flags(MemoryRegion *mr)
60392229a57SYang Zhong {
60492229a57SYang Zhong bool readonly = mr->readonly || memory_region_is_romd(mr);
60592229a57SYang Zhong int flags = 0;
60692229a57SYang Zhong
60792229a57SYang Zhong if (memory_region_get_dirty_log_mask(mr) != 0) {
60892229a57SYang Zhong flags |= KVM_MEM_LOG_DIRTY_PAGES;
60992229a57SYang Zhong }
61092229a57SYang Zhong if (readonly && kvm_readonly_mem_allowed) {
61192229a57SYang Zhong flags |= KVM_MEM_READONLY;
61292229a57SYang Zhong }
613ce5a9832SChao Peng if (memory_region_has_guest_memfd(mr)) {
614ce5a9832SChao Peng assert(kvm_guest_memfd_supported);
615ce5a9832SChao Peng flags |= KVM_MEM_GUEST_MEMFD;
616ce5a9832SChao Peng }
61792229a57SYang Zhong return flags;
61892229a57SYang Zhong }
61992229a57SYang Zhong
62036adac49SPeter Xu /* Called with KVMMemoryListener.slots_lock held */
kvm_slot_update_flags(KVMMemoryListener * kml,KVMSlot * mem,MemoryRegion * mr)62192229a57SYang Zhong static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem,
62292229a57SYang Zhong MemoryRegion *mr)
62392229a57SYang Zhong {
62492229a57SYang Zhong mem->flags = kvm_mem_flags(mr);
62592229a57SYang Zhong
62692229a57SYang Zhong /* If nothing changed effectively, no need to issue ioctl */
6276c090d4aSShannon Zhao if (mem->flags == mem->old_flags) {
62892229a57SYang Zhong return 0;
62992229a57SYang Zhong }
63092229a57SYang Zhong
631ea776d15SPeter Xu kvm_slot_init_dirty_bitmap(mem);
6326c090d4aSShannon Zhao return kvm_set_user_memory_region(kml, mem, false);
63392229a57SYang Zhong }
63492229a57SYang Zhong
kvm_section_update_flags(KVMMemoryListener * kml,MemoryRegionSection * section)63592229a57SYang Zhong static int kvm_section_update_flags(KVMMemoryListener *kml,
63692229a57SYang Zhong MemoryRegionSection *section)
63792229a57SYang Zhong {
638023ae9a8SIgor Mammedov hwaddr start_addr, size, slot_size;
639343562e8SDavid Hildenbrand KVMSlot *mem;
64036adac49SPeter Xu int ret = 0;
64192229a57SYang Zhong
642343562e8SDavid Hildenbrand size = kvm_align_section(section, &start_addr);
643343562e8SDavid Hildenbrand if (!size) {
64492229a57SYang Zhong return 0;
64592229a57SYang Zhong }
646343562e8SDavid Hildenbrand
647a2f77862SPeter Xu kvm_slots_lock();
64836adac49SPeter Xu
649023ae9a8SIgor Mammedov while (size && !ret) {
650023ae9a8SIgor Mammedov slot_size = MIN(kvm_max_slot_size, size);
651023ae9a8SIgor Mammedov mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
652343562e8SDavid Hildenbrand if (!mem) {
653e377e87cSDavid Hildenbrand /* We don't have a slot if we want to trap every access. */
65436adac49SPeter Xu goto out;
655343562e8SDavid Hildenbrand }
656343562e8SDavid Hildenbrand
65736adac49SPeter Xu ret = kvm_slot_update_flags(kml, mem, section->mr);
658023ae9a8SIgor Mammedov start_addr += slot_size;
659023ae9a8SIgor Mammedov size -= slot_size;
660023ae9a8SIgor Mammedov }
66136adac49SPeter Xu
66236adac49SPeter Xu out:
663a2f77862SPeter Xu kvm_slots_unlock();
66436adac49SPeter Xu return ret;
66592229a57SYang Zhong }
66692229a57SYang Zhong
kvm_log_start(MemoryListener * listener,MemoryRegionSection * section,int old,int new)66792229a57SYang Zhong static void kvm_log_start(MemoryListener *listener,
66892229a57SYang Zhong MemoryRegionSection *section,
66992229a57SYang Zhong int old, int new)
67092229a57SYang Zhong {
67192229a57SYang Zhong KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
67292229a57SYang Zhong int r;
67392229a57SYang Zhong
67492229a57SYang Zhong if (old != 0) {
67592229a57SYang Zhong return;
67692229a57SYang Zhong }
67792229a57SYang Zhong
67892229a57SYang Zhong r = kvm_section_update_flags(kml, section);
67992229a57SYang Zhong if (r < 0) {
68092229a57SYang Zhong abort();
68192229a57SYang Zhong }
68292229a57SYang Zhong }
68392229a57SYang Zhong
kvm_log_stop(MemoryListener * listener,MemoryRegionSection * section,int old,int new)68492229a57SYang Zhong static void kvm_log_stop(MemoryListener *listener,
68592229a57SYang Zhong MemoryRegionSection *section,
68692229a57SYang Zhong int old, int new)
68792229a57SYang Zhong {
68892229a57SYang Zhong KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
68992229a57SYang Zhong int r;
69092229a57SYang Zhong
69192229a57SYang Zhong if (new != 0) {
69292229a57SYang Zhong return;
69392229a57SYang Zhong }
69492229a57SYang Zhong
69592229a57SYang Zhong r = kvm_section_update_flags(kml, section);
69692229a57SYang Zhong if (r < 0) {
69792229a57SYang Zhong abort();
69892229a57SYang Zhong }
69992229a57SYang Zhong }
70092229a57SYang Zhong
70192229a57SYang Zhong /* get kvm's dirty pages bitmap and update qemu's */
kvm_slot_sync_dirty_pages(KVMSlot * slot)7022c20b27eSPeter Xu static void kvm_slot_sync_dirty_pages(KVMSlot *slot)
70392229a57SYang Zhong {
7042c20b27eSPeter Xu ram_addr_t start = slot->ram_start_offset;
7058e3b0cbbSMarc-André Lureau ram_addr_t pages = slot->memory_size / qemu_real_host_page_size();
70692229a57SYang Zhong
7072c20b27eSPeter Xu cpu_physical_memory_set_dirty_lebitmap(slot->dirty_bmap, start, pages);
70892229a57SYang Zhong }
70992229a57SYang Zhong
kvm_slot_reset_dirty_pages(KVMSlot * slot)710b4420f19SPeter Xu static void kvm_slot_reset_dirty_pages(KVMSlot *slot)
711b4420f19SPeter Xu {
712b4420f19SPeter Xu memset(slot->dirty_bmap, 0, slot->dirty_bmap_size);
713b4420f19SPeter Xu }
714b4420f19SPeter Xu
71592229a57SYang Zhong #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1))
71692229a57SYang Zhong
7179b3a31c7SDr. David Alan Gilbert /* Allocate the dirty bitmap for a slot */
kvm_slot_init_dirty_bitmap(KVMSlot * mem)718ea776d15SPeter Xu static void kvm_slot_init_dirty_bitmap(KVMSlot *mem)
7199b3a31c7SDr. David Alan Gilbert {
720ea776d15SPeter Xu if (!(mem->flags & KVM_MEM_LOG_DIRTY_PAGES) || mem->dirty_bmap) {
721ea776d15SPeter Xu return;
722ea776d15SPeter Xu }
723ea776d15SPeter Xu
7249b3a31c7SDr. David Alan Gilbert /*
7259b3a31c7SDr. David Alan Gilbert * XXX bad kernel interface alert
7269b3a31c7SDr. David Alan Gilbert * For dirty bitmap, kernel allocates array of size aligned to
7279b3a31c7SDr. David Alan Gilbert * bits-per-long. But for case when the kernel is 64bits and
7289b3a31c7SDr. David Alan Gilbert * the userspace is 32bits, userspace can't align to the same
7299b3a31c7SDr. David Alan Gilbert * bits-per-long, since sizeof(long) is different between kernel
7309b3a31c7SDr. David Alan Gilbert * and user space. This way, userspace will provide buffer which
7319b3a31c7SDr. David Alan Gilbert * may be 4 bytes less than the kernel will use, resulting in
7329b3a31c7SDr. David Alan Gilbert * userspace memory corruption (which is not detectable by valgrind
7339b3a31c7SDr. David Alan Gilbert * too, in most cases).
7349b3a31c7SDr. David Alan Gilbert * So for now, let's align to 64 instead of HOST_LONG_BITS here, in
7359b3a31c7SDr. David Alan Gilbert * a hope that sizeof(long) won't become >8 any time soon.
736e0a8f993SKeqian Zhu *
737e0a8f993SKeqian Zhu * Note: the granule of kvm dirty log is qemu_real_host_page_size.
738e0a8f993SKeqian Zhu * And mem->memory_size is aligned to it (otherwise this mem can't
739e0a8f993SKeqian Zhu * be registered to KVM).
7409b3a31c7SDr. David Alan Gilbert */
7418e3b0cbbSMarc-André Lureau hwaddr bitmap_size = ALIGN(mem->memory_size / qemu_real_host_page_size(),
7429b3a31c7SDr. David Alan Gilbert /*HOST_LONG_BITS*/ 64) / 8;
7439b3a31c7SDr. David Alan Gilbert mem->dirty_bmap = g_malloc0(bitmap_size);
744563d32baSPeter Xu mem->dirty_bmap_size = bitmap_size;
7459b3a31c7SDr. David Alan Gilbert }
7469b3a31c7SDr. David Alan Gilbert
747e65e5f50SPeter Xu /*
748e65e5f50SPeter Xu * Sync dirty bitmap from kernel to KVMSlot.dirty_bmap, return true if
749e65e5f50SPeter Xu * succeeded, false otherwise
750e65e5f50SPeter Xu */
kvm_slot_get_dirty_log(KVMState * s,KVMSlot * slot)751e65e5f50SPeter Xu static bool kvm_slot_get_dirty_log(KVMState *s, KVMSlot *slot)
752e65e5f50SPeter Xu {
753e65e5f50SPeter Xu struct kvm_dirty_log d = {};
754e65e5f50SPeter Xu int ret;
755e65e5f50SPeter Xu
756e65e5f50SPeter Xu d.dirty_bitmap = slot->dirty_bmap;
757e65e5f50SPeter Xu d.slot = slot->slot | (slot->as_id << 16);
758e65e5f50SPeter Xu ret = kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d);
759e65e5f50SPeter Xu
760e65e5f50SPeter Xu if (ret == -ENOENT) {
761e65e5f50SPeter Xu /* kernel does not have dirty bitmap in this slot */
762e65e5f50SPeter Xu ret = 0;
763e65e5f50SPeter Xu }
764e65e5f50SPeter Xu if (ret) {
765e65e5f50SPeter Xu error_report_once("%s: KVM_GET_DIRTY_LOG failed with %d",
766e65e5f50SPeter Xu __func__, ret);
767e65e5f50SPeter Xu }
768e65e5f50SPeter Xu return ret == 0;
769e65e5f50SPeter Xu }
770e65e5f50SPeter Xu
771b4420f19SPeter Xu /* Should be with all slots_lock held for the address spaces. */
kvm_dirty_ring_mark_page(KVMState * s,uint32_t as_id,uint32_t slot_id,uint64_t offset)772b4420f19SPeter Xu static void kvm_dirty_ring_mark_page(KVMState *s, uint32_t as_id,
773b4420f19SPeter Xu uint32_t slot_id, uint64_t offset)
774b4420f19SPeter Xu {
775b4420f19SPeter Xu KVMMemoryListener *kml;
776b4420f19SPeter Xu KVMSlot *mem;
777b4420f19SPeter Xu
778b4420f19SPeter Xu if (as_id >= s->nr_as) {
779b4420f19SPeter Xu return;
780b4420f19SPeter Xu }
781b4420f19SPeter Xu
782b4420f19SPeter Xu kml = s->as[as_id].ml;
783b4420f19SPeter Xu mem = &kml->slots[slot_id];
784b4420f19SPeter Xu
785b4420f19SPeter Xu if (!mem->memory_size || offset >=
7868e3b0cbbSMarc-André Lureau (mem->memory_size / qemu_real_host_page_size())) {
787b4420f19SPeter Xu return;
788b4420f19SPeter Xu }
789b4420f19SPeter Xu
790b4420f19SPeter Xu set_bit(offset, mem->dirty_bmap);
791b4420f19SPeter Xu }
792b4420f19SPeter Xu
dirty_gfn_is_dirtied(struct kvm_dirty_gfn * gfn)793b4420f19SPeter Xu static bool dirty_gfn_is_dirtied(struct kvm_dirty_gfn *gfn)
794b4420f19SPeter Xu {
7954802bf91SPaolo Bonzini /*
7964802bf91SPaolo Bonzini * Read the flags before the value. Pairs with barrier in
7974802bf91SPaolo Bonzini * KVM's kvm_dirty_ring_push() function.
7984802bf91SPaolo Bonzini */
7994802bf91SPaolo Bonzini return qatomic_load_acquire(&gfn->flags) == KVM_DIRTY_GFN_F_DIRTY;
800b4420f19SPeter Xu }
801b4420f19SPeter Xu
dirty_gfn_set_collected(struct kvm_dirty_gfn * gfn)802b4420f19SPeter Xu static void dirty_gfn_set_collected(struct kvm_dirty_gfn *gfn)
803b4420f19SPeter Xu {
80452281c6dSPaolo Bonzini /*
80552281c6dSPaolo Bonzini * Use a store-release so that the CPU that executes KVM_RESET_DIRTY_RINGS
80652281c6dSPaolo Bonzini * sees the full content of the ring:
80752281c6dSPaolo Bonzini *
80852281c6dSPaolo Bonzini * CPU0 CPU1 CPU2
80952281c6dSPaolo Bonzini * ------------------------------------------------------------------------------
81052281c6dSPaolo Bonzini * fill gfn0
81152281c6dSPaolo Bonzini * store-rel flags for gfn0
81252281c6dSPaolo Bonzini * load-acq flags for gfn0
81352281c6dSPaolo Bonzini * store-rel RESET for gfn0
81452281c6dSPaolo Bonzini * ioctl(RESET_RINGS)
81552281c6dSPaolo Bonzini * load-acq flags for gfn0
81652281c6dSPaolo Bonzini * check if flags have RESET
81752281c6dSPaolo Bonzini *
81852281c6dSPaolo Bonzini * The synchronization goes from CPU2 to CPU0 to CPU1.
81952281c6dSPaolo Bonzini */
82052281c6dSPaolo Bonzini qatomic_store_release(&gfn->flags, KVM_DIRTY_GFN_F_RESET);
821b4420f19SPeter Xu }
822b4420f19SPeter Xu
823b4420f19SPeter Xu /*
824b4420f19SPeter Xu * Should be with all slots_lock held for the address spaces. It returns the
825b4420f19SPeter Xu * dirty page we've collected on this dirty ring.
826b4420f19SPeter Xu */
kvm_dirty_ring_reap_one(KVMState * s,CPUState * cpu)827b4420f19SPeter Xu static uint32_t kvm_dirty_ring_reap_one(KVMState *s, CPUState *cpu)
828b4420f19SPeter Xu {
829b4420f19SPeter Xu struct kvm_dirty_gfn *dirty_gfns = cpu->kvm_dirty_gfns, *cur;
830b4420f19SPeter Xu uint32_t ring_size = s->kvm_dirty_ring_size;
831b4420f19SPeter Xu uint32_t count = 0, fetch = cpu->kvm_fetch_index;
832b4420f19SPeter Xu
83356adee40SPeter Xu /*
83456adee40SPeter Xu * It's possible that we race with vcpu creation code where the vcpu is
83556adee40SPeter Xu * put onto the vcpus list but not yet initialized the dirty ring
83656adee40SPeter Xu * structures. If so, skip it.
83756adee40SPeter Xu */
83856adee40SPeter Xu if (!cpu->created) {
83956adee40SPeter Xu return 0;
84056adee40SPeter Xu }
84156adee40SPeter Xu
842b4420f19SPeter Xu assert(dirty_gfns && ring_size);
843b4420f19SPeter Xu trace_kvm_dirty_ring_reap_vcpu(cpu->cpu_index);
844b4420f19SPeter Xu
845b4420f19SPeter Xu while (true) {
846b4420f19SPeter Xu cur = &dirty_gfns[fetch % ring_size];
847b4420f19SPeter Xu if (!dirty_gfn_is_dirtied(cur)) {
848b4420f19SPeter Xu break;
849b4420f19SPeter Xu }
850b4420f19SPeter Xu kvm_dirty_ring_mark_page(s, cur->slot >> 16, cur->slot & 0xffff,
851b4420f19SPeter Xu cur->offset);
852b4420f19SPeter Xu dirty_gfn_set_collected(cur);
853b4420f19SPeter Xu trace_kvm_dirty_ring_page(cpu->cpu_index, fetch, cur->offset);
854b4420f19SPeter Xu fetch++;
855b4420f19SPeter Xu count++;
856b4420f19SPeter Xu }
857b4420f19SPeter Xu cpu->kvm_fetch_index = fetch;
8587786ae40SHyman Huang(黄勇) cpu->dirty_pages += count;
859b4420f19SPeter Xu
860b4420f19SPeter Xu return count;
861b4420f19SPeter Xu }
862b4420f19SPeter Xu
863b4420f19SPeter Xu /* Must be with slots_lock held */
kvm_dirty_ring_reap_locked(KVMState * s,CPUState * cpu)8641667e2b9SHyman Huang(黄勇) static uint64_t kvm_dirty_ring_reap_locked(KVMState *s, CPUState* cpu)
865b4420f19SPeter Xu {
866b4420f19SPeter Xu int ret;
867b4420f19SPeter Xu uint64_t total = 0;
868b4420f19SPeter Xu int64_t stamp;
869b4420f19SPeter Xu
870b4420f19SPeter Xu stamp = get_clock();
871b4420f19SPeter Xu
8721667e2b9SHyman Huang(黄勇) if (cpu) {
8731667e2b9SHyman Huang(黄勇) total = kvm_dirty_ring_reap_one(s, cpu);
8741667e2b9SHyman Huang(黄勇) } else {
875b4420f19SPeter Xu CPU_FOREACH(cpu) {
876b4420f19SPeter Xu total += kvm_dirty_ring_reap_one(s, cpu);
877b4420f19SPeter Xu }
8781667e2b9SHyman Huang(黄勇) }
879b4420f19SPeter Xu
880b4420f19SPeter Xu if (total) {
881b4420f19SPeter Xu ret = kvm_vm_ioctl(s, KVM_RESET_DIRTY_RINGS);
882b4420f19SPeter Xu assert(ret == total);
883b4420f19SPeter Xu }
884b4420f19SPeter Xu
885b4420f19SPeter Xu stamp = get_clock() - stamp;
886b4420f19SPeter Xu
887b4420f19SPeter Xu if (total) {
888b4420f19SPeter Xu trace_kvm_dirty_ring_reap(total, stamp / 1000);
889b4420f19SPeter Xu }
890b4420f19SPeter Xu
891b4420f19SPeter Xu return total;
892b4420f19SPeter Xu }
893b4420f19SPeter Xu
894b4420f19SPeter Xu /*
895b4420f19SPeter Xu * Currently for simplicity, we must hold BQL before calling this. We can
896b4420f19SPeter Xu * consider to drop the BQL if we're clear with all the race conditions.
897b4420f19SPeter Xu */
kvm_dirty_ring_reap(KVMState * s,CPUState * cpu)8981667e2b9SHyman Huang(黄勇) static uint64_t kvm_dirty_ring_reap(KVMState *s, CPUState *cpu)
899b4420f19SPeter Xu {
900b4420f19SPeter Xu uint64_t total;
901b4420f19SPeter Xu
902b4420f19SPeter Xu /*
903b4420f19SPeter Xu * We need to lock all kvm slots for all address spaces here,
904b4420f19SPeter Xu * because:
905b4420f19SPeter Xu *
906b4420f19SPeter Xu * (1) We need to mark dirty for dirty bitmaps in multiple slots
907b4420f19SPeter Xu * and for tons of pages, so it's better to take the lock here
908b4420f19SPeter Xu * once rather than once per page. And more importantly,
909b4420f19SPeter Xu *
910b4420f19SPeter Xu * (2) We must _NOT_ publish dirty bits to the other threads
911b4420f19SPeter Xu * (e.g., the migration thread) via the kvm memory slot dirty
912b4420f19SPeter Xu * bitmaps before correctly re-protect those dirtied pages.
913b4420f19SPeter Xu * Otherwise we can have potential risk of data corruption if
914b4420f19SPeter Xu * the page data is read in the other thread before we do
915b4420f19SPeter Xu * reset below.
916b4420f19SPeter Xu */
917b4420f19SPeter Xu kvm_slots_lock();
9181667e2b9SHyman Huang(黄勇) total = kvm_dirty_ring_reap_locked(s, cpu);
919b4420f19SPeter Xu kvm_slots_unlock();
920b4420f19SPeter Xu
921b4420f19SPeter Xu return total;
922b4420f19SPeter Xu }
923b4420f19SPeter Xu
do_kvm_cpu_synchronize_kick(CPUState * cpu,run_on_cpu_data arg)924b4420f19SPeter Xu static void do_kvm_cpu_synchronize_kick(CPUState *cpu, run_on_cpu_data arg)
925b4420f19SPeter Xu {
926b4420f19SPeter Xu /* No need to do anything */
927b4420f19SPeter Xu }
928b4420f19SPeter Xu
929b4420f19SPeter Xu /*
930b4420f19SPeter Xu * Kick all vcpus out in a synchronized way. When returned, we
931b4420f19SPeter Xu * guarantee that every vcpu has been kicked and at least returned to
932b4420f19SPeter Xu * userspace once.
933b4420f19SPeter Xu */
kvm_cpu_synchronize_kick_all(void)934b4420f19SPeter Xu static void kvm_cpu_synchronize_kick_all(void)
935b4420f19SPeter Xu {
936b4420f19SPeter Xu CPUState *cpu;
937b4420f19SPeter Xu
938b4420f19SPeter Xu CPU_FOREACH(cpu) {
939b4420f19SPeter Xu run_on_cpu(cpu, do_kvm_cpu_synchronize_kick, RUN_ON_CPU_NULL);
940b4420f19SPeter Xu }
941b4420f19SPeter Xu }
942b4420f19SPeter Xu
943b4420f19SPeter Xu /*
944b4420f19SPeter Xu * Flush all the existing dirty pages to the KVM slot buffers. When
945b4420f19SPeter Xu * this call returns, we guarantee that all the touched dirty pages
946b4420f19SPeter Xu * before calling this function have been put into the per-kvmslot
947b4420f19SPeter Xu * dirty bitmap.
948b4420f19SPeter Xu *
949b4420f19SPeter Xu * This function must be called with BQL held.
950b4420f19SPeter Xu */
kvm_dirty_ring_flush(void)951b4420f19SPeter Xu static void kvm_dirty_ring_flush(void)
952b4420f19SPeter Xu {
953b4420f19SPeter Xu trace_kvm_dirty_ring_flush(0);
954b4420f19SPeter Xu /*
955b4420f19SPeter Xu * The function needs to be serialized. Since this function
956b4420f19SPeter Xu * should always be with BQL held, serialization is guaranteed.
957b4420f19SPeter Xu * However, let's be sure of it.
958b4420f19SPeter Xu */
959195801d7SStefan Hajnoczi assert(bql_locked());
960b4420f19SPeter Xu /*
961b4420f19SPeter Xu * First make sure to flush the hardware buffers by kicking all
962b4420f19SPeter Xu * vcpus out in a synchronous way.
963b4420f19SPeter Xu */
964b4420f19SPeter Xu kvm_cpu_synchronize_kick_all();
9651667e2b9SHyman Huang(黄勇) kvm_dirty_ring_reap(kvm_state, NULL);
966b4420f19SPeter Xu trace_kvm_dirty_ring_flush(1);
967b4420f19SPeter Xu }
968b4420f19SPeter Xu
96992229a57SYang Zhong /**
9704a12a11aSPeter Xu * kvm_physical_sync_dirty_bitmap - Sync dirty bitmap from kernel space
97192229a57SYang Zhong *
9724a12a11aSPeter Xu * This function will first try to fetch dirty bitmap from the kernel,
9734a12a11aSPeter Xu * and then updates qemu's dirty bitmap.
9744a12a11aSPeter Xu *
97536adac49SPeter Xu * NOTE: caller must be with kml->slots_lock held.
97636adac49SPeter Xu *
9774a12a11aSPeter Xu * @kml: the KVM memory listener object
9784a12a11aSPeter Xu * @section: the memory section to sync the dirty bitmap with
97992229a57SYang Zhong */
kvm_physical_sync_dirty_bitmap(KVMMemoryListener * kml,MemoryRegionSection * section)980e65e5f50SPeter Xu static void kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml,
98192229a57SYang Zhong MemoryRegionSection *section)
98292229a57SYang Zhong {
98392229a57SYang Zhong KVMState *s = kvm_state;
98492229a57SYang Zhong KVMSlot *mem;
98567548f09SDavid Hildenbrand hwaddr start_addr, size;
9862c20b27eSPeter Xu hwaddr slot_size;
98792229a57SYang Zhong
98867548f09SDavid Hildenbrand size = kvm_align_section(section, &start_addr);
989023ae9a8SIgor Mammedov while (size) {
990023ae9a8SIgor Mammedov slot_size = MIN(kvm_max_slot_size, size);
991023ae9a8SIgor Mammedov mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
99267548f09SDavid Hildenbrand if (!mem) {
993e377e87cSDavid Hildenbrand /* We don't have a slot if we want to trap every access. */
994e65e5f50SPeter Xu return;
99592229a57SYang Zhong }
996e65e5f50SPeter Xu if (kvm_slot_get_dirty_log(s, mem)) {
9972c20b27eSPeter Xu kvm_slot_sync_dirty_pages(mem);
99838e0b790SThomas Huth }
999023ae9a8SIgor Mammedov start_addr += slot_size;
1000023ae9a8SIgor Mammedov size -= slot_size;
100167548f09SDavid Hildenbrand }
100292229a57SYang Zhong }
100392229a57SYang Zhong
1004ff4aa114SPeter Xu /* Alignment requirement for KVM_CLEAR_DIRTY_LOG - 64 pages */
1005ff4aa114SPeter Xu #define KVM_CLEAR_LOG_SHIFT 6
10068e3b0cbbSMarc-André Lureau #define KVM_CLEAR_LOG_ALIGN (qemu_real_host_page_size() << KVM_CLEAR_LOG_SHIFT)
1007ff4aa114SPeter Xu #define KVM_CLEAR_LOG_MASK (-KVM_CLEAR_LOG_ALIGN)
1008ff4aa114SPeter Xu
kvm_log_clear_one_slot(KVMSlot * mem,int as_id,uint64_t start,uint64_t size)10094222147dSPaolo Bonzini static int kvm_log_clear_one_slot(KVMSlot *mem, int as_id, uint64_t start,
10104222147dSPaolo Bonzini uint64_t size)
1011ff4aa114SPeter Xu {
1012ff4aa114SPeter Xu KVMState *s = kvm_state;
10134222147dSPaolo Bonzini uint64_t end, bmap_start, start_delta, bmap_npages;
1014ff4aa114SPeter Xu struct kvm_clear_dirty_log d;
10158e3b0cbbSMarc-André Lureau unsigned long *bmap_clear = NULL, psize = qemu_real_host_page_size();
10164222147dSPaolo Bonzini int ret;
1017ff4aa114SPeter Xu
1018ff4aa114SPeter Xu /*
1019ff4aa114SPeter Xu * We need to extend either the start or the size or both to
1020ff4aa114SPeter Xu * satisfy the KVM interface requirement. Firstly, do the start
1021ff4aa114SPeter Xu * page alignment on 64 host pages
1022ff4aa114SPeter Xu */
102384516e5bSPaolo Bonzini bmap_start = start & KVM_CLEAR_LOG_MASK;
102484516e5bSPaolo Bonzini start_delta = start - bmap_start;
1025ff4aa114SPeter Xu bmap_start /= psize;
1026ff4aa114SPeter Xu
1027ff4aa114SPeter Xu /*
1028ff4aa114SPeter Xu * The kernel interface has restriction on the size too, that either:
1029ff4aa114SPeter Xu *
1030ff4aa114SPeter Xu * (1) the size is 64 host pages aligned (just like the start), or
1031ff4aa114SPeter Xu * (2) the size fills up until the end of the KVM memslot.
1032ff4aa114SPeter Xu */
1033ff4aa114SPeter Xu bmap_npages = DIV_ROUND_UP(size + start_delta, KVM_CLEAR_LOG_ALIGN)
1034ff4aa114SPeter Xu << KVM_CLEAR_LOG_SHIFT;
1035ff4aa114SPeter Xu end = mem->memory_size / psize;
1036ff4aa114SPeter Xu if (bmap_npages > end - bmap_start) {
1037ff4aa114SPeter Xu bmap_npages = end - bmap_start;
1038ff4aa114SPeter Xu }
1039ff4aa114SPeter Xu start_delta /= psize;
1040ff4aa114SPeter Xu
1041ff4aa114SPeter Xu /*
1042ff4aa114SPeter Xu * Prepare the bitmap to clear dirty bits. Here we must guarantee
1043ff4aa114SPeter Xu * that we won't clear any unknown dirty bits otherwise we might
1044ff4aa114SPeter Xu * accidentally clear some set bits which are not yet synced from
1045ff4aa114SPeter Xu * the kernel into QEMU's bitmap, then we'll lose track of the
1046ff4aa114SPeter Xu * guest modifications upon those pages (which can directly lead
1047ff4aa114SPeter Xu * to guest data loss or panic after migration).
1048ff4aa114SPeter Xu *
1049ff4aa114SPeter Xu * Layout of the KVMSlot.dirty_bmap:
1050ff4aa114SPeter Xu *
1051ff4aa114SPeter Xu * |<-------- bmap_npages -----------..>|
1052ff4aa114SPeter Xu * [1]
1053ff4aa114SPeter Xu * start_delta size
1054ff4aa114SPeter Xu * |----------------|-------------|------------------|------------|
1055ff4aa114SPeter Xu * ^ ^ ^ ^
1056ff4aa114SPeter Xu * | | | |
1057ff4aa114SPeter Xu * start bmap_start (start) end
1058ff4aa114SPeter Xu * of memslot of memslot
1059ff4aa114SPeter Xu *
1060ff4aa114SPeter Xu * [1] bmap_npages can be aligned to either 64 pages or the end of slot
1061ff4aa114SPeter Xu */
1062ff4aa114SPeter Xu
1063ff4aa114SPeter Xu assert(bmap_start % BITS_PER_LONG == 0);
1064ff4aa114SPeter Xu /* We should never do log_clear before log_sync */
1065ff4aa114SPeter Xu assert(mem->dirty_bmap);
10664054adbdSZenghui Yu if (start_delta || bmap_npages - size / psize) {
1067ff4aa114SPeter Xu /* Slow path - we need to manipulate a temp bitmap */
1068ff4aa114SPeter Xu bmap_clear = bitmap_new(bmap_npages);
1069ff4aa114SPeter Xu bitmap_copy_with_src_offset(bmap_clear, mem->dirty_bmap,
1070ff4aa114SPeter Xu bmap_start, start_delta + size / psize);
1071ff4aa114SPeter Xu /*
1072ff4aa114SPeter Xu * We need to fill the holes at start because that was not
1073ff4aa114SPeter Xu * specified by the caller and we extended the bitmap only for
1074ff4aa114SPeter Xu * 64 pages alignment
1075ff4aa114SPeter Xu */
1076ff4aa114SPeter Xu bitmap_clear(bmap_clear, 0, start_delta);
1077ff4aa114SPeter Xu d.dirty_bitmap = bmap_clear;
1078ff4aa114SPeter Xu } else {
10794054adbdSZenghui Yu /*
10804054adbdSZenghui Yu * Fast path - both start and size align well with BITS_PER_LONG
10814054adbdSZenghui Yu * (or the end of memory slot)
10824054adbdSZenghui Yu */
1083ff4aa114SPeter Xu d.dirty_bitmap = mem->dirty_bmap + BIT_WORD(bmap_start);
1084ff4aa114SPeter Xu }
1085ff4aa114SPeter Xu
1086ff4aa114SPeter Xu d.first_page = bmap_start;
1087ff4aa114SPeter Xu /* It should never overflow. If it happens, say something */
1088ff4aa114SPeter Xu assert(bmap_npages <= UINT32_MAX);
1089ff4aa114SPeter Xu d.num_pages = bmap_npages;
10904222147dSPaolo Bonzini d.slot = mem->slot | (as_id << 16);
1091ff4aa114SPeter Xu
109238e0b790SThomas Huth ret = kvm_vm_ioctl(s, KVM_CLEAR_DIRTY_LOG, &d);
109338e0b790SThomas Huth if (ret < 0 && ret != -ENOENT) {
1094ff4aa114SPeter Xu error_report("%s: KVM_CLEAR_DIRTY_LOG failed, slot=%d, "
1095ff4aa114SPeter Xu "start=0x%"PRIx64", size=0x%"PRIx32", errno=%d",
1096ff4aa114SPeter Xu __func__, d.slot, (uint64_t)d.first_page,
1097ff4aa114SPeter Xu (uint32_t)d.num_pages, ret);
1098ff4aa114SPeter Xu } else {
1099ff4aa114SPeter Xu ret = 0;
1100ff4aa114SPeter Xu trace_kvm_clear_dirty_log(d.slot, d.first_page, d.num_pages);
1101ff4aa114SPeter Xu }
1102ff4aa114SPeter Xu
1103ff4aa114SPeter Xu /*
1104ff4aa114SPeter Xu * After we have updated the remote dirty bitmap, we update the
1105ff4aa114SPeter Xu * cached bitmap as well for the memslot, then if another user
1106ff4aa114SPeter Xu * clears the same region we know we shouldn't clear it again on
1107ff4aa114SPeter Xu * the remote otherwise it's data loss as well.
1108ff4aa114SPeter Xu */
1109ff4aa114SPeter Xu bitmap_clear(mem->dirty_bmap, bmap_start + start_delta,
1110ff4aa114SPeter Xu size / psize);
1111ff4aa114SPeter Xu /* This handles the NULL case well */
1112ff4aa114SPeter Xu g_free(bmap_clear);
11134222147dSPaolo Bonzini return ret;
11144222147dSPaolo Bonzini }
11154222147dSPaolo Bonzini
11164222147dSPaolo Bonzini
11174222147dSPaolo Bonzini /**
11184222147dSPaolo Bonzini * kvm_physical_log_clear - Clear the kernel's dirty bitmap for range
11194222147dSPaolo Bonzini *
11204222147dSPaolo Bonzini * NOTE: this will be a no-op if we haven't enabled manual dirty log
11214222147dSPaolo Bonzini * protection in the host kernel because in that case this operation
11224222147dSPaolo Bonzini * will be done within log_sync().
11234222147dSPaolo Bonzini *
11244222147dSPaolo Bonzini * @kml: the kvm memory listener
11254222147dSPaolo Bonzini * @section: the memory range to clear dirty bitmap
11264222147dSPaolo Bonzini */
kvm_physical_log_clear(KVMMemoryListener * kml,MemoryRegionSection * section)11274222147dSPaolo Bonzini static int kvm_physical_log_clear(KVMMemoryListener *kml,
11284222147dSPaolo Bonzini MemoryRegionSection *section)
11294222147dSPaolo Bonzini {
11304222147dSPaolo Bonzini KVMState *s = kvm_state;
113184516e5bSPaolo Bonzini uint64_t start, size, offset, count;
113284516e5bSPaolo Bonzini KVMSlot *mem;
113387287ac0SAlex Bennée int ret = 0, i;
11344222147dSPaolo Bonzini
11354222147dSPaolo Bonzini if (!s->manual_dirty_log_protect) {
11364222147dSPaolo Bonzini /* No need to do explicit clear */
113787287ac0SAlex Bennée return ret;
11384222147dSPaolo Bonzini }
11394222147dSPaolo Bonzini
11404222147dSPaolo Bonzini start = section->offset_within_address_space;
11414222147dSPaolo Bonzini size = int128_get64(section->size);
11424222147dSPaolo Bonzini
11434222147dSPaolo Bonzini if (!size) {
11444222147dSPaolo Bonzini /* Nothing more we can do... */
114587287ac0SAlex Bennée return ret;
11464222147dSPaolo Bonzini }
11474222147dSPaolo Bonzini
1148a2f77862SPeter Xu kvm_slots_lock();
11494222147dSPaolo Bonzini
11505504a812SPeter Xu for (i = 0; i < kml->nr_slots_allocated; i++) {
11514222147dSPaolo Bonzini mem = &kml->slots[i];
115284516e5bSPaolo Bonzini /* Discard slots that are empty or do not overlap the section */
115384516e5bSPaolo Bonzini if (!mem->memory_size ||
115484516e5bSPaolo Bonzini mem->start_addr > start + size - 1 ||
115584516e5bSPaolo Bonzini start > mem->start_addr + mem->memory_size - 1) {
115684516e5bSPaolo Bonzini continue;
115784516e5bSPaolo Bonzini }
115884516e5bSPaolo Bonzini
115984516e5bSPaolo Bonzini if (start >= mem->start_addr) {
116084516e5bSPaolo Bonzini /* The slot starts before section or is aligned to it. */
116184516e5bSPaolo Bonzini offset = start - mem->start_addr;
116284516e5bSPaolo Bonzini count = MIN(mem->memory_size - offset, size);
116384516e5bSPaolo Bonzini } else {
116484516e5bSPaolo Bonzini /* The slot starts after section. */
116584516e5bSPaolo Bonzini offset = 0;
116684516e5bSPaolo Bonzini count = MIN(mem->memory_size, size - (mem->start_addr - start));
116784516e5bSPaolo Bonzini }
116884516e5bSPaolo Bonzini ret = kvm_log_clear_one_slot(mem, kml->as_id, offset, count);
116984516e5bSPaolo Bonzini if (ret < 0) {
11704222147dSPaolo Bonzini break;
11714222147dSPaolo Bonzini }
11724222147dSPaolo Bonzini }
11734222147dSPaolo Bonzini
1174a2f77862SPeter Xu kvm_slots_unlock();
1175ff4aa114SPeter Xu
1176ff4aa114SPeter Xu return ret;
1177ff4aa114SPeter Xu }
1178ff4aa114SPeter Xu
kvm_coalesce_mmio_region(MemoryListener * listener,MemoryRegionSection * secion,hwaddr start,hwaddr size)117992229a57SYang Zhong static void kvm_coalesce_mmio_region(MemoryListener *listener,
118092229a57SYang Zhong MemoryRegionSection *secion,
118192229a57SYang Zhong hwaddr start, hwaddr size)
118292229a57SYang Zhong {
118392229a57SYang Zhong KVMState *s = kvm_state;
118492229a57SYang Zhong
118592229a57SYang Zhong if (s->coalesced_mmio) {
118692229a57SYang Zhong struct kvm_coalesced_mmio_zone zone;
118792229a57SYang Zhong
118892229a57SYang Zhong zone.addr = start;
118992229a57SYang Zhong zone.size = size;
119092229a57SYang Zhong zone.pad = 0;
119192229a57SYang Zhong
119292229a57SYang Zhong (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
119392229a57SYang Zhong }
119492229a57SYang Zhong }
119592229a57SYang Zhong
kvm_uncoalesce_mmio_region(MemoryListener * listener,MemoryRegionSection * secion,hwaddr start,hwaddr size)119692229a57SYang Zhong static void kvm_uncoalesce_mmio_region(MemoryListener *listener,
119792229a57SYang Zhong MemoryRegionSection *secion,
119892229a57SYang Zhong hwaddr start, hwaddr size)
119992229a57SYang Zhong {
120092229a57SYang Zhong KVMState *s = kvm_state;
120192229a57SYang Zhong
120292229a57SYang Zhong if (s->coalesced_mmio) {
120392229a57SYang Zhong struct kvm_coalesced_mmio_zone zone;
120492229a57SYang Zhong
120592229a57SYang Zhong zone.addr = start;
120692229a57SYang Zhong zone.size = size;
120792229a57SYang Zhong zone.pad = 0;
120892229a57SYang Zhong
120992229a57SYang Zhong (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
121092229a57SYang Zhong }
121192229a57SYang Zhong }
121292229a57SYang Zhong
kvm_coalesce_pio_add(MemoryListener * listener,MemoryRegionSection * section,hwaddr start,hwaddr size)1213e6d34aeeSPeng Hao static void kvm_coalesce_pio_add(MemoryListener *listener,
1214e6d34aeeSPeng Hao MemoryRegionSection *section,
1215e6d34aeeSPeng Hao hwaddr start, hwaddr size)
1216e6d34aeeSPeng Hao {
1217e6d34aeeSPeng Hao KVMState *s = kvm_state;
1218e6d34aeeSPeng Hao
1219e6d34aeeSPeng Hao if (s->coalesced_pio) {
1220e6d34aeeSPeng Hao struct kvm_coalesced_mmio_zone zone;
1221e6d34aeeSPeng Hao
1222e6d34aeeSPeng Hao zone.addr = start;
1223e6d34aeeSPeng Hao zone.size = size;
1224e6d34aeeSPeng Hao zone.pio = 1;
1225e6d34aeeSPeng Hao
1226e6d34aeeSPeng Hao (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
1227e6d34aeeSPeng Hao }
1228e6d34aeeSPeng Hao }
1229e6d34aeeSPeng Hao
kvm_coalesce_pio_del(MemoryListener * listener,MemoryRegionSection * section,hwaddr start,hwaddr size)1230e6d34aeeSPeng Hao static void kvm_coalesce_pio_del(MemoryListener *listener,
1231e6d34aeeSPeng Hao MemoryRegionSection *section,
1232e6d34aeeSPeng Hao hwaddr start, hwaddr size)
1233e6d34aeeSPeng Hao {
1234e6d34aeeSPeng Hao KVMState *s = kvm_state;
1235e6d34aeeSPeng Hao
1236e6d34aeeSPeng Hao if (s->coalesced_pio) {
1237e6d34aeeSPeng Hao struct kvm_coalesced_mmio_zone zone;
1238e6d34aeeSPeng Hao
1239e6d34aeeSPeng Hao zone.addr = start;
1240e6d34aeeSPeng Hao zone.size = size;
1241e6d34aeeSPeng Hao zone.pio = 1;
1242e6d34aeeSPeng Hao
1243e6d34aeeSPeng Hao (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
1244e6d34aeeSPeng Hao }
1245e6d34aeeSPeng Hao }
1246e6d34aeeSPeng Hao
kvm_check_extension(KVMState * s,unsigned int extension)124792229a57SYang Zhong int kvm_check_extension(KVMState *s, unsigned int extension)
124892229a57SYang Zhong {
124992229a57SYang Zhong int ret;
125092229a57SYang Zhong
125192229a57SYang Zhong ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
125292229a57SYang Zhong if (ret < 0) {
125392229a57SYang Zhong ret = 0;
125492229a57SYang Zhong }
125592229a57SYang Zhong
125692229a57SYang Zhong return ret;
125792229a57SYang Zhong }
125892229a57SYang Zhong
kvm_vm_check_extension(KVMState * s,unsigned int extension)125992229a57SYang Zhong int kvm_vm_check_extension(KVMState *s, unsigned int extension)
126092229a57SYang Zhong {
126192229a57SYang Zhong int ret;
126292229a57SYang Zhong
126392229a57SYang Zhong ret = kvm_vm_ioctl(s, KVM_CHECK_EXTENSION, extension);
126492229a57SYang Zhong if (ret < 0) {
126592229a57SYang Zhong /* VM wide version not implemented, use global one instead */
126692229a57SYang Zhong ret = kvm_check_extension(s, extension);
126792229a57SYang Zhong }
126892229a57SYang Zhong
126992229a57SYang Zhong return ret;
127092229a57SYang Zhong }
127192229a57SYang Zhong
127206152b89SWilliam Roche /*
127306152b89SWilliam Roche * We track the poisoned pages to be able to:
127406152b89SWilliam Roche * - replace them on VM reset
127506152b89SWilliam Roche * - block a migration for a VM with a poisoned page
127606152b89SWilliam Roche */
12776b552b9bSDongjiu Geng typedef struct HWPoisonPage {
12786b552b9bSDongjiu Geng ram_addr_t ram_addr;
12796b552b9bSDongjiu Geng QLIST_ENTRY(HWPoisonPage) list;
12806b552b9bSDongjiu Geng } HWPoisonPage;
12816b552b9bSDongjiu Geng
12826b552b9bSDongjiu Geng static QLIST_HEAD(, HWPoisonPage) hwpoison_page_list =
12836b552b9bSDongjiu Geng QLIST_HEAD_INITIALIZER(hwpoison_page_list);
12846b552b9bSDongjiu Geng
kvm_unpoison_all(void * param)12856b552b9bSDongjiu Geng static void kvm_unpoison_all(void *param)
12866b552b9bSDongjiu Geng {
12876b552b9bSDongjiu Geng HWPoisonPage *page, *next_page;
12886b552b9bSDongjiu Geng
12896b552b9bSDongjiu Geng QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) {
12906b552b9bSDongjiu Geng QLIST_REMOVE(page, list);
12916b552b9bSDongjiu Geng qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE);
12926b552b9bSDongjiu Geng g_free(page);
12936b552b9bSDongjiu Geng }
12946b552b9bSDongjiu Geng }
12956b552b9bSDongjiu Geng
kvm_hwpoison_page_add(ram_addr_t ram_addr)12966b552b9bSDongjiu Geng void kvm_hwpoison_page_add(ram_addr_t ram_addr)
12976b552b9bSDongjiu Geng {
12986b552b9bSDongjiu Geng HWPoisonPage *page;
12996b552b9bSDongjiu Geng
13006b552b9bSDongjiu Geng QLIST_FOREACH(page, &hwpoison_page_list, list) {
13016b552b9bSDongjiu Geng if (page->ram_addr == ram_addr) {
13026b552b9bSDongjiu Geng return;
13036b552b9bSDongjiu Geng }
13046b552b9bSDongjiu Geng }
13056b552b9bSDongjiu Geng page = g_new(HWPoisonPage, 1);
13066b552b9bSDongjiu Geng page->ram_addr = ram_addr;
13076b552b9bSDongjiu Geng QLIST_INSERT_HEAD(&hwpoison_page_list, page, list);
13086b552b9bSDongjiu Geng }
13096b552b9bSDongjiu Geng
kvm_hwpoisoned_mem(void)131006152b89SWilliam Roche bool kvm_hwpoisoned_mem(void)
131106152b89SWilliam Roche {
131206152b89SWilliam Roche return !QLIST_EMPTY(&hwpoison_page_list);
131306152b89SWilliam Roche }
131406152b89SWilliam Roche
adjust_ioeventfd_endianness(uint32_t val,uint32_t size)131592229a57SYang Zhong static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size)
131692229a57SYang Zhong {
1317ee3eb3a7SMarc-André Lureau #if HOST_BIG_ENDIAN != TARGET_BIG_ENDIAN
1318e03b5686SMarc-André Lureau /* The kernel expects ioeventfd values in HOST_BIG_ENDIAN
131992229a57SYang Zhong * endianness, but the memory core hands them in target endianness.
132092229a57SYang Zhong * For example, PPC is always treated as big-endian even if running
132192229a57SYang Zhong * on KVM and on PPC64LE. Correct here.
132292229a57SYang Zhong */
132392229a57SYang Zhong switch (size) {
132492229a57SYang Zhong case 2:
132592229a57SYang Zhong val = bswap16(val);
132692229a57SYang Zhong break;
132792229a57SYang Zhong case 4:
132892229a57SYang Zhong val = bswap32(val);
132992229a57SYang Zhong break;
133092229a57SYang Zhong }
133192229a57SYang Zhong #endif
133292229a57SYang Zhong return val;
133392229a57SYang Zhong }
133492229a57SYang Zhong
kvm_set_ioeventfd_mmio(int fd,hwaddr addr,uint32_t val,bool assign,uint32_t size,bool datamatch)133592229a57SYang Zhong static int kvm_set_ioeventfd_mmio(int fd, hwaddr addr, uint32_t val,
133692229a57SYang Zhong bool assign, uint32_t size, bool datamatch)
133792229a57SYang Zhong {
133892229a57SYang Zhong int ret;
133992229a57SYang Zhong struct kvm_ioeventfd iofd = {
134092229a57SYang Zhong .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
134192229a57SYang Zhong .addr = addr,
134292229a57SYang Zhong .len = size,
134392229a57SYang Zhong .flags = 0,
134492229a57SYang Zhong .fd = fd,
134592229a57SYang Zhong };
134692229a57SYang Zhong
1347876d16cdSDr. David Alan Gilbert trace_kvm_set_ioeventfd_mmio(fd, (uint64_t)addr, val, assign, size,
1348876d16cdSDr. David Alan Gilbert datamatch);
134992229a57SYang Zhong if (!kvm_enabled()) {
135092229a57SYang Zhong return -ENOSYS;
135192229a57SYang Zhong }
135292229a57SYang Zhong
135392229a57SYang Zhong if (datamatch) {
135492229a57SYang Zhong iofd.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
135592229a57SYang Zhong }
135692229a57SYang Zhong if (!assign) {
135792229a57SYang Zhong iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
135892229a57SYang Zhong }
135992229a57SYang Zhong
136092229a57SYang Zhong ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);
136192229a57SYang Zhong
136292229a57SYang Zhong if (ret < 0) {
136392229a57SYang Zhong return -errno;
136492229a57SYang Zhong }
136592229a57SYang Zhong
136692229a57SYang Zhong return 0;
136792229a57SYang Zhong }
136892229a57SYang Zhong
kvm_set_ioeventfd_pio(int fd,uint16_t addr,uint16_t val,bool assign,uint32_t size,bool datamatch)136992229a57SYang Zhong static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val,
137092229a57SYang Zhong bool assign, uint32_t size, bool datamatch)
137192229a57SYang Zhong {
137292229a57SYang Zhong struct kvm_ioeventfd kick = {
137392229a57SYang Zhong .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
137492229a57SYang Zhong .addr = addr,
137592229a57SYang Zhong .flags = KVM_IOEVENTFD_FLAG_PIO,
137692229a57SYang Zhong .len = size,
137792229a57SYang Zhong .fd = fd,
137892229a57SYang Zhong };
137992229a57SYang Zhong int r;
1380876d16cdSDr. David Alan Gilbert trace_kvm_set_ioeventfd_pio(fd, addr, val, assign, size, datamatch);
138192229a57SYang Zhong if (!kvm_enabled()) {
138292229a57SYang Zhong return -ENOSYS;
138392229a57SYang Zhong }
138492229a57SYang Zhong if (datamatch) {
138592229a57SYang Zhong kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
138692229a57SYang Zhong }
138792229a57SYang Zhong if (!assign) {
138892229a57SYang Zhong kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
138992229a57SYang Zhong }
139092229a57SYang Zhong r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
139192229a57SYang Zhong if (r < 0) {
139292229a57SYang Zhong return r;
139392229a57SYang Zhong }
139492229a57SYang Zhong return 0;
139592229a57SYang Zhong }
139692229a57SYang Zhong
139792229a57SYang Zhong
139892229a57SYang Zhong static const KVMCapabilityInfo *
kvm_check_extension_list(KVMState * s,const KVMCapabilityInfo * list)139992229a57SYang Zhong kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list)
140092229a57SYang Zhong {
140192229a57SYang Zhong while (list->name) {
140292229a57SYang Zhong if (!kvm_check_extension(s, list->value)) {
140392229a57SYang Zhong return list;
140492229a57SYang Zhong }
140592229a57SYang Zhong list++;
140692229a57SYang Zhong }
140792229a57SYang Zhong return NULL;
140892229a57SYang Zhong }
140992229a57SYang Zhong
kvm_set_max_memslot_size(hwaddr max_slot_size)1410023ae9a8SIgor Mammedov void kvm_set_max_memslot_size(hwaddr max_slot_size)
1411023ae9a8SIgor Mammedov {
1412023ae9a8SIgor Mammedov g_assert(
14138e3b0cbbSMarc-André Lureau ROUND_UP(max_slot_size, qemu_real_host_page_size()) == max_slot_size
1414023ae9a8SIgor Mammedov );
1415023ae9a8SIgor Mammedov kvm_max_slot_size = max_slot_size;
1416023ae9a8SIgor Mammedov }
1417023ae9a8SIgor Mammedov
kvm_set_memory_attributes(hwaddr start,uint64_t size,uint64_t attr)14180811baedSXiaoyao Li static int kvm_set_memory_attributes(hwaddr start, uint64_t size, uint64_t attr)
14190811baedSXiaoyao Li {
14200811baedSXiaoyao Li struct kvm_memory_attributes attrs;
14210811baedSXiaoyao Li int r;
14220811baedSXiaoyao Li
14230811baedSXiaoyao Li assert((attr & kvm_supported_memory_attributes) == attr);
14240811baedSXiaoyao Li attrs.attributes = attr;
14250811baedSXiaoyao Li attrs.address = start;
14260811baedSXiaoyao Li attrs.size = size;
14270811baedSXiaoyao Li attrs.flags = 0;
14280811baedSXiaoyao Li
14290811baedSXiaoyao Li r = kvm_vm_ioctl(kvm_state, KVM_SET_MEMORY_ATTRIBUTES, &attrs);
14300811baedSXiaoyao Li if (r) {
14310811baedSXiaoyao Li error_report("failed to set memory (0x%" HWADDR_PRIx "+0x%" PRIx64 ") "
14320811baedSXiaoyao Li "with attr 0x%" PRIx64 " error '%s'",
14330811baedSXiaoyao Li start, size, attr, strerror(errno));
14340811baedSXiaoyao Li }
14350811baedSXiaoyao Li return r;
14360811baedSXiaoyao Li }
14370811baedSXiaoyao Li
kvm_set_memory_attributes_private(hwaddr start,uint64_t size)14380811baedSXiaoyao Li int kvm_set_memory_attributes_private(hwaddr start, uint64_t size)
14390811baedSXiaoyao Li {
14400811baedSXiaoyao Li return kvm_set_memory_attributes(start, size, KVM_MEMORY_ATTRIBUTE_PRIVATE);
14410811baedSXiaoyao Li }
14420811baedSXiaoyao Li
kvm_set_memory_attributes_shared(hwaddr start,uint64_t size)14430811baedSXiaoyao Li int kvm_set_memory_attributes_shared(hwaddr start, uint64_t size)
14440811baedSXiaoyao Li {
14450811baedSXiaoyao Li return kvm_set_memory_attributes(start, size, 0);
14460811baedSXiaoyao Li }
14470811baedSXiaoyao Li
1448f39b7d2bSDavid Hildenbrand /* Called with KVMMemoryListener.slots_lock held */
kvm_set_phys_mem(KVMMemoryListener * kml,MemoryRegionSection * section,bool add)144992229a57SYang Zhong static void kvm_set_phys_mem(KVMMemoryListener *kml,
145092229a57SYang Zhong MemoryRegionSection *section, bool add)
145192229a57SYang Zhong {
1452f357f564SDavid Hildenbrand KVMSlot *mem;
145392229a57SYang Zhong int err;
145492229a57SYang Zhong MemoryRegion *mr = section->mr;
14559323e79fSPeter Maydell bool writable = !mr->readonly && !mr->rom_device;
14562c20b27eSPeter Xu hwaddr start_addr, size, slot_size, mr_offset;
14572c20b27eSPeter Xu ram_addr_t ram_start_offset;
14585ea69c2eSDavid Hildenbrand void *ram;
145992229a57SYang Zhong
146092229a57SYang Zhong if (!memory_region_is_ram(mr)) {
14619323e79fSPeter Maydell if (writable || !kvm_readonly_mem_allowed) {
146292229a57SYang Zhong return;
146392229a57SYang Zhong } else if (!mr->romd_mode) {
146492229a57SYang Zhong /* If the memory device is not in romd_mode, then we actually want
146592229a57SYang Zhong * to remove the kvm memory slot so all accesses will trap. */
146692229a57SYang Zhong add = false;
146792229a57SYang Zhong }
146892229a57SYang Zhong }
146992229a57SYang Zhong
14705ea69c2eSDavid Hildenbrand size = kvm_align_section(section, &start_addr);
14715ea69c2eSDavid Hildenbrand if (!size) {
14725ea69c2eSDavid Hildenbrand return;
14735ea69c2eSDavid Hildenbrand }
14745ea69c2eSDavid Hildenbrand
14752c20b27eSPeter Xu /* The offset of the kvmslot within the memory region */
14762c20b27eSPeter Xu mr_offset = section->offset_within_region + start_addr -
14772c20b27eSPeter Xu section->offset_within_address_space;
14782c20b27eSPeter Xu
14792c20b27eSPeter Xu /* use aligned delta to align the ram address and offset */
14802c20b27eSPeter Xu ram = memory_region_get_ram_ptr(mr) + mr_offset;
14812c20b27eSPeter Xu ram_start_offset = memory_region_get_ram_addr(mr) + mr_offset;
148292229a57SYang Zhong
1483f357f564SDavid Hildenbrand if (!add) {
1484023ae9a8SIgor Mammedov do {
1485023ae9a8SIgor Mammedov slot_size = MIN(kvm_max_slot_size, size);
1486023ae9a8SIgor Mammedov mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
148792229a57SYang Zhong if (!mem) {
1488f39b7d2bSDavid Hildenbrand return;
148992229a57SYang Zhong }
149092229a57SYang Zhong if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
1491b4420f19SPeter Xu /*
1492b4420f19SPeter Xu * NOTE: We should be aware of the fact that here we're only
1493b4420f19SPeter Xu * doing a best effort to sync dirty bits. No matter whether
1494b4420f19SPeter Xu * we're using dirty log or dirty ring, we ignored two facts:
1495b4420f19SPeter Xu *
1496b4420f19SPeter Xu * (1) dirty bits can reside in hardware buffers (PML)
1497b4420f19SPeter Xu *
1498b4420f19SPeter Xu * (2) after we collected dirty bits here, pages can be dirtied
1499b4420f19SPeter Xu * again before we do the final KVM_SET_USER_MEMORY_REGION to
1500b4420f19SPeter Xu * remove the slot.
1501b4420f19SPeter Xu *
1502b4420f19SPeter Xu * Not easy. Let's cross the fingers until it's fixed.
1503b4420f19SPeter Xu */
1504b4420f19SPeter Xu if (kvm_state->kvm_dirty_ring_size) {
15051667e2b9SHyman Huang(黄勇) kvm_dirty_ring_reap_locked(kvm_state, NULL);
1506b20cc776SGavin Shan if (kvm_state->kvm_dirty_ring_with_bitmap) {
1507b20cc776SGavin Shan kvm_slot_sync_dirty_pages(mem);
1508b20cc776SGavin Shan kvm_slot_get_dirty_log(kvm_state, mem);
1509b20cc776SGavin Shan }
1510b4420f19SPeter Xu } else {
151129b7e8beSPeter Xu kvm_slot_get_dirty_log(kvm_state, mem);
1512b4420f19SPeter Xu }
151329b7e8beSPeter Xu kvm_slot_sync_dirty_pages(mem);
151492229a57SYang Zhong }
151592229a57SYang Zhong
1516f357f564SDavid Hildenbrand /* unregister the slot */
15179f4bf4baSPeter Xu g_free(mem->dirty_bmap);
15189f4bf4baSPeter Xu mem->dirty_bmap = NULL;
151992229a57SYang Zhong mem->memory_size = 0;
15206c090d4aSShannon Zhao mem->flags = 0;
15216c090d4aSShannon Zhao err = kvm_set_user_memory_region(kml, mem, false);
152292229a57SYang Zhong if (err) {
15231c4fdabaSDavid Hildenbrand fprintf(stderr, "%s: error unregistering slot: %s\n",
152492229a57SYang Zhong __func__, strerror(-err));
152592229a57SYang Zhong abort();
152692229a57SYang Zhong }
1527023ae9a8SIgor Mammedov start_addr += slot_size;
1528023ae9a8SIgor Mammedov size -= slot_size;
1529dbdc00baSPeter Xu kml->nr_slots_used--;
1530023ae9a8SIgor Mammedov } while (size);
1531f39b7d2bSDavid Hildenbrand return;
153292229a57SYang Zhong }
1533f357f564SDavid Hildenbrand
1534f357f564SDavid Hildenbrand /* register the new slot */
1535023ae9a8SIgor Mammedov do {
1536023ae9a8SIgor Mammedov slot_size = MIN(kvm_max_slot_size, size);
153792229a57SYang Zhong mem = kvm_alloc_slot(kml);
1538e65e5f50SPeter Xu mem->as_id = kml->as_id;
1539023ae9a8SIgor Mammedov mem->memory_size = slot_size;
154092229a57SYang Zhong mem->start_addr = start_addr;
15412c20b27eSPeter Xu mem->ram_start_offset = ram_start_offset;
154292229a57SYang Zhong mem->ram = ram;
154392229a57SYang Zhong mem->flags = kvm_mem_flags(mr);
1544ce5a9832SChao Peng mem->guest_memfd = mr->ram_block->guest_memfd;
1545ce5a9832SChao Peng mem->guest_memfd_offset = (uint8_t*)ram - mr->ram_block->host;
1546ce5a9832SChao Peng
1547ea776d15SPeter Xu kvm_slot_init_dirty_bitmap(mem);
15486c090d4aSShannon Zhao err = kvm_set_user_memory_region(kml, mem, true);
154992229a57SYang Zhong if (err) {
155092229a57SYang Zhong fprintf(stderr, "%s: error registering slot: %s\n", __func__,
155192229a57SYang Zhong strerror(-err));
155292229a57SYang Zhong abort();
155392229a57SYang Zhong }
1554bd3bcf69SXiaoyao Li
1555bd3bcf69SXiaoyao Li if (memory_region_has_guest_memfd(mr)) {
1556bd3bcf69SXiaoyao Li err = kvm_set_memory_attributes_private(start_addr, slot_size);
1557bd3bcf69SXiaoyao Li if (err) {
1558bd3bcf69SXiaoyao Li error_report("%s: failed to set memory attribute private: %s",
1559bd3bcf69SXiaoyao Li __func__, strerror(-err));
1560bd3bcf69SXiaoyao Li exit(1);
1561bd3bcf69SXiaoyao Li }
1562bd3bcf69SXiaoyao Li }
1563bd3bcf69SXiaoyao Li
1564023ae9a8SIgor Mammedov start_addr += slot_size;
15652c20b27eSPeter Xu ram_start_offset += slot_size;
1566023ae9a8SIgor Mammedov ram += slot_size;
1567023ae9a8SIgor Mammedov size -= slot_size;
1568dbdc00baSPeter Xu kml->nr_slots_used++;
1569023ae9a8SIgor Mammedov } while (size);
157092229a57SYang Zhong }
157192229a57SYang Zhong
kvm_dirty_ring_reaper_thread(void * data)1572b4420f19SPeter Xu static void *kvm_dirty_ring_reaper_thread(void *data)
1573b4420f19SPeter Xu {
1574b4420f19SPeter Xu KVMState *s = data;
1575b4420f19SPeter Xu struct KVMDirtyRingReaper *r = &s->reaper;
1576b4420f19SPeter Xu
1577b4420f19SPeter Xu rcu_register_thread();
1578b4420f19SPeter Xu
1579b4420f19SPeter Xu trace_kvm_dirty_ring_reaper("init");
1580b4420f19SPeter Xu
1581b4420f19SPeter Xu while (true) {
1582b4420f19SPeter Xu r->reaper_state = KVM_DIRTY_RING_REAPER_WAIT;
1583b4420f19SPeter Xu trace_kvm_dirty_ring_reaper("wait");
1584b4420f19SPeter Xu /*
1585b4420f19SPeter Xu * TODO: provide a smarter timeout rather than a constant?
1586b4420f19SPeter Xu */
1587b4420f19SPeter Xu sleep(1);
1588b4420f19SPeter Xu
1589baa60983SHyman Huang(黄勇) /* keep sleeping so that dirtylimit not be interfered by reaper */
1590baa60983SHyman Huang(黄勇) if (dirtylimit_in_service()) {
1591baa60983SHyman Huang(黄勇) continue;
1592baa60983SHyman Huang(黄勇) }
1593baa60983SHyman Huang(黄勇)
1594b4420f19SPeter Xu trace_kvm_dirty_ring_reaper("wakeup");
1595b4420f19SPeter Xu r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING;
1596b4420f19SPeter Xu
1597195801d7SStefan Hajnoczi bql_lock();
15981667e2b9SHyman Huang(黄勇) kvm_dirty_ring_reap(s, NULL);
1599195801d7SStefan Hajnoczi bql_unlock();
1600b4420f19SPeter Xu
1601b4420f19SPeter Xu r->reaper_iteration++;
1602b4420f19SPeter Xu }
1603b4420f19SPeter Xu
1604c4d16d41SPeter Maydell g_assert_not_reached();
1605b4420f19SPeter Xu }
1606b4420f19SPeter Xu
kvm_dirty_ring_reaper_init(KVMState * s)160743a5e377SAkihiko Odaki static void kvm_dirty_ring_reaper_init(KVMState *s)
1608b4420f19SPeter Xu {
1609b4420f19SPeter Xu struct KVMDirtyRingReaper *r = &s->reaper;
1610b4420f19SPeter Xu
1611b4420f19SPeter Xu qemu_thread_create(&r->reaper_thr, "kvm-reaper",
1612b4420f19SPeter Xu kvm_dirty_ring_reaper_thread,
1613b4420f19SPeter Xu s, QEMU_THREAD_JOINABLE);
1614b4420f19SPeter Xu }
1615b4420f19SPeter Xu
kvm_dirty_ring_init(KVMState * s)16163794cb94SGavin Shan static int kvm_dirty_ring_init(KVMState *s)
16173794cb94SGavin Shan {
16183794cb94SGavin Shan uint32_t ring_size = s->kvm_dirty_ring_size;
16193794cb94SGavin Shan uint64_t ring_bytes = ring_size * sizeof(struct kvm_dirty_gfn);
1620856e23a0SGavin Shan unsigned int capability = KVM_CAP_DIRTY_LOG_RING;
16213794cb94SGavin Shan int ret;
16223794cb94SGavin Shan
16233794cb94SGavin Shan s->kvm_dirty_ring_size = 0;
16243794cb94SGavin Shan s->kvm_dirty_ring_bytes = 0;
16253794cb94SGavin Shan
16263794cb94SGavin Shan /* Bail if the dirty ring size isn't specified */
16273794cb94SGavin Shan if (!ring_size) {
16283794cb94SGavin Shan return 0;
16293794cb94SGavin Shan }
16303794cb94SGavin Shan
16313794cb94SGavin Shan /*
16323794cb94SGavin Shan * Read the max supported pages. Fall back to dirty logging mode
16333794cb94SGavin Shan * if the dirty ring isn't supported.
16343794cb94SGavin Shan */
1635856e23a0SGavin Shan ret = kvm_vm_check_extension(s, capability);
1636856e23a0SGavin Shan if (ret <= 0) {
1637856e23a0SGavin Shan capability = KVM_CAP_DIRTY_LOG_RING_ACQ_REL;
1638856e23a0SGavin Shan ret = kvm_vm_check_extension(s, capability);
1639856e23a0SGavin Shan }
1640856e23a0SGavin Shan
16413794cb94SGavin Shan if (ret <= 0) {
16423794cb94SGavin Shan warn_report("KVM dirty ring not available, using bitmap method");
16433794cb94SGavin Shan return 0;
16443794cb94SGavin Shan }
16453794cb94SGavin Shan
16463794cb94SGavin Shan if (ring_bytes > ret) {
16473794cb94SGavin Shan error_report("KVM dirty ring size %" PRIu32 " too big "
16483794cb94SGavin Shan "(maximum is %ld). Please use a smaller value.",
16493794cb94SGavin Shan ring_size, (long)ret / sizeof(struct kvm_dirty_gfn));
16503794cb94SGavin Shan return -EINVAL;
16513794cb94SGavin Shan }
16523794cb94SGavin Shan
1653856e23a0SGavin Shan ret = kvm_vm_enable_cap(s, capability, 0, ring_bytes);
16543794cb94SGavin Shan if (ret) {
16553794cb94SGavin Shan error_report("Enabling of KVM dirty ring failed: %s. "
16563794cb94SGavin Shan "Suggested minimum value is 1024.", strerror(-ret));
16573794cb94SGavin Shan return -EIO;
16583794cb94SGavin Shan }
16593794cb94SGavin Shan
1660856e23a0SGavin Shan /* Enable the backup bitmap if it is supported */
1661856e23a0SGavin Shan ret = kvm_vm_check_extension(s, KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP);
1662856e23a0SGavin Shan if (ret > 0) {
1663856e23a0SGavin Shan ret = kvm_vm_enable_cap(s, KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP, 0);
1664856e23a0SGavin Shan if (ret) {
1665856e23a0SGavin Shan error_report("Enabling of KVM dirty ring's backup bitmap failed: "
1666856e23a0SGavin Shan "%s. ", strerror(-ret));
1667856e23a0SGavin Shan return -EIO;
1668856e23a0SGavin Shan }
1669856e23a0SGavin Shan
1670856e23a0SGavin Shan s->kvm_dirty_ring_with_bitmap = true;
1671856e23a0SGavin Shan }
1672856e23a0SGavin Shan
16733794cb94SGavin Shan s->kvm_dirty_ring_size = ring_size;
16743794cb94SGavin Shan s->kvm_dirty_ring_bytes = ring_bytes;
16753794cb94SGavin Shan
16763794cb94SGavin Shan return 0;
16773794cb94SGavin Shan }
16783794cb94SGavin Shan
kvm_region_add(MemoryListener * listener,MemoryRegionSection * section)167992229a57SYang Zhong static void kvm_region_add(MemoryListener *listener,
168092229a57SYang Zhong MemoryRegionSection *section)
168192229a57SYang Zhong {
168292229a57SYang Zhong KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1683f39b7d2bSDavid Hildenbrand KVMMemoryUpdate *update;
168492229a57SYang Zhong
1685f39b7d2bSDavid Hildenbrand update = g_new0(KVMMemoryUpdate, 1);
1686f39b7d2bSDavid Hildenbrand update->section = *section;
1687f39b7d2bSDavid Hildenbrand
1688f39b7d2bSDavid Hildenbrand QSIMPLEQ_INSERT_TAIL(&kml->transaction_add, update, next);
168992229a57SYang Zhong }
169092229a57SYang Zhong
kvm_region_del(MemoryListener * listener,MemoryRegionSection * section)169192229a57SYang Zhong static void kvm_region_del(MemoryListener *listener,
169292229a57SYang Zhong MemoryRegionSection *section)
169392229a57SYang Zhong {
169492229a57SYang Zhong KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1695f39b7d2bSDavid Hildenbrand KVMMemoryUpdate *update;
169692229a57SYang Zhong
1697f39b7d2bSDavid Hildenbrand update = g_new0(KVMMemoryUpdate, 1);
1698f39b7d2bSDavid Hildenbrand update->section = *section;
1699f39b7d2bSDavid Hildenbrand
1700f39b7d2bSDavid Hildenbrand QSIMPLEQ_INSERT_TAIL(&kml->transaction_del, update, next);
1701f39b7d2bSDavid Hildenbrand }
1702f39b7d2bSDavid Hildenbrand
kvm_region_commit(MemoryListener * listener)1703f39b7d2bSDavid Hildenbrand static void kvm_region_commit(MemoryListener *listener)
1704f39b7d2bSDavid Hildenbrand {
1705f39b7d2bSDavid Hildenbrand KVMMemoryListener *kml = container_of(listener, KVMMemoryListener,
1706f39b7d2bSDavid Hildenbrand listener);
1707f39b7d2bSDavid Hildenbrand KVMMemoryUpdate *u1, *u2;
1708f39b7d2bSDavid Hildenbrand bool need_inhibit = false;
1709f39b7d2bSDavid Hildenbrand
1710f39b7d2bSDavid Hildenbrand if (QSIMPLEQ_EMPTY(&kml->transaction_add) &&
1711f39b7d2bSDavid Hildenbrand QSIMPLEQ_EMPTY(&kml->transaction_del)) {
1712f39b7d2bSDavid Hildenbrand return;
1713f39b7d2bSDavid Hildenbrand }
1714f39b7d2bSDavid Hildenbrand
1715f39b7d2bSDavid Hildenbrand /*
1716f39b7d2bSDavid Hildenbrand * We have to be careful when regions to add overlap with ranges to remove.
1717f39b7d2bSDavid Hildenbrand * We have to simulate atomic KVM memslot updates by making sure no ioctl()
1718f39b7d2bSDavid Hildenbrand * is currently active.
1719f39b7d2bSDavid Hildenbrand *
1720f39b7d2bSDavid Hildenbrand * The lists are order by addresses, so it's easy to find overlaps.
1721f39b7d2bSDavid Hildenbrand */
1722f39b7d2bSDavid Hildenbrand u1 = QSIMPLEQ_FIRST(&kml->transaction_del);
1723f39b7d2bSDavid Hildenbrand u2 = QSIMPLEQ_FIRST(&kml->transaction_add);
1724f39b7d2bSDavid Hildenbrand while (u1 && u2) {
1725f39b7d2bSDavid Hildenbrand Range r1, r2;
1726f39b7d2bSDavid Hildenbrand
1727f39b7d2bSDavid Hildenbrand range_init_nofail(&r1, u1->section.offset_within_address_space,
1728f39b7d2bSDavid Hildenbrand int128_get64(u1->section.size));
1729f39b7d2bSDavid Hildenbrand range_init_nofail(&r2, u2->section.offset_within_address_space,
1730f39b7d2bSDavid Hildenbrand int128_get64(u2->section.size));
1731f39b7d2bSDavid Hildenbrand
1732f39b7d2bSDavid Hildenbrand if (range_overlaps_range(&r1, &r2)) {
1733f39b7d2bSDavid Hildenbrand need_inhibit = true;
1734f39b7d2bSDavid Hildenbrand break;
1735f39b7d2bSDavid Hildenbrand }
1736f39b7d2bSDavid Hildenbrand if (range_lob(&r1) < range_lob(&r2)) {
1737f39b7d2bSDavid Hildenbrand u1 = QSIMPLEQ_NEXT(u1, next);
1738f39b7d2bSDavid Hildenbrand } else {
1739f39b7d2bSDavid Hildenbrand u2 = QSIMPLEQ_NEXT(u2, next);
1740f39b7d2bSDavid Hildenbrand }
1741f39b7d2bSDavid Hildenbrand }
1742f39b7d2bSDavid Hildenbrand
1743f39b7d2bSDavid Hildenbrand kvm_slots_lock();
1744f39b7d2bSDavid Hildenbrand if (need_inhibit) {
1745f39b7d2bSDavid Hildenbrand accel_ioctl_inhibit_begin();
1746f39b7d2bSDavid Hildenbrand }
1747f39b7d2bSDavid Hildenbrand
1748f39b7d2bSDavid Hildenbrand /* Remove all memslots before adding the new ones. */
1749f39b7d2bSDavid Hildenbrand while (!QSIMPLEQ_EMPTY(&kml->transaction_del)) {
1750f39b7d2bSDavid Hildenbrand u1 = QSIMPLEQ_FIRST(&kml->transaction_del);
1751f39b7d2bSDavid Hildenbrand QSIMPLEQ_REMOVE_HEAD(&kml->transaction_del, next);
1752f39b7d2bSDavid Hildenbrand
1753f39b7d2bSDavid Hildenbrand kvm_set_phys_mem(kml, &u1->section, false);
1754f39b7d2bSDavid Hildenbrand memory_region_unref(u1->section.mr);
1755f39b7d2bSDavid Hildenbrand
1756f39b7d2bSDavid Hildenbrand g_free(u1);
1757f39b7d2bSDavid Hildenbrand }
1758f39b7d2bSDavid Hildenbrand while (!QSIMPLEQ_EMPTY(&kml->transaction_add)) {
1759f39b7d2bSDavid Hildenbrand u1 = QSIMPLEQ_FIRST(&kml->transaction_add);
1760f39b7d2bSDavid Hildenbrand QSIMPLEQ_REMOVE_HEAD(&kml->transaction_add, next);
1761f39b7d2bSDavid Hildenbrand
1762f39b7d2bSDavid Hildenbrand memory_region_ref(u1->section.mr);
1763f39b7d2bSDavid Hildenbrand kvm_set_phys_mem(kml, &u1->section, true);
1764f39b7d2bSDavid Hildenbrand
1765f39b7d2bSDavid Hildenbrand g_free(u1);
1766f39b7d2bSDavid Hildenbrand }
1767f39b7d2bSDavid Hildenbrand
1768f39b7d2bSDavid Hildenbrand if (need_inhibit) {
1769f39b7d2bSDavid Hildenbrand accel_ioctl_inhibit_end();
1770f39b7d2bSDavid Hildenbrand }
1771f39b7d2bSDavid Hildenbrand kvm_slots_unlock();
177292229a57SYang Zhong }
177392229a57SYang Zhong
kvm_log_sync(MemoryListener * listener,MemoryRegionSection * section)177492229a57SYang Zhong static void kvm_log_sync(MemoryListener *listener,
177592229a57SYang Zhong MemoryRegionSection *section)
177692229a57SYang Zhong {
177792229a57SYang Zhong KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
177892229a57SYang Zhong
1779a2f77862SPeter Xu kvm_slots_lock();
1780e65e5f50SPeter Xu kvm_physical_sync_dirty_bitmap(kml, section);
1781a2f77862SPeter Xu kvm_slots_unlock();
178292229a57SYang Zhong }
178392229a57SYang Zhong
kvm_log_sync_global(MemoryListener * l,bool last_stage)17841e493be5SGavin Shan static void kvm_log_sync_global(MemoryListener *l, bool last_stage)
1785b4420f19SPeter Xu {
1786b4420f19SPeter Xu KVMMemoryListener *kml = container_of(l, KVMMemoryListener, listener);
1787b4420f19SPeter Xu KVMState *s = kvm_state;
1788b4420f19SPeter Xu KVMSlot *mem;
1789b4420f19SPeter Xu int i;
1790b4420f19SPeter Xu
1791b4420f19SPeter Xu /* Flush all kernel dirty addresses into KVMSlot dirty bitmap */
1792b4420f19SPeter Xu kvm_dirty_ring_flush();
1793b4420f19SPeter Xu
1794b4420f19SPeter Xu kvm_slots_lock();
17955504a812SPeter Xu for (i = 0; i < kml->nr_slots_allocated; i++) {
1796b4420f19SPeter Xu mem = &kml->slots[i];
1797b4420f19SPeter Xu if (mem->memory_size && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
1798b4420f19SPeter Xu kvm_slot_sync_dirty_pages(mem);
1799b20cc776SGavin Shan
1800b20cc776SGavin Shan if (s->kvm_dirty_ring_with_bitmap && last_stage &&
1801b20cc776SGavin Shan kvm_slot_get_dirty_log(s, mem)) {
1802b20cc776SGavin Shan kvm_slot_sync_dirty_pages(mem);
1803b20cc776SGavin Shan }
1804b20cc776SGavin Shan
1805b4420f19SPeter Xu /*
1806b4420f19SPeter Xu * This is not needed by KVM_GET_DIRTY_LOG because the
1807b4420f19SPeter Xu * ioctl will unconditionally overwrite the whole region.
1808b4420f19SPeter Xu * However kvm dirty ring has no such side effect.
1809b4420f19SPeter Xu */
1810b4420f19SPeter Xu kvm_slot_reset_dirty_pages(mem);
1811b4420f19SPeter Xu }
1812b4420f19SPeter Xu }
1813b4420f19SPeter Xu kvm_slots_unlock();
1814b4420f19SPeter Xu }
1815b4420f19SPeter Xu
kvm_log_clear(MemoryListener * listener,MemoryRegionSection * section)1816ff4aa114SPeter Xu static void kvm_log_clear(MemoryListener *listener,
1817ff4aa114SPeter Xu MemoryRegionSection *section)
1818ff4aa114SPeter Xu {
1819ff4aa114SPeter Xu KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1820ff4aa114SPeter Xu int r;
1821ff4aa114SPeter Xu
1822ff4aa114SPeter Xu r = kvm_physical_log_clear(kml, section);
1823ff4aa114SPeter Xu if (r < 0) {
1824ff4aa114SPeter Xu error_report_once("%s: kvm log clear failed: mr=%s "
1825ff4aa114SPeter Xu "offset=%"HWADDR_PRIx" size=%"PRIx64, __func__,
1826ff4aa114SPeter Xu section->mr->name, section->offset_within_region,
1827ff4aa114SPeter Xu int128_get64(section->size));
1828ff4aa114SPeter Xu abort();
1829ff4aa114SPeter Xu }
1830ff4aa114SPeter Xu }
1831ff4aa114SPeter Xu
kvm_mem_ioeventfd_add(MemoryListener * listener,MemoryRegionSection * section,bool match_data,uint64_t data,EventNotifier * e)183292229a57SYang Zhong static void kvm_mem_ioeventfd_add(MemoryListener *listener,
183392229a57SYang Zhong MemoryRegionSection *section,
183492229a57SYang Zhong bool match_data, uint64_t data,
183592229a57SYang Zhong EventNotifier *e)
183692229a57SYang Zhong {
183792229a57SYang Zhong int fd = event_notifier_get_fd(e);
183892229a57SYang Zhong int r;
183992229a57SYang Zhong
184092229a57SYang Zhong r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
184192229a57SYang Zhong data, true, int128_get64(section->size),
184292229a57SYang Zhong match_data);
184392229a57SYang Zhong if (r < 0) {
1844e346bcbfSYury Kotov fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n",
1845e346bcbfSYury Kotov __func__, strerror(-r), -r);
184692229a57SYang Zhong abort();
184792229a57SYang Zhong }
184892229a57SYang Zhong }
184992229a57SYang Zhong
kvm_mem_ioeventfd_del(MemoryListener * listener,MemoryRegionSection * section,bool match_data,uint64_t data,EventNotifier * e)185092229a57SYang Zhong static void kvm_mem_ioeventfd_del(MemoryListener *listener,
185192229a57SYang Zhong MemoryRegionSection *section,
185292229a57SYang Zhong bool match_data, uint64_t data,
185392229a57SYang Zhong EventNotifier *e)
185492229a57SYang Zhong {
185592229a57SYang Zhong int fd = event_notifier_get_fd(e);
185692229a57SYang Zhong int r;
185792229a57SYang Zhong
185892229a57SYang Zhong r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
185992229a57SYang Zhong data, false, int128_get64(section->size),
186092229a57SYang Zhong match_data);
186192229a57SYang Zhong if (r < 0) {
1862e346bcbfSYury Kotov fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n",
1863e346bcbfSYury Kotov __func__, strerror(-r), -r);
186492229a57SYang Zhong abort();
186592229a57SYang Zhong }
186692229a57SYang Zhong }
186792229a57SYang Zhong
kvm_io_ioeventfd_add(MemoryListener * listener,MemoryRegionSection * section,bool match_data,uint64_t data,EventNotifier * e)186892229a57SYang Zhong static void kvm_io_ioeventfd_add(MemoryListener *listener,
186992229a57SYang Zhong MemoryRegionSection *section,
187092229a57SYang Zhong bool match_data, uint64_t data,
187192229a57SYang Zhong EventNotifier *e)
187292229a57SYang Zhong {
187392229a57SYang Zhong int fd = event_notifier_get_fd(e);
187492229a57SYang Zhong int r;
187592229a57SYang Zhong
187692229a57SYang Zhong r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
187792229a57SYang Zhong data, true, int128_get64(section->size),
187892229a57SYang Zhong match_data);
187992229a57SYang Zhong if (r < 0) {
1880e346bcbfSYury Kotov fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n",
1881e346bcbfSYury Kotov __func__, strerror(-r), -r);
188292229a57SYang Zhong abort();
188392229a57SYang Zhong }
188492229a57SYang Zhong }
188592229a57SYang Zhong
kvm_io_ioeventfd_del(MemoryListener * listener,MemoryRegionSection * section,bool match_data,uint64_t data,EventNotifier * e)188692229a57SYang Zhong static void kvm_io_ioeventfd_del(MemoryListener *listener,
188792229a57SYang Zhong MemoryRegionSection *section,
188892229a57SYang Zhong bool match_data, uint64_t data,
188992229a57SYang Zhong EventNotifier *e)
189092229a57SYang Zhong
189192229a57SYang Zhong {
189292229a57SYang Zhong int fd = event_notifier_get_fd(e);
189392229a57SYang Zhong int r;
189492229a57SYang Zhong
189592229a57SYang Zhong r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
189692229a57SYang Zhong data, false, int128_get64(section->size),
189792229a57SYang Zhong match_data);
189892229a57SYang Zhong if (r < 0) {
1899e346bcbfSYury Kotov fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n",
1900e346bcbfSYury Kotov __func__, strerror(-r), -r);
190192229a57SYang Zhong abort();
190292229a57SYang Zhong }
190392229a57SYang Zhong }
190492229a57SYang Zhong
kvm_memory_listener_register(KVMState * s,KVMMemoryListener * kml,AddressSpace * as,int as_id,const char * name)190592229a57SYang Zhong void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
1906142518bdSPeter Xu AddressSpace *as, int as_id, const char *name)
190792229a57SYang Zhong {
190892229a57SYang Zhong int i;
190992229a57SYang Zhong
191092229a57SYang Zhong kml->as_id = as_id;
191192229a57SYang Zhong
19125504a812SPeter Xu kvm_slots_grow(kml, KVM_MEMSLOTS_NR_ALLOC_DEFAULT);
191392229a57SYang Zhong
1914f39b7d2bSDavid Hildenbrand QSIMPLEQ_INIT(&kml->transaction_add);
1915f39b7d2bSDavid Hildenbrand QSIMPLEQ_INIT(&kml->transaction_del);
1916f39b7d2bSDavid Hildenbrand
191792229a57SYang Zhong kml->listener.region_add = kvm_region_add;
191892229a57SYang Zhong kml->listener.region_del = kvm_region_del;
1919f39b7d2bSDavid Hildenbrand kml->listener.commit = kvm_region_commit;
192092229a57SYang Zhong kml->listener.log_start = kvm_log_start;
192192229a57SYang Zhong kml->listener.log_stop = kvm_log_stop;
19225369a36cSIsaku Yamahata kml->listener.priority = MEMORY_LISTENER_PRIORITY_ACCEL;
1923142518bdSPeter Xu kml->listener.name = name;
1924b4420f19SPeter Xu
1925b4420f19SPeter Xu if (s->kvm_dirty_ring_size) {
1926b4420f19SPeter Xu kml->listener.log_sync_global = kvm_log_sync_global;
1927b4420f19SPeter Xu } else {
192892229a57SYang Zhong kml->listener.log_sync = kvm_log_sync;
1929ff4aa114SPeter Xu kml->listener.log_clear = kvm_log_clear;
1930b4420f19SPeter Xu }
193192229a57SYang Zhong
193292229a57SYang Zhong memory_listener_register(&kml->listener, as);
19338072aae3SAlexey Kardashevskiy
19348072aae3SAlexey Kardashevskiy for (i = 0; i < s->nr_as; ++i) {
19358072aae3SAlexey Kardashevskiy if (!s->as[i].as) {
19368072aae3SAlexey Kardashevskiy s->as[i].as = as;
19378072aae3SAlexey Kardashevskiy s->as[i].ml = kml;
19388072aae3SAlexey Kardashevskiy break;
19398072aae3SAlexey Kardashevskiy }
19408072aae3SAlexey Kardashevskiy }
194192229a57SYang Zhong }
194292229a57SYang Zhong
194392229a57SYang Zhong static MemoryListener kvm_io_listener = {
1944142518bdSPeter Xu .name = "kvm-io",
19452cb81af0SPaolo Bonzini .coalesced_io_add = kvm_coalesce_pio_add,
19462cb81af0SPaolo Bonzini .coalesced_io_del = kvm_coalesce_pio_del,
194792229a57SYang Zhong .eventfd_add = kvm_io_ioeventfd_add,
194892229a57SYang Zhong .eventfd_del = kvm_io_ioeventfd_del,
19498be0461dSIsaku Yamahata .priority = MEMORY_LISTENER_PRIORITY_DEV_BACKEND,
195092229a57SYang Zhong };
195192229a57SYang Zhong
kvm_set_irq(KVMState * s,int irq,int level)195292229a57SYang Zhong int kvm_set_irq(KVMState *s, int irq, int level)
195392229a57SYang Zhong {
195492229a57SYang Zhong struct kvm_irq_level event;
195592229a57SYang Zhong int ret;
195692229a57SYang Zhong
195792229a57SYang Zhong assert(kvm_async_interrupts_enabled());
195892229a57SYang Zhong
195992229a57SYang Zhong event.level = level;
196092229a57SYang Zhong event.irq = irq;
196192229a57SYang Zhong ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event);
196292229a57SYang Zhong if (ret < 0) {
196392229a57SYang Zhong perror("kvm_set_irq");
196492229a57SYang Zhong abort();
196592229a57SYang Zhong }
196692229a57SYang Zhong
196792229a57SYang Zhong return (s->irq_set_ioctl == KVM_IRQ_LINE) ? 1 : event.status;
196892229a57SYang Zhong }
196992229a57SYang Zhong
197092229a57SYang Zhong #ifdef KVM_CAP_IRQ_ROUTING
197192229a57SYang Zhong typedef struct KVMMSIRoute {
197292229a57SYang Zhong struct kvm_irq_routing_entry kroute;
197392229a57SYang Zhong QTAILQ_ENTRY(KVMMSIRoute) entry;
197492229a57SYang Zhong } KVMMSIRoute;
197592229a57SYang Zhong
set_gsi(KVMState * s,unsigned int gsi)197692229a57SYang Zhong static void set_gsi(KVMState *s, unsigned int gsi)
197792229a57SYang Zhong {
197892229a57SYang Zhong set_bit(gsi, s->used_gsi_bitmap);
197992229a57SYang Zhong }
198092229a57SYang Zhong
clear_gsi(KVMState * s,unsigned int gsi)198192229a57SYang Zhong static void clear_gsi(KVMState *s, unsigned int gsi)
198292229a57SYang Zhong {
198392229a57SYang Zhong clear_bit(gsi, s->used_gsi_bitmap);
198492229a57SYang Zhong }
198592229a57SYang Zhong
kvm_init_irq_routing(KVMState * s)198692229a57SYang Zhong void kvm_init_irq_routing(KVMState *s)
198792229a57SYang Zhong {
1988cc5e719eSPaolo Bonzini int gsi_count;
198992229a57SYang Zhong
199092229a57SYang Zhong gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING) - 1;
199192229a57SYang Zhong if (gsi_count > 0) {
199292229a57SYang Zhong /* Round up so we can search ints using ffs */
199392229a57SYang Zhong s->used_gsi_bitmap = bitmap_new(gsi_count);
199492229a57SYang Zhong s->gsi_count = gsi_count;
199592229a57SYang Zhong }
199692229a57SYang Zhong
199792229a57SYang Zhong s->irq_routes = g_malloc0(sizeof(*s->irq_routes));
199892229a57SYang Zhong s->nr_allocated_irq_routes = 0;
199992229a57SYang Zhong
200092229a57SYang Zhong kvm_arch_init_irq_routing(s);
200192229a57SYang Zhong }
200292229a57SYang Zhong
kvm_irqchip_commit_routes(KVMState * s)200392229a57SYang Zhong void kvm_irqchip_commit_routes(KVMState *s)
200492229a57SYang Zhong {
200592229a57SYang Zhong int ret;
200692229a57SYang Zhong
200792229a57SYang Zhong if (kvm_gsi_direct_mapping()) {
200892229a57SYang Zhong return;
200992229a57SYang Zhong }
201092229a57SYang Zhong
201192229a57SYang Zhong if (!kvm_gsi_routing_enabled()) {
201292229a57SYang Zhong return;
201392229a57SYang Zhong }
201492229a57SYang Zhong
201592229a57SYang Zhong s->irq_routes->flags = 0;
201692229a57SYang Zhong trace_kvm_irqchip_commit_routes();
201792229a57SYang Zhong ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes);
201892229a57SYang Zhong assert(ret == 0);
201992229a57SYang Zhong }
202092229a57SYang Zhong
kvm_add_routing_entry(KVMState * s,struct kvm_irq_routing_entry * entry)202148663349SPaolo Bonzini void kvm_add_routing_entry(KVMState *s,
202292229a57SYang Zhong struct kvm_irq_routing_entry *entry)
202392229a57SYang Zhong {
202492229a57SYang Zhong struct kvm_irq_routing_entry *new;
202592229a57SYang Zhong int n, size;
202692229a57SYang Zhong
202792229a57SYang Zhong if (s->irq_routes->nr == s->nr_allocated_irq_routes) {
202892229a57SYang Zhong n = s->nr_allocated_irq_routes * 2;
202992229a57SYang Zhong if (n < 64) {
203092229a57SYang Zhong n = 64;
203192229a57SYang Zhong }
203292229a57SYang Zhong size = sizeof(struct kvm_irq_routing);
203392229a57SYang Zhong size += n * sizeof(*new);
203492229a57SYang Zhong s->irq_routes = g_realloc(s->irq_routes, size);
203592229a57SYang Zhong s->nr_allocated_irq_routes = n;
203692229a57SYang Zhong }
203792229a57SYang Zhong n = s->irq_routes->nr++;
203892229a57SYang Zhong new = &s->irq_routes->entries[n];
203992229a57SYang Zhong
204092229a57SYang Zhong *new = *entry;
204192229a57SYang Zhong
204292229a57SYang Zhong set_gsi(s, entry->gsi);
204392229a57SYang Zhong }
204492229a57SYang Zhong
kvm_update_routing_entry(KVMState * s,struct kvm_irq_routing_entry * new_entry)204592229a57SYang Zhong static int kvm_update_routing_entry(KVMState *s,
204692229a57SYang Zhong struct kvm_irq_routing_entry *new_entry)
204792229a57SYang Zhong {
204892229a57SYang Zhong struct kvm_irq_routing_entry *entry;
204992229a57SYang Zhong int n;
205092229a57SYang Zhong
205192229a57SYang Zhong for (n = 0; n < s->irq_routes->nr; n++) {
205292229a57SYang Zhong entry = &s->irq_routes->entries[n];
205392229a57SYang Zhong if (entry->gsi != new_entry->gsi) {
205492229a57SYang Zhong continue;
205592229a57SYang Zhong }
205692229a57SYang Zhong
205792229a57SYang Zhong if(!memcmp(entry, new_entry, sizeof *entry)) {
205892229a57SYang Zhong return 0;
205992229a57SYang Zhong }
206092229a57SYang Zhong
206192229a57SYang Zhong *entry = *new_entry;
206292229a57SYang Zhong
206392229a57SYang Zhong return 0;
206492229a57SYang Zhong }
206592229a57SYang Zhong
206692229a57SYang Zhong return -ESRCH;
206792229a57SYang Zhong }
206892229a57SYang Zhong
kvm_irqchip_add_irq_route(KVMState * s,int irq,int irqchip,int pin)206992229a57SYang Zhong void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin)
207092229a57SYang Zhong {
207192229a57SYang Zhong struct kvm_irq_routing_entry e = {};
207292229a57SYang Zhong
207392229a57SYang Zhong assert(pin < s->gsi_count);
207492229a57SYang Zhong
207592229a57SYang Zhong e.gsi = irq;
207692229a57SYang Zhong e.type = KVM_IRQ_ROUTING_IRQCHIP;
207792229a57SYang Zhong e.flags = 0;
207892229a57SYang Zhong e.u.irqchip.irqchip = irqchip;
207992229a57SYang Zhong e.u.irqchip.pin = pin;
208092229a57SYang Zhong kvm_add_routing_entry(s, &e);
208192229a57SYang Zhong }
208292229a57SYang Zhong
kvm_irqchip_release_virq(KVMState * s,int virq)208392229a57SYang Zhong void kvm_irqchip_release_virq(KVMState *s, int virq)
208492229a57SYang Zhong {
208592229a57SYang Zhong struct kvm_irq_routing_entry *e;
208692229a57SYang Zhong int i;
208792229a57SYang Zhong
208892229a57SYang Zhong if (kvm_gsi_direct_mapping()) {
208992229a57SYang Zhong return;
209092229a57SYang Zhong }
209192229a57SYang Zhong
209292229a57SYang Zhong for (i = 0; i < s->irq_routes->nr; i++) {
209392229a57SYang Zhong e = &s->irq_routes->entries[i];
209492229a57SYang Zhong if (e->gsi == virq) {
209592229a57SYang Zhong s->irq_routes->nr--;
209692229a57SYang Zhong *e = s->irq_routes->entries[s->irq_routes->nr];
209792229a57SYang Zhong }
209892229a57SYang Zhong }
209992229a57SYang Zhong clear_gsi(s, virq);
210092229a57SYang Zhong kvm_arch_release_virq_post(virq);
210192229a57SYang Zhong trace_kvm_irqchip_release_virq(virq);
210292229a57SYang Zhong }
210392229a57SYang Zhong
kvm_irqchip_add_change_notifier(Notifier * n)21043607715aSDavid Gibson void kvm_irqchip_add_change_notifier(Notifier *n)
21053607715aSDavid Gibson {
21063607715aSDavid Gibson notifier_list_add(&kvm_irqchip_change_notifiers, n);
21073607715aSDavid Gibson }
21083607715aSDavid Gibson
kvm_irqchip_remove_change_notifier(Notifier * n)21093607715aSDavid Gibson void kvm_irqchip_remove_change_notifier(Notifier *n)
21103607715aSDavid Gibson {
21113607715aSDavid Gibson notifier_remove(n);
21123607715aSDavid Gibson }
21133607715aSDavid Gibson
kvm_irqchip_change_notify(void)21143607715aSDavid Gibson void kvm_irqchip_change_notify(void)
21153607715aSDavid Gibson {
21163607715aSDavid Gibson notifier_list_notify(&kvm_irqchip_change_notifiers, NULL);
21173607715aSDavid Gibson }
21183607715aSDavid Gibson
kvm_irqchip_get_virq(KVMState * s)211948663349SPaolo Bonzini int kvm_irqchip_get_virq(KVMState *s)
212092229a57SYang Zhong {
212192229a57SYang Zhong int next_virq;
212292229a57SYang Zhong
212392229a57SYang Zhong /* Return the lowest unused GSI in the bitmap */
212492229a57SYang Zhong next_virq = find_first_zero_bit(s->used_gsi_bitmap, s->gsi_count);
212592229a57SYang Zhong if (next_virq >= s->gsi_count) {
212692229a57SYang Zhong return -ENOSPC;
212792229a57SYang Zhong } else {
212892229a57SYang Zhong return next_virq;
212992229a57SYang Zhong }
213092229a57SYang Zhong }
213192229a57SYang Zhong
kvm_irqchip_send_msi(KVMState * s,MSIMessage msg)213292229a57SYang Zhong int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
213392229a57SYang Zhong {
213492229a57SYang Zhong struct kvm_msi msi;
213592229a57SYang Zhong
213692229a57SYang Zhong msi.address_lo = (uint32_t)msg.address;
213792229a57SYang Zhong msi.address_hi = msg.address >> 32;
213892229a57SYang Zhong msi.data = le32_to_cpu(msg.data);
213992229a57SYang Zhong msi.flags = 0;
214092229a57SYang Zhong memset(msi.pad, 0, sizeof(msi.pad));
214192229a57SYang Zhong
214292229a57SYang Zhong return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi);
214392229a57SYang Zhong }
214492229a57SYang Zhong
kvm_irqchip_add_msi_route(KVMRouteChange * c,int vector,PCIDevice * dev)2145def4c557SLongpeng(Mike) int kvm_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev)
214692229a57SYang Zhong {
214792229a57SYang Zhong struct kvm_irq_routing_entry kroute = {};
214892229a57SYang Zhong int virq;
2149def4c557SLongpeng(Mike) KVMState *s = c->s;
215092229a57SYang Zhong MSIMessage msg = {0, 0};
215192229a57SYang Zhong
215288c725c7SCornelia Huck if (pci_available && dev) {
215392229a57SYang Zhong msg = pci_get_msi_message(dev, vector);
215492229a57SYang Zhong }
215592229a57SYang Zhong
215692229a57SYang Zhong if (kvm_gsi_direct_mapping()) {
215792229a57SYang Zhong return kvm_arch_msi_data_to_gsi(msg.data);
215892229a57SYang Zhong }
215992229a57SYang Zhong
216092229a57SYang Zhong if (!kvm_gsi_routing_enabled()) {
216192229a57SYang Zhong return -ENOSYS;
216292229a57SYang Zhong }
216392229a57SYang Zhong
216492229a57SYang Zhong virq = kvm_irqchip_get_virq(s);
216592229a57SYang Zhong if (virq < 0) {
216692229a57SYang Zhong return virq;
216792229a57SYang Zhong }
216892229a57SYang Zhong
216992229a57SYang Zhong kroute.gsi = virq;
217092229a57SYang Zhong kroute.type = KVM_IRQ_ROUTING_MSI;
217192229a57SYang Zhong kroute.flags = 0;
217292229a57SYang Zhong kroute.u.msi.address_lo = (uint32_t)msg.address;
217392229a57SYang Zhong kroute.u.msi.address_hi = msg.address >> 32;
217492229a57SYang Zhong kroute.u.msi.data = le32_to_cpu(msg.data);
217588c725c7SCornelia Huck if (pci_available && kvm_msi_devid_required()) {
217692229a57SYang Zhong kroute.flags = KVM_MSI_VALID_DEVID;
217792229a57SYang Zhong kroute.u.msi.devid = pci_requester_id(dev);
217892229a57SYang Zhong }
217992229a57SYang Zhong if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
218092229a57SYang Zhong kvm_irqchip_release_virq(s, virq);
218192229a57SYang Zhong return -EINVAL;
218292229a57SYang Zhong }
218392229a57SYang Zhong
2184e34f4d87SIgor Mammedov if (s->irq_routes->nr < s->gsi_count) {
218592229a57SYang Zhong trace_kvm_irqchip_add_msi_route(dev ? dev->name : (char *)"N/A",
218692229a57SYang Zhong vector, virq);
218792229a57SYang Zhong
218892229a57SYang Zhong kvm_add_routing_entry(s, &kroute);
218992229a57SYang Zhong kvm_arch_add_msi_route_post(&kroute, vector, dev);
2190def4c557SLongpeng(Mike) c->changes++;
2191e34f4d87SIgor Mammedov } else {
2192e34f4d87SIgor Mammedov kvm_irqchip_release_virq(s, virq);
2193e34f4d87SIgor Mammedov return -ENOSPC;
2194e34f4d87SIgor Mammedov }
219592229a57SYang Zhong
219692229a57SYang Zhong return virq;
219792229a57SYang Zhong }
219892229a57SYang Zhong
kvm_irqchip_update_msi_route(KVMState * s,int virq,MSIMessage msg,PCIDevice * dev)219992229a57SYang Zhong int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg,
220092229a57SYang Zhong PCIDevice *dev)
220192229a57SYang Zhong {
220292229a57SYang Zhong struct kvm_irq_routing_entry kroute = {};
220392229a57SYang Zhong
220492229a57SYang Zhong if (kvm_gsi_direct_mapping()) {
220592229a57SYang Zhong return 0;
220692229a57SYang Zhong }
220792229a57SYang Zhong
220892229a57SYang Zhong if (!kvm_irqchip_in_kernel()) {
220992229a57SYang Zhong return -ENOSYS;
221092229a57SYang Zhong }
221192229a57SYang Zhong
221292229a57SYang Zhong kroute.gsi = virq;
221392229a57SYang Zhong kroute.type = KVM_IRQ_ROUTING_MSI;
221492229a57SYang Zhong kroute.flags = 0;
221592229a57SYang Zhong kroute.u.msi.address_lo = (uint32_t)msg.address;
221692229a57SYang Zhong kroute.u.msi.address_hi = msg.address >> 32;
221792229a57SYang Zhong kroute.u.msi.data = le32_to_cpu(msg.data);
221888c725c7SCornelia Huck if (pci_available && kvm_msi_devid_required()) {
221992229a57SYang Zhong kroute.flags = KVM_MSI_VALID_DEVID;
222092229a57SYang Zhong kroute.u.msi.devid = pci_requester_id(dev);
222192229a57SYang Zhong }
222292229a57SYang Zhong if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
222392229a57SYang Zhong return -EINVAL;
222492229a57SYang Zhong }
222592229a57SYang Zhong
222692229a57SYang Zhong trace_kvm_irqchip_update_msi_route(virq);
222792229a57SYang Zhong
222892229a57SYang Zhong return kvm_update_routing_entry(s, &kroute);
222992229a57SYang Zhong }
223092229a57SYang Zhong
kvm_irqchip_assign_irqfd(KVMState * s,EventNotifier * event,EventNotifier * resample,int virq,bool assign)2231ff66ba87SPeter Xu static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event,
2232ff66ba87SPeter Xu EventNotifier *resample, int virq,
223392229a57SYang Zhong bool assign)
223492229a57SYang Zhong {
2235ff66ba87SPeter Xu int fd = event_notifier_get_fd(event);
2236ff66ba87SPeter Xu int rfd = resample ? event_notifier_get_fd(resample) : -1;
2237ff66ba87SPeter Xu
223892229a57SYang Zhong struct kvm_irqfd irqfd = {
223992229a57SYang Zhong .fd = fd,
224092229a57SYang Zhong .gsi = virq,
224192229a57SYang Zhong .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN,
224292229a57SYang Zhong };
224392229a57SYang Zhong
224492229a57SYang Zhong if (rfd != -1) {
2245c82d9d43SPeter Xu assert(assign);
2246c82d9d43SPeter Xu if (kvm_irqchip_is_split()) {
2247c82d9d43SPeter Xu /*
2248c82d9d43SPeter Xu * When the slow irqchip (e.g. IOAPIC) is in the
2249c82d9d43SPeter Xu * userspace, KVM kernel resamplefd will not work because
2250c82d9d43SPeter Xu * the EOI of the interrupt will be delivered to userspace
2251c82d9d43SPeter Xu * instead, so the KVM kernel resamplefd kick will be
2252c82d9d43SPeter Xu * skipped. The userspace here mimics what the kernel
2253c82d9d43SPeter Xu * provides with resamplefd, remember the resamplefd and
2254c82d9d43SPeter Xu * kick it when we receive EOI of this IRQ.
2255c82d9d43SPeter Xu *
2256c82d9d43SPeter Xu * This is hackery because IOAPIC is mostly bypassed
2257c82d9d43SPeter Xu * (except EOI broadcasts) when irqfd is used. However
2258c82d9d43SPeter Xu * this can bring much performance back for split irqchip
2259c82d9d43SPeter Xu * with INTx IRQs (for VFIO, this gives 93% perf of the
2260c82d9d43SPeter Xu * full fast path, which is 46% perf boost comparing to
2261c82d9d43SPeter Xu * the INTx slow path).
2262c82d9d43SPeter Xu */
2263c82d9d43SPeter Xu kvm_resample_fd_insert(virq, resample);
2264c82d9d43SPeter Xu } else {
226592229a57SYang Zhong irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE;
226692229a57SYang Zhong irqfd.resamplefd = rfd;
226792229a57SYang Zhong }
2268c82d9d43SPeter Xu } else if (!assign) {
2269c82d9d43SPeter Xu if (kvm_irqchip_is_split()) {
2270c82d9d43SPeter Xu kvm_resample_fd_remove(virq);
2271c82d9d43SPeter Xu }
2272c82d9d43SPeter Xu }
227392229a57SYang Zhong
227492229a57SYang Zhong return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd);
227592229a57SYang Zhong }
227692229a57SYang Zhong
227792229a57SYang Zhong #else /* !KVM_CAP_IRQ_ROUTING */
227892229a57SYang Zhong
kvm_init_irq_routing(KVMState * s)227992229a57SYang Zhong void kvm_init_irq_routing(KVMState *s)
228092229a57SYang Zhong {
228192229a57SYang Zhong }
228292229a57SYang Zhong
kvm_irqchip_release_virq(KVMState * s,int virq)228392229a57SYang Zhong void kvm_irqchip_release_virq(KVMState *s, int virq)
228492229a57SYang Zhong {
228592229a57SYang Zhong }
228692229a57SYang Zhong
kvm_irqchip_send_msi(KVMState * s,MSIMessage msg)228792229a57SYang Zhong int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
228892229a57SYang Zhong {
228992229a57SYang Zhong abort();
229092229a57SYang Zhong }
229192229a57SYang Zhong
kvm_irqchip_add_msi_route(KVMRouteChange * c,int vector,PCIDevice * dev)2292def4c557SLongpeng(Mike) int kvm_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev)
229392229a57SYang Zhong {
229492229a57SYang Zhong return -ENOSYS;
229592229a57SYang Zhong }
229692229a57SYang Zhong
kvm_irqchip_add_adapter_route(KVMState * s,AdapterInfo * adapter)229792229a57SYang Zhong int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter)
229892229a57SYang Zhong {
229992229a57SYang Zhong return -ENOSYS;
230092229a57SYang Zhong }
230192229a57SYang Zhong
kvm_irqchip_add_hv_sint_route(KVMState * s,uint32_t vcpu,uint32_t sint)230292229a57SYang Zhong int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint)
230392229a57SYang Zhong {
230492229a57SYang Zhong return -ENOSYS;
230592229a57SYang Zhong }
230692229a57SYang Zhong
kvm_irqchip_assign_irqfd(KVMState * s,EventNotifier * event,EventNotifier * resample,int virq,bool assign)2307ff66ba87SPeter Xu static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event,
2308ff66ba87SPeter Xu EventNotifier *resample, int virq,
2309ff66ba87SPeter Xu bool assign)
231092229a57SYang Zhong {
231192229a57SYang Zhong abort();
231292229a57SYang Zhong }
231392229a57SYang Zhong
kvm_irqchip_update_msi_route(KVMState * s,int virq,MSIMessage msg)231492229a57SYang Zhong int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg)
231592229a57SYang Zhong {
231692229a57SYang Zhong return -ENOSYS;
231792229a57SYang Zhong }
231892229a57SYang Zhong #endif /* !KVM_CAP_IRQ_ROUTING */
231992229a57SYang Zhong
kvm_irqchip_add_irqfd_notifier_gsi(KVMState * s,EventNotifier * n,EventNotifier * rn,int virq)232092229a57SYang Zhong int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
232192229a57SYang Zhong EventNotifier *rn, int virq)
232292229a57SYang Zhong {
2323ff66ba87SPeter Xu return kvm_irqchip_assign_irqfd(s, n, rn, virq, true);
232492229a57SYang Zhong }
232592229a57SYang Zhong
kvm_irqchip_remove_irqfd_notifier_gsi(KVMState * s,EventNotifier * n,int virq)232692229a57SYang Zhong int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
232792229a57SYang Zhong int virq)
232892229a57SYang Zhong {
2329ff66ba87SPeter Xu return kvm_irqchip_assign_irqfd(s, n, NULL, virq, false);
233092229a57SYang Zhong }
233192229a57SYang Zhong
kvm_irqchip_add_irqfd_notifier(KVMState * s,EventNotifier * n,EventNotifier * rn,qemu_irq irq)233292229a57SYang Zhong int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n,
233392229a57SYang Zhong EventNotifier *rn, qemu_irq irq)
233492229a57SYang Zhong {
233592229a57SYang Zhong gpointer key, gsi;
233692229a57SYang Zhong gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
233792229a57SYang Zhong
233892229a57SYang Zhong if (!found) {
233992229a57SYang Zhong return -ENXIO;
234092229a57SYang Zhong }
234192229a57SYang Zhong return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, GPOINTER_TO_INT(gsi));
234292229a57SYang Zhong }
234392229a57SYang Zhong
kvm_irqchip_remove_irqfd_notifier(KVMState * s,EventNotifier * n,qemu_irq irq)234492229a57SYang Zhong int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n,
234592229a57SYang Zhong qemu_irq irq)
234692229a57SYang Zhong {
234792229a57SYang Zhong gpointer key, gsi;
234892229a57SYang Zhong gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
234992229a57SYang Zhong
235092229a57SYang Zhong if (!found) {
235192229a57SYang Zhong return -ENXIO;
235292229a57SYang Zhong }
235392229a57SYang Zhong return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, GPOINTER_TO_INT(gsi));
235492229a57SYang Zhong }
235592229a57SYang Zhong
kvm_irqchip_set_qemuirq_gsi(KVMState * s,qemu_irq irq,int gsi)235692229a57SYang Zhong void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi)
235792229a57SYang Zhong {
235892229a57SYang Zhong g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi));
235992229a57SYang Zhong }
236092229a57SYang Zhong
kvm_irqchip_create(KVMState * s)23614376c40dSPaolo Bonzini static void kvm_irqchip_create(KVMState *s)
236292229a57SYang Zhong {
236392229a57SYang Zhong int ret;
236492229a57SYang Zhong
2365d1972be1SXiaoyao Li assert(s->kernel_irqchip_split != ON_OFF_AUTO_AUTO);
236692229a57SYang Zhong if (kvm_check_extension(s, KVM_CAP_IRQCHIP)) {
236792229a57SYang Zhong ;
236892229a57SYang Zhong } else if (kvm_check_extension(s, KVM_CAP_S390_IRQCHIP)) {
236992229a57SYang Zhong ret = kvm_vm_enable_cap(s, KVM_CAP_S390_IRQCHIP, 0);
237092229a57SYang Zhong if (ret < 0) {
237192229a57SYang Zhong fprintf(stderr, "Enable kernel irqchip failed: %s\n", strerror(-ret));
237292229a57SYang Zhong exit(1);
237392229a57SYang Zhong }
237492229a57SYang Zhong } else {
237592229a57SYang Zhong return;
237692229a57SYang Zhong }
237792229a57SYang Zhong
2378a788260bSPaolo Bonzini if (kvm_check_extension(s, KVM_CAP_IRQFD) <= 0) {
2379a788260bSPaolo Bonzini fprintf(stderr, "kvm: irqfd not implemented\n");
2380a788260bSPaolo Bonzini exit(1);
2381a788260bSPaolo Bonzini }
2382a788260bSPaolo Bonzini
238392229a57SYang Zhong /* First probe and see if there's a arch-specific hook to create the
238492229a57SYang Zhong * in-kernel irqchip for us */
23854376c40dSPaolo Bonzini ret = kvm_arch_irqchip_create(s);
238692229a57SYang Zhong if (ret == 0) {
2387d1972be1SXiaoyao Li if (s->kernel_irqchip_split == ON_OFF_AUTO_ON) {
238847c182feSCornelia Huck error_report("Split IRQ chip mode not supported.");
238992229a57SYang Zhong exit(1);
239092229a57SYang Zhong } else {
239192229a57SYang Zhong ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP);
239292229a57SYang Zhong }
239392229a57SYang Zhong }
239492229a57SYang Zhong if (ret < 0) {
239592229a57SYang Zhong fprintf(stderr, "Create kernel irqchip failed: %s\n", strerror(-ret));
239692229a57SYang Zhong exit(1);
239792229a57SYang Zhong }
239892229a57SYang Zhong
239992229a57SYang Zhong kvm_kernel_irqchip = true;
240092229a57SYang Zhong /* If we have an in-kernel IRQ chip then we must have asynchronous
240192229a57SYang Zhong * interrupt delivery (though the reverse is not necessarily true)
240292229a57SYang Zhong */
240392229a57SYang Zhong kvm_async_interrupts_allowed = true;
240492229a57SYang Zhong kvm_halt_in_kernel_allowed = true;
240592229a57SYang Zhong
240692229a57SYang Zhong kvm_init_irq_routing(s);
240792229a57SYang Zhong
240892229a57SYang Zhong s->gsimap = g_hash_table_new(g_direct_hash, g_direct_equal);
240992229a57SYang Zhong }
241092229a57SYang Zhong
241192229a57SYang Zhong /* Find number of supported CPUs using the recommended
241292229a57SYang Zhong * procedure from the kernel API documentation to cope with
241392229a57SYang Zhong * older kernels that may be missing capabilities.
241492229a57SYang Zhong */
kvm_recommended_vcpus(KVMState * s)241592229a57SYang Zhong static int kvm_recommended_vcpus(KVMState *s)
241692229a57SYang Zhong {
241711748ba7SGreg Kurz int ret = kvm_vm_check_extension(s, KVM_CAP_NR_VCPUS);
241892229a57SYang Zhong return (ret) ? ret : 4;
241992229a57SYang Zhong }
242092229a57SYang Zhong
kvm_max_vcpus(KVMState * s)242192229a57SYang Zhong static int kvm_max_vcpus(KVMState *s)
242292229a57SYang Zhong {
242392229a57SYang Zhong int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS);
242492229a57SYang Zhong return (ret) ? ret : kvm_recommended_vcpus(s);
242592229a57SYang Zhong }
242692229a57SYang Zhong
kvm_max_vcpu_id(KVMState * s)242792229a57SYang Zhong static int kvm_max_vcpu_id(KVMState *s)
242892229a57SYang Zhong {
242992229a57SYang Zhong int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPU_ID);
243092229a57SYang Zhong return (ret) ? ret : kvm_max_vcpus(s);
243192229a57SYang Zhong }
243292229a57SYang Zhong
kvm_vcpu_id_is_valid(int vcpu_id)243392229a57SYang Zhong bool kvm_vcpu_id_is_valid(int vcpu_id)
243492229a57SYang Zhong {
24354f7f5893SPhilippe Mathieu-Daudé KVMState *s = KVM_STATE(current_accel());
243692229a57SYang Zhong return vcpu_id >= 0 && vcpu_id < kvm_max_vcpu_id(s);
243792229a57SYang Zhong }
243892229a57SYang Zhong
kvm_dirty_ring_enabled(void)24397786ae40SHyman Huang(黄勇) bool kvm_dirty_ring_enabled(void)
24407786ae40SHyman Huang(黄勇) {
2441e65152d5SMasato Imai return kvm_state && kvm_state->kvm_dirty_ring_size;
24427786ae40SHyman Huang(黄勇) }
24437786ae40SHyman Huang(黄勇)
2444467ef823SPaolo Bonzini static void query_stats_cb(StatsResultList **result, StatsTarget target,
2445cf7405bcSPaolo Bonzini strList *names, strList *targets, Error **errp);
2446cc01a3f4SMark Kanda static void query_stats_schemas_cb(StatsSchemaList **result, Error **errp);
2447cc01a3f4SMark Kanda
kvm_dirty_ring_size(void)24484a06a7ccSHyman Huang(黄勇) uint32_t kvm_dirty_ring_size(void)
24494a06a7ccSHyman Huang(黄勇) {
24504a06a7ccSHyman Huang(黄勇) return kvm_state->kvm_dirty_ring_size;
24514a06a7ccSHyman Huang(黄勇) }
24524a06a7ccSHyman Huang(黄勇)
do_kvm_create_vm(MachineState * ms,int type)245367388078SAni Sinha static int do_kvm_create_vm(MachineState *ms, int type)
245467388078SAni Sinha {
245567388078SAni Sinha KVMState *s;
245667388078SAni Sinha int ret;
245767388078SAni Sinha
245867388078SAni Sinha s = KVM_STATE(ms->accelerator);
245967388078SAni Sinha
246067388078SAni Sinha do {
246167388078SAni Sinha ret = kvm_ioctl(s, KVM_CREATE_VM, type);
246267388078SAni Sinha } while (ret == -EINTR);
246367388078SAni Sinha
246467388078SAni Sinha if (ret < 0) {
246567388078SAni Sinha error_report("ioctl(KVM_CREATE_VM) failed: %s", strerror(-ret));
246667388078SAni Sinha
246767388078SAni Sinha #ifdef TARGET_S390X
246867388078SAni Sinha if (ret == -EINVAL) {
246967388078SAni Sinha error_printf("Host kernel setup problem detected."
247067388078SAni Sinha " Please verify:\n");
247167388078SAni Sinha error_printf("- for kernels supporting the"
247267388078SAni Sinha " switch_amode or user_mode parameters, whether");
247367388078SAni Sinha error_printf(" user space is running in primary address space\n");
247467388078SAni Sinha error_printf("- for kernels supporting the vm.allocate_pgste"
247567388078SAni Sinha " sysctl, whether it is enabled\n");
247667388078SAni Sinha }
247767388078SAni Sinha #elif defined(TARGET_PPC)
247867388078SAni Sinha if (ret == -EINVAL) {
247967388078SAni Sinha error_printf("PPC KVM module is not loaded. Try modprobe kvm_%s.\n",
248067388078SAni Sinha (type == 2) ? "pr" : "hv");
248167388078SAni Sinha }
248267388078SAni Sinha #endif
248367388078SAni Sinha }
248467388078SAni Sinha
248567388078SAni Sinha return ret;
248667388078SAni Sinha }
248767388078SAni Sinha
find_kvm_machine_type(MachineState * ms)248867388078SAni Sinha static int find_kvm_machine_type(MachineState *ms)
248967388078SAni Sinha {
249067388078SAni Sinha MachineClass *mc = MACHINE_GET_CLASS(ms);
249167388078SAni Sinha int type;
249267388078SAni Sinha
249367388078SAni Sinha if (object_property_find(OBJECT(current_machine), "kvm-type")) {
249467388078SAni Sinha g_autofree char *kvm_type;
249567388078SAni Sinha kvm_type = object_property_get_str(OBJECT(current_machine),
249667388078SAni Sinha "kvm-type",
249767388078SAni Sinha &error_abort);
249867388078SAni Sinha type = mc->kvm_type(ms, kvm_type);
249967388078SAni Sinha } else if (mc->kvm_type) {
250067388078SAni Sinha type = mc->kvm_type(ms, NULL);
250167388078SAni Sinha } else {
250267388078SAni Sinha type = kvm_arch_get_default_type(ms);
250367388078SAni Sinha }
250467388078SAni Sinha return type;
250567388078SAni Sinha }
250667388078SAni Sinha
kvm_setup_dirty_ring(KVMState * s)250728ed7f97SAni Sinha static int kvm_setup_dirty_ring(KVMState *s)
250828ed7f97SAni Sinha {
250928ed7f97SAni Sinha uint64_t dirty_log_manual_caps;
251028ed7f97SAni Sinha int ret;
251128ed7f97SAni Sinha
251228ed7f97SAni Sinha /*
251328ed7f97SAni Sinha * Enable KVM dirty ring if supported, otherwise fall back to
251428ed7f97SAni Sinha * dirty logging mode
251528ed7f97SAni Sinha */
251628ed7f97SAni Sinha ret = kvm_dirty_ring_init(s);
251728ed7f97SAni Sinha if (ret < 0) {
251828ed7f97SAni Sinha return ret;
251928ed7f97SAni Sinha }
252028ed7f97SAni Sinha
252128ed7f97SAni Sinha /*
252228ed7f97SAni Sinha * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is not needed when dirty ring is
252328ed7f97SAni Sinha * enabled. More importantly, KVM_DIRTY_LOG_INITIALLY_SET will assume no
252428ed7f97SAni Sinha * page is wr-protected initially, which is against how kvm dirty ring is
252528ed7f97SAni Sinha * usage - kvm dirty ring requires all pages are wr-protected at the very
252628ed7f97SAni Sinha * beginning. Enabling this feature for dirty ring causes data corruption.
252728ed7f97SAni Sinha *
252828ed7f97SAni Sinha * TODO: Without KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 and kvm clear dirty log,
252928ed7f97SAni Sinha * we may expect a higher stall time when starting the migration. In the
253028ed7f97SAni Sinha * future we can enable KVM_CLEAR_DIRTY_LOG to work with dirty ring too:
253128ed7f97SAni Sinha * instead of clearing dirty bit, it can be a way to explicitly wr-protect
253228ed7f97SAni Sinha * guest pages.
253328ed7f97SAni Sinha */
253428ed7f97SAni Sinha if (!s->kvm_dirty_ring_size) {
253528ed7f97SAni Sinha dirty_log_manual_caps =
253628ed7f97SAni Sinha kvm_check_extension(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
253728ed7f97SAni Sinha dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
253828ed7f97SAni Sinha KVM_DIRTY_LOG_INITIALLY_SET);
253928ed7f97SAni Sinha s->manual_dirty_log_protect = dirty_log_manual_caps;
254028ed7f97SAni Sinha if (dirty_log_manual_caps) {
254128ed7f97SAni Sinha ret = kvm_vm_enable_cap(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0,
254228ed7f97SAni Sinha dirty_log_manual_caps);
254328ed7f97SAni Sinha if (ret) {
254428ed7f97SAni Sinha warn_report("Trying to enable capability %"PRIu64" of "
254528ed7f97SAni Sinha "KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 but failed. "
254628ed7f97SAni Sinha "Falling back to the legacy mode. ",
254728ed7f97SAni Sinha dirty_log_manual_caps);
254828ed7f97SAni Sinha s->manual_dirty_log_protect = 0;
254928ed7f97SAni Sinha }
255028ed7f97SAni Sinha }
255128ed7f97SAni Sinha }
255228ed7f97SAni Sinha
255328ed7f97SAni Sinha return 0;
255428ed7f97SAni Sinha }
255528ed7f97SAni Sinha
kvm_init(MachineState * ms)255692229a57SYang Zhong static int kvm_init(MachineState *ms)
255792229a57SYang Zhong {
255892229a57SYang Zhong MachineClass *mc = MACHINE_GET_CLASS(ms);
255992229a57SYang Zhong static const char upgrade_note[] =
256092229a57SYang Zhong "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
256192229a57SYang Zhong "(see http://sourceforge.net/projects/kvm).\n";
25622459d420SPhilippe Mathieu-Daudé const struct {
256392229a57SYang Zhong const char *name;
256492229a57SYang Zhong int num;
256592229a57SYang Zhong } num_cpus[] = {
25665cc8767dSLike Xu { "SMP", ms->smp.cpus },
25675cc8767dSLike Xu { "hotpluggable", ms->smp.max_cpus },
25682459d420SPhilippe Mathieu-Daudé { /* end of list */ }
256992229a57SYang Zhong }, *nc = num_cpus;
257092229a57SYang Zhong int soft_vcpus_limit, hard_vcpus_limit;
257192229a57SYang Zhong KVMState *s;
257292229a57SYang Zhong const KVMCapabilityInfo *missing_cap;
257392229a57SYang Zhong int ret;
25745e0d6590SAkihiko Odaki int type;
257592229a57SYang Zhong
2576a2f77862SPeter Xu qemu_mutex_init(&kml_slots_lock);
2577a2f77862SPeter Xu
257892229a57SYang Zhong s = KVM_STATE(ms->accelerator);
257992229a57SYang Zhong
258092229a57SYang Zhong /*
258192229a57SYang Zhong * On systems where the kernel can support different base page
258292229a57SYang Zhong * sizes, host page size may be different from TARGET_PAGE_SIZE,
258392229a57SYang Zhong * even with KVM. TARGET_PAGE_SIZE is assumed to be the minimum
258492229a57SYang Zhong * page size for the system though.
258592229a57SYang Zhong */
25868e3b0cbbSMarc-André Lureau assert(TARGET_PAGE_SIZE <= qemu_real_host_page_size());
258792229a57SYang Zhong
258892229a57SYang Zhong s->sigmask_len = 8;
2589a27dd2deSEmanuele Giuseppe Esposito accel_blocker_init();
259092229a57SYang Zhong
25911e1e4879SPaolo Bonzini #ifdef TARGET_KVM_HAVE_GUEST_DEBUG
259292229a57SYang Zhong QTAILQ_INIT(&s->kvm_sw_breakpoints);
259392229a57SYang Zhong #endif
259492229a57SYang Zhong QLIST_INIT(&s->kvm_parked_vcpus);
2595aef158b0SDaan De Meyer s->fd = qemu_open_old(s->device ?: "/dev/kvm", O_RDWR);
259692229a57SYang Zhong if (s->fd == -1) {
2597804dfbe3SAni Sinha error_report("Could not access KVM kernel module: %m");
259892229a57SYang Zhong ret = -errno;
259992229a57SYang Zhong goto err;
260092229a57SYang Zhong }
260192229a57SYang Zhong
260292229a57SYang Zhong ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
260392229a57SYang Zhong if (ret < KVM_API_VERSION) {
260492229a57SYang Zhong if (ret >= 0) {
260592229a57SYang Zhong ret = -EINVAL;
260692229a57SYang Zhong }
2607804dfbe3SAni Sinha error_report("kvm version too old");
260892229a57SYang Zhong goto err;
260992229a57SYang Zhong }
261092229a57SYang Zhong
261192229a57SYang Zhong if (ret > KVM_API_VERSION) {
261292229a57SYang Zhong ret = -EINVAL;
2613804dfbe3SAni Sinha error_report("kvm version not supported");
261492229a57SYang Zhong goto err;
261592229a57SYang Zhong }
261692229a57SYang Zhong
261792229a57SYang Zhong kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT);
2618943c7428SPeter Xu s->nr_slots_max = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS);
261992229a57SYang Zhong
262092229a57SYang Zhong /* If unspecified, use the default value */
2621943c7428SPeter Xu if (!s->nr_slots_max) {
2622b34a908cSPeter Xu s->nr_slots_max = KVM_MEMSLOTS_NR_MAX_DEFAULT;
262392229a57SYang Zhong }
262492229a57SYang Zhong
262567388078SAni Sinha type = find_kvm_machine_type(ms);
2626bc3e41a0SAkihiko Odaki if (type < 0) {
2627bc3e41a0SAkihiko Odaki ret = -EINVAL;
2628bc3e41a0SAkihiko Odaki goto err;
2629bc3e41a0SAkihiko Odaki }
2630bc3e41a0SAkihiko Odaki
263167388078SAni Sinha ret = do_kvm_create_vm(ms, type);
263292229a57SYang Zhong if (ret < 0) {
263392229a57SYang Zhong goto err;
263492229a57SYang Zhong }
263592229a57SYang Zhong
263692229a57SYang Zhong s->vmfd = ret;
263711748ba7SGreg Kurz
263860de433dSPaolo Bonzini s->nr_as = kvm_vm_check_extension(s, KVM_CAP_MULTI_ADDRESS_SPACE);
263960de433dSPaolo Bonzini if (s->nr_as <= 1) {
264060de433dSPaolo Bonzini s->nr_as = 1;
264160de433dSPaolo Bonzini }
264260de433dSPaolo Bonzini s->as = g_new0(struct KVMAs, s->nr_as);
264360de433dSPaolo Bonzini
264411748ba7SGreg Kurz /* check the vcpu limits */
264511748ba7SGreg Kurz soft_vcpus_limit = kvm_recommended_vcpus(s);
264611748ba7SGreg Kurz hard_vcpus_limit = kvm_max_vcpus(s);
264711748ba7SGreg Kurz
264811748ba7SGreg Kurz while (nc->name) {
264911748ba7SGreg Kurz if (nc->num > soft_vcpus_limit) {
265011748ba7SGreg Kurz warn_report("Number of %s cpus requested (%d) exceeds "
265111748ba7SGreg Kurz "the recommended cpus supported by KVM (%d)",
265211748ba7SGreg Kurz nc->name, nc->num, soft_vcpus_limit);
265311748ba7SGreg Kurz
265411748ba7SGreg Kurz if (nc->num > hard_vcpus_limit) {
2655804dfbe3SAni Sinha error_report("Number of %s cpus requested (%d) exceeds "
2656804dfbe3SAni Sinha "the maximum cpus supported by KVM (%d)",
265711748ba7SGreg Kurz nc->name, nc->num, hard_vcpus_limit);
265811748ba7SGreg Kurz exit(1);
265911748ba7SGreg Kurz }
266011748ba7SGreg Kurz }
266111748ba7SGreg Kurz nc++;
266211748ba7SGreg Kurz }
266311748ba7SGreg Kurz
266492229a57SYang Zhong missing_cap = kvm_check_extension_list(s, kvm_required_capabilites);
266592229a57SYang Zhong if (!missing_cap) {
266692229a57SYang Zhong missing_cap =
266792229a57SYang Zhong kvm_check_extension_list(s, kvm_arch_required_capabilities);
266892229a57SYang Zhong }
266992229a57SYang Zhong if (missing_cap) {
267092229a57SYang Zhong ret = -EINVAL;
2671804dfbe3SAni Sinha error_report("kvm does not support %s", missing_cap->name);
2672804dfbe3SAni Sinha error_printf("%s", upgrade_note);
267392229a57SYang Zhong goto err;
267492229a57SYang Zhong }
267592229a57SYang Zhong
267692229a57SYang Zhong s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
2677e6d34aeeSPeng Hao s->coalesced_pio = s->coalesced_mmio &&
2678e6d34aeeSPeng Hao kvm_check_extension(s, KVM_CAP_COALESCED_PIO);
267992229a57SYang Zhong
268028ed7f97SAni Sinha ret = kvm_setup_dirty_ring(s);
26813794cb94SGavin Shan if (ret < 0) {
2682b4420f19SPeter Xu goto err;
2683b4420f19SPeter Xu }
2684b4420f19SPeter Xu
268592229a57SYang Zhong #ifdef KVM_CAP_VCPU_EVENTS
268692229a57SYang Zhong s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
268792229a57SYang Zhong #endif
2688ebbfef2fSLiran Alon s->max_nested_state_len = kvm_check_extension(s, KVM_CAP_NESTED_STATE);
2689ebbfef2fSLiran Alon
269092229a57SYang Zhong s->irq_set_ioctl = KVM_IRQ_LINE;
269192229a57SYang Zhong if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) {
269292229a57SYang Zhong s->irq_set_ioctl = KVM_IRQ_LINE_STATUS;
269392229a57SYang Zhong }
269492229a57SYang Zhong
269592229a57SYang Zhong kvm_readonly_mem_allowed =
269664e0e63eSTom Dohrmann (kvm_vm_check_extension(s, KVM_CAP_READONLY_MEM) > 0);
269792229a57SYang Zhong
269892229a57SYang Zhong kvm_resamplefds_allowed =
269992229a57SYang Zhong (kvm_check_extension(s, KVM_CAP_IRQFD_RESAMPLE) > 0);
270092229a57SYang Zhong
270192229a57SYang Zhong kvm_vm_attributes_allowed =
270292229a57SYang Zhong (kvm_check_extension(s, KVM_CAP_VM_ATTRIBUTES) > 0);
270392229a57SYang Zhong
27041e1e4879SPaolo Bonzini #ifdef TARGET_KVM_HAVE_GUEST_DEBUG
270512bc5b4cSMaxim Levitsky kvm_has_guest_debug =
270612bc5b4cSMaxim Levitsky (kvm_check_extension(s, KVM_CAP_SET_GUEST_DEBUG) > 0);
270712bc5b4cSMaxim Levitsky #endif
270812bc5b4cSMaxim Levitsky
270912bc5b4cSMaxim Levitsky kvm_sstep_flags = 0;
271012bc5b4cSMaxim Levitsky if (kvm_has_guest_debug) {
271112bc5b4cSMaxim Levitsky kvm_sstep_flags = SSTEP_ENABLE;
2712fd2ddd16SMaxim Levitsky
27131e1e4879SPaolo Bonzini #if defined TARGET_KVM_HAVE_GUEST_DEBUG
2714fd2ddd16SMaxim Levitsky int guest_debug_flags =
2715fd2ddd16SMaxim Levitsky kvm_check_extension(s, KVM_CAP_SET_GUEST_DEBUG2);
2716fd2ddd16SMaxim Levitsky
2717fd2ddd16SMaxim Levitsky if (guest_debug_flags & KVM_GUESTDBG_BLOCKIRQ) {
2718fd2ddd16SMaxim Levitsky kvm_sstep_flags |= SSTEP_NOIRQ;
2719fd2ddd16SMaxim Levitsky }
2720fd2ddd16SMaxim Levitsky #endif
272112bc5b4cSMaxim Levitsky }
272212bc5b4cSMaxim Levitsky
272392229a57SYang Zhong kvm_state = s;
272492229a57SYang Zhong
272592229a57SYang Zhong ret = kvm_arch_init(ms, s);
272692229a57SYang Zhong if (ret < 0) {
272792229a57SYang Zhong goto err;
272892229a57SYang Zhong }
272992229a57SYang Zhong
2730586d708cSPaolo Bonzini kvm_supported_memory_attributes = kvm_vm_check_extension(s, KVM_CAP_MEMORY_ATTRIBUTES);
2731586d708cSPaolo Bonzini kvm_guest_memfd_supported =
2732586d708cSPaolo Bonzini kvm_check_extension(s, KVM_CAP_GUEST_MEMFD) &&
2733586d708cSPaolo Bonzini kvm_check_extension(s, KVM_CAP_USER_MEMORY2) &&
2734586d708cSPaolo Bonzini (kvm_supported_memory_attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE);
2735586d708cSPaolo Bonzini
2736d1972be1SXiaoyao Li if (s->kernel_irqchip_split == ON_OFF_AUTO_AUTO) {
2737d1972be1SXiaoyao Li s->kernel_irqchip_split = mc->default_kernel_irqchip_split ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
2738d1972be1SXiaoyao Li }
2739d1972be1SXiaoyao Li
27406b552b9bSDongjiu Geng qemu_register_reset(kvm_unpoison_all, NULL);
2741*2dc65296SMaciej S. Szmigiero qemu_register_reset(kvm_reset_parked_vcpus, s);
27426b552b9bSDongjiu Geng
274311bc4a13SPaolo Bonzini if (s->kernel_irqchip_allowed) {
27444376c40dSPaolo Bonzini kvm_irqchip_create(s);
274592229a57SYang Zhong }
274692229a57SYang Zhong
274792229a57SYang Zhong s->memory_listener.listener.eventfd_add = kvm_mem_ioeventfd_add;
274892229a57SYang Zhong s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del;
2749e6d34aeeSPeng Hao s->memory_listener.listener.coalesced_io_add = kvm_coalesce_mmio_region;
2750e6d34aeeSPeng Hao s->memory_listener.listener.coalesced_io_del = kvm_uncoalesce_mmio_region;
275192229a57SYang Zhong
275292229a57SYang Zhong kvm_memory_listener_register(s, &s->memory_listener,
2753142518bdSPeter Xu &address_space_memory, 0, "kvm-memory");
275492229a57SYang Zhong memory_listener_register(&kvm_io_listener,
275592229a57SYang Zhong &address_space_io);
275692229a57SYang Zhong
275762dd4edaSGreg Kurz s->sync_mmu = !!kvm_vm_check_extension(kvm_state, KVM_CAP_SYNC_MMU);
2758f5948942SAlex Williamson if (!s->sync_mmu) {
2759956b109fSDavid Hildenbrand ret = ram_block_discard_disable(true);
2760956b109fSDavid Hildenbrand assert(!ret);
2761f5948942SAlex Williamson }
2762b4420f19SPeter Xu
2763b4420f19SPeter Xu if (s->kvm_dirty_ring_size) {
276443a5e377SAkihiko Odaki kvm_dirty_ring_reaper_init(s);
2765b4420f19SPeter Xu }
2766b4420f19SPeter Xu
2767cc01a3f4SMark Kanda if (kvm_check_extension(kvm_state, KVM_CAP_BINARY_STATS_FD)) {
2768068cc51dSPaolo Bonzini add_stats_callbacks(STATS_PROVIDER_KVM, query_stats_cb,
2769068cc51dSPaolo Bonzini query_stats_schemas_cb);
2770cc01a3f4SMark Kanda }
2771cc01a3f4SMark Kanda
277292229a57SYang Zhong return 0;
277392229a57SYang Zhong
277492229a57SYang Zhong err:
277592229a57SYang Zhong assert(ret < 0);
277692229a57SYang Zhong if (s->vmfd >= 0) {
277792229a57SYang Zhong close(s->vmfd);
277892229a57SYang Zhong }
277992229a57SYang Zhong if (s->fd != -1) {
278092229a57SYang Zhong close(s->fd);
278192229a57SYang Zhong }
27824625742cSAkihiko Odaki g_free(s->as);
278392229a57SYang Zhong g_free(s->memory_listener.slots);
278492229a57SYang Zhong
278592229a57SYang Zhong return ret;
278692229a57SYang Zhong }
278792229a57SYang Zhong
kvm_set_sigmask_len(KVMState * s,unsigned int sigmask_len)278892229a57SYang Zhong void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len)
278992229a57SYang Zhong {
279092229a57SYang Zhong s->sigmask_len = sigmask_len;
279192229a57SYang Zhong }
279292229a57SYang Zhong
kvm_handle_io(uint16_t port,MemTxAttrs attrs,void * data,int direction,int size,uint32_t count)279392229a57SYang Zhong static void kvm_handle_io(uint16_t port, MemTxAttrs attrs, void *data, int direction,
279492229a57SYang Zhong int size, uint32_t count)
279592229a57SYang Zhong {
279692229a57SYang Zhong int i;
279792229a57SYang Zhong uint8_t *ptr = data;
279892229a57SYang Zhong
279992229a57SYang Zhong for (i = 0; i < count; i++) {
280092229a57SYang Zhong address_space_rw(&address_space_io, port, attrs,
280192229a57SYang Zhong ptr, size,
280292229a57SYang Zhong direction == KVM_EXIT_IO_OUT);
280392229a57SYang Zhong ptr += size;
280492229a57SYang Zhong }
280592229a57SYang Zhong }
280692229a57SYang Zhong
kvm_handle_internal_error(CPUState * cpu,struct kvm_run * run)280792229a57SYang Zhong static int kvm_handle_internal_error(CPUState *cpu, struct kvm_run *run)
280892229a57SYang Zhong {
2809aacec9aeSPaolo Bonzini int i;
2810aacec9aeSPaolo Bonzini
281192229a57SYang Zhong fprintf(stderr, "KVM internal error. Suberror: %d\n",
281292229a57SYang Zhong run->internal.suberror);
281392229a57SYang Zhong
281492229a57SYang Zhong for (i = 0; i < run->internal.ndata; ++i) {
281556567da3SDavid Edmondson fprintf(stderr, "extra data[%d]: 0x%016"PRIx64"\n",
281692229a57SYang Zhong i, (uint64_t)run->internal.data[i]);
281792229a57SYang Zhong }
281892229a57SYang Zhong if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
281992229a57SYang Zhong fprintf(stderr, "emulation failure\n");
282092229a57SYang Zhong if (!kvm_arch_stop_on_emulation_error(cpu)) {
282190c84c56SMarkus Armbruster cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
282292229a57SYang Zhong return EXCP_INTERRUPT;
282392229a57SYang Zhong }
282492229a57SYang Zhong }
282592229a57SYang Zhong /* FIXME: Should trigger a qmp message to let management know
282692229a57SYang Zhong * something went wrong.
282792229a57SYang Zhong */
282892229a57SYang Zhong return -1;
282992229a57SYang Zhong }
283092229a57SYang Zhong
kvm_flush_coalesced_mmio_buffer(void)283192229a57SYang Zhong void kvm_flush_coalesced_mmio_buffer(void)
283292229a57SYang Zhong {
283392229a57SYang Zhong KVMState *s = kvm_state;
283492229a57SYang Zhong
2835fe6bda58SGavin Shan if (!s || s->coalesced_flush_in_progress) {
283692229a57SYang Zhong return;
283792229a57SYang Zhong }
283892229a57SYang Zhong
283992229a57SYang Zhong s->coalesced_flush_in_progress = true;
284092229a57SYang Zhong
284192229a57SYang Zhong if (s->coalesced_mmio_ring) {
284292229a57SYang Zhong struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
284392229a57SYang Zhong while (ring->first != ring->last) {
284492229a57SYang Zhong struct kvm_coalesced_mmio *ent;
284592229a57SYang Zhong
284692229a57SYang Zhong ent = &ring->coalesced_mmio[ring->first];
284792229a57SYang Zhong
2848e6d34aeeSPeng Hao if (ent->pio == 1) {
284919f70347SPeter Maydell address_space_write(&address_space_io, ent->phys_addr,
2850e6d34aeeSPeng Hao MEMTXATTRS_UNSPECIFIED, ent->data,
285119f70347SPeter Maydell ent->len);
2852e6d34aeeSPeng Hao } else {
285392229a57SYang Zhong cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
2854e6d34aeeSPeng Hao }
285592229a57SYang Zhong smp_wmb();
285692229a57SYang Zhong ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
285792229a57SYang Zhong }
285892229a57SYang Zhong }
285992229a57SYang Zhong
286092229a57SYang Zhong s->coalesced_flush_in_progress = false;
286192229a57SYang Zhong }
286292229a57SYang Zhong
do_kvm_cpu_synchronize_state(CPUState * cpu,run_on_cpu_data arg)286392229a57SYang Zhong static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
286492229a57SYang Zhong {
28655c3131c3SPaolo Bonzini if (!cpu->vcpu_dirty && !kvm_state->guest_state_protected) {
2866a1676bb3SJulia Suvorova Error *err = NULL;
2867a1676bb3SJulia Suvorova int ret = kvm_arch_get_registers(cpu, &err);
28687191f24cSAkihiko Odaki if (ret) {
2869a1676bb3SJulia Suvorova if (err) {
2870a1676bb3SJulia Suvorova error_reportf_err(err, "Failed to synchronize CPU state: ");
2871a1676bb3SJulia Suvorova } else {
28727191f24cSAkihiko Odaki error_report("Failed to get registers: %s", strerror(-ret));
2873a1676bb3SJulia Suvorova }
2874a1676bb3SJulia Suvorova
28757191f24cSAkihiko Odaki cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
28767191f24cSAkihiko Odaki vm_stop(RUN_STATE_INTERNAL_ERROR);
28777191f24cSAkihiko Odaki }
28787191f24cSAkihiko Odaki
287999f31832SSergio Andres Gomez Del Real cpu->vcpu_dirty = true;
288092229a57SYang Zhong }
288192229a57SYang Zhong }
288292229a57SYang Zhong
kvm_cpu_synchronize_state(CPUState * cpu)288392229a57SYang Zhong void kvm_cpu_synchronize_state(CPUState *cpu)
288492229a57SYang Zhong {
28855c3131c3SPaolo Bonzini if (!cpu->vcpu_dirty && !kvm_state->guest_state_protected) {
288692229a57SYang Zhong run_on_cpu(cpu, do_kvm_cpu_synchronize_state, RUN_ON_CPU_NULL);
288792229a57SYang Zhong }
288892229a57SYang Zhong }
288992229a57SYang Zhong
do_kvm_cpu_synchronize_post_reset(CPUState * cpu,run_on_cpu_data arg)289092229a57SYang Zhong static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg)
289192229a57SYang Zhong {
2892a1676bb3SJulia Suvorova Error *err = NULL;
2893a1676bb3SJulia Suvorova int ret = kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE, &err);
28947191f24cSAkihiko Odaki if (ret) {
2895a1676bb3SJulia Suvorova if (err) {
2896a1676bb3SJulia Suvorova error_reportf_err(err, "Restoring resisters after reset: ");
2897a1676bb3SJulia Suvorova } else {
2898a1676bb3SJulia Suvorova error_report("Failed to put registers after reset: %s",
2899a1676bb3SJulia Suvorova strerror(-ret));
2900a1676bb3SJulia Suvorova }
29017191f24cSAkihiko Odaki cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
29027191f24cSAkihiko Odaki vm_stop(RUN_STATE_INTERNAL_ERROR);
29037191f24cSAkihiko Odaki }
29047191f24cSAkihiko Odaki
290599f31832SSergio Andres Gomez Del Real cpu->vcpu_dirty = false;
290692229a57SYang Zhong }
290792229a57SYang Zhong
kvm_cpu_synchronize_post_reset(CPUState * cpu)290892229a57SYang Zhong void kvm_cpu_synchronize_post_reset(CPUState *cpu)
290992229a57SYang Zhong {
291092229a57SYang Zhong run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
291192229a57SYang Zhong }
291292229a57SYang Zhong
do_kvm_cpu_synchronize_post_init(CPUState * cpu,run_on_cpu_data arg)291392229a57SYang Zhong static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg)
291492229a57SYang Zhong {
2915a1676bb3SJulia Suvorova Error *err = NULL;
2916a1676bb3SJulia Suvorova int ret = kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE, &err);
29177191f24cSAkihiko Odaki if (ret) {
2918a1676bb3SJulia Suvorova if (err) {
2919a1676bb3SJulia Suvorova error_reportf_err(err, "Putting registers after init: ");
2920a1676bb3SJulia Suvorova } else {
2921a1676bb3SJulia Suvorova error_report("Failed to put registers after init: %s",
2922a1676bb3SJulia Suvorova strerror(-ret));
2923a1676bb3SJulia Suvorova }
29247191f24cSAkihiko Odaki exit(1);
29257191f24cSAkihiko Odaki }
29267191f24cSAkihiko Odaki
292799f31832SSergio Andres Gomez Del Real cpu->vcpu_dirty = false;
292892229a57SYang Zhong }
292992229a57SYang Zhong
kvm_cpu_synchronize_post_init(CPUState * cpu)293092229a57SYang Zhong void kvm_cpu_synchronize_post_init(CPUState *cpu)
293192229a57SYang Zhong {
29325c3131c3SPaolo Bonzini if (!kvm_state->guest_state_protected) {
29335c3131c3SPaolo Bonzini /*
29345c3131c3SPaolo Bonzini * This runs before the machine_init_done notifiers, and is the last
29355c3131c3SPaolo Bonzini * opportunity to synchronize the state of confidential guests.
29365c3131c3SPaolo Bonzini */
293792229a57SYang Zhong run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
293892229a57SYang Zhong }
29395c3131c3SPaolo Bonzini }
294092229a57SYang Zhong
do_kvm_cpu_synchronize_pre_loadvm(CPUState * cpu,run_on_cpu_data arg)294192229a57SYang Zhong static void do_kvm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg)
294292229a57SYang Zhong {
294399f31832SSergio Andres Gomez Del Real cpu->vcpu_dirty = true;
294492229a57SYang Zhong }
294592229a57SYang Zhong
kvm_cpu_synchronize_pre_loadvm(CPUState * cpu)294692229a57SYang Zhong void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu)
294792229a57SYang Zhong {
294892229a57SYang Zhong run_on_cpu(cpu, do_kvm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
294992229a57SYang Zhong }
295092229a57SYang Zhong
295192229a57SYang Zhong #ifdef KVM_HAVE_MCE_INJECTION
295292229a57SYang Zhong static __thread void *pending_sigbus_addr;
295392229a57SYang Zhong static __thread int pending_sigbus_code;
295492229a57SYang Zhong static __thread bool have_sigbus_pending;
295592229a57SYang Zhong #endif
295692229a57SYang Zhong
kvm_cpu_kick(CPUState * cpu)295792229a57SYang Zhong static void kvm_cpu_kick(CPUState *cpu)
295892229a57SYang Zhong {
2959d73415a3SStefan Hajnoczi qatomic_set(&cpu->kvm_run->immediate_exit, 1);
296092229a57SYang Zhong }
296192229a57SYang Zhong
kvm_cpu_kick_self(void)296292229a57SYang Zhong static void kvm_cpu_kick_self(void)
296392229a57SYang Zhong {
296492229a57SYang Zhong if (kvm_immediate_exit) {
296592229a57SYang Zhong kvm_cpu_kick(current_cpu);
296692229a57SYang Zhong } else {
296792229a57SYang Zhong qemu_cpu_kick_self();
296892229a57SYang Zhong }
296992229a57SYang Zhong }
297092229a57SYang Zhong
kvm_eat_signals(CPUState * cpu)297192229a57SYang Zhong static void kvm_eat_signals(CPUState *cpu)
297292229a57SYang Zhong {
297392229a57SYang Zhong struct timespec ts = { 0, 0 };
297492229a57SYang Zhong siginfo_t siginfo;
297592229a57SYang Zhong sigset_t waitset;
297692229a57SYang Zhong sigset_t chkset;
297792229a57SYang Zhong int r;
297892229a57SYang Zhong
297992229a57SYang Zhong if (kvm_immediate_exit) {
2980d73415a3SStefan Hajnoczi qatomic_set(&cpu->kvm_run->immediate_exit, 0);
298192229a57SYang Zhong /* Write kvm_run->immediate_exit before the cpu->exit_request
298292229a57SYang Zhong * write in kvm_cpu_exec.
298392229a57SYang Zhong */
298492229a57SYang Zhong smp_wmb();
298592229a57SYang Zhong return;
298692229a57SYang Zhong }
298792229a57SYang Zhong
298892229a57SYang Zhong sigemptyset(&waitset);
298992229a57SYang Zhong sigaddset(&waitset, SIG_IPI);
299092229a57SYang Zhong
299192229a57SYang Zhong do {
299292229a57SYang Zhong r = sigtimedwait(&waitset, &siginfo, &ts);
299392229a57SYang Zhong if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
299492229a57SYang Zhong perror("sigtimedwait");
299592229a57SYang Zhong exit(1);
299692229a57SYang Zhong }
299792229a57SYang Zhong
299892229a57SYang Zhong r = sigpending(&chkset);
299992229a57SYang Zhong if (r == -1) {
300092229a57SYang Zhong perror("sigpending");
300192229a57SYang Zhong exit(1);
300292229a57SYang Zhong }
300392229a57SYang Zhong } while (sigismember(&chkset, SIG_IPI));
300492229a57SYang Zhong }
300592229a57SYang Zhong
kvm_convert_memory(hwaddr start,hwaddr size,bool to_private)3006c15e5684SChao Peng int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private)
3007c15e5684SChao Peng {
3008c15e5684SChao Peng MemoryRegionSection section;
3009c15e5684SChao Peng ram_addr_t offset;
3010c15e5684SChao Peng MemoryRegion *mr;
3011c15e5684SChao Peng RAMBlock *rb;
3012c15e5684SChao Peng void *addr;
3013c15e5684SChao Peng int ret = -1;
3014c15e5684SChao Peng
3015c15e5684SChao Peng trace_kvm_convert_memory(start, size, to_private ? "shared_to_private" : "private_to_shared");
3016c15e5684SChao Peng
3017c15e5684SChao Peng if (!QEMU_PTR_IS_ALIGNED(start, qemu_real_host_page_size()) ||
3018c15e5684SChao Peng !QEMU_PTR_IS_ALIGNED(size, qemu_real_host_page_size())) {
3019c15e5684SChao Peng return -1;
3020c15e5684SChao Peng }
3021c15e5684SChao Peng
3022c15e5684SChao Peng if (!size) {
3023c15e5684SChao Peng return -1;
3024c15e5684SChao Peng }
3025c15e5684SChao Peng
3026c15e5684SChao Peng section = memory_region_find(get_system_memory(), start, size);
3027c15e5684SChao Peng mr = section.mr;
3028c15e5684SChao Peng if (!mr) {
3029565f4768SIsaku Yamahata /*
3030565f4768SIsaku Yamahata * Ignore converting non-assigned region to shared.
3031565f4768SIsaku Yamahata *
3032565f4768SIsaku Yamahata * TDX requires vMMIO region to be shared to inject #VE to guest.
3033565f4768SIsaku Yamahata * OVMF issues conservatively MapGPA(shared) on 32bit PCI MMIO region,
3034565f4768SIsaku Yamahata * and vIO-APIC 0xFEC00000 4K page.
3035565f4768SIsaku Yamahata * OVMF assigns 32bit PCI MMIO region to
3036565f4768SIsaku Yamahata * [top of low memory: typically 2GB=0xC000000, 0xFC00000)
3037565f4768SIsaku Yamahata */
3038565f4768SIsaku Yamahata if (!to_private) {
3039565f4768SIsaku Yamahata return 0;
3040565f4768SIsaku Yamahata }
3041c15e5684SChao Peng return -1;
3042c15e5684SChao Peng }
3043c15e5684SChao Peng
3044c15e5684SChao Peng if (!memory_region_has_guest_memfd(mr)) {
3045c5d9425eSIsaku Yamahata /*
3046c5d9425eSIsaku Yamahata * Because vMMIO region must be shared, guest TD may convert vMMIO
3047c5d9425eSIsaku Yamahata * region to shared explicitly. Don't complain such case. See
3048c5d9425eSIsaku Yamahata * memory_region_type() for checking if the region is MMIO region.
3049c5d9425eSIsaku Yamahata */
3050c5d9425eSIsaku Yamahata if (!to_private &&
3051c5d9425eSIsaku Yamahata !memory_region_is_ram(mr) &&
3052c5d9425eSIsaku Yamahata !memory_region_is_ram_device(mr) &&
3053c5d9425eSIsaku Yamahata !memory_region_is_rom(mr) &&
3054c5d9425eSIsaku Yamahata !memory_region_is_romd(mr)) {
3055c5d9425eSIsaku Yamahata ret = 0;
3056c5d9425eSIsaku Yamahata } else {
3057c5d9425eSIsaku Yamahata error_report("Convert non guest_memfd backed memory region "
3058c15e5684SChao Peng "(0x%"HWADDR_PRIx" ,+ 0x%"HWADDR_PRIx") to %s",
3059c15e5684SChao Peng start, size, to_private ? "private" : "shared");
3060c5d9425eSIsaku Yamahata }
3061c15e5684SChao Peng goto out_unref;
3062c15e5684SChao Peng }
3063c15e5684SChao Peng
3064c15e5684SChao Peng if (to_private) {
3065c15e5684SChao Peng ret = kvm_set_memory_attributes_private(start, size);
3066c15e5684SChao Peng } else {
3067c15e5684SChao Peng ret = kvm_set_memory_attributes_shared(start, size);
3068c15e5684SChao Peng }
3069c15e5684SChao Peng if (ret) {
3070c15e5684SChao Peng goto out_unref;
3071c15e5684SChao Peng }
3072c15e5684SChao Peng
3073c15e5684SChao Peng addr = memory_region_get_ram_ptr(mr) + section.offset_within_region;
3074c15e5684SChao Peng rb = qemu_ram_block_from_host(addr, false, &offset);
3075c15e5684SChao Peng
3076c15e5684SChao Peng if (to_private) {
3077c15e5684SChao Peng if (rb->page_size != qemu_real_host_page_size()) {
3078c15e5684SChao Peng /*
3079c15e5684SChao Peng * shared memory is backed by hugetlb, which is supposed to be
3080c15e5684SChao Peng * pre-allocated and doesn't need to be discarded
3081c15e5684SChao Peng */
3082c15e5684SChao Peng goto out_unref;
3083c15e5684SChao Peng }
3084c15e5684SChao Peng ret = ram_block_discard_range(rb, offset, size);
3085c15e5684SChao Peng } else {
3086c15e5684SChao Peng ret = ram_block_discard_guest_memfd_range(rb, offset, size);
3087c15e5684SChao Peng }
3088c15e5684SChao Peng
3089c15e5684SChao Peng out_unref:
3090c15e5684SChao Peng memory_region_unref(mr);
3091c15e5684SChao Peng return ret;
3092c15e5684SChao Peng }
3093c15e5684SChao Peng
kvm_cpu_exec(CPUState * cpu)309492229a57SYang Zhong int kvm_cpu_exec(CPUState *cpu)
309592229a57SYang Zhong {
309692229a57SYang Zhong struct kvm_run *run = cpu->kvm_run;
309792229a57SYang Zhong int ret, run_ret;
309892229a57SYang Zhong
30999cdfb1e3SJai Arora trace_kvm_cpu_exec();
310092229a57SYang Zhong
310192229a57SYang Zhong if (kvm_arch_process_async_events(cpu)) {
3102d73415a3SStefan Hajnoczi qatomic_set(&cpu->exit_request, 0);
310392229a57SYang Zhong return EXCP_HLT;
310492229a57SYang Zhong }
310592229a57SYang Zhong
3106195801d7SStefan Hajnoczi bql_unlock();
310792229a57SYang Zhong cpu_exec_start(cpu);
310892229a57SYang Zhong
310992229a57SYang Zhong do {
311092229a57SYang Zhong MemTxAttrs attrs;
311192229a57SYang Zhong
311299f31832SSergio Andres Gomez Del Real if (cpu->vcpu_dirty) {
3113a1676bb3SJulia Suvorova Error *err = NULL;
3114a1676bb3SJulia Suvorova ret = kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE, &err);
31157191f24cSAkihiko Odaki if (ret) {
3116a1676bb3SJulia Suvorova if (err) {
3117a1676bb3SJulia Suvorova error_reportf_err(err, "Putting registers after init: ");
3118a1676bb3SJulia Suvorova } else {
31197191f24cSAkihiko Odaki error_report("Failed to put registers after init: %s",
31207191f24cSAkihiko Odaki strerror(-ret));
3121a1676bb3SJulia Suvorova }
31227191f24cSAkihiko Odaki ret = -1;
31237191f24cSAkihiko Odaki break;
31247191f24cSAkihiko Odaki }
31257191f24cSAkihiko Odaki
312699f31832SSergio Andres Gomez Del Real cpu->vcpu_dirty = false;
312792229a57SYang Zhong }
312892229a57SYang Zhong
312992229a57SYang Zhong kvm_arch_pre_run(cpu, run);
3130d73415a3SStefan Hajnoczi if (qatomic_read(&cpu->exit_request)) {
31319cdfb1e3SJai Arora trace_kvm_interrupt_exit_request();
313292229a57SYang Zhong /*
313392229a57SYang Zhong * KVM requires us to reenter the kernel after IO exits to complete
313492229a57SYang Zhong * instruction emulation. This self-signal will ensure that we
313592229a57SYang Zhong * leave ASAP again.
313692229a57SYang Zhong */
313792229a57SYang Zhong kvm_cpu_kick_self();
313892229a57SYang Zhong }
313992229a57SYang Zhong
314092229a57SYang Zhong /* Read cpu->exit_request before KVM_RUN reads run->immediate_exit.
314192229a57SYang Zhong * Matching barrier in kvm_eat_signals.
314292229a57SYang Zhong */
314392229a57SYang Zhong smp_rmb();
314492229a57SYang Zhong
314592229a57SYang Zhong run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0);
314692229a57SYang Zhong
314792229a57SYang Zhong attrs = kvm_arch_post_run(cpu, run);
314892229a57SYang Zhong
314992229a57SYang Zhong #ifdef KVM_HAVE_MCE_INJECTION
315092229a57SYang Zhong if (unlikely(have_sigbus_pending)) {
3151195801d7SStefan Hajnoczi bql_lock();
315292229a57SYang Zhong kvm_arch_on_sigbus_vcpu(cpu, pending_sigbus_code,
315392229a57SYang Zhong pending_sigbus_addr);
315492229a57SYang Zhong have_sigbus_pending = false;
3155195801d7SStefan Hajnoczi bql_unlock();
315692229a57SYang Zhong }
315792229a57SYang Zhong #endif
315892229a57SYang Zhong
315992229a57SYang Zhong if (run_ret < 0) {
316092229a57SYang Zhong if (run_ret == -EINTR || run_ret == -EAGAIN) {
31619cdfb1e3SJai Arora trace_kvm_io_window_exit();
316292229a57SYang Zhong kvm_eat_signals(cpu);
316392229a57SYang Zhong ret = EXCP_INTERRUPT;
316492229a57SYang Zhong break;
316592229a57SYang Zhong }
3166c15e5684SChao Peng if (!(run_ret == -EFAULT && run->exit_reason == KVM_EXIT_MEMORY_FAULT)) {
316792229a57SYang Zhong fprintf(stderr, "error: kvm run failed %s\n",
316892229a57SYang Zhong strerror(-run_ret));
316992229a57SYang Zhong #ifdef TARGET_PPC
317092229a57SYang Zhong if (run_ret == -EBUSY) {
317192229a57SYang Zhong fprintf(stderr,
317292229a57SYang Zhong "This is probably because your SMT is enabled.\n"
317392229a57SYang Zhong "VCPU can only run on primary threads with all "
317492229a57SYang Zhong "secondary threads offline.\n");
317592229a57SYang Zhong }
317692229a57SYang Zhong #endif
317792229a57SYang Zhong ret = -1;
317892229a57SYang Zhong break;
317992229a57SYang Zhong }
3180c15e5684SChao Peng }
318192229a57SYang Zhong
318292229a57SYang Zhong trace_kvm_run_exit(cpu->cpu_index, run->exit_reason);
318392229a57SYang Zhong switch (run->exit_reason) {
318492229a57SYang Zhong case KVM_EXIT_IO:
318592229a57SYang Zhong /* Called outside BQL */
318692229a57SYang Zhong kvm_handle_io(run->io.port, attrs,
318792229a57SYang Zhong (uint8_t *)run + run->io.data_offset,
318892229a57SYang Zhong run->io.direction,
318992229a57SYang Zhong run->io.size,
319092229a57SYang Zhong run->io.count);
319192229a57SYang Zhong ret = 0;
319292229a57SYang Zhong break;
319392229a57SYang Zhong case KVM_EXIT_MMIO:
319492229a57SYang Zhong /* Called outside BQL */
319592229a57SYang Zhong address_space_rw(&address_space_memory,
319692229a57SYang Zhong run->mmio.phys_addr, attrs,
319792229a57SYang Zhong run->mmio.data,
319892229a57SYang Zhong run->mmio.len,
319992229a57SYang Zhong run->mmio.is_write);
320092229a57SYang Zhong ret = 0;
320192229a57SYang Zhong break;
320292229a57SYang Zhong case KVM_EXIT_IRQ_WINDOW_OPEN:
320392229a57SYang Zhong ret = EXCP_INTERRUPT;
320492229a57SYang Zhong break;
320592229a57SYang Zhong case KVM_EXIT_SHUTDOWN:
320692229a57SYang Zhong qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
320792229a57SYang Zhong ret = EXCP_INTERRUPT;
320892229a57SYang Zhong break;
320992229a57SYang Zhong case KVM_EXIT_UNKNOWN:
321092229a57SYang Zhong fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n",
321192229a57SYang Zhong (uint64_t)run->hw.hardware_exit_reason);
321292229a57SYang Zhong ret = -1;
321392229a57SYang Zhong break;
321492229a57SYang Zhong case KVM_EXIT_INTERNAL_ERROR:
321592229a57SYang Zhong ret = kvm_handle_internal_error(cpu, run);
321692229a57SYang Zhong break;
3217b4420f19SPeter Xu case KVM_EXIT_DIRTY_RING_FULL:
3218b4420f19SPeter Xu /*
3219b4420f19SPeter Xu * We shouldn't continue if the dirty ring of this vcpu is
3220b4420f19SPeter Xu * still full. Got kicked by KVM_RESET_DIRTY_RINGS.
3221b4420f19SPeter Xu */
3222b4420f19SPeter Xu trace_kvm_dirty_ring_full(cpu->cpu_index);
3223195801d7SStefan Hajnoczi bql_lock();
3224baa60983SHyman Huang(黄勇) /*
3225baa60983SHyman Huang(黄勇) * We throttle vCPU by making it sleep once it exit from kernel
3226baa60983SHyman Huang(黄勇) * due to dirty ring full. In the dirtylimit scenario, reaping
3227baa60983SHyman Huang(黄勇) * all vCPUs after a single vCPU dirty ring get full result in
3228baa60983SHyman Huang(黄勇) * the miss of sleep, so just reap the ring-fulled vCPU.
3229baa60983SHyman Huang(黄勇) */
3230baa60983SHyman Huang(黄勇) if (dirtylimit_in_service()) {
3231baa60983SHyman Huang(黄勇) kvm_dirty_ring_reap(kvm_state, cpu);
3232baa60983SHyman Huang(黄勇) } else {
32331667e2b9SHyman Huang(黄勇) kvm_dirty_ring_reap(kvm_state, NULL);
3234baa60983SHyman Huang(黄勇) }
3235195801d7SStefan Hajnoczi bql_unlock();
3236baa60983SHyman Huang(黄勇) dirtylimit_vcpu_execute(cpu);
3237b4420f19SPeter Xu ret = 0;
3238b4420f19SPeter Xu break;
323992229a57SYang Zhong case KVM_EXIT_SYSTEM_EVENT:
32409cdfb1e3SJai Arora trace_kvm_run_exit_system_event(cpu->cpu_index, run->system_event.type);
324192229a57SYang Zhong switch (run->system_event.type) {
324292229a57SYang Zhong case KVM_SYSTEM_EVENT_SHUTDOWN:
324392229a57SYang Zhong qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
324492229a57SYang Zhong ret = EXCP_INTERRUPT;
324592229a57SYang Zhong break;
324692229a57SYang Zhong case KVM_SYSTEM_EVENT_RESET:
324792229a57SYang Zhong qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
324892229a57SYang Zhong ret = EXCP_INTERRUPT;
324992229a57SYang Zhong break;
325092229a57SYang Zhong case KVM_SYSTEM_EVENT_CRASH:
325192229a57SYang Zhong kvm_cpu_synchronize_state(cpu);
3252195801d7SStefan Hajnoczi bql_lock();
325392229a57SYang Zhong qemu_system_guest_panicked(cpu_get_crash_info(cpu));
3254195801d7SStefan Hajnoczi bql_unlock();
325592229a57SYang Zhong ret = 0;
325692229a57SYang Zhong break;
325792229a57SYang Zhong default:
325892229a57SYang Zhong ret = kvm_arch_handle_exit(cpu, run);
325992229a57SYang Zhong break;
326092229a57SYang Zhong }
326192229a57SYang Zhong break;
3262c15e5684SChao Peng case KVM_EXIT_MEMORY_FAULT:
3263c15e5684SChao Peng trace_kvm_memory_fault(run->memory_fault.gpa,
3264c15e5684SChao Peng run->memory_fault.size,
3265c15e5684SChao Peng run->memory_fault.flags);
3266c15e5684SChao Peng if (run->memory_fault.flags & ~KVM_MEMORY_EXIT_FLAG_PRIVATE) {
3267c15e5684SChao Peng error_report("KVM_EXIT_MEMORY_FAULT: Unknown flag 0x%" PRIx64,
3268c15e5684SChao Peng (uint64_t)run->memory_fault.flags);
3269c15e5684SChao Peng ret = -1;
3270c15e5684SChao Peng break;
3271c15e5684SChao Peng }
3272c15e5684SChao Peng ret = kvm_convert_memory(run->memory_fault.gpa, run->memory_fault.size,
3273c15e5684SChao Peng run->memory_fault.flags & KVM_MEMORY_EXIT_FLAG_PRIVATE);
3274c15e5684SChao Peng break;
327592229a57SYang Zhong default:
327692229a57SYang Zhong ret = kvm_arch_handle_exit(cpu, run);
327792229a57SYang Zhong break;
327892229a57SYang Zhong }
327992229a57SYang Zhong } while (ret == 0);
328092229a57SYang Zhong
328192229a57SYang Zhong cpu_exec_end(cpu);
3282195801d7SStefan Hajnoczi bql_lock();
328392229a57SYang Zhong
328492229a57SYang Zhong if (ret < 0) {
328590c84c56SMarkus Armbruster cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
328692229a57SYang Zhong vm_stop(RUN_STATE_INTERNAL_ERROR);
328792229a57SYang Zhong }
328892229a57SYang Zhong
3289d73415a3SStefan Hajnoczi qatomic_set(&cpu->exit_request, 0);
329092229a57SYang Zhong return ret;
329192229a57SYang Zhong }
329292229a57SYang Zhong
kvm_ioctl(KVMState * s,unsigned long type,...)32936a8703aeSJohannes Stoelp int kvm_ioctl(KVMState *s, unsigned long type, ...)
329492229a57SYang Zhong {
329592229a57SYang Zhong int ret;
329692229a57SYang Zhong void *arg;
329792229a57SYang Zhong va_list ap;
329892229a57SYang Zhong
329992229a57SYang Zhong va_start(ap, type);
330092229a57SYang Zhong arg = va_arg(ap, void *);
330192229a57SYang Zhong va_end(ap);
330292229a57SYang Zhong
330392229a57SYang Zhong trace_kvm_ioctl(type, arg);
330492229a57SYang Zhong ret = ioctl(s->fd, type, arg);
330592229a57SYang Zhong if (ret == -1) {
330692229a57SYang Zhong ret = -errno;
330792229a57SYang Zhong }
330892229a57SYang Zhong return ret;
330992229a57SYang Zhong }
331092229a57SYang Zhong
kvm_vm_ioctl(KVMState * s,unsigned long type,...)33116a8703aeSJohannes Stoelp int kvm_vm_ioctl(KVMState *s, unsigned long type, ...)
331292229a57SYang Zhong {
331392229a57SYang Zhong int ret;
331492229a57SYang Zhong void *arg;
331592229a57SYang Zhong va_list ap;
331692229a57SYang Zhong
331792229a57SYang Zhong va_start(ap, type);
331892229a57SYang Zhong arg = va_arg(ap, void *);
331992229a57SYang Zhong va_end(ap);
332092229a57SYang Zhong
332192229a57SYang Zhong trace_kvm_vm_ioctl(type, arg);
3322a27dd2deSEmanuele Giuseppe Esposito accel_ioctl_begin();
332392229a57SYang Zhong ret = ioctl(s->vmfd, type, arg);
3324a27dd2deSEmanuele Giuseppe Esposito accel_ioctl_end();
332592229a57SYang Zhong if (ret == -1) {
332692229a57SYang Zhong ret = -errno;
332792229a57SYang Zhong }
332892229a57SYang Zhong return ret;
332992229a57SYang Zhong }
333092229a57SYang Zhong
kvm_vcpu_ioctl(CPUState * cpu,unsigned long type,...)33316a8703aeSJohannes Stoelp int kvm_vcpu_ioctl(CPUState *cpu, unsigned long type, ...)
333292229a57SYang Zhong {
333392229a57SYang Zhong int ret;
333492229a57SYang Zhong void *arg;
333592229a57SYang Zhong va_list ap;
333692229a57SYang Zhong
333792229a57SYang Zhong va_start(ap, type);
333892229a57SYang Zhong arg = va_arg(ap, void *);
333992229a57SYang Zhong va_end(ap);
334092229a57SYang Zhong
334192229a57SYang Zhong trace_kvm_vcpu_ioctl(cpu->cpu_index, type, arg);
3342a27dd2deSEmanuele Giuseppe Esposito accel_cpu_ioctl_begin(cpu);
334392229a57SYang Zhong ret = ioctl(cpu->kvm_fd, type, arg);
3344a27dd2deSEmanuele Giuseppe Esposito accel_cpu_ioctl_end(cpu);
334592229a57SYang Zhong if (ret == -1) {
334692229a57SYang Zhong ret = -errno;
334792229a57SYang Zhong }
334892229a57SYang Zhong return ret;
334992229a57SYang Zhong }
335092229a57SYang Zhong
kvm_device_ioctl(int fd,unsigned long type,...)33516a8703aeSJohannes Stoelp int kvm_device_ioctl(int fd, unsigned long type, ...)
335292229a57SYang Zhong {
335392229a57SYang Zhong int ret;
335492229a57SYang Zhong void *arg;
335592229a57SYang Zhong va_list ap;
335692229a57SYang Zhong
335792229a57SYang Zhong va_start(ap, type);
335892229a57SYang Zhong arg = va_arg(ap, void *);
335992229a57SYang Zhong va_end(ap);
336092229a57SYang Zhong
336192229a57SYang Zhong trace_kvm_device_ioctl(fd, type, arg);
3362a27dd2deSEmanuele Giuseppe Esposito accel_ioctl_begin();
336392229a57SYang Zhong ret = ioctl(fd, type, arg);
3364a27dd2deSEmanuele Giuseppe Esposito accel_ioctl_end();
336592229a57SYang Zhong if (ret == -1) {
336692229a57SYang Zhong ret = -errno;
336792229a57SYang Zhong }
336892229a57SYang Zhong return ret;
336992229a57SYang Zhong }
337092229a57SYang Zhong
kvm_vm_check_attr(KVMState * s,uint32_t group,uint64_t attr)337192229a57SYang Zhong int kvm_vm_check_attr(KVMState *s, uint32_t group, uint64_t attr)
337292229a57SYang Zhong {
337392229a57SYang Zhong int ret;
337492229a57SYang Zhong struct kvm_device_attr attribute = {
337592229a57SYang Zhong .group = group,
337692229a57SYang Zhong .attr = attr,
337792229a57SYang Zhong };
337892229a57SYang Zhong
337992229a57SYang Zhong if (!kvm_vm_attributes_allowed) {
338092229a57SYang Zhong return 0;
338192229a57SYang Zhong }
338292229a57SYang Zhong
338392229a57SYang Zhong ret = kvm_vm_ioctl(s, KVM_HAS_DEVICE_ATTR, &attribute);
338492229a57SYang Zhong /* kvm returns 0 on success for HAS_DEVICE_ATTR */
338592229a57SYang Zhong return ret ? 0 : 1;
338692229a57SYang Zhong }
338792229a57SYang Zhong
kvm_device_check_attr(int dev_fd,uint32_t group,uint64_t attr)338892229a57SYang Zhong int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr)
338992229a57SYang Zhong {
339092229a57SYang Zhong struct kvm_device_attr attribute = {
339192229a57SYang Zhong .group = group,
339292229a57SYang Zhong .attr = attr,
339392229a57SYang Zhong .flags = 0,
339492229a57SYang Zhong };
339592229a57SYang Zhong
339692229a57SYang Zhong return kvm_device_ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute) ? 0 : 1;
339792229a57SYang Zhong }
339892229a57SYang Zhong
kvm_device_access(int fd,int group,uint64_t attr,void * val,bool write,Error ** errp)339992229a57SYang Zhong int kvm_device_access(int fd, int group, uint64_t attr,
340092229a57SYang Zhong void *val, bool write, Error **errp)
340192229a57SYang Zhong {
340292229a57SYang Zhong struct kvm_device_attr kvmattr;
340392229a57SYang Zhong int err;
340492229a57SYang Zhong
340592229a57SYang Zhong kvmattr.flags = 0;
340692229a57SYang Zhong kvmattr.group = group;
340792229a57SYang Zhong kvmattr.attr = attr;
340892229a57SYang Zhong kvmattr.addr = (uintptr_t)val;
340992229a57SYang Zhong
341092229a57SYang Zhong err = kvm_device_ioctl(fd,
341192229a57SYang Zhong write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR,
341292229a57SYang Zhong &kvmattr);
341392229a57SYang Zhong if (err < 0) {
341492229a57SYang Zhong error_setg_errno(errp, -err,
341592229a57SYang Zhong "KVM_%s_DEVICE_ATTR failed: Group %d "
341692229a57SYang Zhong "attr 0x%016" PRIx64,
341792229a57SYang Zhong write ? "SET" : "GET", group, attr);
341892229a57SYang Zhong }
341992229a57SYang Zhong return err;
342092229a57SYang Zhong }
342192229a57SYang Zhong
kvm_has_sync_mmu(void)342262dd4edaSGreg Kurz bool kvm_has_sync_mmu(void)
342392229a57SYang Zhong {
342462dd4edaSGreg Kurz return kvm_state->sync_mmu;
342592229a57SYang Zhong }
342692229a57SYang Zhong
kvm_has_vcpu_events(void)342792229a57SYang Zhong int kvm_has_vcpu_events(void)
342892229a57SYang Zhong {
342992229a57SYang Zhong return kvm_state->vcpu_events;
343092229a57SYang Zhong }
343192229a57SYang Zhong
kvm_max_nested_state_length(void)3432ebbfef2fSLiran Alon int kvm_max_nested_state_length(void)
3433ebbfef2fSLiran Alon {
3434ebbfef2fSLiran Alon return kvm_state->max_nested_state_len;
3435ebbfef2fSLiran Alon }
3436ebbfef2fSLiran Alon
kvm_has_gsi_routing(void)343792229a57SYang Zhong int kvm_has_gsi_routing(void)
343892229a57SYang Zhong {
343992229a57SYang Zhong #ifdef KVM_CAP_IRQ_ROUTING
344092229a57SYang Zhong return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
344192229a57SYang Zhong #else
344292229a57SYang Zhong return false;
344392229a57SYang Zhong #endif
344492229a57SYang Zhong }
344592229a57SYang Zhong
kvm_arm_supports_user_irq(void)34465d721b78SAlexander Graf bool kvm_arm_supports_user_irq(void)
34475d721b78SAlexander Graf {
34485d721b78SAlexander Graf return kvm_check_extension(kvm_state, KVM_CAP_ARM_USER_IRQ);
34495d721b78SAlexander Graf }
34505d721b78SAlexander Graf
34511e1e4879SPaolo Bonzini #ifdef TARGET_KVM_HAVE_GUEST_DEBUG
kvm_find_sw_breakpoint(CPUState * cpu,vaddr pc)3452b67be03eSAnton Johansson struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu, vaddr pc)
345392229a57SYang Zhong {
345492229a57SYang Zhong struct kvm_sw_breakpoint *bp;
345592229a57SYang Zhong
345692229a57SYang Zhong QTAILQ_FOREACH(bp, &cpu->kvm_state->kvm_sw_breakpoints, entry) {
345792229a57SYang Zhong if (bp->pc == pc) {
345892229a57SYang Zhong return bp;
345992229a57SYang Zhong }
346092229a57SYang Zhong }
346192229a57SYang Zhong return NULL;
346292229a57SYang Zhong }
346392229a57SYang Zhong
kvm_sw_breakpoints_active(CPUState * cpu)346492229a57SYang Zhong int kvm_sw_breakpoints_active(CPUState *cpu)
346592229a57SYang Zhong {
346692229a57SYang Zhong return !QTAILQ_EMPTY(&cpu->kvm_state->kvm_sw_breakpoints);
346792229a57SYang Zhong }
346892229a57SYang Zhong
346992229a57SYang Zhong struct kvm_set_guest_debug_data {
347092229a57SYang Zhong struct kvm_guest_debug dbg;
347192229a57SYang Zhong int err;
347292229a57SYang Zhong };
347392229a57SYang Zhong
kvm_invoke_set_guest_debug(CPUState * cpu,run_on_cpu_data data)347492229a57SYang Zhong static void kvm_invoke_set_guest_debug(CPUState *cpu, run_on_cpu_data data)
347592229a57SYang Zhong {
347692229a57SYang Zhong struct kvm_set_guest_debug_data *dbg_data =
347792229a57SYang Zhong (struct kvm_set_guest_debug_data *) data.host_ptr;
347892229a57SYang Zhong
347992229a57SYang Zhong dbg_data->err = kvm_vcpu_ioctl(cpu, KVM_SET_GUEST_DEBUG,
348092229a57SYang Zhong &dbg_data->dbg);
348192229a57SYang Zhong }
348292229a57SYang Zhong
kvm_update_guest_debug(CPUState * cpu,unsigned long reinject_trap)348392229a57SYang Zhong int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap)
348492229a57SYang Zhong {
348592229a57SYang Zhong struct kvm_set_guest_debug_data data;
348692229a57SYang Zhong
348792229a57SYang Zhong data.dbg.control = reinject_trap;
348892229a57SYang Zhong
348992229a57SYang Zhong if (cpu->singlestep_enabled) {
349092229a57SYang Zhong data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
3491fd2ddd16SMaxim Levitsky
3492fd2ddd16SMaxim Levitsky if (cpu->singlestep_enabled & SSTEP_NOIRQ) {
3493fd2ddd16SMaxim Levitsky data.dbg.control |= KVM_GUESTDBG_BLOCKIRQ;
3494fd2ddd16SMaxim Levitsky }
349592229a57SYang Zhong }
349692229a57SYang Zhong kvm_arch_update_guest_debug(cpu, &data.dbg);
349792229a57SYang Zhong
349892229a57SYang Zhong run_on_cpu(cpu, kvm_invoke_set_guest_debug,
349992229a57SYang Zhong RUN_ON_CPU_HOST_PTR(&data));
350092229a57SYang Zhong return data.err;
350192229a57SYang Zhong }
350292229a57SYang Zhong
kvm_supports_guest_debug(void)3503a48e7d9eSAlex Bennée bool kvm_supports_guest_debug(void)
3504a48e7d9eSAlex Bennée {
3505a48e7d9eSAlex Bennée /* probed during kvm_init() */
3506a48e7d9eSAlex Bennée return kvm_has_guest_debug;
3507a48e7d9eSAlex Bennée }
3508a48e7d9eSAlex Bennée
kvm_insert_breakpoint(CPUState * cpu,int type,vaddr addr,vaddr len)350955b5b8e9SPhilippe Mathieu-Daudé int kvm_insert_breakpoint(CPUState *cpu, int type, vaddr addr, vaddr len)
351092229a57SYang Zhong {
351192229a57SYang Zhong struct kvm_sw_breakpoint *bp;
351292229a57SYang Zhong int err;
351392229a57SYang Zhong
351492229a57SYang Zhong if (type == GDB_BREAKPOINT_SW) {
351592229a57SYang Zhong bp = kvm_find_sw_breakpoint(cpu, addr);
351692229a57SYang Zhong if (bp) {
351792229a57SYang Zhong bp->use_count++;
351892229a57SYang Zhong return 0;
351992229a57SYang Zhong }
352092229a57SYang Zhong
3521b21e2380SMarkus Armbruster bp = g_new(struct kvm_sw_breakpoint, 1);
352292229a57SYang Zhong bp->pc = addr;
352392229a57SYang Zhong bp->use_count = 1;
352492229a57SYang Zhong err = kvm_arch_insert_sw_breakpoint(cpu, bp);
352592229a57SYang Zhong if (err) {
352692229a57SYang Zhong g_free(bp);
352792229a57SYang Zhong return err;
352892229a57SYang Zhong }
352992229a57SYang Zhong
353092229a57SYang Zhong QTAILQ_INSERT_HEAD(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
353192229a57SYang Zhong } else {
353292229a57SYang Zhong err = kvm_arch_insert_hw_breakpoint(addr, len, type);
353392229a57SYang Zhong if (err) {
353492229a57SYang Zhong return err;
353592229a57SYang Zhong }
353692229a57SYang Zhong }
353792229a57SYang Zhong
353892229a57SYang Zhong CPU_FOREACH(cpu) {
353992229a57SYang Zhong err = kvm_update_guest_debug(cpu, 0);
354092229a57SYang Zhong if (err) {
354192229a57SYang Zhong return err;
354292229a57SYang Zhong }
354392229a57SYang Zhong }
354492229a57SYang Zhong return 0;
354592229a57SYang Zhong }
354692229a57SYang Zhong
kvm_remove_breakpoint(CPUState * cpu,int type,vaddr addr,vaddr len)354755b5b8e9SPhilippe Mathieu-Daudé int kvm_remove_breakpoint(CPUState *cpu, int type, vaddr addr, vaddr len)
354892229a57SYang Zhong {
354992229a57SYang Zhong struct kvm_sw_breakpoint *bp;
355092229a57SYang Zhong int err;
355192229a57SYang Zhong
355292229a57SYang Zhong if (type == GDB_BREAKPOINT_SW) {
355392229a57SYang Zhong bp = kvm_find_sw_breakpoint(cpu, addr);
355492229a57SYang Zhong if (!bp) {
355592229a57SYang Zhong return -ENOENT;
355692229a57SYang Zhong }
355792229a57SYang Zhong
355892229a57SYang Zhong if (bp->use_count > 1) {
355992229a57SYang Zhong bp->use_count--;
356092229a57SYang Zhong return 0;
356192229a57SYang Zhong }
356292229a57SYang Zhong
356392229a57SYang Zhong err = kvm_arch_remove_sw_breakpoint(cpu, bp);
356492229a57SYang Zhong if (err) {
356592229a57SYang Zhong return err;
356692229a57SYang Zhong }
356792229a57SYang Zhong
356892229a57SYang Zhong QTAILQ_REMOVE(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
356992229a57SYang Zhong g_free(bp);
357092229a57SYang Zhong } else {
357192229a57SYang Zhong err = kvm_arch_remove_hw_breakpoint(addr, len, type);
357292229a57SYang Zhong if (err) {
357392229a57SYang Zhong return err;
357492229a57SYang Zhong }
357592229a57SYang Zhong }
357692229a57SYang Zhong
357792229a57SYang Zhong CPU_FOREACH(cpu) {
357892229a57SYang Zhong err = kvm_update_guest_debug(cpu, 0);
357992229a57SYang Zhong if (err) {
358092229a57SYang Zhong return err;
358192229a57SYang Zhong }
358292229a57SYang Zhong }
358392229a57SYang Zhong return 0;
358492229a57SYang Zhong }
358592229a57SYang Zhong
kvm_remove_all_breakpoints(CPUState * cpu)358692229a57SYang Zhong void kvm_remove_all_breakpoints(CPUState *cpu)
358792229a57SYang Zhong {
358892229a57SYang Zhong struct kvm_sw_breakpoint *bp, *next;
358992229a57SYang Zhong KVMState *s = cpu->kvm_state;
359092229a57SYang Zhong CPUState *tmpcpu;
359192229a57SYang Zhong
359292229a57SYang Zhong QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
359392229a57SYang Zhong if (kvm_arch_remove_sw_breakpoint(cpu, bp) != 0) {
359492229a57SYang Zhong /* Try harder to find a CPU that currently sees the breakpoint. */
359592229a57SYang Zhong CPU_FOREACH(tmpcpu) {
359692229a57SYang Zhong if (kvm_arch_remove_sw_breakpoint(tmpcpu, bp) == 0) {
359792229a57SYang Zhong break;
359892229a57SYang Zhong }
359992229a57SYang Zhong }
360092229a57SYang Zhong }
360192229a57SYang Zhong QTAILQ_REMOVE(&s->kvm_sw_breakpoints, bp, entry);
360292229a57SYang Zhong g_free(bp);
360392229a57SYang Zhong }
360492229a57SYang Zhong kvm_arch_remove_all_hw_breakpoints();
360592229a57SYang Zhong
360692229a57SYang Zhong CPU_FOREACH(cpu) {
360792229a57SYang Zhong kvm_update_guest_debug(cpu, 0);
360892229a57SYang Zhong }
360992229a57SYang Zhong }
361092229a57SYang Zhong
36111e1e4879SPaolo Bonzini #endif /* !TARGET_KVM_HAVE_GUEST_DEBUG */
361292229a57SYang Zhong
kvm_set_signal_mask(CPUState * cpu,const sigset_t * sigset)361392229a57SYang Zhong static int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset)
361492229a57SYang Zhong {
361592229a57SYang Zhong KVMState *s = kvm_state;
361692229a57SYang Zhong struct kvm_signal_mask *sigmask;
361792229a57SYang Zhong int r;
361892229a57SYang Zhong
361992229a57SYang Zhong sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset));
362092229a57SYang Zhong
362192229a57SYang Zhong sigmask->len = s->sigmask_len;
362292229a57SYang Zhong memcpy(sigmask->sigset, sigset, sizeof(*sigset));
362392229a57SYang Zhong r = kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, sigmask);
362492229a57SYang Zhong g_free(sigmask);
362592229a57SYang Zhong
362692229a57SYang Zhong return r;
362792229a57SYang Zhong }
362892229a57SYang Zhong
kvm_ipi_signal(int sig)362992229a57SYang Zhong static void kvm_ipi_signal(int sig)
363092229a57SYang Zhong {
363192229a57SYang Zhong if (current_cpu) {
363292229a57SYang Zhong assert(kvm_immediate_exit);
363392229a57SYang Zhong kvm_cpu_kick(current_cpu);
363492229a57SYang Zhong }
363592229a57SYang Zhong }
363692229a57SYang Zhong
kvm_init_cpu_signals(CPUState * cpu)363792229a57SYang Zhong void kvm_init_cpu_signals(CPUState *cpu)
363892229a57SYang Zhong {
363992229a57SYang Zhong int r;
364092229a57SYang Zhong sigset_t set;
364192229a57SYang Zhong struct sigaction sigact;
364292229a57SYang Zhong
364392229a57SYang Zhong memset(&sigact, 0, sizeof(sigact));
364492229a57SYang Zhong sigact.sa_handler = kvm_ipi_signal;
364592229a57SYang Zhong sigaction(SIG_IPI, &sigact, NULL);
364692229a57SYang Zhong
364792229a57SYang Zhong pthread_sigmask(SIG_BLOCK, NULL, &set);
364892229a57SYang Zhong #if defined KVM_HAVE_MCE_INJECTION
364992229a57SYang Zhong sigdelset(&set, SIGBUS);
365092229a57SYang Zhong pthread_sigmask(SIG_SETMASK, &set, NULL);
365192229a57SYang Zhong #endif
365292229a57SYang Zhong sigdelset(&set, SIG_IPI);
365392229a57SYang Zhong if (kvm_immediate_exit) {
365492229a57SYang Zhong r = pthread_sigmask(SIG_SETMASK, &set, NULL);
365592229a57SYang Zhong } else {
365692229a57SYang Zhong r = kvm_set_signal_mask(cpu, &set);
365792229a57SYang Zhong }
365892229a57SYang Zhong if (r) {
365992229a57SYang Zhong fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
366092229a57SYang Zhong exit(1);
366192229a57SYang Zhong }
366292229a57SYang Zhong }
366392229a57SYang Zhong
366492229a57SYang Zhong /* Called asynchronously in VCPU thread. */
kvm_on_sigbus_vcpu(CPUState * cpu,int code,void * addr)366592229a57SYang Zhong int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr)
366692229a57SYang Zhong {
366792229a57SYang Zhong #ifdef KVM_HAVE_MCE_INJECTION
366892229a57SYang Zhong if (have_sigbus_pending) {
366992229a57SYang Zhong return 1;
367092229a57SYang Zhong }
367192229a57SYang Zhong have_sigbus_pending = true;
367292229a57SYang Zhong pending_sigbus_addr = addr;
367392229a57SYang Zhong pending_sigbus_code = code;
3674d73415a3SStefan Hajnoczi qatomic_set(&cpu->exit_request, 1);
367592229a57SYang Zhong return 0;
367692229a57SYang Zhong #else
367792229a57SYang Zhong return 1;
367892229a57SYang Zhong #endif
367992229a57SYang Zhong }
368092229a57SYang Zhong
368192229a57SYang Zhong /* Called synchronously (via signalfd) in main thread. */
kvm_on_sigbus(int code,void * addr)368292229a57SYang Zhong int kvm_on_sigbus(int code, void *addr)
368392229a57SYang Zhong {
368492229a57SYang Zhong #ifdef KVM_HAVE_MCE_INJECTION
368592229a57SYang Zhong /* Action required MCE kills the process if SIGBUS is blocked. Because
368692229a57SYang Zhong * that's what happens in the I/O thread, where we handle MCE via signalfd,
368792229a57SYang Zhong * we can only get action optional here.
368892229a57SYang Zhong */
368992229a57SYang Zhong assert(code != BUS_MCEERR_AR);
369092229a57SYang Zhong kvm_arch_on_sigbus_vcpu(first_cpu, code, addr);
369192229a57SYang Zhong return 0;
369292229a57SYang Zhong #else
369392229a57SYang Zhong return 1;
369492229a57SYang Zhong #endif
369592229a57SYang Zhong }
369692229a57SYang Zhong
kvm_create_device(KVMState * s,uint64_t type,bool test)369792229a57SYang Zhong int kvm_create_device(KVMState *s, uint64_t type, bool test)
369892229a57SYang Zhong {
369992229a57SYang Zhong int ret;
370092229a57SYang Zhong struct kvm_create_device create_dev;
370192229a57SYang Zhong
370292229a57SYang Zhong create_dev.type = type;
370392229a57SYang Zhong create_dev.fd = -1;
370492229a57SYang Zhong create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0;
370592229a57SYang Zhong
370692229a57SYang Zhong if (!kvm_check_extension(s, KVM_CAP_DEVICE_CTRL)) {
370792229a57SYang Zhong return -ENOTSUP;
370892229a57SYang Zhong }
370992229a57SYang Zhong
371092229a57SYang Zhong ret = kvm_vm_ioctl(s, KVM_CREATE_DEVICE, &create_dev);
371192229a57SYang Zhong if (ret) {
371292229a57SYang Zhong return ret;
371392229a57SYang Zhong }
371492229a57SYang Zhong
371592229a57SYang Zhong return test ? 0 : create_dev.fd;
371692229a57SYang Zhong }
371792229a57SYang Zhong
kvm_device_supported(int vmfd,uint64_t type)371892229a57SYang Zhong bool kvm_device_supported(int vmfd, uint64_t type)
371992229a57SYang Zhong {
372092229a57SYang Zhong struct kvm_create_device create_dev = {
372192229a57SYang Zhong .type = type,
372292229a57SYang Zhong .fd = -1,
372392229a57SYang Zhong .flags = KVM_CREATE_DEVICE_TEST,
372492229a57SYang Zhong };
372592229a57SYang Zhong
372692229a57SYang Zhong if (ioctl(vmfd, KVM_CHECK_EXTENSION, KVM_CAP_DEVICE_CTRL) <= 0) {
372792229a57SYang Zhong return false;
372892229a57SYang Zhong }
372992229a57SYang Zhong
373092229a57SYang Zhong return (ioctl(vmfd, KVM_CREATE_DEVICE, &create_dev) >= 0);
373192229a57SYang Zhong }
373292229a57SYang Zhong
kvm_set_one_reg(CPUState * cs,uint64_t id,void * source)373392229a57SYang Zhong int kvm_set_one_reg(CPUState *cs, uint64_t id, void *source)
373492229a57SYang Zhong {
373592229a57SYang Zhong struct kvm_one_reg reg;
373692229a57SYang Zhong int r;
373792229a57SYang Zhong
373892229a57SYang Zhong reg.id = id;
373992229a57SYang Zhong reg.addr = (uintptr_t) source;
374092229a57SYang Zhong r = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, ®);
374192229a57SYang Zhong if (r) {
374292229a57SYang Zhong trace_kvm_failed_reg_set(id, strerror(-r));
374392229a57SYang Zhong }
374492229a57SYang Zhong return r;
374592229a57SYang Zhong }
374692229a57SYang Zhong
kvm_get_one_reg(CPUState * cs,uint64_t id,void * target)374792229a57SYang Zhong int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target)
374892229a57SYang Zhong {
374992229a57SYang Zhong struct kvm_one_reg reg;
375092229a57SYang Zhong int r;
375192229a57SYang Zhong
375292229a57SYang Zhong reg.id = id;
375392229a57SYang Zhong reg.addr = (uintptr_t) target;
375492229a57SYang Zhong r = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, ®);
375592229a57SYang Zhong if (r) {
375692229a57SYang Zhong trace_kvm_failed_reg_get(id, strerror(-r));
375792229a57SYang Zhong }
375892229a57SYang Zhong return r;
375992229a57SYang Zhong }
376092229a57SYang Zhong
kvm_accel_has_memory(MachineState * ms,AddressSpace * as,hwaddr start_addr,hwaddr size)37618072aae3SAlexey Kardashevskiy static bool kvm_accel_has_memory(MachineState *ms, AddressSpace *as,
37628072aae3SAlexey Kardashevskiy hwaddr start_addr, hwaddr size)
37638072aae3SAlexey Kardashevskiy {
37648072aae3SAlexey Kardashevskiy KVMState *kvm = KVM_STATE(ms->accelerator);
37658072aae3SAlexey Kardashevskiy int i;
37668072aae3SAlexey Kardashevskiy
37678072aae3SAlexey Kardashevskiy for (i = 0; i < kvm->nr_as; ++i) {
37688072aae3SAlexey Kardashevskiy if (kvm->as[i].as == as && kvm->as[i].ml) {
3769023ae9a8SIgor Mammedov size = MIN(kvm_max_slot_size, size);
37708072aae3SAlexey Kardashevskiy return NULL != kvm_lookup_matching_slot(kvm->as[i].ml,
37718072aae3SAlexey Kardashevskiy start_addr, size);
37728072aae3SAlexey Kardashevskiy }
37738072aae3SAlexey Kardashevskiy }
37748072aae3SAlexey Kardashevskiy
37758072aae3SAlexey Kardashevskiy return false;
37768072aae3SAlexey Kardashevskiy }
37778072aae3SAlexey Kardashevskiy
kvm_get_kvm_shadow_mem(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)377823b0898eSPaolo Bonzini static void kvm_get_kvm_shadow_mem(Object *obj, Visitor *v,
377923b0898eSPaolo Bonzini const char *name, void *opaque,
378023b0898eSPaolo Bonzini Error **errp)
378123b0898eSPaolo Bonzini {
378223b0898eSPaolo Bonzini KVMState *s = KVM_STATE(obj);
378323b0898eSPaolo Bonzini int64_t value = s->kvm_shadow_mem;
378423b0898eSPaolo Bonzini
378523b0898eSPaolo Bonzini visit_type_int(v, name, &value, errp);
378623b0898eSPaolo Bonzini }
378723b0898eSPaolo Bonzini
kvm_set_kvm_shadow_mem(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)378823b0898eSPaolo Bonzini static void kvm_set_kvm_shadow_mem(Object *obj, Visitor *v,
378923b0898eSPaolo Bonzini const char *name, void *opaque,
379023b0898eSPaolo Bonzini Error **errp)
379123b0898eSPaolo Bonzini {
379223b0898eSPaolo Bonzini KVMState *s = KVM_STATE(obj);
379323b0898eSPaolo Bonzini int64_t value;
379423b0898eSPaolo Bonzini
379570cbae42SPaolo Bonzini if (s->fd != -1) {
379670cbae42SPaolo Bonzini error_setg(errp, "Cannot set properties after the accelerator has been initialized");
379770cbae42SPaolo Bonzini return;
379870cbae42SPaolo Bonzini }
379970cbae42SPaolo Bonzini
3800668f62ecSMarkus Armbruster if (!visit_type_int(v, name, &value, errp)) {
380123b0898eSPaolo Bonzini return;
380223b0898eSPaolo Bonzini }
380323b0898eSPaolo Bonzini
380423b0898eSPaolo Bonzini s->kvm_shadow_mem = value;
380523b0898eSPaolo Bonzini }
380623b0898eSPaolo Bonzini
kvm_set_kernel_irqchip(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)380711bc4a13SPaolo Bonzini static void kvm_set_kernel_irqchip(Object *obj, Visitor *v,
380811bc4a13SPaolo Bonzini const char *name, void *opaque,
380911bc4a13SPaolo Bonzini Error **errp)
381011bc4a13SPaolo Bonzini {
381111bc4a13SPaolo Bonzini KVMState *s = KVM_STATE(obj);
381211bc4a13SPaolo Bonzini OnOffSplit mode;
381311bc4a13SPaolo Bonzini
381470cbae42SPaolo Bonzini if (s->fd != -1) {
381570cbae42SPaolo Bonzini error_setg(errp, "Cannot set properties after the accelerator has been initialized");
381670cbae42SPaolo Bonzini return;
381770cbae42SPaolo Bonzini }
381870cbae42SPaolo Bonzini
381914217038SMarkus Armbruster if (!visit_type_OnOffSplit(v, name, &mode, errp)) {
382011bc4a13SPaolo Bonzini return;
382114217038SMarkus Armbruster }
382211bc4a13SPaolo Bonzini switch (mode) {
382311bc4a13SPaolo Bonzini case ON_OFF_SPLIT_ON:
382411bc4a13SPaolo Bonzini s->kernel_irqchip_allowed = true;
382511bc4a13SPaolo Bonzini s->kernel_irqchip_required = true;
3826d1972be1SXiaoyao Li s->kernel_irqchip_split = ON_OFF_AUTO_OFF;
382711bc4a13SPaolo Bonzini break;
382811bc4a13SPaolo Bonzini case ON_OFF_SPLIT_OFF:
382911bc4a13SPaolo Bonzini s->kernel_irqchip_allowed = false;
383011bc4a13SPaolo Bonzini s->kernel_irqchip_required = false;
3831d1972be1SXiaoyao Li s->kernel_irqchip_split = ON_OFF_AUTO_OFF;
383211bc4a13SPaolo Bonzini break;
383311bc4a13SPaolo Bonzini case ON_OFF_SPLIT_SPLIT:
383411bc4a13SPaolo Bonzini s->kernel_irqchip_allowed = true;
383511bc4a13SPaolo Bonzini s->kernel_irqchip_required = true;
3836d1972be1SXiaoyao Li s->kernel_irqchip_split = ON_OFF_AUTO_ON;
383711bc4a13SPaolo Bonzini break;
383811bc4a13SPaolo Bonzini default:
383911bc4a13SPaolo Bonzini /* The value was checked in visit_type_OnOffSplit() above. If
384011bc4a13SPaolo Bonzini * we get here, then something is wrong in QEMU.
384111bc4a13SPaolo Bonzini */
384211bc4a13SPaolo Bonzini abort();
384311bc4a13SPaolo Bonzini }
384411bc4a13SPaolo Bonzini }
384511bc4a13SPaolo Bonzini
kvm_kernel_irqchip_allowed(void)38464376c40dSPaolo Bonzini bool kvm_kernel_irqchip_allowed(void)
38474376c40dSPaolo Bonzini {
384811bc4a13SPaolo Bonzini return kvm_state->kernel_irqchip_allowed;
38494376c40dSPaolo Bonzini }
38504376c40dSPaolo Bonzini
kvm_kernel_irqchip_required(void)38514376c40dSPaolo Bonzini bool kvm_kernel_irqchip_required(void)
38524376c40dSPaolo Bonzini {
385311bc4a13SPaolo Bonzini return kvm_state->kernel_irqchip_required;
38544376c40dSPaolo Bonzini }
38554376c40dSPaolo Bonzini
kvm_kernel_irqchip_split(void)38564376c40dSPaolo Bonzini bool kvm_kernel_irqchip_split(void)
38574376c40dSPaolo Bonzini {
3858d1972be1SXiaoyao Li return kvm_state->kernel_irqchip_split == ON_OFF_AUTO_ON;
38594376c40dSPaolo Bonzini }
38604376c40dSPaolo Bonzini
kvm_get_dirty_ring_size(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)38612ea5cb0aSPeter Xu static void kvm_get_dirty_ring_size(Object *obj, Visitor *v,
38622ea5cb0aSPeter Xu const char *name, void *opaque,
38632ea5cb0aSPeter Xu Error **errp)
38642ea5cb0aSPeter Xu {
38652ea5cb0aSPeter Xu KVMState *s = KVM_STATE(obj);
38662ea5cb0aSPeter Xu uint32_t value = s->kvm_dirty_ring_size;
38672ea5cb0aSPeter Xu
38682ea5cb0aSPeter Xu visit_type_uint32(v, name, &value, errp);
38692ea5cb0aSPeter Xu }
38702ea5cb0aSPeter Xu
kvm_set_dirty_ring_size(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)38712ea5cb0aSPeter Xu static void kvm_set_dirty_ring_size(Object *obj, Visitor *v,
38722ea5cb0aSPeter Xu const char *name, void *opaque,
38732ea5cb0aSPeter Xu Error **errp)
38742ea5cb0aSPeter Xu {
38752ea5cb0aSPeter Xu KVMState *s = KVM_STATE(obj);
38762ea5cb0aSPeter Xu uint32_t value;
38772ea5cb0aSPeter Xu
38782ea5cb0aSPeter Xu if (s->fd != -1) {
38792ea5cb0aSPeter Xu error_setg(errp, "Cannot set properties after the accelerator has been initialized");
38802ea5cb0aSPeter Xu return;
38812ea5cb0aSPeter Xu }
38822ea5cb0aSPeter Xu
3883d1c81c34SMarkus Armbruster if (!visit_type_uint32(v, name, &value, errp)) {
38842ea5cb0aSPeter Xu return;
38852ea5cb0aSPeter Xu }
38862ea5cb0aSPeter Xu if (value & (value - 1)) {
38872ea5cb0aSPeter Xu error_setg(errp, "dirty-ring-size must be a power of two.");
38882ea5cb0aSPeter Xu return;
38892ea5cb0aSPeter Xu }
38902ea5cb0aSPeter Xu
38912ea5cb0aSPeter Xu s->kvm_dirty_ring_size = value;
38922ea5cb0aSPeter Xu }
38932ea5cb0aSPeter Xu
kvm_get_device(Object * obj,Error ** errp G_GNUC_UNUSED)3894aef158b0SDaan De Meyer static char *kvm_get_device(Object *obj,
3895aef158b0SDaan De Meyer Error **errp G_GNUC_UNUSED)
3896aef158b0SDaan De Meyer {
3897aef158b0SDaan De Meyer KVMState *s = KVM_STATE(obj);
3898aef158b0SDaan De Meyer
3899aef158b0SDaan De Meyer return g_strdup(s->device);
3900aef158b0SDaan De Meyer }
3901aef158b0SDaan De Meyer
kvm_set_device(Object * obj,const char * value,Error ** errp G_GNUC_UNUSED)3902aef158b0SDaan De Meyer static void kvm_set_device(Object *obj,
3903aef158b0SDaan De Meyer const char *value,
3904aef158b0SDaan De Meyer Error **errp G_GNUC_UNUSED)
3905aef158b0SDaan De Meyer {
3906aef158b0SDaan De Meyer KVMState *s = KVM_STATE(obj);
3907aef158b0SDaan De Meyer
3908aef158b0SDaan De Meyer g_free(s->device);
3909aef158b0SDaan De Meyer s->device = g_strdup(value);
3910aef158b0SDaan De Meyer }
3911aef158b0SDaan De Meyer
kvm_set_kvm_rapl(Object * obj,bool value,Error ** errp)39120418f908SAnthony Harivel static void kvm_set_kvm_rapl(Object *obj, bool value, Error **errp)
39130418f908SAnthony Harivel {
39140418f908SAnthony Harivel KVMState *s = KVM_STATE(obj);
39150418f908SAnthony Harivel s->msr_energy.enable = value;
39160418f908SAnthony Harivel }
39170418f908SAnthony Harivel
kvm_set_kvm_rapl_socket_path(Object * obj,const char * str,Error ** errp)39180418f908SAnthony Harivel static void kvm_set_kvm_rapl_socket_path(Object *obj,
39190418f908SAnthony Harivel const char *str,
39200418f908SAnthony Harivel Error **errp)
39210418f908SAnthony Harivel {
39220418f908SAnthony Harivel KVMState *s = KVM_STATE(obj);
39230418f908SAnthony Harivel g_free(s->msr_energy.socket_path);
39240418f908SAnthony Harivel s->msr_energy.socket_path = g_strdup(str);
39250418f908SAnthony Harivel }
39260418f908SAnthony Harivel
kvm_accel_instance_init(Object * obj)392723b0898eSPaolo Bonzini static void kvm_accel_instance_init(Object *obj)
392823b0898eSPaolo Bonzini {
392923b0898eSPaolo Bonzini KVMState *s = KVM_STATE(obj);
393023b0898eSPaolo Bonzini
393170cbae42SPaolo Bonzini s->fd = -1;
393270cbae42SPaolo Bonzini s->vmfd = -1;
393323b0898eSPaolo Bonzini s->kvm_shadow_mem = -1;
3934d1972be1SXiaoyao Li s->kernel_irqchip_allowed = true;
3935d1972be1SXiaoyao Li s->kernel_irqchip_split = ON_OFF_AUTO_AUTO;
39362ea5cb0aSPeter Xu /* KVM dirty ring is by default off */
39372ea5cb0aSPeter Xu s->kvm_dirty_ring_size = 0;
3938b20cc776SGavin Shan s->kvm_dirty_ring_with_bitmap = false;
3939c8f2eb5dSShameer Kolothum s->kvm_eager_split_size = 0;
3940e2e69f6bSChenyi Qiang s->notify_vmexit = NOTIFY_VMEXIT_OPTION_RUN;
3941e2e69f6bSChenyi Qiang s->notify_window = 0;
394261491cf4SDavid Woodhouse s->xen_version = 0;
39436f43f2eeSDavid Woodhouse s->xen_gnttab_max_frames = 64;
3944e16aff4cSDavid Woodhouse s->xen_evtchn_max_pirq = 256;
3945aef158b0SDaan De Meyer s->device = NULL;
39460418f908SAnthony Harivel s->msr_energy.enable = false;
394723b0898eSPaolo Bonzini }
394823b0898eSPaolo Bonzini
39493b7a9388SAlex Bennée /**
39503b7a9388SAlex Bennée * kvm_gdbstub_sstep_flags():
39513b7a9388SAlex Bennée *
39523b7a9388SAlex Bennée * Returns: SSTEP_* flags that KVM supports for guest debug. The
39533b7a9388SAlex Bennée * support is probed during kvm_init()
39543b7a9388SAlex Bennée */
kvm_gdbstub_sstep_flags(void)39553b7a9388SAlex Bennée static int kvm_gdbstub_sstep_flags(void)
39563b7a9388SAlex Bennée {
39573b7a9388SAlex Bennée return kvm_sstep_flags;
39583b7a9388SAlex Bennée }
39593b7a9388SAlex Bennée
kvm_accel_class_init(ObjectClass * oc,void * data)396092229a57SYang Zhong static void kvm_accel_class_init(ObjectClass *oc, void *data)
396192229a57SYang Zhong {
396292229a57SYang Zhong AccelClass *ac = ACCEL_CLASS(oc);
396392229a57SYang Zhong ac->name = "KVM";
396492229a57SYang Zhong ac->init_machine = kvm_init;
39658072aae3SAlexey Kardashevskiy ac->has_memory = kvm_accel_has_memory;
396692229a57SYang Zhong ac->allowed = &kvm_allowed;
39673b7a9388SAlex Bennée ac->gdbstub_supported_sstep_flags = kvm_gdbstub_sstep_flags;
396823b0898eSPaolo Bonzini
396911bc4a13SPaolo Bonzini object_class_property_add(oc, "kernel-irqchip", "on|off|split",
397011bc4a13SPaolo Bonzini NULL, kvm_set_kernel_irqchip,
3971d2623129SMarkus Armbruster NULL, NULL);
397211bc4a13SPaolo Bonzini object_class_property_set_description(oc, "kernel-irqchip",
39737eecec7dSMarkus Armbruster "Configure KVM in-kernel irqchip");
397411bc4a13SPaolo Bonzini
397523b0898eSPaolo Bonzini object_class_property_add(oc, "kvm-shadow-mem", "int",
397623b0898eSPaolo Bonzini kvm_get_kvm_shadow_mem, kvm_set_kvm_shadow_mem,
3977d2623129SMarkus Armbruster NULL, NULL);
397823b0898eSPaolo Bonzini object_class_property_set_description(oc, "kvm-shadow-mem",
39797eecec7dSMarkus Armbruster "KVM shadow MMU size");
39802ea5cb0aSPeter Xu
39812ea5cb0aSPeter Xu object_class_property_add(oc, "dirty-ring-size", "uint32",
39822ea5cb0aSPeter Xu kvm_get_dirty_ring_size, kvm_set_dirty_ring_size,
39832ea5cb0aSPeter Xu NULL, NULL);
39842ea5cb0aSPeter Xu object_class_property_set_description(oc, "dirty-ring-size",
39852ea5cb0aSPeter Xu "Size of KVM dirty page ring buffer (default: 0, i.e. use bitmap)");
39863dba0a33SPaolo Bonzini
3987aef158b0SDaan De Meyer object_class_property_add_str(oc, "device", kvm_get_device, kvm_set_device);
3988aef158b0SDaan De Meyer object_class_property_set_description(oc, "device",
3989aef158b0SDaan De Meyer "Path to the device node to use (default: /dev/kvm)");
3990aef158b0SDaan De Meyer
39910418f908SAnthony Harivel object_class_property_add_bool(oc, "rapl",
39920418f908SAnthony Harivel NULL,
39930418f908SAnthony Harivel kvm_set_kvm_rapl);
39940418f908SAnthony Harivel object_class_property_set_description(oc, "rapl",
39950418f908SAnthony Harivel "Allow energy related MSRs for RAPL interface in Guest");
39960418f908SAnthony Harivel
39970418f908SAnthony Harivel object_class_property_add_str(oc, "rapl-helper-socket", NULL,
39980418f908SAnthony Harivel kvm_set_kvm_rapl_socket_path);
39990418f908SAnthony Harivel object_class_property_set_description(oc, "rapl-helper-socket",
40000418f908SAnthony Harivel "Socket Path for comminucating with the Virtual MSR helper daemon");
40010418f908SAnthony Harivel
40023dba0a33SPaolo Bonzini kvm_arch_accel_class_init(oc);
400392229a57SYang Zhong }
400492229a57SYang Zhong
400592229a57SYang Zhong static const TypeInfo kvm_accel_type = {
400692229a57SYang Zhong .name = TYPE_KVM_ACCEL,
400792229a57SYang Zhong .parent = TYPE_ACCEL,
400823b0898eSPaolo Bonzini .instance_init = kvm_accel_instance_init,
400992229a57SYang Zhong .class_init = kvm_accel_class_init,
401092229a57SYang Zhong .instance_size = sizeof(KVMState),
401192229a57SYang Zhong };
401292229a57SYang Zhong
kvm_type_init(void)401392229a57SYang Zhong static void kvm_type_init(void)
401492229a57SYang Zhong {
401592229a57SYang Zhong type_register_static(&kvm_accel_type);
401692229a57SYang Zhong }
401792229a57SYang Zhong
401892229a57SYang Zhong type_init(kvm_type_init);
4019cc01a3f4SMark Kanda
4020cc01a3f4SMark Kanda typedef struct StatsArgs {
4021cc01a3f4SMark Kanda union StatsResultsType {
4022cc01a3f4SMark Kanda StatsResultList **stats;
4023cc01a3f4SMark Kanda StatsSchemaList **schema;
4024cc01a3f4SMark Kanda } result;
4025cf7405bcSPaolo Bonzini strList *names;
4026cc01a3f4SMark Kanda Error **errp;
4027cc01a3f4SMark Kanda } StatsArgs;
4028cc01a3f4SMark Kanda
add_kvmstat_entry(struct kvm_stats_desc * pdesc,uint64_t * stats_data,StatsList * stats_list,Error ** errp)4029cc01a3f4SMark Kanda static StatsList *add_kvmstat_entry(struct kvm_stats_desc *pdesc,
4030cc01a3f4SMark Kanda uint64_t *stats_data,
4031cc01a3f4SMark Kanda StatsList *stats_list,
4032cc01a3f4SMark Kanda Error **errp)
4033cc01a3f4SMark Kanda {
4034cc01a3f4SMark Kanda
4035cc01a3f4SMark Kanda Stats *stats;
4036cc01a3f4SMark Kanda uint64List *val_list = NULL;
4037cc01a3f4SMark Kanda
4038cc01a3f4SMark Kanda /* Only add stats that we understand. */
4039cc01a3f4SMark Kanda switch (pdesc->flags & KVM_STATS_TYPE_MASK) {
4040cc01a3f4SMark Kanda case KVM_STATS_TYPE_CUMULATIVE:
4041cc01a3f4SMark Kanda case KVM_STATS_TYPE_INSTANT:
4042cc01a3f4SMark Kanda case KVM_STATS_TYPE_PEAK:
4043cc01a3f4SMark Kanda case KVM_STATS_TYPE_LINEAR_HIST:
4044cc01a3f4SMark Kanda case KVM_STATS_TYPE_LOG_HIST:
4045cc01a3f4SMark Kanda break;
4046cc01a3f4SMark Kanda default:
4047cc01a3f4SMark Kanda return stats_list;
4048cc01a3f4SMark Kanda }
4049cc01a3f4SMark Kanda
4050cc01a3f4SMark Kanda switch (pdesc->flags & KVM_STATS_UNIT_MASK) {
4051cc01a3f4SMark Kanda case KVM_STATS_UNIT_NONE:
4052cc01a3f4SMark Kanda case KVM_STATS_UNIT_BYTES:
4053cc01a3f4SMark Kanda case KVM_STATS_UNIT_CYCLES:
4054cc01a3f4SMark Kanda case KVM_STATS_UNIT_SECONDS:
4055105bb7cdSPaolo Bonzini case KVM_STATS_UNIT_BOOLEAN:
4056cc01a3f4SMark Kanda break;
4057cc01a3f4SMark Kanda default:
4058cc01a3f4SMark Kanda return stats_list;
4059cc01a3f4SMark Kanda }
4060cc01a3f4SMark Kanda
4061cc01a3f4SMark Kanda switch (pdesc->flags & KVM_STATS_BASE_MASK) {
4062cc01a3f4SMark Kanda case KVM_STATS_BASE_POW10:
4063cc01a3f4SMark Kanda case KVM_STATS_BASE_POW2:
4064cc01a3f4SMark Kanda break;
4065cc01a3f4SMark Kanda default:
4066cc01a3f4SMark Kanda return stats_list;
4067cc01a3f4SMark Kanda }
4068cc01a3f4SMark Kanda
4069cc01a3f4SMark Kanda /* Alloc and populate data list */
4070cc01a3f4SMark Kanda stats = g_new0(Stats, 1);
4071cc01a3f4SMark Kanda stats->name = g_strdup(pdesc->name);
407244fd9cf6SZhao Liu stats->value = g_new0(StatsValue, 1);
4073cc01a3f4SMark Kanda
4074105bb7cdSPaolo Bonzini if ((pdesc->flags & KVM_STATS_UNIT_MASK) == KVM_STATS_UNIT_BOOLEAN) {
4075105bb7cdSPaolo Bonzini stats->value->u.boolean = *stats_data;
4076105bb7cdSPaolo Bonzini stats->value->type = QTYPE_QBOOL;
4077105bb7cdSPaolo Bonzini } else if (pdesc->size == 1) {
4078cc01a3f4SMark Kanda stats->value->u.scalar = *stats_data;
4079cc01a3f4SMark Kanda stats->value->type = QTYPE_QNUM;
4080cc01a3f4SMark Kanda } else {
4081cc01a3f4SMark Kanda int i;
4082cc01a3f4SMark Kanda for (i = 0; i < pdesc->size; i++) {
4083cc01a3f4SMark Kanda QAPI_LIST_PREPEND(val_list, stats_data[i]);
4084cc01a3f4SMark Kanda }
4085cc01a3f4SMark Kanda stats->value->u.list = val_list;
4086cc01a3f4SMark Kanda stats->value->type = QTYPE_QLIST;
4087cc01a3f4SMark Kanda }
4088cc01a3f4SMark Kanda
4089cc01a3f4SMark Kanda QAPI_LIST_PREPEND(stats_list, stats);
4090cc01a3f4SMark Kanda return stats_list;
4091cc01a3f4SMark Kanda }
4092cc01a3f4SMark Kanda
add_kvmschema_entry(struct kvm_stats_desc * pdesc,StatsSchemaValueList * list,Error ** errp)4093cc01a3f4SMark Kanda static StatsSchemaValueList *add_kvmschema_entry(struct kvm_stats_desc *pdesc,
4094cc01a3f4SMark Kanda StatsSchemaValueList *list,
4095cc01a3f4SMark Kanda Error **errp)
4096cc01a3f4SMark Kanda {
4097cc01a3f4SMark Kanda StatsSchemaValueList *schema_entry = g_new0(StatsSchemaValueList, 1);
4098cc01a3f4SMark Kanda schema_entry->value = g_new0(StatsSchemaValue, 1);
4099cc01a3f4SMark Kanda
4100cc01a3f4SMark Kanda switch (pdesc->flags & KVM_STATS_TYPE_MASK) {
4101cc01a3f4SMark Kanda case KVM_STATS_TYPE_CUMULATIVE:
4102cc01a3f4SMark Kanda schema_entry->value->type = STATS_TYPE_CUMULATIVE;
4103cc01a3f4SMark Kanda break;
4104cc01a3f4SMark Kanda case KVM_STATS_TYPE_INSTANT:
4105cc01a3f4SMark Kanda schema_entry->value->type = STATS_TYPE_INSTANT;
4106cc01a3f4SMark Kanda break;
4107cc01a3f4SMark Kanda case KVM_STATS_TYPE_PEAK:
4108cc01a3f4SMark Kanda schema_entry->value->type = STATS_TYPE_PEAK;
4109cc01a3f4SMark Kanda break;
4110cc01a3f4SMark Kanda case KVM_STATS_TYPE_LINEAR_HIST:
4111cc01a3f4SMark Kanda schema_entry->value->type = STATS_TYPE_LINEAR_HISTOGRAM;
4112cc01a3f4SMark Kanda schema_entry->value->bucket_size = pdesc->bucket_size;
4113cc01a3f4SMark Kanda schema_entry->value->has_bucket_size = true;
4114cc01a3f4SMark Kanda break;
4115cc01a3f4SMark Kanda case KVM_STATS_TYPE_LOG_HIST:
4116cc01a3f4SMark Kanda schema_entry->value->type = STATS_TYPE_LOG2_HISTOGRAM;
4117cc01a3f4SMark Kanda break;
4118cc01a3f4SMark Kanda default:
4119cc01a3f4SMark Kanda goto exit;
4120cc01a3f4SMark Kanda }
4121cc01a3f4SMark Kanda
4122cc01a3f4SMark Kanda switch (pdesc->flags & KVM_STATS_UNIT_MASK) {
4123cc01a3f4SMark Kanda case KVM_STATS_UNIT_NONE:
4124cc01a3f4SMark Kanda break;
4125105bb7cdSPaolo Bonzini case KVM_STATS_UNIT_BOOLEAN:
4126105bb7cdSPaolo Bonzini schema_entry->value->has_unit = true;
4127105bb7cdSPaolo Bonzini schema_entry->value->unit = STATS_UNIT_BOOLEAN;
4128105bb7cdSPaolo Bonzini break;
4129cc01a3f4SMark Kanda case KVM_STATS_UNIT_BYTES:
4130cc01a3f4SMark Kanda schema_entry->value->has_unit = true;
4131cc01a3f4SMark Kanda schema_entry->value->unit = STATS_UNIT_BYTES;
4132cc01a3f4SMark Kanda break;
4133cc01a3f4SMark Kanda case KVM_STATS_UNIT_CYCLES:
4134cc01a3f4SMark Kanda schema_entry->value->has_unit = true;
4135cc01a3f4SMark Kanda schema_entry->value->unit = STATS_UNIT_CYCLES;
4136cc01a3f4SMark Kanda break;
4137cc01a3f4SMark Kanda case KVM_STATS_UNIT_SECONDS:
4138cc01a3f4SMark Kanda schema_entry->value->has_unit = true;
4139cc01a3f4SMark Kanda schema_entry->value->unit = STATS_UNIT_SECONDS;
4140cc01a3f4SMark Kanda break;
4141cc01a3f4SMark Kanda default:
4142cc01a3f4SMark Kanda goto exit;
4143cc01a3f4SMark Kanda }
4144cc01a3f4SMark Kanda
4145cc01a3f4SMark Kanda schema_entry->value->exponent = pdesc->exponent;
4146cc01a3f4SMark Kanda if (pdesc->exponent) {
4147cc01a3f4SMark Kanda switch (pdesc->flags & KVM_STATS_BASE_MASK) {
4148cc01a3f4SMark Kanda case KVM_STATS_BASE_POW10:
4149cc01a3f4SMark Kanda schema_entry->value->has_base = true;
4150cc01a3f4SMark Kanda schema_entry->value->base = 10;
4151cc01a3f4SMark Kanda break;
4152cc01a3f4SMark Kanda case KVM_STATS_BASE_POW2:
4153cc01a3f4SMark Kanda schema_entry->value->has_base = true;
4154cc01a3f4SMark Kanda schema_entry->value->base = 2;
4155cc01a3f4SMark Kanda break;
4156cc01a3f4SMark Kanda default:
4157cc01a3f4SMark Kanda goto exit;
4158cc01a3f4SMark Kanda }
4159cc01a3f4SMark Kanda }
4160cc01a3f4SMark Kanda
4161cc01a3f4SMark Kanda schema_entry->value->name = g_strdup(pdesc->name);
4162cc01a3f4SMark Kanda schema_entry->next = list;
4163cc01a3f4SMark Kanda return schema_entry;
4164cc01a3f4SMark Kanda exit:
4165cc01a3f4SMark Kanda g_free(schema_entry->value);
4166cc01a3f4SMark Kanda g_free(schema_entry);
4167cc01a3f4SMark Kanda return list;
4168cc01a3f4SMark Kanda }
4169cc01a3f4SMark Kanda
4170cc01a3f4SMark Kanda /* Cached stats descriptors */
4171cc01a3f4SMark Kanda typedef struct StatsDescriptors {
4172cc01a3f4SMark Kanda const char *ident; /* cache key, currently the StatsTarget */
4173cc01a3f4SMark Kanda struct kvm_stats_desc *kvm_stats_desc;
417421adec30SPaolo Bonzini struct kvm_stats_header kvm_stats_header;
4175cc01a3f4SMark Kanda QTAILQ_ENTRY(StatsDescriptors) next;
4176cc01a3f4SMark Kanda } StatsDescriptors;
4177cc01a3f4SMark Kanda
4178cc01a3f4SMark Kanda static QTAILQ_HEAD(, StatsDescriptors) stats_descriptors =
4179cc01a3f4SMark Kanda QTAILQ_HEAD_INITIALIZER(stats_descriptors);
4180cc01a3f4SMark Kanda
4181cc01a3f4SMark Kanda /*
4182cc01a3f4SMark Kanda * Return the descriptors for 'target', that either have already been read
4183cc01a3f4SMark Kanda * or are retrieved from 'stats_fd'.
4184cc01a3f4SMark Kanda */
find_stats_descriptors(StatsTarget target,int stats_fd,Error ** errp)4185cc01a3f4SMark Kanda static StatsDescriptors *find_stats_descriptors(StatsTarget target, int stats_fd,
4186cc01a3f4SMark Kanda Error **errp)
4187cc01a3f4SMark Kanda {
4188cc01a3f4SMark Kanda StatsDescriptors *descriptors;
4189cc01a3f4SMark Kanda const char *ident;
4190cc01a3f4SMark Kanda struct kvm_stats_desc *kvm_stats_desc;
4191cc01a3f4SMark Kanda struct kvm_stats_header *kvm_stats_header;
4192cc01a3f4SMark Kanda size_t size_desc;
4193cc01a3f4SMark Kanda ssize_t ret;
4194cc01a3f4SMark Kanda
4195cc01a3f4SMark Kanda ident = StatsTarget_str(target);
4196cc01a3f4SMark Kanda QTAILQ_FOREACH(descriptors, &stats_descriptors, next) {
4197cc01a3f4SMark Kanda if (g_str_equal(descriptors->ident, ident)) {
4198cc01a3f4SMark Kanda return descriptors;
4199cc01a3f4SMark Kanda }
4200cc01a3f4SMark Kanda }
4201cc01a3f4SMark Kanda
4202cc01a3f4SMark Kanda descriptors = g_new0(StatsDescriptors, 1);
4203cc01a3f4SMark Kanda
4204cc01a3f4SMark Kanda /* Read stats header */
420521adec30SPaolo Bonzini kvm_stats_header = &descriptors->kvm_stats_header;
42063b6f4852SMarcelo Tosatti ret = pread(stats_fd, kvm_stats_header, sizeof(*kvm_stats_header), 0);
4207cc01a3f4SMark Kanda if (ret != sizeof(*kvm_stats_header)) {
4208cc01a3f4SMark Kanda error_setg(errp, "KVM stats: failed to read stats header: "
4209cc01a3f4SMark Kanda "expected %zu actual %zu",
4210cc01a3f4SMark Kanda sizeof(*kvm_stats_header), ret);
4211f696b74bSMiaoqian Lin g_free(descriptors);
4212cc01a3f4SMark Kanda return NULL;
4213cc01a3f4SMark Kanda }
4214cc01a3f4SMark Kanda size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size;
4215cc01a3f4SMark Kanda
4216cc01a3f4SMark Kanda /* Read stats descriptors */
4217cc01a3f4SMark Kanda kvm_stats_desc = g_malloc0_n(kvm_stats_header->num_desc, size_desc);
4218cc01a3f4SMark Kanda ret = pread(stats_fd, kvm_stats_desc,
4219cc01a3f4SMark Kanda size_desc * kvm_stats_header->num_desc,
4220cc01a3f4SMark Kanda kvm_stats_header->desc_offset);
4221cc01a3f4SMark Kanda
4222cc01a3f4SMark Kanda if (ret != size_desc * kvm_stats_header->num_desc) {
4223cc01a3f4SMark Kanda error_setg(errp, "KVM stats: failed to read stats descriptors: "
4224cc01a3f4SMark Kanda "expected %zu actual %zu",
4225cc01a3f4SMark Kanda size_desc * kvm_stats_header->num_desc, ret);
4226cc01a3f4SMark Kanda g_free(descriptors);
4227cc01a3f4SMark Kanda g_free(kvm_stats_desc);
4228cc01a3f4SMark Kanda return NULL;
4229cc01a3f4SMark Kanda }
4230cc01a3f4SMark Kanda descriptors->kvm_stats_desc = kvm_stats_desc;
4231cc01a3f4SMark Kanda descriptors->ident = ident;
4232cc01a3f4SMark Kanda QTAILQ_INSERT_TAIL(&stats_descriptors, descriptors, next);
4233cc01a3f4SMark Kanda return descriptors;
4234cc01a3f4SMark Kanda }
4235cc01a3f4SMark Kanda
query_stats(StatsResultList ** result,StatsTarget target,strList * names,int stats_fd,CPUState * cpu,Error ** errp)4236cc01a3f4SMark Kanda static void query_stats(StatsResultList **result, StatsTarget target,
42373b6f4852SMarcelo Tosatti strList *names, int stats_fd, CPUState *cpu,
42383b6f4852SMarcelo Tosatti Error **errp)
4239cc01a3f4SMark Kanda {
4240cc01a3f4SMark Kanda struct kvm_stats_desc *kvm_stats_desc;
4241cc01a3f4SMark Kanda struct kvm_stats_header *kvm_stats_header;
4242cc01a3f4SMark Kanda StatsDescriptors *descriptors;
4243cc01a3f4SMark Kanda g_autofree uint64_t *stats_data = NULL;
4244cc01a3f4SMark Kanda struct kvm_stats_desc *pdesc;
4245cc01a3f4SMark Kanda StatsList *stats_list = NULL;
4246cc01a3f4SMark Kanda size_t size_desc, size_data = 0;
4247cc01a3f4SMark Kanda ssize_t ret;
4248cc01a3f4SMark Kanda int i;
4249cc01a3f4SMark Kanda
4250cc01a3f4SMark Kanda descriptors = find_stats_descriptors(target, stats_fd, errp);
4251cc01a3f4SMark Kanda if (!descriptors) {
4252cc01a3f4SMark Kanda return;
4253cc01a3f4SMark Kanda }
4254cc01a3f4SMark Kanda
425521adec30SPaolo Bonzini kvm_stats_header = &descriptors->kvm_stats_header;
4256cc01a3f4SMark Kanda kvm_stats_desc = descriptors->kvm_stats_desc;
4257cc01a3f4SMark Kanda size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size;
4258cc01a3f4SMark Kanda
4259cc01a3f4SMark Kanda /* Tally the total data size; read schema data */
4260cc01a3f4SMark Kanda for (i = 0; i < kvm_stats_header->num_desc; ++i) {
4261cc01a3f4SMark Kanda pdesc = (void *)kvm_stats_desc + i * size_desc;
4262cc01a3f4SMark Kanda size_data += pdesc->size * sizeof(*stats_data);
4263cc01a3f4SMark Kanda }
4264cc01a3f4SMark Kanda
4265cc01a3f4SMark Kanda stats_data = g_malloc0(size_data);
4266cc01a3f4SMark Kanda ret = pread(stats_fd, stats_data, size_data, kvm_stats_header->data_offset);
4267cc01a3f4SMark Kanda
4268cc01a3f4SMark Kanda if (ret != size_data) {
4269cc01a3f4SMark Kanda error_setg(errp, "KVM stats: failed to read data: "
4270cc01a3f4SMark Kanda "expected %zu actual %zu", size_data, ret);
4271cc01a3f4SMark Kanda return;
4272cc01a3f4SMark Kanda }
4273cc01a3f4SMark Kanda
4274cc01a3f4SMark Kanda for (i = 0; i < kvm_stats_header->num_desc; ++i) {
4275cc01a3f4SMark Kanda uint64_t *stats;
4276cc01a3f4SMark Kanda pdesc = (void *)kvm_stats_desc + i * size_desc;
4277cc01a3f4SMark Kanda
4278cc01a3f4SMark Kanda /* Add entry to the list */
4279cc01a3f4SMark Kanda stats = (void *)stats_data + pdesc->offset;
4280cf7405bcSPaolo Bonzini if (!apply_str_list_filter(pdesc->name, names)) {
4281cf7405bcSPaolo Bonzini continue;
4282cf7405bcSPaolo Bonzini }
4283cc01a3f4SMark Kanda stats_list = add_kvmstat_entry(pdesc, stats, stats_list, errp);
4284cc01a3f4SMark Kanda }
4285cc01a3f4SMark Kanda
4286cc01a3f4SMark Kanda if (!stats_list) {
4287cc01a3f4SMark Kanda return;
4288cc01a3f4SMark Kanda }
4289cc01a3f4SMark Kanda
4290cc01a3f4SMark Kanda switch (target) {
4291cc01a3f4SMark Kanda case STATS_TARGET_VM:
4292cc01a3f4SMark Kanda add_stats_entry(result, STATS_PROVIDER_KVM, NULL, stats_list);
4293cc01a3f4SMark Kanda break;
4294cc01a3f4SMark Kanda case STATS_TARGET_VCPU:
4295cc01a3f4SMark Kanda add_stats_entry(result, STATS_PROVIDER_KVM,
42963b6f4852SMarcelo Tosatti cpu->parent_obj.canonical_path,
4297cc01a3f4SMark Kanda stats_list);
4298cc01a3f4SMark Kanda break;
4299cc01a3f4SMark Kanda default:
4300d12dd9c7SPeter Maydell g_assert_not_reached();
4301cc01a3f4SMark Kanda }
4302cc01a3f4SMark Kanda }
4303cc01a3f4SMark Kanda
query_stats_schema(StatsSchemaList ** result,StatsTarget target,int stats_fd,Error ** errp)4304cc01a3f4SMark Kanda static void query_stats_schema(StatsSchemaList **result, StatsTarget target,
4305cc01a3f4SMark Kanda int stats_fd, Error **errp)
4306cc01a3f4SMark Kanda {
4307cc01a3f4SMark Kanda struct kvm_stats_desc *kvm_stats_desc;
4308cc01a3f4SMark Kanda struct kvm_stats_header *kvm_stats_header;
4309cc01a3f4SMark Kanda StatsDescriptors *descriptors;
4310cc01a3f4SMark Kanda struct kvm_stats_desc *pdesc;
4311cc01a3f4SMark Kanda StatsSchemaValueList *stats_list = NULL;
4312cc01a3f4SMark Kanda size_t size_desc;
4313cc01a3f4SMark Kanda int i;
4314cc01a3f4SMark Kanda
4315cc01a3f4SMark Kanda descriptors = find_stats_descriptors(target, stats_fd, errp);
4316cc01a3f4SMark Kanda if (!descriptors) {
4317cc01a3f4SMark Kanda return;
4318cc01a3f4SMark Kanda }
4319cc01a3f4SMark Kanda
432021adec30SPaolo Bonzini kvm_stats_header = &descriptors->kvm_stats_header;
4321cc01a3f4SMark Kanda kvm_stats_desc = descriptors->kvm_stats_desc;
4322cc01a3f4SMark Kanda size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size;
4323cc01a3f4SMark Kanda
4324cc01a3f4SMark Kanda /* Tally the total data size; read schema data */
4325cc01a3f4SMark Kanda for (i = 0; i < kvm_stats_header->num_desc; ++i) {
4326cc01a3f4SMark Kanda pdesc = (void *)kvm_stats_desc + i * size_desc;
4327cc01a3f4SMark Kanda stats_list = add_kvmschema_entry(pdesc, stats_list, errp);
4328cc01a3f4SMark Kanda }
4329cc01a3f4SMark Kanda
4330cc01a3f4SMark Kanda add_stats_schema(result, STATS_PROVIDER_KVM, target, stats_list);
4331cc01a3f4SMark Kanda }
4332cc01a3f4SMark Kanda
query_stats_vcpu(CPUState * cpu,StatsArgs * kvm_stats_args)43333b6f4852SMarcelo Tosatti static void query_stats_vcpu(CPUState *cpu, StatsArgs *kvm_stats_args)
4334cc01a3f4SMark Kanda {
43353b6f4852SMarcelo Tosatti int stats_fd = cpu->kvm_vcpu_stats_fd;
4336cc01a3f4SMark Kanda Error *local_err = NULL;
4337cc01a3f4SMark Kanda
4338cc01a3f4SMark Kanda if (stats_fd == -1) {
4339cc01a3f4SMark Kanda error_setg_errno(&local_err, errno, "KVM stats: ioctl failed");
4340cc01a3f4SMark Kanda error_propagate(kvm_stats_args->errp, local_err);
4341cc01a3f4SMark Kanda return;
4342cc01a3f4SMark Kanda }
4343cf7405bcSPaolo Bonzini query_stats(kvm_stats_args->result.stats, STATS_TARGET_VCPU,
43443b6f4852SMarcelo Tosatti kvm_stats_args->names, stats_fd, cpu,
43453b6f4852SMarcelo Tosatti kvm_stats_args->errp);
4346cc01a3f4SMark Kanda }
4347cc01a3f4SMark Kanda
query_stats_schema_vcpu(CPUState * cpu,StatsArgs * kvm_stats_args)43483b6f4852SMarcelo Tosatti static void query_stats_schema_vcpu(CPUState *cpu, StatsArgs *kvm_stats_args)
4349cc01a3f4SMark Kanda {
43503b6f4852SMarcelo Tosatti int stats_fd = cpu->kvm_vcpu_stats_fd;
4351cc01a3f4SMark Kanda Error *local_err = NULL;
4352cc01a3f4SMark Kanda
4353cc01a3f4SMark Kanda if (stats_fd == -1) {
4354cc01a3f4SMark Kanda error_setg_errno(&local_err, errno, "KVM stats: ioctl failed");
4355cc01a3f4SMark Kanda error_propagate(kvm_stats_args->errp, local_err);
4356cc01a3f4SMark Kanda return;
4357cc01a3f4SMark Kanda }
4358cc01a3f4SMark Kanda query_stats_schema(kvm_stats_args->result.schema, STATS_TARGET_VCPU, stats_fd,
4359cc01a3f4SMark Kanda kvm_stats_args->errp);
4360cc01a3f4SMark Kanda }
4361cc01a3f4SMark Kanda
query_stats_cb(StatsResultList ** result,StatsTarget target,strList * names,strList * targets,Error ** errp)4362467ef823SPaolo Bonzini static void query_stats_cb(StatsResultList **result, StatsTarget target,
4363cf7405bcSPaolo Bonzini strList *names, strList *targets, Error **errp)
4364cc01a3f4SMark Kanda {
4365cc01a3f4SMark Kanda KVMState *s = kvm_state;
4366cc01a3f4SMark Kanda CPUState *cpu;
4367cc01a3f4SMark Kanda int stats_fd;
4368cc01a3f4SMark Kanda
4369cc01a3f4SMark Kanda switch (target) {
4370cc01a3f4SMark Kanda case STATS_TARGET_VM:
4371cc01a3f4SMark Kanda {
4372cc01a3f4SMark Kanda stats_fd = kvm_vm_ioctl(s, KVM_GET_STATS_FD, NULL);
4373cc01a3f4SMark Kanda if (stats_fd == -1) {
4374cc01a3f4SMark Kanda error_setg_errno(errp, errno, "KVM stats: ioctl failed");
4375cc01a3f4SMark Kanda return;
4376cc01a3f4SMark Kanda }
43773b6f4852SMarcelo Tosatti query_stats(result, target, names, stats_fd, NULL, errp);
4378cc01a3f4SMark Kanda close(stats_fd);
4379cc01a3f4SMark Kanda break;
4380cc01a3f4SMark Kanda }
4381cc01a3f4SMark Kanda case STATS_TARGET_VCPU:
4382cc01a3f4SMark Kanda {
4383cc01a3f4SMark Kanda StatsArgs stats_args;
4384cc01a3f4SMark Kanda stats_args.result.stats = result;
4385cf7405bcSPaolo Bonzini stats_args.names = names;
4386cc01a3f4SMark Kanda stats_args.errp = errp;
4387cc01a3f4SMark Kanda CPU_FOREACH(cpu) {
4388467ef823SPaolo Bonzini if (!apply_str_list_filter(cpu->parent_obj.canonical_path, targets)) {
4389467ef823SPaolo Bonzini continue;
4390467ef823SPaolo Bonzini }
43913b6f4852SMarcelo Tosatti query_stats_vcpu(cpu, &stats_args);
4392cc01a3f4SMark Kanda }
4393cc01a3f4SMark Kanda break;
4394cc01a3f4SMark Kanda }
4395cc01a3f4SMark Kanda default:
4396cc01a3f4SMark Kanda break;
4397cc01a3f4SMark Kanda }
4398cc01a3f4SMark Kanda }
4399cc01a3f4SMark Kanda
query_stats_schemas_cb(StatsSchemaList ** result,Error ** errp)4400cc01a3f4SMark Kanda void query_stats_schemas_cb(StatsSchemaList **result, Error **errp)
4401cc01a3f4SMark Kanda {
4402cc01a3f4SMark Kanda StatsArgs stats_args;
4403cc01a3f4SMark Kanda KVMState *s = kvm_state;
4404cc01a3f4SMark Kanda int stats_fd;
4405cc01a3f4SMark Kanda
4406cc01a3f4SMark Kanda stats_fd = kvm_vm_ioctl(s, KVM_GET_STATS_FD, NULL);
4407cc01a3f4SMark Kanda if (stats_fd == -1) {
4408cc01a3f4SMark Kanda error_setg_errno(errp, errno, "KVM stats: ioctl failed");
4409cc01a3f4SMark Kanda return;
4410cc01a3f4SMark Kanda }
4411cc01a3f4SMark Kanda query_stats_schema(result, STATS_TARGET_VM, stats_fd, errp);
4412cc01a3f4SMark Kanda close(stats_fd);
4413cc01a3f4SMark Kanda
4414a9197ad2SPaolo Bonzini if (first_cpu) {
4415cc01a3f4SMark Kanda stats_args.result.schema = result;
4416cc01a3f4SMark Kanda stats_args.errp = errp;
44173b6f4852SMarcelo Tosatti query_stats_schema_vcpu(first_cpu, &stats_args);
4418cc01a3f4SMark Kanda }
4419a9197ad2SPaolo Bonzini }
44205c3131c3SPaolo Bonzini
kvm_mark_guest_state_protected(void)44215c3131c3SPaolo Bonzini void kvm_mark_guest_state_protected(void)
44225c3131c3SPaolo Bonzini {
44235c3131c3SPaolo Bonzini kvm_state->guest_state_protected = true;
44245c3131c3SPaolo Bonzini }
442515f7a80cSXiaoyao Li
kvm_create_guest_memfd(uint64_t size,uint64_t flags,Error ** errp)442615f7a80cSXiaoyao Li int kvm_create_guest_memfd(uint64_t size, uint64_t flags, Error **errp)
442715f7a80cSXiaoyao Li {
442815f7a80cSXiaoyao Li int fd;
442915f7a80cSXiaoyao Li struct kvm_create_guest_memfd guest_memfd = {
443015f7a80cSXiaoyao Li .size = size,
443115f7a80cSXiaoyao Li .flags = flags,
443215f7a80cSXiaoyao Li };
443315f7a80cSXiaoyao Li
443415f7a80cSXiaoyao Li if (!kvm_guest_memfd_supported) {
443515f7a80cSXiaoyao Li error_setg(errp, "KVM does not support guest_memfd");
443615f7a80cSXiaoyao Li return -1;
443715f7a80cSXiaoyao Li }
443815f7a80cSXiaoyao Li
443915f7a80cSXiaoyao Li fd = kvm_vm_ioctl(kvm_state, KVM_CREATE_GUEST_MEMFD, &guest_memfd);
444015f7a80cSXiaoyao Li if (fd < 0) {
444115f7a80cSXiaoyao Li error_setg_errno(errp, errno, "Error creating KVM guest_memfd");
444215f7a80cSXiaoyao Li return -1;
444315f7a80cSXiaoyao Li }
444415f7a80cSXiaoyao Li
444515f7a80cSXiaoyao Li return fd;
444615f7a80cSXiaoyao Li }
4447