1 /*
2 * QEMU KVM support
3 *
4 * Copyright IBM, Corp. 2008
5 * Red Hat, Inc. 2008
6 *
7 * Authors:
8 * Anthony Liguori <aliguori@us.ibm.com>
9 * Glauber Costa <gcosta@redhat.com>
10 *
11 * This work is licensed under the terms of the GNU GPL, version 2 or later.
12 * See the COPYING file in the top-level directory.
13 *
14 */
15
16 #include "qemu/osdep.h"
17 #include <sys/ioctl.h>
18 #include <poll.h>
19
20 #include <linux/kvm.h>
21
22 #include "qemu/atomic.h"
23 #include "qemu/option.h"
24 #include "qemu/config-file.h"
25 #include "qemu/error-report.h"
26 #include "qapi/error.h"
27 #include "hw/pci/msi.h"
28 #include "hw/pci/msix.h"
29 #include "hw/s390x/adapter.h"
30 #include "gdbstub/enums.h"
31 #include "sysemu/kvm_int.h"
32 #include "sysemu/runstate.h"
33 #include "sysemu/cpus.h"
34 #include "sysemu/accel-blocker.h"
35 #include "qemu/bswap.h"
36 #include "exec/memory.h"
37 #include "exec/ram_addr.h"
38 #include "qemu/event_notifier.h"
39 #include "qemu/main-loop.h"
40 #include "trace.h"
41 #include "hw/irq.h"
42 #include "qapi/visitor.h"
43 #include "qapi/qapi-types-common.h"
44 #include "qapi/qapi-visit-common.h"
45 #include "sysemu/reset.h"
46 #include "qemu/guest-random.h"
47 #include "sysemu/hw_accel.h"
48 #include "kvm-cpus.h"
49 #include "sysemu/dirtylimit.h"
50 #include "qemu/range.h"
51
52 #include "hw/boards.h"
53 #include "sysemu/stats.h"
54
55 /* This check must be after config-host.h is included */
56 #ifdef CONFIG_EVENTFD
57 #include <sys/eventfd.h>
58 #endif
59
60 /* KVM uses PAGE_SIZE in its definition of KVM_COALESCED_MMIO_MAX. We
61 * need to use the real host PAGE_SIZE, as that's what KVM will use.
62 */
63 #ifdef PAGE_SIZE
64 #undef PAGE_SIZE
65 #endif
66 #define PAGE_SIZE qemu_real_host_page_size()
67
68 #ifndef KVM_GUESTDBG_BLOCKIRQ
69 #define KVM_GUESTDBG_BLOCKIRQ 0
70 #endif
71
72 /* Default num of memslots to be allocated when VM starts */
73 #define KVM_MEMSLOTS_NR_ALLOC_DEFAULT 16
74 /* Default max allowed memslots if kernel reported nothing */
75 #define KVM_MEMSLOTS_NR_MAX_DEFAULT 32
76
77 struct KVMParkedVcpu {
78 unsigned long vcpu_id;
79 int kvm_fd;
80 QLIST_ENTRY(KVMParkedVcpu) node;
81 };
82
83 KVMState *kvm_state;
84 bool kvm_kernel_irqchip;
85 bool kvm_split_irqchip;
86 bool kvm_async_interrupts_allowed;
87 bool kvm_halt_in_kernel_allowed;
88 bool kvm_resamplefds_allowed;
89 bool kvm_msi_via_irqfd_allowed;
90 bool kvm_gsi_routing_allowed;
91 bool kvm_gsi_direct_mapping;
92 bool kvm_allowed;
93 bool kvm_readonly_mem_allowed;
94 bool kvm_vm_attributes_allowed;
95 bool kvm_msi_use_devid;
96 static bool kvm_has_guest_debug;
97 static int kvm_sstep_flags;
98 static bool kvm_immediate_exit;
99 static uint64_t kvm_supported_memory_attributes;
100 static bool kvm_guest_memfd_supported;
101 static hwaddr kvm_max_slot_size = ~0;
102
103 static const KVMCapabilityInfo kvm_required_capabilites[] = {
104 KVM_CAP_INFO(USER_MEMORY),
105 KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS),
106 KVM_CAP_INFO(JOIN_MEMORY_REGIONS_WORKS),
107 KVM_CAP_INFO(INTERNAL_ERROR_DATA),
108 KVM_CAP_INFO(IOEVENTFD),
109 KVM_CAP_INFO(IOEVENTFD_ANY_LENGTH),
110 KVM_CAP_LAST_INFO
111 };
112
113 static NotifierList kvm_irqchip_change_notifiers =
114 NOTIFIER_LIST_INITIALIZER(kvm_irqchip_change_notifiers);
115
116 struct KVMResampleFd {
117 int gsi;
118 EventNotifier *resample_event;
119 QLIST_ENTRY(KVMResampleFd) node;
120 };
121 typedef struct KVMResampleFd KVMResampleFd;
122
123 /*
124 * Only used with split irqchip where we need to do the resample fd
125 * kick for the kernel from userspace.
126 */
127 static QLIST_HEAD(, KVMResampleFd) kvm_resample_fd_list =
128 QLIST_HEAD_INITIALIZER(kvm_resample_fd_list);
129
130 static QemuMutex kml_slots_lock;
131
132 #define kvm_slots_lock() qemu_mutex_lock(&kml_slots_lock)
133 #define kvm_slots_unlock() qemu_mutex_unlock(&kml_slots_lock)
134
135 static void kvm_slot_init_dirty_bitmap(KVMSlot *mem);
136
kvm_resample_fd_remove(int gsi)137 static inline void kvm_resample_fd_remove(int gsi)
138 {
139 KVMResampleFd *rfd;
140
141 QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) {
142 if (rfd->gsi == gsi) {
143 QLIST_REMOVE(rfd, node);
144 g_free(rfd);
145 break;
146 }
147 }
148 }
149
kvm_resample_fd_insert(int gsi,EventNotifier * event)150 static inline void kvm_resample_fd_insert(int gsi, EventNotifier *event)
151 {
152 KVMResampleFd *rfd = g_new0(KVMResampleFd, 1);
153
154 rfd->gsi = gsi;
155 rfd->resample_event = event;
156
157 QLIST_INSERT_HEAD(&kvm_resample_fd_list, rfd, node);
158 }
159
kvm_resample_fd_notify(int gsi)160 void kvm_resample_fd_notify(int gsi)
161 {
162 KVMResampleFd *rfd;
163
164 QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) {
165 if (rfd->gsi == gsi) {
166 event_notifier_set(rfd->resample_event);
167 trace_kvm_resample_fd_notify(gsi);
168 return;
169 }
170 }
171 }
172
173 /**
174 * kvm_slots_grow(): Grow the slots[] array in the KVMMemoryListener
175 *
176 * @kml: The KVMMemoryListener* to grow the slots[] array
177 * @nr_slots_new: The new size of slots[] array
178 *
179 * Returns: True if the array grows larger, false otherwise.
180 */
kvm_slots_grow(KVMMemoryListener * kml,unsigned int nr_slots_new)181 static bool kvm_slots_grow(KVMMemoryListener *kml, unsigned int nr_slots_new)
182 {
183 unsigned int i, cur = kml->nr_slots_allocated;
184 KVMSlot *slots;
185
186 if (nr_slots_new > kvm_state->nr_slots_max) {
187 nr_slots_new = kvm_state->nr_slots_max;
188 }
189
190 if (cur >= nr_slots_new) {
191 /* Big enough, no need to grow, or we reached max */
192 return false;
193 }
194
195 if (cur == 0) {
196 slots = g_new0(KVMSlot, nr_slots_new);
197 } else {
198 assert(kml->slots);
199 slots = g_renew(KVMSlot, kml->slots, nr_slots_new);
200 /*
201 * g_renew() doesn't initialize extended buffers, however kvm
202 * memslots require fields to be zero-initialized. E.g. pointers,
203 * memory_size field, etc.
204 */
205 memset(&slots[cur], 0x0, sizeof(slots[0]) * (nr_slots_new - cur));
206 }
207
208 for (i = cur; i < nr_slots_new; i++) {
209 slots[i].slot = i;
210 }
211
212 kml->slots = slots;
213 kml->nr_slots_allocated = nr_slots_new;
214 trace_kvm_slots_grow(cur, nr_slots_new);
215
216 return true;
217 }
218
kvm_slots_double(KVMMemoryListener * kml)219 static bool kvm_slots_double(KVMMemoryListener *kml)
220 {
221 return kvm_slots_grow(kml, kml->nr_slots_allocated * 2);
222 }
223
kvm_get_max_memslots(void)224 unsigned int kvm_get_max_memslots(void)
225 {
226 KVMState *s = KVM_STATE(current_accel());
227
228 return s->nr_slots_max;
229 }
230
kvm_get_free_memslots(void)231 unsigned int kvm_get_free_memslots(void)
232 {
233 unsigned int used_slots = 0;
234 KVMState *s = kvm_state;
235 int i;
236
237 kvm_slots_lock();
238 for (i = 0; i < s->nr_as; i++) {
239 if (!s->as[i].ml) {
240 continue;
241 }
242 used_slots = MAX(used_slots, s->as[i].ml->nr_slots_used);
243 }
244 kvm_slots_unlock();
245
246 return s->nr_slots_max - used_slots;
247 }
248
249 /* Called with KVMMemoryListener.slots_lock held */
kvm_get_free_slot(KVMMemoryListener * kml)250 static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml)
251 {
252 unsigned int n;
253 int i;
254
255 for (i = 0; i < kml->nr_slots_allocated; i++) {
256 if (kml->slots[i].memory_size == 0) {
257 return &kml->slots[i];
258 }
259 }
260
261 /*
262 * If no free slots, try to grow first by doubling. Cache the old size
263 * here to avoid another round of search: if the grow succeeded, it
264 * means slots[] now must have the existing "n" slots occupied,
265 * followed by one or more free slots starting from slots[n].
266 */
267 n = kml->nr_slots_allocated;
268 if (kvm_slots_double(kml)) {
269 return &kml->slots[n];
270 }
271
272 return NULL;
273 }
274
275 /* Called with KVMMemoryListener.slots_lock held */
kvm_alloc_slot(KVMMemoryListener * kml)276 static KVMSlot *kvm_alloc_slot(KVMMemoryListener *kml)
277 {
278 KVMSlot *slot = kvm_get_free_slot(kml);
279
280 if (slot) {
281 return slot;
282 }
283
284 fprintf(stderr, "%s: no free slot available\n", __func__);
285 abort();
286 }
287
kvm_lookup_matching_slot(KVMMemoryListener * kml,hwaddr start_addr,hwaddr size)288 static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml,
289 hwaddr start_addr,
290 hwaddr size)
291 {
292 int i;
293
294 for (i = 0; i < kml->nr_slots_allocated; i++) {
295 KVMSlot *mem = &kml->slots[i];
296
297 if (start_addr == mem->start_addr && size == mem->memory_size) {
298 return mem;
299 }
300 }
301
302 return NULL;
303 }
304
305 /*
306 * Calculate and align the start address and the size of the section.
307 * Return the size. If the size is 0, the aligned section is empty.
308 */
kvm_align_section(MemoryRegionSection * section,hwaddr * start)309 static hwaddr kvm_align_section(MemoryRegionSection *section,
310 hwaddr *start)
311 {
312 hwaddr size = int128_get64(section->size);
313 hwaddr delta, aligned;
314
315 /* kvm works in page size chunks, but the function may be called
316 with sub-page size and unaligned start address. Pad the start
317 address to next and truncate size to previous page boundary. */
318 aligned = ROUND_UP(section->offset_within_address_space,
319 qemu_real_host_page_size());
320 delta = aligned - section->offset_within_address_space;
321 *start = aligned;
322 if (delta > size) {
323 return 0;
324 }
325
326 return (size - delta) & qemu_real_host_page_mask();
327 }
328
kvm_physical_memory_addr_from_host(KVMState * s,void * ram,hwaddr * phys_addr)329 int kvm_physical_memory_addr_from_host(KVMState *s, void *ram,
330 hwaddr *phys_addr)
331 {
332 KVMMemoryListener *kml = &s->memory_listener;
333 int i, ret = 0;
334
335 kvm_slots_lock();
336 for (i = 0; i < kml->nr_slots_allocated; i++) {
337 KVMSlot *mem = &kml->slots[i];
338
339 if (ram >= mem->ram && ram < mem->ram + mem->memory_size) {
340 *phys_addr = mem->start_addr + (ram - mem->ram);
341 ret = 1;
342 break;
343 }
344 }
345 kvm_slots_unlock();
346
347 return ret;
348 }
349
kvm_set_user_memory_region(KVMMemoryListener * kml,KVMSlot * slot,bool new)350 static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot, bool new)
351 {
352 KVMState *s = kvm_state;
353 struct kvm_userspace_memory_region2 mem;
354 int ret;
355
356 mem.slot = slot->slot | (kml->as_id << 16);
357 mem.guest_phys_addr = slot->start_addr;
358 mem.userspace_addr = (unsigned long)slot->ram;
359 mem.flags = slot->flags;
360 mem.guest_memfd = slot->guest_memfd;
361 mem.guest_memfd_offset = slot->guest_memfd_offset;
362
363 if (slot->memory_size && !new && (mem.flags ^ slot->old_flags) & KVM_MEM_READONLY) {
364 /* Set the slot size to 0 before setting the slot to the desired
365 * value. This is needed based on KVM commit 75d61fbc. */
366 mem.memory_size = 0;
367
368 if (kvm_guest_memfd_supported) {
369 ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION2, &mem);
370 } else {
371 ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
372 }
373 if (ret < 0) {
374 goto err;
375 }
376 }
377 mem.memory_size = slot->memory_size;
378 if (kvm_guest_memfd_supported) {
379 ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION2, &mem);
380 } else {
381 ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
382 }
383 slot->old_flags = mem.flags;
384 err:
385 trace_kvm_set_user_memory(mem.slot >> 16, (uint16_t)mem.slot, mem.flags,
386 mem.guest_phys_addr, mem.memory_size,
387 mem.userspace_addr, mem.guest_memfd,
388 mem.guest_memfd_offset, ret);
389 if (ret < 0) {
390 if (kvm_guest_memfd_supported) {
391 error_report("%s: KVM_SET_USER_MEMORY_REGION2 failed, slot=%d,"
392 " start=0x%" PRIx64 ", size=0x%" PRIx64 ","
393 " flags=0x%" PRIx32 ", guest_memfd=%" PRId32 ","
394 " guest_memfd_offset=0x%" PRIx64 ": %s",
395 __func__, mem.slot, slot->start_addr,
396 (uint64_t)mem.memory_size, mem.flags,
397 mem.guest_memfd, (uint64_t)mem.guest_memfd_offset,
398 strerror(errno));
399 } else {
400 error_report("%s: KVM_SET_USER_MEMORY_REGION failed, slot=%d,"
401 " start=0x%" PRIx64 ", size=0x%" PRIx64 ": %s",
402 __func__, mem.slot, slot->start_addr,
403 (uint64_t)mem.memory_size, strerror(errno));
404 }
405 }
406 return ret;
407 }
408
kvm_park_vcpu(CPUState * cpu)409 void kvm_park_vcpu(CPUState *cpu)
410 {
411 struct KVMParkedVcpu *vcpu;
412
413 trace_kvm_park_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
414
415 vcpu = g_malloc0(sizeof(*vcpu));
416 vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
417 vcpu->kvm_fd = cpu->kvm_fd;
418 QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
419 }
420
kvm_unpark_vcpu(KVMState * s,unsigned long vcpu_id)421 int kvm_unpark_vcpu(KVMState *s, unsigned long vcpu_id)
422 {
423 struct KVMParkedVcpu *cpu;
424 int kvm_fd = -ENOENT;
425
426 QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
427 if (cpu->vcpu_id == vcpu_id) {
428 QLIST_REMOVE(cpu, node);
429 kvm_fd = cpu->kvm_fd;
430 g_free(cpu);
431 break;
432 }
433 }
434
435 trace_kvm_unpark_vcpu(vcpu_id, kvm_fd > 0 ? "unparked" : "!found parked");
436
437 return kvm_fd;
438 }
439
kvm_create_vcpu(CPUState * cpu)440 int kvm_create_vcpu(CPUState *cpu)
441 {
442 unsigned long vcpu_id = kvm_arch_vcpu_id(cpu);
443 KVMState *s = kvm_state;
444 int kvm_fd;
445
446 /* check if the KVM vCPU already exist but is parked */
447 kvm_fd = kvm_unpark_vcpu(s, vcpu_id);
448 if (kvm_fd < 0) {
449 /* vCPU not parked: create a new KVM vCPU */
450 kvm_fd = kvm_vm_ioctl(s, KVM_CREATE_VCPU, vcpu_id);
451 if (kvm_fd < 0) {
452 error_report("KVM_CREATE_VCPU IOCTL failed for vCPU %lu", vcpu_id);
453 return kvm_fd;
454 }
455 }
456
457 cpu->kvm_fd = kvm_fd;
458 cpu->kvm_state = s;
459 cpu->vcpu_dirty = true;
460 cpu->dirty_pages = 0;
461 cpu->throttle_us_per_full = 0;
462
463 trace_kvm_create_vcpu(cpu->cpu_index, vcpu_id, kvm_fd);
464
465 return 0;
466 }
467
kvm_create_and_park_vcpu(CPUState * cpu)468 int kvm_create_and_park_vcpu(CPUState *cpu)
469 {
470 int ret = 0;
471
472 ret = kvm_create_vcpu(cpu);
473 if (!ret) {
474 kvm_park_vcpu(cpu);
475 }
476
477 return ret;
478 }
479
do_kvm_destroy_vcpu(CPUState * cpu)480 static int do_kvm_destroy_vcpu(CPUState *cpu)
481 {
482 KVMState *s = kvm_state;
483 int mmap_size;
484 int ret = 0;
485
486 trace_kvm_destroy_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
487
488 ret = kvm_arch_destroy_vcpu(cpu);
489 if (ret < 0) {
490 goto err;
491 }
492
493 mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
494 if (mmap_size < 0) {
495 ret = mmap_size;
496 trace_kvm_failed_get_vcpu_mmap_size();
497 goto err;
498 }
499
500 ret = munmap(cpu->kvm_run, mmap_size);
501 if (ret < 0) {
502 goto err;
503 }
504
505 if (cpu->kvm_dirty_gfns) {
506 ret = munmap(cpu->kvm_dirty_gfns, s->kvm_dirty_ring_bytes);
507 if (ret < 0) {
508 goto err;
509 }
510 }
511
512 kvm_park_vcpu(cpu);
513 err:
514 return ret;
515 }
516
kvm_destroy_vcpu(CPUState * cpu)517 void kvm_destroy_vcpu(CPUState *cpu)
518 {
519 if (do_kvm_destroy_vcpu(cpu) < 0) {
520 error_report("kvm_destroy_vcpu failed");
521 exit(EXIT_FAILURE);
522 }
523 }
524
kvm_init_vcpu(CPUState * cpu,Error ** errp)525 int kvm_init_vcpu(CPUState *cpu, Error **errp)
526 {
527 KVMState *s = kvm_state;
528 int mmap_size;
529 int ret;
530
531 trace_kvm_init_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
532
533 ret = kvm_create_vcpu(cpu);
534 if (ret < 0) {
535 error_setg_errno(errp, -ret,
536 "kvm_init_vcpu: kvm_create_vcpu failed (%lu)",
537 kvm_arch_vcpu_id(cpu));
538 goto err;
539 }
540
541 mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
542 if (mmap_size < 0) {
543 ret = mmap_size;
544 error_setg_errno(errp, -mmap_size,
545 "kvm_init_vcpu: KVM_GET_VCPU_MMAP_SIZE failed");
546 goto err;
547 }
548
549 cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
550 cpu->kvm_fd, 0);
551 if (cpu->kvm_run == MAP_FAILED) {
552 ret = -errno;
553 error_setg_errno(errp, ret,
554 "kvm_init_vcpu: mmap'ing vcpu state failed (%lu)",
555 kvm_arch_vcpu_id(cpu));
556 goto err;
557 }
558
559 if (s->coalesced_mmio && !s->coalesced_mmio_ring) {
560 s->coalesced_mmio_ring =
561 (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE;
562 }
563
564 if (s->kvm_dirty_ring_size) {
565 /* Use MAP_SHARED to share pages with the kernel */
566 cpu->kvm_dirty_gfns = mmap(NULL, s->kvm_dirty_ring_bytes,
567 PROT_READ | PROT_WRITE, MAP_SHARED,
568 cpu->kvm_fd,
569 PAGE_SIZE * KVM_DIRTY_LOG_PAGE_OFFSET);
570 if (cpu->kvm_dirty_gfns == MAP_FAILED) {
571 ret = -errno;
572 goto err;
573 }
574 }
575
576 ret = kvm_arch_init_vcpu(cpu);
577 if (ret < 0) {
578 error_setg_errno(errp, -ret,
579 "kvm_init_vcpu: kvm_arch_init_vcpu failed (%lu)",
580 kvm_arch_vcpu_id(cpu));
581 }
582 cpu->kvm_vcpu_stats_fd = kvm_vcpu_ioctl(cpu, KVM_GET_STATS_FD, NULL);
583
584 err:
585 return ret;
586 }
587
588 /*
589 * dirty pages logging control
590 */
591
kvm_mem_flags(MemoryRegion * mr)592 static int kvm_mem_flags(MemoryRegion *mr)
593 {
594 bool readonly = mr->readonly || memory_region_is_romd(mr);
595 int flags = 0;
596
597 if (memory_region_get_dirty_log_mask(mr) != 0) {
598 flags |= KVM_MEM_LOG_DIRTY_PAGES;
599 }
600 if (readonly && kvm_readonly_mem_allowed) {
601 flags |= KVM_MEM_READONLY;
602 }
603 if (memory_region_has_guest_memfd(mr)) {
604 assert(kvm_guest_memfd_supported);
605 flags |= KVM_MEM_GUEST_MEMFD;
606 }
607 return flags;
608 }
609
610 /* Called with KVMMemoryListener.slots_lock held */
kvm_slot_update_flags(KVMMemoryListener * kml,KVMSlot * mem,MemoryRegion * mr)611 static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem,
612 MemoryRegion *mr)
613 {
614 mem->flags = kvm_mem_flags(mr);
615
616 /* If nothing changed effectively, no need to issue ioctl */
617 if (mem->flags == mem->old_flags) {
618 return 0;
619 }
620
621 kvm_slot_init_dirty_bitmap(mem);
622 return kvm_set_user_memory_region(kml, mem, false);
623 }
624
kvm_section_update_flags(KVMMemoryListener * kml,MemoryRegionSection * section)625 static int kvm_section_update_flags(KVMMemoryListener *kml,
626 MemoryRegionSection *section)
627 {
628 hwaddr start_addr, size, slot_size;
629 KVMSlot *mem;
630 int ret = 0;
631
632 size = kvm_align_section(section, &start_addr);
633 if (!size) {
634 return 0;
635 }
636
637 kvm_slots_lock();
638
639 while (size && !ret) {
640 slot_size = MIN(kvm_max_slot_size, size);
641 mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
642 if (!mem) {
643 /* We don't have a slot if we want to trap every access. */
644 goto out;
645 }
646
647 ret = kvm_slot_update_flags(kml, mem, section->mr);
648 start_addr += slot_size;
649 size -= slot_size;
650 }
651
652 out:
653 kvm_slots_unlock();
654 return ret;
655 }
656
kvm_log_start(MemoryListener * listener,MemoryRegionSection * section,int old,int new)657 static void kvm_log_start(MemoryListener *listener,
658 MemoryRegionSection *section,
659 int old, int new)
660 {
661 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
662 int r;
663
664 if (old != 0) {
665 return;
666 }
667
668 r = kvm_section_update_flags(kml, section);
669 if (r < 0) {
670 abort();
671 }
672 }
673
kvm_log_stop(MemoryListener * listener,MemoryRegionSection * section,int old,int new)674 static void kvm_log_stop(MemoryListener *listener,
675 MemoryRegionSection *section,
676 int old, int new)
677 {
678 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
679 int r;
680
681 if (new != 0) {
682 return;
683 }
684
685 r = kvm_section_update_flags(kml, section);
686 if (r < 0) {
687 abort();
688 }
689 }
690
691 /* get kvm's dirty pages bitmap and update qemu's */
kvm_slot_sync_dirty_pages(KVMSlot * slot)692 static void kvm_slot_sync_dirty_pages(KVMSlot *slot)
693 {
694 ram_addr_t start = slot->ram_start_offset;
695 ram_addr_t pages = slot->memory_size / qemu_real_host_page_size();
696
697 cpu_physical_memory_set_dirty_lebitmap(slot->dirty_bmap, start, pages);
698 }
699
kvm_slot_reset_dirty_pages(KVMSlot * slot)700 static void kvm_slot_reset_dirty_pages(KVMSlot *slot)
701 {
702 memset(slot->dirty_bmap, 0, slot->dirty_bmap_size);
703 }
704
705 #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1))
706
707 /* Allocate the dirty bitmap for a slot */
kvm_slot_init_dirty_bitmap(KVMSlot * mem)708 static void kvm_slot_init_dirty_bitmap(KVMSlot *mem)
709 {
710 if (!(mem->flags & KVM_MEM_LOG_DIRTY_PAGES) || mem->dirty_bmap) {
711 return;
712 }
713
714 /*
715 * XXX bad kernel interface alert
716 * For dirty bitmap, kernel allocates array of size aligned to
717 * bits-per-long. But for case when the kernel is 64bits and
718 * the userspace is 32bits, userspace can't align to the same
719 * bits-per-long, since sizeof(long) is different between kernel
720 * and user space. This way, userspace will provide buffer which
721 * may be 4 bytes less than the kernel will use, resulting in
722 * userspace memory corruption (which is not detectable by valgrind
723 * too, in most cases).
724 * So for now, let's align to 64 instead of HOST_LONG_BITS here, in
725 * a hope that sizeof(long) won't become >8 any time soon.
726 *
727 * Note: the granule of kvm dirty log is qemu_real_host_page_size.
728 * And mem->memory_size is aligned to it (otherwise this mem can't
729 * be registered to KVM).
730 */
731 hwaddr bitmap_size = ALIGN(mem->memory_size / qemu_real_host_page_size(),
732 /*HOST_LONG_BITS*/ 64) / 8;
733 mem->dirty_bmap = g_malloc0(bitmap_size);
734 mem->dirty_bmap_size = bitmap_size;
735 }
736
737 /*
738 * Sync dirty bitmap from kernel to KVMSlot.dirty_bmap, return true if
739 * succeeded, false otherwise
740 */
kvm_slot_get_dirty_log(KVMState * s,KVMSlot * slot)741 static bool kvm_slot_get_dirty_log(KVMState *s, KVMSlot *slot)
742 {
743 struct kvm_dirty_log d = {};
744 int ret;
745
746 d.dirty_bitmap = slot->dirty_bmap;
747 d.slot = slot->slot | (slot->as_id << 16);
748 ret = kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d);
749
750 if (ret == -ENOENT) {
751 /* kernel does not have dirty bitmap in this slot */
752 ret = 0;
753 }
754 if (ret) {
755 error_report_once("%s: KVM_GET_DIRTY_LOG failed with %d",
756 __func__, ret);
757 }
758 return ret == 0;
759 }
760
761 /* Should be with all slots_lock held for the address spaces. */
kvm_dirty_ring_mark_page(KVMState * s,uint32_t as_id,uint32_t slot_id,uint64_t offset)762 static void kvm_dirty_ring_mark_page(KVMState *s, uint32_t as_id,
763 uint32_t slot_id, uint64_t offset)
764 {
765 KVMMemoryListener *kml;
766 KVMSlot *mem;
767
768 if (as_id >= s->nr_as) {
769 return;
770 }
771
772 kml = s->as[as_id].ml;
773 mem = &kml->slots[slot_id];
774
775 if (!mem->memory_size || offset >=
776 (mem->memory_size / qemu_real_host_page_size())) {
777 return;
778 }
779
780 set_bit(offset, mem->dirty_bmap);
781 }
782
dirty_gfn_is_dirtied(struct kvm_dirty_gfn * gfn)783 static bool dirty_gfn_is_dirtied(struct kvm_dirty_gfn *gfn)
784 {
785 /*
786 * Read the flags before the value. Pairs with barrier in
787 * KVM's kvm_dirty_ring_push() function.
788 */
789 return qatomic_load_acquire(&gfn->flags) == KVM_DIRTY_GFN_F_DIRTY;
790 }
791
dirty_gfn_set_collected(struct kvm_dirty_gfn * gfn)792 static void dirty_gfn_set_collected(struct kvm_dirty_gfn *gfn)
793 {
794 /*
795 * Use a store-release so that the CPU that executes KVM_RESET_DIRTY_RINGS
796 * sees the full content of the ring:
797 *
798 * CPU0 CPU1 CPU2
799 * ------------------------------------------------------------------------------
800 * fill gfn0
801 * store-rel flags for gfn0
802 * load-acq flags for gfn0
803 * store-rel RESET for gfn0
804 * ioctl(RESET_RINGS)
805 * load-acq flags for gfn0
806 * check if flags have RESET
807 *
808 * The synchronization goes from CPU2 to CPU0 to CPU1.
809 */
810 qatomic_store_release(&gfn->flags, KVM_DIRTY_GFN_F_RESET);
811 }
812
813 /*
814 * Should be with all slots_lock held for the address spaces. It returns the
815 * dirty page we've collected on this dirty ring.
816 */
kvm_dirty_ring_reap_one(KVMState * s,CPUState * cpu)817 static uint32_t kvm_dirty_ring_reap_one(KVMState *s, CPUState *cpu)
818 {
819 struct kvm_dirty_gfn *dirty_gfns = cpu->kvm_dirty_gfns, *cur;
820 uint32_t ring_size = s->kvm_dirty_ring_size;
821 uint32_t count = 0, fetch = cpu->kvm_fetch_index;
822
823 /*
824 * It's possible that we race with vcpu creation code where the vcpu is
825 * put onto the vcpus list but not yet initialized the dirty ring
826 * structures. If so, skip it.
827 */
828 if (!cpu->created) {
829 return 0;
830 }
831
832 assert(dirty_gfns && ring_size);
833 trace_kvm_dirty_ring_reap_vcpu(cpu->cpu_index);
834
835 while (true) {
836 cur = &dirty_gfns[fetch % ring_size];
837 if (!dirty_gfn_is_dirtied(cur)) {
838 break;
839 }
840 kvm_dirty_ring_mark_page(s, cur->slot >> 16, cur->slot & 0xffff,
841 cur->offset);
842 dirty_gfn_set_collected(cur);
843 trace_kvm_dirty_ring_page(cpu->cpu_index, fetch, cur->offset);
844 fetch++;
845 count++;
846 }
847 cpu->kvm_fetch_index = fetch;
848 cpu->dirty_pages += count;
849
850 return count;
851 }
852
853 /* Must be with slots_lock held */
kvm_dirty_ring_reap_locked(KVMState * s,CPUState * cpu)854 static uint64_t kvm_dirty_ring_reap_locked(KVMState *s, CPUState* cpu)
855 {
856 int ret;
857 uint64_t total = 0;
858 int64_t stamp;
859
860 stamp = get_clock();
861
862 if (cpu) {
863 total = kvm_dirty_ring_reap_one(s, cpu);
864 } else {
865 CPU_FOREACH(cpu) {
866 total += kvm_dirty_ring_reap_one(s, cpu);
867 }
868 }
869
870 if (total) {
871 ret = kvm_vm_ioctl(s, KVM_RESET_DIRTY_RINGS);
872 assert(ret == total);
873 }
874
875 stamp = get_clock() - stamp;
876
877 if (total) {
878 trace_kvm_dirty_ring_reap(total, stamp / 1000);
879 }
880
881 return total;
882 }
883
884 /*
885 * Currently for simplicity, we must hold BQL before calling this. We can
886 * consider to drop the BQL if we're clear with all the race conditions.
887 */
kvm_dirty_ring_reap(KVMState * s,CPUState * cpu)888 static uint64_t kvm_dirty_ring_reap(KVMState *s, CPUState *cpu)
889 {
890 uint64_t total;
891
892 /*
893 * We need to lock all kvm slots for all address spaces here,
894 * because:
895 *
896 * (1) We need to mark dirty for dirty bitmaps in multiple slots
897 * and for tons of pages, so it's better to take the lock here
898 * once rather than once per page. And more importantly,
899 *
900 * (2) We must _NOT_ publish dirty bits to the other threads
901 * (e.g., the migration thread) via the kvm memory slot dirty
902 * bitmaps before correctly re-protect those dirtied pages.
903 * Otherwise we can have potential risk of data corruption if
904 * the page data is read in the other thread before we do
905 * reset below.
906 */
907 kvm_slots_lock();
908 total = kvm_dirty_ring_reap_locked(s, cpu);
909 kvm_slots_unlock();
910
911 return total;
912 }
913
do_kvm_cpu_synchronize_kick(CPUState * cpu,run_on_cpu_data arg)914 static void do_kvm_cpu_synchronize_kick(CPUState *cpu, run_on_cpu_data arg)
915 {
916 /* No need to do anything */
917 }
918
919 /*
920 * Kick all vcpus out in a synchronized way. When returned, we
921 * guarantee that every vcpu has been kicked and at least returned to
922 * userspace once.
923 */
kvm_cpu_synchronize_kick_all(void)924 static void kvm_cpu_synchronize_kick_all(void)
925 {
926 CPUState *cpu;
927
928 CPU_FOREACH(cpu) {
929 run_on_cpu(cpu, do_kvm_cpu_synchronize_kick, RUN_ON_CPU_NULL);
930 }
931 }
932
933 /*
934 * Flush all the existing dirty pages to the KVM slot buffers. When
935 * this call returns, we guarantee that all the touched dirty pages
936 * before calling this function have been put into the per-kvmslot
937 * dirty bitmap.
938 *
939 * This function must be called with BQL held.
940 */
kvm_dirty_ring_flush(void)941 static void kvm_dirty_ring_flush(void)
942 {
943 trace_kvm_dirty_ring_flush(0);
944 /*
945 * The function needs to be serialized. Since this function
946 * should always be with BQL held, serialization is guaranteed.
947 * However, let's be sure of it.
948 */
949 assert(bql_locked());
950 /*
951 * First make sure to flush the hardware buffers by kicking all
952 * vcpus out in a synchronous way.
953 */
954 kvm_cpu_synchronize_kick_all();
955 kvm_dirty_ring_reap(kvm_state, NULL);
956 trace_kvm_dirty_ring_flush(1);
957 }
958
959 /**
960 * kvm_physical_sync_dirty_bitmap - Sync dirty bitmap from kernel space
961 *
962 * This function will first try to fetch dirty bitmap from the kernel,
963 * and then updates qemu's dirty bitmap.
964 *
965 * NOTE: caller must be with kml->slots_lock held.
966 *
967 * @kml: the KVM memory listener object
968 * @section: the memory section to sync the dirty bitmap with
969 */
kvm_physical_sync_dirty_bitmap(KVMMemoryListener * kml,MemoryRegionSection * section)970 static void kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml,
971 MemoryRegionSection *section)
972 {
973 KVMState *s = kvm_state;
974 KVMSlot *mem;
975 hwaddr start_addr, size;
976 hwaddr slot_size;
977
978 size = kvm_align_section(section, &start_addr);
979 while (size) {
980 slot_size = MIN(kvm_max_slot_size, size);
981 mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
982 if (!mem) {
983 /* We don't have a slot if we want to trap every access. */
984 return;
985 }
986 if (kvm_slot_get_dirty_log(s, mem)) {
987 kvm_slot_sync_dirty_pages(mem);
988 }
989 start_addr += slot_size;
990 size -= slot_size;
991 }
992 }
993
994 /* Alignment requirement for KVM_CLEAR_DIRTY_LOG - 64 pages */
995 #define KVM_CLEAR_LOG_SHIFT 6
996 #define KVM_CLEAR_LOG_ALIGN (qemu_real_host_page_size() << KVM_CLEAR_LOG_SHIFT)
997 #define KVM_CLEAR_LOG_MASK (-KVM_CLEAR_LOG_ALIGN)
998
kvm_log_clear_one_slot(KVMSlot * mem,int as_id,uint64_t start,uint64_t size)999 static int kvm_log_clear_one_slot(KVMSlot *mem, int as_id, uint64_t start,
1000 uint64_t size)
1001 {
1002 KVMState *s = kvm_state;
1003 uint64_t end, bmap_start, start_delta, bmap_npages;
1004 struct kvm_clear_dirty_log d;
1005 unsigned long *bmap_clear = NULL, psize = qemu_real_host_page_size();
1006 int ret;
1007
1008 /*
1009 * We need to extend either the start or the size or both to
1010 * satisfy the KVM interface requirement. Firstly, do the start
1011 * page alignment on 64 host pages
1012 */
1013 bmap_start = start & KVM_CLEAR_LOG_MASK;
1014 start_delta = start - bmap_start;
1015 bmap_start /= psize;
1016
1017 /*
1018 * The kernel interface has restriction on the size too, that either:
1019 *
1020 * (1) the size is 64 host pages aligned (just like the start), or
1021 * (2) the size fills up until the end of the KVM memslot.
1022 */
1023 bmap_npages = DIV_ROUND_UP(size + start_delta, KVM_CLEAR_LOG_ALIGN)
1024 << KVM_CLEAR_LOG_SHIFT;
1025 end = mem->memory_size / psize;
1026 if (bmap_npages > end - bmap_start) {
1027 bmap_npages = end - bmap_start;
1028 }
1029 start_delta /= psize;
1030
1031 /*
1032 * Prepare the bitmap to clear dirty bits. Here we must guarantee
1033 * that we won't clear any unknown dirty bits otherwise we might
1034 * accidentally clear some set bits which are not yet synced from
1035 * the kernel into QEMU's bitmap, then we'll lose track of the
1036 * guest modifications upon those pages (which can directly lead
1037 * to guest data loss or panic after migration).
1038 *
1039 * Layout of the KVMSlot.dirty_bmap:
1040 *
1041 * |<-------- bmap_npages -----------..>|
1042 * [1]
1043 * start_delta size
1044 * |----------------|-------------|------------------|------------|
1045 * ^ ^ ^ ^
1046 * | | | |
1047 * start bmap_start (start) end
1048 * of memslot of memslot
1049 *
1050 * [1] bmap_npages can be aligned to either 64 pages or the end of slot
1051 */
1052
1053 assert(bmap_start % BITS_PER_LONG == 0);
1054 /* We should never do log_clear before log_sync */
1055 assert(mem->dirty_bmap);
1056 if (start_delta || bmap_npages - size / psize) {
1057 /* Slow path - we need to manipulate a temp bitmap */
1058 bmap_clear = bitmap_new(bmap_npages);
1059 bitmap_copy_with_src_offset(bmap_clear, mem->dirty_bmap,
1060 bmap_start, start_delta + size / psize);
1061 /*
1062 * We need to fill the holes at start because that was not
1063 * specified by the caller and we extended the bitmap only for
1064 * 64 pages alignment
1065 */
1066 bitmap_clear(bmap_clear, 0, start_delta);
1067 d.dirty_bitmap = bmap_clear;
1068 } else {
1069 /*
1070 * Fast path - both start and size align well with BITS_PER_LONG
1071 * (or the end of memory slot)
1072 */
1073 d.dirty_bitmap = mem->dirty_bmap + BIT_WORD(bmap_start);
1074 }
1075
1076 d.first_page = bmap_start;
1077 /* It should never overflow. If it happens, say something */
1078 assert(bmap_npages <= UINT32_MAX);
1079 d.num_pages = bmap_npages;
1080 d.slot = mem->slot | (as_id << 16);
1081
1082 ret = kvm_vm_ioctl(s, KVM_CLEAR_DIRTY_LOG, &d);
1083 if (ret < 0 && ret != -ENOENT) {
1084 error_report("%s: KVM_CLEAR_DIRTY_LOG failed, slot=%d, "
1085 "start=0x%"PRIx64", size=0x%"PRIx32", errno=%d",
1086 __func__, d.slot, (uint64_t)d.first_page,
1087 (uint32_t)d.num_pages, ret);
1088 } else {
1089 ret = 0;
1090 trace_kvm_clear_dirty_log(d.slot, d.first_page, d.num_pages);
1091 }
1092
1093 /*
1094 * After we have updated the remote dirty bitmap, we update the
1095 * cached bitmap as well for the memslot, then if another user
1096 * clears the same region we know we shouldn't clear it again on
1097 * the remote otherwise it's data loss as well.
1098 */
1099 bitmap_clear(mem->dirty_bmap, bmap_start + start_delta,
1100 size / psize);
1101 /* This handles the NULL case well */
1102 g_free(bmap_clear);
1103 return ret;
1104 }
1105
1106
1107 /**
1108 * kvm_physical_log_clear - Clear the kernel's dirty bitmap for range
1109 *
1110 * NOTE: this will be a no-op if we haven't enabled manual dirty log
1111 * protection in the host kernel because in that case this operation
1112 * will be done within log_sync().
1113 *
1114 * @kml: the kvm memory listener
1115 * @section: the memory range to clear dirty bitmap
1116 */
kvm_physical_log_clear(KVMMemoryListener * kml,MemoryRegionSection * section)1117 static int kvm_physical_log_clear(KVMMemoryListener *kml,
1118 MemoryRegionSection *section)
1119 {
1120 KVMState *s = kvm_state;
1121 uint64_t start, size, offset, count;
1122 KVMSlot *mem;
1123 int ret = 0, i;
1124
1125 if (!s->manual_dirty_log_protect) {
1126 /* No need to do explicit clear */
1127 return ret;
1128 }
1129
1130 start = section->offset_within_address_space;
1131 size = int128_get64(section->size);
1132
1133 if (!size) {
1134 /* Nothing more we can do... */
1135 return ret;
1136 }
1137
1138 kvm_slots_lock();
1139
1140 for (i = 0; i < kml->nr_slots_allocated; i++) {
1141 mem = &kml->slots[i];
1142 /* Discard slots that are empty or do not overlap the section */
1143 if (!mem->memory_size ||
1144 mem->start_addr > start + size - 1 ||
1145 start > mem->start_addr + mem->memory_size - 1) {
1146 continue;
1147 }
1148
1149 if (start >= mem->start_addr) {
1150 /* The slot starts before section or is aligned to it. */
1151 offset = start - mem->start_addr;
1152 count = MIN(mem->memory_size - offset, size);
1153 } else {
1154 /* The slot starts after section. */
1155 offset = 0;
1156 count = MIN(mem->memory_size, size - (mem->start_addr - start));
1157 }
1158 ret = kvm_log_clear_one_slot(mem, kml->as_id, offset, count);
1159 if (ret < 0) {
1160 break;
1161 }
1162 }
1163
1164 kvm_slots_unlock();
1165
1166 return ret;
1167 }
1168
kvm_coalesce_mmio_region(MemoryListener * listener,MemoryRegionSection * secion,hwaddr start,hwaddr size)1169 static void kvm_coalesce_mmio_region(MemoryListener *listener,
1170 MemoryRegionSection *secion,
1171 hwaddr start, hwaddr size)
1172 {
1173 KVMState *s = kvm_state;
1174
1175 if (s->coalesced_mmio) {
1176 struct kvm_coalesced_mmio_zone zone;
1177
1178 zone.addr = start;
1179 zone.size = size;
1180 zone.pad = 0;
1181
1182 (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
1183 }
1184 }
1185
kvm_uncoalesce_mmio_region(MemoryListener * listener,MemoryRegionSection * secion,hwaddr start,hwaddr size)1186 static void kvm_uncoalesce_mmio_region(MemoryListener *listener,
1187 MemoryRegionSection *secion,
1188 hwaddr start, hwaddr size)
1189 {
1190 KVMState *s = kvm_state;
1191
1192 if (s->coalesced_mmio) {
1193 struct kvm_coalesced_mmio_zone zone;
1194
1195 zone.addr = start;
1196 zone.size = size;
1197 zone.pad = 0;
1198
1199 (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
1200 }
1201 }
1202
kvm_coalesce_pio_add(MemoryListener * listener,MemoryRegionSection * section,hwaddr start,hwaddr size)1203 static void kvm_coalesce_pio_add(MemoryListener *listener,
1204 MemoryRegionSection *section,
1205 hwaddr start, hwaddr size)
1206 {
1207 KVMState *s = kvm_state;
1208
1209 if (s->coalesced_pio) {
1210 struct kvm_coalesced_mmio_zone zone;
1211
1212 zone.addr = start;
1213 zone.size = size;
1214 zone.pio = 1;
1215
1216 (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
1217 }
1218 }
1219
kvm_coalesce_pio_del(MemoryListener * listener,MemoryRegionSection * section,hwaddr start,hwaddr size)1220 static void kvm_coalesce_pio_del(MemoryListener *listener,
1221 MemoryRegionSection *section,
1222 hwaddr start, hwaddr size)
1223 {
1224 KVMState *s = kvm_state;
1225
1226 if (s->coalesced_pio) {
1227 struct kvm_coalesced_mmio_zone zone;
1228
1229 zone.addr = start;
1230 zone.size = size;
1231 zone.pio = 1;
1232
1233 (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
1234 }
1235 }
1236
kvm_check_extension(KVMState * s,unsigned int extension)1237 int kvm_check_extension(KVMState *s, unsigned int extension)
1238 {
1239 int ret;
1240
1241 ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
1242 if (ret < 0) {
1243 ret = 0;
1244 }
1245
1246 return ret;
1247 }
1248
kvm_vm_check_extension(KVMState * s,unsigned int extension)1249 int kvm_vm_check_extension(KVMState *s, unsigned int extension)
1250 {
1251 int ret;
1252
1253 ret = kvm_vm_ioctl(s, KVM_CHECK_EXTENSION, extension);
1254 if (ret < 0) {
1255 /* VM wide version not implemented, use global one instead */
1256 ret = kvm_check_extension(s, extension);
1257 }
1258
1259 return ret;
1260 }
1261
1262 /*
1263 * We track the poisoned pages to be able to:
1264 * - replace them on VM reset
1265 * - block a migration for a VM with a poisoned page
1266 */
1267 typedef struct HWPoisonPage {
1268 ram_addr_t ram_addr;
1269 QLIST_ENTRY(HWPoisonPage) list;
1270 } HWPoisonPage;
1271
1272 static QLIST_HEAD(, HWPoisonPage) hwpoison_page_list =
1273 QLIST_HEAD_INITIALIZER(hwpoison_page_list);
1274
kvm_unpoison_all(void * param)1275 static void kvm_unpoison_all(void *param)
1276 {
1277 HWPoisonPage *page, *next_page;
1278
1279 QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) {
1280 QLIST_REMOVE(page, list);
1281 qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE);
1282 g_free(page);
1283 }
1284 }
1285
kvm_hwpoison_page_add(ram_addr_t ram_addr)1286 void kvm_hwpoison_page_add(ram_addr_t ram_addr)
1287 {
1288 HWPoisonPage *page;
1289
1290 QLIST_FOREACH(page, &hwpoison_page_list, list) {
1291 if (page->ram_addr == ram_addr) {
1292 return;
1293 }
1294 }
1295 page = g_new(HWPoisonPage, 1);
1296 page->ram_addr = ram_addr;
1297 QLIST_INSERT_HEAD(&hwpoison_page_list, page, list);
1298 }
1299
kvm_hwpoisoned_mem(void)1300 bool kvm_hwpoisoned_mem(void)
1301 {
1302 return !QLIST_EMPTY(&hwpoison_page_list);
1303 }
1304
adjust_ioeventfd_endianness(uint32_t val,uint32_t size)1305 static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size)
1306 {
1307 #if HOST_BIG_ENDIAN != TARGET_BIG_ENDIAN
1308 /* The kernel expects ioeventfd values in HOST_BIG_ENDIAN
1309 * endianness, but the memory core hands them in target endianness.
1310 * For example, PPC is always treated as big-endian even if running
1311 * on KVM and on PPC64LE. Correct here.
1312 */
1313 switch (size) {
1314 case 2:
1315 val = bswap16(val);
1316 break;
1317 case 4:
1318 val = bswap32(val);
1319 break;
1320 }
1321 #endif
1322 return val;
1323 }
1324
kvm_set_ioeventfd_mmio(int fd,hwaddr addr,uint32_t val,bool assign,uint32_t size,bool datamatch)1325 static int kvm_set_ioeventfd_mmio(int fd, hwaddr addr, uint32_t val,
1326 bool assign, uint32_t size, bool datamatch)
1327 {
1328 int ret;
1329 struct kvm_ioeventfd iofd = {
1330 .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
1331 .addr = addr,
1332 .len = size,
1333 .flags = 0,
1334 .fd = fd,
1335 };
1336
1337 trace_kvm_set_ioeventfd_mmio(fd, (uint64_t)addr, val, assign, size,
1338 datamatch);
1339 if (!kvm_enabled()) {
1340 return -ENOSYS;
1341 }
1342
1343 if (datamatch) {
1344 iofd.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
1345 }
1346 if (!assign) {
1347 iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1348 }
1349
1350 ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);
1351
1352 if (ret < 0) {
1353 return -errno;
1354 }
1355
1356 return 0;
1357 }
1358
kvm_set_ioeventfd_pio(int fd,uint16_t addr,uint16_t val,bool assign,uint32_t size,bool datamatch)1359 static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val,
1360 bool assign, uint32_t size, bool datamatch)
1361 {
1362 struct kvm_ioeventfd kick = {
1363 .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
1364 .addr = addr,
1365 .flags = KVM_IOEVENTFD_FLAG_PIO,
1366 .len = size,
1367 .fd = fd,
1368 };
1369 int r;
1370 trace_kvm_set_ioeventfd_pio(fd, addr, val, assign, size, datamatch);
1371 if (!kvm_enabled()) {
1372 return -ENOSYS;
1373 }
1374 if (datamatch) {
1375 kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
1376 }
1377 if (!assign) {
1378 kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1379 }
1380 r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
1381 if (r < 0) {
1382 return r;
1383 }
1384 return 0;
1385 }
1386
1387
1388 static const KVMCapabilityInfo *
kvm_check_extension_list(KVMState * s,const KVMCapabilityInfo * list)1389 kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list)
1390 {
1391 while (list->name) {
1392 if (!kvm_check_extension(s, list->value)) {
1393 return list;
1394 }
1395 list++;
1396 }
1397 return NULL;
1398 }
1399
kvm_set_max_memslot_size(hwaddr max_slot_size)1400 void kvm_set_max_memslot_size(hwaddr max_slot_size)
1401 {
1402 g_assert(
1403 ROUND_UP(max_slot_size, qemu_real_host_page_size()) == max_slot_size
1404 );
1405 kvm_max_slot_size = max_slot_size;
1406 }
1407
kvm_set_memory_attributes(hwaddr start,uint64_t size,uint64_t attr)1408 static int kvm_set_memory_attributes(hwaddr start, uint64_t size, uint64_t attr)
1409 {
1410 struct kvm_memory_attributes attrs;
1411 int r;
1412
1413 assert((attr & kvm_supported_memory_attributes) == attr);
1414 attrs.attributes = attr;
1415 attrs.address = start;
1416 attrs.size = size;
1417 attrs.flags = 0;
1418
1419 r = kvm_vm_ioctl(kvm_state, KVM_SET_MEMORY_ATTRIBUTES, &attrs);
1420 if (r) {
1421 error_report("failed to set memory (0x%" HWADDR_PRIx "+0x%" PRIx64 ") "
1422 "with attr 0x%" PRIx64 " error '%s'",
1423 start, size, attr, strerror(errno));
1424 }
1425 return r;
1426 }
1427
kvm_set_memory_attributes_private(hwaddr start,uint64_t size)1428 int kvm_set_memory_attributes_private(hwaddr start, uint64_t size)
1429 {
1430 return kvm_set_memory_attributes(start, size, KVM_MEMORY_ATTRIBUTE_PRIVATE);
1431 }
1432
kvm_set_memory_attributes_shared(hwaddr start,uint64_t size)1433 int kvm_set_memory_attributes_shared(hwaddr start, uint64_t size)
1434 {
1435 return kvm_set_memory_attributes(start, size, 0);
1436 }
1437
1438 /* Called with KVMMemoryListener.slots_lock held */
kvm_set_phys_mem(KVMMemoryListener * kml,MemoryRegionSection * section,bool add)1439 static void kvm_set_phys_mem(KVMMemoryListener *kml,
1440 MemoryRegionSection *section, bool add)
1441 {
1442 KVMSlot *mem;
1443 int err;
1444 MemoryRegion *mr = section->mr;
1445 bool writable = !mr->readonly && !mr->rom_device;
1446 hwaddr start_addr, size, slot_size, mr_offset;
1447 ram_addr_t ram_start_offset;
1448 void *ram;
1449
1450 if (!memory_region_is_ram(mr)) {
1451 if (writable || !kvm_readonly_mem_allowed) {
1452 return;
1453 } else if (!mr->romd_mode) {
1454 /* If the memory device is not in romd_mode, then we actually want
1455 * to remove the kvm memory slot so all accesses will trap. */
1456 add = false;
1457 }
1458 }
1459
1460 size = kvm_align_section(section, &start_addr);
1461 if (!size) {
1462 return;
1463 }
1464
1465 /* The offset of the kvmslot within the memory region */
1466 mr_offset = section->offset_within_region + start_addr -
1467 section->offset_within_address_space;
1468
1469 /* use aligned delta to align the ram address and offset */
1470 ram = memory_region_get_ram_ptr(mr) + mr_offset;
1471 ram_start_offset = memory_region_get_ram_addr(mr) + mr_offset;
1472
1473 if (!add) {
1474 do {
1475 slot_size = MIN(kvm_max_slot_size, size);
1476 mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
1477 if (!mem) {
1478 return;
1479 }
1480 if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
1481 /*
1482 * NOTE: We should be aware of the fact that here we're only
1483 * doing a best effort to sync dirty bits. No matter whether
1484 * we're using dirty log or dirty ring, we ignored two facts:
1485 *
1486 * (1) dirty bits can reside in hardware buffers (PML)
1487 *
1488 * (2) after we collected dirty bits here, pages can be dirtied
1489 * again before we do the final KVM_SET_USER_MEMORY_REGION to
1490 * remove the slot.
1491 *
1492 * Not easy. Let's cross the fingers until it's fixed.
1493 */
1494 if (kvm_state->kvm_dirty_ring_size) {
1495 kvm_dirty_ring_reap_locked(kvm_state, NULL);
1496 if (kvm_state->kvm_dirty_ring_with_bitmap) {
1497 kvm_slot_sync_dirty_pages(mem);
1498 kvm_slot_get_dirty_log(kvm_state, mem);
1499 }
1500 } else {
1501 kvm_slot_get_dirty_log(kvm_state, mem);
1502 }
1503 kvm_slot_sync_dirty_pages(mem);
1504 }
1505
1506 /* unregister the slot */
1507 g_free(mem->dirty_bmap);
1508 mem->dirty_bmap = NULL;
1509 mem->memory_size = 0;
1510 mem->flags = 0;
1511 err = kvm_set_user_memory_region(kml, mem, false);
1512 if (err) {
1513 fprintf(stderr, "%s: error unregistering slot: %s\n",
1514 __func__, strerror(-err));
1515 abort();
1516 }
1517 start_addr += slot_size;
1518 size -= slot_size;
1519 kml->nr_slots_used--;
1520 } while (size);
1521 return;
1522 }
1523
1524 /* register the new slot */
1525 do {
1526 slot_size = MIN(kvm_max_slot_size, size);
1527 mem = kvm_alloc_slot(kml);
1528 mem->as_id = kml->as_id;
1529 mem->memory_size = slot_size;
1530 mem->start_addr = start_addr;
1531 mem->ram_start_offset = ram_start_offset;
1532 mem->ram = ram;
1533 mem->flags = kvm_mem_flags(mr);
1534 mem->guest_memfd = mr->ram_block->guest_memfd;
1535 mem->guest_memfd_offset = (uint8_t*)ram - mr->ram_block->host;
1536
1537 kvm_slot_init_dirty_bitmap(mem);
1538 err = kvm_set_user_memory_region(kml, mem, true);
1539 if (err) {
1540 fprintf(stderr, "%s: error registering slot: %s\n", __func__,
1541 strerror(-err));
1542 abort();
1543 }
1544
1545 if (memory_region_has_guest_memfd(mr)) {
1546 err = kvm_set_memory_attributes_private(start_addr, slot_size);
1547 if (err) {
1548 error_report("%s: failed to set memory attribute private: %s",
1549 __func__, strerror(-err));
1550 exit(1);
1551 }
1552 }
1553
1554 start_addr += slot_size;
1555 ram_start_offset += slot_size;
1556 ram += slot_size;
1557 size -= slot_size;
1558 kml->nr_slots_used++;
1559 } while (size);
1560 }
1561
kvm_dirty_ring_reaper_thread(void * data)1562 static void *kvm_dirty_ring_reaper_thread(void *data)
1563 {
1564 KVMState *s = data;
1565 struct KVMDirtyRingReaper *r = &s->reaper;
1566
1567 rcu_register_thread();
1568
1569 trace_kvm_dirty_ring_reaper("init");
1570
1571 while (true) {
1572 r->reaper_state = KVM_DIRTY_RING_REAPER_WAIT;
1573 trace_kvm_dirty_ring_reaper("wait");
1574 /*
1575 * TODO: provide a smarter timeout rather than a constant?
1576 */
1577 sleep(1);
1578
1579 /* keep sleeping so that dirtylimit not be interfered by reaper */
1580 if (dirtylimit_in_service()) {
1581 continue;
1582 }
1583
1584 trace_kvm_dirty_ring_reaper("wakeup");
1585 r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING;
1586
1587 bql_lock();
1588 kvm_dirty_ring_reap(s, NULL);
1589 bql_unlock();
1590
1591 r->reaper_iteration++;
1592 }
1593
1594 g_assert_not_reached();
1595 }
1596
kvm_dirty_ring_reaper_init(KVMState * s)1597 static void kvm_dirty_ring_reaper_init(KVMState *s)
1598 {
1599 struct KVMDirtyRingReaper *r = &s->reaper;
1600
1601 qemu_thread_create(&r->reaper_thr, "kvm-reaper",
1602 kvm_dirty_ring_reaper_thread,
1603 s, QEMU_THREAD_JOINABLE);
1604 }
1605
kvm_dirty_ring_init(KVMState * s)1606 static int kvm_dirty_ring_init(KVMState *s)
1607 {
1608 uint32_t ring_size = s->kvm_dirty_ring_size;
1609 uint64_t ring_bytes = ring_size * sizeof(struct kvm_dirty_gfn);
1610 unsigned int capability = KVM_CAP_DIRTY_LOG_RING;
1611 int ret;
1612
1613 s->kvm_dirty_ring_size = 0;
1614 s->kvm_dirty_ring_bytes = 0;
1615
1616 /* Bail if the dirty ring size isn't specified */
1617 if (!ring_size) {
1618 return 0;
1619 }
1620
1621 /*
1622 * Read the max supported pages. Fall back to dirty logging mode
1623 * if the dirty ring isn't supported.
1624 */
1625 ret = kvm_vm_check_extension(s, capability);
1626 if (ret <= 0) {
1627 capability = KVM_CAP_DIRTY_LOG_RING_ACQ_REL;
1628 ret = kvm_vm_check_extension(s, capability);
1629 }
1630
1631 if (ret <= 0) {
1632 warn_report("KVM dirty ring not available, using bitmap method");
1633 return 0;
1634 }
1635
1636 if (ring_bytes > ret) {
1637 error_report("KVM dirty ring size %" PRIu32 " too big "
1638 "(maximum is %ld). Please use a smaller value.",
1639 ring_size, (long)ret / sizeof(struct kvm_dirty_gfn));
1640 return -EINVAL;
1641 }
1642
1643 ret = kvm_vm_enable_cap(s, capability, 0, ring_bytes);
1644 if (ret) {
1645 error_report("Enabling of KVM dirty ring failed: %s. "
1646 "Suggested minimum value is 1024.", strerror(-ret));
1647 return -EIO;
1648 }
1649
1650 /* Enable the backup bitmap if it is supported */
1651 ret = kvm_vm_check_extension(s, KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP);
1652 if (ret > 0) {
1653 ret = kvm_vm_enable_cap(s, KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP, 0);
1654 if (ret) {
1655 error_report("Enabling of KVM dirty ring's backup bitmap failed: "
1656 "%s. ", strerror(-ret));
1657 return -EIO;
1658 }
1659
1660 s->kvm_dirty_ring_with_bitmap = true;
1661 }
1662
1663 s->kvm_dirty_ring_size = ring_size;
1664 s->kvm_dirty_ring_bytes = ring_bytes;
1665
1666 return 0;
1667 }
1668
kvm_region_add(MemoryListener * listener,MemoryRegionSection * section)1669 static void kvm_region_add(MemoryListener *listener,
1670 MemoryRegionSection *section)
1671 {
1672 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1673 KVMMemoryUpdate *update;
1674
1675 update = g_new0(KVMMemoryUpdate, 1);
1676 update->section = *section;
1677
1678 QSIMPLEQ_INSERT_TAIL(&kml->transaction_add, update, next);
1679 }
1680
kvm_region_del(MemoryListener * listener,MemoryRegionSection * section)1681 static void kvm_region_del(MemoryListener *listener,
1682 MemoryRegionSection *section)
1683 {
1684 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1685 KVMMemoryUpdate *update;
1686
1687 update = g_new0(KVMMemoryUpdate, 1);
1688 update->section = *section;
1689
1690 QSIMPLEQ_INSERT_TAIL(&kml->transaction_del, update, next);
1691 }
1692
kvm_region_commit(MemoryListener * listener)1693 static void kvm_region_commit(MemoryListener *listener)
1694 {
1695 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener,
1696 listener);
1697 KVMMemoryUpdate *u1, *u2;
1698 bool need_inhibit = false;
1699
1700 if (QSIMPLEQ_EMPTY(&kml->transaction_add) &&
1701 QSIMPLEQ_EMPTY(&kml->transaction_del)) {
1702 return;
1703 }
1704
1705 /*
1706 * We have to be careful when regions to add overlap with ranges to remove.
1707 * We have to simulate atomic KVM memslot updates by making sure no ioctl()
1708 * is currently active.
1709 *
1710 * The lists are order by addresses, so it's easy to find overlaps.
1711 */
1712 u1 = QSIMPLEQ_FIRST(&kml->transaction_del);
1713 u2 = QSIMPLEQ_FIRST(&kml->transaction_add);
1714 while (u1 && u2) {
1715 Range r1, r2;
1716
1717 range_init_nofail(&r1, u1->section.offset_within_address_space,
1718 int128_get64(u1->section.size));
1719 range_init_nofail(&r2, u2->section.offset_within_address_space,
1720 int128_get64(u2->section.size));
1721
1722 if (range_overlaps_range(&r1, &r2)) {
1723 need_inhibit = true;
1724 break;
1725 }
1726 if (range_lob(&r1) < range_lob(&r2)) {
1727 u1 = QSIMPLEQ_NEXT(u1, next);
1728 } else {
1729 u2 = QSIMPLEQ_NEXT(u2, next);
1730 }
1731 }
1732
1733 kvm_slots_lock();
1734 if (need_inhibit) {
1735 accel_ioctl_inhibit_begin();
1736 }
1737
1738 /* Remove all memslots before adding the new ones. */
1739 while (!QSIMPLEQ_EMPTY(&kml->transaction_del)) {
1740 u1 = QSIMPLEQ_FIRST(&kml->transaction_del);
1741 QSIMPLEQ_REMOVE_HEAD(&kml->transaction_del, next);
1742
1743 kvm_set_phys_mem(kml, &u1->section, false);
1744 memory_region_unref(u1->section.mr);
1745
1746 g_free(u1);
1747 }
1748 while (!QSIMPLEQ_EMPTY(&kml->transaction_add)) {
1749 u1 = QSIMPLEQ_FIRST(&kml->transaction_add);
1750 QSIMPLEQ_REMOVE_HEAD(&kml->transaction_add, next);
1751
1752 memory_region_ref(u1->section.mr);
1753 kvm_set_phys_mem(kml, &u1->section, true);
1754
1755 g_free(u1);
1756 }
1757
1758 if (need_inhibit) {
1759 accel_ioctl_inhibit_end();
1760 }
1761 kvm_slots_unlock();
1762 }
1763
kvm_log_sync(MemoryListener * listener,MemoryRegionSection * section)1764 static void kvm_log_sync(MemoryListener *listener,
1765 MemoryRegionSection *section)
1766 {
1767 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1768
1769 kvm_slots_lock();
1770 kvm_physical_sync_dirty_bitmap(kml, section);
1771 kvm_slots_unlock();
1772 }
1773
kvm_log_sync_global(MemoryListener * l,bool last_stage)1774 static void kvm_log_sync_global(MemoryListener *l, bool last_stage)
1775 {
1776 KVMMemoryListener *kml = container_of(l, KVMMemoryListener, listener);
1777 KVMState *s = kvm_state;
1778 KVMSlot *mem;
1779 int i;
1780
1781 /* Flush all kernel dirty addresses into KVMSlot dirty bitmap */
1782 kvm_dirty_ring_flush();
1783
1784 kvm_slots_lock();
1785 for (i = 0; i < kml->nr_slots_allocated; i++) {
1786 mem = &kml->slots[i];
1787 if (mem->memory_size && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
1788 kvm_slot_sync_dirty_pages(mem);
1789
1790 if (s->kvm_dirty_ring_with_bitmap && last_stage &&
1791 kvm_slot_get_dirty_log(s, mem)) {
1792 kvm_slot_sync_dirty_pages(mem);
1793 }
1794
1795 /*
1796 * This is not needed by KVM_GET_DIRTY_LOG because the
1797 * ioctl will unconditionally overwrite the whole region.
1798 * However kvm dirty ring has no such side effect.
1799 */
1800 kvm_slot_reset_dirty_pages(mem);
1801 }
1802 }
1803 kvm_slots_unlock();
1804 }
1805
kvm_log_clear(MemoryListener * listener,MemoryRegionSection * section)1806 static void kvm_log_clear(MemoryListener *listener,
1807 MemoryRegionSection *section)
1808 {
1809 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1810 int r;
1811
1812 r = kvm_physical_log_clear(kml, section);
1813 if (r < 0) {
1814 error_report_once("%s: kvm log clear failed: mr=%s "
1815 "offset=%"HWADDR_PRIx" size=%"PRIx64, __func__,
1816 section->mr->name, section->offset_within_region,
1817 int128_get64(section->size));
1818 abort();
1819 }
1820 }
1821
kvm_mem_ioeventfd_add(MemoryListener * listener,MemoryRegionSection * section,bool match_data,uint64_t data,EventNotifier * e)1822 static void kvm_mem_ioeventfd_add(MemoryListener *listener,
1823 MemoryRegionSection *section,
1824 bool match_data, uint64_t data,
1825 EventNotifier *e)
1826 {
1827 int fd = event_notifier_get_fd(e);
1828 int r;
1829
1830 r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
1831 data, true, int128_get64(section->size),
1832 match_data);
1833 if (r < 0) {
1834 fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n",
1835 __func__, strerror(-r), -r);
1836 abort();
1837 }
1838 }
1839
kvm_mem_ioeventfd_del(MemoryListener * listener,MemoryRegionSection * section,bool match_data,uint64_t data,EventNotifier * e)1840 static void kvm_mem_ioeventfd_del(MemoryListener *listener,
1841 MemoryRegionSection *section,
1842 bool match_data, uint64_t data,
1843 EventNotifier *e)
1844 {
1845 int fd = event_notifier_get_fd(e);
1846 int r;
1847
1848 r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
1849 data, false, int128_get64(section->size),
1850 match_data);
1851 if (r < 0) {
1852 fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n",
1853 __func__, strerror(-r), -r);
1854 abort();
1855 }
1856 }
1857
kvm_io_ioeventfd_add(MemoryListener * listener,MemoryRegionSection * section,bool match_data,uint64_t data,EventNotifier * e)1858 static void kvm_io_ioeventfd_add(MemoryListener *listener,
1859 MemoryRegionSection *section,
1860 bool match_data, uint64_t data,
1861 EventNotifier *e)
1862 {
1863 int fd = event_notifier_get_fd(e);
1864 int r;
1865
1866 r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
1867 data, true, int128_get64(section->size),
1868 match_data);
1869 if (r < 0) {
1870 fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n",
1871 __func__, strerror(-r), -r);
1872 abort();
1873 }
1874 }
1875
kvm_io_ioeventfd_del(MemoryListener * listener,MemoryRegionSection * section,bool match_data,uint64_t data,EventNotifier * e)1876 static void kvm_io_ioeventfd_del(MemoryListener *listener,
1877 MemoryRegionSection *section,
1878 bool match_data, uint64_t data,
1879 EventNotifier *e)
1880
1881 {
1882 int fd = event_notifier_get_fd(e);
1883 int r;
1884
1885 r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
1886 data, false, int128_get64(section->size),
1887 match_data);
1888 if (r < 0) {
1889 fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n",
1890 __func__, strerror(-r), -r);
1891 abort();
1892 }
1893 }
1894
kvm_memory_listener_register(KVMState * s,KVMMemoryListener * kml,AddressSpace * as,int as_id,const char * name)1895 void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
1896 AddressSpace *as, int as_id, const char *name)
1897 {
1898 int i;
1899
1900 kml->as_id = as_id;
1901
1902 kvm_slots_grow(kml, KVM_MEMSLOTS_NR_ALLOC_DEFAULT);
1903
1904 QSIMPLEQ_INIT(&kml->transaction_add);
1905 QSIMPLEQ_INIT(&kml->transaction_del);
1906
1907 kml->listener.region_add = kvm_region_add;
1908 kml->listener.region_del = kvm_region_del;
1909 kml->listener.commit = kvm_region_commit;
1910 kml->listener.log_start = kvm_log_start;
1911 kml->listener.log_stop = kvm_log_stop;
1912 kml->listener.priority = MEMORY_LISTENER_PRIORITY_ACCEL;
1913 kml->listener.name = name;
1914
1915 if (s->kvm_dirty_ring_size) {
1916 kml->listener.log_sync_global = kvm_log_sync_global;
1917 } else {
1918 kml->listener.log_sync = kvm_log_sync;
1919 kml->listener.log_clear = kvm_log_clear;
1920 }
1921
1922 memory_listener_register(&kml->listener, as);
1923
1924 for (i = 0; i < s->nr_as; ++i) {
1925 if (!s->as[i].as) {
1926 s->as[i].as = as;
1927 s->as[i].ml = kml;
1928 break;
1929 }
1930 }
1931 }
1932
1933 static MemoryListener kvm_io_listener = {
1934 .name = "kvm-io",
1935 .coalesced_io_add = kvm_coalesce_pio_add,
1936 .coalesced_io_del = kvm_coalesce_pio_del,
1937 .eventfd_add = kvm_io_ioeventfd_add,
1938 .eventfd_del = kvm_io_ioeventfd_del,
1939 .priority = MEMORY_LISTENER_PRIORITY_DEV_BACKEND,
1940 };
1941
kvm_set_irq(KVMState * s,int irq,int level)1942 int kvm_set_irq(KVMState *s, int irq, int level)
1943 {
1944 struct kvm_irq_level event;
1945 int ret;
1946
1947 assert(kvm_async_interrupts_enabled());
1948
1949 event.level = level;
1950 event.irq = irq;
1951 ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event);
1952 if (ret < 0) {
1953 perror("kvm_set_irq");
1954 abort();
1955 }
1956
1957 return (s->irq_set_ioctl == KVM_IRQ_LINE) ? 1 : event.status;
1958 }
1959
1960 #ifdef KVM_CAP_IRQ_ROUTING
1961 typedef struct KVMMSIRoute {
1962 struct kvm_irq_routing_entry kroute;
1963 QTAILQ_ENTRY(KVMMSIRoute) entry;
1964 } KVMMSIRoute;
1965
set_gsi(KVMState * s,unsigned int gsi)1966 static void set_gsi(KVMState *s, unsigned int gsi)
1967 {
1968 set_bit(gsi, s->used_gsi_bitmap);
1969 }
1970
clear_gsi(KVMState * s,unsigned int gsi)1971 static void clear_gsi(KVMState *s, unsigned int gsi)
1972 {
1973 clear_bit(gsi, s->used_gsi_bitmap);
1974 }
1975
kvm_init_irq_routing(KVMState * s)1976 void kvm_init_irq_routing(KVMState *s)
1977 {
1978 int gsi_count;
1979
1980 gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING) - 1;
1981 if (gsi_count > 0) {
1982 /* Round up so we can search ints using ffs */
1983 s->used_gsi_bitmap = bitmap_new(gsi_count);
1984 s->gsi_count = gsi_count;
1985 }
1986
1987 s->irq_routes = g_malloc0(sizeof(*s->irq_routes));
1988 s->nr_allocated_irq_routes = 0;
1989
1990 kvm_arch_init_irq_routing(s);
1991 }
1992
kvm_irqchip_commit_routes(KVMState * s)1993 void kvm_irqchip_commit_routes(KVMState *s)
1994 {
1995 int ret;
1996
1997 if (kvm_gsi_direct_mapping()) {
1998 return;
1999 }
2000
2001 if (!kvm_gsi_routing_enabled()) {
2002 return;
2003 }
2004
2005 s->irq_routes->flags = 0;
2006 trace_kvm_irqchip_commit_routes();
2007 ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes);
2008 assert(ret == 0);
2009 }
2010
kvm_add_routing_entry(KVMState * s,struct kvm_irq_routing_entry * entry)2011 void kvm_add_routing_entry(KVMState *s,
2012 struct kvm_irq_routing_entry *entry)
2013 {
2014 struct kvm_irq_routing_entry *new;
2015 int n, size;
2016
2017 if (s->irq_routes->nr == s->nr_allocated_irq_routes) {
2018 n = s->nr_allocated_irq_routes * 2;
2019 if (n < 64) {
2020 n = 64;
2021 }
2022 size = sizeof(struct kvm_irq_routing);
2023 size += n * sizeof(*new);
2024 s->irq_routes = g_realloc(s->irq_routes, size);
2025 s->nr_allocated_irq_routes = n;
2026 }
2027 n = s->irq_routes->nr++;
2028 new = &s->irq_routes->entries[n];
2029
2030 *new = *entry;
2031
2032 set_gsi(s, entry->gsi);
2033 }
2034
kvm_update_routing_entry(KVMState * s,struct kvm_irq_routing_entry * new_entry)2035 static int kvm_update_routing_entry(KVMState *s,
2036 struct kvm_irq_routing_entry *new_entry)
2037 {
2038 struct kvm_irq_routing_entry *entry;
2039 int n;
2040
2041 for (n = 0; n < s->irq_routes->nr; n++) {
2042 entry = &s->irq_routes->entries[n];
2043 if (entry->gsi != new_entry->gsi) {
2044 continue;
2045 }
2046
2047 if(!memcmp(entry, new_entry, sizeof *entry)) {
2048 return 0;
2049 }
2050
2051 *entry = *new_entry;
2052
2053 return 0;
2054 }
2055
2056 return -ESRCH;
2057 }
2058
kvm_irqchip_add_irq_route(KVMState * s,int irq,int irqchip,int pin)2059 void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin)
2060 {
2061 struct kvm_irq_routing_entry e = {};
2062
2063 assert(pin < s->gsi_count);
2064
2065 e.gsi = irq;
2066 e.type = KVM_IRQ_ROUTING_IRQCHIP;
2067 e.flags = 0;
2068 e.u.irqchip.irqchip = irqchip;
2069 e.u.irqchip.pin = pin;
2070 kvm_add_routing_entry(s, &e);
2071 }
2072
kvm_irqchip_release_virq(KVMState * s,int virq)2073 void kvm_irqchip_release_virq(KVMState *s, int virq)
2074 {
2075 struct kvm_irq_routing_entry *e;
2076 int i;
2077
2078 if (kvm_gsi_direct_mapping()) {
2079 return;
2080 }
2081
2082 for (i = 0; i < s->irq_routes->nr; i++) {
2083 e = &s->irq_routes->entries[i];
2084 if (e->gsi == virq) {
2085 s->irq_routes->nr--;
2086 *e = s->irq_routes->entries[s->irq_routes->nr];
2087 }
2088 }
2089 clear_gsi(s, virq);
2090 kvm_arch_release_virq_post(virq);
2091 trace_kvm_irqchip_release_virq(virq);
2092 }
2093
kvm_irqchip_add_change_notifier(Notifier * n)2094 void kvm_irqchip_add_change_notifier(Notifier *n)
2095 {
2096 notifier_list_add(&kvm_irqchip_change_notifiers, n);
2097 }
2098
kvm_irqchip_remove_change_notifier(Notifier * n)2099 void kvm_irqchip_remove_change_notifier(Notifier *n)
2100 {
2101 notifier_remove(n);
2102 }
2103
kvm_irqchip_change_notify(void)2104 void kvm_irqchip_change_notify(void)
2105 {
2106 notifier_list_notify(&kvm_irqchip_change_notifiers, NULL);
2107 }
2108
kvm_irqchip_get_virq(KVMState * s)2109 int kvm_irqchip_get_virq(KVMState *s)
2110 {
2111 int next_virq;
2112
2113 /* Return the lowest unused GSI in the bitmap */
2114 next_virq = find_first_zero_bit(s->used_gsi_bitmap, s->gsi_count);
2115 if (next_virq >= s->gsi_count) {
2116 return -ENOSPC;
2117 } else {
2118 return next_virq;
2119 }
2120 }
2121
kvm_irqchip_send_msi(KVMState * s,MSIMessage msg)2122 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
2123 {
2124 struct kvm_msi msi;
2125
2126 msi.address_lo = (uint32_t)msg.address;
2127 msi.address_hi = msg.address >> 32;
2128 msi.data = le32_to_cpu(msg.data);
2129 msi.flags = 0;
2130 memset(msi.pad, 0, sizeof(msi.pad));
2131
2132 return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi);
2133 }
2134
kvm_irqchip_add_msi_route(KVMRouteChange * c,int vector,PCIDevice * dev)2135 int kvm_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev)
2136 {
2137 struct kvm_irq_routing_entry kroute = {};
2138 int virq;
2139 KVMState *s = c->s;
2140 MSIMessage msg = {0, 0};
2141
2142 if (pci_available && dev) {
2143 msg = pci_get_msi_message(dev, vector);
2144 }
2145
2146 if (kvm_gsi_direct_mapping()) {
2147 return kvm_arch_msi_data_to_gsi(msg.data);
2148 }
2149
2150 if (!kvm_gsi_routing_enabled()) {
2151 return -ENOSYS;
2152 }
2153
2154 virq = kvm_irqchip_get_virq(s);
2155 if (virq < 0) {
2156 return virq;
2157 }
2158
2159 kroute.gsi = virq;
2160 kroute.type = KVM_IRQ_ROUTING_MSI;
2161 kroute.flags = 0;
2162 kroute.u.msi.address_lo = (uint32_t)msg.address;
2163 kroute.u.msi.address_hi = msg.address >> 32;
2164 kroute.u.msi.data = le32_to_cpu(msg.data);
2165 if (pci_available && kvm_msi_devid_required()) {
2166 kroute.flags = KVM_MSI_VALID_DEVID;
2167 kroute.u.msi.devid = pci_requester_id(dev);
2168 }
2169 if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
2170 kvm_irqchip_release_virq(s, virq);
2171 return -EINVAL;
2172 }
2173
2174 if (s->irq_routes->nr < s->gsi_count) {
2175 trace_kvm_irqchip_add_msi_route(dev ? dev->name : (char *)"N/A",
2176 vector, virq);
2177
2178 kvm_add_routing_entry(s, &kroute);
2179 kvm_arch_add_msi_route_post(&kroute, vector, dev);
2180 c->changes++;
2181 } else {
2182 kvm_irqchip_release_virq(s, virq);
2183 return -ENOSPC;
2184 }
2185
2186 return virq;
2187 }
2188
kvm_irqchip_update_msi_route(KVMState * s,int virq,MSIMessage msg,PCIDevice * dev)2189 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg,
2190 PCIDevice *dev)
2191 {
2192 struct kvm_irq_routing_entry kroute = {};
2193
2194 if (kvm_gsi_direct_mapping()) {
2195 return 0;
2196 }
2197
2198 if (!kvm_irqchip_in_kernel()) {
2199 return -ENOSYS;
2200 }
2201
2202 kroute.gsi = virq;
2203 kroute.type = KVM_IRQ_ROUTING_MSI;
2204 kroute.flags = 0;
2205 kroute.u.msi.address_lo = (uint32_t)msg.address;
2206 kroute.u.msi.address_hi = msg.address >> 32;
2207 kroute.u.msi.data = le32_to_cpu(msg.data);
2208 if (pci_available && kvm_msi_devid_required()) {
2209 kroute.flags = KVM_MSI_VALID_DEVID;
2210 kroute.u.msi.devid = pci_requester_id(dev);
2211 }
2212 if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
2213 return -EINVAL;
2214 }
2215
2216 trace_kvm_irqchip_update_msi_route(virq);
2217
2218 return kvm_update_routing_entry(s, &kroute);
2219 }
2220
kvm_irqchip_assign_irqfd(KVMState * s,EventNotifier * event,EventNotifier * resample,int virq,bool assign)2221 static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event,
2222 EventNotifier *resample, int virq,
2223 bool assign)
2224 {
2225 int fd = event_notifier_get_fd(event);
2226 int rfd = resample ? event_notifier_get_fd(resample) : -1;
2227
2228 struct kvm_irqfd irqfd = {
2229 .fd = fd,
2230 .gsi = virq,
2231 .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN,
2232 };
2233
2234 if (rfd != -1) {
2235 assert(assign);
2236 if (kvm_irqchip_is_split()) {
2237 /*
2238 * When the slow irqchip (e.g. IOAPIC) is in the
2239 * userspace, KVM kernel resamplefd will not work because
2240 * the EOI of the interrupt will be delivered to userspace
2241 * instead, so the KVM kernel resamplefd kick will be
2242 * skipped. The userspace here mimics what the kernel
2243 * provides with resamplefd, remember the resamplefd and
2244 * kick it when we receive EOI of this IRQ.
2245 *
2246 * This is hackery because IOAPIC is mostly bypassed
2247 * (except EOI broadcasts) when irqfd is used. However
2248 * this can bring much performance back for split irqchip
2249 * with INTx IRQs (for VFIO, this gives 93% perf of the
2250 * full fast path, which is 46% perf boost comparing to
2251 * the INTx slow path).
2252 */
2253 kvm_resample_fd_insert(virq, resample);
2254 } else {
2255 irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE;
2256 irqfd.resamplefd = rfd;
2257 }
2258 } else if (!assign) {
2259 if (kvm_irqchip_is_split()) {
2260 kvm_resample_fd_remove(virq);
2261 }
2262 }
2263
2264 return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd);
2265 }
2266
2267 #else /* !KVM_CAP_IRQ_ROUTING */
2268
kvm_init_irq_routing(KVMState * s)2269 void kvm_init_irq_routing(KVMState *s)
2270 {
2271 }
2272
kvm_irqchip_release_virq(KVMState * s,int virq)2273 void kvm_irqchip_release_virq(KVMState *s, int virq)
2274 {
2275 }
2276
kvm_irqchip_send_msi(KVMState * s,MSIMessage msg)2277 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
2278 {
2279 abort();
2280 }
2281
kvm_irqchip_add_msi_route(KVMRouteChange * c,int vector,PCIDevice * dev)2282 int kvm_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev)
2283 {
2284 return -ENOSYS;
2285 }
2286
kvm_irqchip_add_adapter_route(KVMState * s,AdapterInfo * adapter)2287 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter)
2288 {
2289 return -ENOSYS;
2290 }
2291
kvm_irqchip_add_hv_sint_route(KVMState * s,uint32_t vcpu,uint32_t sint)2292 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint)
2293 {
2294 return -ENOSYS;
2295 }
2296
kvm_irqchip_assign_irqfd(KVMState * s,EventNotifier * event,EventNotifier * resample,int virq,bool assign)2297 static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event,
2298 EventNotifier *resample, int virq,
2299 bool assign)
2300 {
2301 abort();
2302 }
2303
kvm_irqchip_update_msi_route(KVMState * s,int virq,MSIMessage msg)2304 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg)
2305 {
2306 return -ENOSYS;
2307 }
2308 #endif /* !KVM_CAP_IRQ_ROUTING */
2309
kvm_irqchip_add_irqfd_notifier_gsi(KVMState * s,EventNotifier * n,EventNotifier * rn,int virq)2310 int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
2311 EventNotifier *rn, int virq)
2312 {
2313 return kvm_irqchip_assign_irqfd(s, n, rn, virq, true);
2314 }
2315
kvm_irqchip_remove_irqfd_notifier_gsi(KVMState * s,EventNotifier * n,int virq)2316 int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
2317 int virq)
2318 {
2319 return kvm_irqchip_assign_irqfd(s, n, NULL, virq, false);
2320 }
2321
kvm_irqchip_add_irqfd_notifier(KVMState * s,EventNotifier * n,EventNotifier * rn,qemu_irq irq)2322 int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n,
2323 EventNotifier *rn, qemu_irq irq)
2324 {
2325 gpointer key, gsi;
2326 gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
2327
2328 if (!found) {
2329 return -ENXIO;
2330 }
2331 return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, GPOINTER_TO_INT(gsi));
2332 }
2333
kvm_irqchip_remove_irqfd_notifier(KVMState * s,EventNotifier * n,qemu_irq irq)2334 int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n,
2335 qemu_irq irq)
2336 {
2337 gpointer key, gsi;
2338 gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
2339
2340 if (!found) {
2341 return -ENXIO;
2342 }
2343 return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, GPOINTER_TO_INT(gsi));
2344 }
2345
kvm_irqchip_set_qemuirq_gsi(KVMState * s,qemu_irq irq,int gsi)2346 void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi)
2347 {
2348 g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi));
2349 }
2350
kvm_irqchip_create(KVMState * s)2351 static void kvm_irqchip_create(KVMState *s)
2352 {
2353 int ret;
2354
2355 assert(s->kernel_irqchip_split != ON_OFF_AUTO_AUTO);
2356 if (kvm_check_extension(s, KVM_CAP_IRQCHIP)) {
2357 ;
2358 } else if (kvm_check_extension(s, KVM_CAP_S390_IRQCHIP)) {
2359 ret = kvm_vm_enable_cap(s, KVM_CAP_S390_IRQCHIP, 0);
2360 if (ret < 0) {
2361 fprintf(stderr, "Enable kernel irqchip failed: %s\n", strerror(-ret));
2362 exit(1);
2363 }
2364 } else {
2365 return;
2366 }
2367
2368 if (kvm_check_extension(s, KVM_CAP_IRQFD) <= 0) {
2369 fprintf(stderr, "kvm: irqfd not implemented\n");
2370 exit(1);
2371 }
2372
2373 /* First probe and see if there's a arch-specific hook to create the
2374 * in-kernel irqchip for us */
2375 ret = kvm_arch_irqchip_create(s);
2376 if (ret == 0) {
2377 if (s->kernel_irqchip_split == ON_OFF_AUTO_ON) {
2378 error_report("Split IRQ chip mode not supported.");
2379 exit(1);
2380 } else {
2381 ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP);
2382 }
2383 }
2384 if (ret < 0) {
2385 fprintf(stderr, "Create kernel irqchip failed: %s\n", strerror(-ret));
2386 exit(1);
2387 }
2388
2389 kvm_kernel_irqchip = true;
2390 /* If we have an in-kernel IRQ chip then we must have asynchronous
2391 * interrupt delivery (though the reverse is not necessarily true)
2392 */
2393 kvm_async_interrupts_allowed = true;
2394 kvm_halt_in_kernel_allowed = true;
2395
2396 kvm_init_irq_routing(s);
2397
2398 s->gsimap = g_hash_table_new(g_direct_hash, g_direct_equal);
2399 }
2400
2401 /* Find number of supported CPUs using the recommended
2402 * procedure from the kernel API documentation to cope with
2403 * older kernels that may be missing capabilities.
2404 */
kvm_recommended_vcpus(KVMState * s)2405 static int kvm_recommended_vcpus(KVMState *s)
2406 {
2407 int ret = kvm_vm_check_extension(s, KVM_CAP_NR_VCPUS);
2408 return (ret) ? ret : 4;
2409 }
2410
kvm_max_vcpus(KVMState * s)2411 static int kvm_max_vcpus(KVMState *s)
2412 {
2413 int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS);
2414 return (ret) ? ret : kvm_recommended_vcpus(s);
2415 }
2416
kvm_max_vcpu_id(KVMState * s)2417 static int kvm_max_vcpu_id(KVMState *s)
2418 {
2419 int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPU_ID);
2420 return (ret) ? ret : kvm_max_vcpus(s);
2421 }
2422
kvm_vcpu_id_is_valid(int vcpu_id)2423 bool kvm_vcpu_id_is_valid(int vcpu_id)
2424 {
2425 KVMState *s = KVM_STATE(current_accel());
2426 return vcpu_id >= 0 && vcpu_id < kvm_max_vcpu_id(s);
2427 }
2428
kvm_dirty_ring_enabled(void)2429 bool kvm_dirty_ring_enabled(void)
2430 {
2431 return kvm_state && kvm_state->kvm_dirty_ring_size;
2432 }
2433
2434 static void query_stats_cb(StatsResultList **result, StatsTarget target,
2435 strList *names, strList *targets, Error **errp);
2436 static void query_stats_schemas_cb(StatsSchemaList **result, Error **errp);
2437
kvm_dirty_ring_size(void)2438 uint32_t kvm_dirty_ring_size(void)
2439 {
2440 return kvm_state->kvm_dirty_ring_size;
2441 }
2442
do_kvm_create_vm(MachineState * ms,int type)2443 static int do_kvm_create_vm(MachineState *ms, int type)
2444 {
2445 KVMState *s;
2446 int ret;
2447
2448 s = KVM_STATE(ms->accelerator);
2449
2450 do {
2451 ret = kvm_ioctl(s, KVM_CREATE_VM, type);
2452 } while (ret == -EINTR);
2453
2454 if (ret < 0) {
2455 error_report("ioctl(KVM_CREATE_VM) failed: %s", strerror(-ret));
2456
2457 #ifdef TARGET_S390X
2458 if (ret == -EINVAL) {
2459 error_printf("Host kernel setup problem detected."
2460 " Please verify:\n");
2461 error_printf("- for kernels supporting the"
2462 " switch_amode or user_mode parameters, whether");
2463 error_printf(" user space is running in primary address space\n");
2464 error_printf("- for kernels supporting the vm.allocate_pgste"
2465 " sysctl, whether it is enabled\n");
2466 }
2467 #elif defined(TARGET_PPC)
2468 if (ret == -EINVAL) {
2469 error_printf("PPC KVM module is not loaded. Try modprobe kvm_%s.\n",
2470 (type == 2) ? "pr" : "hv");
2471 }
2472 #endif
2473 }
2474
2475 return ret;
2476 }
2477
find_kvm_machine_type(MachineState * ms)2478 static int find_kvm_machine_type(MachineState *ms)
2479 {
2480 MachineClass *mc = MACHINE_GET_CLASS(ms);
2481 int type;
2482
2483 if (object_property_find(OBJECT(current_machine), "kvm-type")) {
2484 g_autofree char *kvm_type;
2485 kvm_type = object_property_get_str(OBJECT(current_machine),
2486 "kvm-type",
2487 &error_abort);
2488 type = mc->kvm_type(ms, kvm_type);
2489 } else if (mc->kvm_type) {
2490 type = mc->kvm_type(ms, NULL);
2491 } else {
2492 type = kvm_arch_get_default_type(ms);
2493 }
2494 return type;
2495 }
2496
kvm_setup_dirty_ring(KVMState * s)2497 static int kvm_setup_dirty_ring(KVMState *s)
2498 {
2499 uint64_t dirty_log_manual_caps;
2500 int ret;
2501
2502 /*
2503 * Enable KVM dirty ring if supported, otherwise fall back to
2504 * dirty logging mode
2505 */
2506 ret = kvm_dirty_ring_init(s);
2507 if (ret < 0) {
2508 return ret;
2509 }
2510
2511 /*
2512 * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is not needed when dirty ring is
2513 * enabled. More importantly, KVM_DIRTY_LOG_INITIALLY_SET will assume no
2514 * page is wr-protected initially, which is against how kvm dirty ring is
2515 * usage - kvm dirty ring requires all pages are wr-protected at the very
2516 * beginning. Enabling this feature for dirty ring causes data corruption.
2517 *
2518 * TODO: Without KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 and kvm clear dirty log,
2519 * we may expect a higher stall time when starting the migration. In the
2520 * future we can enable KVM_CLEAR_DIRTY_LOG to work with dirty ring too:
2521 * instead of clearing dirty bit, it can be a way to explicitly wr-protect
2522 * guest pages.
2523 */
2524 if (!s->kvm_dirty_ring_size) {
2525 dirty_log_manual_caps =
2526 kvm_check_extension(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
2527 dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
2528 KVM_DIRTY_LOG_INITIALLY_SET);
2529 s->manual_dirty_log_protect = dirty_log_manual_caps;
2530 if (dirty_log_manual_caps) {
2531 ret = kvm_vm_enable_cap(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0,
2532 dirty_log_manual_caps);
2533 if (ret) {
2534 warn_report("Trying to enable capability %"PRIu64" of "
2535 "KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 but failed. "
2536 "Falling back to the legacy mode. ",
2537 dirty_log_manual_caps);
2538 s->manual_dirty_log_protect = 0;
2539 }
2540 }
2541 }
2542
2543 return 0;
2544 }
2545
kvm_init(MachineState * ms)2546 static int kvm_init(MachineState *ms)
2547 {
2548 MachineClass *mc = MACHINE_GET_CLASS(ms);
2549 static const char upgrade_note[] =
2550 "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
2551 "(see http://sourceforge.net/projects/kvm).\n";
2552 const struct {
2553 const char *name;
2554 int num;
2555 } num_cpus[] = {
2556 { "SMP", ms->smp.cpus },
2557 { "hotpluggable", ms->smp.max_cpus },
2558 { /* end of list */ }
2559 }, *nc = num_cpus;
2560 int soft_vcpus_limit, hard_vcpus_limit;
2561 KVMState *s;
2562 const KVMCapabilityInfo *missing_cap;
2563 int ret;
2564 int type;
2565
2566 qemu_mutex_init(&kml_slots_lock);
2567
2568 s = KVM_STATE(ms->accelerator);
2569
2570 /*
2571 * On systems where the kernel can support different base page
2572 * sizes, host page size may be different from TARGET_PAGE_SIZE,
2573 * even with KVM. TARGET_PAGE_SIZE is assumed to be the minimum
2574 * page size for the system though.
2575 */
2576 assert(TARGET_PAGE_SIZE <= qemu_real_host_page_size());
2577
2578 s->sigmask_len = 8;
2579 accel_blocker_init();
2580
2581 #ifdef TARGET_KVM_HAVE_GUEST_DEBUG
2582 QTAILQ_INIT(&s->kvm_sw_breakpoints);
2583 #endif
2584 QLIST_INIT(&s->kvm_parked_vcpus);
2585 s->fd = qemu_open_old(s->device ?: "/dev/kvm", O_RDWR);
2586 if (s->fd == -1) {
2587 error_report("Could not access KVM kernel module: %m");
2588 ret = -errno;
2589 goto err;
2590 }
2591
2592 ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
2593 if (ret < KVM_API_VERSION) {
2594 if (ret >= 0) {
2595 ret = -EINVAL;
2596 }
2597 error_report("kvm version too old");
2598 goto err;
2599 }
2600
2601 if (ret > KVM_API_VERSION) {
2602 ret = -EINVAL;
2603 error_report("kvm version not supported");
2604 goto err;
2605 }
2606
2607 kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT);
2608 s->nr_slots_max = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS);
2609
2610 /* If unspecified, use the default value */
2611 if (!s->nr_slots_max) {
2612 s->nr_slots_max = KVM_MEMSLOTS_NR_MAX_DEFAULT;
2613 }
2614
2615 type = find_kvm_machine_type(ms);
2616 if (type < 0) {
2617 ret = -EINVAL;
2618 goto err;
2619 }
2620
2621 ret = do_kvm_create_vm(ms, type);
2622 if (ret < 0) {
2623 goto err;
2624 }
2625
2626 s->vmfd = ret;
2627
2628 s->nr_as = kvm_vm_check_extension(s, KVM_CAP_MULTI_ADDRESS_SPACE);
2629 if (s->nr_as <= 1) {
2630 s->nr_as = 1;
2631 }
2632 s->as = g_new0(struct KVMAs, s->nr_as);
2633
2634 /* check the vcpu limits */
2635 soft_vcpus_limit = kvm_recommended_vcpus(s);
2636 hard_vcpus_limit = kvm_max_vcpus(s);
2637
2638 while (nc->name) {
2639 if (nc->num > soft_vcpus_limit) {
2640 warn_report("Number of %s cpus requested (%d) exceeds "
2641 "the recommended cpus supported by KVM (%d)",
2642 nc->name, nc->num, soft_vcpus_limit);
2643
2644 if (nc->num > hard_vcpus_limit) {
2645 error_report("Number of %s cpus requested (%d) exceeds "
2646 "the maximum cpus supported by KVM (%d)",
2647 nc->name, nc->num, hard_vcpus_limit);
2648 exit(1);
2649 }
2650 }
2651 nc++;
2652 }
2653
2654 missing_cap = kvm_check_extension_list(s, kvm_required_capabilites);
2655 if (!missing_cap) {
2656 missing_cap =
2657 kvm_check_extension_list(s, kvm_arch_required_capabilities);
2658 }
2659 if (missing_cap) {
2660 ret = -EINVAL;
2661 error_report("kvm does not support %s", missing_cap->name);
2662 error_printf("%s", upgrade_note);
2663 goto err;
2664 }
2665
2666 s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
2667 s->coalesced_pio = s->coalesced_mmio &&
2668 kvm_check_extension(s, KVM_CAP_COALESCED_PIO);
2669
2670 ret = kvm_setup_dirty_ring(s);
2671 if (ret < 0) {
2672 goto err;
2673 }
2674
2675 #ifdef KVM_CAP_VCPU_EVENTS
2676 s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
2677 #endif
2678 s->max_nested_state_len = kvm_check_extension(s, KVM_CAP_NESTED_STATE);
2679
2680 s->irq_set_ioctl = KVM_IRQ_LINE;
2681 if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) {
2682 s->irq_set_ioctl = KVM_IRQ_LINE_STATUS;
2683 }
2684
2685 kvm_readonly_mem_allowed =
2686 (kvm_vm_check_extension(s, KVM_CAP_READONLY_MEM) > 0);
2687
2688 kvm_resamplefds_allowed =
2689 (kvm_check_extension(s, KVM_CAP_IRQFD_RESAMPLE) > 0);
2690
2691 kvm_vm_attributes_allowed =
2692 (kvm_check_extension(s, KVM_CAP_VM_ATTRIBUTES) > 0);
2693
2694 #ifdef TARGET_KVM_HAVE_GUEST_DEBUG
2695 kvm_has_guest_debug =
2696 (kvm_check_extension(s, KVM_CAP_SET_GUEST_DEBUG) > 0);
2697 #endif
2698
2699 kvm_sstep_flags = 0;
2700 if (kvm_has_guest_debug) {
2701 kvm_sstep_flags = SSTEP_ENABLE;
2702
2703 #if defined TARGET_KVM_HAVE_GUEST_DEBUG
2704 int guest_debug_flags =
2705 kvm_check_extension(s, KVM_CAP_SET_GUEST_DEBUG2);
2706
2707 if (guest_debug_flags & KVM_GUESTDBG_BLOCKIRQ) {
2708 kvm_sstep_flags |= SSTEP_NOIRQ;
2709 }
2710 #endif
2711 }
2712
2713 kvm_state = s;
2714
2715 ret = kvm_arch_init(ms, s);
2716 if (ret < 0) {
2717 goto err;
2718 }
2719
2720 kvm_supported_memory_attributes = kvm_vm_check_extension(s, KVM_CAP_MEMORY_ATTRIBUTES);
2721 kvm_guest_memfd_supported =
2722 kvm_check_extension(s, KVM_CAP_GUEST_MEMFD) &&
2723 kvm_check_extension(s, KVM_CAP_USER_MEMORY2) &&
2724 (kvm_supported_memory_attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE);
2725
2726 if (s->kernel_irqchip_split == ON_OFF_AUTO_AUTO) {
2727 s->kernel_irqchip_split = mc->default_kernel_irqchip_split ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
2728 }
2729
2730 qemu_register_reset(kvm_unpoison_all, NULL);
2731
2732 if (s->kernel_irqchip_allowed) {
2733 kvm_irqchip_create(s);
2734 }
2735
2736 s->memory_listener.listener.eventfd_add = kvm_mem_ioeventfd_add;
2737 s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del;
2738 s->memory_listener.listener.coalesced_io_add = kvm_coalesce_mmio_region;
2739 s->memory_listener.listener.coalesced_io_del = kvm_uncoalesce_mmio_region;
2740
2741 kvm_memory_listener_register(s, &s->memory_listener,
2742 &address_space_memory, 0, "kvm-memory");
2743 memory_listener_register(&kvm_io_listener,
2744 &address_space_io);
2745
2746 s->sync_mmu = !!kvm_vm_check_extension(kvm_state, KVM_CAP_SYNC_MMU);
2747 if (!s->sync_mmu) {
2748 ret = ram_block_discard_disable(true);
2749 assert(!ret);
2750 }
2751
2752 if (s->kvm_dirty_ring_size) {
2753 kvm_dirty_ring_reaper_init(s);
2754 }
2755
2756 if (kvm_check_extension(kvm_state, KVM_CAP_BINARY_STATS_FD)) {
2757 add_stats_callbacks(STATS_PROVIDER_KVM, query_stats_cb,
2758 query_stats_schemas_cb);
2759 }
2760
2761 return 0;
2762
2763 err:
2764 assert(ret < 0);
2765 if (s->vmfd >= 0) {
2766 close(s->vmfd);
2767 }
2768 if (s->fd != -1) {
2769 close(s->fd);
2770 }
2771 g_free(s->as);
2772 g_free(s->memory_listener.slots);
2773
2774 return ret;
2775 }
2776
kvm_set_sigmask_len(KVMState * s,unsigned int sigmask_len)2777 void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len)
2778 {
2779 s->sigmask_len = sigmask_len;
2780 }
2781
kvm_handle_io(uint16_t port,MemTxAttrs attrs,void * data,int direction,int size,uint32_t count)2782 static void kvm_handle_io(uint16_t port, MemTxAttrs attrs, void *data, int direction,
2783 int size, uint32_t count)
2784 {
2785 int i;
2786 uint8_t *ptr = data;
2787
2788 for (i = 0; i < count; i++) {
2789 address_space_rw(&address_space_io, port, attrs,
2790 ptr, size,
2791 direction == KVM_EXIT_IO_OUT);
2792 ptr += size;
2793 }
2794 }
2795
kvm_handle_internal_error(CPUState * cpu,struct kvm_run * run)2796 static int kvm_handle_internal_error(CPUState *cpu, struct kvm_run *run)
2797 {
2798 int i;
2799
2800 fprintf(stderr, "KVM internal error. Suberror: %d\n",
2801 run->internal.suberror);
2802
2803 for (i = 0; i < run->internal.ndata; ++i) {
2804 fprintf(stderr, "extra data[%d]: 0x%016"PRIx64"\n",
2805 i, (uint64_t)run->internal.data[i]);
2806 }
2807 if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
2808 fprintf(stderr, "emulation failure\n");
2809 if (!kvm_arch_stop_on_emulation_error(cpu)) {
2810 cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
2811 return EXCP_INTERRUPT;
2812 }
2813 }
2814 /* FIXME: Should trigger a qmp message to let management know
2815 * something went wrong.
2816 */
2817 return -1;
2818 }
2819
kvm_flush_coalesced_mmio_buffer(void)2820 void kvm_flush_coalesced_mmio_buffer(void)
2821 {
2822 KVMState *s = kvm_state;
2823
2824 if (!s || s->coalesced_flush_in_progress) {
2825 return;
2826 }
2827
2828 s->coalesced_flush_in_progress = true;
2829
2830 if (s->coalesced_mmio_ring) {
2831 struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
2832 while (ring->first != ring->last) {
2833 struct kvm_coalesced_mmio *ent;
2834
2835 ent = &ring->coalesced_mmio[ring->first];
2836
2837 if (ent->pio == 1) {
2838 address_space_write(&address_space_io, ent->phys_addr,
2839 MEMTXATTRS_UNSPECIFIED, ent->data,
2840 ent->len);
2841 } else {
2842 cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
2843 }
2844 smp_wmb();
2845 ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
2846 }
2847 }
2848
2849 s->coalesced_flush_in_progress = false;
2850 }
2851
do_kvm_cpu_synchronize_state(CPUState * cpu,run_on_cpu_data arg)2852 static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
2853 {
2854 if (!cpu->vcpu_dirty && !kvm_state->guest_state_protected) {
2855 Error *err = NULL;
2856 int ret = kvm_arch_get_registers(cpu, &err);
2857 if (ret) {
2858 if (err) {
2859 error_reportf_err(err, "Failed to synchronize CPU state: ");
2860 } else {
2861 error_report("Failed to get registers: %s", strerror(-ret));
2862 }
2863
2864 cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
2865 vm_stop(RUN_STATE_INTERNAL_ERROR);
2866 }
2867
2868 cpu->vcpu_dirty = true;
2869 }
2870 }
2871
kvm_cpu_synchronize_state(CPUState * cpu)2872 void kvm_cpu_synchronize_state(CPUState *cpu)
2873 {
2874 if (!cpu->vcpu_dirty && !kvm_state->guest_state_protected) {
2875 run_on_cpu(cpu, do_kvm_cpu_synchronize_state, RUN_ON_CPU_NULL);
2876 }
2877 }
2878
do_kvm_cpu_synchronize_post_reset(CPUState * cpu,run_on_cpu_data arg)2879 static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg)
2880 {
2881 Error *err = NULL;
2882 int ret = kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE, &err);
2883 if (ret) {
2884 if (err) {
2885 error_reportf_err(err, "Restoring resisters after reset: ");
2886 } else {
2887 error_report("Failed to put registers after reset: %s",
2888 strerror(-ret));
2889 }
2890 cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
2891 vm_stop(RUN_STATE_INTERNAL_ERROR);
2892 }
2893
2894 cpu->vcpu_dirty = false;
2895 }
2896
kvm_cpu_synchronize_post_reset(CPUState * cpu)2897 void kvm_cpu_synchronize_post_reset(CPUState *cpu)
2898 {
2899 run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
2900 }
2901
do_kvm_cpu_synchronize_post_init(CPUState * cpu,run_on_cpu_data arg)2902 static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg)
2903 {
2904 Error *err = NULL;
2905 int ret = kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE, &err);
2906 if (ret) {
2907 if (err) {
2908 error_reportf_err(err, "Putting registers after init: ");
2909 } else {
2910 error_report("Failed to put registers after init: %s",
2911 strerror(-ret));
2912 }
2913 exit(1);
2914 }
2915
2916 cpu->vcpu_dirty = false;
2917 }
2918
kvm_cpu_synchronize_post_init(CPUState * cpu)2919 void kvm_cpu_synchronize_post_init(CPUState *cpu)
2920 {
2921 if (!kvm_state->guest_state_protected) {
2922 /*
2923 * This runs before the machine_init_done notifiers, and is the last
2924 * opportunity to synchronize the state of confidential guests.
2925 */
2926 run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
2927 }
2928 }
2929
do_kvm_cpu_synchronize_pre_loadvm(CPUState * cpu,run_on_cpu_data arg)2930 static void do_kvm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg)
2931 {
2932 cpu->vcpu_dirty = true;
2933 }
2934
kvm_cpu_synchronize_pre_loadvm(CPUState * cpu)2935 void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu)
2936 {
2937 run_on_cpu(cpu, do_kvm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
2938 }
2939
2940 #ifdef KVM_HAVE_MCE_INJECTION
2941 static __thread void *pending_sigbus_addr;
2942 static __thread int pending_sigbus_code;
2943 static __thread bool have_sigbus_pending;
2944 #endif
2945
kvm_cpu_kick(CPUState * cpu)2946 static void kvm_cpu_kick(CPUState *cpu)
2947 {
2948 qatomic_set(&cpu->kvm_run->immediate_exit, 1);
2949 }
2950
kvm_cpu_kick_self(void)2951 static void kvm_cpu_kick_self(void)
2952 {
2953 if (kvm_immediate_exit) {
2954 kvm_cpu_kick(current_cpu);
2955 } else {
2956 qemu_cpu_kick_self();
2957 }
2958 }
2959
kvm_eat_signals(CPUState * cpu)2960 static void kvm_eat_signals(CPUState *cpu)
2961 {
2962 struct timespec ts = { 0, 0 };
2963 siginfo_t siginfo;
2964 sigset_t waitset;
2965 sigset_t chkset;
2966 int r;
2967
2968 if (kvm_immediate_exit) {
2969 qatomic_set(&cpu->kvm_run->immediate_exit, 0);
2970 /* Write kvm_run->immediate_exit before the cpu->exit_request
2971 * write in kvm_cpu_exec.
2972 */
2973 smp_wmb();
2974 return;
2975 }
2976
2977 sigemptyset(&waitset);
2978 sigaddset(&waitset, SIG_IPI);
2979
2980 do {
2981 r = sigtimedwait(&waitset, &siginfo, &ts);
2982 if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
2983 perror("sigtimedwait");
2984 exit(1);
2985 }
2986
2987 r = sigpending(&chkset);
2988 if (r == -1) {
2989 perror("sigpending");
2990 exit(1);
2991 }
2992 } while (sigismember(&chkset, SIG_IPI));
2993 }
2994
kvm_convert_memory(hwaddr start,hwaddr size,bool to_private)2995 int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private)
2996 {
2997 MemoryRegionSection section;
2998 ram_addr_t offset;
2999 MemoryRegion *mr;
3000 RAMBlock *rb;
3001 void *addr;
3002 int ret = -1;
3003
3004 trace_kvm_convert_memory(start, size, to_private ? "shared_to_private" : "private_to_shared");
3005
3006 if (!QEMU_PTR_IS_ALIGNED(start, qemu_real_host_page_size()) ||
3007 !QEMU_PTR_IS_ALIGNED(size, qemu_real_host_page_size())) {
3008 return -1;
3009 }
3010
3011 if (!size) {
3012 return -1;
3013 }
3014
3015 section = memory_region_find(get_system_memory(), start, size);
3016 mr = section.mr;
3017 if (!mr) {
3018 /*
3019 * Ignore converting non-assigned region to shared.
3020 *
3021 * TDX requires vMMIO region to be shared to inject #VE to guest.
3022 * OVMF issues conservatively MapGPA(shared) on 32bit PCI MMIO region,
3023 * and vIO-APIC 0xFEC00000 4K page.
3024 * OVMF assigns 32bit PCI MMIO region to
3025 * [top of low memory: typically 2GB=0xC000000, 0xFC00000)
3026 */
3027 if (!to_private) {
3028 return 0;
3029 }
3030 return -1;
3031 }
3032
3033 if (!memory_region_has_guest_memfd(mr)) {
3034 /*
3035 * Because vMMIO region must be shared, guest TD may convert vMMIO
3036 * region to shared explicitly. Don't complain such case. See
3037 * memory_region_type() for checking if the region is MMIO region.
3038 */
3039 if (!to_private &&
3040 !memory_region_is_ram(mr) &&
3041 !memory_region_is_ram_device(mr) &&
3042 !memory_region_is_rom(mr) &&
3043 !memory_region_is_romd(mr)) {
3044 ret = 0;
3045 } else {
3046 error_report("Convert non guest_memfd backed memory region "
3047 "(0x%"HWADDR_PRIx" ,+ 0x%"HWADDR_PRIx") to %s",
3048 start, size, to_private ? "private" : "shared");
3049 }
3050 goto out_unref;
3051 }
3052
3053 if (to_private) {
3054 ret = kvm_set_memory_attributes_private(start, size);
3055 } else {
3056 ret = kvm_set_memory_attributes_shared(start, size);
3057 }
3058 if (ret) {
3059 goto out_unref;
3060 }
3061
3062 addr = memory_region_get_ram_ptr(mr) + section.offset_within_region;
3063 rb = qemu_ram_block_from_host(addr, false, &offset);
3064
3065 if (to_private) {
3066 if (rb->page_size != qemu_real_host_page_size()) {
3067 /*
3068 * shared memory is backed by hugetlb, which is supposed to be
3069 * pre-allocated and doesn't need to be discarded
3070 */
3071 goto out_unref;
3072 }
3073 ret = ram_block_discard_range(rb, offset, size);
3074 } else {
3075 ret = ram_block_discard_guest_memfd_range(rb, offset, size);
3076 }
3077
3078 out_unref:
3079 memory_region_unref(mr);
3080 return ret;
3081 }
3082
kvm_cpu_exec(CPUState * cpu)3083 int kvm_cpu_exec(CPUState *cpu)
3084 {
3085 struct kvm_run *run = cpu->kvm_run;
3086 int ret, run_ret;
3087
3088 trace_kvm_cpu_exec();
3089
3090 if (kvm_arch_process_async_events(cpu)) {
3091 qatomic_set(&cpu->exit_request, 0);
3092 return EXCP_HLT;
3093 }
3094
3095 bql_unlock();
3096 cpu_exec_start(cpu);
3097
3098 do {
3099 MemTxAttrs attrs;
3100
3101 if (cpu->vcpu_dirty) {
3102 Error *err = NULL;
3103 ret = kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE, &err);
3104 if (ret) {
3105 if (err) {
3106 error_reportf_err(err, "Putting registers after init: ");
3107 } else {
3108 error_report("Failed to put registers after init: %s",
3109 strerror(-ret));
3110 }
3111 ret = -1;
3112 break;
3113 }
3114
3115 cpu->vcpu_dirty = false;
3116 }
3117
3118 kvm_arch_pre_run(cpu, run);
3119 if (qatomic_read(&cpu->exit_request)) {
3120 trace_kvm_interrupt_exit_request();
3121 /*
3122 * KVM requires us to reenter the kernel after IO exits to complete
3123 * instruction emulation. This self-signal will ensure that we
3124 * leave ASAP again.
3125 */
3126 kvm_cpu_kick_self();
3127 }
3128
3129 /* Read cpu->exit_request before KVM_RUN reads run->immediate_exit.
3130 * Matching barrier in kvm_eat_signals.
3131 */
3132 smp_rmb();
3133
3134 run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0);
3135
3136 attrs = kvm_arch_post_run(cpu, run);
3137
3138 #ifdef KVM_HAVE_MCE_INJECTION
3139 if (unlikely(have_sigbus_pending)) {
3140 bql_lock();
3141 kvm_arch_on_sigbus_vcpu(cpu, pending_sigbus_code,
3142 pending_sigbus_addr);
3143 have_sigbus_pending = false;
3144 bql_unlock();
3145 }
3146 #endif
3147
3148 if (run_ret < 0) {
3149 if (run_ret == -EINTR || run_ret == -EAGAIN) {
3150 trace_kvm_io_window_exit();
3151 kvm_eat_signals(cpu);
3152 ret = EXCP_INTERRUPT;
3153 break;
3154 }
3155 if (!(run_ret == -EFAULT && run->exit_reason == KVM_EXIT_MEMORY_FAULT)) {
3156 fprintf(stderr, "error: kvm run failed %s\n",
3157 strerror(-run_ret));
3158 #ifdef TARGET_PPC
3159 if (run_ret == -EBUSY) {
3160 fprintf(stderr,
3161 "This is probably because your SMT is enabled.\n"
3162 "VCPU can only run on primary threads with all "
3163 "secondary threads offline.\n");
3164 }
3165 #endif
3166 ret = -1;
3167 break;
3168 }
3169 }
3170
3171 trace_kvm_run_exit(cpu->cpu_index, run->exit_reason);
3172 switch (run->exit_reason) {
3173 case KVM_EXIT_IO:
3174 /* Called outside BQL */
3175 kvm_handle_io(run->io.port, attrs,
3176 (uint8_t *)run + run->io.data_offset,
3177 run->io.direction,
3178 run->io.size,
3179 run->io.count);
3180 ret = 0;
3181 break;
3182 case KVM_EXIT_MMIO:
3183 /* Called outside BQL */
3184 address_space_rw(&address_space_memory,
3185 run->mmio.phys_addr, attrs,
3186 run->mmio.data,
3187 run->mmio.len,
3188 run->mmio.is_write);
3189 ret = 0;
3190 break;
3191 case KVM_EXIT_IRQ_WINDOW_OPEN:
3192 ret = EXCP_INTERRUPT;
3193 break;
3194 case KVM_EXIT_SHUTDOWN:
3195 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
3196 ret = EXCP_INTERRUPT;
3197 break;
3198 case KVM_EXIT_UNKNOWN:
3199 fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n",
3200 (uint64_t)run->hw.hardware_exit_reason);
3201 ret = -1;
3202 break;
3203 case KVM_EXIT_INTERNAL_ERROR:
3204 ret = kvm_handle_internal_error(cpu, run);
3205 break;
3206 case KVM_EXIT_DIRTY_RING_FULL:
3207 /*
3208 * We shouldn't continue if the dirty ring of this vcpu is
3209 * still full. Got kicked by KVM_RESET_DIRTY_RINGS.
3210 */
3211 trace_kvm_dirty_ring_full(cpu->cpu_index);
3212 bql_lock();
3213 /*
3214 * We throttle vCPU by making it sleep once it exit from kernel
3215 * due to dirty ring full. In the dirtylimit scenario, reaping
3216 * all vCPUs after a single vCPU dirty ring get full result in
3217 * the miss of sleep, so just reap the ring-fulled vCPU.
3218 */
3219 if (dirtylimit_in_service()) {
3220 kvm_dirty_ring_reap(kvm_state, cpu);
3221 } else {
3222 kvm_dirty_ring_reap(kvm_state, NULL);
3223 }
3224 bql_unlock();
3225 dirtylimit_vcpu_execute(cpu);
3226 ret = 0;
3227 break;
3228 case KVM_EXIT_SYSTEM_EVENT:
3229 trace_kvm_run_exit_system_event(cpu->cpu_index, run->system_event.type);
3230 switch (run->system_event.type) {
3231 case KVM_SYSTEM_EVENT_SHUTDOWN:
3232 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
3233 ret = EXCP_INTERRUPT;
3234 break;
3235 case KVM_SYSTEM_EVENT_RESET:
3236 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
3237 ret = EXCP_INTERRUPT;
3238 break;
3239 case KVM_SYSTEM_EVENT_CRASH:
3240 kvm_cpu_synchronize_state(cpu);
3241 bql_lock();
3242 qemu_system_guest_panicked(cpu_get_crash_info(cpu));
3243 bql_unlock();
3244 ret = 0;
3245 break;
3246 default:
3247 ret = kvm_arch_handle_exit(cpu, run);
3248 break;
3249 }
3250 break;
3251 case KVM_EXIT_MEMORY_FAULT:
3252 trace_kvm_memory_fault(run->memory_fault.gpa,
3253 run->memory_fault.size,
3254 run->memory_fault.flags);
3255 if (run->memory_fault.flags & ~KVM_MEMORY_EXIT_FLAG_PRIVATE) {
3256 error_report("KVM_EXIT_MEMORY_FAULT: Unknown flag 0x%" PRIx64,
3257 (uint64_t)run->memory_fault.flags);
3258 ret = -1;
3259 break;
3260 }
3261 ret = kvm_convert_memory(run->memory_fault.gpa, run->memory_fault.size,
3262 run->memory_fault.flags & KVM_MEMORY_EXIT_FLAG_PRIVATE);
3263 break;
3264 default:
3265 ret = kvm_arch_handle_exit(cpu, run);
3266 break;
3267 }
3268 } while (ret == 0);
3269
3270 cpu_exec_end(cpu);
3271 bql_lock();
3272
3273 if (ret < 0) {
3274 cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
3275 vm_stop(RUN_STATE_INTERNAL_ERROR);
3276 }
3277
3278 qatomic_set(&cpu->exit_request, 0);
3279 return ret;
3280 }
3281
kvm_ioctl(KVMState * s,unsigned long type,...)3282 int kvm_ioctl(KVMState *s, unsigned long type, ...)
3283 {
3284 int ret;
3285 void *arg;
3286 va_list ap;
3287
3288 va_start(ap, type);
3289 arg = va_arg(ap, void *);
3290 va_end(ap);
3291
3292 trace_kvm_ioctl(type, arg);
3293 ret = ioctl(s->fd, type, arg);
3294 if (ret == -1) {
3295 ret = -errno;
3296 }
3297 return ret;
3298 }
3299
kvm_vm_ioctl(KVMState * s,unsigned long type,...)3300 int kvm_vm_ioctl(KVMState *s, unsigned long type, ...)
3301 {
3302 int ret;
3303 void *arg;
3304 va_list ap;
3305
3306 va_start(ap, type);
3307 arg = va_arg(ap, void *);
3308 va_end(ap);
3309
3310 trace_kvm_vm_ioctl(type, arg);
3311 accel_ioctl_begin();
3312 ret = ioctl(s->vmfd, type, arg);
3313 accel_ioctl_end();
3314 if (ret == -1) {
3315 ret = -errno;
3316 }
3317 return ret;
3318 }
3319
kvm_vcpu_ioctl(CPUState * cpu,unsigned long type,...)3320 int kvm_vcpu_ioctl(CPUState *cpu, unsigned long type, ...)
3321 {
3322 int ret;
3323 void *arg;
3324 va_list ap;
3325
3326 va_start(ap, type);
3327 arg = va_arg(ap, void *);
3328 va_end(ap);
3329
3330 trace_kvm_vcpu_ioctl(cpu->cpu_index, type, arg);
3331 accel_cpu_ioctl_begin(cpu);
3332 ret = ioctl(cpu->kvm_fd, type, arg);
3333 accel_cpu_ioctl_end(cpu);
3334 if (ret == -1) {
3335 ret = -errno;
3336 }
3337 return ret;
3338 }
3339
kvm_device_ioctl(int fd,unsigned long type,...)3340 int kvm_device_ioctl(int fd, unsigned long type, ...)
3341 {
3342 int ret;
3343 void *arg;
3344 va_list ap;
3345
3346 va_start(ap, type);
3347 arg = va_arg(ap, void *);
3348 va_end(ap);
3349
3350 trace_kvm_device_ioctl(fd, type, arg);
3351 accel_ioctl_begin();
3352 ret = ioctl(fd, type, arg);
3353 accel_ioctl_end();
3354 if (ret == -1) {
3355 ret = -errno;
3356 }
3357 return ret;
3358 }
3359
kvm_vm_check_attr(KVMState * s,uint32_t group,uint64_t attr)3360 int kvm_vm_check_attr(KVMState *s, uint32_t group, uint64_t attr)
3361 {
3362 int ret;
3363 struct kvm_device_attr attribute = {
3364 .group = group,
3365 .attr = attr,
3366 };
3367
3368 if (!kvm_vm_attributes_allowed) {
3369 return 0;
3370 }
3371
3372 ret = kvm_vm_ioctl(s, KVM_HAS_DEVICE_ATTR, &attribute);
3373 /* kvm returns 0 on success for HAS_DEVICE_ATTR */
3374 return ret ? 0 : 1;
3375 }
3376
kvm_device_check_attr(int dev_fd,uint32_t group,uint64_t attr)3377 int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr)
3378 {
3379 struct kvm_device_attr attribute = {
3380 .group = group,
3381 .attr = attr,
3382 .flags = 0,
3383 };
3384
3385 return kvm_device_ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute) ? 0 : 1;
3386 }
3387
kvm_device_access(int fd,int group,uint64_t attr,void * val,bool write,Error ** errp)3388 int kvm_device_access(int fd, int group, uint64_t attr,
3389 void *val, bool write, Error **errp)
3390 {
3391 struct kvm_device_attr kvmattr;
3392 int err;
3393
3394 kvmattr.flags = 0;
3395 kvmattr.group = group;
3396 kvmattr.attr = attr;
3397 kvmattr.addr = (uintptr_t)val;
3398
3399 err = kvm_device_ioctl(fd,
3400 write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR,
3401 &kvmattr);
3402 if (err < 0) {
3403 error_setg_errno(errp, -err,
3404 "KVM_%s_DEVICE_ATTR failed: Group %d "
3405 "attr 0x%016" PRIx64,
3406 write ? "SET" : "GET", group, attr);
3407 }
3408 return err;
3409 }
3410
kvm_has_sync_mmu(void)3411 bool kvm_has_sync_mmu(void)
3412 {
3413 return kvm_state->sync_mmu;
3414 }
3415
kvm_has_vcpu_events(void)3416 int kvm_has_vcpu_events(void)
3417 {
3418 return kvm_state->vcpu_events;
3419 }
3420
kvm_max_nested_state_length(void)3421 int kvm_max_nested_state_length(void)
3422 {
3423 return kvm_state->max_nested_state_len;
3424 }
3425
kvm_has_gsi_routing(void)3426 int kvm_has_gsi_routing(void)
3427 {
3428 #ifdef KVM_CAP_IRQ_ROUTING
3429 return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
3430 #else
3431 return false;
3432 #endif
3433 }
3434
kvm_arm_supports_user_irq(void)3435 bool kvm_arm_supports_user_irq(void)
3436 {
3437 return kvm_check_extension(kvm_state, KVM_CAP_ARM_USER_IRQ);
3438 }
3439
3440 #ifdef TARGET_KVM_HAVE_GUEST_DEBUG
kvm_find_sw_breakpoint(CPUState * cpu,vaddr pc)3441 struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu, vaddr pc)
3442 {
3443 struct kvm_sw_breakpoint *bp;
3444
3445 QTAILQ_FOREACH(bp, &cpu->kvm_state->kvm_sw_breakpoints, entry) {
3446 if (bp->pc == pc) {
3447 return bp;
3448 }
3449 }
3450 return NULL;
3451 }
3452
kvm_sw_breakpoints_active(CPUState * cpu)3453 int kvm_sw_breakpoints_active(CPUState *cpu)
3454 {
3455 return !QTAILQ_EMPTY(&cpu->kvm_state->kvm_sw_breakpoints);
3456 }
3457
3458 struct kvm_set_guest_debug_data {
3459 struct kvm_guest_debug dbg;
3460 int err;
3461 };
3462
kvm_invoke_set_guest_debug(CPUState * cpu,run_on_cpu_data data)3463 static void kvm_invoke_set_guest_debug(CPUState *cpu, run_on_cpu_data data)
3464 {
3465 struct kvm_set_guest_debug_data *dbg_data =
3466 (struct kvm_set_guest_debug_data *) data.host_ptr;
3467
3468 dbg_data->err = kvm_vcpu_ioctl(cpu, KVM_SET_GUEST_DEBUG,
3469 &dbg_data->dbg);
3470 }
3471
kvm_update_guest_debug(CPUState * cpu,unsigned long reinject_trap)3472 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap)
3473 {
3474 struct kvm_set_guest_debug_data data;
3475
3476 data.dbg.control = reinject_trap;
3477
3478 if (cpu->singlestep_enabled) {
3479 data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
3480
3481 if (cpu->singlestep_enabled & SSTEP_NOIRQ) {
3482 data.dbg.control |= KVM_GUESTDBG_BLOCKIRQ;
3483 }
3484 }
3485 kvm_arch_update_guest_debug(cpu, &data.dbg);
3486
3487 run_on_cpu(cpu, kvm_invoke_set_guest_debug,
3488 RUN_ON_CPU_HOST_PTR(&data));
3489 return data.err;
3490 }
3491
kvm_supports_guest_debug(void)3492 bool kvm_supports_guest_debug(void)
3493 {
3494 /* probed during kvm_init() */
3495 return kvm_has_guest_debug;
3496 }
3497
kvm_insert_breakpoint(CPUState * cpu,int type,vaddr addr,vaddr len)3498 int kvm_insert_breakpoint(CPUState *cpu, int type, vaddr addr, vaddr len)
3499 {
3500 struct kvm_sw_breakpoint *bp;
3501 int err;
3502
3503 if (type == GDB_BREAKPOINT_SW) {
3504 bp = kvm_find_sw_breakpoint(cpu, addr);
3505 if (bp) {
3506 bp->use_count++;
3507 return 0;
3508 }
3509
3510 bp = g_new(struct kvm_sw_breakpoint, 1);
3511 bp->pc = addr;
3512 bp->use_count = 1;
3513 err = kvm_arch_insert_sw_breakpoint(cpu, bp);
3514 if (err) {
3515 g_free(bp);
3516 return err;
3517 }
3518
3519 QTAILQ_INSERT_HEAD(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
3520 } else {
3521 err = kvm_arch_insert_hw_breakpoint(addr, len, type);
3522 if (err) {
3523 return err;
3524 }
3525 }
3526
3527 CPU_FOREACH(cpu) {
3528 err = kvm_update_guest_debug(cpu, 0);
3529 if (err) {
3530 return err;
3531 }
3532 }
3533 return 0;
3534 }
3535
kvm_remove_breakpoint(CPUState * cpu,int type,vaddr addr,vaddr len)3536 int kvm_remove_breakpoint(CPUState *cpu, int type, vaddr addr, vaddr len)
3537 {
3538 struct kvm_sw_breakpoint *bp;
3539 int err;
3540
3541 if (type == GDB_BREAKPOINT_SW) {
3542 bp = kvm_find_sw_breakpoint(cpu, addr);
3543 if (!bp) {
3544 return -ENOENT;
3545 }
3546
3547 if (bp->use_count > 1) {
3548 bp->use_count--;
3549 return 0;
3550 }
3551
3552 err = kvm_arch_remove_sw_breakpoint(cpu, bp);
3553 if (err) {
3554 return err;
3555 }
3556
3557 QTAILQ_REMOVE(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
3558 g_free(bp);
3559 } else {
3560 err = kvm_arch_remove_hw_breakpoint(addr, len, type);
3561 if (err) {
3562 return err;
3563 }
3564 }
3565
3566 CPU_FOREACH(cpu) {
3567 err = kvm_update_guest_debug(cpu, 0);
3568 if (err) {
3569 return err;
3570 }
3571 }
3572 return 0;
3573 }
3574
kvm_remove_all_breakpoints(CPUState * cpu)3575 void kvm_remove_all_breakpoints(CPUState *cpu)
3576 {
3577 struct kvm_sw_breakpoint *bp, *next;
3578 KVMState *s = cpu->kvm_state;
3579 CPUState *tmpcpu;
3580
3581 QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
3582 if (kvm_arch_remove_sw_breakpoint(cpu, bp) != 0) {
3583 /* Try harder to find a CPU that currently sees the breakpoint. */
3584 CPU_FOREACH(tmpcpu) {
3585 if (kvm_arch_remove_sw_breakpoint(tmpcpu, bp) == 0) {
3586 break;
3587 }
3588 }
3589 }
3590 QTAILQ_REMOVE(&s->kvm_sw_breakpoints, bp, entry);
3591 g_free(bp);
3592 }
3593 kvm_arch_remove_all_hw_breakpoints();
3594
3595 CPU_FOREACH(cpu) {
3596 kvm_update_guest_debug(cpu, 0);
3597 }
3598 }
3599
3600 #endif /* !TARGET_KVM_HAVE_GUEST_DEBUG */
3601
kvm_set_signal_mask(CPUState * cpu,const sigset_t * sigset)3602 static int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset)
3603 {
3604 KVMState *s = kvm_state;
3605 struct kvm_signal_mask *sigmask;
3606 int r;
3607
3608 sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset));
3609
3610 sigmask->len = s->sigmask_len;
3611 memcpy(sigmask->sigset, sigset, sizeof(*sigset));
3612 r = kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, sigmask);
3613 g_free(sigmask);
3614
3615 return r;
3616 }
3617
kvm_ipi_signal(int sig)3618 static void kvm_ipi_signal(int sig)
3619 {
3620 if (current_cpu) {
3621 assert(kvm_immediate_exit);
3622 kvm_cpu_kick(current_cpu);
3623 }
3624 }
3625
kvm_init_cpu_signals(CPUState * cpu)3626 void kvm_init_cpu_signals(CPUState *cpu)
3627 {
3628 int r;
3629 sigset_t set;
3630 struct sigaction sigact;
3631
3632 memset(&sigact, 0, sizeof(sigact));
3633 sigact.sa_handler = kvm_ipi_signal;
3634 sigaction(SIG_IPI, &sigact, NULL);
3635
3636 pthread_sigmask(SIG_BLOCK, NULL, &set);
3637 #if defined KVM_HAVE_MCE_INJECTION
3638 sigdelset(&set, SIGBUS);
3639 pthread_sigmask(SIG_SETMASK, &set, NULL);
3640 #endif
3641 sigdelset(&set, SIG_IPI);
3642 if (kvm_immediate_exit) {
3643 r = pthread_sigmask(SIG_SETMASK, &set, NULL);
3644 } else {
3645 r = kvm_set_signal_mask(cpu, &set);
3646 }
3647 if (r) {
3648 fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
3649 exit(1);
3650 }
3651 }
3652
3653 /* Called asynchronously in VCPU thread. */
kvm_on_sigbus_vcpu(CPUState * cpu,int code,void * addr)3654 int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr)
3655 {
3656 #ifdef KVM_HAVE_MCE_INJECTION
3657 if (have_sigbus_pending) {
3658 return 1;
3659 }
3660 have_sigbus_pending = true;
3661 pending_sigbus_addr = addr;
3662 pending_sigbus_code = code;
3663 qatomic_set(&cpu->exit_request, 1);
3664 return 0;
3665 #else
3666 return 1;
3667 #endif
3668 }
3669
3670 /* Called synchronously (via signalfd) in main thread. */
kvm_on_sigbus(int code,void * addr)3671 int kvm_on_sigbus(int code, void *addr)
3672 {
3673 #ifdef KVM_HAVE_MCE_INJECTION
3674 /* Action required MCE kills the process if SIGBUS is blocked. Because
3675 * that's what happens in the I/O thread, where we handle MCE via signalfd,
3676 * we can only get action optional here.
3677 */
3678 assert(code != BUS_MCEERR_AR);
3679 kvm_arch_on_sigbus_vcpu(first_cpu, code, addr);
3680 return 0;
3681 #else
3682 return 1;
3683 #endif
3684 }
3685
kvm_create_device(KVMState * s,uint64_t type,bool test)3686 int kvm_create_device(KVMState *s, uint64_t type, bool test)
3687 {
3688 int ret;
3689 struct kvm_create_device create_dev;
3690
3691 create_dev.type = type;
3692 create_dev.fd = -1;
3693 create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0;
3694
3695 if (!kvm_check_extension(s, KVM_CAP_DEVICE_CTRL)) {
3696 return -ENOTSUP;
3697 }
3698
3699 ret = kvm_vm_ioctl(s, KVM_CREATE_DEVICE, &create_dev);
3700 if (ret) {
3701 return ret;
3702 }
3703
3704 return test ? 0 : create_dev.fd;
3705 }
3706
kvm_device_supported(int vmfd,uint64_t type)3707 bool kvm_device_supported(int vmfd, uint64_t type)
3708 {
3709 struct kvm_create_device create_dev = {
3710 .type = type,
3711 .fd = -1,
3712 .flags = KVM_CREATE_DEVICE_TEST,
3713 };
3714
3715 if (ioctl(vmfd, KVM_CHECK_EXTENSION, KVM_CAP_DEVICE_CTRL) <= 0) {
3716 return false;
3717 }
3718
3719 return (ioctl(vmfd, KVM_CREATE_DEVICE, &create_dev) >= 0);
3720 }
3721
kvm_set_one_reg(CPUState * cs,uint64_t id,void * source)3722 int kvm_set_one_reg(CPUState *cs, uint64_t id, void *source)
3723 {
3724 struct kvm_one_reg reg;
3725 int r;
3726
3727 reg.id = id;
3728 reg.addr = (uintptr_t) source;
3729 r = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, ®);
3730 if (r) {
3731 trace_kvm_failed_reg_set(id, strerror(-r));
3732 }
3733 return r;
3734 }
3735
kvm_get_one_reg(CPUState * cs,uint64_t id,void * target)3736 int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target)
3737 {
3738 struct kvm_one_reg reg;
3739 int r;
3740
3741 reg.id = id;
3742 reg.addr = (uintptr_t) target;
3743 r = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, ®);
3744 if (r) {
3745 trace_kvm_failed_reg_get(id, strerror(-r));
3746 }
3747 return r;
3748 }
3749
kvm_accel_has_memory(MachineState * ms,AddressSpace * as,hwaddr start_addr,hwaddr size)3750 static bool kvm_accel_has_memory(MachineState *ms, AddressSpace *as,
3751 hwaddr start_addr, hwaddr size)
3752 {
3753 KVMState *kvm = KVM_STATE(ms->accelerator);
3754 int i;
3755
3756 for (i = 0; i < kvm->nr_as; ++i) {
3757 if (kvm->as[i].as == as && kvm->as[i].ml) {
3758 size = MIN(kvm_max_slot_size, size);
3759 return NULL != kvm_lookup_matching_slot(kvm->as[i].ml,
3760 start_addr, size);
3761 }
3762 }
3763
3764 return false;
3765 }
3766
kvm_get_kvm_shadow_mem(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)3767 static void kvm_get_kvm_shadow_mem(Object *obj, Visitor *v,
3768 const char *name, void *opaque,
3769 Error **errp)
3770 {
3771 KVMState *s = KVM_STATE(obj);
3772 int64_t value = s->kvm_shadow_mem;
3773
3774 visit_type_int(v, name, &value, errp);
3775 }
3776
kvm_set_kvm_shadow_mem(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)3777 static void kvm_set_kvm_shadow_mem(Object *obj, Visitor *v,
3778 const char *name, void *opaque,
3779 Error **errp)
3780 {
3781 KVMState *s = KVM_STATE(obj);
3782 int64_t value;
3783
3784 if (s->fd != -1) {
3785 error_setg(errp, "Cannot set properties after the accelerator has been initialized");
3786 return;
3787 }
3788
3789 if (!visit_type_int(v, name, &value, errp)) {
3790 return;
3791 }
3792
3793 s->kvm_shadow_mem = value;
3794 }
3795
kvm_set_kernel_irqchip(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)3796 static void kvm_set_kernel_irqchip(Object *obj, Visitor *v,
3797 const char *name, void *opaque,
3798 Error **errp)
3799 {
3800 KVMState *s = KVM_STATE(obj);
3801 OnOffSplit mode;
3802
3803 if (s->fd != -1) {
3804 error_setg(errp, "Cannot set properties after the accelerator has been initialized");
3805 return;
3806 }
3807
3808 if (!visit_type_OnOffSplit(v, name, &mode, errp)) {
3809 return;
3810 }
3811 switch (mode) {
3812 case ON_OFF_SPLIT_ON:
3813 s->kernel_irqchip_allowed = true;
3814 s->kernel_irqchip_required = true;
3815 s->kernel_irqchip_split = ON_OFF_AUTO_OFF;
3816 break;
3817 case ON_OFF_SPLIT_OFF:
3818 s->kernel_irqchip_allowed = false;
3819 s->kernel_irqchip_required = false;
3820 s->kernel_irqchip_split = ON_OFF_AUTO_OFF;
3821 break;
3822 case ON_OFF_SPLIT_SPLIT:
3823 s->kernel_irqchip_allowed = true;
3824 s->kernel_irqchip_required = true;
3825 s->kernel_irqchip_split = ON_OFF_AUTO_ON;
3826 break;
3827 default:
3828 /* The value was checked in visit_type_OnOffSplit() above. If
3829 * we get here, then something is wrong in QEMU.
3830 */
3831 abort();
3832 }
3833 }
3834
kvm_kernel_irqchip_allowed(void)3835 bool kvm_kernel_irqchip_allowed(void)
3836 {
3837 return kvm_state->kernel_irqchip_allowed;
3838 }
3839
kvm_kernel_irqchip_required(void)3840 bool kvm_kernel_irqchip_required(void)
3841 {
3842 return kvm_state->kernel_irqchip_required;
3843 }
3844
kvm_kernel_irqchip_split(void)3845 bool kvm_kernel_irqchip_split(void)
3846 {
3847 return kvm_state->kernel_irqchip_split == ON_OFF_AUTO_ON;
3848 }
3849
kvm_get_dirty_ring_size(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)3850 static void kvm_get_dirty_ring_size(Object *obj, Visitor *v,
3851 const char *name, void *opaque,
3852 Error **errp)
3853 {
3854 KVMState *s = KVM_STATE(obj);
3855 uint32_t value = s->kvm_dirty_ring_size;
3856
3857 visit_type_uint32(v, name, &value, errp);
3858 }
3859
kvm_set_dirty_ring_size(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)3860 static void kvm_set_dirty_ring_size(Object *obj, Visitor *v,
3861 const char *name, void *opaque,
3862 Error **errp)
3863 {
3864 KVMState *s = KVM_STATE(obj);
3865 uint32_t value;
3866
3867 if (s->fd != -1) {
3868 error_setg(errp, "Cannot set properties after the accelerator has been initialized");
3869 return;
3870 }
3871
3872 if (!visit_type_uint32(v, name, &value, errp)) {
3873 return;
3874 }
3875 if (value & (value - 1)) {
3876 error_setg(errp, "dirty-ring-size must be a power of two.");
3877 return;
3878 }
3879
3880 s->kvm_dirty_ring_size = value;
3881 }
3882
kvm_get_device(Object * obj,Error ** errp G_GNUC_UNUSED)3883 static char *kvm_get_device(Object *obj,
3884 Error **errp G_GNUC_UNUSED)
3885 {
3886 KVMState *s = KVM_STATE(obj);
3887
3888 return g_strdup(s->device);
3889 }
3890
kvm_set_device(Object * obj,const char * value,Error ** errp G_GNUC_UNUSED)3891 static void kvm_set_device(Object *obj,
3892 const char *value,
3893 Error **errp G_GNUC_UNUSED)
3894 {
3895 KVMState *s = KVM_STATE(obj);
3896
3897 g_free(s->device);
3898 s->device = g_strdup(value);
3899 }
3900
kvm_set_kvm_rapl(Object * obj,bool value,Error ** errp)3901 static void kvm_set_kvm_rapl(Object *obj, bool value, Error **errp)
3902 {
3903 KVMState *s = KVM_STATE(obj);
3904 s->msr_energy.enable = value;
3905 }
3906
kvm_set_kvm_rapl_socket_path(Object * obj,const char * str,Error ** errp)3907 static void kvm_set_kvm_rapl_socket_path(Object *obj,
3908 const char *str,
3909 Error **errp)
3910 {
3911 KVMState *s = KVM_STATE(obj);
3912 g_free(s->msr_energy.socket_path);
3913 s->msr_energy.socket_path = g_strdup(str);
3914 }
3915
kvm_accel_instance_init(Object * obj)3916 static void kvm_accel_instance_init(Object *obj)
3917 {
3918 KVMState *s = KVM_STATE(obj);
3919
3920 s->fd = -1;
3921 s->vmfd = -1;
3922 s->kvm_shadow_mem = -1;
3923 s->kernel_irqchip_allowed = true;
3924 s->kernel_irqchip_split = ON_OFF_AUTO_AUTO;
3925 /* KVM dirty ring is by default off */
3926 s->kvm_dirty_ring_size = 0;
3927 s->kvm_dirty_ring_with_bitmap = false;
3928 s->kvm_eager_split_size = 0;
3929 s->notify_vmexit = NOTIFY_VMEXIT_OPTION_RUN;
3930 s->notify_window = 0;
3931 s->xen_version = 0;
3932 s->xen_gnttab_max_frames = 64;
3933 s->xen_evtchn_max_pirq = 256;
3934 s->device = NULL;
3935 s->msr_energy.enable = false;
3936 }
3937
3938 /**
3939 * kvm_gdbstub_sstep_flags():
3940 *
3941 * Returns: SSTEP_* flags that KVM supports for guest debug. The
3942 * support is probed during kvm_init()
3943 */
kvm_gdbstub_sstep_flags(void)3944 static int kvm_gdbstub_sstep_flags(void)
3945 {
3946 return kvm_sstep_flags;
3947 }
3948
kvm_accel_class_init(ObjectClass * oc,void * data)3949 static void kvm_accel_class_init(ObjectClass *oc, void *data)
3950 {
3951 AccelClass *ac = ACCEL_CLASS(oc);
3952 ac->name = "KVM";
3953 ac->init_machine = kvm_init;
3954 ac->has_memory = kvm_accel_has_memory;
3955 ac->allowed = &kvm_allowed;
3956 ac->gdbstub_supported_sstep_flags = kvm_gdbstub_sstep_flags;
3957
3958 object_class_property_add(oc, "kernel-irqchip", "on|off|split",
3959 NULL, kvm_set_kernel_irqchip,
3960 NULL, NULL);
3961 object_class_property_set_description(oc, "kernel-irqchip",
3962 "Configure KVM in-kernel irqchip");
3963
3964 object_class_property_add(oc, "kvm-shadow-mem", "int",
3965 kvm_get_kvm_shadow_mem, kvm_set_kvm_shadow_mem,
3966 NULL, NULL);
3967 object_class_property_set_description(oc, "kvm-shadow-mem",
3968 "KVM shadow MMU size");
3969
3970 object_class_property_add(oc, "dirty-ring-size", "uint32",
3971 kvm_get_dirty_ring_size, kvm_set_dirty_ring_size,
3972 NULL, NULL);
3973 object_class_property_set_description(oc, "dirty-ring-size",
3974 "Size of KVM dirty page ring buffer (default: 0, i.e. use bitmap)");
3975
3976 object_class_property_add_str(oc, "device", kvm_get_device, kvm_set_device);
3977 object_class_property_set_description(oc, "device",
3978 "Path to the device node to use (default: /dev/kvm)");
3979
3980 object_class_property_add_bool(oc, "rapl",
3981 NULL,
3982 kvm_set_kvm_rapl);
3983 object_class_property_set_description(oc, "rapl",
3984 "Allow energy related MSRs for RAPL interface in Guest");
3985
3986 object_class_property_add_str(oc, "rapl-helper-socket", NULL,
3987 kvm_set_kvm_rapl_socket_path);
3988 object_class_property_set_description(oc, "rapl-helper-socket",
3989 "Socket Path for comminucating with the Virtual MSR helper daemon");
3990
3991 kvm_arch_accel_class_init(oc);
3992 }
3993
3994 static const TypeInfo kvm_accel_type = {
3995 .name = TYPE_KVM_ACCEL,
3996 .parent = TYPE_ACCEL,
3997 .instance_init = kvm_accel_instance_init,
3998 .class_init = kvm_accel_class_init,
3999 .instance_size = sizeof(KVMState),
4000 };
4001
kvm_type_init(void)4002 static void kvm_type_init(void)
4003 {
4004 type_register_static(&kvm_accel_type);
4005 }
4006
4007 type_init(kvm_type_init);
4008
4009 typedef struct StatsArgs {
4010 union StatsResultsType {
4011 StatsResultList **stats;
4012 StatsSchemaList **schema;
4013 } result;
4014 strList *names;
4015 Error **errp;
4016 } StatsArgs;
4017
add_kvmstat_entry(struct kvm_stats_desc * pdesc,uint64_t * stats_data,StatsList * stats_list,Error ** errp)4018 static StatsList *add_kvmstat_entry(struct kvm_stats_desc *pdesc,
4019 uint64_t *stats_data,
4020 StatsList *stats_list,
4021 Error **errp)
4022 {
4023
4024 Stats *stats;
4025 uint64List *val_list = NULL;
4026
4027 /* Only add stats that we understand. */
4028 switch (pdesc->flags & KVM_STATS_TYPE_MASK) {
4029 case KVM_STATS_TYPE_CUMULATIVE:
4030 case KVM_STATS_TYPE_INSTANT:
4031 case KVM_STATS_TYPE_PEAK:
4032 case KVM_STATS_TYPE_LINEAR_HIST:
4033 case KVM_STATS_TYPE_LOG_HIST:
4034 break;
4035 default:
4036 return stats_list;
4037 }
4038
4039 switch (pdesc->flags & KVM_STATS_UNIT_MASK) {
4040 case KVM_STATS_UNIT_NONE:
4041 case KVM_STATS_UNIT_BYTES:
4042 case KVM_STATS_UNIT_CYCLES:
4043 case KVM_STATS_UNIT_SECONDS:
4044 case KVM_STATS_UNIT_BOOLEAN:
4045 break;
4046 default:
4047 return stats_list;
4048 }
4049
4050 switch (pdesc->flags & KVM_STATS_BASE_MASK) {
4051 case KVM_STATS_BASE_POW10:
4052 case KVM_STATS_BASE_POW2:
4053 break;
4054 default:
4055 return stats_list;
4056 }
4057
4058 /* Alloc and populate data list */
4059 stats = g_new0(Stats, 1);
4060 stats->name = g_strdup(pdesc->name);
4061 stats->value = g_new0(StatsValue, 1);
4062
4063 if ((pdesc->flags & KVM_STATS_UNIT_MASK) == KVM_STATS_UNIT_BOOLEAN) {
4064 stats->value->u.boolean = *stats_data;
4065 stats->value->type = QTYPE_QBOOL;
4066 } else if (pdesc->size == 1) {
4067 stats->value->u.scalar = *stats_data;
4068 stats->value->type = QTYPE_QNUM;
4069 } else {
4070 int i;
4071 for (i = 0; i < pdesc->size; i++) {
4072 QAPI_LIST_PREPEND(val_list, stats_data[i]);
4073 }
4074 stats->value->u.list = val_list;
4075 stats->value->type = QTYPE_QLIST;
4076 }
4077
4078 QAPI_LIST_PREPEND(stats_list, stats);
4079 return stats_list;
4080 }
4081
add_kvmschema_entry(struct kvm_stats_desc * pdesc,StatsSchemaValueList * list,Error ** errp)4082 static StatsSchemaValueList *add_kvmschema_entry(struct kvm_stats_desc *pdesc,
4083 StatsSchemaValueList *list,
4084 Error **errp)
4085 {
4086 StatsSchemaValueList *schema_entry = g_new0(StatsSchemaValueList, 1);
4087 schema_entry->value = g_new0(StatsSchemaValue, 1);
4088
4089 switch (pdesc->flags & KVM_STATS_TYPE_MASK) {
4090 case KVM_STATS_TYPE_CUMULATIVE:
4091 schema_entry->value->type = STATS_TYPE_CUMULATIVE;
4092 break;
4093 case KVM_STATS_TYPE_INSTANT:
4094 schema_entry->value->type = STATS_TYPE_INSTANT;
4095 break;
4096 case KVM_STATS_TYPE_PEAK:
4097 schema_entry->value->type = STATS_TYPE_PEAK;
4098 break;
4099 case KVM_STATS_TYPE_LINEAR_HIST:
4100 schema_entry->value->type = STATS_TYPE_LINEAR_HISTOGRAM;
4101 schema_entry->value->bucket_size = pdesc->bucket_size;
4102 schema_entry->value->has_bucket_size = true;
4103 break;
4104 case KVM_STATS_TYPE_LOG_HIST:
4105 schema_entry->value->type = STATS_TYPE_LOG2_HISTOGRAM;
4106 break;
4107 default:
4108 goto exit;
4109 }
4110
4111 switch (pdesc->flags & KVM_STATS_UNIT_MASK) {
4112 case KVM_STATS_UNIT_NONE:
4113 break;
4114 case KVM_STATS_UNIT_BOOLEAN:
4115 schema_entry->value->has_unit = true;
4116 schema_entry->value->unit = STATS_UNIT_BOOLEAN;
4117 break;
4118 case KVM_STATS_UNIT_BYTES:
4119 schema_entry->value->has_unit = true;
4120 schema_entry->value->unit = STATS_UNIT_BYTES;
4121 break;
4122 case KVM_STATS_UNIT_CYCLES:
4123 schema_entry->value->has_unit = true;
4124 schema_entry->value->unit = STATS_UNIT_CYCLES;
4125 break;
4126 case KVM_STATS_UNIT_SECONDS:
4127 schema_entry->value->has_unit = true;
4128 schema_entry->value->unit = STATS_UNIT_SECONDS;
4129 break;
4130 default:
4131 goto exit;
4132 }
4133
4134 schema_entry->value->exponent = pdesc->exponent;
4135 if (pdesc->exponent) {
4136 switch (pdesc->flags & KVM_STATS_BASE_MASK) {
4137 case KVM_STATS_BASE_POW10:
4138 schema_entry->value->has_base = true;
4139 schema_entry->value->base = 10;
4140 break;
4141 case KVM_STATS_BASE_POW2:
4142 schema_entry->value->has_base = true;
4143 schema_entry->value->base = 2;
4144 break;
4145 default:
4146 goto exit;
4147 }
4148 }
4149
4150 schema_entry->value->name = g_strdup(pdesc->name);
4151 schema_entry->next = list;
4152 return schema_entry;
4153 exit:
4154 g_free(schema_entry->value);
4155 g_free(schema_entry);
4156 return list;
4157 }
4158
4159 /* Cached stats descriptors */
4160 typedef struct StatsDescriptors {
4161 const char *ident; /* cache key, currently the StatsTarget */
4162 struct kvm_stats_desc *kvm_stats_desc;
4163 struct kvm_stats_header kvm_stats_header;
4164 QTAILQ_ENTRY(StatsDescriptors) next;
4165 } StatsDescriptors;
4166
4167 static QTAILQ_HEAD(, StatsDescriptors) stats_descriptors =
4168 QTAILQ_HEAD_INITIALIZER(stats_descriptors);
4169
4170 /*
4171 * Return the descriptors for 'target', that either have already been read
4172 * or are retrieved from 'stats_fd'.
4173 */
find_stats_descriptors(StatsTarget target,int stats_fd,Error ** errp)4174 static StatsDescriptors *find_stats_descriptors(StatsTarget target, int stats_fd,
4175 Error **errp)
4176 {
4177 StatsDescriptors *descriptors;
4178 const char *ident;
4179 struct kvm_stats_desc *kvm_stats_desc;
4180 struct kvm_stats_header *kvm_stats_header;
4181 size_t size_desc;
4182 ssize_t ret;
4183
4184 ident = StatsTarget_str(target);
4185 QTAILQ_FOREACH(descriptors, &stats_descriptors, next) {
4186 if (g_str_equal(descriptors->ident, ident)) {
4187 return descriptors;
4188 }
4189 }
4190
4191 descriptors = g_new0(StatsDescriptors, 1);
4192
4193 /* Read stats header */
4194 kvm_stats_header = &descriptors->kvm_stats_header;
4195 ret = pread(stats_fd, kvm_stats_header, sizeof(*kvm_stats_header), 0);
4196 if (ret != sizeof(*kvm_stats_header)) {
4197 error_setg(errp, "KVM stats: failed to read stats header: "
4198 "expected %zu actual %zu",
4199 sizeof(*kvm_stats_header), ret);
4200 g_free(descriptors);
4201 return NULL;
4202 }
4203 size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size;
4204
4205 /* Read stats descriptors */
4206 kvm_stats_desc = g_malloc0_n(kvm_stats_header->num_desc, size_desc);
4207 ret = pread(stats_fd, kvm_stats_desc,
4208 size_desc * kvm_stats_header->num_desc,
4209 kvm_stats_header->desc_offset);
4210
4211 if (ret != size_desc * kvm_stats_header->num_desc) {
4212 error_setg(errp, "KVM stats: failed to read stats descriptors: "
4213 "expected %zu actual %zu",
4214 size_desc * kvm_stats_header->num_desc, ret);
4215 g_free(descriptors);
4216 g_free(kvm_stats_desc);
4217 return NULL;
4218 }
4219 descriptors->kvm_stats_desc = kvm_stats_desc;
4220 descriptors->ident = ident;
4221 QTAILQ_INSERT_TAIL(&stats_descriptors, descriptors, next);
4222 return descriptors;
4223 }
4224
query_stats(StatsResultList ** result,StatsTarget target,strList * names,int stats_fd,CPUState * cpu,Error ** errp)4225 static void query_stats(StatsResultList **result, StatsTarget target,
4226 strList *names, int stats_fd, CPUState *cpu,
4227 Error **errp)
4228 {
4229 struct kvm_stats_desc *kvm_stats_desc;
4230 struct kvm_stats_header *kvm_stats_header;
4231 StatsDescriptors *descriptors;
4232 g_autofree uint64_t *stats_data = NULL;
4233 struct kvm_stats_desc *pdesc;
4234 StatsList *stats_list = NULL;
4235 size_t size_desc, size_data = 0;
4236 ssize_t ret;
4237 int i;
4238
4239 descriptors = find_stats_descriptors(target, stats_fd, errp);
4240 if (!descriptors) {
4241 return;
4242 }
4243
4244 kvm_stats_header = &descriptors->kvm_stats_header;
4245 kvm_stats_desc = descriptors->kvm_stats_desc;
4246 size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size;
4247
4248 /* Tally the total data size; read schema data */
4249 for (i = 0; i < kvm_stats_header->num_desc; ++i) {
4250 pdesc = (void *)kvm_stats_desc + i * size_desc;
4251 size_data += pdesc->size * sizeof(*stats_data);
4252 }
4253
4254 stats_data = g_malloc0(size_data);
4255 ret = pread(stats_fd, stats_data, size_data, kvm_stats_header->data_offset);
4256
4257 if (ret != size_data) {
4258 error_setg(errp, "KVM stats: failed to read data: "
4259 "expected %zu actual %zu", size_data, ret);
4260 return;
4261 }
4262
4263 for (i = 0; i < kvm_stats_header->num_desc; ++i) {
4264 uint64_t *stats;
4265 pdesc = (void *)kvm_stats_desc + i * size_desc;
4266
4267 /* Add entry to the list */
4268 stats = (void *)stats_data + pdesc->offset;
4269 if (!apply_str_list_filter(pdesc->name, names)) {
4270 continue;
4271 }
4272 stats_list = add_kvmstat_entry(pdesc, stats, stats_list, errp);
4273 }
4274
4275 if (!stats_list) {
4276 return;
4277 }
4278
4279 switch (target) {
4280 case STATS_TARGET_VM:
4281 add_stats_entry(result, STATS_PROVIDER_KVM, NULL, stats_list);
4282 break;
4283 case STATS_TARGET_VCPU:
4284 add_stats_entry(result, STATS_PROVIDER_KVM,
4285 cpu->parent_obj.canonical_path,
4286 stats_list);
4287 break;
4288 default:
4289 g_assert_not_reached();
4290 }
4291 }
4292
query_stats_schema(StatsSchemaList ** result,StatsTarget target,int stats_fd,Error ** errp)4293 static void query_stats_schema(StatsSchemaList **result, StatsTarget target,
4294 int stats_fd, Error **errp)
4295 {
4296 struct kvm_stats_desc *kvm_stats_desc;
4297 struct kvm_stats_header *kvm_stats_header;
4298 StatsDescriptors *descriptors;
4299 struct kvm_stats_desc *pdesc;
4300 StatsSchemaValueList *stats_list = NULL;
4301 size_t size_desc;
4302 int i;
4303
4304 descriptors = find_stats_descriptors(target, stats_fd, errp);
4305 if (!descriptors) {
4306 return;
4307 }
4308
4309 kvm_stats_header = &descriptors->kvm_stats_header;
4310 kvm_stats_desc = descriptors->kvm_stats_desc;
4311 size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size;
4312
4313 /* Tally the total data size; read schema data */
4314 for (i = 0; i < kvm_stats_header->num_desc; ++i) {
4315 pdesc = (void *)kvm_stats_desc + i * size_desc;
4316 stats_list = add_kvmschema_entry(pdesc, stats_list, errp);
4317 }
4318
4319 add_stats_schema(result, STATS_PROVIDER_KVM, target, stats_list);
4320 }
4321
query_stats_vcpu(CPUState * cpu,StatsArgs * kvm_stats_args)4322 static void query_stats_vcpu(CPUState *cpu, StatsArgs *kvm_stats_args)
4323 {
4324 int stats_fd = cpu->kvm_vcpu_stats_fd;
4325 Error *local_err = NULL;
4326
4327 if (stats_fd == -1) {
4328 error_setg_errno(&local_err, errno, "KVM stats: ioctl failed");
4329 error_propagate(kvm_stats_args->errp, local_err);
4330 return;
4331 }
4332 query_stats(kvm_stats_args->result.stats, STATS_TARGET_VCPU,
4333 kvm_stats_args->names, stats_fd, cpu,
4334 kvm_stats_args->errp);
4335 }
4336
query_stats_schema_vcpu(CPUState * cpu,StatsArgs * kvm_stats_args)4337 static void query_stats_schema_vcpu(CPUState *cpu, StatsArgs *kvm_stats_args)
4338 {
4339 int stats_fd = cpu->kvm_vcpu_stats_fd;
4340 Error *local_err = NULL;
4341
4342 if (stats_fd == -1) {
4343 error_setg_errno(&local_err, errno, "KVM stats: ioctl failed");
4344 error_propagate(kvm_stats_args->errp, local_err);
4345 return;
4346 }
4347 query_stats_schema(kvm_stats_args->result.schema, STATS_TARGET_VCPU, stats_fd,
4348 kvm_stats_args->errp);
4349 }
4350
query_stats_cb(StatsResultList ** result,StatsTarget target,strList * names,strList * targets,Error ** errp)4351 static void query_stats_cb(StatsResultList **result, StatsTarget target,
4352 strList *names, strList *targets, Error **errp)
4353 {
4354 KVMState *s = kvm_state;
4355 CPUState *cpu;
4356 int stats_fd;
4357
4358 switch (target) {
4359 case STATS_TARGET_VM:
4360 {
4361 stats_fd = kvm_vm_ioctl(s, KVM_GET_STATS_FD, NULL);
4362 if (stats_fd == -1) {
4363 error_setg_errno(errp, errno, "KVM stats: ioctl failed");
4364 return;
4365 }
4366 query_stats(result, target, names, stats_fd, NULL, errp);
4367 close(stats_fd);
4368 break;
4369 }
4370 case STATS_TARGET_VCPU:
4371 {
4372 StatsArgs stats_args;
4373 stats_args.result.stats = result;
4374 stats_args.names = names;
4375 stats_args.errp = errp;
4376 CPU_FOREACH(cpu) {
4377 if (!apply_str_list_filter(cpu->parent_obj.canonical_path, targets)) {
4378 continue;
4379 }
4380 query_stats_vcpu(cpu, &stats_args);
4381 }
4382 break;
4383 }
4384 default:
4385 break;
4386 }
4387 }
4388
query_stats_schemas_cb(StatsSchemaList ** result,Error ** errp)4389 void query_stats_schemas_cb(StatsSchemaList **result, Error **errp)
4390 {
4391 StatsArgs stats_args;
4392 KVMState *s = kvm_state;
4393 int stats_fd;
4394
4395 stats_fd = kvm_vm_ioctl(s, KVM_GET_STATS_FD, NULL);
4396 if (stats_fd == -1) {
4397 error_setg_errno(errp, errno, "KVM stats: ioctl failed");
4398 return;
4399 }
4400 query_stats_schema(result, STATS_TARGET_VM, stats_fd, errp);
4401 close(stats_fd);
4402
4403 if (first_cpu) {
4404 stats_args.result.schema = result;
4405 stats_args.errp = errp;
4406 query_stats_schema_vcpu(first_cpu, &stats_args);
4407 }
4408 }
4409
kvm_mark_guest_state_protected(void)4410 void kvm_mark_guest_state_protected(void)
4411 {
4412 kvm_state->guest_state_protected = true;
4413 }
4414
kvm_create_guest_memfd(uint64_t size,uint64_t flags,Error ** errp)4415 int kvm_create_guest_memfd(uint64_t size, uint64_t flags, Error **errp)
4416 {
4417 int fd;
4418 struct kvm_create_guest_memfd guest_memfd = {
4419 .size = size,
4420 .flags = flags,
4421 };
4422
4423 if (!kvm_guest_memfd_supported) {
4424 error_setg(errp, "KVM does not support guest_memfd");
4425 return -1;
4426 }
4427
4428 fd = kvm_vm_ioctl(kvm_state, KVM_CREATE_GUEST_MEMFD, &guest_memfd);
4429 if (fd < 0) {
4430 error_setg_errno(errp, errno, "Error creating KVM guest_memfd");
4431 return -1;
4432 }
4433
4434 return fd;
4435 }
4436