1 /* 2 * QEMU KVM support 3 * 4 * Copyright IBM, Corp. 2008 5 * Red Hat, Inc. 2008 6 * 7 * Authors: 8 * Anthony Liguori <aliguori@us.ibm.com> 9 * Glauber Costa <gcosta@redhat.com> 10 * 11 * This work is licensed under the terms of the GNU GPL, version 2 or later. 12 * See the COPYING file in the top-level directory. 13 * 14 */ 15 16 #include "qemu/osdep.h" 17 #include <sys/ioctl.h> 18 #include <poll.h> 19 20 #include <linux/kvm.h> 21 22 #include "qemu/atomic.h" 23 #include "qemu/option.h" 24 #include "qemu/config-file.h" 25 #include "qemu/error-report.h" 26 #include "qapi/error.h" 27 #include "hw/pci/msi.h" 28 #include "hw/pci/msix.h" 29 #include "hw/s390x/adapter.h" 30 #include "exec/gdbstub.h" 31 #include "sysemu/kvm_int.h" 32 #include "sysemu/runstate.h" 33 #include "sysemu/cpus.h" 34 #include "qemu/bswap.h" 35 #include "exec/memory.h" 36 #include "exec/ram_addr.h" 37 #include "qemu/event_notifier.h" 38 #include "qemu/main-loop.h" 39 #include "trace.h" 40 #include "hw/irq.h" 41 #include "qapi/visitor.h" 42 #include "qapi/qapi-types-common.h" 43 #include "qapi/qapi-visit-common.h" 44 #include "sysemu/reset.h" 45 #include "qemu/guest-random.h" 46 #include "sysemu/hw_accel.h" 47 #include "kvm-cpus.h" 48 49 #include "hw/boards.h" 50 #include "monitor/stats.h" 51 52 /* This check must be after config-host.h is included */ 53 #ifdef CONFIG_EVENTFD 54 #include <sys/eventfd.h> 55 #endif 56 57 /* KVM uses PAGE_SIZE in its definition of KVM_COALESCED_MMIO_MAX. We 58 * need to use the real host PAGE_SIZE, as that's what KVM will use. 59 */ 60 #ifdef PAGE_SIZE 61 #undef PAGE_SIZE 62 #endif 63 #define PAGE_SIZE qemu_real_host_page_size() 64 65 #ifndef KVM_GUESTDBG_BLOCKIRQ 66 #define KVM_GUESTDBG_BLOCKIRQ 0 67 #endif 68 69 //#define DEBUG_KVM 70 71 #ifdef DEBUG_KVM 72 #define DPRINTF(fmt, ...) \ 73 do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0) 74 #else 75 #define DPRINTF(fmt, ...) \ 76 do { } while (0) 77 #endif 78 79 #define KVM_MSI_HASHTAB_SIZE 256 80 81 struct KVMParkedVcpu { 82 unsigned long vcpu_id; 83 int kvm_fd; 84 QLIST_ENTRY(KVMParkedVcpu) node; 85 }; 86 87 enum KVMDirtyRingReaperState { 88 KVM_DIRTY_RING_REAPER_NONE = 0, 89 /* The reaper is sleeping */ 90 KVM_DIRTY_RING_REAPER_WAIT, 91 /* The reaper is reaping for dirty pages */ 92 KVM_DIRTY_RING_REAPER_REAPING, 93 }; 94 95 /* 96 * KVM reaper instance, responsible for collecting the KVM dirty bits 97 * via the dirty ring. 98 */ 99 struct KVMDirtyRingReaper { 100 /* The reaper thread */ 101 QemuThread reaper_thr; 102 volatile uint64_t reaper_iteration; /* iteration number of reaper thr */ 103 volatile enum KVMDirtyRingReaperState reaper_state; /* reap thr state */ 104 }; 105 106 struct KVMState 107 { 108 AccelState parent_obj; 109 110 int nr_slots; 111 int fd; 112 int vmfd; 113 int coalesced_mmio; 114 int coalesced_pio; 115 struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; 116 bool coalesced_flush_in_progress; 117 int vcpu_events; 118 int robust_singlestep; 119 int debugregs; 120 #ifdef KVM_CAP_SET_GUEST_DEBUG 121 QTAILQ_HEAD(, kvm_sw_breakpoint) kvm_sw_breakpoints; 122 #endif 123 int max_nested_state_len; 124 int many_ioeventfds; 125 int intx_set_mask; 126 int kvm_shadow_mem; 127 bool kernel_irqchip_allowed; 128 bool kernel_irqchip_required; 129 OnOffAuto kernel_irqchip_split; 130 bool sync_mmu; 131 uint64_t manual_dirty_log_protect; 132 /* The man page (and posix) say ioctl numbers are signed int, but 133 * they're not. Linux, glibc and *BSD all treat ioctl numbers as 134 * unsigned, and treating them as signed here can break things */ 135 unsigned irq_set_ioctl; 136 unsigned int sigmask_len; 137 GHashTable *gsimap; 138 #ifdef KVM_CAP_IRQ_ROUTING 139 struct kvm_irq_routing *irq_routes; 140 int nr_allocated_irq_routes; 141 unsigned long *used_gsi_bitmap; 142 unsigned int gsi_count; 143 QTAILQ_HEAD(, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE]; 144 #endif 145 KVMMemoryListener memory_listener; 146 QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus; 147 148 /* For "info mtree -f" to tell if an MR is registered in KVM */ 149 int nr_as; 150 struct KVMAs { 151 KVMMemoryListener *ml; 152 AddressSpace *as; 153 } *as; 154 uint64_t kvm_dirty_ring_bytes; /* Size of the per-vcpu dirty ring */ 155 uint32_t kvm_dirty_ring_size; /* Number of dirty GFNs per ring */ 156 struct KVMDirtyRingReaper reaper; 157 }; 158 159 KVMState *kvm_state; 160 bool kvm_kernel_irqchip; 161 bool kvm_split_irqchip; 162 bool kvm_async_interrupts_allowed; 163 bool kvm_halt_in_kernel_allowed; 164 bool kvm_eventfds_allowed; 165 bool kvm_irqfds_allowed; 166 bool kvm_resamplefds_allowed; 167 bool kvm_msi_via_irqfd_allowed; 168 bool kvm_gsi_routing_allowed; 169 bool kvm_gsi_direct_mapping; 170 bool kvm_allowed; 171 bool kvm_readonly_mem_allowed; 172 bool kvm_vm_attributes_allowed; 173 bool kvm_direct_msi_allowed; 174 bool kvm_ioeventfd_any_length_allowed; 175 bool kvm_msi_use_devid; 176 bool kvm_has_guest_debug; 177 int kvm_sstep_flags; 178 static bool kvm_immediate_exit; 179 static hwaddr kvm_max_slot_size = ~0; 180 181 static const KVMCapabilityInfo kvm_required_capabilites[] = { 182 KVM_CAP_INFO(USER_MEMORY), 183 KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS), 184 KVM_CAP_INFO(JOIN_MEMORY_REGIONS_WORKS), 185 KVM_CAP_LAST_INFO 186 }; 187 188 static NotifierList kvm_irqchip_change_notifiers = 189 NOTIFIER_LIST_INITIALIZER(kvm_irqchip_change_notifiers); 190 191 struct KVMResampleFd { 192 int gsi; 193 EventNotifier *resample_event; 194 QLIST_ENTRY(KVMResampleFd) node; 195 }; 196 typedef struct KVMResampleFd KVMResampleFd; 197 198 /* 199 * Only used with split irqchip where we need to do the resample fd 200 * kick for the kernel from userspace. 201 */ 202 static QLIST_HEAD(, KVMResampleFd) kvm_resample_fd_list = 203 QLIST_HEAD_INITIALIZER(kvm_resample_fd_list); 204 205 static QemuMutex kml_slots_lock; 206 207 #define kvm_slots_lock() qemu_mutex_lock(&kml_slots_lock) 208 #define kvm_slots_unlock() qemu_mutex_unlock(&kml_slots_lock) 209 210 static void kvm_slot_init_dirty_bitmap(KVMSlot *mem); 211 212 static inline void kvm_resample_fd_remove(int gsi) 213 { 214 KVMResampleFd *rfd; 215 216 QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) { 217 if (rfd->gsi == gsi) { 218 QLIST_REMOVE(rfd, node); 219 g_free(rfd); 220 break; 221 } 222 } 223 } 224 225 static inline void kvm_resample_fd_insert(int gsi, EventNotifier *event) 226 { 227 KVMResampleFd *rfd = g_new0(KVMResampleFd, 1); 228 229 rfd->gsi = gsi; 230 rfd->resample_event = event; 231 232 QLIST_INSERT_HEAD(&kvm_resample_fd_list, rfd, node); 233 } 234 235 void kvm_resample_fd_notify(int gsi) 236 { 237 KVMResampleFd *rfd; 238 239 QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) { 240 if (rfd->gsi == gsi) { 241 event_notifier_set(rfd->resample_event); 242 trace_kvm_resample_fd_notify(gsi); 243 return; 244 } 245 } 246 } 247 248 int kvm_get_max_memslots(void) 249 { 250 KVMState *s = KVM_STATE(current_accel()); 251 252 return s->nr_slots; 253 } 254 255 /* Called with KVMMemoryListener.slots_lock held */ 256 static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml) 257 { 258 KVMState *s = kvm_state; 259 int i; 260 261 for (i = 0; i < s->nr_slots; i++) { 262 if (kml->slots[i].memory_size == 0) { 263 return &kml->slots[i]; 264 } 265 } 266 267 return NULL; 268 } 269 270 bool kvm_has_free_slot(MachineState *ms) 271 { 272 KVMState *s = KVM_STATE(ms->accelerator); 273 bool result; 274 KVMMemoryListener *kml = &s->memory_listener; 275 276 kvm_slots_lock(); 277 result = !!kvm_get_free_slot(kml); 278 kvm_slots_unlock(); 279 280 return result; 281 } 282 283 /* Called with KVMMemoryListener.slots_lock held */ 284 static KVMSlot *kvm_alloc_slot(KVMMemoryListener *kml) 285 { 286 KVMSlot *slot = kvm_get_free_slot(kml); 287 288 if (slot) { 289 return slot; 290 } 291 292 fprintf(stderr, "%s: no free slot available\n", __func__); 293 abort(); 294 } 295 296 static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml, 297 hwaddr start_addr, 298 hwaddr size) 299 { 300 KVMState *s = kvm_state; 301 int i; 302 303 for (i = 0; i < s->nr_slots; i++) { 304 KVMSlot *mem = &kml->slots[i]; 305 306 if (start_addr == mem->start_addr && size == mem->memory_size) { 307 return mem; 308 } 309 } 310 311 return NULL; 312 } 313 314 /* 315 * Calculate and align the start address and the size of the section. 316 * Return the size. If the size is 0, the aligned section is empty. 317 */ 318 static hwaddr kvm_align_section(MemoryRegionSection *section, 319 hwaddr *start) 320 { 321 hwaddr size = int128_get64(section->size); 322 hwaddr delta, aligned; 323 324 /* kvm works in page size chunks, but the function may be called 325 with sub-page size and unaligned start address. Pad the start 326 address to next and truncate size to previous page boundary. */ 327 aligned = ROUND_UP(section->offset_within_address_space, 328 qemu_real_host_page_size()); 329 delta = aligned - section->offset_within_address_space; 330 *start = aligned; 331 if (delta > size) { 332 return 0; 333 } 334 335 return (size - delta) & qemu_real_host_page_mask(); 336 } 337 338 int kvm_physical_memory_addr_from_host(KVMState *s, void *ram, 339 hwaddr *phys_addr) 340 { 341 KVMMemoryListener *kml = &s->memory_listener; 342 int i, ret = 0; 343 344 kvm_slots_lock(); 345 for (i = 0; i < s->nr_slots; i++) { 346 KVMSlot *mem = &kml->slots[i]; 347 348 if (ram >= mem->ram && ram < mem->ram + mem->memory_size) { 349 *phys_addr = mem->start_addr + (ram - mem->ram); 350 ret = 1; 351 break; 352 } 353 } 354 kvm_slots_unlock(); 355 356 return ret; 357 } 358 359 static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot, bool new) 360 { 361 KVMState *s = kvm_state; 362 struct kvm_userspace_memory_region mem; 363 int ret; 364 365 mem.slot = slot->slot | (kml->as_id << 16); 366 mem.guest_phys_addr = slot->start_addr; 367 mem.userspace_addr = (unsigned long)slot->ram; 368 mem.flags = slot->flags; 369 370 if (slot->memory_size && !new && (mem.flags ^ slot->old_flags) & KVM_MEM_READONLY) { 371 /* Set the slot size to 0 before setting the slot to the desired 372 * value. This is needed based on KVM commit 75d61fbc. */ 373 mem.memory_size = 0; 374 ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem); 375 if (ret < 0) { 376 goto err; 377 } 378 } 379 mem.memory_size = slot->memory_size; 380 ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem); 381 slot->old_flags = mem.flags; 382 err: 383 trace_kvm_set_user_memory(mem.slot, mem.flags, mem.guest_phys_addr, 384 mem.memory_size, mem.userspace_addr, ret); 385 if (ret < 0) { 386 error_report("%s: KVM_SET_USER_MEMORY_REGION failed, slot=%d," 387 " start=0x%" PRIx64 ", size=0x%" PRIx64 ": %s", 388 __func__, mem.slot, slot->start_addr, 389 (uint64_t)mem.memory_size, strerror(errno)); 390 } 391 return ret; 392 } 393 394 static int do_kvm_destroy_vcpu(CPUState *cpu) 395 { 396 KVMState *s = kvm_state; 397 long mmap_size; 398 struct KVMParkedVcpu *vcpu = NULL; 399 int ret = 0; 400 401 DPRINTF("kvm_destroy_vcpu\n"); 402 403 ret = kvm_arch_destroy_vcpu(cpu); 404 if (ret < 0) { 405 goto err; 406 } 407 408 mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0); 409 if (mmap_size < 0) { 410 ret = mmap_size; 411 DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n"); 412 goto err; 413 } 414 415 ret = munmap(cpu->kvm_run, mmap_size); 416 if (ret < 0) { 417 goto err; 418 } 419 420 if (cpu->kvm_dirty_gfns) { 421 ret = munmap(cpu->kvm_dirty_gfns, s->kvm_dirty_ring_bytes); 422 if (ret < 0) { 423 goto err; 424 } 425 } 426 427 vcpu = g_malloc0(sizeof(*vcpu)); 428 vcpu->vcpu_id = kvm_arch_vcpu_id(cpu); 429 vcpu->kvm_fd = cpu->kvm_fd; 430 QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node); 431 err: 432 return ret; 433 } 434 435 void kvm_destroy_vcpu(CPUState *cpu) 436 { 437 if (do_kvm_destroy_vcpu(cpu) < 0) { 438 error_report("kvm_destroy_vcpu failed"); 439 exit(EXIT_FAILURE); 440 } 441 } 442 443 static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id) 444 { 445 struct KVMParkedVcpu *cpu; 446 447 QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) { 448 if (cpu->vcpu_id == vcpu_id) { 449 int kvm_fd; 450 451 QLIST_REMOVE(cpu, node); 452 kvm_fd = cpu->kvm_fd; 453 g_free(cpu); 454 return kvm_fd; 455 } 456 } 457 458 return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id); 459 } 460 461 int kvm_init_vcpu(CPUState *cpu, Error **errp) 462 { 463 KVMState *s = kvm_state; 464 long mmap_size; 465 int ret; 466 467 trace_kvm_init_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu)); 468 469 ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu)); 470 if (ret < 0) { 471 error_setg_errno(errp, -ret, "kvm_init_vcpu: kvm_get_vcpu failed (%lu)", 472 kvm_arch_vcpu_id(cpu)); 473 goto err; 474 } 475 476 cpu->kvm_fd = ret; 477 cpu->kvm_state = s; 478 cpu->vcpu_dirty = true; 479 cpu->dirty_pages = 0; 480 481 mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0); 482 if (mmap_size < 0) { 483 ret = mmap_size; 484 error_setg_errno(errp, -mmap_size, 485 "kvm_init_vcpu: KVM_GET_VCPU_MMAP_SIZE failed"); 486 goto err; 487 } 488 489 cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, 490 cpu->kvm_fd, 0); 491 if (cpu->kvm_run == MAP_FAILED) { 492 ret = -errno; 493 error_setg_errno(errp, ret, 494 "kvm_init_vcpu: mmap'ing vcpu state failed (%lu)", 495 kvm_arch_vcpu_id(cpu)); 496 goto err; 497 } 498 499 if (s->coalesced_mmio && !s->coalesced_mmio_ring) { 500 s->coalesced_mmio_ring = 501 (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE; 502 } 503 504 if (s->kvm_dirty_ring_size) { 505 /* Use MAP_SHARED to share pages with the kernel */ 506 cpu->kvm_dirty_gfns = mmap(NULL, s->kvm_dirty_ring_bytes, 507 PROT_READ | PROT_WRITE, MAP_SHARED, 508 cpu->kvm_fd, 509 PAGE_SIZE * KVM_DIRTY_LOG_PAGE_OFFSET); 510 if (cpu->kvm_dirty_gfns == MAP_FAILED) { 511 ret = -errno; 512 DPRINTF("mmap'ing vcpu dirty gfns failed: %d\n", ret); 513 goto err; 514 } 515 } 516 517 ret = kvm_arch_init_vcpu(cpu); 518 if (ret < 0) { 519 error_setg_errno(errp, -ret, 520 "kvm_init_vcpu: kvm_arch_init_vcpu failed (%lu)", 521 kvm_arch_vcpu_id(cpu)); 522 } 523 err: 524 return ret; 525 } 526 527 /* 528 * dirty pages logging control 529 */ 530 531 static int kvm_mem_flags(MemoryRegion *mr) 532 { 533 bool readonly = mr->readonly || memory_region_is_romd(mr); 534 int flags = 0; 535 536 if (memory_region_get_dirty_log_mask(mr) != 0) { 537 flags |= KVM_MEM_LOG_DIRTY_PAGES; 538 } 539 if (readonly && kvm_readonly_mem_allowed) { 540 flags |= KVM_MEM_READONLY; 541 } 542 return flags; 543 } 544 545 /* Called with KVMMemoryListener.slots_lock held */ 546 static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem, 547 MemoryRegion *mr) 548 { 549 mem->flags = kvm_mem_flags(mr); 550 551 /* If nothing changed effectively, no need to issue ioctl */ 552 if (mem->flags == mem->old_flags) { 553 return 0; 554 } 555 556 kvm_slot_init_dirty_bitmap(mem); 557 return kvm_set_user_memory_region(kml, mem, false); 558 } 559 560 static int kvm_section_update_flags(KVMMemoryListener *kml, 561 MemoryRegionSection *section) 562 { 563 hwaddr start_addr, size, slot_size; 564 KVMSlot *mem; 565 int ret = 0; 566 567 size = kvm_align_section(section, &start_addr); 568 if (!size) { 569 return 0; 570 } 571 572 kvm_slots_lock(); 573 574 while (size && !ret) { 575 slot_size = MIN(kvm_max_slot_size, size); 576 mem = kvm_lookup_matching_slot(kml, start_addr, slot_size); 577 if (!mem) { 578 /* We don't have a slot if we want to trap every access. */ 579 goto out; 580 } 581 582 ret = kvm_slot_update_flags(kml, mem, section->mr); 583 start_addr += slot_size; 584 size -= slot_size; 585 } 586 587 out: 588 kvm_slots_unlock(); 589 return ret; 590 } 591 592 static void kvm_log_start(MemoryListener *listener, 593 MemoryRegionSection *section, 594 int old, int new) 595 { 596 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 597 int r; 598 599 if (old != 0) { 600 return; 601 } 602 603 r = kvm_section_update_flags(kml, section); 604 if (r < 0) { 605 abort(); 606 } 607 } 608 609 static void kvm_log_stop(MemoryListener *listener, 610 MemoryRegionSection *section, 611 int old, int new) 612 { 613 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 614 int r; 615 616 if (new != 0) { 617 return; 618 } 619 620 r = kvm_section_update_flags(kml, section); 621 if (r < 0) { 622 abort(); 623 } 624 } 625 626 /* get kvm's dirty pages bitmap and update qemu's */ 627 static void kvm_slot_sync_dirty_pages(KVMSlot *slot) 628 { 629 ram_addr_t start = slot->ram_start_offset; 630 ram_addr_t pages = slot->memory_size / qemu_real_host_page_size(); 631 632 cpu_physical_memory_set_dirty_lebitmap(slot->dirty_bmap, start, pages); 633 } 634 635 static void kvm_slot_reset_dirty_pages(KVMSlot *slot) 636 { 637 memset(slot->dirty_bmap, 0, slot->dirty_bmap_size); 638 } 639 640 #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1)) 641 642 /* Allocate the dirty bitmap for a slot */ 643 static void kvm_slot_init_dirty_bitmap(KVMSlot *mem) 644 { 645 if (!(mem->flags & KVM_MEM_LOG_DIRTY_PAGES) || mem->dirty_bmap) { 646 return; 647 } 648 649 /* 650 * XXX bad kernel interface alert 651 * For dirty bitmap, kernel allocates array of size aligned to 652 * bits-per-long. But for case when the kernel is 64bits and 653 * the userspace is 32bits, userspace can't align to the same 654 * bits-per-long, since sizeof(long) is different between kernel 655 * and user space. This way, userspace will provide buffer which 656 * may be 4 bytes less than the kernel will use, resulting in 657 * userspace memory corruption (which is not detectable by valgrind 658 * too, in most cases). 659 * So for now, let's align to 64 instead of HOST_LONG_BITS here, in 660 * a hope that sizeof(long) won't become >8 any time soon. 661 * 662 * Note: the granule of kvm dirty log is qemu_real_host_page_size. 663 * And mem->memory_size is aligned to it (otherwise this mem can't 664 * be registered to KVM). 665 */ 666 hwaddr bitmap_size = ALIGN(mem->memory_size / qemu_real_host_page_size(), 667 /*HOST_LONG_BITS*/ 64) / 8; 668 mem->dirty_bmap = g_malloc0(bitmap_size); 669 mem->dirty_bmap_size = bitmap_size; 670 } 671 672 /* 673 * Sync dirty bitmap from kernel to KVMSlot.dirty_bmap, return true if 674 * succeeded, false otherwise 675 */ 676 static bool kvm_slot_get_dirty_log(KVMState *s, KVMSlot *slot) 677 { 678 struct kvm_dirty_log d = {}; 679 int ret; 680 681 d.dirty_bitmap = slot->dirty_bmap; 682 d.slot = slot->slot | (slot->as_id << 16); 683 ret = kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d); 684 685 if (ret == -ENOENT) { 686 /* kernel does not have dirty bitmap in this slot */ 687 ret = 0; 688 } 689 if (ret) { 690 error_report_once("%s: KVM_GET_DIRTY_LOG failed with %d", 691 __func__, ret); 692 } 693 return ret == 0; 694 } 695 696 /* Should be with all slots_lock held for the address spaces. */ 697 static void kvm_dirty_ring_mark_page(KVMState *s, uint32_t as_id, 698 uint32_t slot_id, uint64_t offset) 699 { 700 KVMMemoryListener *kml; 701 KVMSlot *mem; 702 703 if (as_id >= s->nr_as) { 704 return; 705 } 706 707 kml = s->as[as_id].ml; 708 mem = &kml->slots[slot_id]; 709 710 if (!mem->memory_size || offset >= 711 (mem->memory_size / qemu_real_host_page_size())) { 712 return; 713 } 714 715 set_bit(offset, mem->dirty_bmap); 716 } 717 718 static bool dirty_gfn_is_dirtied(struct kvm_dirty_gfn *gfn) 719 { 720 return gfn->flags == KVM_DIRTY_GFN_F_DIRTY; 721 } 722 723 static void dirty_gfn_set_collected(struct kvm_dirty_gfn *gfn) 724 { 725 gfn->flags = KVM_DIRTY_GFN_F_RESET; 726 } 727 728 /* 729 * Should be with all slots_lock held for the address spaces. It returns the 730 * dirty page we've collected on this dirty ring. 731 */ 732 static uint32_t kvm_dirty_ring_reap_one(KVMState *s, CPUState *cpu) 733 { 734 struct kvm_dirty_gfn *dirty_gfns = cpu->kvm_dirty_gfns, *cur; 735 uint32_t ring_size = s->kvm_dirty_ring_size; 736 uint32_t count = 0, fetch = cpu->kvm_fetch_index; 737 738 assert(dirty_gfns && ring_size); 739 trace_kvm_dirty_ring_reap_vcpu(cpu->cpu_index); 740 741 while (true) { 742 cur = &dirty_gfns[fetch % ring_size]; 743 if (!dirty_gfn_is_dirtied(cur)) { 744 break; 745 } 746 kvm_dirty_ring_mark_page(s, cur->slot >> 16, cur->slot & 0xffff, 747 cur->offset); 748 dirty_gfn_set_collected(cur); 749 trace_kvm_dirty_ring_page(cpu->cpu_index, fetch, cur->offset); 750 fetch++; 751 count++; 752 } 753 cpu->kvm_fetch_index = fetch; 754 cpu->dirty_pages += count; 755 756 return count; 757 } 758 759 /* Must be with slots_lock held */ 760 static uint64_t kvm_dirty_ring_reap_locked(KVMState *s, CPUState* cpu) 761 { 762 int ret; 763 uint64_t total = 0; 764 int64_t stamp; 765 766 stamp = get_clock(); 767 768 if (cpu) { 769 total = kvm_dirty_ring_reap_one(s, cpu); 770 } else { 771 CPU_FOREACH(cpu) { 772 total += kvm_dirty_ring_reap_one(s, cpu); 773 } 774 } 775 776 if (total) { 777 ret = kvm_vm_ioctl(s, KVM_RESET_DIRTY_RINGS); 778 assert(ret == total); 779 } 780 781 stamp = get_clock() - stamp; 782 783 if (total) { 784 trace_kvm_dirty_ring_reap(total, stamp / 1000); 785 } 786 787 return total; 788 } 789 790 /* 791 * Currently for simplicity, we must hold BQL before calling this. We can 792 * consider to drop the BQL if we're clear with all the race conditions. 793 */ 794 static uint64_t kvm_dirty_ring_reap(KVMState *s, CPUState *cpu) 795 { 796 uint64_t total; 797 798 /* 799 * We need to lock all kvm slots for all address spaces here, 800 * because: 801 * 802 * (1) We need to mark dirty for dirty bitmaps in multiple slots 803 * and for tons of pages, so it's better to take the lock here 804 * once rather than once per page. And more importantly, 805 * 806 * (2) We must _NOT_ publish dirty bits to the other threads 807 * (e.g., the migration thread) via the kvm memory slot dirty 808 * bitmaps before correctly re-protect those dirtied pages. 809 * Otherwise we can have potential risk of data corruption if 810 * the page data is read in the other thread before we do 811 * reset below. 812 */ 813 kvm_slots_lock(); 814 total = kvm_dirty_ring_reap_locked(s, cpu); 815 kvm_slots_unlock(); 816 817 return total; 818 } 819 820 static void do_kvm_cpu_synchronize_kick(CPUState *cpu, run_on_cpu_data arg) 821 { 822 /* No need to do anything */ 823 } 824 825 /* 826 * Kick all vcpus out in a synchronized way. When returned, we 827 * guarantee that every vcpu has been kicked and at least returned to 828 * userspace once. 829 */ 830 static void kvm_cpu_synchronize_kick_all(void) 831 { 832 CPUState *cpu; 833 834 CPU_FOREACH(cpu) { 835 run_on_cpu(cpu, do_kvm_cpu_synchronize_kick, RUN_ON_CPU_NULL); 836 } 837 } 838 839 /* 840 * Flush all the existing dirty pages to the KVM slot buffers. When 841 * this call returns, we guarantee that all the touched dirty pages 842 * before calling this function have been put into the per-kvmslot 843 * dirty bitmap. 844 * 845 * This function must be called with BQL held. 846 */ 847 static void kvm_dirty_ring_flush(void) 848 { 849 trace_kvm_dirty_ring_flush(0); 850 /* 851 * The function needs to be serialized. Since this function 852 * should always be with BQL held, serialization is guaranteed. 853 * However, let's be sure of it. 854 */ 855 assert(qemu_mutex_iothread_locked()); 856 /* 857 * First make sure to flush the hardware buffers by kicking all 858 * vcpus out in a synchronous way. 859 */ 860 kvm_cpu_synchronize_kick_all(); 861 kvm_dirty_ring_reap(kvm_state, NULL); 862 trace_kvm_dirty_ring_flush(1); 863 } 864 865 /** 866 * kvm_physical_sync_dirty_bitmap - Sync dirty bitmap from kernel space 867 * 868 * This function will first try to fetch dirty bitmap from the kernel, 869 * and then updates qemu's dirty bitmap. 870 * 871 * NOTE: caller must be with kml->slots_lock held. 872 * 873 * @kml: the KVM memory listener object 874 * @section: the memory section to sync the dirty bitmap with 875 */ 876 static void kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml, 877 MemoryRegionSection *section) 878 { 879 KVMState *s = kvm_state; 880 KVMSlot *mem; 881 hwaddr start_addr, size; 882 hwaddr slot_size; 883 884 size = kvm_align_section(section, &start_addr); 885 while (size) { 886 slot_size = MIN(kvm_max_slot_size, size); 887 mem = kvm_lookup_matching_slot(kml, start_addr, slot_size); 888 if (!mem) { 889 /* We don't have a slot if we want to trap every access. */ 890 return; 891 } 892 if (kvm_slot_get_dirty_log(s, mem)) { 893 kvm_slot_sync_dirty_pages(mem); 894 } 895 start_addr += slot_size; 896 size -= slot_size; 897 } 898 } 899 900 /* Alignment requirement for KVM_CLEAR_DIRTY_LOG - 64 pages */ 901 #define KVM_CLEAR_LOG_SHIFT 6 902 #define KVM_CLEAR_LOG_ALIGN (qemu_real_host_page_size() << KVM_CLEAR_LOG_SHIFT) 903 #define KVM_CLEAR_LOG_MASK (-KVM_CLEAR_LOG_ALIGN) 904 905 static int kvm_log_clear_one_slot(KVMSlot *mem, int as_id, uint64_t start, 906 uint64_t size) 907 { 908 KVMState *s = kvm_state; 909 uint64_t end, bmap_start, start_delta, bmap_npages; 910 struct kvm_clear_dirty_log d; 911 unsigned long *bmap_clear = NULL, psize = qemu_real_host_page_size(); 912 int ret; 913 914 /* 915 * We need to extend either the start or the size or both to 916 * satisfy the KVM interface requirement. Firstly, do the start 917 * page alignment on 64 host pages 918 */ 919 bmap_start = start & KVM_CLEAR_LOG_MASK; 920 start_delta = start - bmap_start; 921 bmap_start /= psize; 922 923 /* 924 * The kernel interface has restriction on the size too, that either: 925 * 926 * (1) the size is 64 host pages aligned (just like the start), or 927 * (2) the size fills up until the end of the KVM memslot. 928 */ 929 bmap_npages = DIV_ROUND_UP(size + start_delta, KVM_CLEAR_LOG_ALIGN) 930 << KVM_CLEAR_LOG_SHIFT; 931 end = mem->memory_size / psize; 932 if (bmap_npages > end - bmap_start) { 933 bmap_npages = end - bmap_start; 934 } 935 start_delta /= psize; 936 937 /* 938 * Prepare the bitmap to clear dirty bits. Here we must guarantee 939 * that we won't clear any unknown dirty bits otherwise we might 940 * accidentally clear some set bits which are not yet synced from 941 * the kernel into QEMU's bitmap, then we'll lose track of the 942 * guest modifications upon those pages (which can directly lead 943 * to guest data loss or panic after migration). 944 * 945 * Layout of the KVMSlot.dirty_bmap: 946 * 947 * |<-------- bmap_npages -----------..>| 948 * [1] 949 * start_delta size 950 * |----------------|-------------|------------------|------------| 951 * ^ ^ ^ ^ 952 * | | | | 953 * start bmap_start (start) end 954 * of memslot of memslot 955 * 956 * [1] bmap_npages can be aligned to either 64 pages or the end of slot 957 */ 958 959 assert(bmap_start % BITS_PER_LONG == 0); 960 /* We should never do log_clear before log_sync */ 961 assert(mem->dirty_bmap); 962 if (start_delta || bmap_npages - size / psize) { 963 /* Slow path - we need to manipulate a temp bitmap */ 964 bmap_clear = bitmap_new(bmap_npages); 965 bitmap_copy_with_src_offset(bmap_clear, mem->dirty_bmap, 966 bmap_start, start_delta + size / psize); 967 /* 968 * We need to fill the holes at start because that was not 969 * specified by the caller and we extended the bitmap only for 970 * 64 pages alignment 971 */ 972 bitmap_clear(bmap_clear, 0, start_delta); 973 d.dirty_bitmap = bmap_clear; 974 } else { 975 /* 976 * Fast path - both start and size align well with BITS_PER_LONG 977 * (or the end of memory slot) 978 */ 979 d.dirty_bitmap = mem->dirty_bmap + BIT_WORD(bmap_start); 980 } 981 982 d.first_page = bmap_start; 983 /* It should never overflow. If it happens, say something */ 984 assert(bmap_npages <= UINT32_MAX); 985 d.num_pages = bmap_npages; 986 d.slot = mem->slot | (as_id << 16); 987 988 ret = kvm_vm_ioctl(s, KVM_CLEAR_DIRTY_LOG, &d); 989 if (ret < 0 && ret != -ENOENT) { 990 error_report("%s: KVM_CLEAR_DIRTY_LOG failed, slot=%d, " 991 "start=0x%"PRIx64", size=0x%"PRIx32", errno=%d", 992 __func__, d.slot, (uint64_t)d.first_page, 993 (uint32_t)d.num_pages, ret); 994 } else { 995 ret = 0; 996 trace_kvm_clear_dirty_log(d.slot, d.first_page, d.num_pages); 997 } 998 999 /* 1000 * After we have updated the remote dirty bitmap, we update the 1001 * cached bitmap as well for the memslot, then if another user 1002 * clears the same region we know we shouldn't clear it again on 1003 * the remote otherwise it's data loss as well. 1004 */ 1005 bitmap_clear(mem->dirty_bmap, bmap_start + start_delta, 1006 size / psize); 1007 /* This handles the NULL case well */ 1008 g_free(bmap_clear); 1009 return ret; 1010 } 1011 1012 1013 /** 1014 * kvm_physical_log_clear - Clear the kernel's dirty bitmap for range 1015 * 1016 * NOTE: this will be a no-op if we haven't enabled manual dirty log 1017 * protection in the host kernel because in that case this operation 1018 * will be done within log_sync(). 1019 * 1020 * @kml: the kvm memory listener 1021 * @section: the memory range to clear dirty bitmap 1022 */ 1023 static int kvm_physical_log_clear(KVMMemoryListener *kml, 1024 MemoryRegionSection *section) 1025 { 1026 KVMState *s = kvm_state; 1027 uint64_t start, size, offset, count; 1028 KVMSlot *mem; 1029 int ret = 0, i; 1030 1031 if (!s->manual_dirty_log_protect) { 1032 /* No need to do explicit clear */ 1033 return ret; 1034 } 1035 1036 start = section->offset_within_address_space; 1037 size = int128_get64(section->size); 1038 1039 if (!size) { 1040 /* Nothing more we can do... */ 1041 return ret; 1042 } 1043 1044 kvm_slots_lock(); 1045 1046 for (i = 0; i < s->nr_slots; i++) { 1047 mem = &kml->slots[i]; 1048 /* Discard slots that are empty or do not overlap the section */ 1049 if (!mem->memory_size || 1050 mem->start_addr > start + size - 1 || 1051 start > mem->start_addr + mem->memory_size - 1) { 1052 continue; 1053 } 1054 1055 if (start >= mem->start_addr) { 1056 /* The slot starts before section or is aligned to it. */ 1057 offset = start - mem->start_addr; 1058 count = MIN(mem->memory_size - offset, size); 1059 } else { 1060 /* The slot starts after section. */ 1061 offset = 0; 1062 count = MIN(mem->memory_size, size - (mem->start_addr - start)); 1063 } 1064 ret = kvm_log_clear_one_slot(mem, kml->as_id, offset, count); 1065 if (ret < 0) { 1066 break; 1067 } 1068 } 1069 1070 kvm_slots_unlock(); 1071 1072 return ret; 1073 } 1074 1075 static void kvm_coalesce_mmio_region(MemoryListener *listener, 1076 MemoryRegionSection *secion, 1077 hwaddr start, hwaddr size) 1078 { 1079 KVMState *s = kvm_state; 1080 1081 if (s->coalesced_mmio) { 1082 struct kvm_coalesced_mmio_zone zone; 1083 1084 zone.addr = start; 1085 zone.size = size; 1086 zone.pad = 0; 1087 1088 (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone); 1089 } 1090 } 1091 1092 static void kvm_uncoalesce_mmio_region(MemoryListener *listener, 1093 MemoryRegionSection *secion, 1094 hwaddr start, hwaddr size) 1095 { 1096 KVMState *s = kvm_state; 1097 1098 if (s->coalesced_mmio) { 1099 struct kvm_coalesced_mmio_zone zone; 1100 1101 zone.addr = start; 1102 zone.size = size; 1103 zone.pad = 0; 1104 1105 (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone); 1106 } 1107 } 1108 1109 static void kvm_coalesce_pio_add(MemoryListener *listener, 1110 MemoryRegionSection *section, 1111 hwaddr start, hwaddr size) 1112 { 1113 KVMState *s = kvm_state; 1114 1115 if (s->coalesced_pio) { 1116 struct kvm_coalesced_mmio_zone zone; 1117 1118 zone.addr = start; 1119 zone.size = size; 1120 zone.pio = 1; 1121 1122 (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone); 1123 } 1124 } 1125 1126 static void kvm_coalesce_pio_del(MemoryListener *listener, 1127 MemoryRegionSection *section, 1128 hwaddr start, hwaddr size) 1129 { 1130 KVMState *s = kvm_state; 1131 1132 if (s->coalesced_pio) { 1133 struct kvm_coalesced_mmio_zone zone; 1134 1135 zone.addr = start; 1136 zone.size = size; 1137 zone.pio = 1; 1138 1139 (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone); 1140 } 1141 } 1142 1143 static MemoryListener kvm_coalesced_pio_listener = { 1144 .name = "kvm-coalesced-pio", 1145 .coalesced_io_add = kvm_coalesce_pio_add, 1146 .coalesced_io_del = kvm_coalesce_pio_del, 1147 }; 1148 1149 int kvm_check_extension(KVMState *s, unsigned int extension) 1150 { 1151 int ret; 1152 1153 ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension); 1154 if (ret < 0) { 1155 ret = 0; 1156 } 1157 1158 return ret; 1159 } 1160 1161 int kvm_vm_check_extension(KVMState *s, unsigned int extension) 1162 { 1163 int ret; 1164 1165 ret = kvm_vm_ioctl(s, KVM_CHECK_EXTENSION, extension); 1166 if (ret < 0) { 1167 /* VM wide version not implemented, use global one instead */ 1168 ret = kvm_check_extension(s, extension); 1169 } 1170 1171 return ret; 1172 } 1173 1174 typedef struct HWPoisonPage { 1175 ram_addr_t ram_addr; 1176 QLIST_ENTRY(HWPoisonPage) list; 1177 } HWPoisonPage; 1178 1179 static QLIST_HEAD(, HWPoisonPage) hwpoison_page_list = 1180 QLIST_HEAD_INITIALIZER(hwpoison_page_list); 1181 1182 static void kvm_unpoison_all(void *param) 1183 { 1184 HWPoisonPage *page, *next_page; 1185 1186 QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) { 1187 QLIST_REMOVE(page, list); 1188 qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE); 1189 g_free(page); 1190 } 1191 } 1192 1193 void kvm_hwpoison_page_add(ram_addr_t ram_addr) 1194 { 1195 HWPoisonPage *page; 1196 1197 QLIST_FOREACH(page, &hwpoison_page_list, list) { 1198 if (page->ram_addr == ram_addr) { 1199 return; 1200 } 1201 } 1202 page = g_new(HWPoisonPage, 1); 1203 page->ram_addr = ram_addr; 1204 QLIST_INSERT_HEAD(&hwpoison_page_list, page, list); 1205 } 1206 1207 static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size) 1208 { 1209 #if HOST_BIG_ENDIAN != TARGET_BIG_ENDIAN 1210 /* The kernel expects ioeventfd values in HOST_BIG_ENDIAN 1211 * endianness, but the memory core hands them in target endianness. 1212 * For example, PPC is always treated as big-endian even if running 1213 * on KVM and on PPC64LE. Correct here. 1214 */ 1215 switch (size) { 1216 case 2: 1217 val = bswap16(val); 1218 break; 1219 case 4: 1220 val = bswap32(val); 1221 break; 1222 } 1223 #endif 1224 return val; 1225 } 1226 1227 static int kvm_set_ioeventfd_mmio(int fd, hwaddr addr, uint32_t val, 1228 bool assign, uint32_t size, bool datamatch) 1229 { 1230 int ret; 1231 struct kvm_ioeventfd iofd = { 1232 .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0, 1233 .addr = addr, 1234 .len = size, 1235 .flags = 0, 1236 .fd = fd, 1237 }; 1238 1239 trace_kvm_set_ioeventfd_mmio(fd, (uint64_t)addr, val, assign, size, 1240 datamatch); 1241 if (!kvm_enabled()) { 1242 return -ENOSYS; 1243 } 1244 1245 if (datamatch) { 1246 iofd.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH; 1247 } 1248 if (!assign) { 1249 iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN; 1250 } 1251 1252 ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd); 1253 1254 if (ret < 0) { 1255 return -errno; 1256 } 1257 1258 return 0; 1259 } 1260 1261 static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val, 1262 bool assign, uint32_t size, bool datamatch) 1263 { 1264 struct kvm_ioeventfd kick = { 1265 .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0, 1266 .addr = addr, 1267 .flags = KVM_IOEVENTFD_FLAG_PIO, 1268 .len = size, 1269 .fd = fd, 1270 }; 1271 int r; 1272 trace_kvm_set_ioeventfd_pio(fd, addr, val, assign, size, datamatch); 1273 if (!kvm_enabled()) { 1274 return -ENOSYS; 1275 } 1276 if (datamatch) { 1277 kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH; 1278 } 1279 if (!assign) { 1280 kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN; 1281 } 1282 r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick); 1283 if (r < 0) { 1284 return r; 1285 } 1286 return 0; 1287 } 1288 1289 1290 static int kvm_check_many_ioeventfds(void) 1291 { 1292 /* Userspace can use ioeventfd for io notification. This requires a host 1293 * that supports eventfd(2) and an I/O thread; since eventfd does not 1294 * support SIGIO it cannot interrupt the vcpu. 1295 * 1296 * Older kernels have a 6 device limit on the KVM io bus. Find out so we 1297 * can avoid creating too many ioeventfds. 1298 */ 1299 #if defined(CONFIG_EVENTFD) 1300 int ioeventfds[7]; 1301 int i, ret = 0; 1302 for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) { 1303 ioeventfds[i] = eventfd(0, EFD_CLOEXEC); 1304 if (ioeventfds[i] < 0) { 1305 break; 1306 } 1307 ret = kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, true, 2, true); 1308 if (ret < 0) { 1309 close(ioeventfds[i]); 1310 break; 1311 } 1312 } 1313 1314 /* Decide whether many devices are supported or not */ 1315 ret = i == ARRAY_SIZE(ioeventfds); 1316 1317 while (i-- > 0) { 1318 kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, false, 2, true); 1319 close(ioeventfds[i]); 1320 } 1321 return ret; 1322 #else 1323 return 0; 1324 #endif 1325 } 1326 1327 static const KVMCapabilityInfo * 1328 kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list) 1329 { 1330 while (list->name) { 1331 if (!kvm_check_extension(s, list->value)) { 1332 return list; 1333 } 1334 list++; 1335 } 1336 return NULL; 1337 } 1338 1339 void kvm_set_max_memslot_size(hwaddr max_slot_size) 1340 { 1341 g_assert( 1342 ROUND_UP(max_slot_size, qemu_real_host_page_size()) == max_slot_size 1343 ); 1344 kvm_max_slot_size = max_slot_size; 1345 } 1346 1347 static void kvm_set_phys_mem(KVMMemoryListener *kml, 1348 MemoryRegionSection *section, bool add) 1349 { 1350 KVMSlot *mem; 1351 int err; 1352 MemoryRegion *mr = section->mr; 1353 bool writable = !mr->readonly && !mr->rom_device; 1354 hwaddr start_addr, size, slot_size, mr_offset; 1355 ram_addr_t ram_start_offset; 1356 void *ram; 1357 1358 if (!memory_region_is_ram(mr)) { 1359 if (writable || !kvm_readonly_mem_allowed) { 1360 return; 1361 } else if (!mr->romd_mode) { 1362 /* If the memory device is not in romd_mode, then we actually want 1363 * to remove the kvm memory slot so all accesses will trap. */ 1364 add = false; 1365 } 1366 } 1367 1368 size = kvm_align_section(section, &start_addr); 1369 if (!size) { 1370 return; 1371 } 1372 1373 /* The offset of the kvmslot within the memory region */ 1374 mr_offset = section->offset_within_region + start_addr - 1375 section->offset_within_address_space; 1376 1377 /* use aligned delta to align the ram address and offset */ 1378 ram = memory_region_get_ram_ptr(mr) + mr_offset; 1379 ram_start_offset = memory_region_get_ram_addr(mr) + mr_offset; 1380 1381 kvm_slots_lock(); 1382 1383 if (!add) { 1384 do { 1385 slot_size = MIN(kvm_max_slot_size, size); 1386 mem = kvm_lookup_matching_slot(kml, start_addr, slot_size); 1387 if (!mem) { 1388 goto out; 1389 } 1390 if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { 1391 /* 1392 * NOTE: We should be aware of the fact that here we're only 1393 * doing a best effort to sync dirty bits. No matter whether 1394 * we're using dirty log or dirty ring, we ignored two facts: 1395 * 1396 * (1) dirty bits can reside in hardware buffers (PML) 1397 * 1398 * (2) after we collected dirty bits here, pages can be dirtied 1399 * again before we do the final KVM_SET_USER_MEMORY_REGION to 1400 * remove the slot. 1401 * 1402 * Not easy. Let's cross the fingers until it's fixed. 1403 */ 1404 if (kvm_state->kvm_dirty_ring_size) { 1405 kvm_dirty_ring_reap_locked(kvm_state, NULL); 1406 } else { 1407 kvm_slot_get_dirty_log(kvm_state, mem); 1408 } 1409 kvm_slot_sync_dirty_pages(mem); 1410 } 1411 1412 /* unregister the slot */ 1413 g_free(mem->dirty_bmap); 1414 mem->dirty_bmap = NULL; 1415 mem->memory_size = 0; 1416 mem->flags = 0; 1417 err = kvm_set_user_memory_region(kml, mem, false); 1418 if (err) { 1419 fprintf(stderr, "%s: error unregistering slot: %s\n", 1420 __func__, strerror(-err)); 1421 abort(); 1422 } 1423 start_addr += slot_size; 1424 size -= slot_size; 1425 } while (size); 1426 goto out; 1427 } 1428 1429 /* register the new slot */ 1430 do { 1431 slot_size = MIN(kvm_max_slot_size, size); 1432 mem = kvm_alloc_slot(kml); 1433 mem->as_id = kml->as_id; 1434 mem->memory_size = slot_size; 1435 mem->start_addr = start_addr; 1436 mem->ram_start_offset = ram_start_offset; 1437 mem->ram = ram; 1438 mem->flags = kvm_mem_flags(mr); 1439 kvm_slot_init_dirty_bitmap(mem); 1440 err = kvm_set_user_memory_region(kml, mem, true); 1441 if (err) { 1442 fprintf(stderr, "%s: error registering slot: %s\n", __func__, 1443 strerror(-err)); 1444 abort(); 1445 } 1446 start_addr += slot_size; 1447 ram_start_offset += slot_size; 1448 ram += slot_size; 1449 size -= slot_size; 1450 } while (size); 1451 1452 out: 1453 kvm_slots_unlock(); 1454 } 1455 1456 static void *kvm_dirty_ring_reaper_thread(void *data) 1457 { 1458 KVMState *s = data; 1459 struct KVMDirtyRingReaper *r = &s->reaper; 1460 1461 rcu_register_thread(); 1462 1463 trace_kvm_dirty_ring_reaper("init"); 1464 1465 while (true) { 1466 r->reaper_state = KVM_DIRTY_RING_REAPER_WAIT; 1467 trace_kvm_dirty_ring_reaper("wait"); 1468 /* 1469 * TODO: provide a smarter timeout rather than a constant? 1470 */ 1471 sleep(1); 1472 1473 trace_kvm_dirty_ring_reaper("wakeup"); 1474 r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING; 1475 1476 qemu_mutex_lock_iothread(); 1477 kvm_dirty_ring_reap(s, NULL); 1478 qemu_mutex_unlock_iothread(); 1479 1480 r->reaper_iteration++; 1481 } 1482 1483 trace_kvm_dirty_ring_reaper("exit"); 1484 1485 rcu_unregister_thread(); 1486 1487 return NULL; 1488 } 1489 1490 static int kvm_dirty_ring_reaper_init(KVMState *s) 1491 { 1492 struct KVMDirtyRingReaper *r = &s->reaper; 1493 1494 qemu_thread_create(&r->reaper_thr, "kvm-reaper", 1495 kvm_dirty_ring_reaper_thread, 1496 s, QEMU_THREAD_JOINABLE); 1497 1498 return 0; 1499 } 1500 1501 static void kvm_region_add(MemoryListener *listener, 1502 MemoryRegionSection *section) 1503 { 1504 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 1505 1506 memory_region_ref(section->mr); 1507 kvm_set_phys_mem(kml, section, true); 1508 } 1509 1510 static void kvm_region_del(MemoryListener *listener, 1511 MemoryRegionSection *section) 1512 { 1513 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 1514 1515 kvm_set_phys_mem(kml, section, false); 1516 memory_region_unref(section->mr); 1517 } 1518 1519 static void kvm_log_sync(MemoryListener *listener, 1520 MemoryRegionSection *section) 1521 { 1522 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 1523 1524 kvm_slots_lock(); 1525 kvm_physical_sync_dirty_bitmap(kml, section); 1526 kvm_slots_unlock(); 1527 } 1528 1529 static void kvm_log_sync_global(MemoryListener *l) 1530 { 1531 KVMMemoryListener *kml = container_of(l, KVMMemoryListener, listener); 1532 KVMState *s = kvm_state; 1533 KVMSlot *mem; 1534 int i; 1535 1536 /* Flush all kernel dirty addresses into KVMSlot dirty bitmap */ 1537 kvm_dirty_ring_flush(); 1538 1539 /* 1540 * TODO: make this faster when nr_slots is big while there are 1541 * only a few used slots (small VMs). 1542 */ 1543 kvm_slots_lock(); 1544 for (i = 0; i < s->nr_slots; i++) { 1545 mem = &kml->slots[i]; 1546 if (mem->memory_size && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { 1547 kvm_slot_sync_dirty_pages(mem); 1548 /* 1549 * This is not needed by KVM_GET_DIRTY_LOG because the 1550 * ioctl will unconditionally overwrite the whole region. 1551 * However kvm dirty ring has no such side effect. 1552 */ 1553 kvm_slot_reset_dirty_pages(mem); 1554 } 1555 } 1556 kvm_slots_unlock(); 1557 } 1558 1559 static void kvm_log_clear(MemoryListener *listener, 1560 MemoryRegionSection *section) 1561 { 1562 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 1563 int r; 1564 1565 r = kvm_physical_log_clear(kml, section); 1566 if (r < 0) { 1567 error_report_once("%s: kvm log clear failed: mr=%s " 1568 "offset=%"HWADDR_PRIx" size=%"PRIx64, __func__, 1569 section->mr->name, section->offset_within_region, 1570 int128_get64(section->size)); 1571 abort(); 1572 } 1573 } 1574 1575 static void kvm_mem_ioeventfd_add(MemoryListener *listener, 1576 MemoryRegionSection *section, 1577 bool match_data, uint64_t data, 1578 EventNotifier *e) 1579 { 1580 int fd = event_notifier_get_fd(e); 1581 int r; 1582 1583 r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space, 1584 data, true, int128_get64(section->size), 1585 match_data); 1586 if (r < 0) { 1587 fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n", 1588 __func__, strerror(-r), -r); 1589 abort(); 1590 } 1591 } 1592 1593 static void kvm_mem_ioeventfd_del(MemoryListener *listener, 1594 MemoryRegionSection *section, 1595 bool match_data, uint64_t data, 1596 EventNotifier *e) 1597 { 1598 int fd = event_notifier_get_fd(e); 1599 int r; 1600 1601 r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space, 1602 data, false, int128_get64(section->size), 1603 match_data); 1604 if (r < 0) { 1605 fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n", 1606 __func__, strerror(-r), -r); 1607 abort(); 1608 } 1609 } 1610 1611 static void kvm_io_ioeventfd_add(MemoryListener *listener, 1612 MemoryRegionSection *section, 1613 bool match_data, uint64_t data, 1614 EventNotifier *e) 1615 { 1616 int fd = event_notifier_get_fd(e); 1617 int r; 1618 1619 r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space, 1620 data, true, int128_get64(section->size), 1621 match_data); 1622 if (r < 0) { 1623 fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n", 1624 __func__, strerror(-r), -r); 1625 abort(); 1626 } 1627 } 1628 1629 static void kvm_io_ioeventfd_del(MemoryListener *listener, 1630 MemoryRegionSection *section, 1631 bool match_data, uint64_t data, 1632 EventNotifier *e) 1633 1634 { 1635 int fd = event_notifier_get_fd(e); 1636 int r; 1637 1638 r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space, 1639 data, false, int128_get64(section->size), 1640 match_data); 1641 if (r < 0) { 1642 fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n", 1643 __func__, strerror(-r), -r); 1644 abort(); 1645 } 1646 } 1647 1648 void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml, 1649 AddressSpace *as, int as_id, const char *name) 1650 { 1651 int i; 1652 1653 kml->slots = g_new0(KVMSlot, s->nr_slots); 1654 kml->as_id = as_id; 1655 1656 for (i = 0; i < s->nr_slots; i++) { 1657 kml->slots[i].slot = i; 1658 } 1659 1660 kml->listener.region_add = kvm_region_add; 1661 kml->listener.region_del = kvm_region_del; 1662 kml->listener.log_start = kvm_log_start; 1663 kml->listener.log_stop = kvm_log_stop; 1664 kml->listener.priority = 10; 1665 kml->listener.name = name; 1666 1667 if (s->kvm_dirty_ring_size) { 1668 kml->listener.log_sync_global = kvm_log_sync_global; 1669 } else { 1670 kml->listener.log_sync = kvm_log_sync; 1671 kml->listener.log_clear = kvm_log_clear; 1672 } 1673 1674 memory_listener_register(&kml->listener, as); 1675 1676 for (i = 0; i < s->nr_as; ++i) { 1677 if (!s->as[i].as) { 1678 s->as[i].as = as; 1679 s->as[i].ml = kml; 1680 break; 1681 } 1682 } 1683 } 1684 1685 static MemoryListener kvm_io_listener = { 1686 .name = "kvm-io", 1687 .eventfd_add = kvm_io_ioeventfd_add, 1688 .eventfd_del = kvm_io_ioeventfd_del, 1689 .priority = 10, 1690 }; 1691 1692 int kvm_set_irq(KVMState *s, int irq, int level) 1693 { 1694 struct kvm_irq_level event; 1695 int ret; 1696 1697 assert(kvm_async_interrupts_enabled()); 1698 1699 event.level = level; 1700 event.irq = irq; 1701 ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event); 1702 if (ret < 0) { 1703 perror("kvm_set_irq"); 1704 abort(); 1705 } 1706 1707 return (s->irq_set_ioctl == KVM_IRQ_LINE) ? 1 : event.status; 1708 } 1709 1710 #ifdef KVM_CAP_IRQ_ROUTING 1711 typedef struct KVMMSIRoute { 1712 struct kvm_irq_routing_entry kroute; 1713 QTAILQ_ENTRY(KVMMSIRoute) entry; 1714 } KVMMSIRoute; 1715 1716 static void set_gsi(KVMState *s, unsigned int gsi) 1717 { 1718 set_bit(gsi, s->used_gsi_bitmap); 1719 } 1720 1721 static void clear_gsi(KVMState *s, unsigned int gsi) 1722 { 1723 clear_bit(gsi, s->used_gsi_bitmap); 1724 } 1725 1726 void kvm_init_irq_routing(KVMState *s) 1727 { 1728 int gsi_count, i; 1729 1730 gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING) - 1; 1731 if (gsi_count > 0) { 1732 /* Round up so we can search ints using ffs */ 1733 s->used_gsi_bitmap = bitmap_new(gsi_count); 1734 s->gsi_count = gsi_count; 1735 } 1736 1737 s->irq_routes = g_malloc0(sizeof(*s->irq_routes)); 1738 s->nr_allocated_irq_routes = 0; 1739 1740 if (!kvm_direct_msi_allowed) { 1741 for (i = 0; i < KVM_MSI_HASHTAB_SIZE; i++) { 1742 QTAILQ_INIT(&s->msi_hashtab[i]); 1743 } 1744 } 1745 1746 kvm_arch_init_irq_routing(s); 1747 } 1748 1749 void kvm_irqchip_commit_routes(KVMState *s) 1750 { 1751 int ret; 1752 1753 if (kvm_gsi_direct_mapping()) { 1754 return; 1755 } 1756 1757 if (!kvm_gsi_routing_enabled()) { 1758 return; 1759 } 1760 1761 s->irq_routes->flags = 0; 1762 trace_kvm_irqchip_commit_routes(); 1763 ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes); 1764 assert(ret == 0); 1765 } 1766 1767 static void kvm_add_routing_entry(KVMState *s, 1768 struct kvm_irq_routing_entry *entry) 1769 { 1770 struct kvm_irq_routing_entry *new; 1771 int n, size; 1772 1773 if (s->irq_routes->nr == s->nr_allocated_irq_routes) { 1774 n = s->nr_allocated_irq_routes * 2; 1775 if (n < 64) { 1776 n = 64; 1777 } 1778 size = sizeof(struct kvm_irq_routing); 1779 size += n * sizeof(*new); 1780 s->irq_routes = g_realloc(s->irq_routes, size); 1781 s->nr_allocated_irq_routes = n; 1782 } 1783 n = s->irq_routes->nr++; 1784 new = &s->irq_routes->entries[n]; 1785 1786 *new = *entry; 1787 1788 set_gsi(s, entry->gsi); 1789 } 1790 1791 static int kvm_update_routing_entry(KVMState *s, 1792 struct kvm_irq_routing_entry *new_entry) 1793 { 1794 struct kvm_irq_routing_entry *entry; 1795 int n; 1796 1797 for (n = 0; n < s->irq_routes->nr; n++) { 1798 entry = &s->irq_routes->entries[n]; 1799 if (entry->gsi != new_entry->gsi) { 1800 continue; 1801 } 1802 1803 if(!memcmp(entry, new_entry, sizeof *entry)) { 1804 return 0; 1805 } 1806 1807 *entry = *new_entry; 1808 1809 return 0; 1810 } 1811 1812 return -ESRCH; 1813 } 1814 1815 void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin) 1816 { 1817 struct kvm_irq_routing_entry e = {}; 1818 1819 assert(pin < s->gsi_count); 1820 1821 e.gsi = irq; 1822 e.type = KVM_IRQ_ROUTING_IRQCHIP; 1823 e.flags = 0; 1824 e.u.irqchip.irqchip = irqchip; 1825 e.u.irqchip.pin = pin; 1826 kvm_add_routing_entry(s, &e); 1827 } 1828 1829 void kvm_irqchip_release_virq(KVMState *s, int virq) 1830 { 1831 struct kvm_irq_routing_entry *e; 1832 int i; 1833 1834 if (kvm_gsi_direct_mapping()) { 1835 return; 1836 } 1837 1838 for (i = 0; i < s->irq_routes->nr; i++) { 1839 e = &s->irq_routes->entries[i]; 1840 if (e->gsi == virq) { 1841 s->irq_routes->nr--; 1842 *e = s->irq_routes->entries[s->irq_routes->nr]; 1843 } 1844 } 1845 clear_gsi(s, virq); 1846 kvm_arch_release_virq_post(virq); 1847 trace_kvm_irqchip_release_virq(virq); 1848 } 1849 1850 void kvm_irqchip_add_change_notifier(Notifier *n) 1851 { 1852 notifier_list_add(&kvm_irqchip_change_notifiers, n); 1853 } 1854 1855 void kvm_irqchip_remove_change_notifier(Notifier *n) 1856 { 1857 notifier_remove(n); 1858 } 1859 1860 void kvm_irqchip_change_notify(void) 1861 { 1862 notifier_list_notify(&kvm_irqchip_change_notifiers, NULL); 1863 } 1864 1865 static unsigned int kvm_hash_msi(uint32_t data) 1866 { 1867 /* This is optimized for IA32 MSI layout. However, no other arch shall 1868 * repeat the mistake of not providing a direct MSI injection API. */ 1869 return data & 0xff; 1870 } 1871 1872 static void kvm_flush_dynamic_msi_routes(KVMState *s) 1873 { 1874 KVMMSIRoute *route, *next; 1875 unsigned int hash; 1876 1877 for (hash = 0; hash < KVM_MSI_HASHTAB_SIZE; hash++) { 1878 QTAILQ_FOREACH_SAFE(route, &s->msi_hashtab[hash], entry, next) { 1879 kvm_irqchip_release_virq(s, route->kroute.gsi); 1880 QTAILQ_REMOVE(&s->msi_hashtab[hash], route, entry); 1881 g_free(route); 1882 } 1883 } 1884 } 1885 1886 static int kvm_irqchip_get_virq(KVMState *s) 1887 { 1888 int next_virq; 1889 1890 /* 1891 * PIC and IOAPIC share the first 16 GSI numbers, thus the available 1892 * GSI numbers are more than the number of IRQ route. Allocating a GSI 1893 * number can succeed even though a new route entry cannot be added. 1894 * When this happens, flush dynamic MSI entries to free IRQ route entries. 1895 */ 1896 if (!kvm_direct_msi_allowed && s->irq_routes->nr == s->gsi_count) { 1897 kvm_flush_dynamic_msi_routes(s); 1898 } 1899 1900 /* Return the lowest unused GSI in the bitmap */ 1901 next_virq = find_first_zero_bit(s->used_gsi_bitmap, s->gsi_count); 1902 if (next_virq >= s->gsi_count) { 1903 return -ENOSPC; 1904 } else { 1905 return next_virq; 1906 } 1907 } 1908 1909 static KVMMSIRoute *kvm_lookup_msi_route(KVMState *s, MSIMessage msg) 1910 { 1911 unsigned int hash = kvm_hash_msi(msg.data); 1912 KVMMSIRoute *route; 1913 1914 QTAILQ_FOREACH(route, &s->msi_hashtab[hash], entry) { 1915 if (route->kroute.u.msi.address_lo == (uint32_t)msg.address && 1916 route->kroute.u.msi.address_hi == (msg.address >> 32) && 1917 route->kroute.u.msi.data == le32_to_cpu(msg.data)) { 1918 return route; 1919 } 1920 } 1921 return NULL; 1922 } 1923 1924 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg) 1925 { 1926 struct kvm_msi msi; 1927 KVMMSIRoute *route; 1928 1929 if (kvm_direct_msi_allowed) { 1930 msi.address_lo = (uint32_t)msg.address; 1931 msi.address_hi = msg.address >> 32; 1932 msi.data = le32_to_cpu(msg.data); 1933 msi.flags = 0; 1934 memset(msi.pad, 0, sizeof(msi.pad)); 1935 1936 return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi); 1937 } 1938 1939 route = kvm_lookup_msi_route(s, msg); 1940 if (!route) { 1941 int virq; 1942 1943 virq = kvm_irqchip_get_virq(s); 1944 if (virq < 0) { 1945 return virq; 1946 } 1947 1948 route = g_new0(KVMMSIRoute, 1); 1949 route->kroute.gsi = virq; 1950 route->kroute.type = KVM_IRQ_ROUTING_MSI; 1951 route->kroute.flags = 0; 1952 route->kroute.u.msi.address_lo = (uint32_t)msg.address; 1953 route->kroute.u.msi.address_hi = msg.address >> 32; 1954 route->kroute.u.msi.data = le32_to_cpu(msg.data); 1955 1956 kvm_add_routing_entry(s, &route->kroute); 1957 kvm_irqchip_commit_routes(s); 1958 1959 QTAILQ_INSERT_TAIL(&s->msi_hashtab[kvm_hash_msi(msg.data)], route, 1960 entry); 1961 } 1962 1963 assert(route->kroute.type == KVM_IRQ_ROUTING_MSI); 1964 1965 return kvm_set_irq(s, route->kroute.gsi, 1); 1966 } 1967 1968 int kvm_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev) 1969 { 1970 struct kvm_irq_routing_entry kroute = {}; 1971 int virq; 1972 KVMState *s = c->s; 1973 MSIMessage msg = {0, 0}; 1974 1975 if (pci_available && dev) { 1976 msg = pci_get_msi_message(dev, vector); 1977 } 1978 1979 if (kvm_gsi_direct_mapping()) { 1980 return kvm_arch_msi_data_to_gsi(msg.data); 1981 } 1982 1983 if (!kvm_gsi_routing_enabled()) { 1984 return -ENOSYS; 1985 } 1986 1987 virq = kvm_irqchip_get_virq(s); 1988 if (virq < 0) { 1989 return virq; 1990 } 1991 1992 kroute.gsi = virq; 1993 kroute.type = KVM_IRQ_ROUTING_MSI; 1994 kroute.flags = 0; 1995 kroute.u.msi.address_lo = (uint32_t)msg.address; 1996 kroute.u.msi.address_hi = msg.address >> 32; 1997 kroute.u.msi.data = le32_to_cpu(msg.data); 1998 if (pci_available && kvm_msi_devid_required()) { 1999 kroute.flags = KVM_MSI_VALID_DEVID; 2000 kroute.u.msi.devid = pci_requester_id(dev); 2001 } 2002 if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) { 2003 kvm_irqchip_release_virq(s, virq); 2004 return -EINVAL; 2005 } 2006 2007 trace_kvm_irqchip_add_msi_route(dev ? dev->name : (char *)"N/A", 2008 vector, virq); 2009 2010 kvm_add_routing_entry(s, &kroute); 2011 kvm_arch_add_msi_route_post(&kroute, vector, dev); 2012 c->changes++; 2013 2014 return virq; 2015 } 2016 2017 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg, 2018 PCIDevice *dev) 2019 { 2020 struct kvm_irq_routing_entry kroute = {}; 2021 2022 if (kvm_gsi_direct_mapping()) { 2023 return 0; 2024 } 2025 2026 if (!kvm_irqchip_in_kernel()) { 2027 return -ENOSYS; 2028 } 2029 2030 kroute.gsi = virq; 2031 kroute.type = KVM_IRQ_ROUTING_MSI; 2032 kroute.flags = 0; 2033 kroute.u.msi.address_lo = (uint32_t)msg.address; 2034 kroute.u.msi.address_hi = msg.address >> 32; 2035 kroute.u.msi.data = le32_to_cpu(msg.data); 2036 if (pci_available && kvm_msi_devid_required()) { 2037 kroute.flags = KVM_MSI_VALID_DEVID; 2038 kroute.u.msi.devid = pci_requester_id(dev); 2039 } 2040 if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) { 2041 return -EINVAL; 2042 } 2043 2044 trace_kvm_irqchip_update_msi_route(virq); 2045 2046 return kvm_update_routing_entry(s, &kroute); 2047 } 2048 2049 static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event, 2050 EventNotifier *resample, int virq, 2051 bool assign) 2052 { 2053 int fd = event_notifier_get_fd(event); 2054 int rfd = resample ? event_notifier_get_fd(resample) : -1; 2055 2056 struct kvm_irqfd irqfd = { 2057 .fd = fd, 2058 .gsi = virq, 2059 .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN, 2060 }; 2061 2062 if (rfd != -1) { 2063 assert(assign); 2064 if (kvm_irqchip_is_split()) { 2065 /* 2066 * When the slow irqchip (e.g. IOAPIC) is in the 2067 * userspace, KVM kernel resamplefd will not work because 2068 * the EOI of the interrupt will be delivered to userspace 2069 * instead, so the KVM kernel resamplefd kick will be 2070 * skipped. The userspace here mimics what the kernel 2071 * provides with resamplefd, remember the resamplefd and 2072 * kick it when we receive EOI of this IRQ. 2073 * 2074 * This is hackery because IOAPIC is mostly bypassed 2075 * (except EOI broadcasts) when irqfd is used. However 2076 * this can bring much performance back for split irqchip 2077 * with INTx IRQs (for VFIO, this gives 93% perf of the 2078 * full fast path, which is 46% perf boost comparing to 2079 * the INTx slow path). 2080 */ 2081 kvm_resample_fd_insert(virq, resample); 2082 } else { 2083 irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE; 2084 irqfd.resamplefd = rfd; 2085 } 2086 } else if (!assign) { 2087 if (kvm_irqchip_is_split()) { 2088 kvm_resample_fd_remove(virq); 2089 } 2090 } 2091 2092 if (!kvm_irqfds_enabled()) { 2093 return -ENOSYS; 2094 } 2095 2096 return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd); 2097 } 2098 2099 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter) 2100 { 2101 struct kvm_irq_routing_entry kroute = {}; 2102 int virq; 2103 2104 if (!kvm_gsi_routing_enabled()) { 2105 return -ENOSYS; 2106 } 2107 2108 virq = kvm_irqchip_get_virq(s); 2109 if (virq < 0) { 2110 return virq; 2111 } 2112 2113 kroute.gsi = virq; 2114 kroute.type = KVM_IRQ_ROUTING_S390_ADAPTER; 2115 kroute.flags = 0; 2116 kroute.u.adapter.summary_addr = adapter->summary_addr; 2117 kroute.u.adapter.ind_addr = adapter->ind_addr; 2118 kroute.u.adapter.summary_offset = adapter->summary_offset; 2119 kroute.u.adapter.ind_offset = adapter->ind_offset; 2120 kroute.u.adapter.adapter_id = adapter->adapter_id; 2121 2122 kvm_add_routing_entry(s, &kroute); 2123 2124 return virq; 2125 } 2126 2127 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint) 2128 { 2129 struct kvm_irq_routing_entry kroute = {}; 2130 int virq; 2131 2132 if (!kvm_gsi_routing_enabled()) { 2133 return -ENOSYS; 2134 } 2135 if (!kvm_check_extension(s, KVM_CAP_HYPERV_SYNIC)) { 2136 return -ENOSYS; 2137 } 2138 virq = kvm_irqchip_get_virq(s); 2139 if (virq < 0) { 2140 return virq; 2141 } 2142 2143 kroute.gsi = virq; 2144 kroute.type = KVM_IRQ_ROUTING_HV_SINT; 2145 kroute.flags = 0; 2146 kroute.u.hv_sint.vcpu = vcpu; 2147 kroute.u.hv_sint.sint = sint; 2148 2149 kvm_add_routing_entry(s, &kroute); 2150 kvm_irqchip_commit_routes(s); 2151 2152 return virq; 2153 } 2154 2155 #else /* !KVM_CAP_IRQ_ROUTING */ 2156 2157 void kvm_init_irq_routing(KVMState *s) 2158 { 2159 } 2160 2161 void kvm_irqchip_release_virq(KVMState *s, int virq) 2162 { 2163 } 2164 2165 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg) 2166 { 2167 abort(); 2168 } 2169 2170 int kvm_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev) 2171 { 2172 return -ENOSYS; 2173 } 2174 2175 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter) 2176 { 2177 return -ENOSYS; 2178 } 2179 2180 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint) 2181 { 2182 return -ENOSYS; 2183 } 2184 2185 static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event, 2186 EventNotifier *resample, int virq, 2187 bool assign) 2188 { 2189 abort(); 2190 } 2191 2192 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg) 2193 { 2194 return -ENOSYS; 2195 } 2196 #endif /* !KVM_CAP_IRQ_ROUTING */ 2197 2198 int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, 2199 EventNotifier *rn, int virq) 2200 { 2201 return kvm_irqchip_assign_irqfd(s, n, rn, virq, true); 2202 } 2203 2204 int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, 2205 int virq) 2206 { 2207 return kvm_irqchip_assign_irqfd(s, n, NULL, virq, false); 2208 } 2209 2210 int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n, 2211 EventNotifier *rn, qemu_irq irq) 2212 { 2213 gpointer key, gsi; 2214 gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi); 2215 2216 if (!found) { 2217 return -ENXIO; 2218 } 2219 return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, GPOINTER_TO_INT(gsi)); 2220 } 2221 2222 int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n, 2223 qemu_irq irq) 2224 { 2225 gpointer key, gsi; 2226 gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi); 2227 2228 if (!found) { 2229 return -ENXIO; 2230 } 2231 return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, GPOINTER_TO_INT(gsi)); 2232 } 2233 2234 void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi) 2235 { 2236 g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi)); 2237 } 2238 2239 static void kvm_irqchip_create(KVMState *s) 2240 { 2241 int ret; 2242 2243 assert(s->kernel_irqchip_split != ON_OFF_AUTO_AUTO); 2244 if (kvm_check_extension(s, KVM_CAP_IRQCHIP)) { 2245 ; 2246 } else if (kvm_check_extension(s, KVM_CAP_S390_IRQCHIP)) { 2247 ret = kvm_vm_enable_cap(s, KVM_CAP_S390_IRQCHIP, 0); 2248 if (ret < 0) { 2249 fprintf(stderr, "Enable kernel irqchip failed: %s\n", strerror(-ret)); 2250 exit(1); 2251 } 2252 } else { 2253 return; 2254 } 2255 2256 /* First probe and see if there's a arch-specific hook to create the 2257 * in-kernel irqchip for us */ 2258 ret = kvm_arch_irqchip_create(s); 2259 if (ret == 0) { 2260 if (s->kernel_irqchip_split == ON_OFF_AUTO_ON) { 2261 perror("Split IRQ chip mode not supported."); 2262 exit(1); 2263 } else { 2264 ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP); 2265 } 2266 } 2267 if (ret < 0) { 2268 fprintf(stderr, "Create kernel irqchip failed: %s\n", strerror(-ret)); 2269 exit(1); 2270 } 2271 2272 kvm_kernel_irqchip = true; 2273 /* If we have an in-kernel IRQ chip then we must have asynchronous 2274 * interrupt delivery (though the reverse is not necessarily true) 2275 */ 2276 kvm_async_interrupts_allowed = true; 2277 kvm_halt_in_kernel_allowed = true; 2278 2279 kvm_init_irq_routing(s); 2280 2281 s->gsimap = g_hash_table_new(g_direct_hash, g_direct_equal); 2282 } 2283 2284 /* Find number of supported CPUs using the recommended 2285 * procedure from the kernel API documentation to cope with 2286 * older kernels that may be missing capabilities. 2287 */ 2288 static int kvm_recommended_vcpus(KVMState *s) 2289 { 2290 int ret = kvm_vm_check_extension(s, KVM_CAP_NR_VCPUS); 2291 return (ret) ? ret : 4; 2292 } 2293 2294 static int kvm_max_vcpus(KVMState *s) 2295 { 2296 int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS); 2297 return (ret) ? ret : kvm_recommended_vcpus(s); 2298 } 2299 2300 static int kvm_max_vcpu_id(KVMState *s) 2301 { 2302 int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPU_ID); 2303 return (ret) ? ret : kvm_max_vcpus(s); 2304 } 2305 2306 bool kvm_vcpu_id_is_valid(int vcpu_id) 2307 { 2308 KVMState *s = KVM_STATE(current_accel()); 2309 return vcpu_id >= 0 && vcpu_id < kvm_max_vcpu_id(s); 2310 } 2311 2312 bool kvm_dirty_ring_enabled(void) 2313 { 2314 return kvm_state->kvm_dirty_ring_size ? true : false; 2315 } 2316 2317 static void query_stats_cb(StatsResultList **result, StatsTarget target, 2318 strList *names, strList *targets, Error **errp); 2319 static void query_stats_schemas_cb(StatsSchemaList **result, Error **errp); 2320 2321 uint32_t kvm_dirty_ring_size(void) 2322 { 2323 return kvm_state->kvm_dirty_ring_size; 2324 } 2325 2326 static int kvm_init(MachineState *ms) 2327 { 2328 MachineClass *mc = MACHINE_GET_CLASS(ms); 2329 static const char upgrade_note[] = 2330 "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n" 2331 "(see http://sourceforge.net/projects/kvm).\n"; 2332 struct { 2333 const char *name; 2334 int num; 2335 } num_cpus[] = { 2336 { "SMP", ms->smp.cpus }, 2337 { "hotpluggable", ms->smp.max_cpus }, 2338 { NULL, } 2339 }, *nc = num_cpus; 2340 int soft_vcpus_limit, hard_vcpus_limit; 2341 KVMState *s; 2342 const KVMCapabilityInfo *missing_cap; 2343 int ret; 2344 int type = 0; 2345 uint64_t dirty_log_manual_caps; 2346 2347 qemu_mutex_init(&kml_slots_lock); 2348 2349 s = KVM_STATE(ms->accelerator); 2350 2351 /* 2352 * On systems where the kernel can support different base page 2353 * sizes, host page size may be different from TARGET_PAGE_SIZE, 2354 * even with KVM. TARGET_PAGE_SIZE is assumed to be the minimum 2355 * page size for the system though. 2356 */ 2357 assert(TARGET_PAGE_SIZE <= qemu_real_host_page_size()); 2358 2359 s->sigmask_len = 8; 2360 2361 #ifdef KVM_CAP_SET_GUEST_DEBUG 2362 QTAILQ_INIT(&s->kvm_sw_breakpoints); 2363 #endif 2364 QLIST_INIT(&s->kvm_parked_vcpus); 2365 s->fd = qemu_open_old("/dev/kvm", O_RDWR); 2366 if (s->fd == -1) { 2367 fprintf(stderr, "Could not access KVM kernel module: %m\n"); 2368 ret = -errno; 2369 goto err; 2370 } 2371 2372 ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0); 2373 if (ret < KVM_API_VERSION) { 2374 if (ret >= 0) { 2375 ret = -EINVAL; 2376 } 2377 fprintf(stderr, "kvm version too old\n"); 2378 goto err; 2379 } 2380 2381 if (ret > KVM_API_VERSION) { 2382 ret = -EINVAL; 2383 fprintf(stderr, "kvm version not supported\n"); 2384 goto err; 2385 } 2386 2387 kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT); 2388 s->nr_slots = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS); 2389 2390 /* If unspecified, use the default value */ 2391 if (!s->nr_slots) { 2392 s->nr_slots = 32; 2393 } 2394 2395 s->nr_as = kvm_check_extension(s, KVM_CAP_MULTI_ADDRESS_SPACE); 2396 if (s->nr_as <= 1) { 2397 s->nr_as = 1; 2398 } 2399 s->as = g_new0(struct KVMAs, s->nr_as); 2400 2401 if (object_property_find(OBJECT(current_machine), "kvm-type")) { 2402 g_autofree char *kvm_type = object_property_get_str(OBJECT(current_machine), 2403 "kvm-type", 2404 &error_abort); 2405 type = mc->kvm_type(ms, kvm_type); 2406 } else if (mc->kvm_type) { 2407 type = mc->kvm_type(ms, NULL); 2408 } 2409 2410 do { 2411 ret = kvm_ioctl(s, KVM_CREATE_VM, type); 2412 } while (ret == -EINTR); 2413 2414 if (ret < 0) { 2415 fprintf(stderr, "ioctl(KVM_CREATE_VM) failed: %d %s\n", -ret, 2416 strerror(-ret)); 2417 2418 #ifdef TARGET_S390X 2419 if (ret == -EINVAL) { 2420 fprintf(stderr, 2421 "Host kernel setup problem detected. Please verify:\n"); 2422 fprintf(stderr, "- for kernels supporting the switch_amode or" 2423 " user_mode parameters, whether\n"); 2424 fprintf(stderr, 2425 " user space is running in primary address space\n"); 2426 fprintf(stderr, 2427 "- for kernels supporting the vm.allocate_pgste sysctl, " 2428 "whether it is enabled\n"); 2429 } 2430 #elif defined(TARGET_PPC) 2431 if (ret == -EINVAL) { 2432 fprintf(stderr, 2433 "PPC KVM module is not loaded. Try modprobe kvm_%s.\n", 2434 (type == 2) ? "pr" : "hv"); 2435 } 2436 #endif 2437 goto err; 2438 } 2439 2440 s->vmfd = ret; 2441 2442 /* check the vcpu limits */ 2443 soft_vcpus_limit = kvm_recommended_vcpus(s); 2444 hard_vcpus_limit = kvm_max_vcpus(s); 2445 2446 while (nc->name) { 2447 if (nc->num > soft_vcpus_limit) { 2448 warn_report("Number of %s cpus requested (%d) exceeds " 2449 "the recommended cpus supported by KVM (%d)", 2450 nc->name, nc->num, soft_vcpus_limit); 2451 2452 if (nc->num > hard_vcpus_limit) { 2453 fprintf(stderr, "Number of %s cpus requested (%d) exceeds " 2454 "the maximum cpus supported by KVM (%d)\n", 2455 nc->name, nc->num, hard_vcpus_limit); 2456 exit(1); 2457 } 2458 } 2459 nc++; 2460 } 2461 2462 missing_cap = kvm_check_extension_list(s, kvm_required_capabilites); 2463 if (!missing_cap) { 2464 missing_cap = 2465 kvm_check_extension_list(s, kvm_arch_required_capabilities); 2466 } 2467 if (missing_cap) { 2468 ret = -EINVAL; 2469 fprintf(stderr, "kvm does not support %s\n%s", 2470 missing_cap->name, upgrade_note); 2471 goto err; 2472 } 2473 2474 s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO); 2475 s->coalesced_pio = s->coalesced_mmio && 2476 kvm_check_extension(s, KVM_CAP_COALESCED_PIO); 2477 2478 /* 2479 * Enable KVM dirty ring if supported, otherwise fall back to 2480 * dirty logging mode 2481 */ 2482 if (s->kvm_dirty_ring_size > 0) { 2483 uint64_t ring_bytes; 2484 2485 ring_bytes = s->kvm_dirty_ring_size * sizeof(struct kvm_dirty_gfn); 2486 2487 /* Read the max supported pages */ 2488 ret = kvm_vm_check_extension(s, KVM_CAP_DIRTY_LOG_RING); 2489 if (ret > 0) { 2490 if (ring_bytes > ret) { 2491 error_report("KVM dirty ring size %" PRIu32 " too big " 2492 "(maximum is %ld). Please use a smaller value.", 2493 s->kvm_dirty_ring_size, 2494 (long)ret / sizeof(struct kvm_dirty_gfn)); 2495 ret = -EINVAL; 2496 goto err; 2497 } 2498 2499 ret = kvm_vm_enable_cap(s, KVM_CAP_DIRTY_LOG_RING, 0, ring_bytes); 2500 if (ret) { 2501 error_report("Enabling of KVM dirty ring failed: %s. " 2502 "Suggested minimum value is 1024.", strerror(-ret)); 2503 goto err; 2504 } 2505 2506 s->kvm_dirty_ring_bytes = ring_bytes; 2507 } else { 2508 warn_report("KVM dirty ring not available, using bitmap method"); 2509 s->kvm_dirty_ring_size = 0; 2510 } 2511 } 2512 2513 /* 2514 * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is not needed when dirty ring is 2515 * enabled. More importantly, KVM_DIRTY_LOG_INITIALLY_SET will assume no 2516 * page is wr-protected initially, which is against how kvm dirty ring is 2517 * usage - kvm dirty ring requires all pages are wr-protected at the very 2518 * beginning. Enabling this feature for dirty ring causes data corruption. 2519 * 2520 * TODO: Without KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 and kvm clear dirty log, 2521 * we may expect a higher stall time when starting the migration. In the 2522 * future we can enable KVM_CLEAR_DIRTY_LOG to work with dirty ring too: 2523 * instead of clearing dirty bit, it can be a way to explicitly wr-protect 2524 * guest pages. 2525 */ 2526 if (!s->kvm_dirty_ring_size) { 2527 dirty_log_manual_caps = 2528 kvm_check_extension(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); 2529 dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | 2530 KVM_DIRTY_LOG_INITIALLY_SET); 2531 s->manual_dirty_log_protect = dirty_log_manual_caps; 2532 if (dirty_log_manual_caps) { 2533 ret = kvm_vm_enable_cap(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0, 2534 dirty_log_manual_caps); 2535 if (ret) { 2536 warn_report("Trying to enable capability %"PRIu64" of " 2537 "KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 but failed. " 2538 "Falling back to the legacy mode. ", 2539 dirty_log_manual_caps); 2540 s->manual_dirty_log_protect = 0; 2541 } 2542 } 2543 } 2544 2545 #ifdef KVM_CAP_VCPU_EVENTS 2546 s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS); 2547 #endif 2548 2549 s->robust_singlestep = 2550 kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP); 2551 2552 #ifdef KVM_CAP_DEBUGREGS 2553 s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS); 2554 #endif 2555 2556 s->max_nested_state_len = kvm_check_extension(s, KVM_CAP_NESTED_STATE); 2557 2558 #ifdef KVM_CAP_IRQ_ROUTING 2559 kvm_direct_msi_allowed = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0); 2560 #endif 2561 2562 s->intx_set_mask = kvm_check_extension(s, KVM_CAP_PCI_2_3); 2563 2564 s->irq_set_ioctl = KVM_IRQ_LINE; 2565 if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) { 2566 s->irq_set_ioctl = KVM_IRQ_LINE_STATUS; 2567 } 2568 2569 kvm_readonly_mem_allowed = 2570 (kvm_check_extension(s, KVM_CAP_READONLY_MEM) > 0); 2571 2572 kvm_eventfds_allowed = 2573 (kvm_check_extension(s, KVM_CAP_IOEVENTFD) > 0); 2574 2575 kvm_irqfds_allowed = 2576 (kvm_check_extension(s, KVM_CAP_IRQFD) > 0); 2577 2578 kvm_resamplefds_allowed = 2579 (kvm_check_extension(s, KVM_CAP_IRQFD_RESAMPLE) > 0); 2580 2581 kvm_vm_attributes_allowed = 2582 (kvm_check_extension(s, KVM_CAP_VM_ATTRIBUTES) > 0); 2583 2584 kvm_ioeventfd_any_length_allowed = 2585 (kvm_check_extension(s, KVM_CAP_IOEVENTFD_ANY_LENGTH) > 0); 2586 2587 #ifdef KVM_CAP_SET_GUEST_DEBUG 2588 kvm_has_guest_debug = 2589 (kvm_check_extension(s, KVM_CAP_SET_GUEST_DEBUG) > 0); 2590 #endif 2591 2592 kvm_sstep_flags = 0; 2593 if (kvm_has_guest_debug) { 2594 kvm_sstep_flags = SSTEP_ENABLE; 2595 2596 #if defined KVM_CAP_SET_GUEST_DEBUG2 2597 int guest_debug_flags = 2598 kvm_check_extension(s, KVM_CAP_SET_GUEST_DEBUG2); 2599 2600 if (guest_debug_flags & KVM_GUESTDBG_BLOCKIRQ) { 2601 kvm_sstep_flags |= SSTEP_NOIRQ; 2602 } 2603 #endif 2604 } 2605 2606 kvm_state = s; 2607 2608 ret = kvm_arch_init(ms, s); 2609 if (ret < 0) { 2610 goto err; 2611 } 2612 2613 if (s->kernel_irqchip_split == ON_OFF_AUTO_AUTO) { 2614 s->kernel_irqchip_split = mc->default_kernel_irqchip_split ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF; 2615 } 2616 2617 qemu_register_reset(kvm_unpoison_all, NULL); 2618 2619 if (s->kernel_irqchip_allowed) { 2620 kvm_irqchip_create(s); 2621 } 2622 2623 if (kvm_eventfds_allowed) { 2624 s->memory_listener.listener.eventfd_add = kvm_mem_ioeventfd_add; 2625 s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del; 2626 } 2627 s->memory_listener.listener.coalesced_io_add = kvm_coalesce_mmio_region; 2628 s->memory_listener.listener.coalesced_io_del = kvm_uncoalesce_mmio_region; 2629 2630 kvm_memory_listener_register(s, &s->memory_listener, 2631 &address_space_memory, 0, "kvm-memory"); 2632 if (kvm_eventfds_allowed) { 2633 memory_listener_register(&kvm_io_listener, 2634 &address_space_io); 2635 } 2636 memory_listener_register(&kvm_coalesced_pio_listener, 2637 &address_space_io); 2638 2639 s->many_ioeventfds = kvm_check_many_ioeventfds(); 2640 2641 s->sync_mmu = !!kvm_vm_check_extension(kvm_state, KVM_CAP_SYNC_MMU); 2642 if (!s->sync_mmu) { 2643 ret = ram_block_discard_disable(true); 2644 assert(!ret); 2645 } 2646 2647 if (s->kvm_dirty_ring_size) { 2648 ret = kvm_dirty_ring_reaper_init(s); 2649 if (ret) { 2650 goto err; 2651 } 2652 } 2653 2654 if (kvm_check_extension(kvm_state, KVM_CAP_BINARY_STATS_FD)) { 2655 add_stats_callbacks(STATS_PROVIDER_KVM, query_stats_cb, 2656 query_stats_schemas_cb); 2657 } 2658 2659 return 0; 2660 2661 err: 2662 assert(ret < 0); 2663 if (s->vmfd >= 0) { 2664 close(s->vmfd); 2665 } 2666 if (s->fd != -1) { 2667 close(s->fd); 2668 } 2669 g_free(s->memory_listener.slots); 2670 2671 return ret; 2672 } 2673 2674 void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len) 2675 { 2676 s->sigmask_len = sigmask_len; 2677 } 2678 2679 static void kvm_handle_io(uint16_t port, MemTxAttrs attrs, void *data, int direction, 2680 int size, uint32_t count) 2681 { 2682 int i; 2683 uint8_t *ptr = data; 2684 2685 for (i = 0; i < count; i++) { 2686 address_space_rw(&address_space_io, port, attrs, 2687 ptr, size, 2688 direction == KVM_EXIT_IO_OUT); 2689 ptr += size; 2690 } 2691 } 2692 2693 static int kvm_handle_internal_error(CPUState *cpu, struct kvm_run *run) 2694 { 2695 fprintf(stderr, "KVM internal error. Suberror: %d\n", 2696 run->internal.suberror); 2697 2698 if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) { 2699 int i; 2700 2701 for (i = 0; i < run->internal.ndata; ++i) { 2702 fprintf(stderr, "extra data[%d]: 0x%016"PRIx64"\n", 2703 i, (uint64_t)run->internal.data[i]); 2704 } 2705 } 2706 if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) { 2707 fprintf(stderr, "emulation failure\n"); 2708 if (!kvm_arch_stop_on_emulation_error(cpu)) { 2709 cpu_dump_state(cpu, stderr, CPU_DUMP_CODE); 2710 return EXCP_INTERRUPT; 2711 } 2712 } 2713 /* FIXME: Should trigger a qmp message to let management know 2714 * something went wrong. 2715 */ 2716 return -1; 2717 } 2718 2719 void kvm_flush_coalesced_mmio_buffer(void) 2720 { 2721 KVMState *s = kvm_state; 2722 2723 if (s->coalesced_flush_in_progress) { 2724 return; 2725 } 2726 2727 s->coalesced_flush_in_progress = true; 2728 2729 if (s->coalesced_mmio_ring) { 2730 struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring; 2731 while (ring->first != ring->last) { 2732 struct kvm_coalesced_mmio *ent; 2733 2734 ent = &ring->coalesced_mmio[ring->first]; 2735 2736 if (ent->pio == 1) { 2737 address_space_write(&address_space_io, ent->phys_addr, 2738 MEMTXATTRS_UNSPECIFIED, ent->data, 2739 ent->len); 2740 } else { 2741 cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len); 2742 } 2743 smp_wmb(); 2744 ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX; 2745 } 2746 } 2747 2748 s->coalesced_flush_in_progress = false; 2749 } 2750 2751 bool kvm_cpu_check_are_resettable(void) 2752 { 2753 return kvm_arch_cpu_check_are_resettable(); 2754 } 2755 2756 static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg) 2757 { 2758 if (!cpu->vcpu_dirty) { 2759 kvm_arch_get_registers(cpu); 2760 cpu->vcpu_dirty = true; 2761 } 2762 } 2763 2764 void kvm_cpu_synchronize_state(CPUState *cpu) 2765 { 2766 if (!cpu->vcpu_dirty) { 2767 run_on_cpu(cpu, do_kvm_cpu_synchronize_state, RUN_ON_CPU_NULL); 2768 } 2769 } 2770 2771 static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg) 2772 { 2773 kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE); 2774 cpu->vcpu_dirty = false; 2775 } 2776 2777 void kvm_cpu_synchronize_post_reset(CPUState *cpu) 2778 { 2779 run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL); 2780 } 2781 2782 static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg) 2783 { 2784 kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE); 2785 cpu->vcpu_dirty = false; 2786 } 2787 2788 void kvm_cpu_synchronize_post_init(CPUState *cpu) 2789 { 2790 run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, RUN_ON_CPU_NULL); 2791 } 2792 2793 static void do_kvm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg) 2794 { 2795 cpu->vcpu_dirty = true; 2796 } 2797 2798 void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu) 2799 { 2800 run_on_cpu(cpu, do_kvm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL); 2801 } 2802 2803 #ifdef KVM_HAVE_MCE_INJECTION 2804 static __thread void *pending_sigbus_addr; 2805 static __thread int pending_sigbus_code; 2806 static __thread bool have_sigbus_pending; 2807 #endif 2808 2809 static void kvm_cpu_kick(CPUState *cpu) 2810 { 2811 qatomic_set(&cpu->kvm_run->immediate_exit, 1); 2812 } 2813 2814 static void kvm_cpu_kick_self(void) 2815 { 2816 if (kvm_immediate_exit) { 2817 kvm_cpu_kick(current_cpu); 2818 } else { 2819 qemu_cpu_kick_self(); 2820 } 2821 } 2822 2823 static void kvm_eat_signals(CPUState *cpu) 2824 { 2825 struct timespec ts = { 0, 0 }; 2826 siginfo_t siginfo; 2827 sigset_t waitset; 2828 sigset_t chkset; 2829 int r; 2830 2831 if (kvm_immediate_exit) { 2832 qatomic_set(&cpu->kvm_run->immediate_exit, 0); 2833 /* Write kvm_run->immediate_exit before the cpu->exit_request 2834 * write in kvm_cpu_exec. 2835 */ 2836 smp_wmb(); 2837 return; 2838 } 2839 2840 sigemptyset(&waitset); 2841 sigaddset(&waitset, SIG_IPI); 2842 2843 do { 2844 r = sigtimedwait(&waitset, &siginfo, &ts); 2845 if (r == -1 && !(errno == EAGAIN || errno == EINTR)) { 2846 perror("sigtimedwait"); 2847 exit(1); 2848 } 2849 2850 r = sigpending(&chkset); 2851 if (r == -1) { 2852 perror("sigpending"); 2853 exit(1); 2854 } 2855 } while (sigismember(&chkset, SIG_IPI)); 2856 } 2857 2858 int kvm_cpu_exec(CPUState *cpu) 2859 { 2860 struct kvm_run *run = cpu->kvm_run; 2861 int ret, run_ret; 2862 2863 DPRINTF("kvm_cpu_exec()\n"); 2864 2865 if (kvm_arch_process_async_events(cpu)) { 2866 qatomic_set(&cpu->exit_request, 0); 2867 return EXCP_HLT; 2868 } 2869 2870 qemu_mutex_unlock_iothread(); 2871 cpu_exec_start(cpu); 2872 2873 do { 2874 MemTxAttrs attrs; 2875 2876 if (cpu->vcpu_dirty) { 2877 kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE); 2878 cpu->vcpu_dirty = false; 2879 } 2880 2881 kvm_arch_pre_run(cpu, run); 2882 if (qatomic_read(&cpu->exit_request)) { 2883 DPRINTF("interrupt exit requested\n"); 2884 /* 2885 * KVM requires us to reenter the kernel after IO exits to complete 2886 * instruction emulation. This self-signal will ensure that we 2887 * leave ASAP again. 2888 */ 2889 kvm_cpu_kick_self(); 2890 } 2891 2892 /* Read cpu->exit_request before KVM_RUN reads run->immediate_exit. 2893 * Matching barrier in kvm_eat_signals. 2894 */ 2895 smp_rmb(); 2896 2897 run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0); 2898 2899 attrs = kvm_arch_post_run(cpu, run); 2900 2901 #ifdef KVM_HAVE_MCE_INJECTION 2902 if (unlikely(have_sigbus_pending)) { 2903 qemu_mutex_lock_iothread(); 2904 kvm_arch_on_sigbus_vcpu(cpu, pending_sigbus_code, 2905 pending_sigbus_addr); 2906 have_sigbus_pending = false; 2907 qemu_mutex_unlock_iothread(); 2908 } 2909 #endif 2910 2911 if (run_ret < 0) { 2912 if (run_ret == -EINTR || run_ret == -EAGAIN) { 2913 DPRINTF("io window exit\n"); 2914 kvm_eat_signals(cpu); 2915 ret = EXCP_INTERRUPT; 2916 break; 2917 } 2918 fprintf(stderr, "error: kvm run failed %s\n", 2919 strerror(-run_ret)); 2920 #ifdef TARGET_PPC 2921 if (run_ret == -EBUSY) { 2922 fprintf(stderr, 2923 "This is probably because your SMT is enabled.\n" 2924 "VCPU can only run on primary threads with all " 2925 "secondary threads offline.\n"); 2926 } 2927 #endif 2928 ret = -1; 2929 break; 2930 } 2931 2932 trace_kvm_run_exit(cpu->cpu_index, run->exit_reason); 2933 switch (run->exit_reason) { 2934 case KVM_EXIT_IO: 2935 DPRINTF("handle_io\n"); 2936 /* Called outside BQL */ 2937 kvm_handle_io(run->io.port, attrs, 2938 (uint8_t *)run + run->io.data_offset, 2939 run->io.direction, 2940 run->io.size, 2941 run->io.count); 2942 ret = 0; 2943 break; 2944 case KVM_EXIT_MMIO: 2945 DPRINTF("handle_mmio\n"); 2946 /* Called outside BQL */ 2947 address_space_rw(&address_space_memory, 2948 run->mmio.phys_addr, attrs, 2949 run->mmio.data, 2950 run->mmio.len, 2951 run->mmio.is_write); 2952 ret = 0; 2953 break; 2954 case KVM_EXIT_IRQ_WINDOW_OPEN: 2955 DPRINTF("irq_window_open\n"); 2956 ret = EXCP_INTERRUPT; 2957 break; 2958 case KVM_EXIT_SHUTDOWN: 2959 DPRINTF("shutdown\n"); 2960 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); 2961 ret = EXCP_INTERRUPT; 2962 break; 2963 case KVM_EXIT_UNKNOWN: 2964 fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n", 2965 (uint64_t)run->hw.hardware_exit_reason); 2966 ret = -1; 2967 break; 2968 case KVM_EXIT_INTERNAL_ERROR: 2969 ret = kvm_handle_internal_error(cpu, run); 2970 break; 2971 case KVM_EXIT_DIRTY_RING_FULL: 2972 /* 2973 * We shouldn't continue if the dirty ring of this vcpu is 2974 * still full. Got kicked by KVM_RESET_DIRTY_RINGS. 2975 */ 2976 trace_kvm_dirty_ring_full(cpu->cpu_index); 2977 qemu_mutex_lock_iothread(); 2978 kvm_dirty_ring_reap(kvm_state, NULL); 2979 qemu_mutex_unlock_iothread(); 2980 ret = 0; 2981 break; 2982 case KVM_EXIT_SYSTEM_EVENT: 2983 switch (run->system_event.type) { 2984 case KVM_SYSTEM_EVENT_SHUTDOWN: 2985 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN); 2986 ret = EXCP_INTERRUPT; 2987 break; 2988 case KVM_SYSTEM_EVENT_RESET: 2989 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); 2990 ret = EXCP_INTERRUPT; 2991 break; 2992 case KVM_SYSTEM_EVENT_CRASH: 2993 kvm_cpu_synchronize_state(cpu); 2994 qemu_mutex_lock_iothread(); 2995 qemu_system_guest_panicked(cpu_get_crash_info(cpu)); 2996 qemu_mutex_unlock_iothread(); 2997 ret = 0; 2998 break; 2999 default: 3000 DPRINTF("kvm_arch_handle_exit\n"); 3001 ret = kvm_arch_handle_exit(cpu, run); 3002 break; 3003 } 3004 break; 3005 default: 3006 DPRINTF("kvm_arch_handle_exit\n"); 3007 ret = kvm_arch_handle_exit(cpu, run); 3008 break; 3009 } 3010 } while (ret == 0); 3011 3012 cpu_exec_end(cpu); 3013 qemu_mutex_lock_iothread(); 3014 3015 if (ret < 0) { 3016 cpu_dump_state(cpu, stderr, CPU_DUMP_CODE); 3017 vm_stop(RUN_STATE_INTERNAL_ERROR); 3018 } 3019 3020 qatomic_set(&cpu->exit_request, 0); 3021 return ret; 3022 } 3023 3024 int kvm_ioctl(KVMState *s, int type, ...) 3025 { 3026 int ret; 3027 void *arg; 3028 va_list ap; 3029 3030 va_start(ap, type); 3031 arg = va_arg(ap, void *); 3032 va_end(ap); 3033 3034 trace_kvm_ioctl(type, arg); 3035 ret = ioctl(s->fd, type, arg); 3036 if (ret == -1) { 3037 ret = -errno; 3038 } 3039 return ret; 3040 } 3041 3042 int kvm_vm_ioctl(KVMState *s, int type, ...) 3043 { 3044 int ret; 3045 void *arg; 3046 va_list ap; 3047 3048 va_start(ap, type); 3049 arg = va_arg(ap, void *); 3050 va_end(ap); 3051 3052 trace_kvm_vm_ioctl(type, arg); 3053 ret = ioctl(s->vmfd, type, arg); 3054 if (ret == -1) { 3055 ret = -errno; 3056 } 3057 return ret; 3058 } 3059 3060 int kvm_vcpu_ioctl(CPUState *cpu, int type, ...) 3061 { 3062 int ret; 3063 void *arg; 3064 va_list ap; 3065 3066 va_start(ap, type); 3067 arg = va_arg(ap, void *); 3068 va_end(ap); 3069 3070 trace_kvm_vcpu_ioctl(cpu->cpu_index, type, arg); 3071 ret = ioctl(cpu->kvm_fd, type, arg); 3072 if (ret == -1) { 3073 ret = -errno; 3074 } 3075 return ret; 3076 } 3077 3078 int kvm_device_ioctl(int fd, int type, ...) 3079 { 3080 int ret; 3081 void *arg; 3082 va_list ap; 3083 3084 va_start(ap, type); 3085 arg = va_arg(ap, void *); 3086 va_end(ap); 3087 3088 trace_kvm_device_ioctl(fd, type, arg); 3089 ret = ioctl(fd, type, arg); 3090 if (ret == -1) { 3091 ret = -errno; 3092 } 3093 return ret; 3094 } 3095 3096 int kvm_vm_check_attr(KVMState *s, uint32_t group, uint64_t attr) 3097 { 3098 int ret; 3099 struct kvm_device_attr attribute = { 3100 .group = group, 3101 .attr = attr, 3102 }; 3103 3104 if (!kvm_vm_attributes_allowed) { 3105 return 0; 3106 } 3107 3108 ret = kvm_vm_ioctl(s, KVM_HAS_DEVICE_ATTR, &attribute); 3109 /* kvm returns 0 on success for HAS_DEVICE_ATTR */ 3110 return ret ? 0 : 1; 3111 } 3112 3113 int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr) 3114 { 3115 struct kvm_device_attr attribute = { 3116 .group = group, 3117 .attr = attr, 3118 .flags = 0, 3119 }; 3120 3121 return kvm_device_ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute) ? 0 : 1; 3122 } 3123 3124 int kvm_device_access(int fd, int group, uint64_t attr, 3125 void *val, bool write, Error **errp) 3126 { 3127 struct kvm_device_attr kvmattr; 3128 int err; 3129 3130 kvmattr.flags = 0; 3131 kvmattr.group = group; 3132 kvmattr.attr = attr; 3133 kvmattr.addr = (uintptr_t)val; 3134 3135 err = kvm_device_ioctl(fd, 3136 write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR, 3137 &kvmattr); 3138 if (err < 0) { 3139 error_setg_errno(errp, -err, 3140 "KVM_%s_DEVICE_ATTR failed: Group %d " 3141 "attr 0x%016" PRIx64, 3142 write ? "SET" : "GET", group, attr); 3143 } 3144 return err; 3145 } 3146 3147 bool kvm_has_sync_mmu(void) 3148 { 3149 return kvm_state->sync_mmu; 3150 } 3151 3152 int kvm_has_vcpu_events(void) 3153 { 3154 return kvm_state->vcpu_events; 3155 } 3156 3157 int kvm_has_robust_singlestep(void) 3158 { 3159 return kvm_state->robust_singlestep; 3160 } 3161 3162 int kvm_has_debugregs(void) 3163 { 3164 return kvm_state->debugregs; 3165 } 3166 3167 int kvm_max_nested_state_length(void) 3168 { 3169 return kvm_state->max_nested_state_len; 3170 } 3171 3172 int kvm_has_many_ioeventfds(void) 3173 { 3174 if (!kvm_enabled()) { 3175 return 0; 3176 } 3177 return kvm_state->many_ioeventfds; 3178 } 3179 3180 int kvm_has_gsi_routing(void) 3181 { 3182 #ifdef KVM_CAP_IRQ_ROUTING 3183 return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING); 3184 #else 3185 return false; 3186 #endif 3187 } 3188 3189 int kvm_has_intx_set_mask(void) 3190 { 3191 return kvm_state->intx_set_mask; 3192 } 3193 3194 bool kvm_arm_supports_user_irq(void) 3195 { 3196 return kvm_check_extension(kvm_state, KVM_CAP_ARM_USER_IRQ); 3197 } 3198 3199 #ifdef KVM_CAP_SET_GUEST_DEBUG 3200 struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu, 3201 target_ulong pc) 3202 { 3203 struct kvm_sw_breakpoint *bp; 3204 3205 QTAILQ_FOREACH(bp, &cpu->kvm_state->kvm_sw_breakpoints, entry) { 3206 if (bp->pc == pc) { 3207 return bp; 3208 } 3209 } 3210 return NULL; 3211 } 3212 3213 int kvm_sw_breakpoints_active(CPUState *cpu) 3214 { 3215 return !QTAILQ_EMPTY(&cpu->kvm_state->kvm_sw_breakpoints); 3216 } 3217 3218 struct kvm_set_guest_debug_data { 3219 struct kvm_guest_debug dbg; 3220 int err; 3221 }; 3222 3223 static void kvm_invoke_set_guest_debug(CPUState *cpu, run_on_cpu_data data) 3224 { 3225 struct kvm_set_guest_debug_data *dbg_data = 3226 (struct kvm_set_guest_debug_data *) data.host_ptr; 3227 3228 dbg_data->err = kvm_vcpu_ioctl(cpu, KVM_SET_GUEST_DEBUG, 3229 &dbg_data->dbg); 3230 } 3231 3232 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap) 3233 { 3234 struct kvm_set_guest_debug_data data; 3235 3236 data.dbg.control = reinject_trap; 3237 3238 if (cpu->singlestep_enabled) { 3239 data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP; 3240 3241 if (cpu->singlestep_enabled & SSTEP_NOIRQ) { 3242 data.dbg.control |= KVM_GUESTDBG_BLOCKIRQ; 3243 } 3244 } 3245 kvm_arch_update_guest_debug(cpu, &data.dbg); 3246 3247 run_on_cpu(cpu, kvm_invoke_set_guest_debug, 3248 RUN_ON_CPU_HOST_PTR(&data)); 3249 return data.err; 3250 } 3251 3252 int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr, 3253 target_ulong len, int type) 3254 { 3255 struct kvm_sw_breakpoint *bp; 3256 int err; 3257 3258 if (type == GDB_BREAKPOINT_SW) { 3259 bp = kvm_find_sw_breakpoint(cpu, addr); 3260 if (bp) { 3261 bp->use_count++; 3262 return 0; 3263 } 3264 3265 bp = g_new(struct kvm_sw_breakpoint, 1); 3266 bp->pc = addr; 3267 bp->use_count = 1; 3268 err = kvm_arch_insert_sw_breakpoint(cpu, bp); 3269 if (err) { 3270 g_free(bp); 3271 return err; 3272 } 3273 3274 QTAILQ_INSERT_HEAD(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry); 3275 } else { 3276 err = kvm_arch_insert_hw_breakpoint(addr, len, type); 3277 if (err) { 3278 return err; 3279 } 3280 } 3281 3282 CPU_FOREACH(cpu) { 3283 err = kvm_update_guest_debug(cpu, 0); 3284 if (err) { 3285 return err; 3286 } 3287 } 3288 return 0; 3289 } 3290 3291 int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr, 3292 target_ulong len, int type) 3293 { 3294 struct kvm_sw_breakpoint *bp; 3295 int err; 3296 3297 if (type == GDB_BREAKPOINT_SW) { 3298 bp = kvm_find_sw_breakpoint(cpu, addr); 3299 if (!bp) { 3300 return -ENOENT; 3301 } 3302 3303 if (bp->use_count > 1) { 3304 bp->use_count--; 3305 return 0; 3306 } 3307 3308 err = kvm_arch_remove_sw_breakpoint(cpu, bp); 3309 if (err) { 3310 return err; 3311 } 3312 3313 QTAILQ_REMOVE(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry); 3314 g_free(bp); 3315 } else { 3316 err = kvm_arch_remove_hw_breakpoint(addr, len, type); 3317 if (err) { 3318 return err; 3319 } 3320 } 3321 3322 CPU_FOREACH(cpu) { 3323 err = kvm_update_guest_debug(cpu, 0); 3324 if (err) { 3325 return err; 3326 } 3327 } 3328 return 0; 3329 } 3330 3331 void kvm_remove_all_breakpoints(CPUState *cpu) 3332 { 3333 struct kvm_sw_breakpoint *bp, *next; 3334 KVMState *s = cpu->kvm_state; 3335 CPUState *tmpcpu; 3336 3337 QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) { 3338 if (kvm_arch_remove_sw_breakpoint(cpu, bp) != 0) { 3339 /* Try harder to find a CPU that currently sees the breakpoint. */ 3340 CPU_FOREACH(tmpcpu) { 3341 if (kvm_arch_remove_sw_breakpoint(tmpcpu, bp) == 0) { 3342 break; 3343 } 3344 } 3345 } 3346 QTAILQ_REMOVE(&s->kvm_sw_breakpoints, bp, entry); 3347 g_free(bp); 3348 } 3349 kvm_arch_remove_all_hw_breakpoints(); 3350 3351 CPU_FOREACH(cpu) { 3352 kvm_update_guest_debug(cpu, 0); 3353 } 3354 } 3355 3356 #else /* !KVM_CAP_SET_GUEST_DEBUG */ 3357 3358 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap) 3359 { 3360 return -EINVAL; 3361 } 3362 3363 int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr, 3364 target_ulong len, int type) 3365 { 3366 return -EINVAL; 3367 } 3368 3369 int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr, 3370 target_ulong len, int type) 3371 { 3372 return -EINVAL; 3373 } 3374 3375 void kvm_remove_all_breakpoints(CPUState *cpu) 3376 { 3377 } 3378 #endif /* !KVM_CAP_SET_GUEST_DEBUG */ 3379 3380 static int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset) 3381 { 3382 KVMState *s = kvm_state; 3383 struct kvm_signal_mask *sigmask; 3384 int r; 3385 3386 sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset)); 3387 3388 sigmask->len = s->sigmask_len; 3389 memcpy(sigmask->sigset, sigset, sizeof(*sigset)); 3390 r = kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, sigmask); 3391 g_free(sigmask); 3392 3393 return r; 3394 } 3395 3396 static void kvm_ipi_signal(int sig) 3397 { 3398 if (current_cpu) { 3399 assert(kvm_immediate_exit); 3400 kvm_cpu_kick(current_cpu); 3401 } 3402 } 3403 3404 void kvm_init_cpu_signals(CPUState *cpu) 3405 { 3406 int r; 3407 sigset_t set; 3408 struct sigaction sigact; 3409 3410 memset(&sigact, 0, sizeof(sigact)); 3411 sigact.sa_handler = kvm_ipi_signal; 3412 sigaction(SIG_IPI, &sigact, NULL); 3413 3414 pthread_sigmask(SIG_BLOCK, NULL, &set); 3415 #if defined KVM_HAVE_MCE_INJECTION 3416 sigdelset(&set, SIGBUS); 3417 pthread_sigmask(SIG_SETMASK, &set, NULL); 3418 #endif 3419 sigdelset(&set, SIG_IPI); 3420 if (kvm_immediate_exit) { 3421 r = pthread_sigmask(SIG_SETMASK, &set, NULL); 3422 } else { 3423 r = kvm_set_signal_mask(cpu, &set); 3424 } 3425 if (r) { 3426 fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r)); 3427 exit(1); 3428 } 3429 } 3430 3431 /* Called asynchronously in VCPU thread. */ 3432 int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr) 3433 { 3434 #ifdef KVM_HAVE_MCE_INJECTION 3435 if (have_sigbus_pending) { 3436 return 1; 3437 } 3438 have_sigbus_pending = true; 3439 pending_sigbus_addr = addr; 3440 pending_sigbus_code = code; 3441 qatomic_set(&cpu->exit_request, 1); 3442 return 0; 3443 #else 3444 return 1; 3445 #endif 3446 } 3447 3448 /* Called synchronously (via signalfd) in main thread. */ 3449 int kvm_on_sigbus(int code, void *addr) 3450 { 3451 #ifdef KVM_HAVE_MCE_INJECTION 3452 /* Action required MCE kills the process if SIGBUS is blocked. Because 3453 * that's what happens in the I/O thread, where we handle MCE via signalfd, 3454 * we can only get action optional here. 3455 */ 3456 assert(code != BUS_MCEERR_AR); 3457 kvm_arch_on_sigbus_vcpu(first_cpu, code, addr); 3458 return 0; 3459 #else 3460 return 1; 3461 #endif 3462 } 3463 3464 int kvm_create_device(KVMState *s, uint64_t type, bool test) 3465 { 3466 int ret; 3467 struct kvm_create_device create_dev; 3468 3469 create_dev.type = type; 3470 create_dev.fd = -1; 3471 create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0; 3472 3473 if (!kvm_check_extension(s, KVM_CAP_DEVICE_CTRL)) { 3474 return -ENOTSUP; 3475 } 3476 3477 ret = kvm_vm_ioctl(s, KVM_CREATE_DEVICE, &create_dev); 3478 if (ret) { 3479 return ret; 3480 } 3481 3482 return test ? 0 : create_dev.fd; 3483 } 3484 3485 bool kvm_device_supported(int vmfd, uint64_t type) 3486 { 3487 struct kvm_create_device create_dev = { 3488 .type = type, 3489 .fd = -1, 3490 .flags = KVM_CREATE_DEVICE_TEST, 3491 }; 3492 3493 if (ioctl(vmfd, KVM_CHECK_EXTENSION, KVM_CAP_DEVICE_CTRL) <= 0) { 3494 return false; 3495 } 3496 3497 return (ioctl(vmfd, KVM_CREATE_DEVICE, &create_dev) >= 0); 3498 } 3499 3500 int kvm_set_one_reg(CPUState *cs, uint64_t id, void *source) 3501 { 3502 struct kvm_one_reg reg; 3503 int r; 3504 3505 reg.id = id; 3506 reg.addr = (uintptr_t) source; 3507 r = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, ®); 3508 if (r) { 3509 trace_kvm_failed_reg_set(id, strerror(-r)); 3510 } 3511 return r; 3512 } 3513 3514 int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target) 3515 { 3516 struct kvm_one_reg reg; 3517 int r; 3518 3519 reg.id = id; 3520 reg.addr = (uintptr_t) target; 3521 r = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, ®); 3522 if (r) { 3523 trace_kvm_failed_reg_get(id, strerror(-r)); 3524 } 3525 return r; 3526 } 3527 3528 static bool kvm_accel_has_memory(MachineState *ms, AddressSpace *as, 3529 hwaddr start_addr, hwaddr size) 3530 { 3531 KVMState *kvm = KVM_STATE(ms->accelerator); 3532 int i; 3533 3534 for (i = 0; i < kvm->nr_as; ++i) { 3535 if (kvm->as[i].as == as && kvm->as[i].ml) { 3536 size = MIN(kvm_max_slot_size, size); 3537 return NULL != kvm_lookup_matching_slot(kvm->as[i].ml, 3538 start_addr, size); 3539 } 3540 } 3541 3542 return false; 3543 } 3544 3545 static void kvm_get_kvm_shadow_mem(Object *obj, Visitor *v, 3546 const char *name, void *opaque, 3547 Error **errp) 3548 { 3549 KVMState *s = KVM_STATE(obj); 3550 int64_t value = s->kvm_shadow_mem; 3551 3552 visit_type_int(v, name, &value, errp); 3553 } 3554 3555 static void kvm_set_kvm_shadow_mem(Object *obj, Visitor *v, 3556 const char *name, void *opaque, 3557 Error **errp) 3558 { 3559 KVMState *s = KVM_STATE(obj); 3560 int64_t value; 3561 3562 if (s->fd != -1) { 3563 error_setg(errp, "Cannot set properties after the accelerator has been initialized"); 3564 return; 3565 } 3566 3567 if (!visit_type_int(v, name, &value, errp)) { 3568 return; 3569 } 3570 3571 s->kvm_shadow_mem = value; 3572 } 3573 3574 static void kvm_set_kernel_irqchip(Object *obj, Visitor *v, 3575 const char *name, void *opaque, 3576 Error **errp) 3577 { 3578 KVMState *s = KVM_STATE(obj); 3579 OnOffSplit mode; 3580 3581 if (s->fd != -1) { 3582 error_setg(errp, "Cannot set properties after the accelerator has been initialized"); 3583 return; 3584 } 3585 3586 if (!visit_type_OnOffSplit(v, name, &mode, errp)) { 3587 return; 3588 } 3589 switch (mode) { 3590 case ON_OFF_SPLIT_ON: 3591 s->kernel_irqchip_allowed = true; 3592 s->kernel_irqchip_required = true; 3593 s->kernel_irqchip_split = ON_OFF_AUTO_OFF; 3594 break; 3595 case ON_OFF_SPLIT_OFF: 3596 s->kernel_irqchip_allowed = false; 3597 s->kernel_irqchip_required = false; 3598 s->kernel_irqchip_split = ON_OFF_AUTO_OFF; 3599 break; 3600 case ON_OFF_SPLIT_SPLIT: 3601 s->kernel_irqchip_allowed = true; 3602 s->kernel_irqchip_required = true; 3603 s->kernel_irqchip_split = ON_OFF_AUTO_ON; 3604 break; 3605 default: 3606 /* The value was checked in visit_type_OnOffSplit() above. If 3607 * we get here, then something is wrong in QEMU. 3608 */ 3609 abort(); 3610 } 3611 } 3612 3613 bool kvm_kernel_irqchip_allowed(void) 3614 { 3615 return kvm_state->kernel_irqchip_allowed; 3616 } 3617 3618 bool kvm_kernel_irqchip_required(void) 3619 { 3620 return kvm_state->kernel_irqchip_required; 3621 } 3622 3623 bool kvm_kernel_irqchip_split(void) 3624 { 3625 return kvm_state->kernel_irqchip_split == ON_OFF_AUTO_ON; 3626 } 3627 3628 static void kvm_get_dirty_ring_size(Object *obj, Visitor *v, 3629 const char *name, void *opaque, 3630 Error **errp) 3631 { 3632 KVMState *s = KVM_STATE(obj); 3633 uint32_t value = s->kvm_dirty_ring_size; 3634 3635 visit_type_uint32(v, name, &value, errp); 3636 } 3637 3638 static void kvm_set_dirty_ring_size(Object *obj, Visitor *v, 3639 const char *name, void *opaque, 3640 Error **errp) 3641 { 3642 KVMState *s = KVM_STATE(obj); 3643 Error *error = NULL; 3644 uint32_t value; 3645 3646 if (s->fd != -1) { 3647 error_setg(errp, "Cannot set properties after the accelerator has been initialized"); 3648 return; 3649 } 3650 3651 visit_type_uint32(v, name, &value, &error); 3652 if (error) { 3653 error_propagate(errp, error); 3654 return; 3655 } 3656 if (value & (value - 1)) { 3657 error_setg(errp, "dirty-ring-size must be a power of two."); 3658 return; 3659 } 3660 3661 s->kvm_dirty_ring_size = value; 3662 } 3663 3664 static void kvm_accel_instance_init(Object *obj) 3665 { 3666 KVMState *s = KVM_STATE(obj); 3667 3668 s->fd = -1; 3669 s->vmfd = -1; 3670 s->kvm_shadow_mem = -1; 3671 s->kernel_irqchip_allowed = true; 3672 s->kernel_irqchip_split = ON_OFF_AUTO_AUTO; 3673 /* KVM dirty ring is by default off */ 3674 s->kvm_dirty_ring_size = 0; 3675 } 3676 3677 static void kvm_accel_class_init(ObjectClass *oc, void *data) 3678 { 3679 AccelClass *ac = ACCEL_CLASS(oc); 3680 ac->name = "KVM"; 3681 ac->init_machine = kvm_init; 3682 ac->has_memory = kvm_accel_has_memory; 3683 ac->allowed = &kvm_allowed; 3684 3685 object_class_property_add(oc, "kernel-irqchip", "on|off|split", 3686 NULL, kvm_set_kernel_irqchip, 3687 NULL, NULL); 3688 object_class_property_set_description(oc, "kernel-irqchip", 3689 "Configure KVM in-kernel irqchip"); 3690 3691 object_class_property_add(oc, "kvm-shadow-mem", "int", 3692 kvm_get_kvm_shadow_mem, kvm_set_kvm_shadow_mem, 3693 NULL, NULL); 3694 object_class_property_set_description(oc, "kvm-shadow-mem", 3695 "KVM shadow MMU size"); 3696 3697 object_class_property_add(oc, "dirty-ring-size", "uint32", 3698 kvm_get_dirty_ring_size, kvm_set_dirty_ring_size, 3699 NULL, NULL); 3700 object_class_property_set_description(oc, "dirty-ring-size", 3701 "Size of KVM dirty page ring buffer (default: 0, i.e. use bitmap)"); 3702 } 3703 3704 static const TypeInfo kvm_accel_type = { 3705 .name = TYPE_KVM_ACCEL, 3706 .parent = TYPE_ACCEL, 3707 .instance_init = kvm_accel_instance_init, 3708 .class_init = kvm_accel_class_init, 3709 .instance_size = sizeof(KVMState), 3710 }; 3711 3712 static void kvm_type_init(void) 3713 { 3714 type_register_static(&kvm_accel_type); 3715 } 3716 3717 type_init(kvm_type_init); 3718 3719 typedef struct StatsArgs { 3720 union StatsResultsType { 3721 StatsResultList **stats; 3722 StatsSchemaList **schema; 3723 } result; 3724 strList *names; 3725 Error **errp; 3726 } StatsArgs; 3727 3728 static StatsList *add_kvmstat_entry(struct kvm_stats_desc *pdesc, 3729 uint64_t *stats_data, 3730 StatsList *stats_list, 3731 Error **errp) 3732 { 3733 3734 Stats *stats; 3735 uint64List *val_list = NULL; 3736 3737 /* Only add stats that we understand. */ 3738 switch (pdesc->flags & KVM_STATS_TYPE_MASK) { 3739 case KVM_STATS_TYPE_CUMULATIVE: 3740 case KVM_STATS_TYPE_INSTANT: 3741 case KVM_STATS_TYPE_PEAK: 3742 case KVM_STATS_TYPE_LINEAR_HIST: 3743 case KVM_STATS_TYPE_LOG_HIST: 3744 break; 3745 default: 3746 return stats_list; 3747 } 3748 3749 switch (pdesc->flags & KVM_STATS_UNIT_MASK) { 3750 case KVM_STATS_UNIT_NONE: 3751 case KVM_STATS_UNIT_BYTES: 3752 case KVM_STATS_UNIT_CYCLES: 3753 case KVM_STATS_UNIT_SECONDS: 3754 break; 3755 default: 3756 return stats_list; 3757 } 3758 3759 switch (pdesc->flags & KVM_STATS_BASE_MASK) { 3760 case KVM_STATS_BASE_POW10: 3761 case KVM_STATS_BASE_POW2: 3762 break; 3763 default: 3764 return stats_list; 3765 } 3766 3767 /* Alloc and populate data list */ 3768 stats = g_new0(Stats, 1); 3769 stats->name = g_strdup(pdesc->name); 3770 stats->value = g_new0(StatsValue, 1);; 3771 3772 if (pdesc->size == 1) { 3773 stats->value->u.scalar = *stats_data; 3774 stats->value->type = QTYPE_QNUM; 3775 } else { 3776 int i; 3777 for (i = 0; i < pdesc->size; i++) { 3778 QAPI_LIST_PREPEND(val_list, stats_data[i]); 3779 } 3780 stats->value->u.list = val_list; 3781 stats->value->type = QTYPE_QLIST; 3782 } 3783 3784 QAPI_LIST_PREPEND(stats_list, stats); 3785 return stats_list; 3786 } 3787 3788 static StatsSchemaValueList *add_kvmschema_entry(struct kvm_stats_desc *pdesc, 3789 StatsSchemaValueList *list, 3790 Error **errp) 3791 { 3792 StatsSchemaValueList *schema_entry = g_new0(StatsSchemaValueList, 1); 3793 schema_entry->value = g_new0(StatsSchemaValue, 1); 3794 3795 switch (pdesc->flags & KVM_STATS_TYPE_MASK) { 3796 case KVM_STATS_TYPE_CUMULATIVE: 3797 schema_entry->value->type = STATS_TYPE_CUMULATIVE; 3798 break; 3799 case KVM_STATS_TYPE_INSTANT: 3800 schema_entry->value->type = STATS_TYPE_INSTANT; 3801 break; 3802 case KVM_STATS_TYPE_PEAK: 3803 schema_entry->value->type = STATS_TYPE_PEAK; 3804 break; 3805 case KVM_STATS_TYPE_LINEAR_HIST: 3806 schema_entry->value->type = STATS_TYPE_LINEAR_HISTOGRAM; 3807 schema_entry->value->bucket_size = pdesc->bucket_size; 3808 schema_entry->value->has_bucket_size = true; 3809 break; 3810 case KVM_STATS_TYPE_LOG_HIST: 3811 schema_entry->value->type = STATS_TYPE_LOG2_HISTOGRAM; 3812 break; 3813 default: 3814 goto exit; 3815 } 3816 3817 switch (pdesc->flags & KVM_STATS_UNIT_MASK) { 3818 case KVM_STATS_UNIT_NONE: 3819 break; 3820 case KVM_STATS_UNIT_BYTES: 3821 schema_entry->value->has_unit = true; 3822 schema_entry->value->unit = STATS_UNIT_BYTES; 3823 break; 3824 case KVM_STATS_UNIT_CYCLES: 3825 schema_entry->value->has_unit = true; 3826 schema_entry->value->unit = STATS_UNIT_CYCLES; 3827 break; 3828 case KVM_STATS_UNIT_SECONDS: 3829 schema_entry->value->has_unit = true; 3830 schema_entry->value->unit = STATS_UNIT_SECONDS; 3831 break; 3832 default: 3833 goto exit; 3834 } 3835 3836 schema_entry->value->exponent = pdesc->exponent; 3837 if (pdesc->exponent) { 3838 switch (pdesc->flags & KVM_STATS_BASE_MASK) { 3839 case KVM_STATS_BASE_POW10: 3840 schema_entry->value->has_base = true; 3841 schema_entry->value->base = 10; 3842 break; 3843 case KVM_STATS_BASE_POW2: 3844 schema_entry->value->has_base = true; 3845 schema_entry->value->base = 2; 3846 break; 3847 default: 3848 goto exit; 3849 } 3850 } 3851 3852 schema_entry->value->name = g_strdup(pdesc->name); 3853 schema_entry->next = list; 3854 return schema_entry; 3855 exit: 3856 g_free(schema_entry->value); 3857 g_free(schema_entry); 3858 return list; 3859 } 3860 3861 /* Cached stats descriptors */ 3862 typedef struct StatsDescriptors { 3863 const char *ident; /* cache key, currently the StatsTarget */ 3864 struct kvm_stats_desc *kvm_stats_desc; 3865 struct kvm_stats_header *kvm_stats_header; 3866 QTAILQ_ENTRY(StatsDescriptors) next; 3867 } StatsDescriptors; 3868 3869 static QTAILQ_HEAD(, StatsDescriptors) stats_descriptors = 3870 QTAILQ_HEAD_INITIALIZER(stats_descriptors); 3871 3872 /* 3873 * Return the descriptors for 'target', that either have already been read 3874 * or are retrieved from 'stats_fd'. 3875 */ 3876 static StatsDescriptors *find_stats_descriptors(StatsTarget target, int stats_fd, 3877 Error **errp) 3878 { 3879 StatsDescriptors *descriptors; 3880 const char *ident; 3881 struct kvm_stats_desc *kvm_stats_desc; 3882 struct kvm_stats_header *kvm_stats_header; 3883 size_t size_desc; 3884 ssize_t ret; 3885 3886 ident = StatsTarget_str(target); 3887 QTAILQ_FOREACH(descriptors, &stats_descriptors, next) { 3888 if (g_str_equal(descriptors->ident, ident)) { 3889 return descriptors; 3890 } 3891 } 3892 3893 descriptors = g_new0(StatsDescriptors, 1); 3894 3895 /* Read stats header */ 3896 kvm_stats_header = g_malloc(sizeof(*kvm_stats_header)); 3897 ret = read(stats_fd, kvm_stats_header, sizeof(*kvm_stats_header)); 3898 if (ret != sizeof(*kvm_stats_header)) { 3899 error_setg(errp, "KVM stats: failed to read stats header: " 3900 "expected %zu actual %zu", 3901 sizeof(*kvm_stats_header), ret); 3902 g_free(descriptors); 3903 return NULL; 3904 } 3905 size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size; 3906 3907 /* Read stats descriptors */ 3908 kvm_stats_desc = g_malloc0_n(kvm_stats_header->num_desc, size_desc); 3909 ret = pread(stats_fd, kvm_stats_desc, 3910 size_desc * kvm_stats_header->num_desc, 3911 kvm_stats_header->desc_offset); 3912 3913 if (ret != size_desc * kvm_stats_header->num_desc) { 3914 error_setg(errp, "KVM stats: failed to read stats descriptors: " 3915 "expected %zu actual %zu", 3916 size_desc * kvm_stats_header->num_desc, ret); 3917 g_free(descriptors); 3918 g_free(kvm_stats_desc); 3919 return NULL; 3920 } 3921 descriptors->kvm_stats_header = kvm_stats_header; 3922 descriptors->kvm_stats_desc = kvm_stats_desc; 3923 descriptors->ident = ident; 3924 QTAILQ_INSERT_TAIL(&stats_descriptors, descriptors, next); 3925 return descriptors; 3926 } 3927 3928 static void query_stats(StatsResultList **result, StatsTarget target, 3929 strList *names, int stats_fd, Error **errp) 3930 { 3931 struct kvm_stats_desc *kvm_stats_desc; 3932 struct kvm_stats_header *kvm_stats_header; 3933 StatsDescriptors *descriptors; 3934 g_autofree uint64_t *stats_data = NULL; 3935 struct kvm_stats_desc *pdesc; 3936 StatsList *stats_list = NULL; 3937 size_t size_desc, size_data = 0; 3938 ssize_t ret; 3939 int i; 3940 3941 descriptors = find_stats_descriptors(target, stats_fd, errp); 3942 if (!descriptors) { 3943 return; 3944 } 3945 3946 kvm_stats_header = descriptors->kvm_stats_header; 3947 kvm_stats_desc = descriptors->kvm_stats_desc; 3948 size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size; 3949 3950 /* Tally the total data size; read schema data */ 3951 for (i = 0; i < kvm_stats_header->num_desc; ++i) { 3952 pdesc = (void *)kvm_stats_desc + i * size_desc; 3953 size_data += pdesc->size * sizeof(*stats_data); 3954 } 3955 3956 stats_data = g_malloc0(size_data); 3957 ret = pread(stats_fd, stats_data, size_data, kvm_stats_header->data_offset); 3958 3959 if (ret != size_data) { 3960 error_setg(errp, "KVM stats: failed to read data: " 3961 "expected %zu actual %zu", size_data, ret); 3962 return; 3963 } 3964 3965 for (i = 0; i < kvm_stats_header->num_desc; ++i) { 3966 uint64_t *stats; 3967 pdesc = (void *)kvm_stats_desc + i * size_desc; 3968 3969 /* Add entry to the list */ 3970 stats = (void *)stats_data + pdesc->offset; 3971 if (!apply_str_list_filter(pdesc->name, names)) { 3972 continue; 3973 } 3974 stats_list = add_kvmstat_entry(pdesc, stats, stats_list, errp); 3975 } 3976 3977 if (!stats_list) { 3978 return; 3979 } 3980 3981 switch (target) { 3982 case STATS_TARGET_VM: 3983 add_stats_entry(result, STATS_PROVIDER_KVM, NULL, stats_list); 3984 break; 3985 case STATS_TARGET_VCPU: 3986 add_stats_entry(result, STATS_PROVIDER_KVM, 3987 current_cpu->parent_obj.canonical_path, 3988 stats_list); 3989 break; 3990 default: 3991 break; 3992 } 3993 } 3994 3995 static void query_stats_schema(StatsSchemaList **result, StatsTarget target, 3996 int stats_fd, Error **errp) 3997 { 3998 struct kvm_stats_desc *kvm_stats_desc; 3999 struct kvm_stats_header *kvm_stats_header; 4000 StatsDescriptors *descriptors; 4001 struct kvm_stats_desc *pdesc; 4002 StatsSchemaValueList *stats_list = NULL; 4003 size_t size_desc; 4004 int i; 4005 4006 descriptors = find_stats_descriptors(target, stats_fd, errp); 4007 if (!descriptors) { 4008 return; 4009 } 4010 4011 kvm_stats_header = descriptors->kvm_stats_header; 4012 kvm_stats_desc = descriptors->kvm_stats_desc; 4013 size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size; 4014 4015 /* Tally the total data size; read schema data */ 4016 for (i = 0; i < kvm_stats_header->num_desc; ++i) { 4017 pdesc = (void *)kvm_stats_desc + i * size_desc; 4018 stats_list = add_kvmschema_entry(pdesc, stats_list, errp); 4019 } 4020 4021 add_stats_schema(result, STATS_PROVIDER_KVM, target, stats_list); 4022 } 4023 4024 static void query_stats_vcpu(CPUState *cpu, run_on_cpu_data data) 4025 { 4026 StatsArgs *kvm_stats_args = (StatsArgs *) data.host_ptr; 4027 int stats_fd = kvm_vcpu_ioctl(cpu, KVM_GET_STATS_FD, NULL); 4028 Error *local_err = NULL; 4029 4030 if (stats_fd == -1) { 4031 error_setg_errno(&local_err, errno, "KVM stats: ioctl failed"); 4032 error_propagate(kvm_stats_args->errp, local_err); 4033 return; 4034 } 4035 query_stats(kvm_stats_args->result.stats, STATS_TARGET_VCPU, 4036 kvm_stats_args->names, stats_fd, kvm_stats_args->errp); 4037 close(stats_fd); 4038 } 4039 4040 static void query_stats_schema_vcpu(CPUState *cpu, run_on_cpu_data data) 4041 { 4042 StatsArgs *kvm_stats_args = (StatsArgs *) data.host_ptr; 4043 int stats_fd = kvm_vcpu_ioctl(cpu, KVM_GET_STATS_FD, NULL); 4044 Error *local_err = NULL; 4045 4046 if (stats_fd == -1) { 4047 error_setg_errno(&local_err, errno, "KVM stats: ioctl failed"); 4048 error_propagate(kvm_stats_args->errp, local_err); 4049 return; 4050 } 4051 query_stats_schema(kvm_stats_args->result.schema, STATS_TARGET_VCPU, stats_fd, 4052 kvm_stats_args->errp); 4053 close(stats_fd); 4054 } 4055 4056 static void query_stats_cb(StatsResultList **result, StatsTarget target, 4057 strList *names, strList *targets, Error **errp) 4058 { 4059 KVMState *s = kvm_state; 4060 CPUState *cpu; 4061 int stats_fd; 4062 4063 switch (target) { 4064 case STATS_TARGET_VM: 4065 { 4066 stats_fd = kvm_vm_ioctl(s, KVM_GET_STATS_FD, NULL); 4067 if (stats_fd == -1) { 4068 error_setg_errno(errp, errno, "KVM stats: ioctl failed"); 4069 return; 4070 } 4071 query_stats(result, target, names, stats_fd, errp); 4072 close(stats_fd); 4073 break; 4074 } 4075 case STATS_TARGET_VCPU: 4076 { 4077 StatsArgs stats_args; 4078 stats_args.result.stats = result; 4079 stats_args.names = names; 4080 stats_args.errp = errp; 4081 CPU_FOREACH(cpu) { 4082 if (!apply_str_list_filter(cpu->parent_obj.canonical_path, targets)) { 4083 continue; 4084 } 4085 run_on_cpu(cpu, query_stats_vcpu, RUN_ON_CPU_HOST_PTR(&stats_args)); 4086 } 4087 break; 4088 } 4089 default: 4090 break; 4091 } 4092 } 4093 4094 void query_stats_schemas_cb(StatsSchemaList **result, Error **errp) 4095 { 4096 StatsArgs stats_args; 4097 KVMState *s = kvm_state; 4098 int stats_fd; 4099 4100 stats_fd = kvm_vm_ioctl(s, KVM_GET_STATS_FD, NULL); 4101 if (stats_fd == -1) { 4102 error_setg_errno(errp, errno, "KVM stats: ioctl failed"); 4103 return; 4104 } 4105 query_stats_schema(result, STATS_TARGET_VM, stats_fd, errp); 4106 close(stats_fd); 4107 4108 stats_args.result.schema = result; 4109 stats_args.errp = errp; 4110 run_on_cpu(first_cpu, query_stats_schema_vcpu, RUN_ON_CPU_HOST_PTR(&stats_args)); 4111 } 4112