1 /* 2 * QEMU KVM support 3 * 4 * Copyright IBM, Corp. 2008 5 * Red Hat, Inc. 2008 6 * 7 * Authors: 8 * Anthony Liguori <aliguori@us.ibm.com> 9 * Glauber Costa <gcosta@redhat.com> 10 * 11 * This work is licensed under the terms of the GNU GPL, version 2 or later. 12 * See the COPYING file in the top-level directory. 13 * 14 */ 15 16 #include "qemu/osdep.h" 17 #include <sys/ioctl.h> 18 #include <poll.h> 19 20 #include <linux/kvm.h> 21 22 #include "qemu/atomic.h" 23 #include "qemu/option.h" 24 #include "qemu/config-file.h" 25 #include "qemu/error-report.h" 26 #include "qapi/error.h" 27 #include "hw/pci/msi.h" 28 #include "hw/pci/msix.h" 29 #include "hw/s390x/adapter.h" 30 #include "exec/gdbstub.h" 31 #include "sysemu/kvm_int.h" 32 #include "sysemu/runstate.h" 33 #include "sysemu/cpus.h" 34 #include "qemu/bswap.h" 35 #include "exec/memory.h" 36 #include "exec/ram_addr.h" 37 #include "qemu/event_notifier.h" 38 #include "qemu/main-loop.h" 39 #include "trace.h" 40 #include "hw/irq.h" 41 #include "qapi/visitor.h" 42 #include "qapi/qapi-types-common.h" 43 #include "qapi/qapi-visit-common.h" 44 #include "sysemu/reset.h" 45 #include "qemu/guest-random.h" 46 #include "sysemu/hw_accel.h" 47 #include "kvm-cpus.h" 48 49 #include "hw/boards.h" 50 #include "monitor/stats.h" 51 52 /* This check must be after config-host.h is included */ 53 #ifdef CONFIG_EVENTFD 54 #include <sys/eventfd.h> 55 #endif 56 57 /* KVM uses PAGE_SIZE in its definition of KVM_COALESCED_MMIO_MAX. We 58 * need to use the real host PAGE_SIZE, as that's what KVM will use. 59 */ 60 #ifdef PAGE_SIZE 61 #undef PAGE_SIZE 62 #endif 63 #define PAGE_SIZE qemu_real_host_page_size() 64 65 #ifndef KVM_GUESTDBG_BLOCKIRQ 66 #define KVM_GUESTDBG_BLOCKIRQ 0 67 #endif 68 69 //#define DEBUG_KVM 70 71 #ifdef DEBUG_KVM 72 #define DPRINTF(fmt, ...) \ 73 do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0) 74 #else 75 #define DPRINTF(fmt, ...) \ 76 do { } while (0) 77 #endif 78 79 #define KVM_MSI_HASHTAB_SIZE 256 80 81 struct KVMParkedVcpu { 82 unsigned long vcpu_id; 83 int kvm_fd; 84 QLIST_ENTRY(KVMParkedVcpu) node; 85 }; 86 87 enum KVMDirtyRingReaperState { 88 KVM_DIRTY_RING_REAPER_NONE = 0, 89 /* The reaper is sleeping */ 90 KVM_DIRTY_RING_REAPER_WAIT, 91 /* The reaper is reaping for dirty pages */ 92 KVM_DIRTY_RING_REAPER_REAPING, 93 }; 94 95 /* 96 * KVM reaper instance, responsible for collecting the KVM dirty bits 97 * via the dirty ring. 98 */ 99 struct KVMDirtyRingReaper { 100 /* The reaper thread */ 101 QemuThread reaper_thr; 102 volatile uint64_t reaper_iteration; /* iteration number of reaper thr */ 103 volatile enum KVMDirtyRingReaperState reaper_state; /* reap thr state */ 104 }; 105 106 struct KVMState 107 { 108 AccelState parent_obj; 109 110 int nr_slots; 111 int fd; 112 int vmfd; 113 int coalesced_mmio; 114 int coalesced_pio; 115 struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; 116 bool coalesced_flush_in_progress; 117 int vcpu_events; 118 int robust_singlestep; 119 int debugregs; 120 #ifdef KVM_CAP_SET_GUEST_DEBUG 121 QTAILQ_HEAD(, kvm_sw_breakpoint) kvm_sw_breakpoints; 122 #endif 123 int max_nested_state_len; 124 int many_ioeventfds; 125 int intx_set_mask; 126 int kvm_shadow_mem; 127 bool kernel_irqchip_allowed; 128 bool kernel_irqchip_required; 129 OnOffAuto kernel_irqchip_split; 130 bool sync_mmu; 131 uint64_t manual_dirty_log_protect; 132 /* The man page (and posix) say ioctl numbers are signed int, but 133 * they're not. Linux, glibc and *BSD all treat ioctl numbers as 134 * unsigned, and treating them as signed here can break things */ 135 unsigned irq_set_ioctl; 136 unsigned int sigmask_len; 137 GHashTable *gsimap; 138 #ifdef KVM_CAP_IRQ_ROUTING 139 struct kvm_irq_routing *irq_routes; 140 int nr_allocated_irq_routes; 141 unsigned long *used_gsi_bitmap; 142 unsigned int gsi_count; 143 QTAILQ_HEAD(, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE]; 144 #endif 145 KVMMemoryListener memory_listener; 146 QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus; 147 148 /* For "info mtree -f" to tell if an MR is registered in KVM */ 149 int nr_as; 150 struct KVMAs { 151 KVMMemoryListener *ml; 152 AddressSpace *as; 153 } *as; 154 uint64_t kvm_dirty_ring_bytes; /* Size of the per-vcpu dirty ring */ 155 uint32_t kvm_dirty_ring_size; /* Number of dirty GFNs per ring */ 156 struct KVMDirtyRingReaper reaper; 157 }; 158 159 KVMState *kvm_state; 160 bool kvm_kernel_irqchip; 161 bool kvm_split_irqchip; 162 bool kvm_async_interrupts_allowed; 163 bool kvm_halt_in_kernel_allowed; 164 bool kvm_eventfds_allowed; 165 bool kvm_irqfds_allowed; 166 bool kvm_resamplefds_allowed; 167 bool kvm_msi_via_irqfd_allowed; 168 bool kvm_gsi_routing_allowed; 169 bool kvm_gsi_direct_mapping; 170 bool kvm_allowed; 171 bool kvm_readonly_mem_allowed; 172 bool kvm_vm_attributes_allowed; 173 bool kvm_direct_msi_allowed; 174 bool kvm_ioeventfd_any_length_allowed; 175 bool kvm_msi_use_devid; 176 bool kvm_has_guest_debug; 177 int kvm_sstep_flags; 178 static bool kvm_immediate_exit; 179 static hwaddr kvm_max_slot_size = ~0; 180 181 static const KVMCapabilityInfo kvm_required_capabilites[] = { 182 KVM_CAP_INFO(USER_MEMORY), 183 KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS), 184 KVM_CAP_INFO(JOIN_MEMORY_REGIONS_WORKS), 185 KVM_CAP_LAST_INFO 186 }; 187 188 static NotifierList kvm_irqchip_change_notifiers = 189 NOTIFIER_LIST_INITIALIZER(kvm_irqchip_change_notifiers); 190 191 struct KVMResampleFd { 192 int gsi; 193 EventNotifier *resample_event; 194 QLIST_ENTRY(KVMResampleFd) node; 195 }; 196 typedef struct KVMResampleFd KVMResampleFd; 197 198 /* 199 * Only used with split irqchip where we need to do the resample fd 200 * kick for the kernel from userspace. 201 */ 202 static QLIST_HEAD(, KVMResampleFd) kvm_resample_fd_list = 203 QLIST_HEAD_INITIALIZER(kvm_resample_fd_list); 204 205 static QemuMutex kml_slots_lock; 206 207 #define kvm_slots_lock() qemu_mutex_lock(&kml_slots_lock) 208 #define kvm_slots_unlock() qemu_mutex_unlock(&kml_slots_lock) 209 210 static void kvm_slot_init_dirty_bitmap(KVMSlot *mem); 211 212 static inline void kvm_resample_fd_remove(int gsi) 213 { 214 KVMResampleFd *rfd; 215 216 QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) { 217 if (rfd->gsi == gsi) { 218 QLIST_REMOVE(rfd, node); 219 g_free(rfd); 220 break; 221 } 222 } 223 } 224 225 static inline void kvm_resample_fd_insert(int gsi, EventNotifier *event) 226 { 227 KVMResampleFd *rfd = g_new0(KVMResampleFd, 1); 228 229 rfd->gsi = gsi; 230 rfd->resample_event = event; 231 232 QLIST_INSERT_HEAD(&kvm_resample_fd_list, rfd, node); 233 } 234 235 void kvm_resample_fd_notify(int gsi) 236 { 237 KVMResampleFd *rfd; 238 239 QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) { 240 if (rfd->gsi == gsi) { 241 event_notifier_set(rfd->resample_event); 242 trace_kvm_resample_fd_notify(gsi); 243 return; 244 } 245 } 246 } 247 248 int kvm_get_max_memslots(void) 249 { 250 KVMState *s = KVM_STATE(current_accel()); 251 252 return s->nr_slots; 253 } 254 255 /* Called with KVMMemoryListener.slots_lock held */ 256 static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml) 257 { 258 KVMState *s = kvm_state; 259 int i; 260 261 for (i = 0; i < s->nr_slots; i++) { 262 if (kml->slots[i].memory_size == 0) { 263 return &kml->slots[i]; 264 } 265 } 266 267 return NULL; 268 } 269 270 bool kvm_has_free_slot(MachineState *ms) 271 { 272 KVMState *s = KVM_STATE(ms->accelerator); 273 bool result; 274 KVMMemoryListener *kml = &s->memory_listener; 275 276 kvm_slots_lock(); 277 result = !!kvm_get_free_slot(kml); 278 kvm_slots_unlock(); 279 280 return result; 281 } 282 283 /* Called with KVMMemoryListener.slots_lock held */ 284 static KVMSlot *kvm_alloc_slot(KVMMemoryListener *kml) 285 { 286 KVMSlot *slot = kvm_get_free_slot(kml); 287 288 if (slot) { 289 return slot; 290 } 291 292 fprintf(stderr, "%s: no free slot available\n", __func__); 293 abort(); 294 } 295 296 static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml, 297 hwaddr start_addr, 298 hwaddr size) 299 { 300 KVMState *s = kvm_state; 301 int i; 302 303 for (i = 0; i < s->nr_slots; i++) { 304 KVMSlot *mem = &kml->slots[i]; 305 306 if (start_addr == mem->start_addr && size == mem->memory_size) { 307 return mem; 308 } 309 } 310 311 return NULL; 312 } 313 314 /* 315 * Calculate and align the start address and the size of the section. 316 * Return the size. If the size is 0, the aligned section is empty. 317 */ 318 static hwaddr kvm_align_section(MemoryRegionSection *section, 319 hwaddr *start) 320 { 321 hwaddr size = int128_get64(section->size); 322 hwaddr delta, aligned; 323 324 /* kvm works in page size chunks, but the function may be called 325 with sub-page size and unaligned start address. Pad the start 326 address to next and truncate size to previous page boundary. */ 327 aligned = ROUND_UP(section->offset_within_address_space, 328 qemu_real_host_page_size()); 329 delta = aligned - section->offset_within_address_space; 330 *start = aligned; 331 if (delta > size) { 332 return 0; 333 } 334 335 return (size - delta) & qemu_real_host_page_mask(); 336 } 337 338 int kvm_physical_memory_addr_from_host(KVMState *s, void *ram, 339 hwaddr *phys_addr) 340 { 341 KVMMemoryListener *kml = &s->memory_listener; 342 int i, ret = 0; 343 344 kvm_slots_lock(); 345 for (i = 0; i < s->nr_slots; i++) { 346 KVMSlot *mem = &kml->slots[i]; 347 348 if (ram >= mem->ram && ram < mem->ram + mem->memory_size) { 349 *phys_addr = mem->start_addr + (ram - mem->ram); 350 ret = 1; 351 break; 352 } 353 } 354 kvm_slots_unlock(); 355 356 return ret; 357 } 358 359 static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot, bool new) 360 { 361 KVMState *s = kvm_state; 362 struct kvm_userspace_memory_region mem; 363 int ret; 364 365 mem.slot = slot->slot | (kml->as_id << 16); 366 mem.guest_phys_addr = slot->start_addr; 367 mem.userspace_addr = (unsigned long)slot->ram; 368 mem.flags = slot->flags; 369 370 if (slot->memory_size && !new && (mem.flags ^ slot->old_flags) & KVM_MEM_READONLY) { 371 /* Set the slot size to 0 before setting the slot to the desired 372 * value. This is needed based on KVM commit 75d61fbc. */ 373 mem.memory_size = 0; 374 ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem); 375 if (ret < 0) { 376 goto err; 377 } 378 } 379 mem.memory_size = slot->memory_size; 380 ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem); 381 slot->old_flags = mem.flags; 382 err: 383 trace_kvm_set_user_memory(mem.slot, mem.flags, mem.guest_phys_addr, 384 mem.memory_size, mem.userspace_addr, ret); 385 if (ret < 0) { 386 error_report("%s: KVM_SET_USER_MEMORY_REGION failed, slot=%d," 387 " start=0x%" PRIx64 ", size=0x%" PRIx64 ": %s", 388 __func__, mem.slot, slot->start_addr, 389 (uint64_t)mem.memory_size, strerror(errno)); 390 } 391 return ret; 392 } 393 394 static int do_kvm_destroy_vcpu(CPUState *cpu) 395 { 396 KVMState *s = kvm_state; 397 long mmap_size; 398 struct KVMParkedVcpu *vcpu = NULL; 399 int ret = 0; 400 401 DPRINTF("kvm_destroy_vcpu\n"); 402 403 ret = kvm_arch_destroy_vcpu(cpu); 404 if (ret < 0) { 405 goto err; 406 } 407 408 mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0); 409 if (mmap_size < 0) { 410 ret = mmap_size; 411 DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n"); 412 goto err; 413 } 414 415 ret = munmap(cpu->kvm_run, mmap_size); 416 if (ret < 0) { 417 goto err; 418 } 419 420 if (cpu->kvm_dirty_gfns) { 421 ret = munmap(cpu->kvm_dirty_gfns, s->kvm_dirty_ring_bytes); 422 if (ret < 0) { 423 goto err; 424 } 425 } 426 427 vcpu = g_malloc0(sizeof(*vcpu)); 428 vcpu->vcpu_id = kvm_arch_vcpu_id(cpu); 429 vcpu->kvm_fd = cpu->kvm_fd; 430 QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node); 431 err: 432 return ret; 433 } 434 435 void kvm_destroy_vcpu(CPUState *cpu) 436 { 437 if (do_kvm_destroy_vcpu(cpu) < 0) { 438 error_report("kvm_destroy_vcpu failed"); 439 exit(EXIT_FAILURE); 440 } 441 } 442 443 static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id) 444 { 445 struct KVMParkedVcpu *cpu; 446 447 QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) { 448 if (cpu->vcpu_id == vcpu_id) { 449 int kvm_fd; 450 451 QLIST_REMOVE(cpu, node); 452 kvm_fd = cpu->kvm_fd; 453 g_free(cpu); 454 return kvm_fd; 455 } 456 } 457 458 return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id); 459 } 460 461 int kvm_init_vcpu(CPUState *cpu, Error **errp) 462 { 463 KVMState *s = kvm_state; 464 long mmap_size; 465 int ret; 466 467 trace_kvm_init_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu)); 468 469 ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu)); 470 if (ret < 0) { 471 error_setg_errno(errp, -ret, "kvm_init_vcpu: kvm_get_vcpu failed (%lu)", 472 kvm_arch_vcpu_id(cpu)); 473 goto err; 474 } 475 476 cpu->kvm_fd = ret; 477 cpu->kvm_state = s; 478 cpu->vcpu_dirty = true; 479 cpu->dirty_pages = 0; 480 481 mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0); 482 if (mmap_size < 0) { 483 ret = mmap_size; 484 error_setg_errno(errp, -mmap_size, 485 "kvm_init_vcpu: KVM_GET_VCPU_MMAP_SIZE failed"); 486 goto err; 487 } 488 489 cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, 490 cpu->kvm_fd, 0); 491 if (cpu->kvm_run == MAP_FAILED) { 492 ret = -errno; 493 error_setg_errno(errp, ret, 494 "kvm_init_vcpu: mmap'ing vcpu state failed (%lu)", 495 kvm_arch_vcpu_id(cpu)); 496 goto err; 497 } 498 499 if (s->coalesced_mmio && !s->coalesced_mmio_ring) { 500 s->coalesced_mmio_ring = 501 (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE; 502 } 503 504 if (s->kvm_dirty_ring_size) { 505 /* Use MAP_SHARED to share pages with the kernel */ 506 cpu->kvm_dirty_gfns = mmap(NULL, s->kvm_dirty_ring_bytes, 507 PROT_READ | PROT_WRITE, MAP_SHARED, 508 cpu->kvm_fd, 509 PAGE_SIZE * KVM_DIRTY_LOG_PAGE_OFFSET); 510 if (cpu->kvm_dirty_gfns == MAP_FAILED) { 511 ret = -errno; 512 DPRINTF("mmap'ing vcpu dirty gfns failed: %d\n", ret); 513 goto err; 514 } 515 } 516 517 ret = kvm_arch_init_vcpu(cpu); 518 if (ret < 0) { 519 error_setg_errno(errp, -ret, 520 "kvm_init_vcpu: kvm_arch_init_vcpu failed (%lu)", 521 kvm_arch_vcpu_id(cpu)); 522 } 523 err: 524 return ret; 525 } 526 527 /* 528 * dirty pages logging control 529 */ 530 531 static int kvm_mem_flags(MemoryRegion *mr) 532 { 533 bool readonly = mr->readonly || memory_region_is_romd(mr); 534 int flags = 0; 535 536 if (memory_region_get_dirty_log_mask(mr) != 0) { 537 flags |= KVM_MEM_LOG_DIRTY_PAGES; 538 } 539 if (readonly && kvm_readonly_mem_allowed) { 540 flags |= KVM_MEM_READONLY; 541 } 542 return flags; 543 } 544 545 /* Called with KVMMemoryListener.slots_lock held */ 546 static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem, 547 MemoryRegion *mr) 548 { 549 mem->flags = kvm_mem_flags(mr); 550 551 /* If nothing changed effectively, no need to issue ioctl */ 552 if (mem->flags == mem->old_flags) { 553 return 0; 554 } 555 556 kvm_slot_init_dirty_bitmap(mem); 557 return kvm_set_user_memory_region(kml, mem, false); 558 } 559 560 static int kvm_section_update_flags(KVMMemoryListener *kml, 561 MemoryRegionSection *section) 562 { 563 hwaddr start_addr, size, slot_size; 564 KVMSlot *mem; 565 int ret = 0; 566 567 size = kvm_align_section(section, &start_addr); 568 if (!size) { 569 return 0; 570 } 571 572 kvm_slots_lock(); 573 574 while (size && !ret) { 575 slot_size = MIN(kvm_max_slot_size, size); 576 mem = kvm_lookup_matching_slot(kml, start_addr, slot_size); 577 if (!mem) { 578 /* We don't have a slot if we want to trap every access. */ 579 goto out; 580 } 581 582 ret = kvm_slot_update_flags(kml, mem, section->mr); 583 start_addr += slot_size; 584 size -= slot_size; 585 } 586 587 out: 588 kvm_slots_unlock(); 589 return ret; 590 } 591 592 static void kvm_log_start(MemoryListener *listener, 593 MemoryRegionSection *section, 594 int old, int new) 595 { 596 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 597 int r; 598 599 if (old != 0) { 600 return; 601 } 602 603 r = kvm_section_update_flags(kml, section); 604 if (r < 0) { 605 abort(); 606 } 607 } 608 609 static void kvm_log_stop(MemoryListener *listener, 610 MemoryRegionSection *section, 611 int old, int new) 612 { 613 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 614 int r; 615 616 if (new != 0) { 617 return; 618 } 619 620 r = kvm_section_update_flags(kml, section); 621 if (r < 0) { 622 abort(); 623 } 624 } 625 626 /* get kvm's dirty pages bitmap and update qemu's */ 627 static void kvm_slot_sync_dirty_pages(KVMSlot *slot) 628 { 629 ram_addr_t start = slot->ram_start_offset; 630 ram_addr_t pages = slot->memory_size / qemu_real_host_page_size(); 631 632 cpu_physical_memory_set_dirty_lebitmap(slot->dirty_bmap, start, pages); 633 } 634 635 static void kvm_slot_reset_dirty_pages(KVMSlot *slot) 636 { 637 memset(slot->dirty_bmap, 0, slot->dirty_bmap_size); 638 } 639 640 #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1)) 641 642 /* Allocate the dirty bitmap for a slot */ 643 static void kvm_slot_init_dirty_bitmap(KVMSlot *mem) 644 { 645 if (!(mem->flags & KVM_MEM_LOG_DIRTY_PAGES) || mem->dirty_bmap) { 646 return; 647 } 648 649 /* 650 * XXX bad kernel interface alert 651 * For dirty bitmap, kernel allocates array of size aligned to 652 * bits-per-long. But for case when the kernel is 64bits and 653 * the userspace is 32bits, userspace can't align to the same 654 * bits-per-long, since sizeof(long) is different between kernel 655 * and user space. This way, userspace will provide buffer which 656 * may be 4 bytes less than the kernel will use, resulting in 657 * userspace memory corruption (which is not detectable by valgrind 658 * too, in most cases). 659 * So for now, let's align to 64 instead of HOST_LONG_BITS here, in 660 * a hope that sizeof(long) won't become >8 any time soon. 661 * 662 * Note: the granule of kvm dirty log is qemu_real_host_page_size. 663 * And mem->memory_size is aligned to it (otherwise this mem can't 664 * be registered to KVM). 665 */ 666 hwaddr bitmap_size = ALIGN(mem->memory_size / qemu_real_host_page_size(), 667 /*HOST_LONG_BITS*/ 64) / 8; 668 mem->dirty_bmap = g_malloc0(bitmap_size); 669 mem->dirty_bmap_size = bitmap_size; 670 } 671 672 /* 673 * Sync dirty bitmap from kernel to KVMSlot.dirty_bmap, return true if 674 * succeeded, false otherwise 675 */ 676 static bool kvm_slot_get_dirty_log(KVMState *s, KVMSlot *slot) 677 { 678 struct kvm_dirty_log d = {}; 679 int ret; 680 681 d.dirty_bitmap = slot->dirty_bmap; 682 d.slot = slot->slot | (slot->as_id << 16); 683 ret = kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d); 684 685 if (ret == -ENOENT) { 686 /* kernel does not have dirty bitmap in this slot */ 687 ret = 0; 688 } 689 if (ret) { 690 error_report_once("%s: KVM_GET_DIRTY_LOG failed with %d", 691 __func__, ret); 692 } 693 return ret == 0; 694 } 695 696 /* Should be with all slots_lock held for the address spaces. */ 697 static void kvm_dirty_ring_mark_page(KVMState *s, uint32_t as_id, 698 uint32_t slot_id, uint64_t offset) 699 { 700 KVMMemoryListener *kml; 701 KVMSlot *mem; 702 703 if (as_id >= s->nr_as) { 704 return; 705 } 706 707 kml = s->as[as_id].ml; 708 mem = &kml->slots[slot_id]; 709 710 if (!mem->memory_size || offset >= 711 (mem->memory_size / qemu_real_host_page_size())) { 712 return; 713 } 714 715 set_bit(offset, mem->dirty_bmap); 716 } 717 718 static bool dirty_gfn_is_dirtied(struct kvm_dirty_gfn *gfn) 719 { 720 return gfn->flags == KVM_DIRTY_GFN_F_DIRTY; 721 } 722 723 static void dirty_gfn_set_collected(struct kvm_dirty_gfn *gfn) 724 { 725 gfn->flags = KVM_DIRTY_GFN_F_RESET; 726 } 727 728 /* 729 * Should be with all slots_lock held for the address spaces. It returns the 730 * dirty page we've collected on this dirty ring. 731 */ 732 static uint32_t kvm_dirty_ring_reap_one(KVMState *s, CPUState *cpu) 733 { 734 struct kvm_dirty_gfn *dirty_gfns = cpu->kvm_dirty_gfns, *cur; 735 uint32_t ring_size = s->kvm_dirty_ring_size; 736 uint32_t count = 0, fetch = cpu->kvm_fetch_index; 737 738 assert(dirty_gfns && ring_size); 739 trace_kvm_dirty_ring_reap_vcpu(cpu->cpu_index); 740 741 while (true) { 742 cur = &dirty_gfns[fetch % ring_size]; 743 if (!dirty_gfn_is_dirtied(cur)) { 744 break; 745 } 746 kvm_dirty_ring_mark_page(s, cur->slot >> 16, cur->slot & 0xffff, 747 cur->offset); 748 dirty_gfn_set_collected(cur); 749 trace_kvm_dirty_ring_page(cpu->cpu_index, fetch, cur->offset); 750 fetch++; 751 count++; 752 } 753 cpu->kvm_fetch_index = fetch; 754 cpu->dirty_pages += count; 755 756 return count; 757 } 758 759 /* Must be with slots_lock held */ 760 static uint64_t kvm_dirty_ring_reap_locked(KVMState *s) 761 { 762 int ret; 763 CPUState *cpu; 764 uint64_t total = 0; 765 int64_t stamp; 766 767 stamp = get_clock(); 768 769 CPU_FOREACH(cpu) { 770 total += kvm_dirty_ring_reap_one(s, cpu); 771 } 772 773 if (total) { 774 ret = kvm_vm_ioctl(s, KVM_RESET_DIRTY_RINGS); 775 assert(ret == total); 776 } 777 778 stamp = get_clock() - stamp; 779 780 if (total) { 781 trace_kvm_dirty_ring_reap(total, stamp / 1000); 782 } 783 784 return total; 785 } 786 787 /* 788 * Currently for simplicity, we must hold BQL before calling this. We can 789 * consider to drop the BQL if we're clear with all the race conditions. 790 */ 791 static uint64_t kvm_dirty_ring_reap(KVMState *s) 792 { 793 uint64_t total; 794 795 /* 796 * We need to lock all kvm slots for all address spaces here, 797 * because: 798 * 799 * (1) We need to mark dirty for dirty bitmaps in multiple slots 800 * and for tons of pages, so it's better to take the lock here 801 * once rather than once per page. And more importantly, 802 * 803 * (2) We must _NOT_ publish dirty bits to the other threads 804 * (e.g., the migration thread) via the kvm memory slot dirty 805 * bitmaps before correctly re-protect those dirtied pages. 806 * Otherwise we can have potential risk of data corruption if 807 * the page data is read in the other thread before we do 808 * reset below. 809 */ 810 kvm_slots_lock(); 811 total = kvm_dirty_ring_reap_locked(s); 812 kvm_slots_unlock(); 813 814 return total; 815 } 816 817 static void do_kvm_cpu_synchronize_kick(CPUState *cpu, run_on_cpu_data arg) 818 { 819 /* No need to do anything */ 820 } 821 822 /* 823 * Kick all vcpus out in a synchronized way. When returned, we 824 * guarantee that every vcpu has been kicked and at least returned to 825 * userspace once. 826 */ 827 static void kvm_cpu_synchronize_kick_all(void) 828 { 829 CPUState *cpu; 830 831 CPU_FOREACH(cpu) { 832 run_on_cpu(cpu, do_kvm_cpu_synchronize_kick, RUN_ON_CPU_NULL); 833 } 834 } 835 836 /* 837 * Flush all the existing dirty pages to the KVM slot buffers. When 838 * this call returns, we guarantee that all the touched dirty pages 839 * before calling this function have been put into the per-kvmslot 840 * dirty bitmap. 841 * 842 * This function must be called with BQL held. 843 */ 844 static void kvm_dirty_ring_flush(void) 845 { 846 trace_kvm_dirty_ring_flush(0); 847 /* 848 * The function needs to be serialized. Since this function 849 * should always be with BQL held, serialization is guaranteed. 850 * However, let's be sure of it. 851 */ 852 assert(qemu_mutex_iothread_locked()); 853 /* 854 * First make sure to flush the hardware buffers by kicking all 855 * vcpus out in a synchronous way. 856 */ 857 kvm_cpu_synchronize_kick_all(); 858 kvm_dirty_ring_reap(kvm_state); 859 trace_kvm_dirty_ring_flush(1); 860 } 861 862 /** 863 * kvm_physical_sync_dirty_bitmap - Sync dirty bitmap from kernel space 864 * 865 * This function will first try to fetch dirty bitmap from the kernel, 866 * and then updates qemu's dirty bitmap. 867 * 868 * NOTE: caller must be with kml->slots_lock held. 869 * 870 * @kml: the KVM memory listener object 871 * @section: the memory section to sync the dirty bitmap with 872 */ 873 static void kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml, 874 MemoryRegionSection *section) 875 { 876 KVMState *s = kvm_state; 877 KVMSlot *mem; 878 hwaddr start_addr, size; 879 hwaddr slot_size; 880 881 size = kvm_align_section(section, &start_addr); 882 while (size) { 883 slot_size = MIN(kvm_max_slot_size, size); 884 mem = kvm_lookup_matching_slot(kml, start_addr, slot_size); 885 if (!mem) { 886 /* We don't have a slot if we want to trap every access. */ 887 return; 888 } 889 if (kvm_slot_get_dirty_log(s, mem)) { 890 kvm_slot_sync_dirty_pages(mem); 891 } 892 start_addr += slot_size; 893 size -= slot_size; 894 } 895 } 896 897 /* Alignment requirement for KVM_CLEAR_DIRTY_LOG - 64 pages */ 898 #define KVM_CLEAR_LOG_SHIFT 6 899 #define KVM_CLEAR_LOG_ALIGN (qemu_real_host_page_size() << KVM_CLEAR_LOG_SHIFT) 900 #define KVM_CLEAR_LOG_MASK (-KVM_CLEAR_LOG_ALIGN) 901 902 static int kvm_log_clear_one_slot(KVMSlot *mem, int as_id, uint64_t start, 903 uint64_t size) 904 { 905 KVMState *s = kvm_state; 906 uint64_t end, bmap_start, start_delta, bmap_npages; 907 struct kvm_clear_dirty_log d; 908 unsigned long *bmap_clear = NULL, psize = qemu_real_host_page_size(); 909 int ret; 910 911 /* 912 * We need to extend either the start or the size or both to 913 * satisfy the KVM interface requirement. Firstly, do the start 914 * page alignment on 64 host pages 915 */ 916 bmap_start = start & KVM_CLEAR_LOG_MASK; 917 start_delta = start - bmap_start; 918 bmap_start /= psize; 919 920 /* 921 * The kernel interface has restriction on the size too, that either: 922 * 923 * (1) the size is 64 host pages aligned (just like the start), or 924 * (2) the size fills up until the end of the KVM memslot. 925 */ 926 bmap_npages = DIV_ROUND_UP(size + start_delta, KVM_CLEAR_LOG_ALIGN) 927 << KVM_CLEAR_LOG_SHIFT; 928 end = mem->memory_size / psize; 929 if (bmap_npages > end - bmap_start) { 930 bmap_npages = end - bmap_start; 931 } 932 start_delta /= psize; 933 934 /* 935 * Prepare the bitmap to clear dirty bits. Here we must guarantee 936 * that we won't clear any unknown dirty bits otherwise we might 937 * accidentally clear some set bits which are not yet synced from 938 * the kernel into QEMU's bitmap, then we'll lose track of the 939 * guest modifications upon those pages (which can directly lead 940 * to guest data loss or panic after migration). 941 * 942 * Layout of the KVMSlot.dirty_bmap: 943 * 944 * |<-------- bmap_npages -----------..>| 945 * [1] 946 * start_delta size 947 * |----------------|-------------|------------------|------------| 948 * ^ ^ ^ ^ 949 * | | | | 950 * start bmap_start (start) end 951 * of memslot of memslot 952 * 953 * [1] bmap_npages can be aligned to either 64 pages or the end of slot 954 */ 955 956 assert(bmap_start % BITS_PER_LONG == 0); 957 /* We should never do log_clear before log_sync */ 958 assert(mem->dirty_bmap); 959 if (start_delta || bmap_npages - size / psize) { 960 /* Slow path - we need to manipulate a temp bitmap */ 961 bmap_clear = bitmap_new(bmap_npages); 962 bitmap_copy_with_src_offset(bmap_clear, mem->dirty_bmap, 963 bmap_start, start_delta + size / psize); 964 /* 965 * We need to fill the holes at start because that was not 966 * specified by the caller and we extended the bitmap only for 967 * 64 pages alignment 968 */ 969 bitmap_clear(bmap_clear, 0, start_delta); 970 d.dirty_bitmap = bmap_clear; 971 } else { 972 /* 973 * Fast path - both start and size align well with BITS_PER_LONG 974 * (or the end of memory slot) 975 */ 976 d.dirty_bitmap = mem->dirty_bmap + BIT_WORD(bmap_start); 977 } 978 979 d.first_page = bmap_start; 980 /* It should never overflow. If it happens, say something */ 981 assert(bmap_npages <= UINT32_MAX); 982 d.num_pages = bmap_npages; 983 d.slot = mem->slot | (as_id << 16); 984 985 ret = kvm_vm_ioctl(s, KVM_CLEAR_DIRTY_LOG, &d); 986 if (ret < 0 && ret != -ENOENT) { 987 error_report("%s: KVM_CLEAR_DIRTY_LOG failed, slot=%d, " 988 "start=0x%"PRIx64", size=0x%"PRIx32", errno=%d", 989 __func__, d.slot, (uint64_t)d.first_page, 990 (uint32_t)d.num_pages, ret); 991 } else { 992 ret = 0; 993 trace_kvm_clear_dirty_log(d.slot, d.first_page, d.num_pages); 994 } 995 996 /* 997 * After we have updated the remote dirty bitmap, we update the 998 * cached bitmap as well for the memslot, then if another user 999 * clears the same region we know we shouldn't clear it again on 1000 * the remote otherwise it's data loss as well. 1001 */ 1002 bitmap_clear(mem->dirty_bmap, bmap_start + start_delta, 1003 size / psize); 1004 /* This handles the NULL case well */ 1005 g_free(bmap_clear); 1006 return ret; 1007 } 1008 1009 1010 /** 1011 * kvm_physical_log_clear - Clear the kernel's dirty bitmap for range 1012 * 1013 * NOTE: this will be a no-op if we haven't enabled manual dirty log 1014 * protection in the host kernel because in that case this operation 1015 * will be done within log_sync(). 1016 * 1017 * @kml: the kvm memory listener 1018 * @section: the memory range to clear dirty bitmap 1019 */ 1020 static int kvm_physical_log_clear(KVMMemoryListener *kml, 1021 MemoryRegionSection *section) 1022 { 1023 KVMState *s = kvm_state; 1024 uint64_t start, size, offset, count; 1025 KVMSlot *mem; 1026 int ret = 0, i; 1027 1028 if (!s->manual_dirty_log_protect) { 1029 /* No need to do explicit clear */ 1030 return ret; 1031 } 1032 1033 start = section->offset_within_address_space; 1034 size = int128_get64(section->size); 1035 1036 if (!size) { 1037 /* Nothing more we can do... */ 1038 return ret; 1039 } 1040 1041 kvm_slots_lock(); 1042 1043 for (i = 0; i < s->nr_slots; i++) { 1044 mem = &kml->slots[i]; 1045 /* Discard slots that are empty or do not overlap the section */ 1046 if (!mem->memory_size || 1047 mem->start_addr > start + size - 1 || 1048 start > mem->start_addr + mem->memory_size - 1) { 1049 continue; 1050 } 1051 1052 if (start >= mem->start_addr) { 1053 /* The slot starts before section or is aligned to it. */ 1054 offset = start - mem->start_addr; 1055 count = MIN(mem->memory_size - offset, size); 1056 } else { 1057 /* The slot starts after section. */ 1058 offset = 0; 1059 count = MIN(mem->memory_size, size - (mem->start_addr - start)); 1060 } 1061 ret = kvm_log_clear_one_slot(mem, kml->as_id, offset, count); 1062 if (ret < 0) { 1063 break; 1064 } 1065 } 1066 1067 kvm_slots_unlock(); 1068 1069 return ret; 1070 } 1071 1072 static void kvm_coalesce_mmio_region(MemoryListener *listener, 1073 MemoryRegionSection *secion, 1074 hwaddr start, hwaddr size) 1075 { 1076 KVMState *s = kvm_state; 1077 1078 if (s->coalesced_mmio) { 1079 struct kvm_coalesced_mmio_zone zone; 1080 1081 zone.addr = start; 1082 zone.size = size; 1083 zone.pad = 0; 1084 1085 (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone); 1086 } 1087 } 1088 1089 static void kvm_uncoalesce_mmio_region(MemoryListener *listener, 1090 MemoryRegionSection *secion, 1091 hwaddr start, hwaddr size) 1092 { 1093 KVMState *s = kvm_state; 1094 1095 if (s->coalesced_mmio) { 1096 struct kvm_coalesced_mmio_zone zone; 1097 1098 zone.addr = start; 1099 zone.size = size; 1100 zone.pad = 0; 1101 1102 (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone); 1103 } 1104 } 1105 1106 static void kvm_coalesce_pio_add(MemoryListener *listener, 1107 MemoryRegionSection *section, 1108 hwaddr start, hwaddr size) 1109 { 1110 KVMState *s = kvm_state; 1111 1112 if (s->coalesced_pio) { 1113 struct kvm_coalesced_mmio_zone zone; 1114 1115 zone.addr = start; 1116 zone.size = size; 1117 zone.pio = 1; 1118 1119 (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone); 1120 } 1121 } 1122 1123 static void kvm_coalesce_pio_del(MemoryListener *listener, 1124 MemoryRegionSection *section, 1125 hwaddr start, hwaddr size) 1126 { 1127 KVMState *s = kvm_state; 1128 1129 if (s->coalesced_pio) { 1130 struct kvm_coalesced_mmio_zone zone; 1131 1132 zone.addr = start; 1133 zone.size = size; 1134 zone.pio = 1; 1135 1136 (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone); 1137 } 1138 } 1139 1140 static MemoryListener kvm_coalesced_pio_listener = { 1141 .name = "kvm-coalesced-pio", 1142 .coalesced_io_add = kvm_coalesce_pio_add, 1143 .coalesced_io_del = kvm_coalesce_pio_del, 1144 }; 1145 1146 int kvm_check_extension(KVMState *s, unsigned int extension) 1147 { 1148 int ret; 1149 1150 ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension); 1151 if (ret < 0) { 1152 ret = 0; 1153 } 1154 1155 return ret; 1156 } 1157 1158 int kvm_vm_check_extension(KVMState *s, unsigned int extension) 1159 { 1160 int ret; 1161 1162 ret = kvm_vm_ioctl(s, KVM_CHECK_EXTENSION, extension); 1163 if (ret < 0) { 1164 /* VM wide version not implemented, use global one instead */ 1165 ret = kvm_check_extension(s, extension); 1166 } 1167 1168 return ret; 1169 } 1170 1171 typedef struct HWPoisonPage { 1172 ram_addr_t ram_addr; 1173 QLIST_ENTRY(HWPoisonPage) list; 1174 } HWPoisonPage; 1175 1176 static QLIST_HEAD(, HWPoisonPage) hwpoison_page_list = 1177 QLIST_HEAD_INITIALIZER(hwpoison_page_list); 1178 1179 static void kvm_unpoison_all(void *param) 1180 { 1181 HWPoisonPage *page, *next_page; 1182 1183 QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) { 1184 QLIST_REMOVE(page, list); 1185 qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE); 1186 g_free(page); 1187 } 1188 } 1189 1190 void kvm_hwpoison_page_add(ram_addr_t ram_addr) 1191 { 1192 HWPoisonPage *page; 1193 1194 QLIST_FOREACH(page, &hwpoison_page_list, list) { 1195 if (page->ram_addr == ram_addr) { 1196 return; 1197 } 1198 } 1199 page = g_new(HWPoisonPage, 1); 1200 page->ram_addr = ram_addr; 1201 QLIST_INSERT_HEAD(&hwpoison_page_list, page, list); 1202 } 1203 1204 static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size) 1205 { 1206 #if HOST_BIG_ENDIAN != TARGET_BIG_ENDIAN 1207 /* The kernel expects ioeventfd values in HOST_BIG_ENDIAN 1208 * endianness, but the memory core hands them in target endianness. 1209 * For example, PPC is always treated as big-endian even if running 1210 * on KVM and on PPC64LE. Correct here. 1211 */ 1212 switch (size) { 1213 case 2: 1214 val = bswap16(val); 1215 break; 1216 case 4: 1217 val = bswap32(val); 1218 break; 1219 } 1220 #endif 1221 return val; 1222 } 1223 1224 static int kvm_set_ioeventfd_mmio(int fd, hwaddr addr, uint32_t val, 1225 bool assign, uint32_t size, bool datamatch) 1226 { 1227 int ret; 1228 struct kvm_ioeventfd iofd = { 1229 .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0, 1230 .addr = addr, 1231 .len = size, 1232 .flags = 0, 1233 .fd = fd, 1234 }; 1235 1236 trace_kvm_set_ioeventfd_mmio(fd, (uint64_t)addr, val, assign, size, 1237 datamatch); 1238 if (!kvm_enabled()) { 1239 return -ENOSYS; 1240 } 1241 1242 if (datamatch) { 1243 iofd.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH; 1244 } 1245 if (!assign) { 1246 iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN; 1247 } 1248 1249 ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd); 1250 1251 if (ret < 0) { 1252 return -errno; 1253 } 1254 1255 return 0; 1256 } 1257 1258 static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val, 1259 bool assign, uint32_t size, bool datamatch) 1260 { 1261 struct kvm_ioeventfd kick = { 1262 .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0, 1263 .addr = addr, 1264 .flags = KVM_IOEVENTFD_FLAG_PIO, 1265 .len = size, 1266 .fd = fd, 1267 }; 1268 int r; 1269 trace_kvm_set_ioeventfd_pio(fd, addr, val, assign, size, datamatch); 1270 if (!kvm_enabled()) { 1271 return -ENOSYS; 1272 } 1273 if (datamatch) { 1274 kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH; 1275 } 1276 if (!assign) { 1277 kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN; 1278 } 1279 r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick); 1280 if (r < 0) { 1281 return r; 1282 } 1283 return 0; 1284 } 1285 1286 1287 static int kvm_check_many_ioeventfds(void) 1288 { 1289 /* Userspace can use ioeventfd for io notification. This requires a host 1290 * that supports eventfd(2) and an I/O thread; since eventfd does not 1291 * support SIGIO it cannot interrupt the vcpu. 1292 * 1293 * Older kernels have a 6 device limit on the KVM io bus. Find out so we 1294 * can avoid creating too many ioeventfds. 1295 */ 1296 #if defined(CONFIG_EVENTFD) 1297 int ioeventfds[7]; 1298 int i, ret = 0; 1299 for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) { 1300 ioeventfds[i] = eventfd(0, EFD_CLOEXEC); 1301 if (ioeventfds[i] < 0) { 1302 break; 1303 } 1304 ret = kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, true, 2, true); 1305 if (ret < 0) { 1306 close(ioeventfds[i]); 1307 break; 1308 } 1309 } 1310 1311 /* Decide whether many devices are supported or not */ 1312 ret = i == ARRAY_SIZE(ioeventfds); 1313 1314 while (i-- > 0) { 1315 kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, false, 2, true); 1316 close(ioeventfds[i]); 1317 } 1318 return ret; 1319 #else 1320 return 0; 1321 #endif 1322 } 1323 1324 static const KVMCapabilityInfo * 1325 kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list) 1326 { 1327 while (list->name) { 1328 if (!kvm_check_extension(s, list->value)) { 1329 return list; 1330 } 1331 list++; 1332 } 1333 return NULL; 1334 } 1335 1336 void kvm_set_max_memslot_size(hwaddr max_slot_size) 1337 { 1338 g_assert( 1339 ROUND_UP(max_slot_size, qemu_real_host_page_size()) == max_slot_size 1340 ); 1341 kvm_max_slot_size = max_slot_size; 1342 } 1343 1344 static void kvm_set_phys_mem(KVMMemoryListener *kml, 1345 MemoryRegionSection *section, bool add) 1346 { 1347 KVMSlot *mem; 1348 int err; 1349 MemoryRegion *mr = section->mr; 1350 bool writable = !mr->readonly && !mr->rom_device; 1351 hwaddr start_addr, size, slot_size, mr_offset; 1352 ram_addr_t ram_start_offset; 1353 void *ram; 1354 1355 if (!memory_region_is_ram(mr)) { 1356 if (writable || !kvm_readonly_mem_allowed) { 1357 return; 1358 } else if (!mr->romd_mode) { 1359 /* If the memory device is not in romd_mode, then we actually want 1360 * to remove the kvm memory slot so all accesses will trap. */ 1361 add = false; 1362 } 1363 } 1364 1365 size = kvm_align_section(section, &start_addr); 1366 if (!size) { 1367 return; 1368 } 1369 1370 /* The offset of the kvmslot within the memory region */ 1371 mr_offset = section->offset_within_region + start_addr - 1372 section->offset_within_address_space; 1373 1374 /* use aligned delta to align the ram address and offset */ 1375 ram = memory_region_get_ram_ptr(mr) + mr_offset; 1376 ram_start_offset = memory_region_get_ram_addr(mr) + mr_offset; 1377 1378 kvm_slots_lock(); 1379 1380 if (!add) { 1381 do { 1382 slot_size = MIN(kvm_max_slot_size, size); 1383 mem = kvm_lookup_matching_slot(kml, start_addr, slot_size); 1384 if (!mem) { 1385 goto out; 1386 } 1387 if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { 1388 /* 1389 * NOTE: We should be aware of the fact that here we're only 1390 * doing a best effort to sync dirty bits. No matter whether 1391 * we're using dirty log or dirty ring, we ignored two facts: 1392 * 1393 * (1) dirty bits can reside in hardware buffers (PML) 1394 * 1395 * (2) after we collected dirty bits here, pages can be dirtied 1396 * again before we do the final KVM_SET_USER_MEMORY_REGION to 1397 * remove the slot. 1398 * 1399 * Not easy. Let's cross the fingers until it's fixed. 1400 */ 1401 if (kvm_state->kvm_dirty_ring_size) { 1402 kvm_dirty_ring_reap_locked(kvm_state); 1403 } else { 1404 kvm_slot_get_dirty_log(kvm_state, mem); 1405 } 1406 kvm_slot_sync_dirty_pages(mem); 1407 } 1408 1409 /* unregister the slot */ 1410 g_free(mem->dirty_bmap); 1411 mem->dirty_bmap = NULL; 1412 mem->memory_size = 0; 1413 mem->flags = 0; 1414 err = kvm_set_user_memory_region(kml, mem, false); 1415 if (err) { 1416 fprintf(stderr, "%s: error unregistering slot: %s\n", 1417 __func__, strerror(-err)); 1418 abort(); 1419 } 1420 start_addr += slot_size; 1421 size -= slot_size; 1422 } while (size); 1423 goto out; 1424 } 1425 1426 /* register the new slot */ 1427 do { 1428 slot_size = MIN(kvm_max_slot_size, size); 1429 mem = kvm_alloc_slot(kml); 1430 mem->as_id = kml->as_id; 1431 mem->memory_size = slot_size; 1432 mem->start_addr = start_addr; 1433 mem->ram_start_offset = ram_start_offset; 1434 mem->ram = ram; 1435 mem->flags = kvm_mem_flags(mr); 1436 kvm_slot_init_dirty_bitmap(mem); 1437 err = kvm_set_user_memory_region(kml, mem, true); 1438 if (err) { 1439 fprintf(stderr, "%s: error registering slot: %s\n", __func__, 1440 strerror(-err)); 1441 abort(); 1442 } 1443 start_addr += slot_size; 1444 ram_start_offset += slot_size; 1445 ram += slot_size; 1446 size -= slot_size; 1447 } while (size); 1448 1449 out: 1450 kvm_slots_unlock(); 1451 } 1452 1453 static void *kvm_dirty_ring_reaper_thread(void *data) 1454 { 1455 KVMState *s = data; 1456 struct KVMDirtyRingReaper *r = &s->reaper; 1457 1458 rcu_register_thread(); 1459 1460 trace_kvm_dirty_ring_reaper("init"); 1461 1462 while (true) { 1463 r->reaper_state = KVM_DIRTY_RING_REAPER_WAIT; 1464 trace_kvm_dirty_ring_reaper("wait"); 1465 /* 1466 * TODO: provide a smarter timeout rather than a constant? 1467 */ 1468 sleep(1); 1469 1470 trace_kvm_dirty_ring_reaper("wakeup"); 1471 r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING; 1472 1473 qemu_mutex_lock_iothread(); 1474 kvm_dirty_ring_reap(s); 1475 qemu_mutex_unlock_iothread(); 1476 1477 r->reaper_iteration++; 1478 } 1479 1480 trace_kvm_dirty_ring_reaper("exit"); 1481 1482 rcu_unregister_thread(); 1483 1484 return NULL; 1485 } 1486 1487 static int kvm_dirty_ring_reaper_init(KVMState *s) 1488 { 1489 struct KVMDirtyRingReaper *r = &s->reaper; 1490 1491 qemu_thread_create(&r->reaper_thr, "kvm-reaper", 1492 kvm_dirty_ring_reaper_thread, 1493 s, QEMU_THREAD_JOINABLE); 1494 1495 return 0; 1496 } 1497 1498 static void kvm_region_add(MemoryListener *listener, 1499 MemoryRegionSection *section) 1500 { 1501 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 1502 1503 memory_region_ref(section->mr); 1504 kvm_set_phys_mem(kml, section, true); 1505 } 1506 1507 static void kvm_region_del(MemoryListener *listener, 1508 MemoryRegionSection *section) 1509 { 1510 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 1511 1512 kvm_set_phys_mem(kml, section, false); 1513 memory_region_unref(section->mr); 1514 } 1515 1516 static void kvm_log_sync(MemoryListener *listener, 1517 MemoryRegionSection *section) 1518 { 1519 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 1520 1521 kvm_slots_lock(); 1522 kvm_physical_sync_dirty_bitmap(kml, section); 1523 kvm_slots_unlock(); 1524 } 1525 1526 static void kvm_log_sync_global(MemoryListener *l) 1527 { 1528 KVMMemoryListener *kml = container_of(l, KVMMemoryListener, listener); 1529 KVMState *s = kvm_state; 1530 KVMSlot *mem; 1531 int i; 1532 1533 /* Flush all kernel dirty addresses into KVMSlot dirty bitmap */ 1534 kvm_dirty_ring_flush(); 1535 1536 /* 1537 * TODO: make this faster when nr_slots is big while there are 1538 * only a few used slots (small VMs). 1539 */ 1540 kvm_slots_lock(); 1541 for (i = 0; i < s->nr_slots; i++) { 1542 mem = &kml->slots[i]; 1543 if (mem->memory_size && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { 1544 kvm_slot_sync_dirty_pages(mem); 1545 /* 1546 * This is not needed by KVM_GET_DIRTY_LOG because the 1547 * ioctl will unconditionally overwrite the whole region. 1548 * However kvm dirty ring has no such side effect. 1549 */ 1550 kvm_slot_reset_dirty_pages(mem); 1551 } 1552 } 1553 kvm_slots_unlock(); 1554 } 1555 1556 static void kvm_log_clear(MemoryListener *listener, 1557 MemoryRegionSection *section) 1558 { 1559 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 1560 int r; 1561 1562 r = kvm_physical_log_clear(kml, section); 1563 if (r < 0) { 1564 error_report_once("%s: kvm log clear failed: mr=%s " 1565 "offset=%"HWADDR_PRIx" size=%"PRIx64, __func__, 1566 section->mr->name, section->offset_within_region, 1567 int128_get64(section->size)); 1568 abort(); 1569 } 1570 } 1571 1572 static void kvm_mem_ioeventfd_add(MemoryListener *listener, 1573 MemoryRegionSection *section, 1574 bool match_data, uint64_t data, 1575 EventNotifier *e) 1576 { 1577 int fd = event_notifier_get_fd(e); 1578 int r; 1579 1580 r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space, 1581 data, true, int128_get64(section->size), 1582 match_data); 1583 if (r < 0) { 1584 fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n", 1585 __func__, strerror(-r), -r); 1586 abort(); 1587 } 1588 } 1589 1590 static void kvm_mem_ioeventfd_del(MemoryListener *listener, 1591 MemoryRegionSection *section, 1592 bool match_data, uint64_t data, 1593 EventNotifier *e) 1594 { 1595 int fd = event_notifier_get_fd(e); 1596 int r; 1597 1598 r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space, 1599 data, false, int128_get64(section->size), 1600 match_data); 1601 if (r < 0) { 1602 fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n", 1603 __func__, strerror(-r), -r); 1604 abort(); 1605 } 1606 } 1607 1608 static void kvm_io_ioeventfd_add(MemoryListener *listener, 1609 MemoryRegionSection *section, 1610 bool match_data, uint64_t data, 1611 EventNotifier *e) 1612 { 1613 int fd = event_notifier_get_fd(e); 1614 int r; 1615 1616 r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space, 1617 data, true, int128_get64(section->size), 1618 match_data); 1619 if (r < 0) { 1620 fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n", 1621 __func__, strerror(-r), -r); 1622 abort(); 1623 } 1624 } 1625 1626 static void kvm_io_ioeventfd_del(MemoryListener *listener, 1627 MemoryRegionSection *section, 1628 bool match_data, uint64_t data, 1629 EventNotifier *e) 1630 1631 { 1632 int fd = event_notifier_get_fd(e); 1633 int r; 1634 1635 r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space, 1636 data, false, int128_get64(section->size), 1637 match_data); 1638 if (r < 0) { 1639 fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n", 1640 __func__, strerror(-r), -r); 1641 abort(); 1642 } 1643 } 1644 1645 void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml, 1646 AddressSpace *as, int as_id, const char *name) 1647 { 1648 int i; 1649 1650 kml->slots = g_new0(KVMSlot, s->nr_slots); 1651 kml->as_id = as_id; 1652 1653 for (i = 0; i < s->nr_slots; i++) { 1654 kml->slots[i].slot = i; 1655 } 1656 1657 kml->listener.region_add = kvm_region_add; 1658 kml->listener.region_del = kvm_region_del; 1659 kml->listener.log_start = kvm_log_start; 1660 kml->listener.log_stop = kvm_log_stop; 1661 kml->listener.priority = 10; 1662 kml->listener.name = name; 1663 1664 if (s->kvm_dirty_ring_size) { 1665 kml->listener.log_sync_global = kvm_log_sync_global; 1666 } else { 1667 kml->listener.log_sync = kvm_log_sync; 1668 kml->listener.log_clear = kvm_log_clear; 1669 } 1670 1671 memory_listener_register(&kml->listener, as); 1672 1673 for (i = 0; i < s->nr_as; ++i) { 1674 if (!s->as[i].as) { 1675 s->as[i].as = as; 1676 s->as[i].ml = kml; 1677 break; 1678 } 1679 } 1680 } 1681 1682 static MemoryListener kvm_io_listener = { 1683 .name = "kvm-io", 1684 .eventfd_add = kvm_io_ioeventfd_add, 1685 .eventfd_del = kvm_io_ioeventfd_del, 1686 .priority = 10, 1687 }; 1688 1689 int kvm_set_irq(KVMState *s, int irq, int level) 1690 { 1691 struct kvm_irq_level event; 1692 int ret; 1693 1694 assert(kvm_async_interrupts_enabled()); 1695 1696 event.level = level; 1697 event.irq = irq; 1698 ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event); 1699 if (ret < 0) { 1700 perror("kvm_set_irq"); 1701 abort(); 1702 } 1703 1704 return (s->irq_set_ioctl == KVM_IRQ_LINE) ? 1 : event.status; 1705 } 1706 1707 #ifdef KVM_CAP_IRQ_ROUTING 1708 typedef struct KVMMSIRoute { 1709 struct kvm_irq_routing_entry kroute; 1710 QTAILQ_ENTRY(KVMMSIRoute) entry; 1711 } KVMMSIRoute; 1712 1713 static void set_gsi(KVMState *s, unsigned int gsi) 1714 { 1715 set_bit(gsi, s->used_gsi_bitmap); 1716 } 1717 1718 static void clear_gsi(KVMState *s, unsigned int gsi) 1719 { 1720 clear_bit(gsi, s->used_gsi_bitmap); 1721 } 1722 1723 void kvm_init_irq_routing(KVMState *s) 1724 { 1725 int gsi_count, i; 1726 1727 gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING) - 1; 1728 if (gsi_count > 0) { 1729 /* Round up so we can search ints using ffs */ 1730 s->used_gsi_bitmap = bitmap_new(gsi_count); 1731 s->gsi_count = gsi_count; 1732 } 1733 1734 s->irq_routes = g_malloc0(sizeof(*s->irq_routes)); 1735 s->nr_allocated_irq_routes = 0; 1736 1737 if (!kvm_direct_msi_allowed) { 1738 for (i = 0; i < KVM_MSI_HASHTAB_SIZE; i++) { 1739 QTAILQ_INIT(&s->msi_hashtab[i]); 1740 } 1741 } 1742 1743 kvm_arch_init_irq_routing(s); 1744 } 1745 1746 void kvm_irqchip_commit_routes(KVMState *s) 1747 { 1748 int ret; 1749 1750 if (kvm_gsi_direct_mapping()) { 1751 return; 1752 } 1753 1754 if (!kvm_gsi_routing_enabled()) { 1755 return; 1756 } 1757 1758 s->irq_routes->flags = 0; 1759 trace_kvm_irqchip_commit_routes(); 1760 ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes); 1761 assert(ret == 0); 1762 } 1763 1764 static void kvm_add_routing_entry(KVMState *s, 1765 struct kvm_irq_routing_entry *entry) 1766 { 1767 struct kvm_irq_routing_entry *new; 1768 int n, size; 1769 1770 if (s->irq_routes->nr == s->nr_allocated_irq_routes) { 1771 n = s->nr_allocated_irq_routes * 2; 1772 if (n < 64) { 1773 n = 64; 1774 } 1775 size = sizeof(struct kvm_irq_routing); 1776 size += n * sizeof(*new); 1777 s->irq_routes = g_realloc(s->irq_routes, size); 1778 s->nr_allocated_irq_routes = n; 1779 } 1780 n = s->irq_routes->nr++; 1781 new = &s->irq_routes->entries[n]; 1782 1783 *new = *entry; 1784 1785 set_gsi(s, entry->gsi); 1786 } 1787 1788 static int kvm_update_routing_entry(KVMState *s, 1789 struct kvm_irq_routing_entry *new_entry) 1790 { 1791 struct kvm_irq_routing_entry *entry; 1792 int n; 1793 1794 for (n = 0; n < s->irq_routes->nr; n++) { 1795 entry = &s->irq_routes->entries[n]; 1796 if (entry->gsi != new_entry->gsi) { 1797 continue; 1798 } 1799 1800 if(!memcmp(entry, new_entry, sizeof *entry)) { 1801 return 0; 1802 } 1803 1804 *entry = *new_entry; 1805 1806 return 0; 1807 } 1808 1809 return -ESRCH; 1810 } 1811 1812 void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin) 1813 { 1814 struct kvm_irq_routing_entry e = {}; 1815 1816 assert(pin < s->gsi_count); 1817 1818 e.gsi = irq; 1819 e.type = KVM_IRQ_ROUTING_IRQCHIP; 1820 e.flags = 0; 1821 e.u.irqchip.irqchip = irqchip; 1822 e.u.irqchip.pin = pin; 1823 kvm_add_routing_entry(s, &e); 1824 } 1825 1826 void kvm_irqchip_release_virq(KVMState *s, int virq) 1827 { 1828 struct kvm_irq_routing_entry *e; 1829 int i; 1830 1831 if (kvm_gsi_direct_mapping()) { 1832 return; 1833 } 1834 1835 for (i = 0; i < s->irq_routes->nr; i++) { 1836 e = &s->irq_routes->entries[i]; 1837 if (e->gsi == virq) { 1838 s->irq_routes->nr--; 1839 *e = s->irq_routes->entries[s->irq_routes->nr]; 1840 } 1841 } 1842 clear_gsi(s, virq); 1843 kvm_arch_release_virq_post(virq); 1844 trace_kvm_irqchip_release_virq(virq); 1845 } 1846 1847 void kvm_irqchip_add_change_notifier(Notifier *n) 1848 { 1849 notifier_list_add(&kvm_irqchip_change_notifiers, n); 1850 } 1851 1852 void kvm_irqchip_remove_change_notifier(Notifier *n) 1853 { 1854 notifier_remove(n); 1855 } 1856 1857 void kvm_irqchip_change_notify(void) 1858 { 1859 notifier_list_notify(&kvm_irqchip_change_notifiers, NULL); 1860 } 1861 1862 static unsigned int kvm_hash_msi(uint32_t data) 1863 { 1864 /* This is optimized for IA32 MSI layout. However, no other arch shall 1865 * repeat the mistake of not providing a direct MSI injection API. */ 1866 return data & 0xff; 1867 } 1868 1869 static void kvm_flush_dynamic_msi_routes(KVMState *s) 1870 { 1871 KVMMSIRoute *route, *next; 1872 unsigned int hash; 1873 1874 for (hash = 0; hash < KVM_MSI_HASHTAB_SIZE; hash++) { 1875 QTAILQ_FOREACH_SAFE(route, &s->msi_hashtab[hash], entry, next) { 1876 kvm_irqchip_release_virq(s, route->kroute.gsi); 1877 QTAILQ_REMOVE(&s->msi_hashtab[hash], route, entry); 1878 g_free(route); 1879 } 1880 } 1881 } 1882 1883 static int kvm_irqchip_get_virq(KVMState *s) 1884 { 1885 int next_virq; 1886 1887 /* 1888 * PIC and IOAPIC share the first 16 GSI numbers, thus the available 1889 * GSI numbers are more than the number of IRQ route. Allocating a GSI 1890 * number can succeed even though a new route entry cannot be added. 1891 * When this happens, flush dynamic MSI entries to free IRQ route entries. 1892 */ 1893 if (!kvm_direct_msi_allowed && s->irq_routes->nr == s->gsi_count) { 1894 kvm_flush_dynamic_msi_routes(s); 1895 } 1896 1897 /* Return the lowest unused GSI in the bitmap */ 1898 next_virq = find_first_zero_bit(s->used_gsi_bitmap, s->gsi_count); 1899 if (next_virq >= s->gsi_count) { 1900 return -ENOSPC; 1901 } else { 1902 return next_virq; 1903 } 1904 } 1905 1906 static KVMMSIRoute *kvm_lookup_msi_route(KVMState *s, MSIMessage msg) 1907 { 1908 unsigned int hash = kvm_hash_msi(msg.data); 1909 KVMMSIRoute *route; 1910 1911 QTAILQ_FOREACH(route, &s->msi_hashtab[hash], entry) { 1912 if (route->kroute.u.msi.address_lo == (uint32_t)msg.address && 1913 route->kroute.u.msi.address_hi == (msg.address >> 32) && 1914 route->kroute.u.msi.data == le32_to_cpu(msg.data)) { 1915 return route; 1916 } 1917 } 1918 return NULL; 1919 } 1920 1921 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg) 1922 { 1923 struct kvm_msi msi; 1924 KVMMSIRoute *route; 1925 1926 if (kvm_direct_msi_allowed) { 1927 msi.address_lo = (uint32_t)msg.address; 1928 msi.address_hi = msg.address >> 32; 1929 msi.data = le32_to_cpu(msg.data); 1930 msi.flags = 0; 1931 memset(msi.pad, 0, sizeof(msi.pad)); 1932 1933 return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi); 1934 } 1935 1936 route = kvm_lookup_msi_route(s, msg); 1937 if (!route) { 1938 int virq; 1939 1940 virq = kvm_irqchip_get_virq(s); 1941 if (virq < 0) { 1942 return virq; 1943 } 1944 1945 route = g_new0(KVMMSIRoute, 1); 1946 route->kroute.gsi = virq; 1947 route->kroute.type = KVM_IRQ_ROUTING_MSI; 1948 route->kroute.flags = 0; 1949 route->kroute.u.msi.address_lo = (uint32_t)msg.address; 1950 route->kroute.u.msi.address_hi = msg.address >> 32; 1951 route->kroute.u.msi.data = le32_to_cpu(msg.data); 1952 1953 kvm_add_routing_entry(s, &route->kroute); 1954 kvm_irqchip_commit_routes(s); 1955 1956 QTAILQ_INSERT_TAIL(&s->msi_hashtab[kvm_hash_msi(msg.data)], route, 1957 entry); 1958 } 1959 1960 assert(route->kroute.type == KVM_IRQ_ROUTING_MSI); 1961 1962 return kvm_set_irq(s, route->kroute.gsi, 1); 1963 } 1964 1965 int kvm_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev) 1966 { 1967 struct kvm_irq_routing_entry kroute = {}; 1968 int virq; 1969 KVMState *s = c->s; 1970 MSIMessage msg = {0, 0}; 1971 1972 if (pci_available && dev) { 1973 msg = pci_get_msi_message(dev, vector); 1974 } 1975 1976 if (kvm_gsi_direct_mapping()) { 1977 return kvm_arch_msi_data_to_gsi(msg.data); 1978 } 1979 1980 if (!kvm_gsi_routing_enabled()) { 1981 return -ENOSYS; 1982 } 1983 1984 virq = kvm_irqchip_get_virq(s); 1985 if (virq < 0) { 1986 return virq; 1987 } 1988 1989 kroute.gsi = virq; 1990 kroute.type = KVM_IRQ_ROUTING_MSI; 1991 kroute.flags = 0; 1992 kroute.u.msi.address_lo = (uint32_t)msg.address; 1993 kroute.u.msi.address_hi = msg.address >> 32; 1994 kroute.u.msi.data = le32_to_cpu(msg.data); 1995 if (pci_available && kvm_msi_devid_required()) { 1996 kroute.flags = KVM_MSI_VALID_DEVID; 1997 kroute.u.msi.devid = pci_requester_id(dev); 1998 } 1999 if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) { 2000 kvm_irqchip_release_virq(s, virq); 2001 return -EINVAL; 2002 } 2003 2004 trace_kvm_irqchip_add_msi_route(dev ? dev->name : (char *)"N/A", 2005 vector, virq); 2006 2007 kvm_add_routing_entry(s, &kroute); 2008 kvm_arch_add_msi_route_post(&kroute, vector, dev); 2009 c->changes++; 2010 2011 return virq; 2012 } 2013 2014 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg, 2015 PCIDevice *dev) 2016 { 2017 struct kvm_irq_routing_entry kroute = {}; 2018 2019 if (kvm_gsi_direct_mapping()) { 2020 return 0; 2021 } 2022 2023 if (!kvm_irqchip_in_kernel()) { 2024 return -ENOSYS; 2025 } 2026 2027 kroute.gsi = virq; 2028 kroute.type = KVM_IRQ_ROUTING_MSI; 2029 kroute.flags = 0; 2030 kroute.u.msi.address_lo = (uint32_t)msg.address; 2031 kroute.u.msi.address_hi = msg.address >> 32; 2032 kroute.u.msi.data = le32_to_cpu(msg.data); 2033 if (pci_available && kvm_msi_devid_required()) { 2034 kroute.flags = KVM_MSI_VALID_DEVID; 2035 kroute.u.msi.devid = pci_requester_id(dev); 2036 } 2037 if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) { 2038 return -EINVAL; 2039 } 2040 2041 trace_kvm_irqchip_update_msi_route(virq); 2042 2043 return kvm_update_routing_entry(s, &kroute); 2044 } 2045 2046 static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event, 2047 EventNotifier *resample, int virq, 2048 bool assign) 2049 { 2050 int fd = event_notifier_get_fd(event); 2051 int rfd = resample ? event_notifier_get_fd(resample) : -1; 2052 2053 struct kvm_irqfd irqfd = { 2054 .fd = fd, 2055 .gsi = virq, 2056 .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN, 2057 }; 2058 2059 if (rfd != -1) { 2060 assert(assign); 2061 if (kvm_irqchip_is_split()) { 2062 /* 2063 * When the slow irqchip (e.g. IOAPIC) is in the 2064 * userspace, KVM kernel resamplefd will not work because 2065 * the EOI of the interrupt will be delivered to userspace 2066 * instead, so the KVM kernel resamplefd kick will be 2067 * skipped. The userspace here mimics what the kernel 2068 * provides with resamplefd, remember the resamplefd and 2069 * kick it when we receive EOI of this IRQ. 2070 * 2071 * This is hackery because IOAPIC is mostly bypassed 2072 * (except EOI broadcasts) when irqfd is used. However 2073 * this can bring much performance back for split irqchip 2074 * with INTx IRQs (for VFIO, this gives 93% perf of the 2075 * full fast path, which is 46% perf boost comparing to 2076 * the INTx slow path). 2077 */ 2078 kvm_resample_fd_insert(virq, resample); 2079 } else { 2080 irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE; 2081 irqfd.resamplefd = rfd; 2082 } 2083 } else if (!assign) { 2084 if (kvm_irqchip_is_split()) { 2085 kvm_resample_fd_remove(virq); 2086 } 2087 } 2088 2089 if (!kvm_irqfds_enabled()) { 2090 return -ENOSYS; 2091 } 2092 2093 return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd); 2094 } 2095 2096 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter) 2097 { 2098 struct kvm_irq_routing_entry kroute = {}; 2099 int virq; 2100 2101 if (!kvm_gsi_routing_enabled()) { 2102 return -ENOSYS; 2103 } 2104 2105 virq = kvm_irqchip_get_virq(s); 2106 if (virq < 0) { 2107 return virq; 2108 } 2109 2110 kroute.gsi = virq; 2111 kroute.type = KVM_IRQ_ROUTING_S390_ADAPTER; 2112 kroute.flags = 0; 2113 kroute.u.adapter.summary_addr = adapter->summary_addr; 2114 kroute.u.adapter.ind_addr = adapter->ind_addr; 2115 kroute.u.adapter.summary_offset = adapter->summary_offset; 2116 kroute.u.adapter.ind_offset = adapter->ind_offset; 2117 kroute.u.adapter.adapter_id = adapter->adapter_id; 2118 2119 kvm_add_routing_entry(s, &kroute); 2120 2121 return virq; 2122 } 2123 2124 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint) 2125 { 2126 struct kvm_irq_routing_entry kroute = {}; 2127 int virq; 2128 2129 if (!kvm_gsi_routing_enabled()) { 2130 return -ENOSYS; 2131 } 2132 if (!kvm_check_extension(s, KVM_CAP_HYPERV_SYNIC)) { 2133 return -ENOSYS; 2134 } 2135 virq = kvm_irqchip_get_virq(s); 2136 if (virq < 0) { 2137 return virq; 2138 } 2139 2140 kroute.gsi = virq; 2141 kroute.type = KVM_IRQ_ROUTING_HV_SINT; 2142 kroute.flags = 0; 2143 kroute.u.hv_sint.vcpu = vcpu; 2144 kroute.u.hv_sint.sint = sint; 2145 2146 kvm_add_routing_entry(s, &kroute); 2147 kvm_irqchip_commit_routes(s); 2148 2149 return virq; 2150 } 2151 2152 #else /* !KVM_CAP_IRQ_ROUTING */ 2153 2154 void kvm_init_irq_routing(KVMState *s) 2155 { 2156 } 2157 2158 void kvm_irqchip_release_virq(KVMState *s, int virq) 2159 { 2160 } 2161 2162 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg) 2163 { 2164 abort(); 2165 } 2166 2167 int kvm_irqchip_add_msi_route(KVMRouteChange *c, int vector, PCIDevice *dev) 2168 { 2169 return -ENOSYS; 2170 } 2171 2172 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter) 2173 { 2174 return -ENOSYS; 2175 } 2176 2177 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint) 2178 { 2179 return -ENOSYS; 2180 } 2181 2182 static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event, 2183 EventNotifier *resample, int virq, 2184 bool assign) 2185 { 2186 abort(); 2187 } 2188 2189 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg) 2190 { 2191 return -ENOSYS; 2192 } 2193 #endif /* !KVM_CAP_IRQ_ROUTING */ 2194 2195 int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, 2196 EventNotifier *rn, int virq) 2197 { 2198 return kvm_irqchip_assign_irqfd(s, n, rn, virq, true); 2199 } 2200 2201 int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, 2202 int virq) 2203 { 2204 return kvm_irqchip_assign_irqfd(s, n, NULL, virq, false); 2205 } 2206 2207 int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n, 2208 EventNotifier *rn, qemu_irq irq) 2209 { 2210 gpointer key, gsi; 2211 gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi); 2212 2213 if (!found) { 2214 return -ENXIO; 2215 } 2216 return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, GPOINTER_TO_INT(gsi)); 2217 } 2218 2219 int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n, 2220 qemu_irq irq) 2221 { 2222 gpointer key, gsi; 2223 gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi); 2224 2225 if (!found) { 2226 return -ENXIO; 2227 } 2228 return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, GPOINTER_TO_INT(gsi)); 2229 } 2230 2231 void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi) 2232 { 2233 g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi)); 2234 } 2235 2236 static void kvm_irqchip_create(KVMState *s) 2237 { 2238 int ret; 2239 2240 assert(s->kernel_irqchip_split != ON_OFF_AUTO_AUTO); 2241 if (kvm_check_extension(s, KVM_CAP_IRQCHIP)) { 2242 ; 2243 } else if (kvm_check_extension(s, KVM_CAP_S390_IRQCHIP)) { 2244 ret = kvm_vm_enable_cap(s, KVM_CAP_S390_IRQCHIP, 0); 2245 if (ret < 0) { 2246 fprintf(stderr, "Enable kernel irqchip failed: %s\n", strerror(-ret)); 2247 exit(1); 2248 } 2249 } else { 2250 return; 2251 } 2252 2253 /* First probe and see if there's a arch-specific hook to create the 2254 * in-kernel irqchip for us */ 2255 ret = kvm_arch_irqchip_create(s); 2256 if (ret == 0) { 2257 if (s->kernel_irqchip_split == ON_OFF_AUTO_ON) { 2258 perror("Split IRQ chip mode not supported."); 2259 exit(1); 2260 } else { 2261 ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP); 2262 } 2263 } 2264 if (ret < 0) { 2265 fprintf(stderr, "Create kernel irqchip failed: %s\n", strerror(-ret)); 2266 exit(1); 2267 } 2268 2269 kvm_kernel_irqchip = true; 2270 /* If we have an in-kernel IRQ chip then we must have asynchronous 2271 * interrupt delivery (though the reverse is not necessarily true) 2272 */ 2273 kvm_async_interrupts_allowed = true; 2274 kvm_halt_in_kernel_allowed = true; 2275 2276 kvm_init_irq_routing(s); 2277 2278 s->gsimap = g_hash_table_new(g_direct_hash, g_direct_equal); 2279 } 2280 2281 /* Find number of supported CPUs using the recommended 2282 * procedure from the kernel API documentation to cope with 2283 * older kernels that may be missing capabilities. 2284 */ 2285 static int kvm_recommended_vcpus(KVMState *s) 2286 { 2287 int ret = kvm_vm_check_extension(s, KVM_CAP_NR_VCPUS); 2288 return (ret) ? ret : 4; 2289 } 2290 2291 static int kvm_max_vcpus(KVMState *s) 2292 { 2293 int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS); 2294 return (ret) ? ret : kvm_recommended_vcpus(s); 2295 } 2296 2297 static int kvm_max_vcpu_id(KVMState *s) 2298 { 2299 int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPU_ID); 2300 return (ret) ? ret : kvm_max_vcpus(s); 2301 } 2302 2303 bool kvm_vcpu_id_is_valid(int vcpu_id) 2304 { 2305 KVMState *s = KVM_STATE(current_accel()); 2306 return vcpu_id >= 0 && vcpu_id < kvm_max_vcpu_id(s); 2307 } 2308 2309 bool kvm_dirty_ring_enabled(void) 2310 { 2311 return kvm_state->kvm_dirty_ring_size ? true : false; 2312 } 2313 2314 static void query_stats_cb(StatsResultList **result, StatsTarget target, 2315 strList *names, strList *targets, Error **errp); 2316 static void query_stats_schemas_cb(StatsSchemaList **result, Error **errp); 2317 2318 static int kvm_init(MachineState *ms) 2319 { 2320 MachineClass *mc = MACHINE_GET_CLASS(ms); 2321 static const char upgrade_note[] = 2322 "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n" 2323 "(see http://sourceforge.net/projects/kvm).\n"; 2324 struct { 2325 const char *name; 2326 int num; 2327 } num_cpus[] = { 2328 { "SMP", ms->smp.cpus }, 2329 { "hotpluggable", ms->smp.max_cpus }, 2330 { NULL, } 2331 }, *nc = num_cpus; 2332 int soft_vcpus_limit, hard_vcpus_limit; 2333 KVMState *s; 2334 const KVMCapabilityInfo *missing_cap; 2335 int ret; 2336 int type = 0; 2337 uint64_t dirty_log_manual_caps; 2338 2339 qemu_mutex_init(&kml_slots_lock); 2340 2341 s = KVM_STATE(ms->accelerator); 2342 2343 /* 2344 * On systems where the kernel can support different base page 2345 * sizes, host page size may be different from TARGET_PAGE_SIZE, 2346 * even with KVM. TARGET_PAGE_SIZE is assumed to be the minimum 2347 * page size for the system though. 2348 */ 2349 assert(TARGET_PAGE_SIZE <= qemu_real_host_page_size()); 2350 2351 s->sigmask_len = 8; 2352 2353 #ifdef KVM_CAP_SET_GUEST_DEBUG 2354 QTAILQ_INIT(&s->kvm_sw_breakpoints); 2355 #endif 2356 QLIST_INIT(&s->kvm_parked_vcpus); 2357 s->fd = qemu_open_old("/dev/kvm", O_RDWR); 2358 if (s->fd == -1) { 2359 fprintf(stderr, "Could not access KVM kernel module: %m\n"); 2360 ret = -errno; 2361 goto err; 2362 } 2363 2364 ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0); 2365 if (ret < KVM_API_VERSION) { 2366 if (ret >= 0) { 2367 ret = -EINVAL; 2368 } 2369 fprintf(stderr, "kvm version too old\n"); 2370 goto err; 2371 } 2372 2373 if (ret > KVM_API_VERSION) { 2374 ret = -EINVAL; 2375 fprintf(stderr, "kvm version not supported\n"); 2376 goto err; 2377 } 2378 2379 kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT); 2380 s->nr_slots = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS); 2381 2382 /* If unspecified, use the default value */ 2383 if (!s->nr_slots) { 2384 s->nr_slots = 32; 2385 } 2386 2387 s->nr_as = kvm_check_extension(s, KVM_CAP_MULTI_ADDRESS_SPACE); 2388 if (s->nr_as <= 1) { 2389 s->nr_as = 1; 2390 } 2391 s->as = g_new0(struct KVMAs, s->nr_as); 2392 2393 if (object_property_find(OBJECT(current_machine), "kvm-type")) { 2394 g_autofree char *kvm_type = object_property_get_str(OBJECT(current_machine), 2395 "kvm-type", 2396 &error_abort); 2397 type = mc->kvm_type(ms, kvm_type); 2398 } else if (mc->kvm_type) { 2399 type = mc->kvm_type(ms, NULL); 2400 } 2401 2402 do { 2403 ret = kvm_ioctl(s, KVM_CREATE_VM, type); 2404 } while (ret == -EINTR); 2405 2406 if (ret < 0) { 2407 fprintf(stderr, "ioctl(KVM_CREATE_VM) failed: %d %s\n", -ret, 2408 strerror(-ret)); 2409 2410 #ifdef TARGET_S390X 2411 if (ret == -EINVAL) { 2412 fprintf(stderr, 2413 "Host kernel setup problem detected. Please verify:\n"); 2414 fprintf(stderr, "- for kernels supporting the switch_amode or" 2415 " user_mode parameters, whether\n"); 2416 fprintf(stderr, 2417 " user space is running in primary address space\n"); 2418 fprintf(stderr, 2419 "- for kernels supporting the vm.allocate_pgste sysctl, " 2420 "whether it is enabled\n"); 2421 } 2422 #elif defined(TARGET_PPC) 2423 if (ret == -EINVAL) { 2424 fprintf(stderr, 2425 "PPC KVM module is not loaded. Try modprobe kvm_%s.\n", 2426 (type == 2) ? "pr" : "hv"); 2427 } 2428 #endif 2429 goto err; 2430 } 2431 2432 s->vmfd = ret; 2433 2434 /* check the vcpu limits */ 2435 soft_vcpus_limit = kvm_recommended_vcpus(s); 2436 hard_vcpus_limit = kvm_max_vcpus(s); 2437 2438 while (nc->name) { 2439 if (nc->num > soft_vcpus_limit) { 2440 warn_report("Number of %s cpus requested (%d) exceeds " 2441 "the recommended cpus supported by KVM (%d)", 2442 nc->name, nc->num, soft_vcpus_limit); 2443 2444 if (nc->num > hard_vcpus_limit) { 2445 fprintf(stderr, "Number of %s cpus requested (%d) exceeds " 2446 "the maximum cpus supported by KVM (%d)\n", 2447 nc->name, nc->num, hard_vcpus_limit); 2448 exit(1); 2449 } 2450 } 2451 nc++; 2452 } 2453 2454 missing_cap = kvm_check_extension_list(s, kvm_required_capabilites); 2455 if (!missing_cap) { 2456 missing_cap = 2457 kvm_check_extension_list(s, kvm_arch_required_capabilities); 2458 } 2459 if (missing_cap) { 2460 ret = -EINVAL; 2461 fprintf(stderr, "kvm does not support %s\n%s", 2462 missing_cap->name, upgrade_note); 2463 goto err; 2464 } 2465 2466 s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO); 2467 s->coalesced_pio = s->coalesced_mmio && 2468 kvm_check_extension(s, KVM_CAP_COALESCED_PIO); 2469 2470 /* 2471 * Enable KVM dirty ring if supported, otherwise fall back to 2472 * dirty logging mode 2473 */ 2474 if (s->kvm_dirty_ring_size > 0) { 2475 uint64_t ring_bytes; 2476 2477 ring_bytes = s->kvm_dirty_ring_size * sizeof(struct kvm_dirty_gfn); 2478 2479 /* Read the max supported pages */ 2480 ret = kvm_vm_check_extension(s, KVM_CAP_DIRTY_LOG_RING); 2481 if (ret > 0) { 2482 if (ring_bytes > ret) { 2483 error_report("KVM dirty ring size %" PRIu32 " too big " 2484 "(maximum is %ld). Please use a smaller value.", 2485 s->kvm_dirty_ring_size, 2486 (long)ret / sizeof(struct kvm_dirty_gfn)); 2487 ret = -EINVAL; 2488 goto err; 2489 } 2490 2491 ret = kvm_vm_enable_cap(s, KVM_CAP_DIRTY_LOG_RING, 0, ring_bytes); 2492 if (ret) { 2493 error_report("Enabling of KVM dirty ring failed: %s. " 2494 "Suggested minimum value is 1024.", strerror(-ret)); 2495 goto err; 2496 } 2497 2498 s->kvm_dirty_ring_bytes = ring_bytes; 2499 } else { 2500 warn_report("KVM dirty ring not available, using bitmap method"); 2501 s->kvm_dirty_ring_size = 0; 2502 } 2503 } 2504 2505 /* 2506 * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is not needed when dirty ring is 2507 * enabled. More importantly, KVM_DIRTY_LOG_INITIALLY_SET will assume no 2508 * page is wr-protected initially, which is against how kvm dirty ring is 2509 * usage - kvm dirty ring requires all pages are wr-protected at the very 2510 * beginning. Enabling this feature for dirty ring causes data corruption. 2511 * 2512 * TODO: Without KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 and kvm clear dirty log, 2513 * we may expect a higher stall time when starting the migration. In the 2514 * future we can enable KVM_CLEAR_DIRTY_LOG to work with dirty ring too: 2515 * instead of clearing dirty bit, it can be a way to explicitly wr-protect 2516 * guest pages. 2517 */ 2518 if (!s->kvm_dirty_ring_size) { 2519 dirty_log_manual_caps = 2520 kvm_check_extension(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); 2521 dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | 2522 KVM_DIRTY_LOG_INITIALLY_SET); 2523 s->manual_dirty_log_protect = dirty_log_manual_caps; 2524 if (dirty_log_manual_caps) { 2525 ret = kvm_vm_enable_cap(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0, 2526 dirty_log_manual_caps); 2527 if (ret) { 2528 warn_report("Trying to enable capability %"PRIu64" of " 2529 "KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 but failed. " 2530 "Falling back to the legacy mode. ", 2531 dirty_log_manual_caps); 2532 s->manual_dirty_log_protect = 0; 2533 } 2534 } 2535 } 2536 2537 #ifdef KVM_CAP_VCPU_EVENTS 2538 s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS); 2539 #endif 2540 2541 s->robust_singlestep = 2542 kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP); 2543 2544 #ifdef KVM_CAP_DEBUGREGS 2545 s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS); 2546 #endif 2547 2548 s->max_nested_state_len = kvm_check_extension(s, KVM_CAP_NESTED_STATE); 2549 2550 #ifdef KVM_CAP_IRQ_ROUTING 2551 kvm_direct_msi_allowed = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0); 2552 #endif 2553 2554 s->intx_set_mask = kvm_check_extension(s, KVM_CAP_PCI_2_3); 2555 2556 s->irq_set_ioctl = KVM_IRQ_LINE; 2557 if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) { 2558 s->irq_set_ioctl = KVM_IRQ_LINE_STATUS; 2559 } 2560 2561 kvm_readonly_mem_allowed = 2562 (kvm_check_extension(s, KVM_CAP_READONLY_MEM) > 0); 2563 2564 kvm_eventfds_allowed = 2565 (kvm_check_extension(s, KVM_CAP_IOEVENTFD) > 0); 2566 2567 kvm_irqfds_allowed = 2568 (kvm_check_extension(s, KVM_CAP_IRQFD) > 0); 2569 2570 kvm_resamplefds_allowed = 2571 (kvm_check_extension(s, KVM_CAP_IRQFD_RESAMPLE) > 0); 2572 2573 kvm_vm_attributes_allowed = 2574 (kvm_check_extension(s, KVM_CAP_VM_ATTRIBUTES) > 0); 2575 2576 kvm_ioeventfd_any_length_allowed = 2577 (kvm_check_extension(s, KVM_CAP_IOEVENTFD_ANY_LENGTH) > 0); 2578 2579 #ifdef KVM_CAP_SET_GUEST_DEBUG 2580 kvm_has_guest_debug = 2581 (kvm_check_extension(s, KVM_CAP_SET_GUEST_DEBUG) > 0); 2582 #endif 2583 2584 kvm_sstep_flags = 0; 2585 if (kvm_has_guest_debug) { 2586 kvm_sstep_flags = SSTEP_ENABLE; 2587 2588 #if defined KVM_CAP_SET_GUEST_DEBUG2 2589 int guest_debug_flags = 2590 kvm_check_extension(s, KVM_CAP_SET_GUEST_DEBUG2); 2591 2592 if (guest_debug_flags & KVM_GUESTDBG_BLOCKIRQ) { 2593 kvm_sstep_flags |= SSTEP_NOIRQ; 2594 } 2595 #endif 2596 } 2597 2598 kvm_state = s; 2599 2600 ret = kvm_arch_init(ms, s); 2601 if (ret < 0) { 2602 goto err; 2603 } 2604 2605 if (s->kernel_irqchip_split == ON_OFF_AUTO_AUTO) { 2606 s->kernel_irqchip_split = mc->default_kernel_irqchip_split ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF; 2607 } 2608 2609 qemu_register_reset(kvm_unpoison_all, NULL); 2610 2611 if (s->kernel_irqchip_allowed) { 2612 kvm_irqchip_create(s); 2613 } 2614 2615 if (kvm_eventfds_allowed) { 2616 s->memory_listener.listener.eventfd_add = kvm_mem_ioeventfd_add; 2617 s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del; 2618 } 2619 s->memory_listener.listener.coalesced_io_add = kvm_coalesce_mmio_region; 2620 s->memory_listener.listener.coalesced_io_del = kvm_uncoalesce_mmio_region; 2621 2622 kvm_memory_listener_register(s, &s->memory_listener, 2623 &address_space_memory, 0, "kvm-memory"); 2624 if (kvm_eventfds_allowed) { 2625 memory_listener_register(&kvm_io_listener, 2626 &address_space_io); 2627 } 2628 memory_listener_register(&kvm_coalesced_pio_listener, 2629 &address_space_io); 2630 2631 s->many_ioeventfds = kvm_check_many_ioeventfds(); 2632 2633 s->sync_mmu = !!kvm_vm_check_extension(kvm_state, KVM_CAP_SYNC_MMU); 2634 if (!s->sync_mmu) { 2635 ret = ram_block_discard_disable(true); 2636 assert(!ret); 2637 } 2638 2639 if (s->kvm_dirty_ring_size) { 2640 ret = kvm_dirty_ring_reaper_init(s); 2641 if (ret) { 2642 goto err; 2643 } 2644 } 2645 2646 if (kvm_check_extension(kvm_state, KVM_CAP_BINARY_STATS_FD)) { 2647 add_stats_callbacks(STATS_PROVIDER_KVM, query_stats_cb, 2648 query_stats_schemas_cb); 2649 } 2650 2651 return 0; 2652 2653 err: 2654 assert(ret < 0); 2655 if (s->vmfd >= 0) { 2656 close(s->vmfd); 2657 } 2658 if (s->fd != -1) { 2659 close(s->fd); 2660 } 2661 g_free(s->memory_listener.slots); 2662 2663 return ret; 2664 } 2665 2666 void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len) 2667 { 2668 s->sigmask_len = sigmask_len; 2669 } 2670 2671 static void kvm_handle_io(uint16_t port, MemTxAttrs attrs, void *data, int direction, 2672 int size, uint32_t count) 2673 { 2674 int i; 2675 uint8_t *ptr = data; 2676 2677 for (i = 0; i < count; i++) { 2678 address_space_rw(&address_space_io, port, attrs, 2679 ptr, size, 2680 direction == KVM_EXIT_IO_OUT); 2681 ptr += size; 2682 } 2683 } 2684 2685 static int kvm_handle_internal_error(CPUState *cpu, struct kvm_run *run) 2686 { 2687 fprintf(stderr, "KVM internal error. Suberror: %d\n", 2688 run->internal.suberror); 2689 2690 if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) { 2691 int i; 2692 2693 for (i = 0; i < run->internal.ndata; ++i) { 2694 fprintf(stderr, "extra data[%d]: 0x%016"PRIx64"\n", 2695 i, (uint64_t)run->internal.data[i]); 2696 } 2697 } 2698 if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) { 2699 fprintf(stderr, "emulation failure\n"); 2700 if (!kvm_arch_stop_on_emulation_error(cpu)) { 2701 cpu_dump_state(cpu, stderr, CPU_DUMP_CODE); 2702 return EXCP_INTERRUPT; 2703 } 2704 } 2705 /* FIXME: Should trigger a qmp message to let management know 2706 * something went wrong. 2707 */ 2708 return -1; 2709 } 2710 2711 void kvm_flush_coalesced_mmio_buffer(void) 2712 { 2713 KVMState *s = kvm_state; 2714 2715 if (s->coalesced_flush_in_progress) { 2716 return; 2717 } 2718 2719 s->coalesced_flush_in_progress = true; 2720 2721 if (s->coalesced_mmio_ring) { 2722 struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring; 2723 while (ring->first != ring->last) { 2724 struct kvm_coalesced_mmio *ent; 2725 2726 ent = &ring->coalesced_mmio[ring->first]; 2727 2728 if (ent->pio == 1) { 2729 address_space_write(&address_space_io, ent->phys_addr, 2730 MEMTXATTRS_UNSPECIFIED, ent->data, 2731 ent->len); 2732 } else { 2733 cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len); 2734 } 2735 smp_wmb(); 2736 ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX; 2737 } 2738 } 2739 2740 s->coalesced_flush_in_progress = false; 2741 } 2742 2743 bool kvm_cpu_check_are_resettable(void) 2744 { 2745 return kvm_arch_cpu_check_are_resettable(); 2746 } 2747 2748 static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg) 2749 { 2750 if (!cpu->vcpu_dirty) { 2751 kvm_arch_get_registers(cpu); 2752 cpu->vcpu_dirty = true; 2753 } 2754 } 2755 2756 void kvm_cpu_synchronize_state(CPUState *cpu) 2757 { 2758 if (!cpu->vcpu_dirty) { 2759 run_on_cpu(cpu, do_kvm_cpu_synchronize_state, RUN_ON_CPU_NULL); 2760 } 2761 } 2762 2763 static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg) 2764 { 2765 kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE); 2766 cpu->vcpu_dirty = false; 2767 } 2768 2769 void kvm_cpu_synchronize_post_reset(CPUState *cpu) 2770 { 2771 run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL); 2772 } 2773 2774 static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg) 2775 { 2776 kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE); 2777 cpu->vcpu_dirty = false; 2778 } 2779 2780 void kvm_cpu_synchronize_post_init(CPUState *cpu) 2781 { 2782 run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, RUN_ON_CPU_NULL); 2783 } 2784 2785 static void do_kvm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg) 2786 { 2787 cpu->vcpu_dirty = true; 2788 } 2789 2790 void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu) 2791 { 2792 run_on_cpu(cpu, do_kvm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL); 2793 } 2794 2795 #ifdef KVM_HAVE_MCE_INJECTION 2796 static __thread void *pending_sigbus_addr; 2797 static __thread int pending_sigbus_code; 2798 static __thread bool have_sigbus_pending; 2799 #endif 2800 2801 static void kvm_cpu_kick(CPUState *cpu) 2802 { 2803 qatomic_set(&cpu->kvm_run->immediate_exit, 1); 2804 } 2805 2806 static void kvm_cpu_kick_self(void) 2807 { 2808 if (kvm_immediate_exit) { 2809 kvm_cpu_kick(current_cpu); 2810 } else { 2811 qemu_cpu_kick_self(); 2812 } 2813 } 2814 2815 static void kvm_eat_signals(CPUState *cpu) 2816 { 2817 struct timespec ts = { 0, 0 }; 2818 siginfo_t siginfo; 2819 sigset_t waitset; 2820 sigset_t chkset; 2821 int r; 2822 2823 if (kvm_immediate_exit) { 2824 qatomic_set(&cpu->kvm_run->immediate_exit, 0); 2825 /* Write kvm_run->immediate_exit before the cpu->exit_request 2826 * write in kvm_cpu_exec. 2827 */ 2828 smp_wmb(); 2829 return; 2830 } 2831 2832 sigemptyset(&waitset); 2833 sigaddset(&waitset, SIG_IPI); 2834 2835 do { 2836 r = sigtimedwait(&waitset, &siginfo, &ts); 2837 if (r == -1 && !(errno == EAGAIN || errno == EINTR)) { 2838 perror("sigtimedwait"); 2839 exit(1); 2840 } 2841 2842 r = sigpending(&chkset); 2843 if (r == -1) { 2844 perror("sigpending"); 2845 exit(1); 2846 } 2847 } while (sigismember(&chkset, SIG_IPI)); 2848 } 2849 2850 int kvm_cpu_exec(CPUState *cpu) 2851 { 2852 struct kvm_run *run = cpu->kvm_run; 2853 int ret, run_ret; 2854 2855 DPRINTF("kvm_cpu_exec()\n"); 2856 2857 if (kvm_arch_process_async_events(cpu)) { 2858 qatomic_set(&cpu->exit_request, 0); 2859 return EXCP_HLT; 2860 } 2861 2862 qemu_mutex_unlock_iothread(); 2863 cpu_exec_start(cpu); 2864 2865 do { 2866 MemTxAttrs attrs; 2867 2868 if (cpu->vcpu_dirty) { 2869 kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE); 2870 cpu->vcpu_dirty = false; 2871 } 2872 2873 kvm_arch_pre_run(cpu, run); 2874 if (qatomic_read(&cpu->exit_request)) { 2875 DPRINTF("interrupt exit requested\n"); 2876 /* 2877 * KVM requires us to reenter the kernel after IO exits to complete 2878 * instruction emulation. This self-signal will ensure that we 2879 * leave ASAP again. 2880 */ 2881 kvm_cpu_kick_self(); 2882 } 2883 2884 /* Read cpu->exit_request before KVM_RUN reads run->immediate_exit. 2885 * Matching barrier in kvm_eat_signals. 2886 */ 2887 smp_rmb(); 2888 2889 run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0); 2890 2891 attrs = kvm_arch_post_run(cpu, run); 2892 2893 #ifdef KVM_HAVE_MCE_INJECTION 2894 if (unlikely(have_sigbus_pending)) { 2895 qemu_mutex_lock_iothread(); 2896 kvm_arch_on_sigbus_vcpu(cpu, pending_sigbus_code, 2897 pending_sigbus_addr); 2898 have_sigbus_pending = false; 2899 qemu_mutex_unlock_iothread(); 2900 } 2901 #endif 2902 2903 if (run_ret < 0) { 2904 if (run_ret == -EINTR || run_ret == -EAGAIN) { 2905 DPRINTF("io window exit\n"); 2906 kvm_eat_signals(cpu); 2907 ret = EXCP_INTERRUPT; 2908 break; 2909 } 2910 fprintf(stderr, "error: kvm run failed %s\n", 2911 strerror(-run_ret)); 2912 #ifdef TARGET_PPC 2913 if (run_ret == -EBUSY) { 2914 fprintf(stderr, 2915 "This is probably because your SMT is enabled.\n" 2916 "VCPU can only run on primary threads with all " 2917 "secondary threads offline.\n"); 2918 } 2919 #endif 2920 ret = -1; 2921 break; 2922 } 2923 2924 trace_kvm_run_exit(cpu->cpu_index, run->exit_reason); 2925 switch (run->exit_reason) { 2926 case KVM_EXIT_IO: 2927 DPRINTF("handle_io\n"); 2928 /* Called outside BQL */ 2929 kvm_handle_io(run->io.port, attrs, 2930 (uint8_t *)run + run->io.data_offset, 2931 run->io.direction, 2932 run->io.size, 2933 run->io.count); 2934 ret = 0; 2935 break; 2936 case KVM_EXIT_MMIO: 2937 DPRINTF("handle_mmio\n"); 2938 /* Called outside BQL */ 2939 address_space_rw(&address_space_memory, 2940 run->mmio.phys_addr, attrs, 2941 run->mmio.data, 2942 run->mmio.len, 2943 run->mmio.is_write); 2944 ret = 0; 2945 break; 2946 case KVM_EXIT_IRQ_WINDOW_OPEN: 2947 DPRINTF("irq_window_open\n"); 2948 ret = EXCP_INTERRUPT; 2949 break; 2950 case KVM_EXIT_SHUTDOWN: 2951 DPRINTF("shutdown\n"); 2952 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); 2953 ret = EXCP_INTERRUPT; 2954 break; 2955 case KVM_EXIT_UNKNOWN: 2956 fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n", 2957 (uint64_t)run->hw.hardware_exit_reason); 2958 ret = -1; 2959 break; 2960 case KVM_EXIT_INTERNAL_ERROR: 2961 ret = kvm_handle_internal_error(cpu, run); 2962 break; 2963 case KVM_EXIT_DIRTY_RING_FULL: 2964 /* 2965 * We shouldn't continue if the dirty ring of this vcpu is 2966 * still full. Got kicked by KVM_RESET_DIRTY_RINGS. 2967 */ 2968 trace_kvm_dirty_ring_full(cpu->cpu_index); 2969 qemu_mutex_lock_iothread(); 2970 kvm_dirty_ring_reap(kvm_state); 2971 qemu_mutex_unlock_iothread(); 2972 ret = 0; 2973 break; 2974 case KVM_EXIT_SYSTEM_EVENT: 2975 switch (run->system_event.type) { 2976 case KVM_SYSTEM_EVENT_SHUTDOWN: 2977 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN); 2978 ret = EXCP_INTERRUPT; 2979 break; 2980 case KVM_SYSTEM_EVENT_RESET: 2981 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); 2982 ret = EXCP_INTERRUPT; 2983 break; 2984 case KVM_SYSTEM_EVENT_CRASH: 2985 kvm_cpu_synchronize_state(cpu); 2986 qemu_mutex_lock_iothread(); 2987 qemu_system_guest_panicked(cpu_get_crash_info(cpu)); 2988 qemu_mutex_unlock_iothread(); 2989 ret = 0; 2990 break; 2991 default: 2992 DPRINTF("kvm_arch_handle_exit\n"); 2993 ret = kvm_arch_handle_exit(cpu, run); 2994 break; 2995 } 2996 break; 2997 default: 2998 DPRINTF("kvm_arch_handle_exit\n"); 2999 ret = kvm_arch_handle_exit(cpu, run); 3000 break; 3001 } 3002 } while (ret == 0); 3003 3004 cpu_exec_end(cpu); 3005 qemu_mutex_lock_iothread(); 3006 3007 if (ret < 0) { 3008 cpu_dump_state(cpu, stderr, CPU_DUMP_CODE); 3009 vm_stop(RUN_STATE_INTERNAL_ERROR); 3010 } 3011 3012 qatomic_set(&cpu->exit_request, 0); 3013 return ret; 3014 } 3015 3016 int kvm_ioctl(KVMState *s, int type, ...) 3017 { 3018 int ret; 3019 void *arg; 3020 va_list ap; 3021 3022 va_start(ap, type); 3023 arg = va_arg(ap, void *); 3024 va_end(ap); 3025 3026 trace_kvm_ioctl(type, arg); 3027 ret = ioctl(s->fd, type, arg); 3028 if (ret == -1) { 3029 ret = -errno; 3030 } 3031 return ret; 3032 } 3033 3034 int kvm_vm_ioctl(KVMState *s, int type, ...) 3035 { 3036 int ret; 3037 void *arg; 3038 va_list ap; 3039 3040 va_start(ap, type); 3041 arg = va_arg(ap, void *); 3042 va_end(ap); 3043 3044 trace_kvm_vm_ioctl(type, arg); 3045 ret = ioctl(s->vmfd, type, arg); 3046 if (ret == -1) { 3047 ret = -errno; 3048 } 3049 return ret; 3050 } 3051 3052 int kvm_vcpu_ioctl(CPUState *cpu, int type, ...) 3053 { 3054 int ret; 3055 void *arg; 3056 va_list ap; 3057 3058 va_start(ap, type); 3059 arg = va_arg(ap, void *); 3060 va_end(ap); 3061 3062 trace_kvm_vcpu_ioctl(cpu->cpu_index, type, arg); 3063 ret = ioctl(cpu->kvm_fd, type, arg); 3064 if (ret == -1) { 3065 ret = -errno; 3066 } 3067 return ret; 3068 } 3069 3070 int kvm_device_ioctl(int fd, int type, ...) 3071 { 3072 int ret; 3073 void *arg; 3074 va_list ap; 3075 3076 va_start(ap, type); 3077 arg = va_arg(ap, void *); 3078 va_end(ap); 3079 3080 trace_kvm_device_ioctl(fd, type, arg); 3081 ret = ioctl(fd, type, arg); 3082 if (ret == -1) { 3083 ret = -errno; 3084 } 3085 return ret; 3086 } 3087 3088 int kvm_vm_check_attr(KVMState *s, uint32_t group, uint64_t attr) 3089 { 3090 int ret; 3091 struct kvm_device_attr attribute = { 3092 .group = group, 3093 .attr = attr, 3094 }; 3095 3096 if (!kvm_vm_attributes_allowed) { 3097 return 0; 3098 } 3099 3100 ret = kvm_vm_ioctl(s, KVM_HAS_DEVICE_ATTR, &attribute); 3101 /* kvm returns 0 on success for HAS_DEVICE_ATTR */ 3102 return ret ? 0 : 1; 3103 } 3104 3105 int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr) 3106 { 3107 struct kvm_device_attr attribute = { 3108 .group = group, 3109 .attr = attr, 3110 .flags = 0, 3111 }; 3112 3113 return kvm_device_ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute) ? 0 : 1; 3114 } 3115 3116 int kvm_device_access(int fd, int group, uint64_t attr, 3117 void *val, bool write, Error **errp) 3118 { 3119 struct kvm_device_attr kvmattr; 3120 int err; 3121 3122 kvmattr.flags = 0; 3123 kvmattr.group = group; 3124 kvmattr.attr = attr; 3125 kvmattr.addr = (uintptr_t)val; 3126 3127 err = kvm_device_ioctl(fd, 3128 write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR, 3129 &kvmattr); 3130 if (err < 0) { 3131 error_setg_errno(errp, -err, 3132 "KVM_%s_DEVICE_ATTR failed: Group %d " 3133 "attr 0x%016" PRIx64, 3134 write ? "SET" : "GET", group, attr); 3135 } 3136 return err; 3137 } 3138 3139 bool kvm_has_sync_mmu(void) 3140 { 3141 return kvm_state->sync_mmu; 3142 } 3143 3144 int kvm_has_vcpu_events(void) 3145 { 3146 return kvm_state->vcpu_events; 3147 } 3148 3149 int kvm_has_robust_singlestep(void) 3150 { 3151 return kvm_state->robust_singlestep; 3152 } 3153 3154 int kvm_has_debugregs(void) 3155 { 3156 return kvm_state->debugregs; 3157 } 3158 3159 int kvm_max_nested_state_length(void) 3160 { 3161 return kvm_state->max_nested_state_len; 3162 } 3163 3164 int kvm_has_many_ioeventfds(void) 3165 { 3166 if (!kvm_enabled()) { 3167 return 0; 3168 } 3169 return kvm_state->many_ioeventfds; 3170 } 3171 3172 int kvm_has_gsi_routing(void) 3173 { 3174 #ifdef KVM_CAP_IRQ_ROUTING 3175 return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING); 3176 #else 3177 return false; 3178 #endif 3179 } 3180 3181 int kvm_has_intx_set_mask(void) 3182 { 3183 return kvm_state->intx_set_mask; 3184 } 3185 3186 bool kvm_arm_supports_user_irq(void) 3187 { 3188 return kvm_check_extension(kvm_state, KVM_CAP_ARM_USER_IRQ); 3189 } 3190 3191 #ifdef KVM_CAP_SET_GUEST_DEBUG 3192 struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu, 3193 target_ulong pc) 3194 { 3195 struct kvm_sw_breakpoint *bp; 3196 3197 QTAILQ_FOREACH(bp, &cpu->kvm_state->kvm_sw_breakpoints, entry) { 3198 if (bp->pc == pc) { 3199 return bp; 3200 } 3201 } 3202 return NULL; 3203 } 3204 3205 int kvm_sw_breakpoints_active(CPUState *cpu) 3206 { 3207 return !QTAILQ_EMPTY(&cpu->kvm_state->kvm_sw_breakpoints); 3208 } 3209 3210 struct kvm_set_guest_debug_data { 3211 struct kvm_guest_debug dbg; 3212 int err; 3213 }; 3214 3215 static void kvm_invoke_set_guest_debug(CPUState *cpu, run_on_cpu_data data) 3216 { 3217 struct kvm_set_guest_debug_data *dbg_data = 3218 (struct kvm_set_guest_debug_data *) data.host_ptr; 3219 3220 dbg_data->err = kvm_vcpu_ioctl(cpu, KVM_SET_GUEST_DEBUG, 3221 &dbg_data->dbg); 3222 } 3223 3224 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap) 3225 { 3226 struct kvm_set_guest_debug_data data; 3227 3228 data.dbg.control = reinject_trap; 3229 3230 if (cpu->singlestep_enabled) { 3231 data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP; 3232 3233 if (cpu->singlestep_enabled & SSTEP_NOIRQ) { 3234 data.dbg.control |= KVM_GUESTDBG_BLOCKIRQ; 3235 } 3236 } 3237 kvm_arch_update_guest_debug(cpu, &data.dbg); 3238 3239 run_on_cpu(cpu, kvm_invoke_set_guest_debug, 3240 RUN_ON_CPU_HOST_PTR(&data)); 3241 return data.err; 3242 } 3243 3244 int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr, 3245 target_ulong len, int type) 3246 { 3247 struct kvm_sw_breakpoint *bp; 3248 int err; 3249 3250 if (type == GDB_BREAKPOINT_SW) { 3251 bp = kvm_find_sw_breakpoint(cpu, addr); 3252 if (bp) { 3253 bp->use_count++; 3254 return 0; 3255 } 3256 3257 bp = g_new(struct kvm_sw_breakpoint, 1); 3258 bp->pc = addr; 3259 bp->use_count = 1; 3260 err = kvm_arch_insert_sw_breakpoint(cpu, bp); 3261 if (err) { 3262 g_free(bp); 3263 return err; 3264 } 3265 3266 QTAILQ_INSERT_HEAD(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry); 3267 } else { 3268 err = kvm_arch_insert_hw_breakpoint(addr, len, type); 3269 if (err) { 3270 return err; 3271 } 3272 } 3273 3274 CPU_FOREACH(cpu) { 3275 err = kvm_update_guest_debug(cpu, 0); 3276 if (err) { 3277 return err; 3278 } 3279 } 3280 return 0; 3281 } 3282 3283 int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr, 3284 target_ulong len, int type) 3285 { 3286 struct kvm_sw_breakpoint *bp; 3287 int err; 3288 3289 if (type == GDB_BREAKPOINT_SW) { 3290 bp = kvm_find_sw_breakpoint(cpu, addr); 3291 if (!bp) { 3292 return -ENOENT; 3293 } 3294 3295 if (bp->use_count > 1) { 3296 bp->use_count--; 3297 return 0; 3298 } 3299 3300 err = kvm_arch_remove_sw_breakpoint(cpu, bp); 3301 if (err) { 3302 return err; 3303 } 3304 3305 QTAILQ_REMOVE(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry); 3306 g_free(bp); 3307 } else { 3308 err = kvm_arch_remove_hw_breakpoint(addr, len, type); 3309 if (err) { 3310 return err; 3311 } 3312 } 3313 3314 CPU_FOREACH(cpu) { 3315 err = kvm_update_guest_debug(cpu, 0); 3316 if (err) { 3317 return err; 3318 } 3319 } 3320 return 0; 3321 } 3322 3323 void kvm_remove_all_breakpoints(CPUState *cpu) 3324 { 3325 struct kvm_sw_breakpoint *bp, *next; 3326 KVMState *s = cpu->kvm_state; 3327 CPUState *tmpcpu; 3328 3329 QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) { 3330 if (kvm_arch_remove_sw_breakpoint(cpu, bp) != 0) { 3331 /* Try harder to find a CPU that currently sees the breakpoint. */ 3332 CPU_FOREACH(tmpcpu) { 3333 if (kvm_arch_remove_sw_breakpoint(tmpcpu, bp) == 0) { 3334 break; 3335 } 3336 } 3337 } 3338 QTAILQ_REMOVE(&s->kvm_sw_breakpoints, bp, entry); 3339 g_free(bp); 3340 } 3341 kvm_arch_remove_all_hw_breakpoints(); 3342 3343 CPU_FOREACH(cpu) { 3344 kvm_update_guest_debug(cpu, 0); 3345 } 3346 } 3347 3348 #else /* !KVM_CAP_SET_GUEST_DEBUG */ 3349 3350 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap) 3351 { 3352 return -EINVAL; 3353 } 3354 3355 int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr, 3356 target_ulong len, int type) 3357 { 3358 return -EINVAL; 3359 } 3360 3361 int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr, 3362 target_ulong len, int type) 3363 { 3364 return -EINVAL; 3365 } 3366 3367 void kvm_remove_all_breakpoints(CPUState *cpu) 3368 { 3369 } 3370 #endif /* !KVM_CAP_SET_GUEST_DEBUG */ 3371 3372 static int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset) 3373 { 3374 KVMState *s = kvm_state; 3375 struct kvm_signal_mask *sigmask; 3376 int r; 3377 3378 sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset)); 3379 3380 sigmask->len = s->sigmask_len; 3381 memcpy(sigmask->sigset, sigset, sizeof(*sigset)); 3382 r = kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, sigmask); 3383 g_free(sigmask); 3384 3385 return r; 3386 } 3387 3388 static void kvm_ipi_signal(int sig) 3389 { 3390 if (current_cpu) { 3391 assert(kvm_immediate_exit); 3392 kvm_cpu_kick(current_cpu); 3393 } 3394 } 3395 3396 void kvm_init_cpu_signals(CPUState *cpu) 3397 { 3398 int r; 3399 sigset_t set; 3400 struct sigaction sigact; 3401 3402 memset(&sigact, 0, sizeof(sigact)); 3403 sigact.sa_handler = kvm_ipi_signal; 3404 sigaction(SIG_IPI, &sigact, NULL); 3405 3406 pthread_sigmask(SIG_BLOCK, NULL, &set); 3407 #if defined KVM_HAVE_MCE_INJECTION 3408 sigdelset(&set, SIGBUS); 3409 pthread_sigmask(SIG_SETMASK, &set, NULL); 3410 #endif 3411 sigdelset(&set, SIG_IPI); 3412 if (kvm_immediate_exit) { 3413 r = pthread_sigmask(SIG_SETMASK, &set, NULL); 3414 } else { 3415 r = kvm_set_signal_mask(cpu, &set); 3416 } 3417 if (r) { 3418 fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r)); 3419 exit(1); 3420 } 3421 } 3422 3423 /* Called asynchronously in VCPU thread. */ 3424 int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr) 3425 { 3426 #ifdef KVM_HAVE_MCE_INJECTION 3427 if (have_sigbus_pending) { 3428 return 1; 3429 } 3430 have_sigbus_pending = true; 3431 pending_sigbus_addr = addr; 3432 pending_sigbus_code = code; 3433 qatomic_set(&cpu->exit_request, 1); 3434 return 0; 3435 #else 3436 return 1; 3437 #endif 3438 } 3439 3440 /* Called synchronously (via signalfd) in main thread. */ 3441 int kvm_on_sigbus(int code, void *addr) 3442 { 3443 #ifdef KVM_HAVE_MCE_INJECTION 3444 /* Action required MCE kills the process if SIGBUS is blocked. Because 3445 * that's what happens in the I/O thread, where we handle MCE via signalfd, 3446 * we can only get action optional here. 3447 */ 3448 assert(code != BUS_MCEERR_AR); 3449 kvm_arch_on_sigbus_vcpu(first_cpu, code, addr); 3450 return 0; 3451 #else 3452 return 1; 3453 #endif 3454 } 3455 3456 int kvm_create_device(KVMState *s, uint64_t type, bool test) 3457 { 3458 int ret; 3459 struct kvm_create_device create_dev; 3460 3461 create_dev.type = type; 3462 create_dev.fd = -1; 3463 create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0; 3464 3465 if (!kvm_check_extension(s, KVM_CAP_DEVICE_CTRL)) { 3466 return -ENOTSUP; 3467 } 3468 3469 ret = kvm_vm_ioctl(s, KVM_CREATE_DEVICE, &create_dev); 3470 if (ret) { 3471 return ret; 3472 } 3473 3474 return test ? 0 : create_dev.fd; 3475 } 3476 3477 bool kvm_device_supported(int vmfd, uint64_t type) 3478 { 3479 struct kvm_create_device create_dev = { 3480 .type = type, 3481 .fd = -1, 3482 .flags = KVM_CREATE_DEVICE_TEST, 3483 }; 3484 3485 if (ioctl(vmfd, KVM_CHECK_EXTENSION, KVM_CAP_DEVICE_CTRL) <= 0) { 3486 return false; 3487 } 3488 3489 return (ioctl(vmfd, KVM_CREATE_DEVICE, &create_dev) >= 0); 3490 } 3491 3492 int kvm_set_one_reg(CPUState *cs, uint64_t id, void *source) 3493 { 3494 struct kvm_one_reg reg; 3495 int r; 3496 3497 reg.id = id; 3498 reg.addr = (uintptr_t) source; 3499 r = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, ®); 3500 if (r) { 3501 trace_kvm_failed_reg_set(id, strerror(-r)); 3502 } 3503 return r; 3504 } 3505 3506 int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target) 3507 { 3508 struct kvm_one_reg reg; 3509 int r; 3510 3511 reg.id = id; 3512 reg.addr = (uintptr_t) target; 3513 r = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, ®); 3514 if (r) { 3515 trace_kvm_failed_reg_get(id, strerror(-r)); 3516 } 3517 return r; 3518 } 3519 3520 static bool kvm_accel_has_memory(MachineState *ms, AddressSpace *as, 3521 hwaddr start_addr, hwaddr size) 3522 { 3523 KVMState *kvm = KVM_STATE(ms->accelerator); 3524 int i; 3525 3526 for (i = 0; i < kvm->nr_as; ++i) { 3527 if (kvm->as[i].as == as && kvm->as[i].ml) { 3528 size = MIN(kvm_max_slot_size, size); 3529 return NULL != kvm_lookup_matching_slot(kvm->as[i].ml, 3530 start_addr, size); 3531 } 3532 } 3533 3534 return false; 3535 } 3536 3537 static void kvm_get_kvm_shadow_mem(Object *obj, Visitor *v, 3538 const char *name, void *opaque, 3539 Error **errp) 3540 { 3541 KVMState *s = KVM_STATE(obj); 3542 int64_t value = s->kvm_shadow_mem; 3543 3544 visit_type_int(v, name, &value, errp); 3545 } 3546 3547 static void kvm_set_kvm_shadow_mem(Object *obj, Visitor *v, 3548 const char *name, void *opaque, 3549 Error **errp) 3550 { 3551 KVMState *s = KVM_STATE(obj); 3552 int64_t value; 3553 3554 if (s->fd != -1) { 3555 error_setg(errp, "Cannot set properties after the accelerator has been initialized"); 3556 return; 3557 } 3558 3559 if (!visit_type_int(v, name, &value, errp)) { 3560 return; 3561 } 3562 3563 s->kvm_shadow_mem = value; 3564 } 3565 3566 static void kvm_set_kernel_irqchip(Object *obj, Visitor *v, 3567 const char *name, void *opaque, 3568 Error **errp) 3569 { 3570 KVMState *s = KVM_STATE(obj); 3571 OnOffSplit mode; 3572 3573 if (s->fd != -1) { 3574 error_setg(errp, "Cannot set properties after the accelerator has been initialized"); 3575 return; 3576 } 3577 3578 if (!visit_type_OnOffSplit(v, name, &mode, errp)) { 3579 return; 3580 } 3581 switch (mode) { 3582 case ON_OFF_SPLIT_ON: 3583 s->kernel_irqchip_allowed = true; 3584 s->kernel_irqchip_required = true; 3585 s->kernel_irqchip_split = ON_OFF_AUTO_OFF; 3586 break; 3587 case ON_OFF_SPLIT_OFF: 3588 s->kernel_irqchip_allowed = false; 3589 s->kernel_irqchip_required = false; 3590 s->kernel_irqchip_split = ON_OFF_AUTO_OFF; 3591 break; 3592 case ON_OFF_SPLIT_SPLIT: 3593 s->kernel_irqchip_allowed = true; 3594 s->kernel_irqchip_required = true; 3595 s->kernel_irqchip_split = ON_OFF_AUTO_ON; 3596 break; 3597 default: 3598 /* The value was checked in visit_type_OnOffSplit() above. If 3599 * we get here, then something is wrong in QEMU. 3600 */ 3601 abort(); 3602 } 3603 } 3604 3605 bool kvm_kernel_irqchip_allowed(void) 3606 { 3607 return kvm_state->kernel_irqchip_allowed; 3608 } 3609 3610 bool kvm_kernel_irqchip_required(void) 3611 { 3612 return kvm_state->kernel_irqchip_required; 3613 } 3614 3615 bool kvm_kernel_irqchip_split(void) 3616 { 3617 return kvm_state->kernel_irqchip_split == ON_OFF_AUTO_ON; 3618 } 3619 3620 static void kvm_get_dirty_ring_size(Object *obj, Visitor *v, 3621 const char *name, void *opaque, 3622 Error **errp) 3623 { 3624 KVMState *s = KVM_STATE(obj); 3625 uint32_t value = s->kvm_dirty_ring_size; 3626 3627 visit_type_uint32(v, name, &value, errp); 3628 } 3629 3630 static void kvm_set_dirty_ring_size(Object *obj, Visitor *v, 3631 const char *name, void *opaque, 3632 Error **errp) 3633 { 3634 KVMState *s = KVM_STATE(obj); 3635 Error *error = NULL; 3636 uint32_t value; 3637 3638 if (s->fd != -1) { 3639 error_setg(errp, "Cannot set properties after the accelerator has been initialized"); 3640 return; 3641 } 3642 3643 visit_type_uint32(v, name, &value, &error); 3644 if (error) { 3645 error_propagate(errp, error); 3646 return; 3647 } 3648 if (value & (value - 1)) { 3649 error_setg(errp, "dirty-ring-size must be a power of two."); 3650 return; 3651 } 3652 3653 s->kvm_dirty_ring_size = value; 3654 } 3655 3656 static void kvm_accel_instance_init(Object *obj) 3657 { 3658 KVMState *s = KVM_STATE(obj); 3659 3660 s->fd = -1; 3661 s->vmfd = -1; 3662 s->kvm_shadow_mem = -1; 3663 s->kernel_irqchip_allowed = true; 3664 s->kernel_irqchip_split = ON_OFF_AUTO_AUTO; 3665 /* KVM dirty ring is by default off */ 3666 s->kvm_dirty_ring_size = 0; 3667 } 3668 3669 static void kvm_accel_class_init(ObjectClass *oc, void *data) 3670 { 3671 AccelClass *ac = ACCEL_CLASS(oc); 3672 ac->name = "KVM"; 3673 ac->init_machine = kvm_init; 3674 ac->has_memory = kvm_accel_has_memory; 3675 ac->allowed = &kvm_allowed; 3676 3677 object_class_property_add(oc, "kernel-irqchip", "on|off|split", 3678 NULL, kvm_set_kernel_irqchip, 3679 NULL, NULL); 3680 object_class_property_set_description(oc, "kernel-irqchip", 3681 "Configure KVM in-kernel irqchip"); 3682 3683 object_class_property_add(oc, "kvm-shadow-mem", "int", 3684 kvm_get_kvm_shadow_mem, kvm_set_kvm_shadow_mem, 3685 NULL, NULL); 3686 object_class_property_set_description(oc, "kvm-shadow-mem", 3687 "KVM shadow MMU size"); 3688 3689 object_class_property_add(oc, "dirty-ring-size", "uint32", 3690 kvm_get_dirty_ring_size, kvm_set_dirty_ring_size, 3691 NULL, NULL); 3692 object_class_property_set_description(oc, "dirty-ring-size", 3693 "Size of KVM dirty page ring buffer (default: 0, i.e. use bitmap)"); 3694 } 3695 3696 static const TypeInfo kvm_accel_type = { 3697 .name = TYPE_KVM_ACCEL, 3698 .parent = TYPE_ACCEL, 3699 .instance_init = kvm_accel_instance_init, 3700 .class_init = kvm_accel_class_init, 3701 .instance_size = sizeof(KVMState), 3702 }; 3703 3704 static void kvm_type_init(void) 3705 { 3706 type_register_static(&kvm_accel_type); 3707 } 3708 3709 type_init(kvm_type_init); 3710 3711 typedef struct StatsArgs { 3712 union StatsResultsType { 3713 StatsResultList **stats; 3714 StatsSchemaList **schema; 3715 } result; 3716 strList *names; 3717 Error **errp; 3718 } StatsArgs; 3719 3720 static StatsList *add_kvmstat_entry(struct kvm_stats_desc *pdesc, 3721 uint64_t *stats_data, 3722 StatsList *stats_list, 3723 Error **errp) 3724 { 3725 3726 Stats *stats; 3727 uint64List *val_list = NULL; 3728 3729 /* Only add stats that we understand. */ 3730 switch (pdesc->flags & KVM_STATS_TYPE_MASK) { 3731 case KVM_STATS_TYPE_CUMULATIVE: 3732 case KVM_STATS_TYPE_INSTANT: 3733 case KVM_STATS_TYPE_PEAK: 3734 case KVM_STATS_TYPE_LINEAR_HIST: 3735 case KVM_STATS_TYPE_LOG_HIST: 3736 break; 3737 default: 3738 return stats_list; 3739 } 3740 3741 switch (pdesc->flags & KVM_STATS_UNIT_MASK) { 3742 case KVM_STATS_UNIT_NONE: 3743 case KVM_STATS_UNIT_BYTES: 3744 case KVM_STATS_UNIT_CYCLES: 3745 case KVM_STATS_UNIT_SECONDS: 3746 break; 3747 default: 3748 return stats_list; 3749 } 3750 3751 switch (pdesc->flags & KVM_STATS_BASE_MASK) { 3752 case KVM_STATS_BASE_POW10: 3753 case KVM_STATS_BASE_POW2: 3754 break; 3755 default: 3756 return stats_list; 3757 } 3758 3759 /* Alloc and populate data list */ 3760 stats = g_new0(Stats, 1); 3761 stats->name = g_strdup(pdesc->name); 3762 stats->value = g_new0(StatsValue, 1);; 3763 3764 if (pdesc->size == 1) { 3765 stats->value->u.scalar = *stats_data; 3766 stats->value->type = QTYPE_QNUM; 3767 } else { 3768 int i; 3769 for (i = 0; i < pdesc->size; i++) { 3770 QAPI_LIST_PREPEND(val_list, stats_data[i]); 3771 } 3772 stats->value->u.list = val_list; 3773 stats->value->type = QTYPE_QLIST; 3774 } 3775 3776 QAPI_LIST_PREPEND(stats_list, stats); 3777 return stats_list; 3778 } 3779 3780 static StatsSchemaValueList *add_kvmschema_entry(struct kvm_stats_desc *pdesc, 3781 StatsSchemaValueList *list, 3782 Error **errp) 3783 { 3784 StatsSchemaValueList *schema_entry = g_new0(StatsSchemaValueList, 1); 3785 schema_entry->value = g_new0(StatsSchemaValue, 1); 3786 3787 switch (pdesc->flags & KVM_STATS_TYPE_MASK) { 3788 case KVM_STATS_TYPE_CUMULATIVE: 3789 schema_entry->value->type = STATS_TYPE_CUMULATIVE; 3790 break; 3791 case KVM_STATS_TYPE_INSTANT: 3792 schema_entry->value->type = STATS_TYPE_INSTANT; 3793 break; 3794 case KVM_STATS_TYPE_PEAK: 3795 schema_entry->value->type = STATS_TYPE_PEAK; 3796 break; 3797 case KVM_STATS_TYPE_LINEAR_HIST: 3798 schema_entry->value->type = STATS_TYPE_LINEAR_HISTOGRAM; 3799 schema_entry->value->bucket_size = pdesc->bucket_size; 3800 schema_entry->value->has_bucket_size = true; 3801 break; 3802 case KVM_STATS_TYPE_LOG_HIST: 3803 schema_entry->value->type = STATS_TYPE_LOG2_HISTOGRAM; 3804 break; 3805 default: 3806 goto exit; 3807 } 3808 3809 switch (pdesc->flags & KVM_STATS_UNIT_MASK) { 3810 case KVM_STATS_UNIT_NONE: 3811 break; 3812 case KVM_STATS_UNIT_BYTES: 3813 schema_entry->value->has_unit = true; 3814 schema_entry->value->unit = STATS_UNIT_BYTES; 3815 break; 3816 case KVM_STATS_UNIT_CYCLES: 3817 schema_entry->value->has_unit = true; 3818 schema_entry->value->unit = STATS_UNIT_CYCLES; 3819 break; 3820 case KVM_STATS_UNIT_SECONDS: 3821 schema_entry->value->has_unit = true; 3822 schema_entry->value->unit = STATS_UNIT_SECONDS; 3823 break; 3824 default: 3825 goto exit; 3826 } 3827 3828 schema_entry->value->exponent = pdesc->exponent; 3829 if (pdesc->exponent) { 3830 switch (pdesc->flags & KVM_STATS_BASE_MASK) { 3831 case KVM_STATS_BASE_POW10: 3832 schema_entry->value->has_base = true; 3833 schema_entry->value->base = 10; 3834 break; 3835 case KVM_STATS_BASE_POW2: 3836 schema_entry->value->has_base = true; 3837 schema_entry->value->base = 2; 3838 break; 3839 default: 3840 goto exit; 3841 } 3842 } 3843 3844 schema_entry->value->name = g_strdup(pdesc->name); 3845 schema_entry->next = list; 3846 return schema_entry; 3847 exit: 3848 g_free(schema_entry->value); 3849 g_free(schema_entry); 3850 return list; 3851 } 3852 3853 /* Cached stats descriptors */ 3854 typedef struct StatsDescriptors { 3855 const char *ident; /* cache key, currently the StatsTarget */ 3856 struct kvm_stats_desc *kvm_stats_desc; 3857 struct kvm_stats_header *kvm_stats_header; 3858 QTAILQ_ENTRY(StatsDescriptors) next; 3859 } StatsDescriptors; 3860 3861 static QTAILQ_HEAD(, StatsDescriptors) stats_descriptors = 3862 QTAILQ_HEAD_INITIALIZER(stats_descriptors); 3863 3864 /* 3865 * Return the descriptors for 'target', that either have already been read 3866 * or are retrieved from 'stats_fd'. 3867 */ 3868 static StatsDescriptors *find_stats_descriptors(StatsTarget target, int stats_fd, 3869 Error **errp) 3870 { 3871 StatsDescriptors *descriptors; 3872 const char *ident; 3873 struct kvm_stats_desc *kvm_stats_desc; 3874 struct kvm_stats_header *kvm_stats_header; 3875 size_t size_desc; 3876 ssize_t ret; 3877 3878 ident = StatsTarget_str(target); 3879 QTAILQ_FOREACH(descriptors, &stats_descriptors, next) { 3880 if (g_str_equal(descriptors->ident, ident)) { 3881 return descriptors; 3882 } 3883 } 3884 3885 descriptors = g_new0(StatsDescriptors, 1); 3886 3887 /* Read stats header */ 3888 kvm_stats_header = g_malloc(sizeof(*kvm_stats_header)); 3889 ret = read(stats_fd, kvm_stats_header, sizeof(*kvm_stats_header)); 3890 if (ret != sizeof(*kvm_stats_header)) { 3891 error_setg(errp, "KVM stats: failed to read stats header: " 3892 "expected %zu actual %zu", 3893 sizeof(*kvm_stats_header), ret); 3894 return NULL; 3895 } 3896 size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size; 3897 3898 /* Read stats descriptors */ 3899 kvm_stats_desc = g_malloc0_n(kvm_stats_header->num_desc, size_desc); 3900 ret = pread(stats_fd, kvm_stats_desc, 3901 size_desc * kvm_stats_header->num_desc, 3902 kvm_stats_header->desc_offset); 3903 3904 if (ret != size_desc * kvm_stats_header->num_desc) { 3905 error_setg(errp, "KVM stats: failed to read stats descriptors: " 3906 "expected %zu actual %zu", 3907 size_desc * kvm_stats_header->num_desc, ret); 3908 g_free(descriptors); 3909 g_free(kvm_stats_desc); 3910 return NULL; 3911 } 3912 descriptors->kvm_stats_header = kvm_stats_header; 3913 descriptors->kvm_stats_desc = kvm_stats_desc; 3914 descriptors->ident = ident; 3915 QTAILQ_INSERT_TAIL(&stats_descriptors, descriptors, next); 3916 return descriptors; 3917 } 3918 3919 static void query_stats(StatsResultList **result, StatsTarget target, 3920 strList *names, int stats_fd, Error **errp) 3921 { 3922 struct kvm_stats_desc *kvm_stats_desc; 3923 struct kvm_stats_header *kvm_stats_header; 3924 StatsDescriptors *descriptors; 3925 g_autofree uint64_t *stats_data = NULL; 3926 struct kvm_stats_desc *pdesc; 3927 StatsList *stats_list = NULL; 3928 size_t size_desc, size_data = 0; 3929 ssize_t ret; 3930 int i; 3931 3932 descriptors = find_stats_descriptors(target, stats_fd, errp); 3933 if (!descriptors) { 3934 return; 3935 } 3936 3937 kvm_stats_header = descriptors->kvm_stats_header; 3938 kvm_stats_desc = descriptors->kvm_stats_desc; 3939 size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size; 3940 3941 /* Tally the total data size; read schema data */ 3942 for (i = 0; i < kvm_stats_header->num_desc; ++i) { 3943 pdesc = (void *)kvm_stats_desc + i * size_desc; 3944 size_data += pdesc->size * sizeof(*stats_data); 3945 } 3946 3947 stats_data = g_malloc0(size_data); 3948 ret = pread(stats_fd, stats_data, size_data, kvm_stats_header->data_offset); 3949 3950 if (ret != size_data) { 3951 error_setg(errp, "KVM stats: failed to read data: " 3952 "expected %zu actual %zu", size_data, ret); 3953 return; 3954 } 3955 3956 for (i = 0; i < kvm_stats_header->num_desc; ++i) { 3957 uint64_t *stats; 3958 pdesc = (void *)kvm_stats_desc + i * size_desc; 3959 3960 /* Add entry to the list */ 3961 stats = (void *)stats_data + pdesc->offset; 3962 if (!apply_str_list_filter(pdesc->name, names)) { 3963 continue; 3964 } 3965 stats_list = add_kvmstat_entry(pdesc, stats, stats_list, errp); 3966 } 3967 3968 if (!stats_list) { 3969 return; 3970 } 3971 3972 switch (target) { 3973 case STATS_TARGET_VM: 3974 add_stats_entry(result, STATS_PROVIDER_KVM, NULL, stats_list); 3975 break; 3976 case STATS_TARGET_VCPU: 3977 add_stats_entry(result, STATS_PROVIDER_KVM, 3978 current_cpu->parent_obj.canonical_path, 3979 stats_list); 3980 break; 3981 default: 3982 break; 3983 } 3984 } 3985 3986 static void query_stats_schema(StatsSchemaList **result, StatsTarget target, 3987 int stats_fd, Error **errp) 3988 { 3989 struct kvm_stats_desc *kvm_stats_desc; 3990 struct kvm_stats_header *kvm_stats_header; 3991 StatsDescriptors *descriptors; 3992 struct kvm_stats_desc *pdesc; 3993 StatsSchemaValueList *stats_list = NULL; 3994 size_t size_desc; 3995 int i; 3996 3997 descriptors = find_stats_descriptors(target, stats_fd, errp); 3998 if (!descriptors) { 3999 return; 4000 } 4001 4002 kvm_stats_header = descriptors->kvm_stats_header; 4003 kvm_stats_desc = descriptors->kvm_stats_desc; 4004 size_desc = sizeof(*kvm_stats_desc) + kvm_stats_header->name_size; 4005 4006 /* Tally the total data size; read schema data */ 4007 for (i = 0; i < kvm_stats_header->num_desc; ++i) { 4008 pdesc = (void *)kvm_stats_desc + i * size_desc; 4009 stats_list = add_kvmschema_entry(pdesc, stats_list, errp); 4010 } 4011 4012 add_stats_schema(result, STATS_PROVIDER_KVM, target, stats_list); 4013 } 4014 4015 static void query_stats_vcpu(CPUState *cpu, run_on_cpu_data data) 4016 { 4017 StatsArgs *kvm_stats_args = (StatsArgs *) data.host_ptr; 4018 int stats_fd = kvm_vcpu_ioctl(cpu, KVM_GET_STATS_FD, NULL); 4019 Error *local_err = NULL; 4020 4021 if (stats_fd == -1) { 4022 error_setg_errno(&local_err, errno, "KVM stats: ioctl failed"); 4023 error_propagate(kvm_stats_args->errp, local_err); 4024 return; 4025 } 4026 query_stats(kvm_stats_args->result.stats, STATS_TARGET_VCPU, 4027 kvm_stats_args->names, stats_fd, kvm_stats_args->errp); 4028 close(stats_fd); 4029 } 4030 4031 static void query_stats_schema_vcpu(CPUState *cpu, run_on_cpu_data data) 4032 { 4033 StatsArgs *kvm_stats_args = (StatsArgs *) data.host_ptr; 4034 int stats_fd = kvm_vcpu_ioctl(cpu, KVM_GET_STATS_FD, NULL); 4035 Error *local_err = NULL; 4036 4037 if (stats_fd == -1) { 4038 error_setg_errno(&local_err, errno, "KVM stats: ioctl failed"); 4039 error_propagate(kvm_stats_args->errp, local_err); 4040 return; 4041 } 4042 query_stats_schema(kvm_stats_args->result.schema, STATS_TARGET_VCPU, stats_fd, 4043 kvm_stats_args->errp); 4044 close(stats_fd); 4045 } 4046 4047 static void query_stats_cb(StatsResultList **result, StatsTarget target, 4048 strList *names, strList *targets, Error **errp) 4049 { 4050 KVMState *s = kvm_state; 4051 CPUState *cpu; 4052 int stats_fd; 4053 4054 switch (target) { 4055 case STATS_TARGET_VM: 4056 { 4057 stats_fd = kvm_vm_ioctl(s, KVM_GET_STATS_FD, NULL); 4058 if (stats_fd == -1) { 4059 error_setg_errno(errp, errno, "KVM stats: ioctl failed"); 4060 return; 4061 } 4062 query_stats(result, target, names, stats_fd, errp); 4063 close(stats_fd); 4064 break; 4065 } 4066 case STATS_TARGET_VCPU: 4067 { 4068 StatsArgs stats_args; 4069 stats_args.result.stats = result; 4070 stats_args.names = names; 4071 stats_args.errp = errp; 4072 CPU_FOREACH(cpu) { 4073 if (!apply_str_list_filter(cpu->parent_obj.canonical_path, targets)) { 4074 continue; 4075 } 4076 run_on_cpu(cpu, query_stats_vcpu, RUN_ON_CPU_HOST_PTR(&stats_args)); 4077 } 4078 break; 4079 } 4080 default: 4081 break; 4082 } 4083 } 4084 4085 void query_stats_schemas_cb(StatsSchemaList **result, Error **errp) 4086 { 4087 StatsArgs stats_args; 4088 KVMState *s = kvm_state; 4089 int stats_fd; 4090 4091 stats_fd = kvm_vm_ioctl(s, KVM_GET_STATS_FD, NULL); 4092 if (stats_fd == -1) { 4093 error_setg_errno(errp, errno, "KVM stats: ioctl failed"); 4094 return; 4095 } 4096 query_stats_schema(result, STATS_TARGET_VM, stats_fd, errp); 4097 close(stats_fd); 4098 4099 stats_args.result.schema = result; 4100 stats_args.errp = errp; 4101 run_on_cpu(first_cpu, query_stats_schema_vcpu, RUN_ON_CPU_HOST_PTR(&stats_args)); 4102 } 4103