1 /* 2 * QEMU KVM support 3 * 4 * Copyright IBM, Corp. 2008 5 * Red Hat, Inc. 2008 6 * 7 * Authors: 8 * Anthony Liguori <aliguori@us.ibm.com> 9 * Glauber Costa <gcosta@redhat.com> 10 * 11 * This work is licensed under the terms of the GNU GPL, version 2 or later. 12 * See the COPYING file in the top-level directory. 13 * 14 */ 15 16 #include "qemu/osdep.h" 17 #include <sys/ioctl.h> 18 19 #include <linux/kvm.h> 20 21 #include "qemu/atomic.h" 22 #include "qemu/option.h" 23 #include "qemu/config-file.h" 24 #include "qemu/error-report.h" 25 #include "qapi/error.h" 26 #include "hw/pci/msi.h" 27 #include "hw/pci/msix.h" 28 #include "hw/s390x/adapter.h" 29 #include "exec/gdbstub.h" 30 #include "sysemu/kvm_int.h" 31 #include "sysemu/runstate.h" 32 #include "sysemu/cpus.h" 33 #include "sysemu/sysemu.h" 34 #include "qemu/bswap.h" 35 #include "exec/memory.h" 36 #include "exec/ram_addr.h" 37 #include "exec/address-spaces.h" 38 #include "qemu/event_notifier.h" 39 #include "qemu/main-loop.h" 40 #include "trace.h" 41 #include "hw/irq.h" 42 #include "qapi/visitor.h" 43 #include "qapi/qapi-types-common.h" 44 #include "qapi/qapi-visit-common.h" 45 #include "sysemu/reset.h" 46 #include "qemu/guest-random.h" 47 #include "sysemu/hw_accel.h" 48 #include "kvm-cpus.h" 49 50 #include "hw/boards.h" 51 52 /* This check must be after config-host.h is included */ 53 #ifdef CONFIG_EVENTFD 54 #include <sys/eventfd.h> 55 #endif 56 57 /* KVM uses PAGE_SIZE in its definition of KVM_COALESCED_MMIO_MAX. We 58 * need to use the real host PAGE_SIZE, as that's what KVM will use. 59 */ 60 #ifdef PAGE_SIZE 61 #undef PAGE_SIZE 62 #endif 63 #define PAGE_SIZE qemu_real_host_page_size 64 65 //#define DEBUG_KVM 66 67 #ifdef DEBUG_KVM 68 #define DPRINTF(fmt, ...) \ 69 do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0) 70 #else 71 #define DPRINTF(fmt, ...) \ 72 do { } while (0) 73 #endif 74 75 #define KVM_MSI_HASHTAB_SIZE 256 76 77 struct KVMParkedVcpu { 78 unsigned long vcpu_id; 79 int kvm_fd; 80 QLIST_ENTRY(KVMParkedVcpu) node; 81 }; 82 83 struct KVMState 84 { 85 AccelState parent_obj; 86 87 int nr_slots; 88 int fd; 89 int vmfd; 90 int coalesced_mmio; 91 int coalesced_pio; 92 struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; 93 bool coalesced_flush_in_progress; 94 int vcpu_events; 95 int robust_singlestep; 96 int debugregs; 97 #ifdef KVM_CAP_SET_GUEST_DEBUG 98 QTAILQ_HEAD(, kvm_sw_breakpoint) kvm_sw_breakpoints; 99 #endif 100 int max_nested_state_len; 101 int many_ioeventfds; 102 int intx_set_mask; 103 int kvm_shadow_mem; 104 bool kernel_irqchip_allowed; 105 bool kernel_irqchip_required; 106 OnOffAuto kernel_irqchip_split; 107 bool sync_mmu; 108 uint64_t manual_dirty_log_protect; 109 /* The man page (and posix) say ioctl numbers are signed int, but 110 * they're not. Linux, glibc and *BSD all treat ioctl numbers as 111 * unsigned, and treating them as signed here can break things */ 112 unsigned irq_set_ioctl; 113 unsigned int sigmask_len; 114 GHashTable *gsimap; 115 #ifdef KVM_CAP_IRQ_ROUTING 116 struct kvm_irq_routing *irq_routes; 117 int nr_allocated_irq_routes; 118 unsigned long *used_gsi_bitmap; 119 unsigned int gsi_count; 120 QTAILQ_HEAD(, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE]; 121 #endif 122 KVMMemoryListener memory_listener; 123 QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus; 124 125 /* For "info mtree -f" to tell if an MR is registered in KVM */ 126 int nr_as; 127 struct KVMAs { 128 KVMMemoryListener *ml; 129 AddressSpace *as; 130 } *as; 131 }; 132 133 KVMState *kvm_state; 134 bool kvm_kernel_irqchip; 135 bool kvm_split_irqchip; 136 bool kvm_async_interrupts_allowed; 137 bool kvm_halt_in_kernel_allowed; 138 bool kvm_eventfds_allowed; 139 bool kvm_irqfds_allowed; 140 bool kvm_resamplefds_allowed; 141 bool kvm_msi_via_irqfd_allowed; 142 bool kvm_gsi_routing_allowed; 143 bool kvm_gsi_direct_mapping; 144 bool kvm_allowed; 145 bool kvm_readonly_mem_allowed; 146 bool kvm_vm_attributes_allowed; 147 bool kvm_direct_msi_allowed; 148 bool kvm_ioeventfd_any_length_allowed; 149 bool kvm_msi_use_devid; 150 static bool kvm_immediate_exit; 151 static hwaddr kvm_max_slot_size = ~0; 152 153 static const KVMCapabilityInfo kvm_required_capabilites[] = { 154 KVM_CAP_INFO(USER_MEMORY), 155 KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS), 156 KVM_CAP_INFO(JOIN_MEMORY_REGIONS_WORKS), 157 KVM_CAP_LAST_INFO 158 }; 159 160 static NotifierList kvm_irqchip_change_notifiers = 161 NOTIFIER_LIST_INITIALIZER(kvm_irqchip_change_notifiers); 162 163 struct KVMResampleFd { 164 int gsi; 165 EventNotifier *resample_event; 166 QLIST_ENTRY(KVMResampleFd) node; 167 }; 168 typedef struct KVMResampleFd KVMResampleFd; 169 170 /* 171 * Only used with split irqchip where we need to do the resample fd 172 * kick for the kernel from userspace. 173 */ 174 static QLIST_HEAD(, KVMResampleFd) kvm_resample_fd_list = 175 QLIST_HEAD_INITIALIZER(kvm_resample_fd_list); 176 177 #define kvm_slots_lock(kml) qemu_mutex_lock(&(kml)->slots_lock) 178 #define kvm_slots_unlock(kml) qemu_mutex_unlock(&(kml)->slots_lock) 179 180 static inline void kvm_resample_fd_remove(int gsi) 181 { 182 KVMResampleFd *rfd; 183 184 QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) { 185 if (rfd->gsi == gsi) { 186 QLIST_REMOVE(rfd, node); 187 g_free(rfd); 188 break; 189 } 190 } 191 } 192 193 static inline void kvm_resample_fd_insert(int gsi, EventNotifier *event) 194 { 195 KVMResampleFd *rfd = g_new0(KVMResampleFd, 1); 196 197 rfd->gsi = gsi; 198 rfd->resample_event = event; 199 200 QLIST_INSERT_HEAD(&kvm_resample_fd_list, rfd, node); 201 } 202 203 void kvm_resample_fd_notify(int gsi) 204 { 205 KVMResampleFd *rfd; 206 207 QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) { 208 if (rfd->gsi == gsi) { 209 event_notifier_set(rfd->resample_event); 210 trace_kvm_resample_fd_notify(gsi); 211 return; 212 } 213 } 214 } 215 216 int kvm_get_max_memslots(void) 217 { 218 KVMState *s = KVM_STATE(current_accel()); 219 220 return s->nr_slots; 221 } 222 223 /* Called with KVMMemoryListener.slots_lock held */ 224 static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml) 225 { 226 KVMState *s = kvm_state; 227 int i; 228 229 for (i = 0; i < s->nr_slots; i++) { 230 if (kml->slots[i].memory_size == 0) { 231 return &kml->slots[i]; 232 } 233 } 234 235 return NULL; 236 } 237 238 bool kvm_has_free_slot(MachineState *ms) 239 { 240 KVMState *s = KVM_STATE(ms->accelerator); 241 bool result; 242 KVMMemoryListener *kml = &s->memory_listener; 243 244 kvm_slots_lock(kml); 245 result = !!kvm_get_free_slot(kml); 246 kvm_slots_unlock(kml); 247 248 return result; 249 } 250 251 /* Called with KVMMemoryListener.slots_lock held */ 252 static KVMSlot *kvm_alloc_slot(KVMMemoryListener *kml) 253 { 254 KVMSlot *slot = kvm_get_free_slot(kml); 255 256 if (slot) { 257 return slot; 258 } 259 260 fprintf(stderr, "%s: no free slot available\n", __func__); 261 abort(); 262 } 263 264 static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml, 265 hwaddr start_addr, 266 hwaddr size) 267 { 268 KVMState *s = kvm_state; 269 int i; 270 271 for (i = 0; i < s->nr_slots; i++) { 272 KVMSlot *mem = &kml->slots[i]; 273 274 if (start_addr == mem->start_addr && size == mem->memory_size) { 275 return mem; 276 } 277 } 278 279 return NULL; 280 } 281 282 /* 283 * Calculate and align the start address and the size of the section. 284 * Return the size. If the size is 0, the aligned section is empty. 285 */ 286 static hwaddr kvm_align_section(MemoryRegionSection *section, 287 hwaddr *start) 288 { 289 hwaddr size = int128_get64(section->size); 290 hwaddr delta, aligned; 291 292 /* kvm works in page size chunks, but the function may be called 293 with sub-page size and unaligned start address. Pad the start 294 address to next and truncate size to previous page boundary. */ 295 aligned = ROUND_UP(section->offset_within_address_space, 296 qemu_real_host_page_size); 297 delta = aligned - section->offset_within_address_space; 298 *start = aligned; 299 if (delta > size) { 300 return 0; 301 } 302 303 return (size - delta) & qemu_real_host_page_mask; 304 } 305 306 int kvm_physical_memory_addr_from_host(KVMState *s, void *ram, 307 hwaddr *phys_addr) 308 { 309 KVMMemoryListener *kml = &s->memory_listener; 310 int i, ret = 0; 311 312 kvm_slots_lock(kml); 313 for (i = 0; i < s->nr_slots; i++) { 314 KVMSlot *mem = &kml->slots[i]; 315 316 if (ram >= mem->ram && ram < mem->ram + mem->memory_size) { 317 *phys_addr = mem->start_addr + (ram - mem->ram); 318 ret = 1; 319 break; 320 } 321 } 322 kvm_slots_unlock(kml); 323 324 return ret; 325 } 326 327 static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot, bool new) 328 { 329 KVMState *s = kvm_state; 330 struct kvm_userspace_memory_region mem; 331 int ret; 332 333 mem.slot = slot->slot | (kml->as_id << 16); 334 mem.guest_phys_addr = slot->start_addr; 335 mem.userspace_addr = (unsigned long)slot->ram; 336 mem.flags = slot->flags; 337 338 if (slot->memory_size && !new && (mem.flags ^ slot->old_flags) & KVM_MEM_READONLY) { 339 /* Set the slot size to 0 before setting the slot to the desired 340 * value. This is needed based on KVM commit 75d61fbc. */ 341 mem.memory_size = 0; 342 ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem); 343 if (ret < 0) { 344 goto err; 345 } 346 } 347 mem.memory_size = slot->memory_size; 348 ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem); 349 slot->old_flags = mem.flags; 350 err: 351 trace_kvm_set_user_memory(mem.slot, mem.flags, mem.guest_phys_addr, 352 mem.memory_size, mem.userspace_addr, ret); 353 if (ret < 0) { 354 error_report("%s: KVM_SET_USER_MEMORY_REGION failed, slot=%d," 355 " start=0x%" PRIx64 ", size=0x%" PRIx64 ": %s", 356 __func__, mem.slot, slot->start_addr, 357 (uint64_t)mem.memory_size, strerror(errno)); 358 } 359 return ret; 360 } 361 362 static int do_kvm_destroy_vcpu(CPUState *cpu) 363 { 364 KVMState *s = kvm_state; 365 long mmap_size; 366 struct KVMParkedVcpu *vcpu = NULL; 367 int ret = 0; 368 369 DPRINTF("kvm_destroy_vcpu\n"); 370 371 ret = kvm_arch_destroy_vcpu(cpu); 372 if (ret < 0) { 373 goto err; 374 } 375 376 mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0); 377 if (mmap_size < 0) { 378 ret = mmap_size; 379 DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n"); 380 goto err; 381 } 382 383 ret = munmap(cpu->kvm_run, mmap_size); 384 if (ret < 0) { 385 goto err; 386 } 387 388 vcpu = g_malloc0(sizeof(*vcpu)); 389 vcpu->vcpu_id = kvm_arch_vcpu_id(cpu); 390 vcpu->kvm_fd = cpu->kvm_fd; 391 QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node); 392 err: 393 return ret; 394 } 395 396 void kvm_destroy_vcpu(CPUState *cpu) 397 { 398 if (do_kvm_destroy_vcpu(cpu) < 0) { 399 error_report("kvm_destroy_vcpu failed"); 400 exit(EXIT_FAILURE); 401 } 402 } 403 404 static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id) 405 { 406 struct KVMParkedVcpu *cpu; 407 408 QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) { 409 if (cpu->vcpu_id == vcpu_id) { 410 int kvm_fd; 411 412 QLIST_REMOVE(cpu, node); 413 kvm_fd = cpu->kvm_fd; 414 g_free(cpu); 415 return kvm_fd; 416 } 417 } 418 419 return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id); 420 } 421 422 int kvm_init_vcpu(CPUState *cpu, Error **errp) 423 { 424 KVMState *s = kvm_state; 425 long mmap_size; 426 int ret; 427 428 trace_kvm_init_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu)); 429 430 ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu)); 431 if (ret < 0) { 432 error_setg_errno(errp, -ret, "kvm_init_vcpu: kvm_get_vcpu failed (%lu)", 433 kvm_arch_vcpu_id(cpu)); 434 goto err; 435 } 436 437 cpu->kvm_fd = ret; 438 cpu->kvm_state = s; 439 cpu->vcpu_dirty = true; 440 441 mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0); 442 if (mmap_size < 0) { 443 ret = mmap_size; 444 error_setg_errno(errp, -mmap_size, 445 "kvm_init_vcpu: KVM_GET_VCPU_MMAP_SIZE failed"); 446 goto err; 447 } 448 449 cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, 450 cpu->kvm_fd, 0); 451 if (cpu->kvm_run == MAP_FAILED) { 452 ret = -errno; 453 error_setg_errno(errp, ret, 454 "kvm_init_vcpu: mmap'ing vcpu state failed (%lu)", 455 kvm_arch_vcpu_id(cpu)); 456 goto err; 457 } 458 459 if (s->coalesced_mmio && !s->coalesced_mmio_ring) { 460 s->coalesced_mmio_ring = 461 (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE; 462 } 463 464 ret = kvm_arch_init_vcpu(cpu); 465 if (ret < 0) { 466 error_setg_errno(errp, -ret, 467 "kvm_init_vcpu: kvm_arch_init_vcpu failed (%lu)", 468 kvm_arch_vcpu_id(cpu)); 469 } 470 err: 471 return ret; 472 } 473 474 /* 475 * dirty pages logging control 476 */ 477 478 static int kvm_mem_flags(MemoryRegion *mr) 479 { 480 bool readonly = mr->readonly || memory_region_is_romd(mr); 481 int flags = 0; 482 483 if (memory_region_get_dirty_log_mask(mr) != 0) { 484 flags |= KVM_MEM_LOG_DIRTY_PAGES; 485 } 486 if (readonly && kvm_readonly_mem_allowed) { 487 flags |= KVM_MEM_READONLY; 488 } 489 return flags; 490 } 491 492 /* Called with KVMMemoryListener.slots_lock held */ 493 static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem, 494 MemoryRegion *mr) 495 { 496 mem->flags = kvm_mem_flags(mr); 497 498 /* If nothing changed effectively, no need to issue ioctl */ 499 if (mem->flags == mem->old_flags) { 500 return 0; 501 } 502 503 return kvm_set_user_memory_region(kml, mem, false); 504 } 505 506 static int kvm_section_update_flags(KVMMemoryListener *kml, 507 MemoryRegionSection *section) 508 { 509 hwaddr start_addr, size, slot_size; 510 KVMSlot *mem; 511 int ret = 0; 512 513 size = kvm_align_section(section, &start_addr); 514 if (!size) { 515 return 0; 516 } 517 518 kvm_slots_lock(kml); 519 520 while (size && !ret) { 521 slot_size = MIN(kvm_max_slot_size, size); 522 mem = kvm_lookup_matching_slot(kml, start_addr, slot_size); 523 if (!mem) { 524 /* We don't have a slot if we want to trap every access. */ 525 goto out; 526 } 527 528 ret = kvm_slot_update_flags(kml, mem, section->mr); 529 start_addr += slot_size; 530 size -= slot_size; 531 } 532 533 out: 534 kvm_slots_unlock(kml); 535 return ret; 536 } 537 538 static void kvm_log_start(MemoryListener *listener, 539 MemoryRegionSection *section, 540 int old, int new) 541 { 542 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 543 int r; 544 545 if (old != 0) { 546 return; 547 } 548 549 r = kvm_section_update_flags(kml, section); 550 if (r < 0) { 551 abort(); 552 } 553 } 554 555 static void kvm_log_stop(MemoryListener *listener, 556 MemoryRegionSection *section, 557 int old, int new) 558 { 559 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 560 int r; 561 562 if (new != 0) { 563 return; 564 } 565 566 r = kvm_section_update_flags(kml, section); 567 if (r < 0) { 568 abort(); 569 } 570 } 571 572 /* get kvm's dirty pages bitmap and update qemu's */ 573 static int kvm_get_dirty_pages_log_range(MemoryRegionSection *section, 574 unsigned long *bitmap) 575 { 576 ram_addr_t start = section->offset_within_region + 577 memory_region_get_ram_addr(section->mr); 578 ram_addr_t pages = int128_get64(section->size) / qemu_real_host_page_size; 579 580 cpu_physical_memory_set_dirty_lebitmap(bitmap, start, pages); 581 return 0; 582 } 583 584 #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1)) 585 586 /* Allocate the dirty bitmap for a slot */ 587 static void kvm_memslot_init_dirty_bitmap(KVMSlot *mem) 588 { 589 /* 590 * XXX bad kernel interface alert 591 * For dirty bitmap, kernel allocates array of size aligned to 592 * bits-per-long. But for case when the kernel is 64bits and 593 * the userspace is 32bits, userspace can't align to the same 594 * bits-per-long, since sizeof(long) is different between kernel 595 * and user space. This way, userspace will provide buffer which 596 * may be 4 bytes less than the kernel will use, resulting in 597 * userspace memory corruption (which is not detectable by valgrind 598 * too, in most cases). 599 * So for now, let's align to 64 instead of HOST_LONG_BITS here, in 600 * a hope that sizeof(long) won't become >8 any time soon. 601 * 602 * Note: the granule of kvm dirty log is qemu_real_host_page_size. 603 * And mem->memory_size is aligned to it (otherwise this mem can't 604 * be registered to KVM). 605 */ 606 hwaddr bitmap_size = ALIGN(mem->memory_size / qemu_real_host_page_size, 607 /*HOST_LONG_BITS*/ 64) / 8; 608 mem->dirty_bmap = g_malloc0(bitmap_size); 609 } 610 611 /** 612 * kvm_physical_sync_dirty_bitmap - Sync dirty bitmap from kernel space 613 * 614 * This function will first try to fetch dirty bitmap from the kernel, 615 * and then updates qemu's dirty bitmap. 616 * 617 * NOTE: caller must be with kml->slots_lock held. 618 * 619 * @kml: the KVM memory listener object 620 * @section: the memory section to sync the dirty bitmap with 621 */ 622 static int kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml, 623 MemoryRegionSection *section) 624 { 625 KVMState *s = kvm_state; 626 struct kvm_dirty_log d = {}; 627 KVMSlot *mem; 628 hwaddr start_addr, size; 629 hwaddr slot_size, slot_offset = 0; 630 int ret = 0; 631 632 size = kvm_align_section(section, &start_addr); 633 while (size) { 634 MemoryRegionSection subsection = *section; 635 636 slot_size = MIN(kvm_max_slot_size, size); 637 mem = kvm_lookup_matching_slot(kml, start_addr, slot_size); 638 if (!mem) { 639 /* We don't have a slot if we want to trap every access. */ 640 goto out; 641 } 642 643 if (!mem->dirty_bmap) { 644 /* Allocate on the first log_sync, once and for all */ 645 kvm_memslot_init_dirty_bitmap(mem); 646 } 647 648 d.dirty_bitmap = mem->dirty_bmap; 649 d.slot = mem->slot | (kml->as_id << 16); 650 ret = kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d); 651 if (ret == -ENOENT) { 652 /* kernel does not have dirty bitmap in this slot */ 653 ret = 0; 654 } else if (ret < 0) { 655 error_report("ioctl KVM_GET_DIRTY_LOG failed: %d", errno); 656 goto out; 657 } else { 658 subsection.offset_within_region += slot_offset; 659 subsection.size = int128_make64(slot_size); 660 kvm_get_dirty_pages_log_range(&subsection, d.dirty_bitmap); 661 } 662 663 slot_offset += slot_size; 664 start_addr += slot_size; 665 size -= slot_size; 666 } 667 out: 668 return ret; 669 } 670 671 /* Alignment requirement for KVM_CLEAR_DIRTY_LOG - 64 pages */ 672 #define KVM_CLEAR_LOG_SHIFT 6 673 #define KVM_CLEAR_LOG_ALIGN (qemu_real_host_page_size << KVM_CLEAR_LOG_SHIFT) 674 #define KVM_CLEAR_LOG_MASK (-KVM_CLEAR_LOG_ALIGN) 675 676 static int kvm_log_clear_one_slot(KVMSlot *mem, int as_id, uint64_t start, 677 uint64_t size) 678 { 679 KVMState *s = kvm_state; 680 uint64_t end, bmap_start, start_delta, bmap_npages; 681 struct kvm_clear_dirty_log d; 682 unsigned long *bmap_clear = NULL, psize = qemu_real_host_page_size; 683 int ret; 684 685 /* 686 * We need to extend either the start or the size or both to 687 * satisfy the KVM interface requirement. Firstly, do the start 688 * page alignment on 64 host pages 689 */ 690 bmap_start = start & KVM_CLEAR_LOG_MASK; 691 start_delta = start - bmap_start; 692 bmap_start /= psize; 693 694 /* 695 * The kernel interface has restriction on the size too, that either: 696 * 697 * (1) the size is 64 host pages aligned (just like the start), or 698 * (2) the size fills up until the end of the KVM memslot. 699 */ 700 bmap_npages = DIV_ROUND_UP(size + start_delta, KVM_CLEAR_LOG_ALIGN) 701 << KVM_CLEAR_LOG_SHIFT; 702 end = mem->memory_size / psize; 703 if (bmap_npages > end - bmap_start) { 704 bmap_npages = end - bmap_start; 705 } 706 start_delta /= psize; 707 708 /* 709 * Prepare the bitmap to clear dirty bits. Here we must guarantee 710 * that we won't clear any unknown dirty bits otherwise we might 711 * accidentally clear some set bits which are not yet synced from 712 * the kernel into QEMU's bitmap, then we'll lose track of the 713 * guest modifications upon those pages (which can directly lead 714 * to guest data loss or panic after migration). 715 * 716 * Layout of the KVMSlot.dirty_bmap: 717 * 718 * |<-------- bmap_npages -----------..>| 719 * [1] 720 * start_delta size 721 * |----------------|-------------|------------------|------------| 722 * ^ ^ ^ ^ 723 * | | | | 724 * start bmap_start (start) end 725 * of memslot of memslot 726 * 727 * [1] bmap_npages can be aligned to either 64 pages or the end of slot 728 */ 729 730 assert(bmap_start % BITS_PER_LONG == 0); 731 /* We should never do log_clear before log_sync */ 732 assert(mem->dirty_bmap); 733 if (start_delta || bmap_npages - size / psize) { 734 /* Slow path - we need to manipulate a temp bitmap */ 735 bmap_clear = bitmap_new(bmap_npages); 736 bitmap_copy_with_src_offset(bmap_clear, mem->dirty_bmap, 737 bmap_start, start_delta + size / psize); 738 /* 739 * We need to fill the holes at start because that was not 740 * specified by the caller and we extended the bitmap only for 741 * 64 pages alignment 742 */ 743 bitmap_clear(bmap_clear, 0, start_delta); 744 d.dirty_bitmap = bmap_clear; 745 } else { 746 /* 747 * Fast path - both start and size align well with BITS_PER_LONG 748 * (or the end of memory slot) 749 */ 750 d.dirty_bitmap = mem->dirty_bmap + BIT_WORD(bmap_start); 751 } 752 753 d.first_page = bmap_start; 754 /* It should never overflow. If it happens, say something */ 755 assert(bmap_npages <= UINT32_MAX); 756 d.num_pages = bmap_npages; 757 d.slot = mem->slot | (as_id << 16); 758 759 ret = kvm_vm_ioctl(s, KVM_CLEAR_DIRTY_LOG, &d); 760 if (ret < 0 && ret != -ENOENT) { 761 error_report("%s: KVM_CLEAR_DIRTY_LOG failed, slot=%d, " 762 "start=0x%"PRIx64", size=0x%"PRIx32", errno=%d", 763 __func__, d.slot, (uint64_t)d.first_page, 764 (uint32_t)d.num_pages, ret); 765 } else { 766 ret = 0; 767 trace_kvm_clear_dirty_log(d.slot, d.first_page, d.num_pages); 768 } 769 770 /* 771 * After we have updated the remote dirty bitmap, we update the 772 * cached bitmap as well for the memslot, then if another user 773 * clears the same region we know we shouldn't clear it again on 774 * the remote otherwise it's data loss as well. 775 */ 776 bitmap_clear(mem->dirty_bmap, bmap_start + start_delta, 777 size / psize); 778 /* This handles the NULL case well */ 779 g_free(bmap_clear); 780 return ret; 781 } 782 783 784 /** 785 * kvm_physical_log_clear - Clear the kernel's dirty bitmap for range 786 * 787 * NOTE: this will be a no-op if we haven't enabled manual dirty log 788 * protection in the host kernel because in that case this operation 789 * will be done within log_sync(). 790 * 791 * @kml: the kvm memory listener 792 * @section: the memory range to clear dirty bitmap 793 */ 794 static int kvm_physical_log_clear(KVMMemoryListener *kml, 795 MemoryRegionSection *section) 796 { 797 KVMState *s = kvm_state; 798 uint64_t start, size, offset, count; 799 KVMSlot *mem; 800 int ret = 0, i; 801 802 if (!s->manual_dirty_log_protect) { 803 /* No need to do explicit clear */ 804 return ret; 805 } 806 807 start = section->offset_within_address_space; 808 size = int128_get64(section->size); 809 810 if (!size) { 811 /* Nothing more we can do... */ 812 return ret; 813 } 814 815 kvm_slots_lock(kml); 816 817 for (i = 0; i < s->nr_slots; i++) { 818 mem = &kml->slots[i]; 819 /* Discard slots that are empty or do not overlap the section */ 820 if (!mem->memory_size || 821 mem->start_addr > start + size - 1 || 822 start > mem->start_addr + mem->memory_size - 1) { 823 continue; 824 } 825 826 if (start >= mem->start_addr) { 827 /* The slot starts before section or is aligned to it. */ 828 offset = start - mem->start_addr; 829 count = MIN(mem->memory_size - offset, size); 830 } else { 831 /* The slot starts after section. */ 832 offset = 0; 833 count = MIN(mem->memory_size, size - (mem->start_addr - start)); 834 } 835 ret = kvm_log_clear_one_slot(mem, kml->as_id, offset, count); 836 if (ret < 0) { 837 break; 838 } 839 } 840 841 kvm_slots_unlock(kml); 842 843 return ret; 844 } 845 846 static void kvm_coalesce_mmio_region(MemoryListener *listener, 847 MemoryRegionSection *secion, 848 hwaddr start, hwaddr size) 849 { 850 KVMState *s = kvm_state; 851 852 if (s->coalesced_mmio) { 853 struct kvm_coalesced_mmio_zone zone; 854 855 zone.addr = start; 856 zone.size = size; 857 zone.pad = 0; 858 859 (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone); 860 } 861 } 862 863 static void kvm_uncoalesce_mmio_region(MemoryListener *listener, 864 MemoryRegionSection *secion, 865 hwaddr start, hwaddr size) 866 { 867 KVMState *s = kvm_state; 868 869 if (s->coalesced_mmio) { 870 struct kvm_coalesced_mmio_zone zone; 871 872 zone.addr = start; 873 zone.size = size; 874 zone.pad = 0; 875 876 (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone); 877 } 878 } 879 880 static void kvm_coalesce_pio_add(MemoryListener *listener, 881 MemoryRegionSection *section, 882 hwaddr start, hwaddr size) 883 { 884 KVMState *s = kvm_state; 885 886 if (s->coalesced_pio) { 887 struct kvm_coalesced_mmio_zone zone; 888 889 zone.addr = start; 890 zone.size = size; 891 zone.pio = 1; 892 893 (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone); 894 } 895 } 896 897 static void kvm_coalesce_pio_del(MemoryListener *listener, 898 MemoryRegionSection *section, 899 hwaddr start, hwaddr size) 900 { 901 KVMState *s = kvm_state; 902 903 if (s->coalesced_pio) { 904 struct kvm_coalesced_mmio_zone zone; 905 906 zone.addr = start; 907 zone.size = size; 908 zone.pio = 1; 909 910 (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone); 911 } 912 } 913 914 static MemoryListener kvm_coalesced_pio_listener = { 915 .coalesced_io_add = kvm_coalesce_pio_add, 916 .coalesced_io_del = kvm_coalesce_pio_del, 917 }; 918 919 int kvm_check_extension(KVMState *s, unsigned int extension) 920 { 921 int ret; 922 923 ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension); 924 if (ret < 0) { 925 ret = 0; 926 } 927 928 return ret; 929 } 930 931 int kvm_vm_check_extension(KVMState *s, unsigned int extension) 932 { 933 int ret; 934 935 ret = kvm_vm_ioctl(s, KVM_CHECK_EXTENSION, extension); 936 if (ret < 0) { 937 /* VM wide version not implemented, use global one instead */ 938 ret = kvm_check_extension(s, extension); 939 } 940 941 return ret; 942 } 943 944 typedef struct HWPoisonPage { 945 ram_addr_t ram_addr; 946 QLIST_ENTRY(HWPoisonPage) list; 947 } HWPoisonPage; 948 949 static QLIST_HEAD(, HWPoisonPage) hwpoison_page_list = 950 QLIST_HEAD_INITIALIZER(hwpoison_page_list); 951 952 static void kvm_unpoison_all(void *param) 953 { 954 HWPoisonPage *page, *next_page; 955 956 QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) { 957 QLIST_REMOVE(page, list); 958 qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE); 959 g_free(page); 960 } 961 } 962 963 void kvm_hwpoison_page_add(ram_addr_t ram_addr) 964 { 965 HWPoisonPage *page; 966 967 QLIST_FOREACH(page, &hwpoison_page_list, list) { 968 if (page->ram_addr == ram_addr) { 969 return; 970 } 971 } 972 page = g_new(HWPoisonPage, 1); 973 page->ram_addr = ram_addr; 974 QLIST_INSERT_HEAD(&hwpoison_page_list, page, list); 975 } 976 977 static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size) 978 { 979 #if defined(HOST_WORDS_BIGENDIAN) != defined(TARGET_WORDS_BIGENDIAN) 980 /* The kernel expects ioeventfd values in HOST_WORDS_BIGENDIAN 981 * endianness, but the memory core hands them in target endianness. 982 * For example, PPC is always treated as big-endian even if running 983 * on KVM and on PPC64LE. Correct here. 984 */ 985 switch (size) { 986 case 2: 987 val = bswap16(val); 988 break; 989 case 4: 990 val = bswap32(val); 991 break; 992 } 993 #endif 994 return val; 995 } 996 997 static int kvm_set_ioeventfd_mmio(int fd, hwaddr addr, uint32_t val, 998 bool assign, uint32_t size, bool datamatch) 999 { 1000 int ret; 1001 struct kvm_ioeventfd iofd = { 1002 .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0, 1003 .addr = addr, 1004 .len = size, 1005 .flags = 0, 1006 .fd = fd, 1007 }; 1008 1009 trace_kvm_set_ioeventfd_mmio(fd, (uint64_t)addr, val, assign, size, 1010 datamatch); 1011 if (!kvm_enabled()) { 1012 return -ENOSYS; 1013 } 1014 1015 if (datamatch) { 1016 iofd.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH; 1017 } 1018 if (!assign) { 1019 iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN; 1020 } 1021 1022 ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd); 1023 1024 if (ret < 0) { 1025 return -errno; 1026 } 1027 1028 return 0; 1029 } 1030 1031 static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val, 1032 bool assign, uint32_t size, bool datamatch) 1033 { 1034 struct kvm_ioeventfd kick = { 1035 .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0, 1036 .addr = addr, 1037 .flags = KVM_IOEVENTFD_FLAG_PIO, 1038 .len = size, 1039 .fd = fd, 1040 }; 1041 int r; 1042 trace_kvm_set_ioeventfd_pio(fd, addr, val, assign, size, datamatch); 1043 if (!kvm_enabled()) { 1044 return -ENOSYS; 1045 } 1046 if (datamatch) { 1047 kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH; 1048 } 1049 if (!assign) { 1050 kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN; 1051 } 1052 r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick); 1053 if (r < 0) { 1054 return r; 1055 } 1056 return 0; 1057 } 1058 1059 1060 static int kvm_check_many_ioeventfds(void) 1061 { 1062 /* Userspace can use ioeventfd for io notification. This requires a host 1063 * that supports eventfd(2) and an I/O thread; since eventfd does not 1064 * support SIGIO it cannot interrupt the vcpu. 1065 * 1066 * Older kernels have a 6 device limit on the KVM io bus. Find out so we 1067 * can avoid creating too many ioeventfds. 1068 */ 1069 #if defined(CONFIG_EVENTFD) 1070 int ioeventfds[7]; 1071 int i, ret = 0; 1072 for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) { 1073 ioeventfds[i] = eventfd(0, EFD_CLOEXEC); 1074 if (ioeventfds[i] < 0) { 1075 break; 1076 } 1077 ret = kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, true, 2, true); 1078 if (ret < 0) { 1079 close(ioeventfds[i]); 1080 break; 1081 } 1082 } 1083 1084 /* Decide whether many devices are supported or not */ 1085 ret = i == ARRAY_SIZE(ioeventfds); 1086 1087 while (i-- > 0) { 1088 kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, false, 2, true); 1089 close(ioeventfds[i]); 1090 } 1091 return ret; 1092 #else 1093 return 0; 1094 #endif 1095 } 1096 1097 static const KVMCapabilityInfo * 1098 kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list) 1099 { 1100 while (list->name) { 1101 if (!kvm_check_extension(s, list->value)) { 1102 return list; 1103 } 1104 list++; 1105 } 1106 return NULL; 1107 } 1108 1109 void kvm_set_max_memslot_size(hwaddr max_slot_size) 1110 { 1111 g_assert( 1112 ROUND_UP(max_slot_size, qemu_real_host_page_size) == max_slot_size 1113 ); 1114 kvm_max_slot_size = max_slot_size; 1115 } 1116 1117 static void kvm_set_phys_mem(KVMMemoryListener *kml, 1118 MemoryRegionSection *section, bool add) 1119 { 1120 KVMSlot *mem; 1121 int err; 1122 MemoryRegion *mr = section->mr; 1123 bool writeable = !mr->readonly && !mr->rom_device; 1124 hwaddr start_addr, size, slot_size; 1125 void *ram; 1126 1127 if (!memory_region_is_ram(mr)) { 1128 if (writeable || !kvm_readonly_mem_allowed) { 1129 return; 1130 } else if (!mr->romd_mode) { 1131 /* If the memory device is not in romd_mode, then we actually want 1132 * to remove the kvm memory slot so all accesses will trap. */ 1133 add = false; 1134 } 1135 } 1136 1137 size = kvm_align_section(section, &start_addr); 1138 if (!size) { 1139 return; 1140 } 1141 1142 /* use aligned delta to align the ram address */ 1143 ram = memory_region_get_ram_ptr(mr) + section->offset_within_region + 1144 (start_addr - section->offset_within_address_space); 1145 1146 kvm_slots_lock(kml); 1147 1148 if (!add) { 1149 do { 1150 slot_size = MIN(kvm_max_slot_size, size); 1151 mem = kvm_lookup_matching_slot(kml, start_addr, slot_size); 1152 if (!mem) { 1153 goto out; 1154 } 1155 if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { 1156 kvm_physical_sync_dirty_bitmap(kml, section); 1157 } 1158 1159 /* unregister the slot */ 1160 g_free(mem->dirty_bmap); 1161 mem->dirty_bmap = NULL; 1162 mem->memory_size = 0; 1163 mem->flags = 0; 1164 err = kvm_set_user_memory_region(kml, mem, false); 1165 if (err) { 1166 fprintf(stderr, "%s: error unregistering slot: %s\n", 1167 __func__, strerror(-err)); 1168 abort(); 1169 } 1170 start_addr += slot_size; 1171 size -= slot_size; 1172 } while (size); 1173 goto out; 1174 } 1175 1176 /* register the new slot */ 1177 do { 1178 slot_size = MIN(kvm_max_slot_size, size); 1179 mem = kvm_alloc_slot(kml); 1180 mem->memory_size = slot_size; 1181 mem->start_addr = start_addr; 1182 mem->ram = ram; 1183 mem->flags = kvm_mem_flags(mr); 1184 1185 if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { 1186 /* 1187 * Reallocate the bmap; it means it doesn't disappear in 1188 * middle of a migrate. 1189 */ 1190 kvm_memslot_init_dirty_bitmap(mem); 1191 } 1192 err = kvm_set_user_memory_region(kml, mem, true); 1193 if (err) { 1194 fprintf(stderr, "%s: error registering slot: %s\n", __func__, 1195 strerror(-err)); 1196 abort(); 1197 } 1198 start_addr += slot_size; 1199 ram += slot_size; 1200 size -= slot_size; 1201 } while (size); 1202 1203 out: 1204 kvm_slots_unlock(kml); 1205 } 1206 1207 static void kvm_region_add(MemoryListener *listener, 1208 MemoryRegionSection *section) 1209 { 1210 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 1211 1212 memory_region_ref(section->mr); 1213 kvm_set_phys_mem(kml, section, true); 1214 } 1215 1216 static void kvm_region_del(MemoryListener *listener, 1217 MemoryRegionSection *section) 1218 { 1219 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 1220 1221 kvm_set_phys_mem(kml, section, false); 1222 memory_region_unref(section->mr); 1223 } 1224 1225 static void kvm_log_sync(MemoryListener *listener, 1226 MemoryRegionSection *section) 1227 { 1228 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 1229 int r; 1230 1231 kvm_slots_lock(kml); 1232 r = kvm_physical_sync_dirty_bitmap(kml, section); 1233 kvm_slots_unlock(kml); 1234 if (r < 0) { 1235 abort(); 1236 } 1237 } 1238 1239 static void kvm_log_clear(MemoryListener *listener, 1240 MemoryRegionSection *section) 1241 { 1242 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 1243 int r; 1244 1245 r = kvm_physical_log_clear(kml, section); 1246 if (r < 0) { 1247 error_report_once("%s: kvm log clear failed: mr=%s " 1248 "offset=%"HWADDR_PRIx" size=%"PRIx64, __func__, 1249 section->mr->name, section->offset_within_region, 1250 int128_get64(section->size)); 1251 abort(); 1252 } 1253 } 1254 1255 static void kvm_mem_ioeventfd_add(MemoryListener *listener, 1256 MemoryRegionSection *section, 1257 bool match_data, uint64_t data, 1258 EventNotifier *e) 1259 { 1260 int fd = event_notifier_get_fd(e); 1261 int r; 1262 1263 r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space, 1264 data, true, int128_get64(section->size), 1265 match_data); 1266 if (r < 0) { 1267 fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n", 1268 __func__, strerror(-r), -r); 1269 abort(); 1270 } 1271 } 1272 1273 static void kvm_mem_ioeventfd_del(MemoryListener *listener, 1274 MemoryRegionSection *section, 1275 bool match_data, uint64_t data, 1276 EventNotifier *e) 1277 { 1278 int fd = event_notifier_get_fd(e); 1279 int r; 1280 1281 r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space, 1282 data, false, int128_get64(section->size), 1283 match_data); 1284 if (r < 0) { 1285 fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n", 1286 __func__, strerror(-r), -r); 1287 abort(); 1288 } 1289 } 1290 1291 static void kvm_io_ioeventfd_add(MemoryListener *listener, 1292 MemoryRegionSection *section, 1293 bool match_data, uint64_t data, 1294 EventNotifier *e) 1295 { 1296 int fd = event_notifier_get_fd(e); 1297 int r; 1298 1299 r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space, 1300 data, true, int128_get64(section->size), 1301 match_data); 1302 if (r < 0) { 1303 fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n", 1304 __func__, strerror(-r), -r); 1305 abort(); 1306 } 1307 } 1308 1309 static void kvm_io_ioeventfd_del(MemoryListener *listener, 1310 MemoryRegionSection *section, 1311 bool match_data, uint64_t data, 1312 EventNotifier *e) 1313 1314 { 1315 int fd = event_notifier_get_fd(e); 1316 int r; 1317 1318 r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space, 1319 data, false, int128_get64(section->size), 1320 match_data); 1321 if (r < 0) { 1322 fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n", 1323 __func__, strerror(-r), -r); 1324 abort(); 1325 } 1326 } 1327 1328 void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml, 1329 AddressSpace *as, int as_id) 1330 { 1331 int i; 1332 1333 qemu_mutex_init(&kml->slots_lock); 1334 kml->slots = g_malloc0(s->nr_slots * sizeof(KVMSlot)); 1335 kml->as_id = as_id; 1336 1337 for (i = 0; i < s->nr_slots; i++) { 1338 kml->slots[i].slot = i; 1339 } 1340 1341 kml->listener.region_add = kvm_region_add; 1342 kml->listener.region_del = kvm_region_del; 1343 kml->listener.log_start = kvm_log_start; 1344 kml->listener.log_stop = kvm_log_stop; 1345 kml->listener.log_sync = kvm_log_sync; 1346 kml->listener.log_clear = kvm_log_clear; 1347 kml->listener.priority = 10; 1348 1349 memory_listener_register(&kml->listener, as); 1350 1351 for (i = 0; i < s->nr_as; ++i) { 1352 if (!s->as[i].as) { 1353 s->as[i].as = as; 1354 s->as[i].ml = kml; 1355 break; 1356 } 1357 } 1358 } 1359 1360 static MemoryListener kvm_io_listener = { 1361 .eventfd_add = kvm_io_ioeventfd_add, 1362 .eventfd_del = kvm_io_ioeventfd_del, 1363 .priority = 10, 1364 }; 1365 1366 int kvm_set_irq(KVMState *s, int irq, int level) 1367 { 1368 struct kvm_irq_level event; 1369 int ret; 1370 1371 assert(kvm_async_interrupts_enabled()); 1372 1373 event.level = level; 1374 event.irq = irq; 1375 ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event); 1376 if (ret < 0) { 1377 perror("kvm_set_irq"); 1378 abort(); 1379 } 1380 1381 return (s->irq_set_ioctl == KVM_IRQ_LINE) ? 1 : event.status; 1382 } 1383 1384 #ifdef KVM_CAP_IRQ_ROUTING 1385 typedef struct KVMMSIRoute { 1386 struct kvm_irq_routing_entry kroute; 1387 QTAILQ_ENTRY(KVMMSIRoute) entry; 1388 } KVMMSIRoute; 1389 1390 static void set_gsi(KVMState *s, unsigned int gsi) 1391 { 1392 set_bit(gsi, s->used_gsi_bitmap); 1393 } 1394 1395 static void clear_gsi(KVMState *s, unsigned int gsi) 1396 { 1397 clear_bit(gsi, s->used_gsi_bitmap); 1398 } 1399 1400 void kvm_init_irq_routing(KVMState *s) 1401 { 1402 int gsi_count, i; 1403 1404 gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING) - 1; 1405 if (gsi_count > 0) { 1406 /* Round up so we can search ints using ffs */ 1407 s->used_gsi_bitmap = bitmap_new(gsi_count); 1408 s->gsi_count = gsi_count; 1409 } 1410 1411 s->irq_routes = g_malloc0(sizeof(*s->irq_routes)); 1412 s->nr_allocated_irq_routes = 0; 1413 1414 if (!kvm_direct_msi_allowed) { 1415 for (i = 0; i < KVM_MSI_HASHTAB_SIZE; i++) { 1416 QTAILQ_INIT(&s->msi_hashtab[i]); 1417 } 1418 } 1419 1420 kvm_arch_init_irq_routing(s); 1421 } 1422 1423 void kvm_irqchip_commit_routes(KVMState *s) 1424 { 1425 int ret; 1426 1427 if (kvm_gsi_direct_mapping()) { 1428 return; 1429 } 1430 1431 if (!kvm_gsi_routing_enabled()) { 1432 return; 1433 } 1434 1435 s->irq_routes->flags = 0; 1436 trace_kvm_irqchip_commit_routes(); 1437 ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes); 1438 assert(ret == 0); 1439 } 1440 1441 static void kvm_add_routing_entry(KVMState *s, 1442 struct kvm_irq_routing_entry *entry) 1443 { 1444 struct kvm_irq_routing_entry *new; 1445 int n, size; 1446 1447 if (s->irq_routes->nr == s->nr_allocated_irq_routes) { 1448 n = s->nr_allocated_irq_routes * 2; 1449 if (n < 64) { 1450 n = 64; 1451 } 1452 size = sizeof(struct kvm_irq_routing); 1453 size += n * sizeof(*new); 1454 s->irq_routes = g_realloc(s->irq_routes, size); 1455 s->nr_allocated_irq_routes = n; 1456 } 1457 n = s->irq_routes->nr++; 1458 new = &s->irq_routes->entries[n]; 1459 1460 *new = *entry; 1461 1462 set_gsi(s, entry->gsi); 1463 } 1464 1465 static int kvm_update_routing_entry(KVMState *s, 1466 struct kvm_irq_routing_entry *new_entry) 1467 { 1468 struct kvm_irq_routing_entry *entry; 1469 int n; 1470 1471 for (n = 0; n < s->irq_routes->nr; n++) { 1472 entry = &s->irq_routes->entries[n]; 1473 if (entry->gsi != new_entry->gsi) { 1474 continue; 1475 } 1476 1477 if(!memcmp(entry, new_entry, sizeof *entry)) { 1478 return 0; 1479 } 1480 1481 *entry = *new_entry; 1482 1483 return 0; 1484 } 1485 1486 return -ESRCH; 1487 } 1488 1489 void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin) 1490 { 1491 struct kvm_irq_routing_entry e = {}; 1492 1493 assert(pin < s->gsi_count); 1494 1495 e.gsi = irq; 1496 e.type = KVM_IRQ_ROUTING_IRQCHIP; 1497 e.flags = 0; 1498 e.u.irqchip.irqchip = irqchip; 1499 e.u.irqchip.pin = pin; 1500 kvm_add_routing_entry(s, &e); 1501 } 1502 1503 void kvm_irqchip_release_virq(KVMState *s, int virq) 1504 { 1505 struct kvm_irq_routing_entry *e; 1506 int i; 1507 1508 if (kvm_gsi_direct_mapping()) { 1509 return; 1510 } 1511 1512 for (i = 0; i < s->irq_routes->nr; i++) { 1513 e = &s->irq_routes->entries[i]; 1514 if (e->gsi == virq) { 1515 s->irq_routes->nr--; 1516 *e = s->irq_routes->entries[s->irq_routes->nr]; 1517 } 1518 } 1519 clear_gsi(s, virq); 1520 kvm_arch_release_virq_post(virq); 1521 trace_kvm_irqchip_release_virq(virq); 1522 } 1523 1524 void kvm_irqchip_add_change_notifier(Notifier *n) 1525 { 1526 notifier_list_add(&kvm_irqchip_change_notifiers, n); 1527 } 1528 1529 void kvm_irqchip_remove_change_notifier(Notifier *n) 1530 { 1531 notifier_remove(n); 1532 } 1533 1534 void kvm_irqchip_change_notify(void) 1535 { 1536 notifier_list_notify(&kvm_irqchip_change_notifiers, NULL); 1537 } 1538 1539 static unsigned int kvm_hash_msi(uint32_t data) 1540 { 1541 /* This is optimized for IA32 MSI layout. However, no other arch shall 1542 * repeat the mistake of not providing a direct MSI injection API. */ 1543 return data & 0xff; 1544 } 1545 1546 static void kvm_flush_dynamic_msi_routes(KVMState *s) 1547 { 1548 KVMMSIRoute *route, *next; 1549 unsigned int hash; 1550 1551 for (hash = 0; hash < KVM_MSI_HASHTAB_SIZE; hash++) { 1552 QTAILQ_FOREACH_SAFE(route, &s->msi_hashtab[hash], entry, next) { 1553 kvm_irqchip_release_virq(s, route->kroute.gsi); 1554 QTAILQ_REMOVE(&s->msi_hashtab[hash], route, entry); 1555 g_free(route); 1556 } 1557 } 1558 } 1559 1560 static int kvm_irqchip_get_virq(KVMState *s) 1561 { 1562 int next_virq; 1563 1564 /* 1565 * PIC and IOAPIC share the first 16 GSI numbers, thus the available 1566 * GSI numbers are more than the number of IRQ route. Allocating a GSI 1567 * number can succeed even though a new route entry cannot be added. 1568 * When this happens, flush dynamic MSI entries to free IRQ route entries. 1569 */ 1570 if (!kvm_direct_msi_allowed && s->irq_routes->nr == s->gsi_count) { 1571 kvm_flush_dynamic_msi_routes(s); 1572 } 1573 1574 /* Return the lowest unused GSI in the bitmap */ 1575 next_virq = find_first_zero_bit(s->used_gsi_bitmap, s->gsi_count); 1576 if (next_virq >= s->gsi_count) { 1577 return -ENOSPC; 1578 } else { 1579 return next_virq; 1580 } 1581 } 1582 1583 static KVMMSIRoute *kvm_lookup_msi_route(KVMState *s, MSIMessage msg) 1584 { 1585 unsigned int hash = kvm_hash_msi(msg.data); 1586 KVMMSIRoute *route; 1587 1588 QTAILQ_FOREACH(route, &s->msi_hashtab[hash], entry) { 1589 if (route->kroute.u.msi.address_lo == (uint32_t)msg.address && 1590 route->kroute.u.msi.address_hi == (msg.address >> 32) && 1591 route->kroute.u.msi.data == le32_to_cpu(msg.data)) { 1592 return route; 1593 } 1594 } 1595 return NULL; 1596 } 1597 1598 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg) 1599 { 1600 struct kvm_msi msi; 1601 KVMMSIRoute *route; 1602 1603 if (kvm_direct_msi_allowed) { 1604 msi.address_lo = (uint32_t)msg.address; 1605 msi.address_hi = msg.address >> 32; 1606 msi.data = le32_to_cpu(msg.data); 1607 msi.flags = 0; 1608 memset(msi.pad, 0, sizeof(msi.pad)); 1609 1610 return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi); 1611 } 1612 1613 route = kvm_lookup_msi_route(s, msg); 1614 if (!route) { 1615 int virq; 1616 1617 virq = kvm_irqchip_get_virq(s); 1618 if (virq < 0) { 1619 return virq; 1620 } 1621 1622 route = g_malloc0(sizeof(KVMMSIRoute)); 1623 route->kroute.gsi = virq; 1624 route->kroute.type = KVM_IRQ_ROUTING_MSI; 1625 route->kroute.flags = 0; 1626 route->kroute.u.msi.address_lo = (uint32_t)msg.address; 1627 route->kroute.u.msi.address_hi = msg.address >> 32; 1628 route->kroute.u.msi.data = le32_to_cpu(msg.data); 1629 1630 kvm_add_routing_entry(s, &route->kroute); 1631 kvm_irqchip_commit_routes(s); 1632 1633 QTAILQ_INSERT_TAIL(&s->msi_hashtab[kvm_hash_msi(msg.data)], route, 1634 entry); 1635 } 1636 1637 assert(route->kroute.type == KVM_IRQ_ROUTING_MSI); 1638 1639 return kvm_set_irq(s, route->kroute.gsi, 1); 1640 } 1641 1642 int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev) 1643 { 1644 struct kvm_irq_routing_entry kroute = {}; 1645 int virq; 1646 MSIMessage msg = {0, 0}; 1647 1648 if (pci_available && dev) { 1649 msg = pci_get_msi_message(dev, vector); 1650 } 1651 1652 if (kvm_gsi_direct_mapping()) { 1653 return kvm_arch_msi_data_to_gsi(msg.data); 1654 } 1655 1656 if (!kvm_gsi_routing_enabled()) { 1657 return -ENOSYS; 1658 } 1659 1660 virq = kvm_irqchip_get_virq(s); 1661 if (virq < 0) { 1662 return virq; 1663 } 1664 1665 kroute.gsi = virq; 1666 kroute.type = KVM_IRQ_ROUTING_MSI; 1667 kroute.flags = 0; 1668 kroute.u.msi.address_lo = (uint32_t)msg.address; 1669 kroute.u.msi.address_hi = msg.address >> 32; 1670 kroute.u.msi.data = le32_to_cpu(msg.data); 1671 if (pci_available && kvm_msi_devid_required()) { 1672 kroute.flags = KVM_MSI_VALID_DEVID; 1673 kroute.u.msi.devid = pci_requester_id(dev); 1674 } 1675 if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) { 1676 kvm_irqchip_release_virq(s, virq); 1677 return -EINVAL; 1678 } 1679 1680 trace_kvm_irqchip_add_msi_route(dev ? dev->name : (char *)"N/A", 1681 vector, virq); 1682 1683 kvm_add_routing_entry(s, &kroute); 1684 kvm_arch_add_msi_route_post(&kroute, vector, dev); 1685 kvm_irqchip_commit_routes(s); 1686 1687 return virq; 1688 } 1689 1690 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg, 1691 PCIDevice *dev) 1692 { 1693 struct kvm_irq_routing_entry kroute = {}; 1694 1695 if (kvm_gsi_direct_mapping()) { 1696 return 0; 1697 } 1698 1699 if (!kvm_irqchip_in_kernel()) { 1700 return -ENOSYS; 1701 } 1702 1703 kroute.gsi = virq; 1704 kroute.type = KVM_IRQ_ROUTING_MSI; 1705 kroute.flags = 0; 1706 kroute.u.msi.address_lo = (uint32_t)msg.address; 1707 kroute.u.msi.address_hi = msg.address >> 32; 1708 kroute.u.msi.data = le32_to_cpu(msg.data); 1709 if (pci_available && kvm_msi_devid_required()) { 1710 kroute.flags = KVM_MSI_VALID_DEVID; 1711 kroute.u.msi.devid = pci_requester_id(dev); 1712 } 1713 if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) { 1714 return -EINVAL; 1715 } 1716 1717 trace_kvm_irqchip_update_msi_route(virq); 1718 1719 return kvm_update_routing_entry(s, &kroute); 1720 } 1721 1722 static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event, 1723 EventNotifier *resample, int virq, 1724 bool assign) 1725 { 1726 int fd = event_notifier_get_fd(event); 1727 int rfd = resample ? event_notifier_get_fd(resample) : -1; 1728 1729 struct kvm_irqfd irqfd = { 1730 .fd = fd, 1731 .gsi = virq, 1732 .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN, 1733 }; 1734 1735 if (rfd != -1) { 1736 assert(assign); 1737 if (kvm_irqchip_is_split()) { 1738 /* 1739 * When the slow irqchip (e.g. IOAPIC) is in the 1740 * userspace, KVM kernel resamplefd will not work because 1741 * the EOI of the interrupt will be delivered to userspace 1742 * instead, so the KVM kernel resamplefd kick will be 1743 * skipped. The userspace here mimics what the kernel 1744 * provides with resamplefd, remember the resamplefd and 1745 * kick it when we receive EOI of this IRQ. 1746 * 1747 * This is hackery because IOAPIC is mostly bypassed 1748 * (except EOI broadcasts) when irqfd is used. However 1749 * this can bring much performance back for split irqchip 1750 * with INTx IRQs (for VFIO, this gives 93% perf of the 1751 * full fast path, which is 46% perf boost comparing to 1752 * the INTx slow path). 1753 */ 1754 kvm_resample_fd_insert(virq, resample); 1755 } else { 1756 irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE; 1757 irqfd.resamplefd = rfd; 1758 } 1759 } else if (!assign) { 1760 if (kvm_irqchip_is_split()) { 1761 kvm_resample_fd_remove(virq); 1762 } 1763 } 1764 1765 if (!kvm_irqfds_enabled()) { 1766 return -ENOSYS; 1767 } 1768 1769 return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd); 1770 } 1771 1772 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter) 1773 { 1774 struct kvm_irq_routing_entry kroute = {}; 1775 int virq; 1776 1777 if (!kvm_gsi_routing_enabled()) { 1778 return -ENOSYS; 1779 } 1780 1781 virq = kvm_irqchip_get_virq(s); 1782 if (virq < 0) { 1783 return virq; 1784 } 1785 1786 kroute.gsi = virq; 1787 kroute.type = KVM_IRQ_ROUTING_S390_ADAPTER; 1788 kroute.flags = 0; 1789 kroute.u.adapter.summary_addr = adapter->summary_addr; 1790 kroute.u.adapter.ind_addr = adapter->ind_addr; 1791 kroute.u.adapter.summary_offset = adapter->summary_offset; 1792 kroute.u.adapter.ind_offset = adapter->ind_offset; 1793 kroute.u.adapter.adapter_id = adapter->adapter_id; 1794 1795 kvm_add_routing_entry(s, &kroute); 1796 1797 return virq; 1798 } 1799 1800 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint) 1801 { 1802 struct kvm_irq_routing_entry kroute = {}; 1803 int virq; 1804 1805 if (!kvm_gsi_routing_enabled()) { 1806 return -ENOSYS; 1807 } 1808 if (!kvm_check_extension(s, KVM_CAP_HYPERV_SYNIC)) { 1809 return -ENOSYS; 1810 } 1811 virq = kvm_irqchip_get_virq(s); 1812 if (virq < 0) { 1813 return virq; 1814 } 1815 1816 kroute.gsi = virq; 1817 kroute.type = KVM_IRQ_ROUTING_HV_SINT; 1818 kroute.flags = 0; 1819 kroute.u.hv_sint.vcpu = vcpu; 1820 kroute.u.hv_sint.sint = sint; 1821 1822 kvm_add_routing_entry(s, &kroute); 1823 kvm_irqchip_commit_routes(s); 1824 1825 return virq; 1826 } 1827 1828 #else /* !KVM_CAP_IRQ_ROUTING */ 1829 1830 void kvm_init_irq_routing(KVMState *s) 1831 { 1832 } 1833 1834 void kvm_irqchip_release_virq(KVMState *s, int virq) 1835 { 1836 } 1837 1838 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg) 1839 { 1840 abort(); 1841 } 1842 1843 int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev) 1844 { 1845 return -ENOSYS; 1846 } 1847 1848 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter) 1849 { 1850 return -ENOSYS; 1851 } 1852 1853 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint) 1854 { 1855 return -ENOSYS; 1856 } 1857 1858 static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event, 1859 EventNotifier *resample, int virq, 1860 bool assign) 1861 { 1862 abort(); 1863 } 1864 1865 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg) 1866 { 1867 return -ENOSYS; 1868 } 1869 #endif /* !KVM_CAP_IRQ_ROUTING */ 1870 1871 int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, 1872 EventNotifier *rn, int virq) 1873 { 1874 return kvm_irqchip_assign_irqfd(s, n, rn, virq, true); 1875 } 1876 1877 int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, 1878 int virq) 1879 { 1880 return kvm_irqchip_assign_irqfd(s, n, NULL, virq, false); 1881 } 1882 1883 int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n, 1884 EventNotifier *rn, qemu_irq irq) 1885 { 1886 gpointer key, gsi; 1887 gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi); 1888 1889 if (!found) { 1890 return -ENXIO; 1891 } 1892 return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, GPOINTER_TO_INT(gsi)); 1893 } 1894 1895 int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n, 1896 qemu_irq irq) 1897 { 1898 gpointer key, gsi; 1899 gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi); 1900 1901 if (!found) { 1902 return -ENXIO; 1903 } 1904 return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, GPOINTER_TO_INT(gsi)); 1905 } 1906 1907 void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi) 1908 { 1909 g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi)); 1910 } 1911 1912 static void kvm_irqchip_create(KVMState *s) 1913 { 1914 int ret; 1915 1916 assert(s->kernel_irqchip_split != ON_OFF_AUTO_AUTO); 1917 if (kvm_check_extension(s, KVM_CAP_IRQCHIP)) { 1918 ; 1919 } else if (kvm_check_extension(s, KVM_CAP_S390_IRQCHIP)) { 1920 ret = kvm_vm_enable_cap(s, KVM_CAP_S390_IRQCHIP, 0); 1921 if (ret < 0) { 1922 fprintf(stderr, "Enable kernel irqchip failed: %s\n", strerror(-ret)); 1923 exit(1); 1924 } 1925 } else { 1926 return; 1927 } 1928 1929 /* First probe and see if there's a arch-specific hook to create the 1930 * in-kernel irqchip for us */ 1931 ret = kvm_arch_irqchip_create(s); 1932 if (ret == 0) { 1933 if (s->kernel_irqchip_split == ON_OFF_AUTO_ON) { 1934 perror("Split IRQ chip mode not supported."); 1935 exit(1); 1936 } else { 1937 ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP); 1938 } 1939 } 1940 if (ret < 0) { 1941 fprintf(stderr, "Create kernel irqchip failed: %s\n", strerror(-ret)); 1942 exit(1); 1943 } 1944 1945 kvm_kernel_irqchip = true; 1946 /* If we have an in-kernel IRQ chip then we must have asynchronous 1947 * interrupt delivery (though the reverse is not necessarily true) 1948 */ 1949 kvm_async_interrupts_allowed = true; 1950 kvm_halt_in_kernel_allowed = true; 1951 1952 kvm_init_irq_routing(s); 1953 1954 s->gsimap = g_hash_table_new(g_direct_hash, g_direct_equal); 1955 } 1956 1957 /* Find number of supported CPUs using the recommended 1958 * procedure from the kernel API documentation to cope with 1959 * older kernels that may be missing capabilities. 1960 */ 1961 static int kvm_recommended_vcpus(KVMState *s) 1962 { 1963 int ret = kvm_vm_check_extension(s, KVM_CAP_NR_VCPUS); 1964 return (ret) ? ret : 4; 1965 } 1966 1967 static int kvm_max_vcpus(KVMState *s) 1968 { 1969 int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS); 1970 return (ret) ? ret : kvm_recommended_vcpus(s); 1971 } 1972 1973 static int kvm_max_vcpu_id(KVMState *s) 1974 { 1975 int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPU_ID); 1976 return (ret) ? ret : kvm_max_vcpus(s); 1977 } 1978 1979 bool kvm_vcpu_id_is_valid(int vcpu_id) 1980 { 1981 KVMState *s = KVM_STATE(current_accel()); 1982 return vcpu_id >= 0 && vcpu_id < kvm_max_vcpu_id(s); 1983 } 1984 1985 static int kvm_init(MachineState *ms) 1986 { 1987 MachineClass *mc = MACHINE_GET_CLASS(ms); 1988 static const char upgrade_note[] = 1989 "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n" 1990 "(see http://sourceforge.net/projects/kvm).\n"; 1991 struct { 1992 const char *name; 1993 int num; 1994 } num_cpus[] = { 1995 { "SMP", ms->smp.cpus }, 1996 { "hotpluggable", ms->smp.max_cpus }, 1997 { NULL, } 1998 }, *nc = num_cpus; 1999 int soft_vcpus_limit, hard_vcpus_limit; 2000 KVMState *s; 2001 const KVMCapabilityInfo *missing_cap; 2002 int ret; 2003 int type = 0; 2004 uint64_t dirty_log_manual_caps; 2005 2006 s = KVM_STATE(ms->accelerator); 2007 2008 /* 2009 * On systems where the kernel can support different base page 2010 * sizes, host page size may be different from TARGET_PAGE_SIZE, 2011 * even with KVM. TARGET_PAGE_SIZE is assumed to be the minimum 2012 * page size for the system though. 2013 */ 2014 assert(TARGET_PAGE_SIZE <= qemu_real_host_page_size); 2015 2016 s->sigmask_len = 8; 2017 2018 #ifdef KVM_CAP_SET_GUEST_DEBUG 2019 QTAILQ_INIT(&s->kvm_sw_breakpoints); 2020 #endif 2021 QLIST_INIT(&s->kvm_parked_vcpus); 2022 s->vmfd = -1; 2023 s->fd = qemu_open_old("/dev/kvm", O_RDWR); 2024 if (s->fd == -1) { 2025 fprintf(stderr, "Could not access KVM kernel module: %m\n"); 2026 ret = -errno; 2027 goto err; 2028 } 2029 2030 ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0); 2031 if (ret < KVM_API_VERSION) { 2032 if (ret >= 0) { 2033 ret = -EINVAL; 2034 } 2035 fprintf(stderr, "kvm version too old\n"); 2036 goto err; 2037 } 2038 2039 if (ret > KVM_API_VERSION) { 2040 ret = -EINVAL; 2041 fprintf(stderr, "kvm version not supported\n"); 2042 goto err; 2043 } 2044 2045 kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT); 2046 s->nr_slots = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS); 2047 2048 /* If unspecified, use the default value */ 2049 if (!s->nr_slots) { 2050 s->nr_slots = 32; 2051 } 2052 2053 s->nr_as = kvm_check_extension(s, KVM_CAP_MULTI_ADDRESS_SPACE); 2054 if (s->nr_as <= 1) { 2055 s->nr_as = 1; 2056 } 2057 s->as = g_new0(struct KVMAs, s->nr_as); 2058 2059 if (object_property_find(OBJECT(current_machine), "kvm-type")) { 2060 g_autofree char *kvm_type = object_property_get_str(OBJECT(current_machine), 2061 "kvm-type", 2062 &error_abort); 2063 type = mc->kvm_type(ms, kvm_type); 2064 } else if (mc->kvm_type) { 2065 type = mc->kvm_type(ms, NULL); 2066 } 2067 2068 do { 2069 ret = kvm_ioctl(s, KVM_CREATE_VM, type); 2070 } while (ret == -EINTR); 2071 2072 if (ret < 0) { 2073 fprintf(stderr, "ioctl(KVM_CREATE_VM) failed: %d %s\n", -ret, 2074 strerror(-ret)); 2075 2076 #ifdef TARGET_S390X 2077 if (ret == -EINVAL) { 2078 fprintf(stderr, 2079 "Host kernel setup problem detected. Please verify:\n"); 2080 fprintf(stderr, "- for kernels supporting the switch_amode or" 2081 " user_mode parameters, whether\n"); 2082 fprintf(stderr, 2083 " user space is running in primary address space\n"); 2084 fprintf(stderr, 2085 "- for kernels supporting the vm.allocate_pgste sysctl, " 2086 "whether it is enabled\n"); 2087 } 2088 #endif 2089 goto err; 2090 } 2091 2092 s->vmfd = ret; 2093 2094 /* check the vcpu limits */ 2095 soft_vcpus_limit = kvm_recommended_vcpus(s); 2096 hard_vcpus_limit = kvm_max_vcpus(s); 2097 2098 while (nc->name) { 2099 if (nc->num > soft_vcpus_limit) { 2100 warn_report("Number of %s cpus requested (%d) exceeds " 2101 "the recommended cpus supported by KVM (%d)", 2102 nc->name, nc->num, soft_vcpus_limit); 2103 2104 if (nc->num > hard_vcpus_limit) { 2105 fprintf(stderr, "Number of %s cpus requested (%d) exceeds " 2106 "the maximum cpus supported by KVM (%d)\n", 2107 nc->name, nc->num, hard_vcpus_limit); 2108 exit(1); 2109 } 2110 } 2111 nc++; 2112 } 2113 2114 missing_cap = kvm_check_extension_list(s, kvm_required_capabilites); 2115 if (!missing_cap) { 2116 missing_cap = 2117 kvm_check_extension_list(s, kvm_arch_required_capabilities); 2118 } 2119 if (missing_cap) { 2120 ret = -EINVAL; 2121 fprintf(stderr, "kvm does not support %s\n%s", 2122 missing_cap->name, upgrade_note); 2123 goto err; 2124 } 2125 2126 s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO); 2127 s->coalesced_pio = s->coalesced_mmio && 2128 kvm_check_extension(s, KVM_CAP_COALESCED_PIO); 2129 2130 dirty_log_manual_caps = 2131 kvm_check_extension(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); 2132 dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | 2133 KVM_DIRTY_LOG_INITIALLY_SET); 2134 s->manual_dirty_log_protect = dirty_log_manual_caps; 2135 if (dirty_log_manual_caps) { 2136 ret = kvm_vm_enable_cap(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0, 2137 dirty_log_manual_caps); 2138 if (ret) { 2139 warn_report("Trying to enable capability %"PRIu64" of " 2140 "KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 but failed. " 2141 "Falling back to the legacy mode. ", 2142 dirty_log_manual_caps); 2143 s->manual_dirty_log_protect = 0; 2144 } 2145 } 2146 2147 #ifdef KVM_CAP_VCPU_EVENTS 2148 s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS); 2149 #endif 2150 2151 s->robust_singlestep = 2152 kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP); 2153 2154 #ifdef KVM_CAP_DEBUGREGS 2155 s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS); 2156 #endif 2157 2158 s->max_nested_state_len = kvm_check_extension(s, KVM_CAP_NESTED_STATE); 2159 2160 #ifdef KVM_CAP_IRQ_ROUTING 2161 kvm_direct_msi_allowed = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0); 2162 #endif 2163 2164 s->intx_set_mask = kvm_check_extension(s, KVM_CAP_PCI_2_3); 2165 2166 s->irq_set_ioctl = KVM_IRQ_LINE; 2167 if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) { 2168 s->irq_set_ioctl = KVM_IRQ_LINE_STATUS; 2169 } 2170 2171 kvm_readonly_mem_allowed = 2172 (kvm_check_extension(s, KVM_CAP_READONLY_MEM) > 0); 2173 2174 kvm_eventfds_allowed = 2175 (kvm_check_extension(s, KVM_CAP_IOEVENTFD) > 0); 2176 2177 kvm_irqfds_allowed = 2178 (kvm_check_extension(s, KVM_CAP_IRQFD) > 0); 2179 2180 kvm_resamplefds_allowed = 2181 (kvm_check_extension(s, KVM_CAP_IRQFD_RESAMPLE) > 0); 2182 2183 kvm_vm_attributes_allowed = 2184 (kvm_check_extension(s, KVM_CAP_VM_ATTRIBUTES) > 0); 2185 2186 kvm_ioeventfd_any_length_allowed = 2187 (kvm_check_extension(s, KVM_CAP_IOEVENTFD_ANY_LENGTH) > 0); 2188 2189 kvm_state = s; 2190 2191 ret = kvm_arch_init(ms, s); 2192 if (ret < 0) { 2193 goto err; 2194 } 2195 2196 if (s->kernel_irqchip_split == ON_OFF_AUTO_AUTO) { 2197 s->kernel_irqchip_split = mc->default_kernel_irqchip_split ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF; 2198 } 2199 2200 qemu_register_reset(kvm_unpoison_all, NULL); 2201 2202 if (s->kernel_irqchip_allowed) { 2203 kvm_irqchip_create(s); 2204 } 2205 2206 if (kvm_eventfds_allowed) { 2207 s->memory_listener.listener.eventfd_add = kvm_mem_ioeventfd_add; 2208 s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del; 2209 } 2210 s->memory_listener.listener.coalesced_io_add = kvm_coalesce_mmio_region; 2211 s->memory_listener.listener.coalesced_io_del = kvm_uncoalesce_mmio_region; 2212 2213 kvm_memory_listener_register(s, &s->memory_listener, 2214 &address_space_memory, 0); 2215 if (kvm_eventfds_allowed) { 2216 memory_listener_register(&kvm_io_listener, 2217 &address_space_io); 2218 } 2219 memory_listener_register(&kvm_coalesced_pio_listener, 2220 &address_space_io); 2221 2222 s->many_ioeventfds = kvm_check_many_ioeventfds(); 2223 2224 s->sync_mmu = !!kvm_vm_check_extension(kvm_state, KVM_CAP_SYNC_MMU); 2225 if (!s->sync_mmu) { 2226 ret = ram_block_discard_disable(true); 2227 assert(!ret); 2228 } 2229 return 0; 2230 2231 err: 2232 assert(ret < 0); 2233 if (s->vmfd >= 0) { 2234 close(s->vmfd); 2235 } 2236 if (s->fd != -1) { 2237 close(s->fd); 2238 } 2239 g_free(s->memory_listener.slots); 2240 2241 return ret; 2242 } 2243 2244 void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len) 2245 { 2246 s->sigmask_len = sigmask_len; 2247 } 2248 2249 static void kvm_handle_io(uint16_t port, MemTxAttrs attrs, void *data, int direction, 2250 int size, uint32_t count) 2251 { 2252 int i; 2253 uint8_t *ptr = data; 2254 2255 for (i = 0; i < count; i++) { 2256 address_space_rw(&address_space_io, port, attrs, 2257 ptr, size, 2258 direction == KVM_EXIT_IO_OUT); 2259 ptr += size; 2260 } 2261 } 2262 2263 static int kvm_handle_internal_error(CPUState *cpu, struct kvm_run *run) 2264 { 2265 fprintf(stderr, "KVM internal error. Suberror: %d\n", 2266 run->internal.suberror); 2267 2268 if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) { 2269 int i; 2270 2271 for (i = 0; i < run->internal.ndata; ++i) { 2272 fprintf(stderr, "extra data[%d]: 0x%016"PRIx64"\n", 2273 i, (uint64_t)run->internal.data[i]); 2274 } 2275 } 2276 if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) { 2277 fprintf(stderr, "emulation failure\n"); 2278 if (!kvm_arch_stop_on_emulation_error(cpu)) { 2279 cpu_dump_state(cpu, stderr, CPU_DUMP_CODE); 2280 return EXCP_INTERRUPT; 2281 } 2282 } 2283 /* FIXME: Should trigger a qmp message to let management know 2284 * something went wrong. 2285 */ 2286 return -1; 2287 } 2288 2289 void kvm_flush_coalesced_mmio_buffer(void) 2290 { 2291 KVMState *s = kvm_state; 2292 2293 if (s->coalesced_flush_in_progress) { 2294 return; 2295 } 2296 2297 s->coalesced_flush_in_progress = true; 2298 2299 if (s->coalesced_mmio_ring) { 2300 struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring; 2301 while (ring->first != ring->last) { 2302 struct kvm_coalesced_mmio *ent; 2303 2304 ent = &ring->coalesced_mmio[ring->first]; 2305 2306 if (ent->pio == 1) { 2307 address_space_write(&address_space_io, ent->phys_addr, 2308 MEMTXATTRS_UNSPECIFIED, ent->data, 2309 ent->len); 2310 } else { 2311 cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len); 2312 } 2313 smp_wmb(); 2314 ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX; 2315 } 2316 } 2317 2318 s->coalesced_flush_in_progress = false; 2319 } 2320 2321 bool kvm_cpu_check_are_resettable(void) 2322 { 2323 return kvm_arch_cpu_check_are_resettable(); 2324 } 2325 2326 static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg) 2327 { 2328 if (!cpu->vcpu_dirty) { 2329 kvm_arch_get_registers(cpu); 2330 cpu->vcpu_dirty = true; 2331 } 2332 } 2333 2334 void kvm_cpu_synchronize_state(CPUState *cpu) 2335 { 2336 if (!cpu->vcpu_dirty) { 2337 run_on_cpu(cpu, do_kvm_cpu_synchronize_state, RUN_ON_CPU_NULL); 2338 } 2339 } 2340 2341 static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg) 2342 { 2343 kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE); 2344 cpu->vcpu_dirty = false; 2345 } 2346 2347 void kvm_cpu_synchronize_post_reset(CPUState *cpu) 2348 { 2349 run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL); 2350 } 2351 2352 static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg) 2353 { 2354 kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE); 2355 cpu->vcpu_dirty = false; 2356 } 2357 2358 void kvm_cpu_synchronize_post_init(CPUState *cpu) 2359 { 2360 run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, RUN_ON_CPU_NULL); 2361 } 2362 2363 static void do_kvm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg) 2364 { 2365 cpu->vcpu_dirty = true; 2366 } 2367 2368 void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu) 2369 { 2370 run_on_cpu(cpu, do_kvm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL); 2371 } 2372 2373 #ifdef KVM_HAVE_MCE_INJECTION 2374 static __thread void *pending_sigbus_addr; 2375 static __thread int pending_sigbus_code; 2376 static __thread bool have_sigbus_pending; 2377 #endif 2378 2379 static void kvm_cpu_kick(CPUState *cpu) 2380 { 2381 qatomic_set(&cpu->kvm_run->immediate_exit, 1); 2382 } 2383 2384 static void kvm_cpu_kick_self(void) 2385 { 2386 if (kvm_immediate_exit) { 2387 kvm_cpu_kick(current_cpu); 2388 } else { 2389 qemu_cpu_kick_self(); 2390 } 2391 } 2392 2393 static void kvm_eat_signals(CPUState *cpu) 2394 { 2395 struct timespec ts = { 0, 0 }; 2396 siginfo_t siginfo; 2397 sigset_t waitset; 2398 sigset_t chkset; 2399 int r; 2400 2401 if (kvm_immediate_exit) { 2402 qatomic_set(&cpu->kvm_run->immediate_exit, 0); 2403 /* Write kvm_run->immediate_exit before the cpu->exit_request 2404 * write in kvm_cpu_exec. 2405 */ 2406 smp_wmb(); 2407 return; 2408 } 2409 2410 sigemptyset(&waitset); 2411 sigaddset(&waitset, SIG_IPI); 2412 2413 do { 2414 r = sigtimedwait(&waitset, &siginfo, &ts); 2415 if (r == -1 && !(errno == EAGAIN || errno == EINTR)) { 2416 perror("sigtimedwait"); 2417 exit(1); 2418 } 2419 2420 r = sigpending(&chkset); 2421 if (r == -1) { 2422 perror("sigpending"); 2423 exit(1); 2424 } 2425 } while (sigismember(&chkset, SIG_IPI)); 2426 } 2427 2428 int kvm_cpu_exec(CPUState *cpu) 2429 { 2430 struct kvm_run *run = cpu->kvm_run; 2431 int ret, run_ret; 2432 2433 DPRINTF("kvm_cpu_exec()\n"); 2434 2435 if (kvm_arch_process_async_events(cpu)) { 2436 qatomic_set(&cpu->exit_request, 0); 2437 return EXCP_HLT; 2438 } 2439 2440 qemu_mutex_unlock_iothread(); 2441 cpu_exec_start(cpu); 2442 2443 do { 2444 MemTxAttrs attrs; 2445 2446 if (cpu->vcpu_dirty) { 2447 kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE); 2448 cpu->vcpu_dirty = false; 2449 } 2450 2451 kvm_arch_pre_run(cpu, run); 2452 if (qatomic_read(&cpu->exit_request)) { 2453 DPRINTF("interrupt exit requested\n"); 2454 /* 2455 * KVM requires us to reenter the kernel after IO exits to complete 2456 * instruction emulation. This self-signal will ensure that we 2457 * leave ASAP again. 2458 */ 2459 kvm_cpu_kick_self(); 2460 } 2461 2462 /* Read cpu->exit_request before KVM_RUN reads run->immediate_exit. 2463 * Matching barrier in kvm_eat_signals. 2464 */ 2465 smp_rmb(); 2466 2467 run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0); 2468 2469 attrs = kvm_arch_post_run(cpu, run); 2470 2471 #ifdef KVM_HAVE_MCE_INJECTION 2472 if (unlikely(have_sigbus_pending)) { 2473 qemu_mutex_lock_iothread(); 2474 kvm_arch_on_sigbus_vcpu(cpu, pending_sigbus_code, 2475 pending_sigbus_addr); 2476 have_sigbus_pending = false; 2477 qemu_mutex_unlock_iothread(); 2478 } 2479 #endif 2480 2481 if (run_ret < 0) { 2482 if (run_ret == -EINTR || run_ret == -EAGAIN) { 2483 DPRINTF("io window exit\n"); 2484 kvm_eat_signals(cpu); 2485 ret = EXCP_INTERRUPT; 2486 break; 2487 } 2488 fprintf(stderr, "error: kvm run failed %s\n", 2489 strerror(-run_ret)); 2490 #ifdef TARGET_PPC 2491 if (run_ret == -EBUSY) { 2492 fprintf(stderr, 2493 "This is probably because your SMT is enabled.\n" 2494 "VCPU can only run on primary threads with all " 2495 "secondary threads offline.\n"); 2496 } 2497 #endif 2498 ret = -1; 2499 break; 2500 } 2501 2502 trace_kvm_run_exit(cpu->cpu_index, run->exit_reason); 2503 switch (run->exit_reason) { 2504 case KVM_EXIT_IO: 2505 DPRINTF("handle_io\n"); 2506 /* Called outside BQL */ 2507 kvm_handle_io(run->io.port, attrs, 2508 (uint8_t *)run + run->io.data_offset, 2509 run->io.direction, 2510 run->io.size, 2511 run->io.count); 2512 ret = 0; 2513 break; 2514 case KVM_EXIT_MMIO: 2515 DPRINTF("handle_mmio\n"); 2516 /* Called outside BQL */ 2517 address_space_rw(&address_space_memory, 2518 run->mmio.phys_addr, attrs, 2519 run->mmio.data, 2520 run->mmio.len, 2521 run->mmio.is_write); 2522 ret = 0; 2523 break; 2524 case KVM_EXIT_IRQ_WINDOW_OPEN: 2525 DPRINTF("irq_window_open\n"); 2526 ret = EXCP_INTERRUPT; 2527 break; 2528 case KVM_EXIT_SHUTDOWN: 2529 DPRINTF("shutdown\n"); 2530 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); 2531 ret = EXCP_INTERRUPT; 2532 break; 2533 case KVM_EXIT_UNKNOWN: 2534 fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n", 2535 (uint64_t)run->hw.hardware_exit_reason); 2536 ret = -1; 2537 break; 2538 case KVM_EXIT_INTERNAL_ERROR: 2539 ret = kvm_handle_internal_error(cpu, run); 2540 break; 2541 case KVM_EXIT_SYSTEM_EVENT: 2542 switch (run->system_event.type) { 2543 case KVM_SYSTEM_EVENT_SHUTDOWN: 2544 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN); 2545 ret = EXCP_INTERRUPT; 2546 break; 2547 case KVM_SYSTEM_EVENT_RESET: 2548 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); 2549 ret = EXCP_INTERRUPT; 2550 break; 2551 case KVM_SYSTEM_EVENT_CRASH: 2552 kvm_cpu_synchronize_state(cpu); 2553 qemu_mutex_lock_iothread(); 2554 qemu_system_guest_panicked(cpu_get_crash_info(cpu)); 2555 qemu_mutex_unlock_iothread(); 2556 ret = 0; 2557 break; 2558 default: 2559 DPRINTF("kvm_arch_handle_exit\n"); 2560 ret = kvm_arch_handle_exit(cpu, run); 2561 break; 2562 } 2563 break; 2564 default: 2565 DPRINTF("kvm_arch_handle_exit\n"); 2566 ret = kvm_arch_handle_exit(cpu, run); 2567 break; 2568 } 2569 } while (ret == 0); 2570 2571 cpu_exec_end(cpu); 2572 qemu_mutex_lock_iothread(); 2573 2574 if (ret < 0) { 2575 cpu_dump_state(cpu, stderr, CPU_DUMP_CODE); 2576 vm_stop(RUN_STATE_INTERNAL_ERROR); 2577 } 2578 2579 qatomic_set(&cpu->exit_request, 0); 2580 return ret; 2581 } 2582 2583 int kvm_ioctl(KVMState *s, int type, ...) 2584 { 2585 int ret; 2586 void *arg; 2587 va_list ap; 2588 2589 va_start(ap, type); 2590 arg = va_arg(ap, void *); 2591 va_end(ap); 2592 2593 trace_kvm_ioctl(type, arg); 2594 ret = ioctl(s->fd, type, arg); 2595 if (ret == -1) { 2596 ret = -errno; 2597 } 2598 return ret; 2599 } 2600 2601 int kvm_vm_ioctl(KVMState *s, int type, ...) 2602 { 2603 int ret; 2604 void *arg; 2605 va_list ap; 2606 2607 va_start(ap, type); 2608 arg = va_arg(ap, void *); 2609 va_end(ap); 2610 2611 trace_kvm_vm_ioctl(type, arg); 2612 ret = ioctl(s->vmfd, type, arg); 2613 if (ret == -1) { 2614 ret = -errno; 2615 } 2616 return ret; 2617 } 2618 2619 int kvm_vcpu_ioctl(CPUState *cpu, int type, ...) 2620 { 2621 int ret; 2622 void *arg; 2623 va_list ap; 2624 2625 va_start(ap, type); 2626 arg = va_arg(ap, void *); 2627 va_end(ap); 2628 2629 trace_kvm_vcpu_ioctl(cpu->cpu_index, type, arg); 2630 ret = ioctl(cpu->kvm_fd, type, arg); 2631 if (ret == -1) { 2632 ret = -errno; 2633 } 2634 return ret; 2635 } 2636 2637 int kvm_device_ioctl(int fd, int type, ...) 2638 { 2639 int ret; 2640 void *arg; 2641 va_list ap; 2642 2643 va_start(ap, type); 2644 arg = va_arg(ap, void *); 2645 va_end(ap); 2646 2647 trace_kvm_device_ioctl(fd, type, arg); 2648 ret = ioctl(fd, type, arg); 2649 if (ret == -1) { 2650 ret = -errno; 2651 } 2652 return ret; 2653 } 2654 2655 int kvm_vm_check_attr(KVMState *s, uint32_t group, uint64_t attr) 2656 { 2657 int ret; 2658 struct kvm_device_attr attribute = { 2659 .group = group, 2660 .attr = attr, 2661 }; 2662 2663 if (!kvm_vm_attributes_allowed) { 2664 return 0; 2665 } 2666 2667 ret = kvm_vm_ioctl(s, KVM_HAS_DEVICE_ATTR, &attribute); 2668 /* kvm returns 0 on success for HAS_DEVICE_ATTR */ 2669 return ret ? 0 : 1; 2670 } 2671 2672 int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr) 2673 { 2674 struct kvm_device_attr attribute = { 2675 .group = group, 2676 .attr = attr, 2677 .flags = 0, 2678 }; 2679 2680 return kvm_device_ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute) ? 0 : 1; 2681 } 2682 2683 int kvm_device_access(int fd, int group, uint64_t attr, 2684 void *val, bool write, Error **errp) 2685 { 2686 struct kvm_device_attr kvmattr; 2687 int err; 2688 2689 kvmattr.flags = 0; 2690 kvmattr.group = group; 2691 kvmattr.attr = attr; 2692 kvmattr.addr = (uintptr_t)val; 2693 2694 err = kvm_device_ioctl(fd, 2695 write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR, 2696 &kvmattr); 2697 if (err < 0) { 2698 error_setg_errno(errp, -err, 2699 "KVM_%s_DEVICE_ATTR failed: Group %d " 2700 "attr 0x%016" PRIx64, 2701 write ? "SET" : "GET", group, attr); 2702 } 2703 return err; 2704 } 2705 2706 bool kvm_has_sync_mmu(void) 2707 { 2708 return kvm_state->sync_mmu; 2709 } 2710 2711 int kvm_has_vcpu_events(void) 2712 { 2713 return kvm_state->vcpu_events; 2714 } 2715 2716 int kvm_has_robust_singlestep(void) 2717 { 2718 return kvm_state->robust_singlestep; 2719 } 2720 2721 int kvm_has_debugregs(void) 2722 { 2723 return kvm_state->debugregs; 2724 } 2725 2726 int kvm_max_nested_state_length(void) 2727 { 2728 return kvm_state->max_nested_state_len; 2729 } 2730 2731 int kvm_has_many_ioeventfds(void) 2732 { 2733 if (!kvm_enabled()) { 2734 return 0; 2735 } 2736 return kvm_state->many_ioeventfds; 2737 } 2738 2739 int kvm_has_gsi_routing(void) 2740 { 2741 #ifdef KVM_CAP_IRQ_ROUTING 2742 return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING); 2743 #else 2744 return false; 2745 #endif 2746 } 2747 2748 int kvm_has_intx_set_mask(void) 2749 { 2750 return kvm_state->intx_set_mask; 2751 } 2752 2753 bool kvm_arm_supports_user_irq(void) 2754 { 2755 return kvm_check_extension(kvm_state, KVM_CAP_ARM_USER_IRQ); 2756 } 2757 2758 #ifdef KVM_CAP_SET_GUEST_DEBUG 2759 struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu, 2760 target_ulong pc) 2761 { 2762 struct kvm_sw_breakpoint *bp; 2763 2764 QTAILQ_FOREACH(bp, &cpu->kvm_state->kvm_sw_breakpoints, entry) { 2765 if (bp->pc == pc) { 2766 return bp; 2767 } 2768 } 2769 return NULL; 2770 } 2771 2772 int kvm_sw_breakpoints_active(CPUState *cpu) 2773 { 2774 return !QTAILQ_EMPTY(&cpu->kvm_state->kvm_sw_breakpoints); 2775 } 2776 2777 struct kvm_set_guest_debug_data { 2778 struct kvm_guest_debug dbg; 2779 int err; 2780 }; 2781 2782 static void kvm_invoke_set_guest_debug(CPUState *cpu, run_on_cpu_data data) 2783 { 2784 struct kvm_set_guest_debug_data *dbg_data = 2785 (struct kvm_set_guest_debug_data *) data.host_ptr; 2786 2787 dbg_data->err = kvm_vcpu_ioctl(cpu, KVM_SET_GUEST_DEBUG, 2788 &dbg_data->dbg); 2789 } 2790 2791 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap) 2792 { 2793 struct kvm_set_guest_debug_data data; 2794 2795 data.dbg.control = reinject_trap; 2796 2797 if (cpu->singlestep_enabled) { 2798 data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP; 2799 } 2800 kvm_arch_update_guest_debug(cpu, &data.dbg); 2801 2802 run_on_cpu(cpu, kvm_invoke_set_guest_debug, 2803 RUN_ON_CPU_HOST_PTR(&data)); 2804 return data.err; 2805 } 2806 2807 int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr, 2808 target_ulong len, int type) 2809 { 2810 struct kvm_sw_breakpoint *bp; 2811 int err; 2812 2813 if (type == GDB_BREAKPOINT_SW) { 2814 bp = kvm_find_sw_breakpoint(cpu, addr); 2815 if (bp) { 2816 bp->use_count++; 2817 return 0; 2818 } 2819 2820 bp = g_malloc(sizeof(struct kvm_sw_breakpoint)); 2821 bp->pc = addr; 2822 bp->use_count = 1; 2823 err = kvm_arch_insert_sw_breakpoint(cpu, bp); 2824 if (err) { 2825 g_free(bp); 2826 return err; 2827 } 2828 2829 QTAILQ_INSERT_HEAD(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry); 2830 } else { 2831 err = kvm_arch_insert_hw_breakpoint(addr, len, type); 2832 if (err) { 2833 return err; 2834 } 2835 } 2836 2837 CPU_FOREACH(cpu) { 2838 err = kvm_update_guest_debug(cpu, 0); 2839 if (err) { 2840 return err; 2841 } 2842 } 2843 return 0; 2844 } 2845 2846 int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr, 2847 target_ulong len, int type) 2848 { 2849 struct kvm_sw_breakpoint *bp; 2850 int err; 2851 2852 if (type == GDB_BREAKPOINT_SW) { 2853 bp = kvm_find_sw_breakpoint(cpu, addr); 2854 if (!bp) { 2855 return -ENOENT; 2856 } 2857 2858 if (bp->use_count > 1) { 2859 bp->use_count--; 2860 return 0; 2861 } 2862 2863 err = kvm_arch_remove_sw_breakpoint(cpu, bp); 2864 if (err) { 2865 return err; 2866 } 2867 2868 QTAILQ_REMOVE(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry); 2869 g_free(bp); 2870 } else { 2871 err = kvm_arch_remove_hw_breakpoint(addr, len, type); 2872 if (err) { 2873 return err; 2874 } 2875 } 2876 2877 CPU_FOREACH(cpu) { 2878 err = kvm_update_guest_debug(cpu, 0); 2879 if (err) { 2880 return err; 2881 } 2882 } 2883 return 0; 2884 } 2885 2886 void kvm_remove_all_breakpoints(CPUState *cpu) 2887 { 2888 struct kvm_sw_breakpoint *bp, *next; 2889 KVMState *s = cpu->kvm_state; 2890 CPUState *tmpcpu; 2891 2892 QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) { 2893 if (kvm_arch_remove_sw_breakpoint(cpu, bp) != 0) { 2894 /* Try harder to find a CPU that currently sees the breakpoint. */ 2895 CPU_FOREACH(tmpcpu) { 2896 if (kvm_arch_remove_sw_breakpoint(tmpcpu, bp) == 0) { 2897 break; 2898 } 2899 } 2900 } 2901 QTAILQ_REMOVE(&s->kvm_sw_breakpoints, bp, entry); 2902 g_free(bp); 2903 } 2904 kvm_arch_remove_all_hw_breakpoints(); 2905 2906 CPU_FOREACH(cpu) { 2907 kvm_update_guest_debug(cpu, 0); 2908 } 2909 } 2910 2911 #else /* !KVM_CAP_SET_GUEST_DEBUG */ 2912 2913 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap) 2914 { 2915 return -EINVAL; 2916 } 2917 2918 int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr, 2919 target_ulong len, int type) 2920 { 2921 return -EINVAL; 2922 } 2923 2924 int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr, 2925 target_ulong len, int type) 2926 { 2927 return -EINVAL; 2928 } 2929 2930 void kvm_remove_all_breakpoints(CPUState *cpu) 2931 { 2932 } 2933 #endif /* !KVM_CAP_SET_GUEST_DEBUG */ 2934 2935 static int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset) 2936 { 2937 KVMState *s = kvm_state; 2938 struct kvm_signal_mask *sigmask; 2939 int r; 2940 2941 sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset)); 2942 2943 sigmask->len = s->sigmask_len; 2944 memcpy(sigmask->sigset, sigset, sizeof(*sigset)); 2945 r = kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, sigmask); 2946 g_free(sigmask); 2947 2948 return r; 2949 } 2950 2951 static void kvm_ipi_signal(int sig) 2952 { 2953 if (current_cpu) { 2954 assert(kvm_immediate_exit); 2955 kvm_cpu_kick(current_cpu); 2956 } 2957 } 2958 2959 void kvm_init_cpu_signals(CPUState *cpu) 2960 { 2961 int r; 2962 sigset_t set; 2963 struct sigaction sigact; 2964 2965 memset(&sigact, 0, sizeof(sigact)); 2966 sigact.sa_handler = kvm_ipi_signal; 2967 sigaction(SIG_IPI, &sigact, NULL); 2968 2969 pthread_sigmask(SIG_BLOCK, NULL, &set); 2970 #if defined KVM_HAVE_MCE_INJECTION 2971 sigdelset(&set, SIGBUS); 2972 pthread_sigmask(SIG_SETMASK, &set, NULL); 2973 #endif 2974 sigdelset(&set, SIG_IPI); 2975 if (kvm_immediate_exit) { 2976 r = pthread_sigmask(SIG_SETMASK, &set, NULL); 2977 } else { 2978 r = kvm_set_signal_mask(cpu, &set); 2979 } 2980 if (r) { 2981 fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r)); 2982 exit(1); 2983 } 2984 } 2985 2986 /* Called asynchronously in VCPU thread. */ 2987 int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr) 2988 { 2989 #ifdef KVM_HAVE_MCE_INJECTION 2990 if (have_sigbus_pending) { 2991 return 1; 2992 } 2993 have_sigbus_pending = true; 2994 pending_sigbus_addr = addr; 2995 pending_sigbus_code = code; 2996 qatomic_set(&cpu->exit_request, 1); 2997 return 0; 2998 #else 2999 return 1; 3000 #endif 3001 } 3002 3003 /* Called synchronously (via signalfd) in main thread. */ 3004 int kvm_on_sigbus(int code, void *addr) 3005 { 3006 #ifdef KVM_HAVE_MCE_INJECTION 3007 /* Action required MCE kills the process if SIGBUS is blocked. Because 3008 * that's what happens in the I/O thread, where we handle MCE via signalfd, 3009 * we can only get action optional here. 3010 */ 3011 assert(code != BUS_MCEERR_AR); 3012 kvm_arch_on_sigbus_vcpu(first_cpu, code, addr); 3013 return 0; 3014 #else 3015 return 1; 3016 #endif 3017 } 3018 3019 int kvm_create_device(KVMState *s, uint64_t type, bool test) 3020 { 3021 int ret; 3022 struct kvm_create_device create_dev; 3023 3024 create_dev.type = type; 3025 create_dev.fd = -1; 3026 create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0; 3027 3028 if (!kvm_check_extension(s, KVM_CAP_DEVICE_CTRL)) { 3029 return -ENOTSUP; 3030 } 3031 3032 ret = kvm_vm_ioctl(s, KVM_CREATE_DEVICE, &create_dev); 3033 if (ret) { 3034 return ret; 3035 } 3036 3037 return test ? 0 : create_dev.fd; 3038 } 3039 3040 bool kvm_device_supported(int vmfd, uint64_t type) 3041 { 3042 struct kvm_create_device create_dev = { 3043 .type = type, 3044 .fd = -1, 3045 .flags = KVM_CREATE_DEVICE_TEST, 3046 }; 3047 3048 if (ioctl(vmfd, KVM_CHECK_EXTENSION, KVM_CAP_DEVICE_CTRL) <= 0) { 3049 return false; 3050 } 3051 3052 return (ioctl(vmfd, KVM_CREATE_DEVICE, &create_dev) >= 0); 3053 } 3054 3055 int kvm_set_one_reg(CPUState *cs, uint64_t id, void *source) 3056 { 3057 struct kvm_one_reg reg; 3058 int r; 3059 3060 reg.id = id; 3061 reg.addr = (uintptr_t) source; 3062 r = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, ®); 3063 if (r) { 3064 trace_kvm_failed_reg_set(id, strerror(-r)); 3065 } 3066 return r; 3067 } 3068 3069 int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target) 3070 { 3071 struct kvm_one_reg reg; 3072 int r; 3073 3074 reg.id = id; 3075 reg.addr = (uintptr_t) target; 3076 r = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, ®); 3077 if (r) { 3078 trace_kvm_failed_reg_get(id, strerror(-r)); 3079 } 3080 return r; 3081 } 3082 3083 static bool kvm_accel_has_memory(MachineState *ms, AddressSpace *as, 3084 hwaddr start_addr, hwaddr size) 3085 { 3086 KVMState *kvm = KVM_STATE(ms->accelerator); 3087 int i; 3088 3089 for (i = 0; i < kvm->nr_as; ++i) { 3090 if (kvm->as[i].as == as && kvm->as[i].ml) { 3091 size = MIN(kvm_max_slot_size, size); 3092 return NULL != kvm_lookup_matching_slot(kvm->as[i].ml, 3093 start_addr, size); 3094 } 3095 } 3096 3097 return false; 3098 } 3099 3100 static void kvm_get_kvm_shadow_mem(Object *obj, Visitor *v, 3101 const char *name, void *opaque, 3102 Error **errp) 3103 { 3104 KVMState *s = KVM_STATE(obj); 3105 int64_t value = s->kvm_shadow_mem; 3106 3107 visit_type_int(v, name, &value, errp); 3108 } 3109 3110 static void kvm_set_kvm_shadow_mem(Object *obj, Visitor *v, 3111 const char *name, void *opaque, 3112 Error **errp) 3113 { 3114 KVMState *s = KVM_STATE(obj); 3115 int64_t value; 3116 3117 if (!visit_type_int(v, name, &value, errp)) { 3118 return; 3119 } 3120 3121 s->kvm_shadow_mem = value; 3122 } 3123 3124 static void kvm_set_kernel_irqchip(Object *obj, Visitor *v, 3125 const char *name, void *opaque, 3126 Error **errp) 3127 { 3128 KVMState *s = KVM_STATE(obj); 3129 OnOffSplit mode; 3130 3131 if (!visit_type_OnOffSplit(v, name, &mode, errp)) { 3132 return; 3133 } 3134 switch (mode) { 3135 case ON_OFF_SPLIT_ON: 3136 s->kernel_irqchip_allowed = true; 3137 s->kernel_irqchip_required = true; 3138 s->kernel_irqchip_split = ON_OFF_AUTO_OFF; 3139 break; 3140 case ON_OFF_SPLIT_OFF: 3141 s->kernel_irqchip_allowed = false; 3142 s->kernel_irqchip_required = false; 3143 s->kernel_irqchip_split = ON_OFF_AUTO_OFF; 3144 break; 3145 case ON_OFF_SPLIT_SPLIT: 3146 s->kernel_irqchip_allowed = true; 3147 s->kernel_irqchip_required = true; 3148 s->kernel_irqchip_split = ON_OFF_AUTO_ON; 3149 break; 3150 default: 3151 /* The value was checked in visit_type_OnOffSplit() above. If 3152 * we get here, then something is wrong in QEMU. 3153 */ 3154 abort(); 3155 } 3156 } 3157 3158 bool kvm_kernel_irqchip_allowed(void) 3159 { 3160 return kvm_state->kernel_irqchip_allowed; 3161 } 3162 3163 bool kvm_kernel_irqchip_required(void) 3164 { 3165 return kvm_state->kernel_irqchip_required; 3166 } 3167 3168 bool kvm_kernel_irqchip_split(void) 3169 { 3170 return kvm_state->kernel_irqchip_split == ON_OFF_AUTO_ON; 3171 } 3172 3173 static void kvm_accel_instance_init(Object *obj) 3174 { 3175 KVMState *s = KVM_STATE(obj); 3176 3177 s->kvm_shadow_mem = -1; 3178 s->kernel_irqchip_allowed = true; 3179 s->kernel_irqchip_split = ON_OFF_AUTO_AUTO; 3180 } 3181 3182 static void kvm_accel_class_init(ObjectClass *oc, void *data) 3183 { 3184 AccelClass *ac = ACCEL_CLASS(oc); 3185 ac->name = "KVM"; 3186 ac->init_machine = kvm_init; 3187 ac->has_memory = kvm_accel_has_memory; 3188 ac->allowed = &kvm_allowed; 3189 3190 object_class_property_add(oc, "kernel-irqchip", "on|off|split", 3191 NULL, kvm_set_kernel_irqchip, 3192 NULL, NULL); 3193 object_class_property_set_description(oc, "kernel-irqchip", 3194 "Configure KVM in-kernel irqchip"); 3195 3196 object_class_property_add(oc, "kvm-shadow-mem", "int", 3197 kvm_get_kvm_shadow_mem, kvm_set_kvm_shadow_mem, 3198 NULL, NULL); 3199 object_class_property_set_description(oc, "kvm-shadow-mem", 3200 "KVM shadow MMU size"); 3201 } 3202 3203 static const TypeInfo kvm_accel_type = { 3204 .name = TYPE_KVM_ACCEL, 3205 .parent = TYPE_ACCEL, 3206 .instance_init = kvm_accel_instance_init, 3207 .class_init = kvm_accel_class_init, 3208 .instance_size = sizeof(KVMState), 3209 }; 3210 3211 static void kvm_type_init(void) 3212 { 3213 type_register_static(&kvm_accel_type); 3214 } 3215 3216 type_init(kvm_type_init); 3217