1 /* 2 * QEMU KVM support 3 * 4 * Copyright IBM, Corp. 2008 5 * Red Hat, Inc. 2008 6 * 7 * Authors: 8 * Anthony Liguori <aliguori@us.ibm.com> 9 * Glauber Costa <gcosta@redhat.com> 10 * 11 * This work is licensed under the terms of the GNU GPL, version 2 or later. 12 * See the COPYING file in the top-level directory. 13 * 14 */ 15 16 #include "qemu/osdep.h" 17 #include <sys/ioctl.h> 18 19 #include <linux/kvm.h> 20 21 #include "qemu-common.h" 22 #include "qemu/atomic.h" 23 #include "qemu/option.h" 24 #include "qemu/config-file.h" 25 #include "qemu/error-report.h" 26 #include "qapi/error.h" 27 #include "hw/hw.h" 28 #include "hw/pci/msi.h" 29 #include "hw/pci/msix.h" 30 #include "hw/s390x/adapter.h" 31 #include "exec/gdbstub.h" 32 #include "sysemu/kvm_int.h" 33 #include "sysemu/cpus.h" 34 #include "qemu/bswap.h" 35 #include "exec/memory.h" 36 #include "exec/ram_addr.h" 37 #include "exec/address-spaces.h" 38 #include "qemu/event_notifier.h" 39 #include "trace.h" 40 #include "hw/irq.h" 41 #include "sysemu/sev.h" 42 #include "sysemu/balloon.h" 43 44 #include "hw/boards.h" 45 46 /* This check must be after config-host.h is included */ 47 #ifdef CONFIG_EVENTFD 48 #include <sys/eventfd.h> 49 #endif 50 51 /* KVM uses PAGE_SIZE in its definition of KVM_COALESCED_MMIO_MAX. We 52 * need to use the real host PAGE_SIZE, as that's what KVM will use. 53 */ 54 #define PAGE_SIZE getpagesize() 55 56 //#define DEBUG_KVM 57 58 #ifdef DEBUG_KVM 59 #define DPRINTF(fmt, ...) \ 60 do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0) 61 #else 62 #define DPRINTF(fmt, ...) \ 63 do { } while (0) 64 #endif 65 66 #define KVM_MSI_HASHTAB_SIZE 256 67 68 struct KVMParkedVcpu { 69 unsigned long vcpu_id; 70 int kvm_fd; 71 QLIST_ENTRY(KVMParkedVcpu) node; 72 }; 73 74 struct KVMState 75 { 76 AccelState parent_obj; 77 78 int nr_slots; 79 int fd; 80 int vmfd; 81 int coalesced_mmio; 82 int coalesced_pio; 83 struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; 84 bool coalesced_flush_in_progress; 85 int vcpu_events; 86 int robust_singlestep; 87 int debugregs; 88 #ifdef KVM_CAP_SET_GUEST_DEBUG 89 QTAILQ_HEAD(, kvm_sw_breakpoint) kvm_sw_breakpoints; 90 #endif 91 int many_ioeventfds; 92 int intx_set_mask; 93 bool sync_mmu; 94 /* The man page (and posix) say ioctl numbers are signed int, but 95 * they're not. Linux, glibc and *BSD all treat ioctl numbers as 96 * unsigned, and treating them as signed here can break things */ 97 unsigned irq_set_ioctl; 98 unsigned int sigmask_len; 99 GHashTable *gsimap; 100 #ifdef KVM_CAP_IRQ_ROUTING 101 struct kvm_irq_routing *irq_routes; 102 int nr_allocated_irq_routes; 103 unsigned long *used_gsi_bitmap; 104 unsigned int gsi_count; 105 QTAILQ_HEAD(, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE]; 106 #endif 107 KVMMemoryListener memory_listener; 108 QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus; 109 110 /* memory encryption */ 111 void *memcrypt_handle; 112 int (*memcrypt_encrypt_data)(void *handle, uint8_t *ptr, uint64_t len); 113 }; 114 115 KVMState *kvm_state; 116 bool kvm_kernel_irqchip; 117 bool kvm_split_irqchip; 118 bool kvm_async_interrupts_allowed; 119 bool kvm_halt_in_kernel_allowed; 120 bool kvm_eventfds_allowed; 121 bool kvm_irqfds_allowed; 122 bool kvm_resamplefds_allowed; 123 bool kvm_msi_via_irqfd_allowed; 124 bool kvm_gsi_routing_allowed; 125 bool kvm_gsi_direct_mapping; 126 bool kvm_allowed; 127 bool kvm_readonly_mem_allowed; 128 bool kvm_vm_attributes_allowed; 129 bool kvm_direct_msi_allowed; 130 bool kvm_ioeventfd_any_length_allowed; 131 bool kvm_msi_use_devid; 132 static bool kvm_immediate_exit; 133 134 static const KVMCapabilityInfo kvm_required_capabilites[] = { 135 KVM_CAP_INFO(USER_MEMORY), 136 KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS), 137 KVM_CAP_INFO(JOIN_MEMORY_REGIONS_WORKS), 138 KVM_CAP_LAST_INFO 139 }; 140 141 int kvm_get_max_memslots(void) 142 { 143 KVMState *s = KVM_STATE(current_machine->accelerator); 144 145 return s->nr_slots; 146 } 147 148 bool kvm_memcrypt_enabled(void) 149 { 150 if (kvm_state && kvm_state->memcrypt_handle) { 151 return true; 152 } 153 154 return false; 155 } 156 157 int kvm_memcrypt_encrypt_data(uint8_t *ptr, uint64_t len) 158 { 159 if (kvm_state->memcrypt_handle && 160 kvm_state->memcrypt_encrypt_data) { 161 return kvm_state->memcrypt_encrypt_data(kvm_state->memcrypt_handle, 162 ptr, len); 163 } 164 165 return 1; 166 } 167 168 static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml) 169 { 170 KVMState *s = kvm_state; 171 int i; 172 173 for (i = 0; i < s->nr_slots; i++) { 174 if (kml->slots[i].memory_size == 0) { 175 return &kml->slots[i]; 176 } 177 } 178 179 return NULL; 180 } 181 182 bool kvm_has_free_slot(MachineState *ms) 183 { 184 KVMState *s = KVM_STATE(ms->accelerator); 185 186 return kvm_get_free_slot(&s->memory_listener); 187 } 188 189 static KVMSlot *kvm_alloc_slot(KVMMemoryListener *kml) 190 { 191 KVMSlot *slot = kvm_get_free_slot(kml); 192 193 if (slot) { 194 return slot; 195 } 196 197 fprintf(stderr, "%s: no free slot available\n", __func__); 198 abort(); 199 } 200 201 static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml, 202 hwaddr start_addr, 203 hwaddr size) 204 { 205 KVMState *s = kvm_state; 206 int i; 207 208 for (i = 0; i < s->nr_slots; i++) { 209 KVMSlot *mem = &kml->slots[i]; 210 211 if (start_addr == mem->start_addr && size == mem->memory_size) { 212 return mem; 213 } 214 } 215 216 return NULL; 217 } 218 219 /* 220 * Calculate and align the start address and the size of the section. 221 * Return the size. If the size is 0, the aligned section is empty. 222 */ 223 static hwaddr kvm_align_section(MemoryRegionSection *section, 224 hwaddr *start) 225 { 226 hwaddr size = int128_get64(section->size); 227 hwaddr delta, aligned; 228 229 /* kvm works in page size chunks, but the function may be called 230 with sub-page size and unaligned start address. Pad the start 231 address to next and truncate size to previous page boundary. */ 232 aligned = ROUND_UP(section->offset_within_address_space, 233 qemu_real_host_page_size); 234 delta = aligned - section->offset_within_address_space; 235 *start = aligned; 236 if (delta > size) { 237 return 0; 238 } 239 240 return (size - delta) & qemu_real_host_page_mask; 241 } 242 243 int kvm_physical_memory_addr_from_host(KVMState *s, void *ram, 244 hwaddr *phys_addr) 245 { 246 KVMMemoryListener *kml = &s->memory_listener; 247 int i; 248 249 for (i = 0; i < s->nr_slots; i++) { 250 KVMSlot *mem = &kml->slots[i]; 251 252 if (ram >= mem->ram && ram < mem->ram + mem->memory_size) { 253 *phys_addr = mem->start_addr + (ram - mem->ram); 254 return 1; 255 } 256 } 257 258 return 0; 259 } 260 261 static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot, bool new) 262 { 263 KVMState *s = kvm_state; 264 struct kvm_userspace_memory_region mem; 265 int ret; 266 267 mem.slot = slot->slot | (kml->as_id << 16); 268 mem.guest_phys_addr = slot->start_addr; 269 mem.userspace_addr = (unsigned long)slot->ram; 270 mem.flags = slot->flags; 271 272 if (slot->memory_size && !new && (mem.flags ^ slot->old_flags) & KVM_MEM_READONLY) { 273 /* Set the slot size to 0 before setting the slot to the desired 274 * value. This is needed based on KVM commit 75d61fbc. */ 275 mem.memory_size = 0; 276 kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem); 277 } 278 mem.memory_size = slot->memory_size; 279 ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem); 280 slot->old_flags = mem.flags; 281 trace_kvm_set_user_memory(mem.slot, mem.flags, mem.guest_phys_addr, 282 mem.memory_size, mem.userspace_addr, ret); 283 return ret; 284 } 285 286 int kvm_destroy_vcpu(CPUState *cpu) 287 { 288 KVMState *s = kvm_state; 289 long mmap_size; 290 struct KVMParkedVcpu *vcpu = NULL; 291 int ret = 0; 292 293 DPRINTF("kvm_destroy_vcpu\n"); 294 295 mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0); 296 if (mmap_size < 0) { 297 ret = mmap_size; 298 DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n"); 299 goto err; 300 } 301 302 ret = munmap(cpu->kvm_run, mmap_size); 303 if (ret < 0) { 304 goto err; 305 } 306 307 vcpu = g_malloc0(sizeof(*vcpu)); 308 vcpu->vcpu_id = kvm_arch_vcpu_id(cpu); 309 vcpu->kvm_fd = cpu->kvm_fd; 310 QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node); 311 err: 312 return ret; 313 } 314 315 static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id) 316 { 317 struct KVMParkedVcpu *cpu; 318 319 QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) { 320 if (cpu->vcpu_id == vcpu_id) { 321 int kvm_fd; 322 323 QLIST_REMOVE(cpu, node); 324 kvm_fd = cpu->kvm_fd; 325 g_free(cpu); 326 return kvm_fd; 327 } 328 } 329 330 return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id); 331 } 332 333 int kvm_init_vcpu(CPUState *cpu) 334 { 335 KVMState *s = kvm_state; 336 long mmap_size; 337 int ret; 338 339 DPRINTF("kvm_init_vcpu\n"); 340 341 ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu)); 342 if (ret < 0) { 343 DPRINTF("kvm_create_vcpu failed\n"); 344 goto err; 345 } 346 347 cpu->kvm_fd = ret; 348 cpu->kvm_state = s; 349 cpu->vcpu_dirty = true; 350 351 mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0); 352 if (mmap_size < 0) { 353 ret = mmap_size; 354 DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n"); 355 goto err; 356 } 357 358 cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, 359 cpu->kvm_fd, 0); 360 if (cpu->kvm_run == MAP_FAILED) { 361 ret = -errno; 362 DPRINTF("mmap'ing vcpu state failed\n"); 363 goto err; 364 } 365 366 if (s->coalesced_mmio && !s->coalesced_mmio_ring) { 367 s->coalesced_mmio_ring = 368 (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE; 369 } 370 371 ret = kvm_arch_init_vcpu(cpu); 372 err: 373 return ret; 374 } 375 376 /* 377 * dirty pages logging control 378 */ 379 380 static int kvm_mem_flags(MemoryRegion *mr) 381 { 382 bool readonly = mr->readonly || memory_region_is_romd(mr); 383 int flags = 0; 384 385 if (memory_region_get_dirty_log_mask(mr) != 0) { 386 flags |= KVM_MEM_LOG_DIRTY_PAGES; 387 } 388 if (readonly && kvm_readonly_mem_allowed) { 389 flags |= KVM_MEM_READONLY; 390 } 391 return flags; 392 } 393 394 static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem, 395 MemoryRegion *mr) 396 { 397 mem->flags = kvm_mem_flags(mr); 398 399 /* If nothing changed effectively, no need to issue ioctl */ 400 if (mem->flags == mem->old_flags) { 401 return 0; 402 } 403 404 return kvm_set_user_memory_region(kml, mem, false); 405 } 406 407 static int kvm_section_update_flags(KVMMemoryListener *kml, 408 MemoryRegionSection *section) 409 { 410 hwaddr start_addr, size; 411 KVMSlot *mem; 412 413 size = kvm_align_section(section, &start_addr); 414 if (!size) { 415 return 0; 416 } 417 418 mem = kvm_lookup_matching_slot(kml, start_addr, size); 419 if (!mem) { 420 /* We don't have a slot if we want to trap every access. */ 421 return 0; 422 } 423 424 return kvm_slot_update_flags(kml, mem, section->mr); 425 } 426 427 static void kvm_log_start(MemoryListener *listener, 428 MemoryRegionSection *section, 429 int old, int new) 430 { 431 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 432 int r; 433 434 if (old != 0) { 435 return; 436 } 437 438 r = kvm_section_update_flags(kml, section); 439 if (r < 0) { 440 abort(); 441 } 442 } 443 444 static void kvm_log_stop(MemoryListener *listener, 445 MemoryRegionSection *section, 446 int old, int new) 447 { 448 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 449 int r; 450 451 if (new != 0) { 452 return; 453 } 454 455 r = kvm_section_update_flags(kml, section); 456 if (r < 0) { 457 abort(); 458 } 459 } 460 461 /* get kvm's dirty pages bitmap and update qemu's */ 462 static int kvm_get_dirty_pages_log_range(MemoryRegionSection *section, 463 unsigned long *bitmap) 464 { 465 ram_addr_t start = section->offset_within_region + 466 memory_region_get_ram_addr(section->mr); 467 ram_addr_t pages = int128_get64(section->size) / getpagesize(); 468 469 cpu_physical_memory_set_dirty_lebitmap(bitmap, start, pages); 470 return 0; 471 } 472 473 #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1)) 474 475 /** 476 * kvm_physical_sync_dirty_bitmap - Grab dirty bitmap from kernel space 477 * This function updates qemu's dirty bitmap using 478 * memory_region_set_dirty(). This means all bits are set 479 * to dirty. 480 * 481 * @start_add: start of logged region. 482 * @end_addr: end of logged region. 483 */ 484 static int kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml, 485 MemoryRegionSection *section) 486 { 487 KVMState *s = kvm_state; 488 struct kvm_dirty_log d = {}; 489 KVMSlot *mem; 490 hwaddr start_addr, size; 491 492 size = kvm_align_section(section, &start_addr); 493 if (size) { 494 mem = kvm_lookup_matching_slot(kml, start_addr, size); 495 if (!mem) { 496 /* We don't have a slot if we want to trap every access. */ 497 return 0; 498 } 499 500 /* XXX bad kernel interface alert 501 * For dirty bitmap, kernel allocates array of size aligned to 502 * bits-per-long. But for case when the kernel is 64bits and 503 * the userspace is 32bits, userspace can't align to the same 504 * bits-per-long, since sizeof(long) is different between kernel 505 * and user space. This way, userspace will provide buffer which 506 * may be 4 bytes less than the kernel will use, resulting in 507 * userspace memory corruption (which is not detectable by valgrind 508 * too, in most cases). 509 * So for now, let's align to 64 instead of HOST_LONG_BITS here, in 510 * a hope that sizeof(long) won't become >8 any time soon. 511 */ 512 size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS), 513 /*HOST_LONG_BITS*/ 64) / 8; 514 d.dirty_bitmap = g_malloc0(size); 515 516 d.slot = mem->slot | (kml->as_id << 16); 517 if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) { 518 DPRINTF("ioctl failed %d\n", errno); 519 g_free(d.dirty_bitmap); 520 return -1; 521 } 522 523 kvm_get_dirty_pages_log_range(section, d.dirty_bitmap); 524 g_free(d.dirty_bitmap); 525 } 526 527 return 0; 528 } 529 530 static void kvm_coalesce_mmio_region(MemoryListener *listener, 531 MemoryRegionSection *secion, 532 hwaddr start, hwaddr size) 533 { 534 KVMState *s = kvm_state; 535 536 if (s->coalesced_mmio) { 537 struct kvm_coalesced_mmio_zone zone; 538 539 zone.addr = start; 540 zone.size = size; 541 zone.pad = 0; 542 543 (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone); 544 } 545 } 546 547 static void kvm_uncoalesce_mmio_region(MemoryListener *listener, 548 MemoryRegionSection *secion, 549 hwaddr start, hwaddr size) 550 { 551 KVMState *s = kvm_state; 552 553 if (s->coalesced_mmio) { 554 struct kvm_coalesced_mmio_zone zone; 555 556 zone.addr = start; 557 zone.size = size; 558 zone.pad = 0; 559 560 (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone); 561 } 562 } 563 564 static void kvm_coalesce_pio_add(MemoryListener *listener, 565 MemoryRegionSection *section, 566 hwaddr start, hwaddr size) 567 { 568 KVMState *s = kvm_state; 569 570 if (s->coalesced_pio) { 571 struct kvm_coalesced_mmio_zone zone; 572 573 zone.addr = start; 574 zone.size = size; 575 zone.pio = 1; 576 577 (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone); 578 } 579 } 580 581 static void kvm_coalesce_pio_del(MemoryListener *listener, 582 MemoryRegionSection *section, 583 hwaddr start, hwaddr size) 584 { 585 KVMState *s = kvm_state; 586 587 if (s->coalesced_pio) { 588 struct kvm_coalesced_mmio_zone zone; 589 590 zone.addr = start; 591 zone.size = size; 592 zone.pio = 1; 593 594 (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone); 595 } 596 } 597 598 static MemoryListener kvm_coalesced_pio_listener = { 599 .coalesced_io_add = kvm_coalesce_pio_add, 600 .coalesced_io_del = kvm_coalesce_pio_del, 601 }; 602 603 int kvm_check_extension(KVMState *s, unsigned int extension) 604 { 605 int ret; 606 607 ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension); 608 if (ret < 0) { 609 ret = 0; 610 } 611 612 return ret; 613 } 614 615 int kvm_vm_check_extension(KVMState *s, unsigned int extension) 616 { 617 int ret; 618 619 ret = kvm_vm_ioctl(s, KVM_CHECK_EXTENSION, extension); 620 if (ret < 0) { 621 /* VM wide version not implemented, use global one instead */ 622 ret = kvm_check_extension(s, extension); 623 } 624 625 return ret; 626 } 627 628 static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size) 629 { 630 #if defined(HOST_WORDS_BIGENDIAN) != defined(TARGET_WORDS_BIGENDIAN) 631 /* The kernel expects ioeventfd values in HOST_WORDS_BIGENDIAN 632 * endianness, but the memory core hands them in target endianness. 633 * For example, PPC is always treated as big-endian even if running 634 * on KVM and on PPC64LE. Correct here. 635 */ 636 switch (size) { 637 case 2: 638 val = bswap16(val); 639 break; 640 case 4: 641 val = bswap32(val); 642 break; 643 } 644 #endif 645 return val; 646 } 647 648 static int kvm_set_ioeventfd_mmio(int fd, hwaddr addr, uint32_t val, 649 bool assign, uint32_t size, bool datamatch) 650 { 651 int ret; 652 struct kvm_ioeventfd iofd = { 653 .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0, 654 .addr = addr, 655 .len = size, 656 .flags = 0, 657 .fd = fd, 658 }; 659 660 trace_kvm_set_ioeventfd_mmio(fd, (uint64_t)addr, val, assign, size, 661 datamatch); 662 if (!kvm_enabled()) { 663 return -ENOSYS; 664 } 665 666 if (datamatch) { 667 iofd.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH; 668 } 669 if (!assign) { 670 iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN; 671 } 672 673 ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd); 674 675 if (ret < 0) { 676 return -errno; 677 } 678 679 return 0; 680 } 681 682 static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val, 683 bool assign, uint32_t size, bool datamatch) 684 { 685 struct kvm_ioeventfd kick = { 686 .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0, 687 .addr = addr, 688 .flags = KVM_IOEVENTFD_FLAG_PIO, 689 .len = size, 690 .fd = fd, 691 }; 692 int r; 693 trace_kvm_set_ioeventfd_pio(fd, addr, val, assign, size, datamatch); 694 if (!kvm_enabled()) { 695 return -ENOSYS; 696 } 697 if (datamatch) { 698 kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH; 699 } 700 if (!assign) { 701 kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN; 702 } 703 r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick); 704 if (r < 0) { 705 return r; 706 } 707 return 0; 708 } 709 710 711 static int kvm_check_many_ioeventfds(void) 712 { 713 /* Userspace can use ioeventfd for io notification. This requires a host 714 * that supports eventfd(2) and an I/O thread; since eventfd does not 715 * support SIGIO it cannot interrupt the vcpu. 716 * 717 * Older kernels have a 6 device limit on the KVM io bus. Find out so we 718 * can avoid creating too many ioeventfds. 719 */ 720 #if defined(CONFIG_EVENTFD) 721 int ioeventfds[7]; 722 int i, ret = 0; 723 for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) { 724 ioeventfds[i] = eventfd(0, EFD_CLOEXEC); 725 if (ioeventfds[i] < 0) { 726 break; 727 } 728 ret = kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, true, 2, true); 729 if (ret < 0) { 730 close(ioeventfds[i]); 731 break; 732 } 733 } 734 735 /* Decide whether many devices are supported or not */ 736 ret = i == ARRAY_SIZE(ioeventfds); 737 738 while (i-- > 0) { 739 kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, false, 2, true); 740 close(ioeventfds[i]); 741 } 742 return ret; 743 #else 744 return 0; 745 #endif 746 } 747 748 static const KVMCapabilityInfo * 749 kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list) 750 { 751 while (list->name) { 752 if (!kvm_check_extension(s, list->value)) { 753 return list; 754 } 755 list++; 756 } 757 return NULL; 758 } 759 760 static void kvm_set_phys_mem(KVMMemoryListener *kml, 761 MemoryRegionSection *section, bool add) 762 { 763 KVMSlot *mem; 764 int err; 765 MemoryRegion *mr = section->mr; 766 bool writeable = !mr->readonly && !mr->rom_device; 767 hwaddr start_addr, size; 768 void *ram; 769 770 if (!memory_region_is_ram(mr)) { 771 if (writeable || !kvm_readonly_mem_allowed) { 772 return; 773 } else if (!mr->romd_mode) { 774 /* If the memory device is not in romd_mode, then we actually want 775 * to remove the kvm memory slot so all accesses will trap. */ 776 add = false; 777 } 778 } 779 780 size = kvm_align_section(section, &start_addr); 781 if (!size) { 782 return; 783 } 784 785 /* use aligned delta to align the ram address */ 786 ram = memory_region_get_ram_ptr(mr) + section->offset_within_region + 787 (start_addr - section->offset_within_address_space); 788 789 if (!add) { 790 mem = kvm_lookup_matching_slot(kml, start_addr, size); 791 if (!mem) { 792 return; 793 } 794 if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { 795 kvm_physical_sync_dirty_bitmap(kml, section); 796 } 797 798 /* unregister the slot */ 799 mem->memory_size = 0; 800 mem->flags = 0; 801 err = kvm_set_user_memory_region(kml, mem, false); 802 if (err) { 803 fprintf(stderr, "%s: error unregistering slot: %s\n", 804 __func__, strerror(-err)); 805 abort(); 806 } 807 return; 808 } 809 810 /* register the new slot */ 811 mem = kvm_alloc_slot(kml); 812 mem->memory_size = size; 813 mem->start_addr = start_addr; 814 mem->ram = ram; 815 mem->flags = kvm_mem_flags(mr); 816 817 err = kvm_set_user_memory_region(kml, mem, true); 818 if (err) { 819 fprintf(stderr, "%s: error registering slot: %s\n", __func__, 820 strerror(-err)); 821 abort(); 822 } 823 } 824 825 static void kvm_region_add(MemoryListener *listener, 826 MemoryRegionSection *section) 827 { 828 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 829 830 memory_region_ref(section->mr); 831 kvm_set_phys_mem(kml, section, true); 832 } 833 834 static void kvm_region_del(MemoryListener *listener, 835 MemoryRegionSection *section) 836 { 837 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 838 839 kvm_set_phys_mem(kml, section, false); 840 memory_region_unref(section->mr); 841 } 842 843 static void kvm_log_sync(MemoryListener *listener, 844 MemoryRegionSection *section) 845 { 846 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 847 int r; 848 849 r = kvm_physical_sync_dirty_bitmap(kml, section); 850 if (r < 0) { 851 abort(); 852 } 853 } 854 855 static void kvm_mem_ioeventfd_add(MemoryListener *listener, 856 MemoryRegionSection *section, 857 bool match_data, uint64_t data, 858 EventNotifier *e) 859 { 860 int fd = event_notifier_get_fd(e); 861 int r; 862 863 r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space, 864 data, true, int128_get64(section->size), 865 match_data); 866 if (r < 0) { 867 fprintf(stderr, "%s: error adding ioeventfd: %s\n", 868 __func__, strerror(-r)); 869 abort(); 870 } 871 } 872 873 static void kvm_mem_ioeventfd_del(MemoryListener *listener, 874 MemoryRegionSection *section, 875 bool match_data, uint64_t data, 876 EventNotifier *e) 877 { 878 int fd = event_notifier_get_fd(e); 879 int r; 880 881 r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space, 882 data, false, int128_get64(section->size), 883 match_data); 884 if (r < 0) { 885 abort(); 886 } 887 } 888 889 static void kvm_io_ioeventfd_add(MemoryListener *listener, 890 MemoryRegionSection *section, 891 bool match_data, uint64_t data, 892 EventNotifier *e) 893 { 894 int fd = event_notifier_get_fd(e); 895 int r; 896 897 r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space, 898 data, true, int128_get64(section->size), 899 match_data); 900 if (r < 0) { 901 fprintf(stderr, "%s: error adding ioeventfd: %s\n", 902 __func__, strerror(-r)); 903 abort(); 904 } 905 } 906 907 static void kvm_io_ioeventfd_del(MemoryListener *listener, 908 MemoryRegionSection *section, 909 bool match_data, uint64_t data, 910 EventNotifier *e) 911 912 { 913 int fd = event_notifier_get_fd(e); 914 int r; 915 916 r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space, 917 data, false, int128_get64(section->size), 918 match_data); 919 if (r < 0) { 920 abort(); 921 } 922 } 923 924 void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml, 925 AddressSpace *as, int as_id) 926 { 927 int i; 928 929 kml->slots = g_malloc0(s->nr_slots * sizeof(KVMSlot)); 930 kml->as_id = as_id; 931 932 for (i = 0; i < s->nr_slots; i++) { 933 kml->slots[i].slot = i; 934 } 935 936 kml->listener.region_add = kvm_region_add; 937 kml->listener.region_del = kvm_region_del; 938 kml->listener.log_start = kvm_log_start; 939 kml->listener.log_stop = kvm_log_stop; 940 kml->listener.log_sync = kvm_log_sync; 941 kml->listener.priority = 10; 942 943 memory_listener_register(&kml->listener, as); 944 } 945 946 static MemoryListener kvm_io_listener = { 947 .eventfd_add = kvm_io_ioeventfd_add, 948 .eventfd_del = kvm_io_ioeventfd_del, 949 .priority = 10, 950 }; 951 952 int kvm_set_irq(KVMState *s, int irq, int level) 953 { 954 struct kvm_irq_level event; 955 int ret; 956 957 assert(kvm_async_interrupts_enabled()); 958 959 event.level = level; 960 event.irq = irq; 961 ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event); 962 if (ret < 0) { 963 perror("kvm_set_irq"); 964 abort(); 965 } 966 967 return (s->irq_set_ioctl == KVM_IRQ_LINE) ? 1 : event.status; 968 } 969 970 #ifdef KVM_CAP_IRQ_ROUTING 971 typedef struct KVMMSIRoute { 972 struct kvm_irq_routing_entry kroute; 973 QTAILQ_ENTRY(KVMMSIRoute) entry; 974 } KVMMSIRoute; 975 976 static void set_gsi(KVMState *s, unsigned int gsi) 977 { 978 set_bit(gsi, s->used_gsi_bitmap); 979 } 980 981 static void clear_gsi(KVMState *s, unsigned int gsi) 982 { 983 clear_bit(gsi, s->used_gsi_bitmap); 984 } 985 986 void kvm_init_irq_routing(KVMState *s) 987 { 988 int gsi_count, i; 989 990 gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING) - 1; 991 if (gsi_count > 0) { 992 /* Round up so we can search ints using ffs */ 993 s->used_gsi_bitmap = bitmap_new(gsi_count); 994 s->gsi_count = gsi_count; 995 } 996 997 s->irq_routes = g_malloc0(sizeof(*s->irq_routes)); 998 s->nr_allocated_irq_routes = 0; 999 1000 if (!kvm_direct_msi_allowed) { 1001 for (i = 0; i < KVM_MSI_HASHTAB_SIZE; i++) { 1002 QTAILQ_INIT(&s->msi_hashtab[i]); 1003 } 1004 } 1005 1006 kvm_arch_init_irq_routing(s); 1007 } 1008 1009 void kvm_irqchip_commit_routes(KVMState *s) 1010 { 1011 int ret; 1012 1013 if (kvm_gsi_direct_mapping()) { 1014 return; 1015 } 1016 1017 if (!kvm_gsi_routing_enabled()) { 1018 return; 1019 } 1020 1021 s->irq_routes->flags = 0; 1022 trace_kvm_irqchip_commit_routes(); 1023 ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes); 1024 assert(ret == 0); 1025 } 1026 1027 static void kvm_add_routing_entry(KVMState *s, 1028 struct kvm_irq_routing_entry *entry) 1029 { 1030 struct kvm_irq_routing_entry *new; 1031 int n, size; 1032 1033 if (s->irq_routes->nr == s->nr_allocated_irq_routes) { 1034 n = s->nr_allocated_irq_routes * 2; 1035 if (n < 64) { 1036 n = 64; 1037 } 1038 size = sizeof(struct kvm_irq_routing); 1039 size += n * sizeof(*new); 1040 s->irq_routes = g_realloc(s->irq_routes, size); 1041 s->nr_allocated_irq_routes = n; 1042 } 1043 n = s->irq_routes->nr++; 1044 new = &s->irq_routes->entries[n]; 1045 1046 *new = *entry; 1047 1048 set_gsi(s, entry->gsi); 1049 } 1050 1051 static int kvm_update_routing_entry(KVMState *s, 1052 struct kvm_irq_routing_entry *new_entry) 1053 { 1054 struct kvm_irq_routing_entry *entry; 1055 int n; 1056 1057 for (n = 0; n < s->irq_routes->nr; n++) { 1058 entry = &s->irq_routes->entries[n]; 1059 if (entry->gsi != new_entry->gsi) { 1060 continue; 1061 } 1062 1063 if(!memcmp(entry, new_entry, sizeof *entry)) { 1064 return 0; 1065 } 1066 1067 *entry = *new_entry; 1068 1069 return 0; 1070 } 1071 1072 return -ESRCH; 1073 } 1074 1075 void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin) 1076 { 1077 struct kvm_irq_routing_entry e = {}; 1078 1079 assert(pin < s->gsi_count); 1080 1081 e.gsi = irq; 1082 e.type = KVM_IRQ_ROUTING_IRQCHIP; 1083 e.flags = 0; 1084 e.u.irqchip.irqchip = irqchip; 1085 e.u.irqchip.pin = pin; 1086 kvm_add_routing_entry(s, &e); 1087 } 1088 1089 void kvm_irqchip_release_virq(KVMState *s, int virq) 1090 { 1091 struct kvm_irq_routing_entry *e; 1092 int i; 1093 1094 if (kvm_gsi_direct_mapping()) { 1095 return; 1096 } 1097 1098 for (i = 0; i < s->irq_routes->nr; i++) { 1099 e = &s->irq_routes->entries[i]; 1100 if (e->gsi == virq) { 1101 s->irq_routes->nr--; 1102 *e = s->irq_routes->entries[s->irq_routes->nr]; 1103 } 1104 } 1105 clear_gsi(s, virq); 1106 kvm_arch_release_virq_post(virq); 1107 trace_kvm_irqchip_release_virq(virq); 1108 } 1109 1110 static unsigned int kvm_hash_msi(uint32_t data) 1111 { 1112 /* This is optimized for IA32 MSI layout. However, no other arch shall 1113 * repeat the mistake of not providing a direct MSI injection API. */ 1114 return data & 0xff; 1115 } 1116 1117 static void kvm_flush_dynamic_msi_routes(KVMState *s) 1118 { 1119 KVMMSIRoute *route, *next; 1120 unsigned int hash; 1121 1122 for (hash = 0; hash < KVM_MSI_HASHTAB_SIZE; hash++) { 1123 QTAILQ_FOREACH_SAFE(route, &s->msi_hashtab[hash], entry, next) { 1124 kvm_irqchip_release_virq(s, route->kroute.gsi); 1125 QTAILQ_REMOVE(&s->msi_hashtab[hash], route, entry); 1126 g_free(route); 1127 } 1128 } 1129 } 1130 1131 static int kvm_irqchip_get_virq(KVMState *s) 1132 { 1133 int next_virq; 1134 1135 /* 1136 * PIC and IOAPIC share the first 16 GSI numbers, thus the available 1137 * GSI numbers are more than the number of IRQ route. Allocating a GSI 1138 * number can succeed even though a new route entry cannot be added. 1139 * When this happens, flush dynamic MSI entries to free IRQ route entries. 1140 */ 1141 if (!kvm_direct_msi_allowed && s->irq_routes->nr == s->gsi_count) { 1142 kvm_flush_dynamic_msi_routes(s); 1143 } 1144 1145 /* Return the lowest unused GSI in the bitmap */ 1146 next_virq = find_first_zero_bit(s->used_gsi_bitmap, s->gsi_count); 1147 if (next_virq >= s->gsi_count) { 1148 return -ENOSPC; 1149 } else { 1150 return next_virq; 1151 } 1152 } 1153 1154 static KVMMSIRoute *kvm_lookup_msi_route(KVMState *s, MSIMessage msg) 1155 { 1156 unsigned int hash = kvm_hash_msi(msg.data); 1157 KVMMSIRoute *route; 1158 1159 QTAILQ_FOREACH(route, &s->msi_hashtab[hash], entry) { 1160 if (route->kroute.u.msi.address_lo == (uint32_t)msg.address && 1161 route->kroute.u.msi.address_hi == (msg.address >> 32) && 1162 route->kroute.u.msi.data == le32_to_cpu(msg.data)) { 1163 return route; 1164 } 1165 } 1166 return NULL; 1167 } 1168 1169 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg) 1170 { 1171 struct kvm_msi msi; 1172 KVMMSIRoute *route; 1173 1174 if (kvm_direct_msi_allowed) { 1175 msi.address_lo = (uint32_t)msg.address; 1176 msi.address_hi = msg.address >> 32; 1177 msi.data = le32_to_cpu(msg.data); 1178 msi.flags = 0; 1179 memset(msi.pad, 0, sizeof(msi.pad)); 1180 1181 return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi); 1182 } 1183 1184 route = kvm_lookup_msi_route(s, msg); 1185 if (!route) { 1186 int virq; 1187 1188 virq = kvm_irqchip_get_virq(s); 1189 if (virq < 0) { 1190 return virq; 1191 } 1192 1193 route = g_malloc0(sizeof(KVMMSIRoute)); 1194 route->kroute.gsi = virq; 1195 route->kroute.type = KVM_IRQ_ROUTING_MSI; 1196 route->kroute.flags = 0; 1197 route->kroute.u.msi.address_lo = (uint32_t)msg.address; 1198 route->kroute.u.msi.address_hi = msg.address >> 32; 1199 route->kroute.u.msi.data = le32_to_cpu(msg.data); 1200 1201 kvm_add_routing_entry(s, &route->kroute); 1202 kvm_irqchip_commit_routes(s); 1203 1204 QTAILQ_INSERT_TAIL(&s->msi_hashtab[kvm_hash_msi(msg.data)], route, 1205 entry); 1206 } 1207 1208 assert(route->kroute.type == KVM_IRQ_ROUTING_MSI); 1209 1210 return kvm_set_irq(s, route->kroute.gsi, 1); 1211 } 1212 1213 int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev) 1214 { 1215 struct kvm_irq_routing_entry kroute = {}; 1216 int virq; 1217 MSIMessage msg = {0, 0}; 1218 1219 if (pci_available && dev) { 1220 msg = pci_get_msi_message(dev, vector); 1221 } 1222 1223 if (kvm_gsi_direct_mapping()) { 1224 return kvm_arch_msi_data_to_gsi(msg.data); 1225 } 1226 1227 if (!kvm_gsi_routing_enabled()) { 1228 return -ENOSYS; 1229 } 1230 1231 virq = kvm_irqchip_get_virq(s); 1232 if (virq < 0) { 1233 return virq; 1234 } 1235 1236 kroute.gsi = virq; 1237 kroute.type = KVM_IRQ_ROUTING_MSI; 1238 kroute.flags = 0; 1239 kroute.u.msi.address_lo = (uint32_t)msg.address; 1240 kroute.u.msi.address_hi = msg.address >> 32; 1241 kroute.u.msi.data = le32_to_cpu(msg.data); 1242 if (pci_available && kvm_msi_devid_required()) { 1243 kroute.flags = KVM_MSI_VALID_DEVID; 1244 kroute.u.msi.devid = pci_requester_id(dev); 1245 } 1246 if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) { 1247 kvm_irqchip_release_virq(s, virq); 1248 return -EINVAL; 1249 } 1250 1251 trace_kvm_irqchip_add_msi_route(dev ? dev->name : (char *)"N/A", 1252 vector, virq); 1253 1254 kvm_add_routing_entry(s, &kroute); 1255 kvm_arch_add_msi_route_post(&kroute, vector, dev); 1256 kvm_irqchip_commit_routes(s); 1257 1258 return virq; 1259 } 1260 1261 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg, 1262 PCIDevice *dev) 1263 { 1264 struct kvm_irq_routing_entry kroute = {}; 1265 1266 if (kvm_gsi_direct_mapping()) { 1267 return 0; 1268 } 1269 1270 if (!kvm_irqchip_in_kernel()) { 1271 return -ENOSYS; 1272 } 1273 1274 kroute.gsi = virq; 1275 kroute.type = KVM_IRQ_ROUTING_MSI; 1276 kroute.flags = 0; 1277 kroute.u.msi.address_lo = (uint32_t)msg.address; 1278 kroute.u.msi.address_hi = msg.address >> 32; 1279 kroute.u.msi.data = le32_to_cpu(msg.data); 1280 if (pci_available && kvm_msi_devid_required()) { 1281 kroute.flags = KVM_MSI_VALID_DEVID; 1282 kroute.u.msi.devid = pci_requester_id(dev); 1283 } 1284 if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) { 1285 return -EINVAL; 1286 } 1287 1288 trace_kvm_irqchip_update_msi_route(virq); 1289 1290 return kvm_update_routing_entry(s, &kroute); 1291 } 1292 1293 static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int rfd, int virq, 1294 bool assign) 1295 { 1296 struct kvm_irqfd irqfd = { 1297 .fd = fd, 1298 .gsi = virq, 1299 .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN, 1300 }; 1301 1302 if (rfd != -1) { 1303 irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE; 1304 irqfd.resamplefd = rfd; 1305 } 1306 1307 if (!kvm_irqfds_enabled()) { 1308 return -ENOSYS; 1309 } 1310 1311 return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd); 1312 } 1313 1314 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter) 1315 { 1316 struct kvm_irq_routing_entry kroute = {}; 1317 int virq; 1318 1319 if (!kvm_gsi_routing_enabled()) { 1320 return -ENOSYS; 1321 } 1322 1323 virq = kvm_irqchip_get_virq(s); 1324 if (virq < 0) { 1325 return virq; 1326 } 1327 1328 kroute.gsi = virq; 1329 kroute.type = KVM_IRQ_ROUTING_S390_ADAPTER; 1330 kroute.flags = 0; 1331 kroute.u.adapter.summary_addr = adapter->summary_addr; 1332 kroute.u.adapter.ind_addr = adapter->ind_addr; 1333 kroute.u.adapter.summary_offset = adapter->summary_offset; 1334 kroute.u.adapter.ind_offset = adapter->ind_offset; 1335 kroute.u.adapter.adapter_id = adapter->adapter_id; 1336 1337 kvm_add_routing_entry(s, &kroute); 1338 1339 return virq; 1340 } 1341 1342 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint) 1343 { 1344 struct kvm_irq_routing_entry kroute = {}; 1345 int virq; 1346 1347 if (!kvm_gsi_routing_enabled()) { 1348 return -ENOSYS; 1349 } 1350 if (!kvm_check_extension(s, KVM_CAP_HYPERV_SYNIC)) { 1351 return -ENOSYS; 1352 } 1353 virq = kvm_irqchip_get_virq(s); 1354 if (virq < 0) { 1355 return virq; 1356 } 1357 1358 kroute.gsi = virq; 1359 kroute.type = KVM_IRQ_ROUTING_HV_SINT; 1360 kroute.flags = 0; 1361 kroute.u.hv_sint.vcpu = vcpu; 1362 kroute.u.hv_sint.sint = sint; 1363 1364 kvm_add_routing_entry(s, &kroute); 1365 kvm_irqchip_commit_routes(s); 1366 1367 return virq; 1368 } 1369 1370 #else /* !KVM_CAP_IRQ_ROUTING */ 1371 1372 void kvm_init_irq_routing(KVMState *s) 1373 { 1374 } 1375 1376 void kvm_irqchip_release_virq(KVMState *s, int virq) 1377 { 1378 } 1379 1380 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg) 1381 { 1382 abort(); 1383 } 1384 1385 int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev) 1386 { 1387 return -ENOSYS; 1388 } 1389 1390 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter) 1391 { 1392 return -ENOSYS; 1393 } 1394 1395 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint) 1396 { 1397 return -ENOSYS; 1398 } 1399 1400 static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int virq, bool assign) 1401 { 1402 abort(); 1403 } 1404 1405 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg) 1406 { 1407 return -ENOSYS; 1408 } 1409 #endif /* !KVM_CAP_IRQ_ROUTING */ 1410 1411 int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, 1412 EventNotifier *rn, int virq) 1413 { 1414 return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), 1415 rn ? event_notifier_get_fd(rn) : -1, virq, true); 1416 } 1417 1418 int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, 1419 int virq) 1420 { 1421 return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), -1, virq, 1422 false); 1423 } 1424 1425 int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n, 1426 EventNotifier *rn, qemu_irq irq) 1427 { 1428 gpointer key, gsi; 1429 gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi); 1430 1431 if (!found) { 1432 return -ENXIO; 1433 } 1434 return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, GPOINTER_TO_INT(gsi)); 1435 } 1436 1437 int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n, 1438 qemu_irq irq) 1439 { 1440 gpointer key, gsi; 1441 gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi); 1442 1443 if (!found) { 1444 return -ENXIO; 1445 } 1446 return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, GPOINTER_TO_INT(gsi)); 1447 } 1448 1449 void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi) 1450 { 1451 g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi)); 1452 } 1453 1454 static void kvm_irqchip_create(MachineState *machine, KVMState *s) 1455 { 1456 int ret; 1457 1458 if (kvm_check_extension(s, KVM_CAP_IRQCHIP)) { 1459 ; 1460 } else if (kvm_check_extension(s, KVM_CAP_S390_IRQCHIP)) { 1461 ret = kvm_vm_enable_cap(s, KVM_CAP_S390_IRQCHIP, 0); 1462 if (ret < 0) { 1463 fprintf(stderr, "Enable kernel irqchip failed: %s\n", strerror(-ret)); 1464 exit(1); 1465 } 1466 } else { 1467 return; 1468 } 1469 1470 /* First probe and see if there's a arch-specific hook to create the 1471 * in-kernel irqchip for us */ 1472 ret = kvm_arch_irqchip_create(machine, s); 1473 if (ret == 0) { 1474 if (machine_kernel_irqchip_split(machine)) { 1475 perror("Split IRQ chip mode not supported."); 1476 exit(1); 1477 } else { 1478 ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP); 1479 } 1480 } 1481 if (ret < 0) { 1482 fprintf(stderr, "Create kernel irqchip failed: %s\n", strerror(-ret)); 1483 exit(1); 1484 } 1485 1486 kvm_kernel_irqchip = true; 1487 /* If we have an in-kernel IRQ chip then we must have asynchronous 1488 * interrupt delivery (though the reverse is not necessarily true) 1489 */ 1490 kvm_async_interrupts_allowed = true; 1491 kvm_halt_in_kernel_allowed = true; 1492 1493 kvm_init_irq_routing(s); 1494 1495 s->gsimap = g_hash_table_new(g_direct_hash, g_direct_equal); 1496 } 1497 1498 /* Find number of supported CPUs using the recommended 1499 * procedure from the kernel API documentation to cope with 1500 * older kernels that may be missing capabilities. 1501 */ 1502 static int kvm_recommended_vcpus(KVMState *s) 1503 { 1504 int ret = kvm_vm_check_extension(s, KVM_CAP_NR_VCPUS); 1505 return (ret) ? ret : 4; 1506 } 1507 1508 static int kvm_max_vcpus(KVMState *s) 1509 { 1510 int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS); 1511 return (ret) ? ret : kvm_recommended_vcpus(s); 1512 } 1513 1514 static int kvm_max_vcpu_id(KVMState *s) 1515 { 1516 int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPU_ID); 1517 return (ret) ? ret : kvm_max_vcpus(s); 1518 } 1519 1520 bool kvm_vcpu_id_is_valid(int vcpu_id) 1521 { 1522 KVMState *s = KVM_STATE(current_machine->accelerator); 1523 return vcpu_id >= 0 && vcpu_id < kvm_max_vcpu_id(s); 1524 } 1525 1526 static int kvm_init(MachineState *ms) 1527 { 1528 MachineClass *mc = MACHINE_GET_CLASS(ms); 1529 static const char upgrade_note[] = 1530 "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n" 1531 "(see http://sourceforge.net/projects/kvm).\n"; 1532 struct { 1533 const char *name; 1534 int num; 1535 } num_cpus[] = { 1536 { "SMP", smp_cpus }, 1537 { "hotpluggable", max_cpus }, 1538 { NULL, } 1539 }, *nc = num_cpus; 1540 int soft_vcpus_limit, hard_vcpus_limit; 1541 KVMState *s; 1542 const KVMCapabilityInfo *missing_cap; 1543 int ret; 1544 int type = 0; 1545 const char *kvm_type; 1546 1547 s = KVM_STATE(ms->accelerator); 1548 1549 /* 1550 * On systems where the kernel can support different base page 1551 * sizes, host page size may be different from TARGET_PAGE_SIZE, 1552 * even with KVM. TARGET_PAGE_SIZE is assumed to be the minimum 1553 * page size for the system though. 1554 */ 1555 assert(TARGET_PAGE_SIZE <= getpagesize()); 1556 1557 s->sigmask_len = 8; 1558 1559 #ifdef KVM_CAP_SET_GUEST_DEBUG 1560 QTAILQ_INIT(&s->kvm_sw_breakpoints); 1561 #endif 1562 QLIST_INIT(&s->kvm_parked_vcpus); 1563 s->vmfd = -1; 1564 s->fd = qemu_open("/dev/kvm", O_RDWR); 1565 if (s->fd == -1) { 1566 fprintf(stderr, "Could not access KVM kernel module: %m\n"); 1567 ret = -errno; 1568 goto err; 1569 } 1570 1571 ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0); 1572 if (ret < KVM_API_VERSION) { 1573 if (ret >= 0) { 1574 ret = -EINVAL; 1575 } 1576 fprintf(stderr, "kvm version too old\n"); 1577 goto err; 1578 } 1579 1580 if (ret > KVM_API_VERSION) { 1581 ret = -EINVAL; 1582 fprintf(stderr, "kvm version not supported\n"); 1583 goto err; 1584 } 1585 1586 kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT); 1587 s->nr_slots = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS); 1588 1589 /* If unspecified, use the default value */ 1590 if (!s->nr_slots) { 1591 s->nr_slots = 32; 1592 } 1593 1594 kvm_type = qemu_opt_get(qemu_get_machine_opts(), "kvm-type"); 1595 if (mc->kvm_type) { 1596 type = mc->kvm_type(ms, kvm_type); 1597 } else if (kvm_type) { 1598 ret = -EINVAL; 1599 fprintf(stderr, "Invalid argument kvm-type=%s\n", kvm_type); 1600 goto err; 1601 } 1602 1603 do { 1604 ret = kvm_ioctl(s, KVM_CREATE_VM, type); 1605 } while (ret == -EINTR); 1606 1607 if (ret < 0) { 1608 fprintf(stderr, "ioctl(KVM_CREATE_VM) failed: %d %s\n", -ret, 1609 strerror(-ret)); 1610 1611 #ifdef TARGET_S390X 1612 if (ret == -EINVAL) { 1613 fprintf(stderr, 1614 "Host kernel setup problem detected. Please verify:\n"); 1615 fprintf(stderr, "- for kernels supporting the switch_amode or" 1616 " user_mode parameters, whether\n"); 1617 fprintf(stderr, 1618 " user space is running in primary address space\n"); 1619 fprintf(stderr, 1620 "- for kernels supporting the vm.allocate_pgste sysctl, " 1621 "whether it is enabled\n"); 1622 } 1623 #endif 1624 goto err; 1625 } 1626 1627 s->vmfd = ret; 1628 1629 /* check the vcpu limits */ 1630 soft_vcpus_limit = kvm_recommended_vcpus(s); 1631 hard_vcpus_limit = kvm_max_vcpus(s); 1632 1633 while (nc->name) { 1634 if (nc->num > soft_vcpus_limit) { 1635 warn_report("Number of %s cpus requested (%d) exceeds " 1636 "the recommended cpus supported by KVM (%d)", 1637 nc->name, nc->num, soft_vcpus_limit); 1638 1639 if (nc->num > hard_vcpus_limit) { 1640 fprintf(stderr, "Number of %s cpus requested (%d) exceeds " 1641 "the maximum cpus supported by KVM (%d)\n", 1642 nc->name, nc->num, hard_vcpus_limit); 1643 exit(1); 1644 } 1645 } 1646 nc++; 1647 } 1648 1649 missing_cap = kvm_check_extension_list(s, kvm_required_capabilites); 1650 if (!missing_cap) { 1651 missing_cap = 1652 kvm_check_extension_list(s, kvm_arch_required_capabilities); 1653 } 1654 if (missing_cap) { 1655 ret = -EINVAL; 1656 fprintf(stderr, "kvm does not support %s\n%s", 1657 missing_cap->name, upgrade_note); 1658 goto err; 1659 } 1660 1661 s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO); 1662 s->coalesced_pio = s->coalesced_mmio && 1663 kvm_check_extension(s, KVM_CAP_COALESCED_PIO); 1664 1665 #ifdef KVM_CAP_VCPU_EVENTS 1666 s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS); 1667 #endif 1668 1669 s->robust_singlestep = 1670 kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP); 1671 1672 #ifdef KVM_CAP_DEBUGREGS 1673 s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS); 1674 #endif 1675 1676 #ifdef KVM_CAP_IRQ_ROUTING 1677 kvm_direct_msi_allowed = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0); 1678 #endif 1679 1680 s->intx_set_mask = kvm_check_extension(s, KVM_CAP_PCI_2_3); 1681 1682 s->irq_set_ioctl = KVM_IRQ_LINE; 1683 if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) { 1684 s->irq_set_ioctl = KVM_IRQ_LINE_STATUS; 1685 } 1686 1687 kvm_readonly_mem_allowed = 1688 (kvm_check_extension(s, KVM_CAP_READONLY_MEM) > 0); 1689 1690 kvm_eventfds_allowed = 1691 (kvm_check_extension(s, KVM_CAP_IOEVENTFD) > 0); 1692 1693 kvm_irqfds_allowed = 1694 (kvm_check_extension(s, KVM_CAP_IRQFD) > 0); 1695 1696 kvm_resamplefds_allowed = 1697 (kvm_check_extension(s, KVM_CAP_IRQFD_RESAMPLE) > 0); 1698 1699 kvm_vm_attributes_allowed = 1700 (kvm_check_extension(s, KVM_CAP_VM_ATTRIBUTES) > 0); 1701 1702 kvm_ioeventfd_any_length_allowed = 1703 (kvm_check_extension(s, KVM_CAP_IOEVENTFD_ANY_LENGTH) > 0); 1704 1705 kvm_state = s; 1706 1707 /* 1708 * if memory encryption object is specified then initialize the memory 1709 * encryption context. 1710 */ 1711 if (ms->memory_encryption) { 1712 kvm_state->memcrypt_handle = sev_guest_init(ms->memory_encryption); 1713 if (!kvm_state->memcrypt_handle) { 1714 ret = -1; 1715 goto err; 1716 } 1717 1718 kvm_state->memcrypt_encrypt_data = sev_encrypt_data; 1719 } 1720 1721 ret = kvm_arch_init(ms, s); 1722 if (ret < 0) { 1723 goto err; 1724 } 1725 1726 if (machine_kernel_irqchip_allowed(ms)) { 1727 kvm_irqchip_create(ms, s); 1728 } 1729 1730 if (kvm_eventfds_allowed) { 1731 s->memory_listener.listener.eventfd_add = kvm_mem_ioeventfd_add; 1732 s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del; 1733 } 1734 s->memory_listener.listener.coalesced_io_add = kvm_coalesce_mmio_region; 1735 s->memory_listener.listener.coalesced_io_del = kvm_uncoalesce_mmio_region; 1736 1737 kvm_memory_listener_register(s, &s->memory_listener, 1738 &address_space_memory, 0); 1739 memory_listener_register(&kvm_io_listener, 1740 &address_space_io); 1741 memory_listener_register(&kvm_coalesced_pio_listener, 1742 &address_space_io); 1743 1744 s->many_ioeventfds = kvm_check_many_ioeventfds(); 1745 1746 s->sync_mmu = !!kvm_vm_check_extension(kvm_state, KVM_CAP_SYNC_MMU); 1747 if (!s->sync_mmu) { 1748 qemu_balloon_inhibit(true); 1749 } 1750 1751 return 0; 1752 1753 err: 1754 assert(ret < 0); 1755 if (s->vmfd >= 0) { 1756 close(s->vmfd); 1757 } 1758 if (s->fd != -1) { 1759 close(s->fd); 1760 } 1761 g_free(s->memory_listener.slots); 1762 1763 return ret; 1764 } 1765 1766 void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len) 1767 { 1768 s->sigmask_len = sigmask_len; 1769 } 1770 1771 static void kvm_handle_io(uint16_t port, MemTxAttrs attrs, void *data, int direction, 1772 int size, uint32_t count) 1773 { 1774 int i; 1775 uint8_t *ptr = data; 1776 1777 for (i = 0; i < count; i++) { 1778 address_space_rw(&address_space_io, port, attrs, 1779 ptr, size, 1780 direction == KVM_EXIT_IO_OUT); 1781 ptr += size; 1782 } 1783 } 1784 1785 static int kvm_handle_internal_error(CPUState *cpu, struct kvm_run *run) 1786 { 1787 fprintf(stderr, "KVM internal error. Suberror: %d\n", 1788 run->internal.suberror); 1789 1790 if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) { 1791 int i; 1792 1793 for (i = 0; i < run->internal.ndata; ++i) { 1794 fprintf(stderr, "extra data[%d]: %"PRIx64"\n", 1795 i, (uint64_t)run->internal.data[i]); 1796 } 1797 } 1798 if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) { 1799 fprintf(stderr, "emulation failure\n"); 1800 if (!kvm_arch_stop_on_emulation_error(cpu)) { 1801 cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_CODE); 1802 return EXCP_INTERRUPT; 1803 } 1804 } 1805 /* FIXME: Should trigger a qmp message to let management know 1806 * something went wrong. 1807 */ 1808 return -1; 1809 } 1810 1811 void kvm_flush_coalesced_mmio_buffer(void) 1812 { 1813 KVMState *s = kvm_state; 1814 1815 if (s->coalesced_flush_in_progress) { 1816 return; 1817 } 1818 1819 s->coalesced_flush_in_progress = true; 1820 1821 if (s->coalesced_mmio_ring) { 1822 struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring; 1823 while (ring->first != ring->last) { 1824 struct kvm_coalesced_mmio *ent; 1825 1826 ent = &ring->coalesced_mmio[ring->first]; 1827 1828 if (ent->pio == 1) { 1829 address_space_rw(&address_space_io, ent->phys_addr, 1830 MEMTXATTRS_UNSPECIFIED, ent->data, 1831 ent->len, true); 1832 } else { 1833 cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len); 1834 } 1835 smp_wmb(); 1836 ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX; 1837 } 1838 } 1839 1840 s->coalesced_flush_in_progress = false; 1841 } 1842 1843 static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg) 1844 { 1845 if (!cpu->vcpu_dirty) { 1846 kvm_arch_get_registers(cpu); 1847 cpu->vcpu_dirty = true; 1848 } 1849 } 1850 1851 void kvm_cpu_synchronize_state(CPUState *cpu) 1852 { 1853 if (!cpu->vcpu_dirty) { 1854 run_on_cpu(cpu, do_kvm_cpu_synchronize_state, RUN_ON_CPU_NULL); 1855 } 1856 } 1857 1858 static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg) 1859 { 1860 kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE); 1861 cpu->vcpu_dirty = false; 1862 } 1863 1864 void kvm_cpu_synchronize_post_reset(CPUState *cpu) 1865 { 1866 run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL); 1867 } 1868 1869 static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg) 1870 { 1871 kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE); 1872 cpu->vcpu_dirty = false; 1873 } 1874 1875 void kvm_cpu_synchronize_post_init(CPUState *cpu) 1876 { 1877 run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, RUN_ON_CPU_NULL); 1878 } 1879 1880 static void do_kvm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg) 1881 { 1882 cpu->vcpu_dirty = true; 1883 } 1884 1885 void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu) 1886 { 1887 run_on_cpu(cpu, do_kvm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL); 1888 } 1889 1890 #ifdef KVM_HAVE_MCE_INJECTION 1891 static __thread void *pending_sigbus_addr; 1892 static __thread int pending_sigbus_code; 1893 static __thread bool have_sigbus_pending; 1894 #endif 1895 1896 static void kvm_cpu_kick(CPUState *cpu) 1897 { 1898 atomic_set(&cpu->kvm_run->immediate_exit, 1); 1899 } 1900 1901 static void kvm_cpu_kick_self(void) 1902 { 1903 if (kvm_immediate_exit) { 1904 kvm_cpu_kick(current_cpu); 1905 } else { 1906 qemu_cpu_kick_self(); 1907 } 1908 } 1909 1910 static void kvm_eat_signals(CPUState *cpu) 1911 { 1912 struct timespec ts = { 0, 0 }; 1913 siginfo_t siginfo; 1914 sigset_t waitset; 1915 sigset_t chkset; 1916 int r; 1917 1918 if (kvm_immediate_exit) { 1919 atomic_set(&cpu->kvm_run->immediate_exit, 0); 1920 /* Write kvm_run->immediate_exit before the cpu->exit_request 1921 * write in kvm_cpu_exec. 1922 */ 1923 smp_wmb(); 1924 return; 1925 } 1926 1927 sigemptyset(&waitset); 1928 sigaddset(&waitset, SIG_IPI); 1929 1930 do { 1931 r = sigtimedwait(&waitset, &siginfo, &ts); 1932 if (r == -1 && !(errno == EAGAIN || errno == EINTR)) { 1933 perror("sigtimedwait"); 1934 exit(1); 1935 } 1936 1937 r = sigpending(&chkset); 1938 if (r == -1) { 1939 perror("sigpending"); 1940 exit(1); 1941 } 1942 } while (sigismember(&chkset, SIG_IPI)); 1943 } 1944 1945 int kvm_cpu_exec(CPUState *cpu) 1946 { 1947 struct kvm_run *run = cpu->kvm_run; 1948 int ret, run_ret; 1949 1950 DPRINTF("kvm_cpu_exec()\n"); 1951 1952 if (kvm_arch_process_async_events(cpu)) { 1953 atomic_set(&cpu->exit_request, 0); 1954 return EXCP_HLT; 1955 } 1956 1957 qemu_mutex_unlock_iothread(); 1958 cpu_exec_start(cpu); 1959 1960 do { 1961 MemTxAttrs attrs; 1962 1963 if (cpu->vcpu_dirty) { 1964 kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE); 1965 cpu->vcpu_dirty = false; 1966 } 1967 1968 kvm_arch_pre_run(cpu, run); 1969 if (atomic_read(&cpu->exit_request)) { 1970 DPRINTF("interrupt exit requested\n"); 1971 /* 1972 * KVM requires us to reenter the kernel after IO exits to complete 1973 * instruction emulation. This self-signal will ensure that we 1974 * leave ASAP again. 1975 */ 1976 kvm_cpu_kick_self(); 1977 } 1978 1979 /* Read cpu->exit_request before KVM_RUN reads run->immediate_exit. 1980 * Matching barrier in kvm_eat_signals. 1981 */ 1982 smp_rmb(); 1983 1984 run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0); 1985 1986 attrs = kvm_arch_post_run(cpu, run); 1987 1988 #ifdef KVM_HAVE_MCE_INJECTION 1989 if (unlikely(have_sigbus_pending)) { 1990 qemu_mutex_lock_iothread(); 1991 kvm_arch_on_sigbus_vcpu(cpu, pending_sigbus_code, 1992 pending_sigbus_addr); 1993 have_sigbus_pending = false; 1994 qemu_mutex_unlock_iothread(); 1995 } 1996 #endif 1997 1998 if (run_ret < 0) { 1999 if (run_ret == -EINTR || run_ret == -EAGAIN) { 2000 DPRINTF("io window exit\n"); 2001 kvm_eat_signals(cpu); 2002 ret = EXCP_INTERRUPT; 2003 break; 2004 } 2005 fprintf(stderr, "error: kvm run failed %s\n", 2006 strerror(-run_ret)); 2007 #ifdef TARGET_PPC 2008 if (run_ret == -EBUSY) { 2009 fprintf(stderr, 2010 "This is probably because your SMT is enabled.\n" 2011 "VCPU can only run on primary threads with all " 2012 "secondary threads offline.\n"); 2013 } 2014 #endif 2015 ret = -1; 2016 break; 2017 } 2018 2019 trace_kvm_run_exit(cpu->cpu_index, run->exit_reason); 2020 switch (run->exit_reason) { 2021 case KVM_EXIT_IO: 2022 DPRINTF("handle_io\n"); 2023 /* Called outside BQL */ 2024 kvm_handle_io(run->io.port, attrs, 2025 (uint8_t *)run + run->io.data_offset, 2026 run->io.direction, 2027 run->io.size, 2028 run->io.count); 2029 ret = 0; 2030 break; 2031 case KVM_EXIT_MMIO: 2032 DPRINTF("handle_mmio\n"); 2033 /* Called outside BQL */ 2034 address_space_rw(&address_space_memory, 2035 run->mmio.phys_addr, attrs, 2036 run->mmio.data, 2037 run->mmio.len, 2038 run->mmio.is_write); 2039 ret = 0; 2040 break; 2041 case KVM_EXIT_IRQ_WINDOW_OPEN: 2042 DPRINTF("irq_window_open\n"); 2043 ret = EXCP_INTERRUPT; 2044 break; 2045 case KVM_EXIT_SHUTDOWN: 2046 DPRINTF("shutdown\n"); 2047 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); 2048 ret = EXCP_INTERRUPT; 2049 break; 2050 case KVM_EXIT_UNKNOWN: 2051 fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n", 2052 (uint64_t)run->hw.hardware_exit_reason); 2053 ret = -1; 2054 break; 2055 case KVM_EXIT_INTERNAL_ERROR: 2056 ret = kvm_handle_internal_error(cpu, run); 2057 break; 2058 case KVM_EXIT_SYSTEM_EVENT: 2059 switch (run->system_event.type) { 2060 case KVM_SYSTEM_EVENT_SHUTDOWN: 2061 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN); 2062 ret = EXCP_INTERRUPT; 2063 break; 2064 case KVM_SYSTEM_EVENT_RESET: 2065 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); 2066 ret = EXCP_INTERRUPT; 2067 break; 2068 case KVM_SYSTEM_EVENT_CRASH: 2069 kvm_cpu_synchronize_state(cpu); 2070 qemu_mutex_lock_iothread(); 2071 qemu_system_guest_panicked(cpu_get_crash_info(cpu)); 2072 qemu_mutex_unlock_iothread(); 2073 ret = 0; 2074 break; 2075 default: 2076 DPRINTF("kvm_arch_handle_exit\n"); 2077 ret = kvm_arch_handle_exit(cpu, run); 2078 break; 2079 } 2080 break; 2081 default: 2082 DPRINTF("kvm_arch_handle_exit\n"); 2083 ret = kvm_arch_handle_exit(cpu, run); 2084 break; 2085 } 2086 } while (ret == 0); 2087 2088 cpu_exec_end(cpu); 2089 qemu_mutex_lock_iothread(); 2090 2091 if (ret < 0) { 2092 cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_CODE); 2093 vm_stop(RUN_STATE_INTERNAL_ERROR); 2094 } 2095 2096 atomic_set(&cpu->exit_request, 0); 2097 return ret; 2098 } 2099 2100 int kvm_ioctl(KVMState *s, int type, ...) 2101 { 2102 int ret; 2103 void *arg; 2104 va_list ap; 2105 2106 va_start(ap, type); 2107 arg = va_arg(ap, void *); 2108 va_end(ap); 2109 2110 trace_kvm_ioctl(type, arg); 2111 ret = ioctl(s->fd, type, arg); 2112 if (ret == -1) { 2113 ret = -errno; 2114 } 2115 return ret; 2116 } 2117 2118 int kvm_vm_ioctl(KVMState *s, int type, ...) 2119 { 2120 int ret; 2121 void *arg; 2122 va_list ap; 2123 2124 va_start(ap, type); 2125 arg = va_arg(ap, void *); 2126 va_end(ap); 2127 2128 trace_kvm_vm_ioctl(type, arg); 2129 ret = ioctl(s->vmfd, type, arg); 2130 if (ret == -1) { 2131 ret = -errno; 2132 } 2133 return ret; 2134 } 2135 2136 int kvm_vcpu_ioctl(CPUState *cpu, int type, ...) 2137 { 2138 int ret; 2139 void *arg; 2140 va_list ap; 2141 2142 va_start(ap, type); 2143 arg = va_arg(ap, void *); 2144 va_end(ap); 2145 2146 trace_kvm_vcpu_ioctl(cpu->cpu_index, type, arg); 2147 ret = ioctl(cpu->kvm_fd, type, arg); 2148 if (ret == -1) { 2149 ret = -errno; 2150 } 2151 return ret; 2152 } 2153 2154 int kvm_device_ioctl(int fd, int type, ...) 2155 { 2156 int ret; 2157 void *arg; 2158 va_list ap; 2159 2160 va_start(ap, type); 2161 arg = va_arg(ap, void *); 2162 va_end(ap); 2163 2164 trace_kvm_device_ioctl(fd, type, arg); 2165 ret = ioctl(fd, type, arg); 2166 if (ret == -1) { 2167 ret = -errno; 2168 } 2169 return ret; 2170 } 2171 2172 int kvm_vm_check_attr(KVMState *s, uint32_t group, uint64_t attr) 2173 { 2174 int ret; 2175 struct kvm_device_attr attribute = { 2176 .group = group, 2177 .attr = attr, 2178 }; 2179 2180 if (!kvm_vm_attributes_allowed) { 2181 return 0; 2182 } 2183 2184 ret = kvm_vm_ioctl(s, KVM_HAS_DEVICE_ATTR, &attribute); 2185 /* kvm returns 0 on success for HAS_DEVICE_ATTR */ 2186 return ret ? 0 : 1; 2187 } 2188 2189 int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr) 2190 { 2191 struct kvm_device_attr attribute = { 2192 .group = group, 2193 .attr = attr, 2194 .flags = 0, 2195 }; 2196 2197 return kvm_device_ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute) ? 0 : 1; 2198 } 2199 2200 int kvm_device_access(int fd, int group, uint64_t attr, 2201 void *val, bool write, Error **errp) 2202 { 2203 struct kvm_device_attr kvmattr; 2204 int err; 2205 2206 kvmattr.flags = 0; 2207 kvmattr.group = group; 2208 kvmattr.attr = attr; 2209 kvmattr.addr = (uintptr_t)val; 2210 2211 err = kvm_device_ioctl(fd, 2212 write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR, 2213 &kvmattr); 2214 if (err < 0) { 2215 error_setg_errno(errp, -err, 2216 "KVM_%s_DEVICE_ATTR failed: Group %d " 2217 "attr 0x%016" PRIx64, 2218 write ? "SET" : "GET", group, attr); 2219 } 2220 return err; 2221 } 2222 2223 bool kvm_has_sync_mmu(void) 2224 { 2225 return kvm_state->sync_mmu; 2226 } 2227 2228 int kvm_has_vcpu_events(void) 2229 { 2230 return kvm_state->vcpu_events; 2231 } 2232 2233 int kvm_has_robust_singlestep(void) 2234 { 2235 return kvm_state->robust_singlestep; 2236 } 2237 2238 int kvm_has_debugregs(void) 2239 { 2240 return kvm_state->debugregs; 2241 } 2242 2243 int kvm_has_many_ioeventfds(void) 2244 { 2245 if (!kvm_enabled()) { 2246 return 0; 2247 } 2248 return kvm_state->many_ioeventfds; 2249 } 2250 2251 int kvm_has_gsi_routing(void) 2252 { 2253 #ifdef KVM_CAP_IRQ_ROUTING 2254 return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING); 2255 #else 2256 return false; 2257 #endif 2258 } 2259 2260 int kvm_has_intx_set_mask(void) 2261 { 2262 return kvm_state->intx_set_mask; 2263 } 2264 2265 bool kvm_arm_supports_user_irq(void) 2266 { 2267 return kvm_check_extension(kvm_state, KVM_CAP_ARM_USER_IRQ); 2268 } 2269 2270 #ifdef KVM_CAP_SET_GUEST_DEBUG 2271 struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu, 2272 target_ulong pc) 2273 { 2274 struct kvm_sw_breakpoint *bp; 2275 2276 QTAILQ_FOREACH(bp, &cpu->kvm_state->kvm_sw_breakpoints, entry) { 2277 if (bp->pc == pc) { 2278 return bp; 2279 } 2280 } 2281 return NULL; 2282 } 2283 2284 int kvm_sw_breakpoints_active(CPUState *cpu) 2285 { 2286 return !QTAILQ_EMPTY(&cpu->kvm_state->kvm_sw_breakpoints); 2287 } 2288 2289 struct kvm_set_guest_debug_data { 2290 struct kvm_guest_debug dbg; 2291 int err; 2292 }; 2293 2294 static void kvm_invoke_set_guest_debug(CPUState *cpu, run_on_cpu_data data) 2295 { 2296 struct kvm_set_guest_debug_data *dbg_data = 2297 (struct kvm_set_guest_debug_data *) data.host_ptr; 2298 2299 dbg_data->err = kvm_vcpu_ioctl(cpu, KVM_SET_GUEST_DEBUG, 2300 &dbg_data->dbg); 2301 } 2302 2303 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap) 2304 { 2305 struct kvm_set_guest_debug_data data; 2306 2307 data.dbg.control = reinject_trap; 2308 2309 if (cpu->singlestep_enabled) { 2310 data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP; 2311 } 2312 kvm_arch_update_guest_debug(cpu, &data.dbg); 2313 2314 run_on_cpu(cpu, kvm_invoke_set_guest_debug, 2315 RUN_ON_CPU_HOST_PTR(&data)); 2316 return data.err; 2317 } 2318 2319 int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr, 2320 target_ulong len, int type) 2321 { 2322 struct kvm_sw_breakpoint *bp; 2323 int err; 2324 2325 if (type == GDB_BREAKPOINT_SW) { 2326 bp = kvm_find_sw_breakpoint(cpu, addr); 2327 if (bp) { 2328 bp->use_count++; 2329 return 0; 2330 } 2331 2332 bp = g_malloc(sizeof(struct kvm_sw_breakpoint)); 2333 bp->pc = addr; 2334 bp->use_count = 1; 2335 err = kvm_arch_insert_sw_breakpoint(cpu, bp); 2336 if (err) { 2337 g_free(bp); 2338 return err; 2339 } 2340 2341 QTAILQ_INSERT_HEAD(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry); 2342 } else { 2343 err = kvm_arch_insert_hw_breakpoint(addr, len, type); 2344 if (err) { 2345 return err; 2346 } 2347 } 2348 2349 CPU_FOREACH(cpu) { 2350 err = kvm_update_guest_debug(cpu, 0); 2351 if (err) { 2352 return err; 2353 } 2354 } 2355 return 0; 2356 } 2357 2358 int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr, 2359 target_ulong len, int type) 2360 { 2361 struct kvm_sw_breakpoint *bp; 2362 int err; 2363 2364 if (type == GDB_BREAKPOINT_SW) { 2365 bp = kvm_find_sw_breakpoint(cpu, addr); 2366 if (!bp) { 2367 return -ENOENT; 2368 } 2369 2370 if (bp->use_count > 1) { 2371 bp->use_count--; 2372 return 0; 2373 } 2374 2375 err = kvm_arch_remove_sw_breakpoint(cpu, bp); 2376 if (err) { 2377 return err; 2378 } 2379 2380 QTAILQ_REMOVE(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry); 2381 g_free(bp); 2382 } else { 2383 err = kvm_arch_remove_hw_breakpoint(addr, len, type); 2384 if (err) { 2385 return err; 2386 } 2387 } 2388 2389 CPU_FOREACH(cpu) { 2390 err = kvm_update_guest_debug(cpu, 0); 2391 if (err) { 2392 return err; 2393 } 2394 } 2395 return 0; 2396 } 2397 2398 void kvm_remove_all_breakpoints(CPUState *cpu) 2399 { 2400 struct kvm_sw_breakpoint *bp, *next; 2401 KVMState *s = cpu->kvm_state; 2402 CPUState *tmpcpu; 2403 2404 QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) { 2405 if (kvm_arch_remove_sw_breakpoint(cpu, bp) != 0) { 2406 /* Try harder to find a CPU that currently sees the breakpoint. */ 2407 CPU_FOREACH(tmpcpu) { 2408 if (kvm_arch_remove_sw_breakpoint(tmpcpu, bp) == 0) { 2409 break; 2410 } 2411 } 2412 } 2413 QTAILQ_REMOVE(&s->kvm_sw_breakpoints, bp, entry); 2414 g_free(bp); 2415 } 2416 kvm_arch_remove_all_hw_breakpoints(); 2417 2418 CPU_FOREACH(cpu) { 2419 kvm_update_guest_debug(cpu, 0); 2420 } 2421 } 2422 2423 #else /* !KVM_CAP_SET_GUEST_DEBUG */ 2424 2425 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap) 2426 { 2427 return -EINVAL; 2428 } 2429 2430 int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr, 2431 target_ulong len, int type) 2432 { 2433 return -EINVAL; 2434 } 2435 2436 int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr, 2437 target_ulong len, int type) 2438 { 2439 return -EINVAL; 2440 } 2441 2442 void kvm_remove_all_breakpoints(CPUState *cpu) 2443 { 2444 } 2445 #endif /* !KVM_CAP_SET_GUEST_DEBUG */ 2446 2447 static int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset) 2448 { 2449 KVMState *s = kvm_state; 2450 struct kvm_signal_mask *sigmask; 2451 int r; 2452 2453 sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset)); 2454 2455 sigmask->len = s->sigmask_len; 2456 memcpy(sigmask->sigset, sigset, sizeof(*sigset)); 2457 r = kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, sigmask); 2458 g_free(sigmask); 2459 2460 return r; 2461 } 2462 2463 static void kvm_ipi_signal(int sig) 2464 { 2465 if (current_cpu) { 2466 assert(kvm_immediate_exit); 2467 kvm_cpu_kick(current_cpu); 2468 } 2469 } 2470 2471 void kvm_init_cpu_signals(CPUState *cpu) 2472 { 2473 int r; 2474 sigset_t set; 2475 struct sigaction sigact; 2476 2477 memset(&sigact, 0, sizeof(sigact)); 2478 sigact.sa_handler = kvm_ipi_signal; 2479 sigaction(SIG_IPI, &sigact, NULL); 2480 2481 pthread_sigmask(SIG_BLOCK, NULL, &set); 2482 #if defined KVM_HAVE_MCE_INJECTION 2483 sigdelset(&set, SIGBUS); 2484 pthread_sigmask(SIG_SETMASK, &set, NULL); 2485 #endif 2486 sigdelset(&set, SIG_IPI); 2487 if (kvm_immediate_exit) { 2488 r = pthread_sigmask(SIG_SETMASK, &set, NULL); 2489 } else { 2490 r = kvm_set_signal_mask(cpu, &set); 2491 } 2492 if (r) { 2493 fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r)); 2494 exit(1); 2495 } 2496 } 2497 2498 /* Called asynchronously in VCPU thread. */ 2499 int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr) 2500 { 2501 #ifdef KVM_HAVE_MCE_INJECTION 2502 if (have_sigbus_pending) { 2503 return 1; 2504 } 2505 have_sigbus_pending = true; 2506 pending_sigbus_addr = addr; 2507 pending_sigbus_code = code; 2508 atomic_set(&cpu->exit_request, 1); 2509 return 0; 2510 #else 2511 return 1; 2512 #endif 2513 } 2514 2515 /* Called synchronously (via signalfd) in main thread. */ 2516 int kvm_on_sigbus(int code, void *addr) 2517 { 2518 #ifdef KVM_HAVE_MCE_INJECTION 2519 /* Action required MCE kills the process if SIGBUS is blocked. Because 2520 * that's what happens in the I/O thread, where we handle MCE via signalfd, 2521 * we can only get action optional here. 2522 */ 2523 assert(code != BUS_MCEERR_AR); 2524 kvm_arch_on_sigbus_vcpu(first_cpu, code, addr); 2525 return 0; 2526 #else 2527 return 1; 2528 #endif 2529 } 2530 2531 int kvm_create_device(KVMState *s, uint64_t type, bool test) 2532 { 2533 int ret; 2534 struct kvm_create_device create_dev; 2535 2536 create_dev.type = type; 2537 create_dev.fd = -1; 2538 create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0; 2539 2540 if (!kvm_check_extension(s, KVM_CAP_DEVICE_CTRL)) { 2541 return -ENOTSUP; 2542 } 2543 2544 ret = kvm_vm_ioctl(s, KVM_CREATE_DEVICE, &create_dev); 2545 if (ret) { 2546 return ret; 2547 } 2548 2549 return test ? 0 : create_dev.fd; 2550 } 2551 2552 bool kvm_device_supported(int vmfd, uint64_t type) 2553 { 2554 struct kvm_create_device create_dev = { 2555 .type = type, 2556 .fd = -1, 2557 .flags = KVM_CREATE_DEVICE_TEST, 2558 }; 2559 2560 if (ioctl(vmfd, KVM_CHECK_EXTENSION, KVM_CAP_DEVICE_CTRL) <= 0) { 2561 return false; 2562 } 2563 2564 return (ioctl(vmfd, KVM_CREATE_DEVICE, &create_dev) >= 0); 2565 } 2566 2567 int kvm_set_one_reg(CPUState *cs, uint64_t id, void *source) 2568 { 2569 struct kvm_one_reg reg; 2570 int r; 2571 2572 reg.id = id; 2573 reg.addr = (uintptr_t) source; 2574 r = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, ®); 2575 if (r) { 2576 trace_kvm_failed_reg_set(id, strerror(-r)); 2577 } 2578 return r; 2579 } 2580 2581 int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target) 2582 { 2583 struct kvm_one_reg reg; 2584 int r; 2585 2586 reg.id = id; 2587 reg.addr = (uintptr_t) target; 2588 r = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, ®); 2589 if (r) { 2590 trace_kvm_failed_reg_get(id, strerror(-r)); 2591 } 2592 return r; 2593 } 2594 2595 static void kvm_accel_class_init(ObjectClass *oc, void *data) 2596 { 2597 AccelClass *ac = ACCEL_CLASS(oc); 2598 ac->name = "KVM"; 2599 ac->init_machine = kvm_init; 2600 ac->allowed = &kvm_allowed; 2601 } 2602 2603 static const TypeInfo kvm_accel_type = { 2604 .name = TYPE_KVM_ACCEL, 2605 .parent = TYPE_ACCEL, 2606 .class_init = kvm_accel_class_init, 2607 .instance_size = sizeof(KVMState), 2608 }; 2609 2610 static void kvm_type_init(void) 2611 { 2612 type_register_static(&kvm_accel_type); 2613 } 2614 2615 type_init(kvm_type_init); 2616