1 /* 2 * QEMU KVM support 3 * 4 * Copyright IBM, Corp. 2008 5 * Red Hat, Inc. 2008 6 * 7 * Authors: 8 * Anthony Liguori <aliguori@us.ibm.com> 9 * Glauber Costa <gcosta@redhat.com> 10 * 11 * This work is licensed under the terms of the GNU GPL, version 2 or later. 12 * See the COPYING file in the top-level directory. 13 * 14 */ 15 16 #include "qemu/osdep.h" 17 #include <sys/ioctl.h> 18 19 #include <linux/kvm.h> 20 21 #include "qemu/atomic.h" 22 #include "qemu/option.h" 23 #include "qemu/config-file.h" 24 #include "qemu/error-report.h" 25 #include "qapi/error.h" 26 #include "hw/hw.h" 27 #include "hw/pci/msi.h" 28 #include "hw/pci/msix.h" 29 #include "hw/s390x/adapter.h" 30 #include "exec/gdbstub.h" 31 #include "sysemu/kvm_int.h" 32 #include "sysemu/cpus.h" 33 #include "qemu/bswap.h" 34 #include "exec/memory.h" 35 #include "exec/ram_addr.h" 36 #include "exec/address-spaces.h" 37 #include "qemu/event_notifier.h" 38 #include "trace.h" 39 #include "hw/irq.h" 40 #include "sysemu/sev.h" 41 #include "sysemu/balloon.h" 42 43 #include "hw/boards.h" 44 45 /* This check must be after config-host.h is included */ 46 #ifdef CONFIG_EVENTFD 47 #include <sys/eventfd.h> 48 #endif 49 50 /* KVM uses PAGE_SIZE in its definition of KVM_COALESCED_MMIO_MAX. We 51 * need to use the real host PAGE_SIZE, as that's what KVM will use. 52 */ 53 #define PAGE_SIZE getpagesize() 54 55 //#define DEBUG_KVM 56 57 #ifdef DEBUG_KVM 58 #define DPRINTF(fmt, ...) \ 59 do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0) 60 #else 61 #define DPRINTF(fmt, ...) \ 62 do { } while (0) 63 #endif 64 65 #define KVM_MSI_HASHTAB_SIZE 256 66 67 struct KVMParkedVcpu { 68 unsigned long vcpu_id; 69 int kvm_fd; 70 QLIST_ENTRY(KVMParkedVcpu) node; 71 }; 72 73 struct KVMState 74 { 75 AccelState parent_obj; 76 77 int nr_slots; 78 int fd; 79 int vmfd; 80 int coalesced_mmio; 81 int coalesced_pio; 82 struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; 83 bool coalesced_flush_in_progress; 84 int vcpu_events; 85 int robust_singlestep; 86 int debugregs; 87 #ifdef KVM_CAP_SET_GUEST_DEBUG 88 QTAILQ_HEAD(, kvm_sw_breakpoint) kvm_sw_breakpoints; 89 #endif 90 int max_nested_state_len; 91 int many_ioeventfds; 92 int intx_set_mask; 93 bool sync_mmu; 94 /* The man page (and posix) say ioctl numbers are signed int, but 95 * they're not. Linux, glibc and *BSD all treat ioctl numbers as 96 * unsigned, and treating them as signed here can break things */ 97 unsigned irq_set_ioctl; 98 unsigned int sigmask_len; 99 GHashTable *gsimap; 100 #ifdef KVM_CAP_IRQ_ROUTING 101 struct kvm_irq_routing *irq_routes; 102 int nr_allocated_irq_routes; 103 unsigned long *used_gsi_bitmap; 104 unsigned int gsi_count; 105 QTAILQ_HEAD(, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE]; 106 #endif 107 KVMMemoryListener memory_listener; 108 QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus; 109 110 /* memory encryption */ 111 void *memcrypt_handle; 112 int (*memcrypt_encrypt_data)(void *handle, uint8_t *ptr, uint64_t len); 113 }; 114 115 KVMState *kvm_state; 116 bool kvm_kernel_irqchip; 117 bool kvm_split_irqchip; 118 bool kvm_async_interrupts_allowed; 119 bool kvm_halt_in_kernel_allowed; 120 bool kvm_eventfds_allowed; 121 bool kvm_irqfds_allowed; 122 bool kvm_resamplefds_allowed; 123 bool kvm_msi_via_irqfd_allowed; 124 bool kvm_gsi_routing_allowed; 125 bool kvm_gsi_direct_mapping; 126 bool kvm_allowed; 127 bool kvm_readonly_mem_allowed; 128 bool kvm_vm_attributes_allowed; 129 bool kvm_direct_msi_allowed; 130 bool kvm_ioeventfd_any_length_allowed; 131 bool kvm_msi_use_devid; 132 static bool kvm_immediate_exit; 133 134 static const KVMCapabilityInfo kvm_required_capabilites[] = { 135 KVM_CAP_INFO(USER_MEMORY), 136 KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS), 137 KVM_CAP_INFO(JOIN_MEMORY_REGIONS_WORKS), 138 KVM_CAP_LAST_INFO 139 }; 140 141 int kvm_get_max_memslots(void) 142 { 143 KVMState *s = KVM_STATE(current_machine->accelerator); 144 145 return s->nr_slots; 146 } 147 148 bool kvm_memcrypt_enabled(void) 149 { 150 if (kvm_state && kvm_state->memcrypt_handle) { 151 return true; 152 } 153 154 return false; 155 } 156 157 int kvm_memcrypt_encrypt_data(uint8_t *ptr, uint64_t len) 158 { 159 if (kvm_state->memcrypt_handle && 160 kvm_state->memcrypt_encrypt_data) { 161 return kvm_state->memcrypt_encrypt_data(kvm_state->memcrypt_handle, 162 ptr, len); 163 } 164 165 return 1; 166 } 167 168 static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml) 169 { 170 KVMState *s = kvm_state; 171 int i; 172 173 for (i = 0; i < s->nr_slots; i++) { 174 if (kml->slots[i].memory_size == 0) { 175 return &kml->slots[i]; 176 } 177 } 178 179 return NULL; 180 } 181 182 bool kvm_has_free_slot(MachineState *ms) 183 { 184 KVMState *s = KVM_STATE(ms->accelerator); 185 186 return kvm_get_free_slot(&s->memory_listener); 187 } 188 189 static KVMSlot *kvm_alloc_slot(KVMMemoryListener *kml) 190 { 191 KVMSlot *slot = kvm_get_free_slot(kml); 192 193 if (slot) { 194 return slot; 195 } 196 197 fprintf(stderr, "%s: no free slot available\n", __func__); 198 abort(); 199 } 200 201 static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml, 202 hwaddr start_addr, 203 hwaddr size) 204 { 205 KVMState *s = kvm_state; 206 int i; 207 208 for (i = 0; i < s->nr_slots; i++) { 209 KVMSlot *mem = &kml->slots[i]; 210 211 if (start_addr == mem->start_addr && size == mem->memory_size) { 212 return mem; 213 } 214 } 215 216 return NULL; 217 } 218 219 /* 220 * Calculate and align the start address and the size of the section. 221 * Return the size. If the size is 0, the aligned section is empty. 222 */ 223 static hwaddr kvm_align_section(MemoryRegionSection *section, 224 hwaddr *start) 225 { 226 hwaddr size = int128_get64(section->size); 227 hwaddr delta, aligned; 228 229 /* kvm works in page size chunks, but the function may be called 230 with sub-page size and unaligned start address. Pad the start 231 address to next and truncate size to previous page boundary. */ 232 aligned = ROUND_UP(section->offset_within_address_space, 233 qemu_real_host_page_size); 234 delta = aligned - section->offset_within_address_space; 235 *start = aligned; 236 if (delta > size) { 237 return 0; 238 } 239 240 return (size - delta) & qemu_real_host_page_mask; 241 } 242 243 int kvm_physical_memory_addr_from_host(KVMState *s, void *ram, 244 hwaddr *phys_addr) 245 { 246 KVMMemoryListener *kml = &s->memory_listener; 247 int i; 248 249 for (i = 0; i < s->nr_slots; i++) { 250 KVMSlot *mem = &kml->slots[i]; 251 252 if (ram >= mem->ram && ram < mem->ram + mem->memory_size) { 253 *phys_addr = mem->start_addr + (ram - mem->ram); 254 return 1; 255 } 256 } 257 258 return 0; 259 } 260 261 static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot, bool new) 262 { 263 KVMState *s = kvm_state; 264 struct kvm_userspace_memory_region mem; 265 int ret; 266 267 mem.slot = slot->slot | (kml->as_id << 16); 268 mem.guest_phys_addr = slot->start_addr; 269 mem.userspace_addr = (unsigned long)slot->ram; 270 mem.flags = slot->flags; 271 272 if (slot->memory_size && !new && (mem.flags ^ slot->old_flags) & KVM_MEM_READONLY) { 273 /* Set the slot size to 0 before setting the slot to the desired 274 * value. This is needed based on KVM commit 75d61fbc. */ 275 mem.memory_size = 0; 276 kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem); 277 } 278 mem.memory_size = slot->memory_size; 279 ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem); 280 slot->old_flags = mem.flags; 281 trace_kvm_set_user_memory(mem.slot, mem.flags, mem.guest_phys_addr, 282 mem.memory_size, mem.userspace_addr, ret); 283 return ret; 284 } 285 286 int kvm_destroy_vcpu(CPUState *cpu) 287 { 288 KVMState *s = kvm_state; 289 long mmap_size; 290 struct KVMParkedVcpu *vcpu = NULL; 291 int ret = 0; 292 293 DPRINTF("kvm_destroy_vcpu\n"); 294 295 ret = kvm_arch_destroy_vcpu(cpu); 296 if (ret < 0) { 297 goto err; 298 } 299 300 mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0); 301 if (mmap_size < 0) { 302 ret = mmap_size; 303 DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n"); 304 goto err; 305 } 306 307 ret = munmap(cpu->kvm_run, mmap_size); 308 if (ret < 0) { 309 goto err; 310 } 311 312 vcpu = g_malloc0(sizeof(*vcpu)); 313 vcpu->vcpu_id = kvm_arch_vcpu_id(cpu); 314 vcpu->kvm_fd = cpu->kvm_fd; 315 QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node); 316 err: 317 return ret; 318 } 319 320 static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id) 321 { 322 struct KVMParkedVcpu *cpu; 323 324 QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) { 325 if (cpu->vcpu_id == vcpu_id) { 326 int kvm_fd; 327 328 QLIST_REMOVE(cpu, node); 329 kvm_fd = cpu->kvm_fd; 330 g_free(cpu); 331 return kvm_fd; 332 } 333 } 334 335 return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id); 336 } 337 338 int kvm_init_vcpu(CPUState *cpu) 339 { 340 KVMState *s = kvm_state; 341 long mmap_size; 342 int ret; 343 344 DPRINTF("kvm_init_vcpu\n"); 345 346 ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu)); 347 if (ret < 0) { 348 DPRINTF("kvm_create_vcpu failed\n"); 349 goto err; 350 } 351 352 cpu->kvm_fd = ret; 353 cpu->kvm_state = s; 354 cpu->vcpu_dirty = true; 355 356 mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0); 357 if (mmap_size < 0) { 358 ret = mmap_size; 359 DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n"); 360 goto err; 361 } 362 363 cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, 364 cpu->kvm_fd, 0); 365 if (cpu->kvm_run == MAP_FAILED) { 366 ret = -errno; 367 DPRINTF("mmap'ing vcpu state failed\n"); 368 goto err; 369 } 370 371 if (s->coalesced_mmio && !s->coalesced_mmio_ring) { 372 s->coalesced_mmio_ring = 373 (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE; 374 } 375 376 ret = kvm_arch_init_vcpu(cpu); 377 err: 378 return ret; 379 } 380 381 /* 382 * dirty pages logging control 383 */ 384 385 static int kvm_mem_flags(MemoryRegion *mr) 386 { 387 bool readonly = mr->readonly || memory_region_is_romd(mr); 388 int flags = 0; 389 390 if (memory_region_get_dirty_log_mask(mr) != 0) { 391 flags |= KVM_MEM_LOG_DIRTY_PAGES; 392 } 393 if (readonly && kvm_readonly_mem_allowed) { 394 flags |= KVM_MEM_READONLY; 395 } 396 return flags; 397 } 398 399 static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem, 400 MemoryRegion *mr) 401 { 402 mem->flags = kvm_mem_flags(mr); 403 404 /* If nothing changed effectively, no need to issue ioctl */ 405 if (mem->flags == mem->old_flags) { 406 return 0; 407 } 408 409 return kvm_set_user_memory_region(kml, mem, false); 410 } 411 412 static int kvm_section_update_flags(KVMMemoryListener *kml, 413 MemoryRegionSection *section) 414 { 415 hwaddr start_addr, size; 416 KVMSlot *mem; 417 418 size = kvm_align_section(section, &start_addr); 419 if (!size) { 420 return 0; 421 } 422 423 mem = kvm_lookup_matching_slot(kml, start_addr, size); 424 if (!mem) { 425 /* We don't have a slot if we want to trap every access. */ 426 return 0; 427 } 428 429 return kvm_slot_update_flags(kml, mem, section->mr); 430 } 431 432 static void kvm_log_start(MemoryListener *listener, 433 MemoryRegionSection *section, 434 int old, int new) 435 { 436 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 437 int r; 438 439 if (old != 0) { 440 return; 441 } 442 443 r = kvm_section_update_flags(kml, section); 444 if (r < 0) { 445 abort(); 446 } 447 } 448 449 static void kvm_log_stop(MemoryListener *listener, 450 MemoryRegionSection *section, 451 int old, int new) 452 { 453 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 454 int r; 455 456 if (new != 0) { 457 return; 458 } 459 460 r = kvm_section_update_flags(kml, section); 461 if (r < 0) { 462 abort(); 463 } 464 } 465 466 /* get kvm's dirty pages bitmap and update qemu's */ 467 static int kvm_get_dirty_pages_log_range(MemoryRegionSection *section, 468 unsigned long *bitmap) 469 { 470 ram_addr_t start = section->offset_within_region + 471 memory_region_get_ram_addr(section->mr); 472 ram_addr_t pages = int128_get64(section->size) / getpagesize(); 473 474 cpu_physical_memory_set_dirty_lebitmap(bitmap, start, pages); 475 return 0; 476 } 477 478 #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1)) 479 480 /** 481 * kvm_physical_sync_dirty_bitmap - Grab dirty bitmap from kernel space 482 * This function updates qemu's dirty bitmap using 483 * memory_region_set_dirty(). This means all bits are set 484 * to dirty. 485 * 486 * @start_add: start of logged region. 487 * @end_addr: end of logged region. 488 */ 489 static int kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml, 490 MemoryRegionSection *section) 491 { 492 KVMState *s = kvm_state; 493 struct kvm_dirty_log d = {}; 494 KVMSlot *mem; 495 hwaddr start_addr, size; 496 497 size = kvm_align_section(section, &start_addr); 498 if (size) { 499 mem = kvm_lookup_matching_slot(kml, start_addr, size); 500 if (!mem) { 501 /* We don't have a slot if we want to trap every access. */ 502 return 0; 503 } 504 505 /* XXX bad kernel interface alert 506 * For dirty bitmap, kernel allocates array of size aligned to 507 * bits-per-long. But for case when the kernel is 64bits and 508 * the userspace is 32bits, userspace can't align to the same 509 * bits-per-long, since sizeof(long) is different between kernel 510 * and user space. This way, userspace will provide buffer which 511 * may be 4 bytes less than the kernel will use, resulting in 512 * userspace memory corruption (which is not detectable by valgrind 513 * too, in most cases). 514 * So for now, let's align to 64 instead of HOST_LONG_BITS here, in 515 * a hope that sizeof(long) won't become >8 any time soon. 516 */ 517 size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS), 518 /*HOST_LONG_BITS*/ 64) / 8; 519 d.dirty_bitmap = g_malloc0(size); 520 521 d.slot = mem->slot | (kml->as_id << 16); 522 if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) { 523 DPRINTF("ioctl failed %d\n", errno); 524 g_free(d.dirty_bitmap); 525 return -1; 526 } 527 528 kvm_get_dirty_pages_log_range(section, d.dirty_bitmap); 529 g_free(d.dirty_bitmap); 530 } 531 532 return 0; 533 } 534 535 static void kvm_coalesce_mmio_region(MemoryListener *listener, 536 MemoryRegionSection *secion, 537 hwaddr start, hwaddr size) 538 { 539 KVMState *s = kvm_state; 540 541 if (s->coalesced_mmio) { 542 struct kvm_coalesced_mmio_zone zone; 543 544 zone.addr = start; 545 zone.size = size; 546 zone.pad = 0; 547 548 (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone); 549 } 550 } 551 552 static void kvm_uncoalesce_mmio_region(MemoryListener *listener, 553 MemoryRegionSection *secion, 554 hwaddr start, hwaddr size) 555 { 556 KVMState *s = kvm_state; 557 558 if (s->coalesced_mmio) { 559 struct kvm_coalesced_mmio_zone zone; 560 561 zone.addr = start; 562 zone.size = size; 563 zone.pad = 0; 564 565 (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone); 566 } 567 } 568 569 static void kvm_coalesce_pio_add(MemoryListener *listener, 570 MemoryRegionSection *section, 571 hwaddr start, hwaddr size) 572 { 573 KVMState *s = kvm_state; 574 575 if (s->coalesced_pio) { 576 struct kvm_coalesced_mmio_zone zone; 577 578 zone.addr = start; 579 zone.size = size; 580 zone.pio = 1; 581 582 (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone); 583 } 584 } 585 586 static void kvm_coalesce_pio_del(MemoryListener *listener, 587 MemoryRegionSection *section, 588 hwaddr start, hwaddr size) 589 { 590 KVMState *s = kvm_state; 591 592 if (s->coalesced_pio) { 593 struct kvm_coalesced_mmio_zone zone; 594 595 zone.addr = start; 596 zone.size = size; 597 zone.pio = 1; 598 599 (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone); 600 } 601 } 602 603 static MemoryListener kvm_coalesced_pio_listener = { 604 .coalesced_io_add = kvm_coalesce_pio_add, 605 .coalesced_io_del = kvm_coalesce_pio_del, 606 }; 607 608 int kvm_check_extension(KVMState *s, unsigned int extension) 609 { 610 int ret; 611 612 ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension); 613 if (ret < 0) { 614 ret = 0; 615 } 616 617 return ret; 618 } 619 620 int kvm_vm_check_extension(KVMState *s, unsigned int extension) 621 { 622 int ret; 623 624 ret = kvm_vm_ioctl(s, KVM_CHECK_EXTENSION, extension); 625 if (ret < 0) { 626 /* VM wide version not implemented, use global one instead */ 627 ret = kvm_check_extension(s, extension); 628 } 629 630 return ret; 631 } 632 633 static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size) 634 { 635 #if defined(HOST_WORDS_BIGENDIAN) != defined(TARGET_WORDS_BIGENDIAN) 636 /* The kernel expects ioeventfd values in HOST_WORDS_BIGENDIAN 637 * endianness, but the memory core hands them in target endianness. 638 * For example, PPC is always treated as big-endian even if running 639 * on KVM and on PPC64LE. Correct here. 640 */ 641 switch (size) { 642 case 2: 643 val = bswap16(val); 644 break; 645 case 4: 646 val = bswap32(val); 647 break; 648 } 649 #endif 650 return val; 651 } 652 653 static int kvm_set_ioeventfd_mmio(int fd, hwaddr addr, uint32_t val, 654 bool assign, uint32_t size, bool datamatch) 655 { 656 int ret; 657 struct kvm_ioeventfd iofd = { 658 .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0, 659 .addr = addr, 660 .len = size, 661 .flags = 0, 662 .fd = fd, 663 }; 664 665 trace_kvm_set_ioeventfd_mmio(fd, (uint64_t)addr, val, assign, size, 666 datamatch); 667 if (!kvm_enabled()) { 668 return -ENOSYS; 669 } 670 671 if (datamatch) { 672 iofd.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH; 673 } 674 if (!assign) { 675 iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN; 676 } 677 678 ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd); 679 680 if (ret < 0) { 681 return -errno; 682 } 683 684 return 0; 685 } 686 687 static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val, 688 bool assign, uint32_t size, bool datamatch) 689 { 690 struct kvm_ioeventfd kick = { 691 .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0, 692 .addr = addr, 693 .flags = KVM_IOEVENTFD_FLAG_PIO, 694 .len = size, 695 .fd = fd, 696 }; 697 int r; 698 trace_kvm_set_ioeventfd_pio(fd, addr, val, assign, size, datamatch); 699 if (!kvm_enabled()) { 700 return -ENOSYS; 701 } 702 if (datamatch) { 703 kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH; 704 } 705 if (!assign) { 706 kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN; 707 } 708 r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick); 709 if (r < 0) { 710 return r; 711 } 712 return 0; 713 } 714 715 716 static int kvm_check_many_ioeventfds(void) 717 { 718 /* Userspace can use ioeventfd for io notification. This requires a host 719 * that supports eventfd(2) and an I/O thread; since eventfd does not 720 * support SIGIO it cannot interrupt the vcpu. 721 * 722 * Older kernels have a 6 device limit on the KVM io bus. Find out so we 723 * can avoid creating too many ioeventfds. 724 */ 725 #if defined(CONFIG_EVENTFD) 726 int ioeventfds[7]; 727 int i, ret = 0; 728 for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) { 729 ioeventfds[i] = eventfd(0, EFD_CLOEXEC); 730 if (ioeventfds[i] < 0) { 731 break; 732 } 733 ret = kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, true, 2, true); 734 if (ret < 0) { 735 close(ioeventfds[i]); 736 break; 737 } 738 } 739 740 /* Decide whether many devices are supported or not */ 741 ret = i == ARRAY_SIZE(ioeventfds); 742 743 while (i-- > 0) { 744 kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, false, 2, true); 745 close(ioeventfds[i]); 746 } 747 return ret; 748 #else 749 return 0; 750 #endif 751 } 752 753 static const KVMCapabilityInfo * 754 kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list) 755 { 756 while (list->name) { 757 if (!kvm_check_extension(s, list->value)) { 758 return list; 759 } 760 list++; 761 } 762 return NULL; 763 } 764 765 static void kvm_set_phys_mem(KVMMemoryListener *kml, 766 MemoryRegionSection *section, bool add) 767 { 768 KVMSlot *mem; 769 int err; 770 MemoryRegion *mr = section->mr; 771 bool writeable = !mr->readonly && !mr->rom_device; 772 hwaddr start_addr, size; 773 void *ram; 774 775 if (!memory_region_is_ram(mr)) { 776 if (writeable || !kvm_readonly_mem_allowed) { 777 return; 778 } else if (!mr->romd_mode) { 779 /* If the memory device is not in romd_mode, then we actually want 780 * to remove the kvm memory slot so all accesses will trap. */ 781 add = false; 782 } 783 } 784 785 size = kvm_align_section(section, &start_addr); 786 if (!size) { 787 return; 788 } 789 790 /* use aligned delta to align the ram address */ 791 ram = memory_region_get_ram_ptr(mr) + section->offset_within_region + 792 (start_addr - section->offset_within_address_space); 793 794 if (!add) { 795 mem = kvm_lookup_matching_slot(kml, start_addr, size); 796 if (!mem) { 797 return; 798 } 799 if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { 800 kvm_physical_sync_dirty_bitmap(kml, section); 801 } 802 803 /* unregister the slot */ 804 mem->memory_size = 0; 805 mem->flags = 0; 806 err = kvm_set_user_memory_region(kml, mem, false); 807 if (err) { 808 fprintf(stderr, "%s: error unregistering slot: %s\n", 809 __func__, strerror(-err)); 810 abort(); 811 } 812 return; 813 } 814 815 /* register the new slot */ 816 mem = kvm_alloc_slot(kml); 817 mem->memory_size = size; 818 mem->start_addr = start_addr; 819 mem->ram = ram; 820 mem->flags = kvm_mem_flags(mr); 821 822 err = kvm_set_user_memory_region(kml, mem, true); 823 if (err) { 824 fprintf(stderr, "%s: error registering slot: %s\n", __func__, 825 strerror(-err)); 826 abort(); 827 } 828 } 829 830 static void kvm_region_add(MemoryListener *listener, 831 MemoryRegionSection *section) 832 { 833 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 834 835 memory_region_ref(section->mr); 836 kvm_set_phys_mem(kml, section, true); 837 } 838 839 static void kvm_region_del(MemoryListener *listener, 840 MemoryRegionSection *section) 841 { 842 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 843 844 kvm_set_phys_mem(kml, section, false); 845 memory_region_unref(section->mr); 846 } 847 848 static void kvm_log_sync(MemoryListener *listener, 849 MemoryRegionSection *section) 850 { 851 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 852 int r; 853 854 r = kvm_physical_sync_dirty_bitmap(kml, section); 855 if (r < 0) { 856 abort(); 857 } 858 } 859 860 static void kvm_mem_ioeventfd_add(MemoryListener *listener, 861 MemoryRegionSection *section, 862 bool match_data, uint64_t data, 863 EventNotifier *e) 864 { 865 int fd = event_notifier_get_fd(e); 866 int r; 867 868 r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space, 869 data, true, int128_get64(section->size), 870 match_data); 871 if (r < 0) { 872 fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n", 873 __func__, strerror(-r), -r); 874 abort(); 875 } 876 } 877 878 static void kvm_mem_ioeventfd_del(MemoryListener *listener, 879 MemoryRegionSection *section, 880 bool match_data, uint64_t data, 881 EventNotifier *e) 882 { 883 int fd = event_notifier_get_fd(e); 884 int r; 885 886 r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space, 887 data, false, int128_get64(section->size), 888 match_data); 889 if (r < 0) { 890 fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n", 891 __func__, strerror(-r), -r); 892 abort(); 893 } 894 } 895 896 static void kvm_io_ioeventfd_add(MemoryListener *listener, 897 MemoryRegionSection *section, 898 bool match_data, uint64_t data, 899 EventNotifier *e) 900 { 901 int fd = event_notifier_get_fd(e); 902 int r; 903 904 r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space, 905 data, true, int128_get64(section->size), 906 match_data); 907 if (r < 0) { 908 fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n", 909 __func__, strerror(-r), -r); 910 abort(); 911 } 912 } 913 914 static void kvm_io_ioeventfd_del(MemoryListener *listener, 915 MemoryRegionSection *section, 916 bool match_data, uint64_t data, 917 EventNotifier *e) 918 919 { 920 int fd = event_notifier_get_fd(e); 921 int r; 922 923 r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space, 924 data, false, int128_get64(section->size), 925 match_data); 926 if (r < 0) { 927 fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n", 928 __func__, strerror(-r), -r); 929 abort(); 930 } 931 } 932 933 void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml, 934 AddressSpace *as, int as_id) 935 { 936 int i; 937 938 kml->slots = g_malloc0(s->nr_slots * sizeof(KVMSlot)); 939 kml->as_id = as_id; 940 941 for (i = 0; i < s->nr_slots; i++) { 942 kml->slots[i].slot = i; 943 } 944 945 kml->listener.region_add = kvm_region_add; 946 kml->listener.region_del = kvm_region_del; 947 kml->listener.log_start = kvm_log_start; 948 kml->listener.log_stop = kvm_log_stop; 949 kml->listener.log_sync = kvm_log_sync; 950 kml->listener.priority = 10; 951 952 memory_listener_register(&kml->listener, as); 953 } 954 955 static MemoryListener kvm_io_listener = { 956 .eventfd_add = kvm_io_ioeventfd_add, 957 .eventfd_del = kvm_io_ioeventfd_del, 958 .priority = 10, 959 }; 960 961 int kvm_set_irq(KVMState *s, int irq, int level) 962 { 963 struct kvm_irq_level event; 964 int ret; 965 966 assert(kvm_async_interrupts_enabled()); 967 968 event.level = level; 969 event.irq = irq; 970 ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event); 971 if (ret < 0) { 972 perror("kvm_set_irq"); 973 abort(); 974 } 975 976 return (s->irq_set_ioctl == KVM_IRQ_LINE) ? 1 : event.status; 977 } 978 979 #ifdef KVM_CAP_IRQ_ROUTING 980 typedef struct KVMMSIRoute { 981 struct kvm_irq_routing_entry kroute; 982 QTAILQ_ENTRY(KVMMSIRoute) entry; 983 } KVMMSIRoute; 984 985 static void set_gsi(KVMState *s, unsigned int gsi) 986 { 987 set_bit(gsi, s->used_gsi_bitmap); 988 } 989 990 static void clear_gsi(KVMState *s, unsigned int gsi) 991 { 992 clear_bit(gsi, s->used_gsi_bitmap); 993 } 994 995 void kvm_init_irq_routing(KVMState *s) 996 { 997 int gsi_count, i; 998 999 gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING) - 1; 1000 if (gsi_count > 0) { 1001 /* Round up so we can search ints using ffs */ 1002 s->used_gsi_bitmap = bitmap_new(gsi_count); 1003 s->gsi_count = gsi_count; 1004 } 1005 1006 s->irq_routes = g_malloc0(sizeof(*s->irq_routes)); 1007 s->nr_allocated_irq_routes = 0; 1008 1009 if (!kvm_direct_msi_allowed) { 1010 for (i = 0; i < KVM_MSI_HASHTAB_SIZE; i++) { 1011 QTAILQ_INIT(&s->msi_hashtab[i]); 1012 } 1013 } 1014 1015 kvm_arch_init_irq_routing(s); 1016 } 1017 1018 void kvm_irqchip_commit_routes(KVMState *s) 1019 { 1020 int ret; 1021 1022 if (kvm_gsi_direct_mapping()) { 1023 return; 1024 } 1025 1026 if (!kvm_gsi_routing_enabled()) { 1027 return; 1028 } 1029 1030 s->irq_routes->flags = 0; 1031 trace_kvm_irqchip_commit_routes(); 1032 ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes); 1033 assert(ret == 0); 1034 } 1035 1036 static void kvm_add_routing_entry(KVMState *s, 1037 struct kvm_irq_routing_entry *entry) 1038 { 1039 struct kvm_irq_routing_entry *new; 1040 int n, size; 1041 1042 if (s->irq_routes->nr == s->nr_allocated_irq_routes) { 1043 n = s->nr_allocated_irq_routes * 2; 1044 if (n < 64) { 1045 n = 64; 1046 } 1047 size = sizeof(struct kvm_irq_routing); 1048 size += n * sizeof(*new); 1049 s->irq_routes = g_realloc(s->irq_routes, size); 1050 s->nr_allocated_irq_routes = n; 1051 } 1052 n = s->irq_routes->nr++; 1053 new = &s->irq_routes->entries[n]; 1054 1055 *new = *entry; 1056 1057 set_gsi(s, entry->gsi); 1058 } 1059 1060 static int kvm_update_routing_entry(KVMState *s, 1061 struct kvm_irq_routing_entry *new_entry) 1062 { 1063 struct kvm_irq_routing_entry *entry; 1064 int n; 1065 1066 for (n = 0; n < s->irq_routes->nr; n++) { 1067 entry = &s->irq_routes->entries[n]; 1068 if (entry->gsi != new_entry->gsi) { 1069 continue; 1070 } 1071 1072 if(!memcmp(entry, new_entry, sizeof *entry)) { 1073 return 0; 1074 } 1075 1076 *entry = *new_entry; 1077 1078 return 0; 1079 } 1080 1081 return -ESRCH; 1082 } 1083 1084 void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin) 1085 { 1086 struct kvm_irq_routing_entry e = {}; 1087 1088 assert(pin < s->gsi_count); 1089 1090 e.gsi = irq; 1091 e.type = KVM_IRQ_ROUTING_IRQCHIP; 1092 e.flags = 0; 1093 e.u.irqchip.irqchip = irqchip; 1094 e.u.irqchip.pin = pin; 1095 kvm_add_routing_entry(s, &e); 1096 } 1097 1098 void kvm_irqchip_release_virq(KVMState *s, int virq) 1099 { 1100 struct kvm_irq_routing_entry *e; 1101 int i; 1102 1103 if (kvm_gsi_direct_mapping()) { 1104 return; 1105 } 1106 1107 for (i = 0; i < s->irq_routes->nr; i++) { 1108 e = &s->irq_routes->entries[i]; 1109 if (e->gsi == virq) { 1110 s->irq_routes->nr--; 1111 *e = s->irq_routes->entries[s->irq_routes->nr]; 1112 } 1113 } 1114 clear_gsi(s, virq); 1115 kvm_arch_release_virq_post(virq); 1116 trace_kvm_irqchip_release_virq(virq); 1117 } 1118 1119 static unsigned int kvm_hash_msi(uint32_t data) 1120 { 1121 /* This is optimized for IA32 MSI layout. However, no other arch shall 1122 * repeat the mistake of not providing a direct MSI injection API. */ 1123 return data & 0xff; 1124 } 1125 1126 static void kvm_flush_dynamic_msi_routes(KVMState *s) 1127 { 1128 KVMMSIRoute *route, *next; 1129 unsigned int hash; 1130 1131 for (hash = 0; hash < KVM_MSI_HASHTAB_SIZE; hash++) { 1132 QTAILQ_FOREACH_SAFE(route, &s->msi_hashtab[hash], entry, next) { 1133 kvm_irqchip_release_virq(s, route->kroute.gsi); 1134 QTAILQ_REMOVE(&s->msi_hashtab[hash], route, entry); 1135 g_free(route); 1136 } 1137 } 1138 } 1139 1140 static int kvm_irqchip_get_virq(KVMState *s) 1141 { 1142 int next_virq; 1143 1144 /* 1145 * PIC and IOAPIC share the first 16 GSI numbers, thus the available 1146 * GSI numbers are more than the number of IRQ route. Allocating a GSI 1147 * number can succeed even though a new route entry cannot be added. 1148 * When this happens, flush dynamic MSI entries to free IRQ route entries. 1149 */ 1150 if (!kvm_direct_msi_allowed && s->irq_routes->nr == s->gsi_count) { 1151 kvm_flush_dynamic_msi_routes(s); 1152 } 1153 1154 /* Return the lowest unused GSI in the bitmap */ 1155 next_virq = find_first_zero_bit(s->used_gsi_bitmap, s->gsi_count); 1156 if (next_virq >= s->gsi_count) { 1157 return -ENOSPC; 1158 } else { 1159 return next_virq; 1160 } 1161 } 1162 1163 static KVMMSIRoute *kvm_lookup_msi_route(KVMState *s, MSIMessage msg) 1164 { 1165 unsigned int hash = kvm_hash_msi(msg.data); 1166 KVMMSIRoute *route; 1167 1168 QTAILQ_FOREACH(route, &s->msi_hashtab[hash], entry) { 1169 if (route->kroute.u.msi.address_lo == (uint32_t)msg.address && 1170 route->kroute.u.msi.address_hi == (msg.address >> 32) && 1171 route->kroute.u.msi.data == le32_to_cpu(msg.data)) { 1172 return route; 1173 } 1174 } 1175 return NULL; 1176 } 1177 1178 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg) 1179 { 1180 struct kvm_msi msi; 1181 KVMMSIRoute *route; 1182 1183 if (kvm_direct_msi_allowed) { 1184 msi.address_lo = (uint32_t)msg.address; 1185 msi.address_hi = msg.address >> 32; 1186 msi.data = le32_to_cpu(msg.data); 1187 msi.flags = 0; 1188 memset(msi.pad, 0, sizeof(msi.pad)); 1189 1190 return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi); 1191 } 1192 1193 route = kvm_lookup_msi_route(s, msg); 1194 if (!route) { 1195 int virq; 1196 1197 virq = kvm_irqchip_get_virq(s); 1198 if (virq < 0) { 1199 return virq; 1200 } 1201 1202 route = g_malloc0(sizeof(KVMMSIRoute)); 1203 route->kroute.gsi = virq; 1204 route->kroute.type = KVM_IRQ_ROUTING_MSI; 1205 route->kroute.flags = 0; 1206 route->kroute.u.msi.address_lo = (uint32_t)msg.address; 1207 route->kroute.u.msi.address_hi = msg.address >> 32; 1208 route->kroute.u.msi.data = le32_to_cpu(msg.data); 1209 1210 kvm_add_routing_entry(s, &route->kroute); 1211 kvm_irqchip_commit_routes(s); 1212 1213 QTAILQ_INSERT_TAIL(&s->msi_hashtab[kvm_hash_msi(msg.data)], route, 1214 entry); 1215 } 1216 1217 assert(route->kroute.type == KVM_IRQ_ROUTING_MSI); 1218 1219 return kvm_set_irq(s, route->kroute.gsi, 1); 1220 } 1221 1222 int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev) 1223 { 1224 struct kvm_irq_routing_entry kroute = {}; 1225 int virq; 1226 MSIMessage msg = {0, 0}; 1227 1228 if (pci_available && dev) { 1229 msg = pci_get_msi_message(dev, vector); 1230 } 1231 1232 if (kvm_gsi_direct_mapping()) { 1233 return kvm_arch_msi_data_to_gsi(msg.data); 1234 } 1235 1236 if (!kvm_gsi_routing_enabled()) { 1237 return -ENOSYS; 1238 } 1239 1240 virq = kvm_irqchip_get_virq(s); 1241 if (virq < 0) { 1242 return virq; 1243 } 1244 1245 kroute.gsi = virq; 1246 kroute.type = KVM_IRQ_ROUTING_MSI; 1247 kroute.flags = 0; 1248 kroute.u.msi.address_lo = (uint32_t)msg.address; 1249 kroute.u.msi.address_hi = msg.address >> 32; 1250 kroute.u.msi.data = le32_to_cpu(msg.data); 1251 if (pci_available && kvm_msi_devid_required()) { 1252 kroute.flags = KVM_MSI_VALID_DEVID; 1253 kroute.u.msi.devid = pci_requester_id(dev); 1254 } 1255 if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) { 1256 kvm_irqchip_release_virq(s, virq); 1257 return -EINVAL; 1258 } 1259 1260 trace_kvm_irqchip_add_msi_route(dev ? dev->name : (char *)"N/A", 1261 vector, virq); 1262 1263 kvm_add_routing_entry(s, &kroute); 1264 kvm_arch_add_msi_route_post(&kroute, vector, dev); 1265 kvm_irqchip_commit_routes(s); 1266 1267 return virq; 1268 } 1269 1270 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg, 1271 PCIDevice *dev) 1272 { 1273 struct kvm_irq_routing_entry kroute = {}; 1274 1275 if (kvm_gsi_direct_mapping()) { 1276 return 0; 1277 } 1278 1279 if (!kvm_irqchip_in_kernel()) { 1280 return -ENOSYS; 1281 } 1282 1283 kroute.gsi = virq; 1284 kroute.type = KVM_IRQ_ROUTING_MSI; 1285 kroute.flags = 0; 1286 kroute.u.msi.address_lo = (uint32_t)msg.address; 1287 kroute.u.msi.address_hi = msg.address >> 32; 1288 kroute.u.msi.data = le32_to_cpu(msg.data); 1289 if (pci_available && kvm_msi_devid_required()) { 1290 kroute.flags = KVM_MSI_VALID_DEVID; 1291 kroute.u.msi.devid = pci_requester_id(dev); 1292 } 1293 if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) { 1294 return -EINVAL; 1295 } 1296 1297 trace_kvm_irqchip_update_msi_route(virq); 1298 1299 return kvm_update_routing_entry(s, &kroute); 1300 } 1301 1302 static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int rfd, int virq, 1303 bool assign) 1304 { 1305 struct kvm_irqfd irqfd = { 1306 .fd = fd, 1307 .gsi = virq, 1308 .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN, 1309 }; 1310 1311 if (rfd != -1) { 1312 irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE; 1313 irqfd.resamplefd = rfd; 1314 } 1315 1316 if (!kvm_irqfds_enabled()) { 1317 return -ENOSYS; 1318 } 1319 1320 return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd); 1321 } 1322 1323 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter) 1324 { 1325 struct kvm_irq_routing_entry kroute = {}; 1326 int virq; 1327 1328 if (!kvm_gsi_routing_enabled()) { 1329 return -ENOSYS; 1330 } 1331 1332 virq = kvm_irqchip_get_virq(s); 1333 if (virq < 0) { 1334 return virq; 1335 } 1336 1337 kroute.gsi = virq; 1338 kroute.type = KVM_IRQ_ROUTING_S390_ADAPTER; 1339 kroute.flags = 0; 1340 kroute.u.adapter.summary_addr = adapter->summary_addr; 1341 kroute.u.adapter.ind_addr = adapter->ind_addr; 1342 kroute.u.adapter.summary_offset = adapter->summary_offset; 1343 kroute.u.adapter.ind_offset = adapter->ind_offset; 1344 kroute.u.adapter.adapter_id = adapter->adapter_id; 1345 1346 kvm_add_routing_entry(s, &kroute); 1347 1348 return virq; 1349 } 1350 1351 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint) 1352 { 1353 struct kvm_irq_routing_entry kroute = {}; 1354 int virq; 1355 1356 if (!kvm_gsi_routing_enabled()) { 1357 return -ENOSYS; 1358 } 1359 if (!kvm_check_extension(s, KVM_CAP_HYPERV_SYNIC)) { 1360 return -ENOSYS; 1361 } 1362 virq = kvm_irqchip_get_virq(s); 1363 if (virq < 0) { 1364 return virq; 1365 } 1366 1367 kroute.gsi = virq; 1368 kroute.type = KVM_IRQ_ROUTING_HV_SINT; 1369 kroute.flags = 0; 1370 kroute.u.hv_sint.vcpu = vcpu; 1371 kroute.u.hv_sint.sint = sint; 1372 1373 kvm_add_routing_entry(s, &kroute); 1374 kvm_irqchip_commit_routes(s); 1375 1376 return virq; 1377 } 1378 1379 #else /* !KVM_CAP_IRQ_ROUTING */ 1380 1381 void kvm_init_irq_routing(KVMState *s) 1382 { 1383 } 1384 1385 void kvm_irqchip_release_virq(KVMState *s, int virq) 1386 { 1387 } 1388 1389 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg) 1390 { 1391 abort(); 1392 } 1393 1394 int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev) 1395 { 1396 return -ENOSYS; 1397 } 1398 1399 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter) 1400 { 1401 return -ENOSYS; 1402 } 1403 1404 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint) 1405 { 1406 return -ENOSYS; 1407 } 1408 1409 static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int virq, bool assign) 1410 { 1411 abort(); 1412 } 1413 1414 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg) 1415 { 1416 return -ENOSYS; 1417 } 1418 #endif /* !KVM_CAP_IRQ_ROUTING */ 1419 1420 int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, 1421 EventNotifier *rn, int virq) 1422 { 1423 return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), 1424 rn ? event_notifier_get_fd(rn) : -1, virq, true); 1425 } 1426 1427 int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, 1428 int virq) 1429 { 1430 return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), -1, virq, 1431 false); 1432 } 1433 1434 int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n, 1435 EventNotifier *rn, qemu_irq irq) 1436 { 1437 gpointer key, gsi; 1438 gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi); 1439 1440 if (!found) { 1441 return -ENXIO; 1442 } 1443 return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, GPOINTER_TO_INT(gsi)); 1444 } 1445 1446 int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n, 1447 qemu_irq irq) 1448 { 1449 gpointer key, gsi; 1450 gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi); 1451 1452 if (!found) { 1453 return -ENXIO; 1454 } 1455 return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, GPOINTER_TO_INT(gsi)); 1456 } 1457 1458 void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi) 1459 { 1460 g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi)); 1461 } 1462 1463 static void kvm_irqchip_create(MachineState *machine, KVMState *s) 1464 { 1465 int ret; 1466 1467 if (kvm_check_extension(s, KVM_CAP_IRQCHIP)) { 1468 ; 1469 } else if (kvm_check_extension(s, KVM_CAP_S390_IRQCHIP)) { 1470 ret = kvm_vm_enable_cap(s, KVM_CAP_S390_IRQCHIP, 0); 1471 if (ret < 0) { 1472 fprintf(stderr, "Enable kernel irqchip failed: %s\n", strerror(-ret)); 1473 exit(1); 1474 } 1475 } else { 1476 return; 1477 } 1478 1479 /* First probe and see if there's a arch-specific hook to create the 1480 * in-kernel irqchip for us */ 1481 ret = kvm_arch_irqchip_create(machine, s); 1482 if (ret == 0) { 1483 if (machine_kernel_irqchip_split(machine)) { 1484 perror("Split IRQ chip mode not supported."); 1485 exit(1); 1486 } else { 1487 ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP); 1488 } 1489 } 1490 if (ret < 0) { 1491 fprintf(stderr, "Create kernel irqchip failed: %s\n", strerror(-ret)); 1492 exit(1); 1493 } 1494 1495 kvm_kernel_irqchip = true; 1496 /* If we have an in-kernel IRQ chip then we must have asynchronous 1497 * interrupt delivery (though the reverse is not necessarily true) 1498 */ 1499 kvm_async_interrupts_allowed = true; 1500 kvm_halt_in_kernel_allowed = true; 1501 1502 kvm_init_irq_routing(s); 1503 1504 s->gsimap = g_hash_table_new(g_direct_hash, g_direct_equal); 1505 } 1506 1507 /* Find number of supported CPUs using the recommended 1508 * procedure from the kernel API documentation to cope with 1509 * older kernels that may be missing capabilities. 1510 */ 1511 static int kvm_recommended_vcpus(KVMState *s) 1512 { 1513 int ret = kvm_vm_check_extension(s, KVM_CAP_NR_VCPUS); 1514 return (ret) ? ret : 4; 1515 } 1516 1517 static int kvm_max_vcpus(KVMState *s) 1518 { 1519 int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS); 1520 return (ret) ? ret : kvm_recommended_vcpus(s); 1521 } 1522 1523 static int kvm_max_vcpu_id(KVMState *s) 1524 { 1525 int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPU_ID); 1526 return (ret) ? ret : kvm_max_vcpus(s); 1527 } 1528 1529 bool kvm_vcpu_id_is_valid(int vcpu_id) 1530 { 1531 KVMState *s = KVM_STATE(current_machine->accelerator); 1532 return vcpu_id >= 0 && vcpu_id < kvm_max_vcpu_id(s); 1533 } 1534 1535 static int kvm_init(MachineState *ms) 1536 { 1537 MachineClass *mc = MACHINE_GET_CLASS(ms); 1538 static const char upgrade_note[] = 1539 "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n" 1540 "(see http://sourceforge.net/projects/kvm).\n"; 1541 struct { 1542 const char *name; 1543 int num; 1544 } num_cpus[] = { 1545 { "SMP", smp_cpus }, 1546 { "hotpluggable", max_cpus }, 1547 { NULL, } 1548 }, *nc = num_cpus; 1549 int soft_vcpus_limit, hard_vcpus_limit; 1550 KVMState *s; 1551 const KVMCapabilityInfo *missing_cap; 1552 int ret; 1553 int type = 0; 1554 const char *kvm_type; 1555 1556 s = KVM_STATE(ms->accelerator); 1557 1558 /* 1559 * On systems where the kernel can support different base page 1560 * sizes, host page size may be different from TARGET_PAGE_SIZE, 1561 * even with KVM. TARGET_PAGE_SIZE is assumed to be the minimum 1562 * page size for the system though. 1563 */ 1564 assert(TARGET_PAGE_SIZE <= getpagesize()); 1565 1566 s->sigmask_len = 8; 1567 1568 #ifdef KVM_CAP_SET_GUEST_DEBUG 1569 QTAILQ_INIT(&s->kvm_sw_breakpoints); 1570 #endif 1571 QLIST_INIT(&s->kvm_parked_vcpus); 1572 s->vmfd = -1; 1573 s->fd = qemu_open("/dev/kvm", O_RDWR); 1574 if (s->fd == -1) { 1575 fprintf(stderr, "Could not access KVM kernel module: %m\n"); 1576 ret = -errno; 1577 goto err; 1578 } 1579 1580 ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0); 1581 if (ret < KVM_API_VERSION) { 1582 if (ret >= 0) { 1583 ret = -EINVAL; 1584 } 1585 fprintf(stderr, "kvm version too old\n"); 1586 goto err; 1587 } 1588 1589 if (ret > KVM_API_VERSION) { 1590 ret = -EINVAL; 1591 fprintf(stderr, "kvm version not supported\n"); 1592 goto err; 1593 } 1594 1595 kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT); 1596 s->nr_slots = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS); 1597 1598 /* If unspecified, use the default value */ 1599 if (!s->nr_slots) { 1600 s->nr_slots = 32; 1601 } 1602 1603 kvm_type = qemu_opt_get(qemu_get_machine_opts(), "kvm-type"); 1604 if (mc->kvm_type) { 1605 type = mc->kvm_type(ms, kvm_type); 1606 } else if (kvm_type) { 1607 ret = -EINVAL; 1608 fprintf(stderr, "Invalid argument kvm-type=%s\n", kvm_type); 1609 goto err; 1610 } 1611 1612 do { 1613 ret = kvm_ioctl(s, KVM_CREATE_VM, type); 1614 } while (ret == -EINTR); 1615 1616 if (ret < 0) { 1617 fprintf(stderr, "ioctl(KVM_CREATE_VM) failed: %d %s\n", -ret, 1618 strerror(-ret)); 1619 1620 #ifdef TARGET_S390X 1621 if (ret == -EINVAL) { 1622 fprintf(stderr, 1623 "Host kernel setup problem detected. Please verify:\n"); 1624 fprintf(stderr, "- for kernels supporting the switch_amode or" 1625 " user_mode parameters, whether\n"); 1626 fprintf(stderr, 1627 " user space is running in primary address space\n"); 1628 fprintf(stderr, 1629 "- for kernels supporting the vm.allocate_pgste sysctl, " 1630 "whether it is enabled\n"); 1631 } 1632 #endif 1633 goto err; 1634 } 1635 1636 s->vmfd = ret; 1637 1638 /* check the vcpu limits */ 1639 soft_vcpus_limit = kvm_recommended_vcpus(s); 1640 hard_vcpus_limit = kvm_max_vcpus(s); 1641 1642 while (nc->name) { 1643 if (nc->num > soft_vcpus_limit) { 1644 warn_report("Number of %s cpus requested (%d) exceeds " 1645 "the recommended cpus supported by KVM (%d)", 1646 nc->name, nc->num, soft_vcpus_limit); 1647 1648 if (nc->num > hard_vcpus_limit) { 1649 fprintf(stderr, "Number of %s cpus requested (%d) exceeds " 1650 "the maximum cpus supported by KVM (%d)\n", 1651 nc->name, nc->num, hard_vcpus_limit); 1652 exit(1); 1653 } 1654 } 1655 nc++; 1656 } 1657 1658 missing_cap = kvm_check_extension_list(s, kvm_required_capabilites); 1659 if (!missing_cap) { 1660 missing_cap = 1661 kvm_check_extension_list(s, kvm_arch_required_capabilities); 1662 } 1663 if (missing_cap) { 1664 ret = -EINVAL; 1665 fprintf(stderr, "kvm does not support %s\n%s", 1666 missing_cap->name, upgrade_note); 1667 goto err; 1668 } 1669 1670 s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO); 1671 s->coalesced_pio = s->coalesced_mmio && 1672 kvm_check_extension(s, KVM_CAP_COALESCED_PIO); 1673 1674 #ifdef KVM_CAP_VCPU_EVENTS 1675 s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS); 1676 #endif 1677 1678 s->robust_singlestep = 1679 kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP); 1680 1681 #ifdef KVM_CAP_DEBUGREGS 1682 s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS); 1683 #endif 1684 1685 s->max_nested_state_len = kvm_check_extension(s, KVM_CAP_NESTED_STATE); 1686 1687 #ifdef KVM_CAP_IRQ_ROUTING 1688 kvm_direct_msi_allowed = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0); 1689 #endif 1690 1691 s->intx_set_mask = kvm_check_extension(s, KVM_CAP_PCI_2_3); 1692 1693 s->irq_set_ioctl = KVM_IRQ_LINE; 1694 if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) { 1695 s->irq_set_ioctl = KVM_IRQ_LINE_STATUS; 1696 } 1697 1698 kvm_readonly_mem_allowed = 1699 (kvm_check_extension(s, KVM_CAP_READONLY_MEM) > 0); 1700 1701 kvm_eventfds_allowed = 1702 (kvm_check_extension(s, KVM_CAP_IOEVENTFD) > 0); 1703 1704 kvm_irqfds_allowed = 1705 (kvm_check_extension(s, KVM_CAP_IRQFD) > 0); 1706 1707 kvm_resamplefds_allowed = 1708 (kvm_check_extension(s, KVM_CAP_IRQFD_RESAMPLE) > 0); 1709 1710 kvm_vm_attributes_allowed = 1711 (kvm_check_extension(s, KVM_CAP_VM_ATTRIBUTES) > 0); 1712 1713 kvm_ioeventfd_any_length_allowed = 1714 (kvm_check_extension(s, KVM_CAP_IOEVENTFD_ANY_LENGTH) > 0); 1715 1716 kvm_state = s; 1717 1718 /* 1719 * if memory encryption object is specified then initialize the memory 1720 * encryption context. 1721 */ 1722 if (ms->memory_encryption) { 1723 kvm_state->memcrypt_handle = sev_guest_init(ms->memory_encryption); 1724 if (!kvm_state->memcrypt_handle) { 1725 ret = -1; 1726 goto err; 1727 } 1728 1729 kvm_state->memcrypt_encrypt_data = sev_encrypt_data; 1730 } 1731 1732 ret = kvm_arch_init(ms, s); 1733 if (ret < 0) { 1734 goto err; 1735 } 1736 1737 if (machine_kernel_irqchip_allowed(ms)) { 1738 kvm_irqchip_create(ms, s); 1739 } 1740 1741 if (kvm_eventfds_allowed) { 1742 s->memory_listener.listener.eventfd_add = kvm_mem_ioeventfd_add; 1743 s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del; 1744 } 1745 s->memory_listener.listener.coalesced_io_add = kvm_coalesce_mmio_region; 1746 s->memory_listener.listener.coalesced_io_del = kvm_uncoalesce_mmio_region; 1747 1748 kvm_memory_listener_register(s, &s->memory_listener, 1749 &address_space_memory, 0); 1750 memory_listener_register(&kvm_io_listener, 1751 &address_space_io); 1752 memory_listener_register(&kvm_coalesced_pio_listener, 1753 &address_space_io); 1754 1755 s->many_ioeventfds = kvm_check_many_ioeventfds(); 1756 1757 s->sync_mmu = !!kvm_vm_check_extension(kvm_state, KVM_CAP_SYNC_MMU); 1758 if (!s->sync_mmu) { 1759 qemu_balloon_inhibit(true); 1760 } 1761 1762 return 0; 1763 1764 err: 1765 assert(ret < 0); 1766 if (s->vmfd >= 0) { 1767 close(s->vmfd); 1768 } 1769 if (s->fd != -1) { 1770 close(s->fd); 1771 } 1772 g_free(s->memory_listener.slots); 1773 1774 return ret; 1775 } 1776 1777 void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len) 1778 { 1779 s->sigmask_len = sigmask_len; 1780 } 1781 1782 static void kvm_handle_io(uint16_t port, MemTxAttrs attrs, void *data, int direction, 1783 int size, uint32_t count) 1784 { 1785 int i; 1786 uint8_t *ptr = data; 1787 1788 for (i = 0; i < count; i++) { 1789 address_space_rw(&address_space_io, port, attrs, 1790 ptr, size, 1791 direction == KVM_EXIT_IO_OUT); 1792 ptr += size; 1793 } 1794 } 1795 1796 static int kvm_handle_internal_error(CPUState *cpu, struct kvm_run *run) 1797 { 1798 fprintf(stderr, "KVM internal error. Suberror: %d\n", 1799 run->internal.suberror); 1800 1801 if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) { 1802 int i; 1803 1804 for (i = 0; i < run->internal.ndata; ++i) { 1805 fprintf(stderr, "extra data[%d]: %"PRIx64"\n", 1806 i, (uint64_t)run->internal.data[i]); 1807 } 1808 } 1809 if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) { 1810 fprintf(stderr, "emulation failure\n"); 1811 if (!kvm_arch_stop_on_emulation_error(cpu)) { 1812 cpu_dump_state(cpu, stderr, CPU_DUMP_CODE); 1813 return EXCP_INTERRUPT; 1814 } 1815 } 1816 /* FIXME: Should trigger a qmp message to let management know 1817 * something went wrong. 1818 */ 1819 return -1; 1820 } 1821 1822 void kvm_flush_coalesced_mmio_buffer(void) 1823 { 1824 KVMState *s = kvm_state; 1825 1826 if (s->coalesced_flush_in_progress) { 1827 return; 1828 } 1829 1830 s->coalesced_flush_in_progress = true; 1831 1832 if (s->coalesced_mmio_ring) { 1833 struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring; 1834 while (ring->first != ring->last) { 1835 struct kvm_coalesced_mmio *ent; 1836 1837 ent = &ring->coalesced_mmio[ring->first]; 1838 1839 if (ent->pio == 1) { 1840 address_space_rw(&address_space_io, ent->phys_addr, 1841 MEMTXATTRS_UNSPECIFIED, ent->data, 1842 ent->len, true); 1843 } else { 1844 cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len); 1845 } 1846 smp_wmb(); 1847 ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX; 1848 } 1849 } 1850 1851 s->coalesced_flush_in_progress = false; 1852 } 1853 1854 static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg) 1855 { 1856 if (!cpu->vcpu_dirty) { 1857 kvm_arch_get_registers(cpu); 1858 cpu->vcpu_dirty = true; 1859 } 1860 } 1861 1862 void kvm_cpu_synchronize_state(CPUState *cpu) 1863 { 1864 if (!cpu->vcpu_dirty) { 1865 run_on_cpu(cpu, do_kvm_cpu_synchronize_state, RUN_ON_CPU_NULL); 1866 } 1867 } 1868 1869 static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg) 1870 { 1871 kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE); 1872 cpu->vcpu_dirty = false; 1873 } 1874 1875 void kvm_cpu_synchronize_post_reset(CPUState *cpu) 1876 { 1877 run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL); 1878 } 1879 1880 static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg) 1881 { 1882 kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE); 1883 cpu->vcpu_dirty = false; 1884 } 1885 1886 void kvm_cpu_synchronize_post_init(CPUState *cpu) 1887 { 1888 run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, RUN_ON_CPU_NULL); 1889 } 1890 1891 static void do_kvm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg) 1892 { 1893 cpu->vcpu_dirty = true; 1894 } 1895 1896 void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu) 1897 { 1898 run_on_cpu(cpu, do_kvm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL); 1899 } 1900 1901 #ifdef KVM_HAVE_MCE_INJECTION 1902 static __thread void *pending_sigbus_addr; 1903 static __thread int pending_sigbus_code; 1904 static __thread bool have_sigbus_pending; 1905 #endif 1906 1907 static void kvm_cpu_kick(CPUState *cpu) 1908 { 1909 atomic_set(&cpu->kvm_run->immediate_exit, 1); 1910 } 1911 1912 static void kvm_cpu_kick_self(void) 1913 { 1914 if (kvm_immediate_exit) { 1915 kvm_cpu_kick(current_cpu); 1916 } else { 1917 qemu_cpu_kick_self(); 1918 } 1919 } 1920 1921 static void kvm_eat_signals(CPUState *cpu) 1922 { 1923 struct timespec ts = { 0, 0 }; 1924 siginfo_t siginfo; 1925 sigset_t waitset; 1926 sigset_t chkset; 1927 int r; 1928 1929 if (kvm_immediate_exit) { 1930 atomic_set(&cpu->kvm_run->immediate_exit, 0); 1931 /* Write kvm_run->immediate_exit before the cpu->exit_request 1932 * write in kvm_cpu_exec. 1933 */ 1934 smp_wmb(); 1935 return; 1936 } 1937 1938 sigemptyset(&waitset); 1939 sigaddset(&waitset, SIG_IPI); 1940 1941 do { 1942 r = sigtimedwait(&waitset, &siginfo, &ts); 1943 if (r == -1 && !(errno == EAGAIN || errno == EINTR)) { 1944 perror("sigtimedwait"); 1945 exit(1); 1946 } 1947 1948 r = sigpending(&chkset); 1949 if (r == -1) { 1950 perror("sigpending"); 1951 exit(1); 1952 } 1953 } while (sigismember(&chkset, SIG_IPI)); 1954 } 1955 1956 int kvm_cpu_exec(CPUState *cpu) 1957 { 1958 struct kvm_run *run = cpu->kvm_run; 1959 int ret, run_ret; 1960 1961 DPRINTF("kvm_cpu_exec()\n"); 1962 1963 if (kvm_arch_process_async_events(cpu)) { 1964 atomic_set(&cpu->exit_request, 0); 1965 return EXCP_HLT; 1966 } 1967 1968 qemu_mutex_unlock_iothread(); 1969 cpu_exec_start(cpu); 1970 1971 do { 1972 MemTxAttrs attrs; 1973 1974 if (cpu->vcpu_dirty) { 1975 kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE); 1976 cpu->vcpu_dirty = false; 1977 } 1978 1979 kvm_arch_pre_run(cpu, run); 1980 if (atomic_read(&cpu->exit_request)) { 1981 DPRINTF("interrupt exit requested\n"); 1982 /* 1983 * KVM requires us to reenter the kernel after IO exits to complete 1984 * instruction emulation. This self-signal will ensure that we 1985 * leave ASAP again. 1986 */ 1987 kvm_cpu_kick_self(); 1988 } 1989 1990 /* Read cpu->exit_request before KVM_RUN reads run->immediate_exit. 1991 * Matching barrier in kvm_eat_signals. 1992 */ 1993 smp_rmb(); 1994 1995 run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0); 1996 1997 attrs = kvm_arch_post_run(cpu, run); 1998 1999 #ifdef KVM_HAVE_MCE_INJECTION 2000 if (unlikely(have_sigbus_pending)) { 2001 qemu_mutex_lock_iothread(); 2002 kvm_arch_on_sigbus_vcpu(cpu, pending_sigbus_code, 2003 pending_sigbus_addr); 2004 have_sigbus_pending = false; 2005 qemu_mutex_unlock_iothread(); 2006 } 2007 #endif 2008 2009 if (run_ret < 0) { 2010 if (run_ret == -EINTR || run_ret == -EAGAIN) { 2011 DPRINTF("io window exit\n"); 2012 kvm_eat_signals(cpu); 2013 ret = EXCP_INTERRUPT; 2014 break; 2015 } 2016 fprintf(stderr, "error: kvm run failed %s\n", 2017 strerror(-run_ret)); 2018 #ifdef TARGET_PPC 2019 if (run_ret == -EBUSY) { 2020 fprintf(stderr, 2021 "This is probably because your SMT is enabled.\n" 2022 "VCPU can only run on primary threads with all " 2023 "secondary threads offline.\n"); 2024 } 2025 #endif 2026 ret = -1; 2027 break; 2028 } 2029 2030 trace_kvm_run_exit(cpu->cpu_index, run->exit_reason); 2031 switch (run->exit_reason) { 2032 case KVM_EXIT_IO: 2033 DPRINTF("handle_io\n"); 2034 /* Called outside BQL */ 2035 kvm_handle_io(run->io.port, attrs, 2036 (uint8_t *)run + run->io.data_offset, 2037 run->io.direction, 2038 run->io.size, 2039 run->io.count); 2040 ret = 0; 2041 break; 2042 case KVM_EXIT_MMIO: 2043 DPRINTF("handle_mmio\n"); 2044 /* Called outside BQL */ 2045 address_space_rw(&address_space_memory, 2046 run->mmio.phys_addr, attrs, 2047 run->mmio.data, 2048 run->mmio.len, 2049 run->mmio.is_write); 2050 ret = 0; 2051 break; 2052 case KVM_EXIT_IRQ_WINDOW_OPEN: 2053 DPRINTF("irq_window_open\n"); 2054 ret = EXCP_INTERRUPT; 2055 break; 2056 case KVM_EXIT_SHUTDOWN: 2057 DPRINTF("shutdown\n"); 2058 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); 2059 ret = EXCP_INTERRUPT; 2060 break; 2061 case KVM_EXIT_UNKNOWN: 2062 fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n", 2063 (uint64_t)run->hw.hardware_exit_reason); 2064 ret = -1; 2065 break; 2066 case KVM_EXIT_INTERNAL_ERROR: 2067 ret = kvm_handle_internal_error(cpu, run); 2068 break; 2069 case KVM_EXIT_SYSTEM_EVENT: 2070 switch (run->system_event.type) { 2071 case KVM_SYSTEM_EVENT_SHUTDOWN: 2072 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN); 2073 ret = EXCP_INTERRUPT; 2074 break; 2075 case KVM_SYSTEM_EVENT_RESET: 2076 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); 2077 ret = EXCP_INTERRUPT; 2078 break; 2079 case KVM_SYSTEM_EVENT_CRASH: 2080 kvm_cpu_synchronize_state(cpu); 2081 qemu_mutex_lock_iothread(); 2082 qemu_system_guest_panicked(cpu_get_crash_info(cpu)); 2083 qemu_mutex_unlock_iothread(); 2084 ret = 0; 2085 break; 2086 default: 2087 DPRINTF("kvm_arch_handle_exit\n"); 2088 ret = kvm_arch_handle_exit(cpu, run); 2089 break; 2090 } 2091 break; 2092 default: 2093 DPRINTF("kvm_arch_handle_exit\n"); 2094 ret = kvm_arch_handle_exit(cpu, run); 2095 break; 2096 } 2097 } while (ret == 0); 2098 2099 cpu_exec_end(cpu); 2100 qemu_mutex_lock_iothread(); 2101 2102 if (ret < 0) { 2103 cpu_dump_state(cpu, stderr, CPU_DUMP_CODE); 2104 vm_stop(RUN_STATE_INTERNAL_ERROR); 2105 } 2106 2107 atomic_set(&cpu->exit_request, 0); 2108 return ret; 2109 } 2110 2111 int kvm_ioctl(KVMState *s, int type, ...) 2112 { 2113 int ret; 2114 void *arg; 2115 va_list ap; 2116 2117 va_start(ap, type); 2118 arg = va_arg(ap, void *); 2119 va_end(ap); 2120 2121 trace_kvm_ioctl(type, arg); 2122 ret = ioctl(s->fd, type, arg); 2123 if (ret == -1) { 2124 ret = -errno; 2125 } 2126 return ret; 2127 } 2128 2129 int kvm_vm_ioctl(KVMState *s, int type, ...) 2130 { 2131 int ret; 2132 void *arg; 2133 va_list ap; 2134 2135 va_start(ap, type); 2136 arg = va_arg(ap, void *); 2137 va_end(ap); 2138 2139 trace_kvm_vm_ioctl(type, arg); 2140 ret = ioctl(s->vmfd, type, arg); 2141 if (ret == -1) { 2142 ret = -errno; 2143 } 2144 return ret; 2145 } 2146 2147 int kvm_vcpu_ioctl(CPUState *cpu, int type, ...) 2148 { 2149 int ret; 2150 void *arg; 2151 va_list ap; 2152 2153 va_start(ap, type); 2154 arg = va_arg(ap, void *); 2155 va_end(ap); 2156 2157 trace_kvm_vcpu_ioctl(cpu->cpu_index, type, arg); 2158 ret = ioctl(cpu->kvm_fd, type, arg); 2159 if (ret == -1) { 2160 ret = -errno; 2161 } 2162 return ret; 2163 } 2164 2165 int kvm_device_ioctl(int fd, int type, ...) 2166 { 2167 int ret; 2168 void *arg; 2169 va_list ap; 2170 2171 va_start(ap, type); 2172 arg = va_arg(ap, void *); 2173 va_end(ap); 2174 2175 trace_kvm_device_ioctl(fd, type, arg); 2176 ret = ioctl(fd, type, arg); 2177 if (ret == -1) { 2178 ret = -errno; 2179 } 2180 return ret; 2181 } 2182 2183 int kvm_vm_check_attr(KVMState *s, uint32_t group, uint64_t attr) 2184 { 2185 int ret; 2186 struct kvm_device_attr attribute = { 2187 .group = group, 2188 .attr = attr, 2189 }; 2190 2191 if (!kvm_vm_attributes_allowed) { 2192 return 0; 2193 } 2194 2195 ret = kvm_vm_ioctl(s, KVM_HAS_DEVICE_ATTR, &attribute); 2196 /* kvm returns 0 on success for HAS_DEVICE_ATTR */ 2197 return ret ? 0 : 1; 2198 } 2199 2200 int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr) 2201 { 2202 struct kvm_device_attr attribute = { 2203 .group = group, 2204 .attr = attr, 2205 .flags = 0, 2206 }; 2207 2208 return kvm_device_ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute) ? 0 : 1; 2209 } 2210 2211 int kvm_device_access(int fd, int group, uint64_t attr, 2212 void *val, bool write, Error **errp) 2213 { 2214 struct kvm_device_attr kvmattr; 2215 int err; 2216 2217 kvmattr.flags = 0; 2218 kvmattr.group = group; 2219 kvmattr.attr = attr; 2220 kvmattr.addr = (uintptr_t)val; 2221 2222 err = kvm_device_ioctl(fd, 2223 write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR, 2224 &kvmattr); 2225 if (err < 0) { 2226 error_setg_errno(errp, -err, 2227 "KVM_%s_DEVICE_ATTR failed: Group %d " 2228 "attr 0x%016" PRIx64, 2229 write ? "SET" : "GET", group, attr); 2230 } 2231 return err; 2232 } 2233 2234 bool kvm_has_sync_mmu(void) 2235 { 2236 return kvm_state->sync_mmu; 2237 } 2238 2239 int kvm_has_vcpu_events(void) 2240 { 2241 return kvm_state->vcpu_events; 2242 } 2243 2244 int kvm_has_robust_singlestep(void) 2245 { 2246 return kvm_state->robust_singlestep; 2247 } 2248 2249 int kvm_has_debugregs(void) 2250 { 2251 return kvm_state->debugregs; 2252 } 2253 2254 int kvm_max_nested_state_length(void) 2255 { 2256 return kvm_state->max_nested_state_len; 2257 } 2258 2259 int kvm_has_many_ioeventfds(void) 2260 { 2261 if (!kvm_enabled()) { 2262 return 0; 2263 } 2264 return kvm_state->many_ioeventfds; 2265 } 2266 2267 int kvm_has_gsi_routing(void) 2268 { 2269 #ifdef KVM_CAP_IRQ_ROUTING 2270 return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING); 2271 #else 2272 return false; 2273 #endif 2274 } 2275 2276 int kvm_has_intx_set_mask(void) 2277 { 2278 return kvm_state->intx_set_mask; 2279 } 2280 2281 bool kvm_arm_supports_user_irq(void) 2282 { 2283 return kvm_check_extension(kvm_state, KVM_CAP_ARM_USER_IRQ); 2284 } 2285 2286 #ifdef KVM_CAP_SET_GUEST_DEBUG 2287 struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu, 2288 target_ulong pc) 2289 { 2290 struct kvm_sw_breakpoint *bp; 2291 2292 QTAILQ_FOREACH(bp, &cpu->kvm_state->kvm_sw_breakpoints, entry) { 2293 if (bp->pc == pc) { 2294 return bp; 2295 } 2296 } 2297 return NULL; 2298 } 2299 2300 int kvm_sw_breakpoints_active(CPUState *cpu) 2301 { 2302 return !QTAILQ_EMPTY(&cpu->kvm_state->kvm_sw_breakpoints); 2303 } 2304 2305 struct kvm_set_guest_debug_data { 2306 struct kvm_guest_debug dbg; 2307 int err; 2308 }; 2309 2310 static void kvm_invoke_set_guest_debug(CPUState *cpu, run_on_cpu_data data) 2311 { 2312 struct kvm_set_guest_debug_data *dbg_data = 2313 (struct kvm_set_guest_debug_data *) data.host_ptr; 2314 2315 dbg_data->err = kvm_vcpu_ioctl(cpu, KVM_SET_GUEST_DEBUG, 2316 &dbg_data->dbg); 2317 } 2318 2319 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap) 2320 { 2321 struct kvm_set_guest_debug_data data; 2322 2323 data.dbg.control = reinject_trap; 2324 2325 if (cpu->singlestep_enabled) { 2326 data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP; 2327 } 2328 kvm_arch_update_guest_debug(cpu, &data.dbg); 2329 2330 run_on_cpu(cpu, kvm_invoke_set_guest_debug, 2331 RUN_ON_CPU_HOST_PTR(&data)); 2332 return data.err; 2333 } 2334 2335 int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr, 2336 target_ulong len, int type) 2337 { 2338 struct kvm_sw_breakpoint *bp; 2339 int err; 2340 2341 if (type == GDB_BREAKPOINT_SW) { 2342 bp = kvm_find_sw_breakpoint(cpu, addr); 2343 if (bp) { 2344 bp->use_count++; 2345 return 0; 2346 } 2347 2348 bp = g_malloc(sizeof(struct kvm_sw_breakpoint)); 2349 bp->pc = addr; 2350 bp->use_count = 1; 2351 err = kvm_arch_insert_sw_breakpoint(cpu, bp); 2352 if (err) { 2353 g_free(bp); 2354 return err; 2355 } 2356 2357 QTAILQ_INSERT_HEAD(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry); 2358 } else { 2359 err = kvm_arch_insert_hw_breakpoint(addr, len, type); 2360 if (err) { 2361 return err; 2362 } 2363 } 2364 2365 CPU_FOREACH(cpu) { 2366 err = kvm_update_guest_debug(cpu, 0); 2367 if (err) { 2368 return err; 2369 } 2370 } 2371 return 0; 2372 } 2373 2374 int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr, 2375 target_ulong len, int type) 2376 { 2377 struct kvm_sw_breakpoint *bp; 2378 int err; 2379 2380 if (type == GDB_BREAKPOINT_SW) { 2381 bp = kvm_find_sw_breakpoint(cpu, addr); 2382 if (!bp) { 2383 return -ENOENT; 2384 } 2385 2386 if (bp->use_count > 1) { 2387 bp->use_count--; 2388 return 0; 2389 } 2390 2391 err = kvm_arch_remove_sw_breakpoint(cpu, bp); 2392 if (err) { 2393 return err; 2394 } 2395 2396 QTAILQ_REMOVE(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry); 2397 g_free(bp); 2398 } else { 2399 err = kvm_arch_remove_hw_breakpoint(addr, len, type); 2400 if (err) { 2401 return err; 2402 } 2403 } 2404 2405 CPU_FOREACH(cpu) { 2406 err = kvm_update_guest_debug(cpu, 0); 2407 if (err) { 2408 return err; 2409 } 2410 } 2411 return 0; 2412 } 2413 2414 void kvm_remove_all_breakpoints(CPUState *cpu) 2415 { 2416 struct kvm_sw_breakpoint *bp, *next; 2417 KVMState *s = cpu->kvm_state; 2418 CPUState *tmpcpu; 2419 2420 QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) { 2421 if (kvm_arch_remove_sw_breakpoint(cpu, bp) != 0) { 2422 /* Try harder to find a CPU that currently sees the breakpoint. */ 2423 CPU_FOREACH(tmpcpu) { 2424 if (kvm_arch_remove_sw_breakpoint(tmpcpu, bp) == 0) { 2425 break; 2426 } 2427 } 2428 } 2429 QTAILQ_REMOVE(&s->kvm_sw_breakpoints, bp, entry); 2430 g_free(bp); 2431 } 2432 kvm_arch_remove_all_hw_breakpoints(); 2433 2434 CPU_FOREACH(cpu) { 2435 kvm_update_guest_debug(cpu, 0); 2436 } 2437 } 2438 2439 #else /* !KVM_CAP_SET_GUEST_DEBUG */ 2440 2441 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap) 2442 { 2443 return -EINVAL; 2444 } 2445 2446 int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr, 2447 target_ulong len, int type) 2448 { 2449 return -EINVAL; 2450 } 2451 2452 int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr, 2453 target_ulong len, int type) 2454 { 2455 return -EINVAL; 2456 } 2457 2458 void kvm_remove_all_breakpoints(CPUState *cpu) 2459 { 2460 } 2461 #endif /* !KVM_CAP_SET_GUEST_DEBUG */ 2462 2463 static int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset) 2464 { 2465 KVMState *s = kvm_state; 2466 struct kvm_signal_mask *sigmask; 2467 int r; 2468 2469 sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset)); 2470 2471 sigmask->len = s->sigmask_len; 2472 memcpy(sigmask->sigset, sigset, sizeof(*sigset)); 2473 r = kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, sigmask); 2474 g_free(sigmask); 2475 2476 return r; 2477 } 2478 2479 static void kvm_ipi_signal(int sig) 2480 { 2481 if (current_cpu) { 2482 assert(kvm_immediate_exit); 2483 kvm_cpu_kick(current_cpu); 2484 } 2485 } 2486 2487 void kvm_init_cpu_signals(CPUState *cpu) 2488 { 2489 int r; 2490 sigset_t set; 2491 struct sigaction sigact; 2492 2493 memset(&sigact, 0, sizeof(sigact)); 2494 sigact.sa_handler = kvm_ipi_signal; 2495 sigaction(SIG_IPI, &sigact, NULL); 2496 2497 pthread_sigmask(SIG_BLOCK, NULL, &set); 2498 #if defined KVM_HAVE_MCE_INJECTION 2499 sigdelset(&set, SIGBUS); 2500 pthread_sigmask(SIG_SETMASK, &set, NULL); 2501 #endif 2502 sigdelset(&set, SIG_IPI); 2503 if (kvm_immediate_exit) { 2504 r = pthread_sigmask(SIG_SETMASK, &set, NULL); 2505 } else { 2506 r = kvm_set_signal_mask(cpu, &set); 2507 } 2508 if (r) { 2509 fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r)); 2510 exit(1); 2511 } 2512 } 2513 2514 /* Called asynchronously in VCPU thread. */ 2515 int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr) 2516 { 2517 #ifdef KVM_HAVE_MCE_INJECTION 2518 if (have_sigbus_pending) { 2519 return 1; 2520 } 2521 have_sigbus_pending = true; 2522 pending_sigbus_addr = addr; 2523 pending_sigbus_code = code; 2524 atomic_set(&cpu->exit_request, 1); 2525 return 0; 2526 #else 2527 return 1; 2528 #endif 2529 } 2530 2531 /* Called synchronously (via signalfd) in main thread. */ 2532 int kvm_on_sigbus(int code, void *addr) 2533 { 2534 #ifdef KVM_HAVE_MCE_INJECTION 2535 /* Action required MCE kills the process if SIGBUS is blocked. Because 2536 * that's what happens in the I/O thread, where we handle MCE via signalfd, 2537 * we can only get action optional here. 2538 */ 2539 assert(code != BUS_MCEERR_AR); 2540 kvm_arch_on_sigbus_vcpu(first_cpu, code, addr); 2541 return 0; 2542 #else 2543 return 1; 2544 #endif 2545 } 2546 2547 int kvm_create_device(KVMState *s, uint64_t type, bool test) 2548 { 2549 int ret; 2550 struct kvm_create_device create_dev; 2551 2552 create_dev.type = type; 2553 create_dev.fd = -1; 2554 create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0; 2555 2556 if (!kvm_check_extension(s, KVM_CAP_DEVICE_CTRL)) { 2557 return -ENOTSUP; 2558 } 2559 2560 ret = kvm_vm_ioctl(s, KVM_CREATE_DEVICE, &create_dev); 2561 if (ret) { 2562 return ret; 2563 } 2564 2565 return test ? 0 : create_dev.fd; 2566 } 2567 2568 bool kvm_device_supported(int vmfd, uint64_t type) 2569 { 2570 struct kvm_create_device create_dev = { 2571 .type = type, 2572 .fd = -1, 2573 .flags = KVM_CREATE_DEVICE_TEST, 2574 }; 2575 2576 if (ioctl(vmfd, KVM_CHECK_EXTENSION, KVM_CAP_DEVICE_CTRL) <= 0) { 2577 return false; 2578 } 2579 2580 return (ioctl(vmfd, KVM_CREATE_DEVICE, &create_dev) >= 0); 2581 } 2582 2583 int kvm_set_one_reg(CPUState *cs, uint64_t id, void *source) 2584 { 2585 struct kvm_one_reg reg; 2586 int r; 2587 2588 reg.id = id; 2589 reg.addr = (uintptr_t) source; 2590 r = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, ®); 2591 if (r) { 2592 trace_kvm_failed_reg_set(id, strerror(-r)); 2593 } 2594 return r; 2595 } 2596 2597 int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target) 2598 { 2599 struct kvm_one_reg reg; 2600 int r; 2601 2602 reg.id = id; 2603 reg.addr = (uintptr_t) target; 2604 r = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, ®); 2605 if (r) { 2606 trace_kvm_failed_reg_get(id, strerror(-r)); 2607 } 2608 return r; 2609 } 2610 2611 static void kvm_accel_class_init(ObjectClass *oc, void *data) 2612 { 2613 AccelClass *ac = ACCEL_CLASS(oc); 2614 ac->name = "KVM"; 2615 ac->init_machine = kvm_init; 2616 ac->allowed = &kvm_allowed; 2617 } 2618 2619 static const TypeInfo kvm_accel_type = { 2620 .name = TYPE_KVM_ACCEL, 2621 .parent = TYPE_ACCEL, 2622 .class_init = kvm_accel_class_init, 2623 .instance_size = sizeof(KVMState), 2624 }; 2625 2626 static void kvm_type_init(void) 2627 { 2628 type_register_static(&kvm_accel_type); 2629 } 2630 2631 type_init(kvm_type_init); 2632