1 /* 2 * QEMU KVM support 3 * 4 * Copyright IBM, Corp. 2008 5 * Red Hat, Inc. 2008 6 * 7 * Authors: 8 * Anthony Liguori <aliguori@us.ibm.com> 9 * Glauber Costa <gcosta@redhat.com> 10 * 11 * This work is licensed under the terms of the GNU GPL, version 2 or later. 12 * See the COPYING file in the top-level directory. 13 * 14 */ 15 16 #include "qemu/osdep.h" 17 #include <sys/ioctl.h> 18 19 #include <linux/kvm.h> 20 21 #include "qemu/atomic.h" 22 #include "qemu/option.h" 23 #include "qemu/config-file.h" 24 #include "qemu/error-report.h" 25 #include "qapi/error.h" 26 #include "hw/hw.h" 27 #include "hw/pci/msi.h" 28 #include "hw/pci/msix.h" 29 #include "hw/s390x/adapter.h" 30 #include "exec/gdbstub.h" 31 #include "sysemu/kvm_int.h" 32 #include "sysemu/cpus.h" 33 #include "qemu/bswap.h" 34 #include "exec/memory.h" 35 #include "exec/ram_addr.h" 36 #include "exec/address-spaces.h" 37 #include "qemu/event_notifier.h" 38 #include "trace.h" 39 #include "hw/irq.h" 40 #include "sysemu/sev.h" 41 #include "sysemu/balloon.h" 42 43 #include "hw/boards.h" 44 45 /* This check must be after config-host.h is included */ 46 #ifdef CONFIG_EVENTFD 47 #include <sys/eventfd.h> 48 #endif 49 50 /* KVM uses PAGE_SIZE in its definition of KVM_COALESCED_MMIO_MAX. We 51 * need to use the real host PAGE_SIZE, as that's what KVM will use. 52 */ 53 #define PAGE_SIZE getpagesize() 54 55 //#define DEBUG_KVM 56 57 #ifdef DEBUG_KVM 58 #define DPRINTF(fmt, ...) \ 59 do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0) 60 #else 61 #define DPRINTF(fmt, ...) \ 62 do { } while (0) 63 #endif 64 65 #define KVM_MSI_HASHTAB_SIZE 256 66 67 struct KVMParkedVcpu { 68 unsigned long vcpu_id; 69 int kvm_fd; 70 QLIST_ENTRY(KVMParkedVcpu) node; 71 }; 72 73 struct KVMState 74 { 75 AccelState parent_obj; 76 77 int nr_slots; 78 int fd; 79 int vmfd; 80 int coalesced_mmio; 81 int coalesced_pio; 82 struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; 83 bool coalesced_flush_in_progress; 84 int vcpu_events; 85 int robust_singlestep; 86 int debugregs; 87 #ifdef KVM_CAP_SET_GUEST_DEBUG 88 QTAILQ_HEAD(, kvm_sw_breakpoint) kvm_sw_breakpoints; 89 #endif 90 int max_nested_state_len; 91 int many_ioeventfds; 92 int intx_set_mask; 93 bool sync_mmu; 94 /* The man page (and posix) say ioctl numbers are signed int, but 95 * they're not. Linux, glibc and *BSD all treat ioctl numbers as 96 * unsigned, and treating them as signed here can break things */ 97 unsigned irq_set_ioctl; 98 unsigned int sigmask_len; 99 GHashTable *gsimap; 100 #ifdef KVM_CAP_IRQ_ROUTING 101 struct kvm_irq_routing *irq_routes; 102 int nr_allocated_irq_routes; 103 unsigned long *used_gsi_bitmap; 104 unsigned int gsi_count; 105 QTAILQ_HEAD(, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE]; 106 #endif 107 KVMMemoryListener memory_listener; 108 QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus; 109 110 /* memory encryption */ 111 void *memcrypt_handle; 112 int (*memcrypt_encrypt_data)(void *handle, uint8_t *ptr, uint64_t len); 113 }; 114 115 KVMState *kvm_state; 116 bool kvm_kernel_irqchip; 117 bool kvm_split_irqchip; 118 bool kvm_async_interrupts_allowed; 119 bool kvm_halt_in_kernel_allowed; 120 bool kvm_eventfds_allowed; 121 bool kvm_irqfds_allowed; 122 bool kvm_resamplefds_allowed; 123 bool kvm_msi_via_irqfd_allowed; 124 bool kvm_gsi_routing_allowed; 125 bool kvm_gsi_direct_mapping; 126 bool kvm_allowed; 127 bool kvm_readonly_mem_allowed; 128 bool kvm_vm_attributes_allowed; 129 bool kvm_direct_msi_allowed; 130 bool kvm_ioeventfd_any_length_allowed; 131 bool kvm_msi_use_devid; 132 static bool kvm_immediate_exit; 133 134 static const KVMCapabilityInfo kvm_required_capabilites[] = { 135 KVM_CAP_INFO(USER_MEMORY), 136 KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS), 137 KVM_CAP_INFO(JOIN_MEMORY_REGIONS_WORKS), 138 KVM_CAP_LAST_INFO 139 }; 140 141 int kvm_get_max_memslots(void) 142 { 143 KVMState *s = KVM_STATE(current_machine->accelerator); 144 145 return s->nr_slots; 146 } 147 148 bool kvm_memcrypt_enabled(void) 149 { 150 if (kvm_state && kvm_state->memcrypt_handle) { 151 return true; 152 } 153 154 return false; 155 } 156 157 int kvm_memcrypt_encrypt_data(uint8_t *ptr, uint64_t len) 158 { 159 if (kvm_state->memcrypt_handle && 160 kvm_state->memcrypt_encrypt_data) { 161 return kvm_state->memcrypt_encrypt_data(kvm_state->memcrypt_handle, 162 ptr, len); 163 } 164 165 return 1; 166 } 167 168 static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml) 169 { 170 KVMState *s = kvm_state; 171 int i; 172 173 for (i = 0; i < s->nr_slots; i++) { 174 if (kml->slots[i].memory_size == 0) { 175 return &kml->slots[i]; 176 } 177 } 178 179 return NULL; 180 } 181 182 bool kvm_has_free_slot(MachineState *ms) 183 { 184 KVMState *s = KVM_STATE(ms->accelerator); 185 186 return kvm_get_free_slot(&s->memory_listener); 187 } 188 189 static KVMSlot *kvm_alloc_slot(KVMMemoryListener *kml) 190 { 191 KVMSlot *slot = kvm_get_free_slot(kml); 192 193 if (slot) { 194 return slot; 195 } 196 197 fprintf(stderr, "%s: no free slot available\n", __func__); 198 abort(); 199 } 200 201 static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml, 202 hwaddr start_addr, 203 hwaddr size) 204 { 205 KVMState *s = kvm_state; 206 int i; 207 208 for (i = 0; i < s->nr_slots; i++) { 209 KVMSlot *mem = &kml->slots[i]; 210 211 if (start_addr == mem->start_addr && size == mem->memory_size) { 212 return mem; 213 } 214 } 215 216 return NULL; 217 } 218 219 /* 220 * Calculate and align the start address and the size of the section. 221 * Return the size. If the size is 0, the aligned section is empty. 222 */ 223 static hwaddr kvm_align_section(MemoryRegionSection *section, 224 hwaddr *start) 225 { 226 hwaddr size = int128_get64(section->size); 227 hwaddr delta, aligned; 228 229 /* kvm works in page size chunks, but the function may be called 230 with sub-page size and unaligned start address. Pad the start 231 address to next and truncate size to previous page boundary. */ 232 aligned = ROUND_UP(section->offset_within_address_space, 233 qemu_real_host_page_size); 234 delta = aligned - section->offset_within_address_space; 235 *start = aligned; 236 if (delta > size) { 237 return 0; 238 } 239 240 return (size - delta) & qemu_real_host_page_mask; 241 } 242 243 int kvm_physical_memory_addr_from_host(KVMState *s, void *ram, 244 hwaddr *phys_addr) 245 { 246 KVMMemoryListener *kml = &s->memory_listener; 247 int i; 248 249 for (i = 0; i < s->nr_slots; i++) { 250 KVMSlot *mem = &kml->slots[i]; 251 252 if (ram >= mem->ram && ram < mem->ram + mem->memory_size) { 253 *phys_addr = mem->start_addr + (ram - mem->ram); 254 return 1; 255 } 256 } 257 258 return 0; 259 } 260 261 static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot, bool new) 262 { 263 KVMState *s = kvm_state; 264 struct kvm_userspace_memory_region mem; 265 int ret; 266 267 mem.slot = slot->slot | (kml->as_id << 16); 268 mem.guest_phys_addr = slot->start_addr; 269 mem.userspace_addr = (unsigned long)slot->ram; 270 mem.flags = slot->flags; 271 272 if (slot->memory_size && !new && (mem.flags ^ slot->old_flags) & KVM_MEM_READONLY) { 273 /* Set the slot size to 0 before setting the slot to the desired 274 * value. This is needed based on KVM commit 75d61fbc. */ 275 mem.memory_size = 0; 276 kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem); 277 } 278 mem.memory_size = slot->memory_size; 279 ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem); 280 slot->old_flags = mem.flags; 281 trace_kvm_set_user_memory(mem.slot, mem.flags, mem.guest_phys_addr, 282 mem.memory_size, mem.userspace_addr, ret); 283 return ret; 284 } 285 286 int kvm_destroy_vcpu(CPUState *cpu) 287 { 288 KVMState *s = kvm_state; 289 long mmap_size; 290 struct KVMParkedVcpu *vcpu = NULL; 291 int ret = 0; 292 293 DPRINTF("kvm_destroy_vcpu\n"); 294 295 ret = kvm_arch_destroy_vcpu(cpu); 296 if (ret < 0) { 297 goto err; 298 } 299 300 mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0); 301 if (mmap_size < 0) { 302 ret = mmap_size; 303 DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n"); 304 goto err; 305 } 306 307 ret = munmap(cpu->kvm_run, mmap_size); 308 if (ret < 0) { 309 goto err; 310 } 311 312 vcpu = g_malloc0(sizeof(*vcpu)); 313 vcpu->vcpu_id = kvm_arch_vcpu_id(cpu); 314 vcpu->kvm_fd = cpu->kvm_fd; 315 QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node); 316 err: 317 return ret; 318 } 319 320 static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id) 321 { 322 struct KVMParkedVcpu *cpu; 323 324 QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) { 325 if (cpu->vcpu_id == vcpu_id) { 326 int kvm_fd; 327 328 QLIST_REMOVE(cpu, node); 329 kvm_fd = cpu->kvm_fd; 330 g_free(cpu); 331 return kvm_fd; 332 } 333 } 334 335 return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id); 336 } 337 338 int kvm_init_vcpu(CPUState *cpu) 339 { 340 KVMState *s = kvm_state; 341 long mmap_size; 342 int ret; 343 344 DPRINTF("kvm_init_vcpu\n"); 345 346 ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu)); 347 if (ret < 0) { 348 DPRINTF("kvm_create_vcpu failed\n"); 349 goto err; 350 } 351 352 cpu->kvm_fd = ret; 353 cpu->kvm_state = s; 354 cpu->vcpu_dirty = true; 355 356 mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0); 357 if (mmap_size < 0) { 358 ret = mmap_size; 359 DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n"); 360 goto err; 361 } 362 363 cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, 364 cpu->kvm_fd, 0); 365 if (cpu->kvm_run == MAP_FAILED) { 366 ret = -errno; 367 DPRINTF("mmap'ing vcpu state failed\n"); 368 goto err; 369 } 370 371 if (s->coalesced_mmio && !s->coalesced_mmio_ring) { 372 s->coalesced_mmio_ring = 373 (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE; 374 } 375 376 ret = kvm_arch_init_vcpu(cpu); 377 err: 378 return ret; 379 } 380 381 /* 382 * dirty pages logging control 383 */ 384 385 static int kvm_mem_flags(MemoryRegion *mr) 386 { 387 bool readonly = mr->readonly || memory_region_is_romd(mr); 388 int flags = 0; 389 390 if (memory_region_get_dirty_log_mask(mr) != 0) { 391 flags |= KVM_MEM_LOG_DIRTY_PAGES; 392 } 393 if (readonly && kvm_readonly_mem_allowed) { 394 flags |= KVM_MEM_READONLY; 395 } 396 return flags; 397 } 398 399 static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem, 400 MemoryRegion *mr) 401 { 402 mem->flags = kvm_mem_flags(mr); 403 404 /* If nothing changed effectively, no need to issue ioctl */ 405 if (mem->flags == mem->old_flags) { 406 return 0; 407 } 408 409 return kvm_set_user_memory_region(kml, mem, false); 410 } 411 412 static int kvm_section_update_flags(KVMMemoryListener *kml, 413 MemoryRegionSection *section) 414 { 415 hwaddr start_addr, size; 416 KVMSlot *mem; 417 418 size = kvm_align_section(section, &start_addr); 419 if (!size) { 420 return 0; 421 } 422 423 mem = kvm_lookup_matching_slot(kml, start_addr, size); 424 if (!mem) { 425 /* We don't have a slot if we want to trap every access. */ 426 return 0; 427 } 428 429 return kvm_slot_update_flags(kml, mem, section->mr); 430 } 431 432 static void kvm_log_start(MemoryListener *listener, 433 MemoryRegionSection *section, 434 int old, int new) 435 { 436 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 437 int r; 438 439 if (old != 0) { 440 return; 441 } 442 443 r = kvm_section_update_flags(kml, section); 444 if (r < 0) { 445 abort(); 446 } 447 } 448 449 static void kvm_log_stop(MemoryListener *listener, 450 MemoryRegionSection *section, 451 int old, int new) 452 { 453 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 454 int r; 455 456 if (new != 0) { 457 return; 458 } 459 460 r = kvm_section_update_flags(kml, section); 461 if (r < 0) { 462 abort(); 463 } 464 } 465 466 /* get kvm's dirty pages bitmap and update qemu's */ 467 static int kvm_get_dirty_pages_log_range(MemoryRegionSection *section, 468 unsigned long *bitmap) 469 { 470 ram_addr_t start = section->offset_within_region + 471 memory_region_get_ram_addr(section->mr); 472 ram_addr_t pages = int128_get64(section->size) / getpagesize(); 473 474 cpu_physical_memory_set_dirty_lebitmap(bitmap, start, pages); 475 return 0; 476 } 477 478 #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1)) 479 480 /** 481 * kvm_physical_sync_dirty_bitmap - Sync dirty bitmap from kernel space 482 * 483 * This function will first try to fetch dirty bitmap from the kernel, 484 * and then updates qemu's dirty bitmap. 485 * 486 * @kml: the KVM memory listener object 487 * @section: the memory section to sync the dirty bitmap with 488 */ 489 static int kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml, 490 MemoryRegionSection *section) 491 { 492 KVMState *s = kvm_state; 493 struct kvm_dirty_log d = {}; 494 KVMSlot *mem; 495 hwaddr start_addr, size; 496 497 size = kvm_align_section(section, &start_addr); 498 if (size) { 499 mem = kvm_lookup_matching_slot(kml, start_addr, size); 500 if (!mem) { 501 /* We don't have a slot if we want to trap every access. */ 502 return 0; 503 } 504 505 /* XXX bad kernel interface alert 506 * For dirty bitmap, kernel allocates array of size aligned to 507 * bits-per-long. But for case when the kernel is 64bits and 508 * the userspace is 32bits, userspace can't align to the same 509 * bits-per-long, since sizeof(long) is different between kernel 510 * and user space. This way, userspace will provide buffer which 511 * may be 4 bytes less than the kernel will use, resulting in 512 * userspace memory corruption (which is not detectable by valgrind 513 * too, in most cases). 514 * So for now, let's align to 64 instead of HOST_LONG_BITS here, in 515 * a hope that sizeof(long) won't become >8 any time soon. 516 */ 517 size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS), 518 /*HOST_LONG_BITS*/ 64) / 8; 519 if (!mem->dirty_bmap) { 520 /* Allocate on the first log_sync, once and for all */ 521 mem->dirty_bmap = g_malloc0(size); 522 } 523 524 d.dirty_bitmap = mem->dirty_bmap; 525 d.slot = mem->slot | (kml->as_id << 16); 526 if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) { 527 DPRINTF("ioctl failed %d\n", errno); 528 return -1; 529 } 530 531 kvm_get_dirty_pages_log_range(section, d.dirty_bitmap); 532 } 533 534 return 0; 535 } 536 537 static void kvm_coalesce_mmio_region(MemoryListener *listener, 538 MemoryRegionSection *secion, 539 hwaddr start, hwaddr size) 540 { 541 KVMState *s = kvm_state; 542 543 if (s->coalesced_mmio) { 544 struct kvm_coalesced_mmio_zone zone; 545 546 zone.addr = start; 547 zone.size = size; 548 zone.pad = 0; 549 550 (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone); 551 } 552 } 553 554 static void kvm_uncoalesce_mmio_region(MemoryListener *listener, 555 MemoryRegionSection *secion, 556 hwaddr start, hwaddr size) 557 { 558 KVMState *s = kvm_state; 559 560 if (s->coalesced_mmio) { 561 struct kvm_coalesced_mmio_zone zone; 562 563 zone.addr = start; 564 zone.size = size; 565 zone.pad = 0; 566 567 (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone); 568 } 569 } 570 571 static void kvm_coalesce_pio_add(MemoryListener *listener, 572 MemoryRegionSection *section, 573 hwaddr start, hwaddr size) 574 { 575 KVMState *s = kvm_state; 576 577 if (s->coalesced_pio) { 578 struct kvm_coalesced_mmio_zone zone; 579 580 zone.addr = start; 581 zone.size = size; 582 zone.pio = 1; 583 584 (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone); 585 } 586 } 587 588 static void kvm_coalesce_pio_del(MemoryListener *listener, 589 MemoryRegionSection *section, 590 hwaddr start, hwaddr size) 591 { 592 KVMState *s = kvm_state; 593 594 if (s->coalesced_pio) { 595 struct kvm_coalesced_mmio_zone zone; 596 597 zone.addr = start; 598 zone.size = size; 599 zone.pio = 1; 600 601 (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone); 602 } 603 } 604 605 static MemoryListener kvm_coalesced_pio_listener = { 606 .coalesced_io_add = kvm_coalesce_pio_add, 607 .coalesced_io_del = kvm_coalesce_pio_del, 608 }; 609 610 int kvm_check_extension(KVMState *s, unsigned int extension) 611 { 612 int ret; 613 614 ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension); 615 if (ret < 0) { 616 ret = 0; 617 } 618 619 return ret; 620 } 621 622 int kvm_vm_check_extension(KVMState *s, unsigned int extension) 623 { 624 int ret; 625 626 ret = kvm_vm_ioctl(s, KVM_CHECK_EXTENSION, extension); 627 if (ret < 0) { 628 /* VM wide version not implemented, use global one instead */ 629 ret = kvm_check_extension(s, extension); 630 } 631 632 return ret; 633 } 634 635 static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size) 636 { 637 #if defined(HOST_WORDS_BIGENDIAN) != defined(TARGET_WORDS_BIGENDIAN) 638 /* The kernel expects ioeventfd values in HOST_WORDS_BIGENDIAN 639 * endianness, but the memory core hands them in target endianness. 640 * For example, PPC is always treated as big-endian even if running 641 * on KVM and on PPC64LE. Correct here. 642 */ 643 switch (size) { 644 case 2: 645 val = bswap16(val); 646 break; 647 case 4: 648 val = bswap32(val); 649 break; 650 } 651 #endif 652 return val; 653 } 654 655 static int kvm_set_ioeventfd_mmio(int fd, hwaddr addr, uint32_t val, 656 bool assign, uint32_t size, bool datamatch) 657 { 658 int ret; 659 struct kvm_ioeventfd iofd = { 660 .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0, 661 .addr = addr, 662 .len = size, 663 .flags = 0, 664 .fd = fd, 665 }; 666 667 trace_kvm_set_ioeventfd_mmio(fd, (uint64_t)addr, val, assign, size, 668 datamatch); 669 if (!kvm_enabled()) { 670 return -ENOSYS; 671 } 672 673 if (datamatch) { 674 iofd.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH; 675 } 676 if (!assign) { 677 iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN; 678 } 679 680 ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd); 681 682 if (ret < 0) { 683 return -errno; 684 } 685 686 return 0; 687 } 688 689 static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val, 690 bool assign, uint32_t size, bool datamatch) 691 { 692 struct kvm_ioeventfd kick = { 693 .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0, 694 .addr = addr, 695 .flags = KVM_IOEVENTFD_FLAG_PIO, 696 .len = size, 697 .fd = fd, 698 }; 699 int r; 700 trace_kvm_set_ioeventfd_pio(fd, addr, val, assign, size, datamatch); 701 if (!kvm_enabled()) { 702 return -ENOSYS; 703 } 704 if (datamatch) { 705 kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH; 706 } 707 if (!assign) { 708 kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN; 709 } 710 r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick); 711 if (r < 0) { 712 return r; 713 } 714 return 0; 715 } 716 717 718 static int kvm_check_many_ioeventfds(void) 719 { 720 /* Userspace can use ioeventfd for io notification. This requires a host 721 * that supports eventfd(2) and an I/O thread; since eventfd does not 722 * support SIGIO it cannot interrupt the vcpu. 723 * 724 * Older kernels have a 6 device limit on the KVM io bus. Find out so we 725 * can avoid creating too many ioeventfds. 726 */ 727 #if defined(CONFIG_EVENTFD) 728 int ioeventfds[7]; 729 int i, ret = 0; 730 for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) { 731 ioeventfds[i] = eventfd(0, EFD_CLOEXEC); 732 if (ioeventfds[i] < 0) { 733 break; 734 } 735 ret = kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, true, 2, true); 736 if (ret < 0) { 737 close(ioeventfds[i]); 738 break; 739 } 740 } 741 742 /* Decide whether many devices are supported or not */ 743 ret = i == ARRAY_SIZE(ioeventfds); 744 745 while (i-- > 0) { 746 kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, false, 2, true); 747 close(ioeventfds[i]); 748 } 749 return ret; 750 #else 751 return 0; 752 #endif 753 } 754 755 static const KVMCapabilityInfo * 756 kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list) 757 { 758 while (list->name) { 759 if (!kvm_check_extension(s, list->value)) { 760 return list; 761 } 762 list++; 763 } 764 return NULL; 765 } 766 767 static void kvm_set_phys_mem(KVMMemoryListener *kml, 768 MemoryRegionSection *section, bool add) 769 { 770 KVMSlot *mem; 771 int err; 772 MemoryRegion *mr = section->mr; 773 bool writeable = !mr->readonly && !mr->rom_device; 774 hwaddr start_addr, size; 775 void *ram; 776 777 if (!memory_region_is_ram(mr)) { 778 if (writeable || !kvm_readonly_mem_allowed) { 779 return; 780 } else if (!mr->romd_mode) { 781 /* If the memory device is not in romd_mode, then we actually want 782 * to remove the kvm memory slot so all accesses will trap. */ 783 add = false; 784 } 785 } 786 787 size = kvm_align_section(section, &start_addr); 788 if (!size) { 789 return; 790 } 791 792 /* use aligned delta to align the ram address */ 793 ram = memory_region_get_ram_ptr(mr) + section->offset_within_region + 794 (start_addr - section->offset_within_address_space); 795 796 if (!add) { 797 mem = kvm_lookup_matching_slot(kml, start_addr, size); 798 if (!mem) { 799 return; 800 } 801 if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { 802 kvm_physical_sync_dirty_bitmap(kml, section); 803 } 804 805 /* unregister the slot */ 806 g_free(mem->dirty_bmap); 807 mem->dirty_bmap = NULL; 808 mem->memory_size = 0; 809 mem->flags = 0; 810 err = kvm_set_user_memory_region(kml, mem, false); 811 if (err) { 812 fprintf(stderr, "%s: error unregistering slot: %s\n", 813 __func__, strerror(-err)); 814 abort(); 815 } 816 return; 817 } 818 819 /* register the new slot */ 820 mem = kvm_alloc_slot(kml); 821 mem->memory_size = size; 822 mem->start_addr = start_addr; 823 mem->ram = ram; 824 mem->flags = kvm_mem_flags(mr); 825 826 err = kvm_set_user_memory_region(kml, mem, true); 827 if (err) { 828 fprintf(stderr, "%s: error registering slot: %s\n", __func__, 829 strerror(-err)); 830 abort(); 831 } 832 } 833 834 static void kvm_region_add(MemoryListener *listener, 835 MemoryRegionSection *section) 836 { 837 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 838 839 memory_region_ref(section->mr); 840 kvm_set_phys_mem(kml, section, true); 841 } 842 843 static void kvm_region_del(MemoryListener *listener, 844 MemoryRegionSection *section) 845 { 846 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 847 848 kvm_set_phys_mem(kml, section, false); 849 memory_region_unref(section->mr); 850 } 851 852 static void kvm_log_sync(MemoryListener *listener, 853 MemoryRegionSection *section) 854 { 855 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 856 int r; 857 858 r = kvm_physical_sync_dirty_bitmap(kml, section); 859 if (r < 0) { 860 abort(); 861 } 862 } 863 864 static void kvm_mem_ioeventfd_add(MemoryListener *listener, 865 MemoryRegionSection *section, 866 bool match_data, uint64_t data, 867 EventNotifier *e) 868 { 869 int fd = event_notifier_get_fd(e); 870 int r; 871 872 r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space, 873 data, true, int128_get64(section->size), 874 match_data); 875 if (r < 0) { 876 fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n", 877 __func__, strerror(-r), -r); 878 abort(); 879 } 880 } 881 882 static void kvm_mem_ioeventfd_del(MemoryListener *listener, 883 MemoryRegionSection *section, 884 bool match_data, uint64_t data, 885 EventNotifier *e) 886 { 887 int fd = event_notifier_get_fd(e); 888 int r; 889 890 r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space, 891 data, false, int128_get64(section->size), 892 match_data); 893 if (r < 0) { 894 fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n", 895 __func__, strerror(-r), -r); 896 abort(); 897 } 898 } 899 900 static void kvm_io_ioeventfd_add(MemoryListener *listener, 901 MemoryRegionSection *section, 902 bool match_data, uint64_t data, 903 EventNotifier *e) 904 { 905 int fd = event_notifier_get_fd(e); 906 int r; 907 908 r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space, 909 data, true, int128_get64(section->size), 910 match_data); 911 if (r < 0) { 912 fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n", 913 __func__, strerror(-r), -r); 914 abort(); 915 } 916 } 917 918 static void kvm_io_ioeventfd_del(MemoryListener *listener, 919 MemoryRegionSection *section, 920 bool match_data, uint64_t data, 921 EventNotifier *e) 922 923 { 924 int fd = event_notifier_get_fd(e); 925 int r; 926 927 r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space, 928 data, false, int128_get64(section->size), 929 match_data); 930 if (r < 0) { 931 fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n", 932 __func__, strerror(-r), -r); 933 abort(); 934 } 935 } 936 937 void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml, 938 AddressSpace *as, int as_id) 939 { 940 int i; 941 942 kml->slots = g_malloc0(s->nr_slots * sizeof(KVMSlot)); 943 kml->as_id = as_id; 944 945 for (i = 0; i < s->nr_slots; i++) { 946 kml->slots[i].slot = i; 947 } 948 949 kml->listener.region_add = kvm_region_add; 950 kml->listener.region_del = kvm_region_del; 951 kml->listener.log_start = kvm_log_start; 952 kml->listener.log_stop = kvm_log_stop; 953 kml->listener.log_sync = kvm_log_sync; 954 kml->listener.priority = 10; 955 956 memory_listener_register(&kml->listener, as); 957 } 958 959 static MemoryListener kvm_io_listener = { 960 .eventfd_add = kvm_io_ioeventfd_add, 961 .eventfd_del = kvm_io_ioeventfd_del, 962 .priority = 10, 963 }; 964 965 int kvm_set_irq(KVMState *s, int irq, int level) 966 { 967 struct kvm_irq_level event; 968 int ret; 969 970 assert(kvm_async_interrupts_enabled()); 971 972 event.level = level; 973 event.irq = irq; 974 ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event); 975 if (ret < 0) { 976 perror("kvm_set_irq"); 977 abort(); 978 } 979 980 return (s->irq_set_ioctl == KVM_IRQ_LINE) ? 1 : event.status; 981 } 982 983 #ifdef KVM_CAP_IRQ_ROUTING 984 typedef struct KVMMSIRoute { 985 struct kvm_irq_routing_entry kroute; 986 QTAILQ_ENTRY(KVMMSIRoute) entry; 987 } KVMMSIRoute; 988 989 static void set_gsi(KVMState *s, unsigned int gsi) 990 { 991 set_bit(gsi, s->used_gsi_bitmap); 992 } 993 994 static void clear_gsi(KVMState *s, unsigned int gsi) 995 { 996 clear_bit(gsi, s->used_gsi_bitmap); 997 } 998 999 void kvm_init_irq_routing(KVMState *s) 1000 { 1001 int gsi_count, i; 1002 1003 gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING) - 1; 1004 if (gsi_count > 0) { 1005 /* Round up so we can search ints using ffs */ 1006 s->used_gsi_bitmap = bitmap_new(gsi_count); 1007 s->gsi_count = gsi_count; 1008 } 1009 1010 s->irq_routes = g_malloc0(sizeof(*s->irq_routes)); 1011 s->nr_allocated_irq_routes = 0; 1012 1013 if (!kvm_direct_msi_allowed) { 1014 for (i = 0; i < KVM_MSI_HASHTAB_SIZE; i++) { 1015 QTAILQ_INIT(&s->msi_hashtab[i]); 1016 } 1017 } 1018 1019 kvm_arch_init_irq_routing(s); 1020 } 1021 1022 void kvm_irqchip_commit_routes(KVMState *s) 1023 { 1024 int ret; 1025 1026 if (kvm_gsi_direct_mapping()) { 1027 return; 1028 } 1029 1030 if (!kvm_gsi_routing_enabled()) { 1031 return; 1032 } 1033 1034 s->irq_routes->flags = 0; 1035 trace_kvm_irqchip_commit_routes(); 1036 ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes); 1037 assert(ret == 0); 1038 } 1039 1040 static void kvm_add_routing_entry(KVMState *s, 1041 struct kvm_irq_routing_entry *entry) 1042 { 1043 struct kvm_irq_routing_entry *new; 1044 int n, size; 1045 1046 if (s->irq_routes->nr == s->nr_allocated_irq_routes) { 1047 n = s->nr_allocated_irq_routes * 2; 1048 if (n < 64) { 1049 n = 64; 1050 } 1051 size = sizeof(struct kvm_irq_routing); 1052 size += n * sizeof(*new); 1053 s->irq_routes = g_realloc(s->irq_routes, size); 1054 s->nr_allocated_irq_routes = n; 1055 } 1056 n = s->irq_routes->nr++; 1057 new = &s->irq_routes->entries[n]; 1058 1059 *new = *entry; 1060 1061 set_gsi(s, entry->gsi); 1062 } 1063 1064 static int kvm_update_routing_entry(KVMState *s, 1065 struct kvm_irq_routing_entry *new_entry) 1066 { 1067 struct kvm_irq_routing_entry *entry; 1068 int n; 1069 1070 for (n = 0; n < s->irq_routes->nr; n++) { 1071 entry = &s->irq_routes->entries[n]; 1072 if (entry->gsi != new_entry->gsi) { 1073 continue; 1074 } 1075 1076 if(!memcmp(entry, new_entry, sizeof *entry)) { 1077 return 0; 1078 } 1079 1080 *entry = *new_entry; 1081 1082 return 0; 1083 } 1084 1085 return -ESRCH; 1086 } 1087 1088 void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin) 1089 { 1090 struct kvm_irq_routing_entry e = {}; 1091 1092 assert(pin < s->gsi_count); 1093 1094 e.gsi = irq; 1095 e.type = KVM_IRQ_ROUTING_IRQCHIP; 1096 e.flags = 0; 1097 e.u.irqchip.irqchip = irqchip; 1098 e.u.irqchip.pin = pin; 1099 kvm_add_routing_entry(s, &e); 1100 } 1101 1102 void kvm_irqchip_release_virq(KVMState *s, int virq) 1103 { 1104 struct kvm_irq_routing_entry *e; 1105 int i; 1106 1107 if (kvm_gsi_direct_mapping()) { 1108 return; 1109 } 1110 1111 for (i = 0; i < s->irq_routes->nr; i++) { 1112 e = &s->irq_routes->entries[i]; 1113 if (e->gsi == virq) { 1114 s->irq_routes->nr--; 1115 *e = s->irq_routes->entries[s->irq_routes->nr]; 1116 } 1117 } 1118 clear_gsi(s, virq); 1119 kvm_arch_release_virq_post(virq); 1120 trace_kvm_irqchip_release_virq(virq); 1121 } 1122 1123 static unsigned int kvm_hash_msi(uint32_t data) 1124 { 1125 /* This is optimized for IA32 MSI layout. However, no other arch shall 1126 * repeat the mistake of not providing a direct MSI injection API. */ 1127 return data & 0xff; 1128 } 1129 1130 static void kvm_flush_dynamic_msi_routes(KVMState *s) 1131 { 1132 KVMMSIRoute *route, *next; 1133 unsigned int hash; 1134 1135 for (hash = 0; hash < KVM_MSI_HASHTAB_SIZE; hash++) { 1136 QTAILQ_FOREACH_SAFE(route, &s->msi_hashtab[hash], entry, next) { 1137 kvm_irqchip_release_virq(s, route->kroute.gsi); 1138 QTAILQ_REMOVE(&s->msi_hashtab[hash], route, entry); 1139 g_free(route); 1140 } 1141 } 1142 } 1143 1144 static int kvm_irqchip_get_virq(KVMState *s) 1145 { 1146 int next_virq; 1147 1148 /* 1149 * PIC and IOAPIC share the first 16 GSI numbers, thus the available 1150 * GSI numbers are more than the number of IRQ route. Allocating a GSI 1151 * number can succeed even though a new route entry cannot be added. 1152 * When this happens, flush dynamic MSI entries to free IRQ route entries. 1153 */ 1154 if (!kvm_direct_msi_allowed && s->irq_routes->nr == s->gsi_count) { 1155 kvm_flush_dynamic_msi_routes(s); 1156 } 1157 1158 /* Return the lowest unused GSI in the bitmap */ 1159 next_virq = find_first_zero_bit(s->used_gsi_bitmap, s->gsi_count); 1160 if (next_virq >= s->gsi_count) { 1161 return -ENOSPC; 1162 } else { 1163 return next_virq; 1164 } 1165 } 1166 1167 static KVMMSIRoute *kvm_lookup_msi_route(KVMState *s, MSIMessage msg) 1168 { 1169 unsigned int hash = kvm_hash_msi(msg.data); 1170 KVMMSIRoute *route; 1171 1172 QTAILQ_FOREACH(route, &s->msi_hashtab[hash], entry) { 1173 if (route->kroute.u.msi.address_lo == (uint32_t)msg.address && 1174 route->kroute.u.msi.address_hi == (msg.address >> 32) && 1175 route->kroute.u.msi.data == le32_to_cpu(msg.data)) { 1176 return route; 1177 } 1178 } 1179 return NULL; 1180 } 1181 1182 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg) 1183 { 1184 struct kvm_msi msi; 1185 KVMMSIRoute *route; 1186 1187 if (kvm_direct_msi_allowed) { 1188 msi.address_lo = (uint32_t)msg.address; 1189 msi.address_hi = msg.address >> 32; 1190 msi.data = le32_to_cpu(msg.data); 1191 msi.flags = 0; 1192 memset(msi.pad, 0, sizeof(msi.pad)); 1193 1194 return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi); 1195 } 1196 1197 route = kvm_lookup_msi_route(s, msg); 1198 if (!route) { 1199 int virq; 1200 1201 virq = kvm_irqchip_get_virq(s); 1202 if (virq < 0) { 1203 return virq; 1204 } 1205 1206 route = g_malloc0(sizeof(KVMMSIRoute)); 1207 route->kroute.gsi = virq; 1208 route->kroute.type = KVM_IRQ_ROUTING_MSI; 1209 route->kroute.flags = 0; 1210 route->kroute.u.msi.address_lo = (uint32_t)msg.address; 1211 route->kroute.u.msi.address_hi = msg.address >> 32; 1212 route->kroute.u.msi.data = le32_to_cpu(msg.data); 1213 1214 kvm_add_routing_entry(s, &route->kroute); 1215 kvm_irqchip_commit_routes(s); 1216 1217 QTAILQ_INSERT_TAIL(&s->msi_hashtab[kvm_hash_msi(msg.data)], route, 1218 entry); 1219 } 1220 1221 assert(route->kroute.type == KVM_IRQ_ROUTING_MSI); 1222 1223 return kvm_set_irq(s, route->kroute.gsi, 1); 1224 } 1225 1226 int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev) 1227 { 1228 struct kvm_irq_routing_entry kroute = {}; 1229 int virq; 1230 MSIMessage msg = {0, 0}; 1231 1232 if (pci_available && dev) { 1233 msg = pci_get_msi_message(dev, vector); 1234 } 1235 1236 if (kvm_gsi_direct_mapping()) { 1237 return kvm_arch_msi_data_to_gsi(msg.data); 1238 } 1239 1240 if (!kvm_gsi_routing_enabled()) { 1241 return -ENOSYS; 1242 } 1243 1244 virq = kvm_irqchip_get_virq(s); 1245 if (virq < 0) { 1246 return virq; 1247 } 1248 1249 kroute.gsi = virq; 1250 kroute.type = KVM_IRQ_ROUTING_MSI; 1251 kroute.flags = 0; 1252 kroute.u.msi.address_lo = (uint32_t)msg.address; 1253 kroute.u.msi.address_hi = msg.address >> 32; 1254 kroute.u.msi.data = le32_to_cpu(msg.data); 1255 if (pci_available && kvm_msi_devid_required()) { 1256 kroute.flags = KVM_MSI_VALID_DEVID; 1257 kroute.u.msi.devid = pci_requester_id(dev); 1258 } 1259 if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) { 1260 kvm_irqchip_release_virq(s, virq); 1261 return -EINVAL; 1262 } 1263 1264 trace_kvm_irqchip_add_msi_route(dev ? dev->name : (char *)"N/A", 1265 vector, virq); 1266 1267 kvm_add_routing_entry(s, &kroute); 1268 kvm_arch_add_msi_route_post(&kroute, vector, dev); 1269 kvm_irqchip_commit_routes(s); 1270 1271 return virq; 1272 } 1273 1274 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg, 1275 PCIDevice *dev) 1276 { 1277 struct kvm_irq_routing_entry kroute = {}; 1278 1279 if (kvm_gsi_direct_mapping()) { 1280 return 0; 1281 } 1282 1283 if (!kvm_irqchip_in_kernel()) { 1284 return -ENOSYS; 1285 } 1286 1287 kroute.gsi = virq; 1288 kroute.type = KVM_IRQ_ROUTING_MSI; 1289 kroute.flags = 0; 1290 kroute.u.msi.address_lo = (uint32_t)msg.address; 1291 kroute.u.msi.address_hi = msg.address >> 32; 1292 kroute.u.msi.data = le32_to_cpu(msg.data); 1293 if (pci_available && kvm_msi_devid_required()) { 1294 kroute.flags = KVM_MSI_VALID_DEVID; 1295 kroute.u.msi.devid = pci_requester_id(dev); 1296 } 1297 if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) { 1298 return -EINVAL; 1299 } 1300 1301 trace_kvm_irqchip_update_msi_route(virq); 1302 1303 return kvm_update_routing_entry(s, &kroute); 1304 } 1305 1306 static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int rfd, int virq, 1307 bool assign) 1308 { 1309 struct kvm_irqfd irqfd = { 1310 .fd = fd, 1311 .gsi = virq, 1312 .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN, 1313 }; 1314 1315 if (rfd != -1) { 1316 irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE; 1317 irqfd.resamplefd = rfd; 1318 } 1319 1320 if (!kvm_irqfds_enabled()) { 1321 return -ENOSYS; 1322 } 1323 1324 return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd); 1325 } 1326 1327 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter) 1328 { 1329 struct kvm_irq_routing_entry kroute = {}; 1330 int virq; 1331 1332 if (!kvm_gsi_routing_enabled()) { 1333 return -ENOSYS; 1334 } 1335 1336 virq = kvm_irqchip_get_virq(s); 1337 if (virq < 0) { 1338 return virq; 1339 } 1340 1341 kroute.gsi = virq; 1342 kroute.type = KVM_IRQ_ROUTING_S390_ADAPTER; 1343 kroute.flags = 0; 1344 kroute.u.adapter.summary_addr = adapter->summary_addr; 1345 kroute.u.adapter.ind_addr = adapter->ind_addr; 1346 kroute.u.adapter.summary_offset = adapter->summary_offset; 1347 kroute.u.adapter.ind_offset = adapter->ind_offset; 1348 kroute.u.adapter.adapter_id = adapter->adapter_id; 1349 1350 kvm_add_routing_entry(s, &kroute); 1351 1352 return virq; 1353 } 1354 1355 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint) 1356 { 1357 struct kvm_irq_routing_entry kroute = {}; 1358 int virq; 1359 1360 if (!kvm_gsi_routing_enabled()) { 1361 return -ENOSYS; 1362 } 1363 if (!kvm_check_extension(s, KVM_CAP_HYPERV_SYNIC)) { 1364 return -ENOSYS; 1365 } 1366 virq = kvm_irqchip_get_virq(s); 1367 if (virq < 0) { 1368 return virq; 1369 } 1370 1371 kroute.gsi = virq; 1372 kroute.type = KVM_IRQ_ROUTING_HV_SINT; 1373 kroute.flags = 0; 1374 kroute.u.hv_sint.vcpu = vcpu; 1375 kroute.u.hv_sint.sint = sint; 1376 1377 kvm_add_routing_entry(s, &kroute); 1378 kvm_irqchip_commit_routes(s); 1379 1380 return virq; 1381 } 1382 1383 #else /* !KVM_CAP_IRQ_ROUTING */ 1384 1385 void kvm_init_irq_routing(KVMState *s) 1386 { 1387 } 1388 1389 void kvm_irqchip_release_virq(KVMState *s, int virq) 1390 { 1391 } 1392 1393 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg) 1394 { 1395 abort(); 1396 } 1397 1398 int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev) 1399 { 1400 return -ENOSYS; 1401 } 1402 1403 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter) 1404 { 1405 return -ENOSYS; 1406 } 1407 1408 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint) 1409 { 1410 return -ENOSYS; 1411 } 1412 1413 static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int virq, bool assign) 1414 { 1415 abort(); 1416 } 1417 1418 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg) 1419 { 1420 return -ENOSYS; 1421 } 1422 #endif /* !KVM_CAP_IRQ_ROUTING */ 1423 1424 int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, 1425 EventNotifier *rn, int virq) 1426 { 1427 return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), 1428 rn ? event_notifier_get_fd(rn) : -1, virq, true); 1429 } 1430 1431 int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, 1432 int virq) 1433 { 1434 return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), -1, virq, 1435 false); 1436 } 1437 1438 int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n, 1439 EventNotifier *rn, qemu_irq irq) 1440 { 1441 gpointer key, gsi; 1442 gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi); 1443 1444 if (!found) { 1445 return -ENXIO; 1446 } 1447 return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, GPOINTER_TO_INT(gsi)); 1448 } 1449 1450 int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n, 1451 qemu_irq irq) 1452 { 1453 gpointer key, gsi; 1454 gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi); 1455 1456 if (!found) { 1457 return -ENXIO; 1458 } 1459 return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, GPOINTER_TO_INT(gsi)); 1460 } 1461 1462 void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi) 1463 { 1464 g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi)); 1465 } 1466 1467 static void kvm_irqchip_create(MachineState *machine, KVMState *s) 1468 { 1469 int ret; 1470 1471 if (kvm_check_extension(s, KVM_CAP_IRQCHIP)) { 1472 ; 1473 } else if (kvm_check_extension(s, KVM_CAP_S390_IRQCHIP)) { 1474 ret = kvm_vm_enable_cap(s, KVM_CAP_S390_IRQCHIP, 0); 1475 if (ret < 0) { 1476 fprintf(stderr, "Enable kernel irqchip failed: %s\n", strerror(-ret)); 1477 exit(1); 1478 } 1479 } else { 1480 return; 1481 } 1482 1483 /* First probe and see if there's a arch-specific hook to create the 1484 * in-kernel irqchip for us */ 1485 ret = kvm_arch_irqchip_create(machine, s); 1486 if (ret == 0) { 1487 if (machine_kernel_irqchip_split(machine)) { 1488 perror("Split IRQ chip mode not supported."); 1489 exit(1); 1490 } else { 1491 ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP); 1492 } 1493 } 1494 if (ret < 0) { 1495 fprintf(stderr, "Create kernel irqchip failed: %s\n", strerror(-ret)); 1496 exit(1); 1497 } 1498 1499 kvm_kernel_irqchip = true; 1500 /* If we have an in-kernel IRQ chip then we must have asynchronous 1501 * interrupt delivery (though the reverse is not necessarily true) 1502 */ 1503 kvm_async_interrupts_allowed = true; 1504 kvm_halt_in_kernel_allowed = true; 1505 1506 kvm_init_irq_routing(s); 1507 1508 s->gsimap = g_hash_table_new(g_direct_hash, g_direct_equal); 1509 } 1510 1511 /* Find number of supported CPUs using the recommended 1512 * procedure from the kernel API documentation to cope with 1513 * older kernels that may be missing capabilities. 1514 */ 1515 static int kvm_recommended_vcpus(KVMState *s) 1516 { 1517 int ret = kvm_vm_check_extension(s, KVM_CAP_NR_VCPUS); 1518 return (ret) ? ret : 4; 1519 } 1520 1521 static int kvm_max_vcpus(KVMState *s) 1522 { 1523 int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS); 1524 return (ret) ? ret : kvm_recommended_vcpus(s); 1525 } 1526 1527 static int kvm_max_vcpu_id(KVMState *s) 1528 { 1529 int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPU_ID); 1530 return (ret) ? ret : kvm_max_vcpus(s); 1531 } 1532 1533 bool kvm_vcpu_id_is_valid(int vcpu_id) 1534 { 1535 KVMState *s = KVM_STATE(current_machine->accelerator); 1536 return vcpu_id >= 0 && vcpu_id < kvm_max_vcpu_id(s); 1537 } 1538 1539 static int kvm_init(MachineState *ms) 1540 { 1541 MachineClass *mc = MACHINE_GET_CLASS(ms); 1542 static const char upgrade_note[] = 1543 "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n" 1544 "(see http://sourceforge.net/projects/kvm).\n"; 1545 struct { 1546 const char *name; 1547 int num; 1548 } num_cpus[] = { 1549 { "SMP", ms->smp.cpus }, 1550 { "hotpluggable", ms->smp.max_cpus }, 1551 { NULL, } 1552 }, *nc = num_cpus; 1553 int soft_vcpus_limit, hard_vcpus_limit; 1554 KVMState *s; 1555 const KVMCapabilityInfo *missing_cap; 1556 int ret; 1557 int type = 0; 1558 const char *kvm_type; 1559 1560 s = KVM_STATE(ms->accelerator); 1561 1562 /* 1563 * On systems where the kernel can support different base page 1564 * sizes, host page size may be different from TARGET_PAGE_SIZE, 1565 * even with KVM. TARGET_PAGE_SIZE is assumed to be the minimum 1566 * page size for the system though. 1567 */ 1568 assert(TARGET_PAGE_SIZE <= getpagesize()); 1569 1570 s->sigmask_len = 8; 1571 1572 #ifdef KVM_CAP_SET_GUEST_DEBUG 1573 QTAILQ_INIT(&s->kvm_sw_breakpoints); 1574 #endif 1575 QLIST_INIT(&s->kvm_parked_vcpus); 1576 s->vmfd = -1; 1577 s->fd = qemu_open("/dev/kvm", O_RDWR); 1578 if (s->fd == -1) { 1579 fprintf(stderr, "Could not access KVM kernel module: %m\n"); 1580 ret = -errno; 1581 goto err; 1582 } 1583 1584 ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0); 1585 if (ret < KVM_API_VERSION) { 1586 if (ret >= 0) { 1587 ret = -EINVAL; 1588 } 1589 fprintf(stderr, "kvm version too old\n"); 1590 goto err; 1591 } 1592 1593 if (ret > KVM_API_VERSION) { 1594 ret = -EINVAL; 1595 fprintf(stderr, "kvm version not supported\n"); 1596 goto err; 1597 } 1598 1599 kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT); 1600 s->nr_slots = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS); 1601 1602 /* If unspecified, use the default value */ 1603 if (!s->nr_slots) { 1604 s->nr_slots = 32; 1605 } 1606 1607 kvm_type = qemu_opt_get(qemu_get_machine_opts(), "kvm-type"); 1608 if (mc->kvm_type) { 1609 type = mc->kvm_type(ms, kvm_type); 1610 } else if (kvm_type) { 1611 ret = -EINVAL; 1612 fprintf(stderr, "Invalid argument kvm-type=%s\n", kvm_type); 1613 goto err; 1614 } 1615 1616 do { 1617 ret = kvm_ioctl(s, KVM_CREATE_VM, type); 1618 } while (ret == -EINTR); 1619 1620 if (ret < 0) { 1621 fprintf(stderr, "ioctl(KVM_CREATE_VM) failed: %d %s\n", -ret, 1622 strerror(-ret)); 1623 1624 #ifdef TARGET_S390X 1625 if (ret == -EINVAL) { 1626 fprintf(stderr, 1627 "Host kernel setup problem detected. Please verify:\n"); 1628 fprintf(stderr, "- for kernels supporting the switch_amode or" 1629 " user_mode parameters, whether\n"); 1630 fprintf(stderr, 1631 " user space is running in primary address space\n"); 1632 fprintf(stderr, 1633 "- for kernels supporting the vm.allocate_pgste sysctl, " 1634 "whether it is enabled\n"); 1635 } 1636 #endif 1637 goto err; 1638 } 1639 1640 s->vmfd = ret; 1641 1642 /* check the vcpu limits */ 1643 soft_vcpus_limit = kvm_recommended_vcpus(s); 1644 hard_vcpus_limit = kvm_max_vcpus(s); 1645 1646 while (nc->name) { 1647 if (nc->num > soft_vcpus_limit) { 1648 warn_report("Number of %s cpus requested (%d) exceeds " 1649 "the recommended cpus supported by KVM (%d)", 1650 nc->name, nc->num, soft_vcpus_limit); 1651 1652 if (nc->num > hard_vcpus_limit) { 1653 fprintf(stderr, "Number of %s cpus requested (%d) exceeds " 1654 "the maximum cpus supported by KVM (%d)\n", 1655 nc->name, nc->num, hard_vcpus_limit); 1656 exit(1); 1657 } 1658 } 1659 nc++; 1660 } 1661 1662 missing_cap = kvm_check_extension_list(s, kvm_required_capabilites); 1663 if (!missing_cap) { 1664 missing_cap = 1665 kvm_check_extension_list(s, kvm_arch_required_capabilities); 1666 } 1667 if (missing_cap) { 1668 ret = -EINVAL; 1669 fprintf(stderr, "kvm does not support %s\n%s", 1670 missing_cap->name, upgrade_note); 1671 goto err; 1672 } 1673 1674 s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO); 1675 s->coalesced_pio = s->coalesced_mmio && 1676 kvm_check_extension(s, KVM_CAP_COALESCED_PIO); 1677 1678 #ifdef KVM_CAP_VCPU_EVENTS 1679 s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS); 1680 #endif 1681 1682 s->robust_singlestep = 1683 kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP); 1684 1685 #ifdef KVM_CAP_DEBUGREGS 1686 s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS); 1687 #endif 1688 1689 s->max_nested_state_len = kvm_check_extension(s, KVM_CAP_NESTED_STATE); 1690 1691 #ifdef KVM_CAP_IRQ_ROUTING 1692 kvm_direct_msi_allowed = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0); 1693 #endif 1694 1695 s->intx_set_mask = kvm_check_extension(s, KVM_CAP_PCI_2_3); 1696 1697 s->irq_set_ioctl = KVM_IRQ_LINE; 1698 if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) { 1699 s->irq_set_ioctl = KVM_IRQ_LINE_STATUS; 1700 } 1701 1702 kvm_readonly_mem_allowed = 1703 (kvm_check_extension(s, KVM_CAP_READONLY_MEM) > 0); 1704 1705 kvm_eventfds_allowed = 1706 (kvm_check_extension(s, KVM_CAP_IOEVENTFD) > 0); 1707 1708 kvm_irqfds_allowed = 1709 (kvm_check_extension(s, KVM_CAP_IRQFD) > 0); 1710 1711 kvm_resamplefds_allowed = 1712 (kvm_check_extension(s, KVM_CAP_IRQFD_RESAMPLE) > 0); 1713 1714 kvm_vm_attributes_allowed = 1715 (kvm_check_extension(s, KVM_CAP_VM_ATTRIBUTES) > 0); 1716 1717 kvm_ioeventfd_any_length_allowed = 1718 (kvm_check_extension(s, KVM_CAP_IOEVENTFD_ANY_LENGTH) > 0); 1719 1720 kvm_state = s; 1721 1722 /* 1723 * if memory encryption object is specified then initialize the memory 1724 * encryption context. 1725 */ 1726 if (ms->memory_encryption) { 1727 kvm_state->memcrypt_handle = sev_guest_init(ms->memory_encryption); 1728 if (!kvm_state->memcrypt_handle) { 1729 ret = -1; 1730 goto err; 1731 } 1732 1733 kvm_state->memcrypt_encrypt_data = sev_encrypt_data; 1734 } 1735 1736 ret = kvm_arch_init(ms, s); 1737 if (ret < 0) { 1738 goto err; 1739 } 1740 1741 if (machine_kernel_irqchip_allowed(ms)) { 1742 kvm_irqchip_create(ms, s); 1743 } 1744 1745 if (kvm_eventfds_allowed) { 1746 s->memory_listener.listener.eventfd_add = kvm_mem_ioeventfd_add; 1747 s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del; 1748 } 1749 s->memory_listener.listener.coalesced_io_add = kvm_coalesce_mmio_region; 1750 s->memory_listener.listener.coalesced_io_del = kvm_uncoalesce_mmio_region; 1751 1752 kvm_memory_listener_register(s, &s->memory_listener, 1753 &address_space_memory, 0); 1754 memory_listener_register(&kvm_io_listener, 1755 &address_space_io); 1756 memory_listener_register(&kvm_coalesced_pio_listener, 1757 &address_space_io); 1758 1759 s->many_ioeventfds = kvm_check_many_ioeventfds(); 1760 1761 s->sync_mmu = !!kvm_vm_check_extension(kvm_state, KVM_CAP_SYNC_MMU); 1762 if (!s->sync_mmu) { 1763 qemu_balloon_inhibit(true); 1764 } 1765 1766 return 0; 1767 1768 err: 1769 assert(ret < 0); 1770 if (s->vmfd >= 0) { 1771 close(s->vmfd); 1772 } 1773 if (s->fd != -1) { 1774 close(s->fd); 1775 } 1776 g_free(s->memory_listener.slots); 1777 1778 return ret; 1779 } 1780 1781 void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len) 1782 { 1783 s->sigmask_len = sigmask_len; 1784 } 1785 1786 static void kvm_handle_io(uint16_t port, MemTxAttrs attrs, void *data, int direction, 1787 int size, uint32_t count) 1788 { 1789 int i; 1790 uint8_t *ptr = data; 1791 1792 for (i = 0; i < count; i++) { 1793 address_space_rw(&address_space_io, port, attrs, 1794 ptr, size, 1795 direction == KVM_EXIT_IO_OUT); 1796 ptr += size; 1797 } 1798 } 1799 1800 static int kvm_handle_internal_error(CPUState *cpu, struct kvm_run *run) 1801 { 1802 fprintf(stderr, "KVM internal error. Suberror: %d\n", 1803 run->internal.suberror); 1804 1805 if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) { 1806 int i; 1807 1808 for (i = 0; i < run->internal.ndata; ++i) { 1809 fprintf(stderr, "extra data[%d]: %"PRIx64"\n", 1810 i, (uint64_t)run->internal.data[i]); 1811 } 1812 } 1813 if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) { 1814 fprintf(stderr, "emulation failure\n"); 1815 if (!kvm_arch_stop_on_emulation_error(cpu)) { 1816 cpu_dump_state(cpu, stderr, CPU_DUMP_CODE); 1817 return EXCP_INTERRUPT; 1818 } 1819 } 1820 /* FIXME: Should trigger a qmp message to let management know 1821 * something went wrong. 1822 */ 1823 return -1; 1824 } 1825 1826 void kvm_flush_coalesced_mmio_buffer(void) 1827 { 1828 KVMState *s = kvm_state; 1829 1830 if (s->coalesced_flush_in_progress) { 1831 return; 1832 } 1833 1834 s->coalesced_flush_in_progress = true; 1835 1836 if (s->coalesced_mmio_ring) { 1837 struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring; 1838 while (ring->first != ring->last) { 1839 struct kvm_coalesced_mmio *ent; 1840 1841 ent = &ring->coalesced_mmio[ring->first]; 1842 1843 if (ent->pio == 1) { 1844 address_space_rw(&address_space_io, ent->phys_addr, 1845 MEMTXATTRS_UNSPECIFIED, ent->data, 1846 ent->len, true); 1847 } else { 1848 cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len); 1849 } 1850 smp_wmb(); 1851 ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX; 1852 } 1853 } 1854 1855 s->coalesced_flush_in_progress = false; 1856 } 1857 1858 static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg) 1859 { 1860 if (!cpu->vcpu_dirty) { 1861 kvm_arch_get_registers(cpu); 1862 cpu->vcpu_dirty = true; 1863 } 1864 } 1865 1866 void kvm_cpu_synchronize_state(CPUState *cpu) 1867 { 1868 if (!cpu->vcpu_dirty) { 1869 run_on_cpu(cpu, do_kvm_cpu_synchronize_state, RUN_ON_CPU_NULL); 1870 } 1871 } 1872 1873 static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg) 1874 { 1875 kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE); 1876 cpu->vcpu_dirty = false; 1877 } 1878 1879 void kvm_cpu_synchronize_post_reset(CPUState *cpu) 1880 { 1881 run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL); 1882 } 1883 1884 static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg) 1885 { 1886 kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE); 1887 cpu->vcpu_dirty = false; 1888 } 1889 1890 void kvm_cpu_synchronize_post_init(CPUState *cpu) 1891 { 1892 run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, RUN_ON_CPU_NULL); 1893 } 1894 1895 static void do_kvm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg) 1896 { 1897 cpu->vcpu_dirty = true; 1898 } 1899 1900 void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu) 1901 { 1902 run_on_cpu(cpu, do_kvm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL); 1903 } 1904 1905 #ifdef KVM_HAVE_MCE_INJECTION 1906 static __thread void *pending_sigbus_addr; 1907 static __thread int pending_sigbus_code; 1908 static __thread bool have_sigbus_pending; 1909 #endif 1910 1911 static void kvm_cpu_kick(CPUState *cpu) 1912 { 1913 atomic_set(&cpu->kvm_run->immediate_exit, 1); 1914 } 1915 1916 static void kvm_cpu_kick_self(void) 1917 { 1918 if (kvm_immediate_exit) { 1919 kvm_cpu_kick(current_cpu); 1920 } else { 1921 qemu_cpu_kick_self(); 1922 } 1923 } 1924 1925 static void kvm_eat_signals(CPUState *cpu) 1926 { 1927 struct timespec ts = { 0, 0 }; 1928 siginfo_t siginfo; 1929 sigset_t waitset; 1930 sigset_t chkset; 1931 int r; 1932 1933 if (kvm_immediate_exit) { 1934 atomic_set(&cpu->kvm_run->immediate_exit, 0); 1935 /* Write kvm_run->immediate_exit before the cpu->exit_request 1936 * write in kvm_cpu_exec. 1937 */ 1938 smp_wmb(); 1939 return; 1940 } 1941 1942 sigemptyset(&waitset); 1943 sigaddset(&waitset, SIG_IPI); 1944 1945 do { 1946 r = sigtimedwait(&waitset, &siginfo, &ts); 1947 if (r == -1 && !(errno == EAGAIN || errno == EINTR)) { 1948 perror("sigtimedwait"); 1949 exit(1); 1950 } 1951 1952 r = sigpending(&chkset); 1953 if (r == -1) { 1954 perror("sigpending"); 1955 exit(1); 1956 } 1957 } while (sigismember(&chkset, SIG_IPI)); 1958 } 1959 1960 int kvm_cpu_exec(CPUState *cpu) 1961 { 1962 struct kvm_run *run = cpu->kvm_run; 1963 int ret, run_ret; 1964 1965 DPRINTF("kvm_cpu_exec()\n"); 1966 1967 if (kvm_arch_process_async_events(cpu)) { 1968 atomic_set(&cpu->exit_request, 0); 1969 return EXCP_HLT; 1970 } 1971 1972 qemu_mutex_unlock_iothread(); 1973 cpu_exec_start(cpu); 1974 1975 do { 1976 MemTxAttrs attrs; 1977 1978 if (cpu->vcpu_dirty) { 1979 kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE); 1980 cpu->vcpu_dirty = false; 1981 } 1982 1983 kvm_arch_pre_run(cpu, run); 1984 if (atomic_read(&cpu->exit_request)) { 1985 DPRINTF("interrupt exit requested\n"); 1986 /* 1987 * KVM requires us to reenter the kernel after IO exits to complete 1988 * instruction emulation. This self-signal will ensure that we 1989 * leave ASAP again. 1990 */ 1991 kvm_cpu_kick_self(); 1992 } 1993 1994 /* Read cpu->exit_request before KVM_RUN reads run->immediate_exit. 1995 * Matching barrier in kvm_eat_signals. 1996 */ 1997 smp_rmb(); 1998 1999 run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0); 2000 2001 attrs = kvm_arch_post_run(cpu, run); 2002 2003 #ifdef KVM_HAVE_MCE_INJECTION 2004 if (unlikely(have_sigbus_pending)) { 2005 qemu_mutex_lock_iothread(); 2006 kvm_arch_on_sigbus_vcpu(cpu, pending_sigbus_code, 2007 pending_sigbus_addr); 2008 have_sigbus_pending = false; 2009 qemu_mutex_unlock_iothread(); 2010 } 2011 #endif 2012 2013 if (run_ret < 0) { 2014 if (run_ret == -EINTR || run_ret == -EAGAIN) { 2015 DPRINTF("io window exit\n"); 2016 kvm_eat_signals(cpu); 2017 ret = EXCP_INTERRUPT; 2018 break; 2019 } 2020 fprintf(stderr, "error: kvm run failed %s\n", 2021 strerror(-run_ret)); 2022 #ifdef TARGET_PPC 2023 if (run_ret == -EBUSY) { 2024 fprintf(stderr, 2025 "This is probably because your SMT is enabled.\n" 2026 "VCPU can only run on primary threads with all " 2027 "secondary threads offline.\n"); 2028 } 2029 #endif 2030 ret = -1; 2031 break; 2032 } 2033 2034 trace_kvm_run_exit(cpu->cpu_index, run->exit_reason); 2035 switch (run->exit_reason) { 2036 case KVM_EXIT_IO: 2037 DPRINTF("handle_io\n"); 2038 /* Called outside BQL */ 2039 kvm_handle_io(run->io.port, attrs, 2040 (uint8_t *)run + run->io.data_offset, 2041 run->io.direction, 2042 run->io.size, 2043 run->io.count); 2044 ret = 0; 2045 break; 2046 case KVM_EXIT_MMIO: 2047 DPRINTF("handle_mmio\n"); 2048 /* Called outside BQL */ 2049 address_space_rw(&address_space_memory, 2050 run->mmio.phys_addr, attrs, 2051 run->mmio.data, 2052 run->mmio.len, 2053 run->mmio.is_write); 2054 ret = 0; 2055 break; 2056 case KVM_EXIT_IRQ_WINDOW_OPEN: 2057 DPRINTF("irq_window_open\n"); 2058 ret = EXCP_INTERRUPT; 2059 break; 2060 case KVM_EXIT_SHUTDOWN: 2061 DPRINTF("shutdown\n"); 2062 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); 2063 ret = EXCP_INTERRUPT; 2064 break; 2065 case KVM_EXIT_UNKNOWN: 2066 fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n", 2067 (uint64_t)run->hw.hardware_exit_reason); 2068 ret = -1; 2069 break; 2070 case KVM_EXIT_INTERNAL_ERROR: 2071 ret = kvm_handle_internal_error(cpu, run); 2072 break; 2073 case KVM_EXIT_SYSTEM_EVENT: 2074 switch (run->system_event.type) { 2075 case KVM_SYSTEM_EVENT_SHUTDOWN: 2076 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN); 2077 ret = EXCP_INTERRUPT; 2078 break; 2079 case KVM_SYSTEM_EVENT_RESET: 2080 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); 2081 ret = EXCP_INTERRUPT; 2082 break; 2083 case KVM_SYSTEM_EVENT_CRASH: 2084 kvm_cpu_synchronize_state(cpu); 2085 qemu_mutex_lock_iothread(); 2086 qemu_system_guest_panicked(cpu_get_crash_info(cpu)); 2087 qemu_mutex_unlock_iothread(); 2088 ret = 0; 2089 break; 2090 default: 2091 DPRINTF("kvm_arch_handle_exit\n"); 2092 ret = kvm_arch_handle_exit(cpu, run); 2093 break; 2094 } 2095 break; 2096 default: 2097 DPRINTF("kvm_arch_handle_exit\n"); 2098 ret = kvm_arch_handle_exit(cpu, run); 2099 break; 2100 } 2101 } while (ret == 0); 2102 2103 cpu_exec_end(cpu); 2104 qemu_mutex_lock_iothread(); 2105 2106 if (ret < 0) { 2107 cpu_dump_state(cpu, stderr, CPU_DUMP_CODE); 2108 vm_stop(RUN_STATE_INTERNAL_ERROR); 2109 } 2110 2111 atomic_set(&cpu->exit_request, 0); 2112 return ret; 2113 } 2114 2115 int kvm_ioctl(KVMState *s, int type, ...) 2116 { 2117 int ret; 2118 void *arg; 2119 va_list ap; 2120 2121 va_start(ap, type); 2122 arg = va_arg(ap, void *); 2123 va_end(ap); 2124 2125 trace_kvm_ioctl(type, arg); 2126 ret = ioctl(s->fd, type, arg); 2127 if (ret == -1) { 2128 ret = -errno; 2129 } 2130 return ret; 2131 } 2132 2133 int kvm_vm_ioctl(KVMState *s, int type, ...) 2134 { 2135 int ret; 2136 void *arg; 2137 va_list ap; 2138 2139 va_start(ap, type); 2140 arg = va_arg(ap, void *); 2141 va_end(ap); 2142 2143 trace_kvm_vm_ioctl(type, arg); 2144 ret = ioctl(s->vmfd, type, arg); 2145 if (ret == -1) { 2146 ret = -errno; 2147 } 2148 return ret; 2149 } 2150 2151 int kvm_vcpu_ioctl(CPUState *cpu, int type, ...) 2152 { 2153 int ret; 2154 void *arg; 2155 va_list ap; 2156 2157 va_start(ap, type); 2158 arg = va_arg(ap, void *); 2159 va_end(ap); 2160 2161 trace_kvm_vcpu_ioctl(cpu->cpu_index, type, arg); 2162 ret = ioctl(cpu->kvm_fd, type, arg); 2163 if (ret == -1) { 2164 ret = -errno; 2165 } 2166 return ret; 2167 } 2168 2169 int kvm_device_ioctl(int fd, int type, ...) 2170 { 2171 int ret; 2172 void *arg; 2173 va_list ap; 2174 2175 va_start(ap, type); 2176 arg = va_arg(ap, void *); 2177 va_end(ap); 2178 2179 trace_kvm_device_ioctl(fd, type, arg); 2180 ret = ioctl(fd, type, arg); 2181 if (ret == -1) { 2182 ret = -errno; 2183 } 2184 return ret; 2185 } 2186 2187 int kvm_vm_check_attr(KVMState *s, uint32_t group, uint64_t attr) 2188 { 2189 int ret; 2190 struct kvm_device_attr attribute = { 2191 .group = group, 2192 .attr = attr, 2193 }; 2194 2195 if (!kvm_vm_attributes_allowed) { 2196 return 0; 2197 } 2198 2199 ret = kvm_vm_ioctl(s, KVM_HAS_DEVICE_ATTR, &attribute); 2200 /* kvm returns 0 on success for HAS_DEVICE_ATTR */ 2201 return ret ? 0 : 1; 2202 } 2203 2204 int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr) 2205 { 2206 struct kvm_device_attr attribute = { 2207 .group = group, 2208 .attr = attr, 2209 .flags = 0, 2210 }; 2211 2212 return kvm_device_ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute) ? 0 : 1; 2213 } 2214 2215 int kvm_device_access(int fd, int group, uint64_t attr, 2216 void *val, bool write, Error **errp) 2217 { 2218 struct kvm_device_attr kvmattr; 2219 int err; 2220 2221 kvmattr.flags = 0; 2222 kvmattr.group = group; 2223 kvmattr.attr = attr; 2224 kvmattr.addr = (uintptr_t)val; 2225 2226 err = kvm_device_ioctl(fd, 2227 write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR, 2228 &kvmattr); 2229 if (err < 0) { 2230 error_setg_errno(errp, -err, 2231 "KVM_%s_DEVICE_ATTR failed: Group %d " 2232 "attr 0x%016" PRIx64, 2233 write ? "SET" : "GET", group, attr); 2234 } 2235 return err; 2236 } 2237 2238 bool kvm_has_sync_mmu(void) 2239 { 2240 return kvm_state->sync_mmu; 2241 } 2242 2243 int kvm_has_vcpu_events(void) 2244 { 2245 return kvm_state->vcpu_events; 2246 } 2247 2248 int kvm_has_robust_singlestep(void) 2249 { 2250 return kvm_state->robust_singlestep; 2251 } 2252 2253 int kvm_has_debugregs(void) 2254 { 2255 return kvm_state->debugregs; 2256 } 2257 2258 int kvm_max_nested_state_length(void) 2259 { 2260 return kvm_state->max_nested_state_len; 2261 } 2262 2263 int kvm_has_many_ioeventfds(void) 2264 { 2265 if (!kvm_enabled()) { 2266 return 0; 2267 } 2268 return kvm_state->many_ioeventfds; 2269 } 2270 2271 int kvm_has_gsi_routing(void) 2272 { 2273 #ifdef KVM_CAP_IRQ_ROUTING 2274 return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING); 2275 #else 2276 return false; 2277 #endif 2278 } 2279 2280 int kvm_has_intx_set_mask(void) 2281 { 2282 return kvm_state->intx_set_mask; 2283 } 2284 2285 bool kvm_arm_supports_user_irq(void) 2286 { 2287 return kvm_check_extension(kvm_state, KVM_CAP_ARM_USER_IRQ); 2288 } 2289 2290 #ifdef KVM_CAP_SET_GUEST_DEBUG 2291 struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu, 2292 target_ulong pc) 2293 { 2294 struct kvm_sw_breakpoint *bp; 2295 2296 QTAILQ_FOREACH(bp, &cpu->kvm_state->kvm_sw_breakpoints, entry) { 2297 if (bp->pc == pc) { 2298 return bp; 2299 } 2300 } 2301 return NULL; 2302 } 2303 2304 int kvm_sw_breakpoints_active(CPUState *cpu) 2305 { 2306 return !QTAILQ_EMPTY(&cpu->kvm_state->kvm_sw_breakpoints); 2307 } 2308 2309 struct kvm_set_guest_debug_data { 2310 struct kvm_guest_debug dbg; 2311 int err; 2312 }; 2313 2314 static void kvm_invoke_set_guest_debug(CPUState *cpu, run_on_cpu_data data) 2315 { 2316 struct kvm_set_guest_debug_data *dbg_data = 2317 (struct kvm_set_guest_debug_data *) data.host_ptr; 2318 2319 dbg_data->err = kvm_vcpu_ioctl(cpu, KVM_SET_GUEST_DEBUG, 2320 &dbg_data->dbg); 2321 } 2322 2323 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap) 2324 { 2325 struct kvm_set_guest_debug_data data; 2326 2327 data.dbg.control = reinject_trap; 2328 2329 if (cpu->singlestep_enabled) { 2330 data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP; 2331 } 2332 kvm_arch_update_guest_debug(cpu, &data.dbg); 2333 2334 run_on_cpu(cpu, kvm_invoke_set_guest_debug, 2335 RUN_ON_CPU_HOST_PTR(&data)); 2336 return data.err; 2337 } 2338 2339 int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr, 2340 target_ulong len, int type) 2341 { 2342 struct kvm_sw_breakpoint *bp; 2343 int err; 2344 2345 if (type == GDB_BREAKPOINT_SW) { 2346 bp = kvm_find_sw_breakpoint(cpu, addr); 2347 if (bp) { 2348 bp->use_count++; 2349 return 0; 2350 } 2351 2352 bp = g_malloc(sizeof(struct kvm_sw_breakpoint)); 2353 bp->pc = addr; 2354 bp->use_count = 1; 2355 err = kvm_arch_insert_sw_breakpoint(cpu, bp); 2356 if (err) { 2357 g_free(bp); 2358 return err; 2359 } 2360 2361 QTAILQ_INSERT_HEAD(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry); 2362 } else { 2363 err = kvm_arch_insert_hw_breakpoint(addr, len, type); 2364 if (err) { 2365 return err; 2366 } 2367 } 2368 2369 CPU_FOREACH(cpu) { 2370 err = kvm_update_guest_debug(cpu, 0); 2371 if (err) { 2372 return err; 2373 } 2374 } 2375 return 0; 2376 } 2377 2378 int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr, 2379 target_ulong len, int type) 2380 { 2381 struct kvm_sw_breakpoint *bp; 2382 int err; 2383 2384 if (type == GDB_BREAKPOINT_SW) { 2385 bp = kvm_find_sw_breakpoint(cpu, addr); 2386 if (!bp) { 2387 return -ENOENT; 2388 } 2389 2390 if (bp->use_count > 1) { 2391 bp->use_count--; 2392 return 0; 2393 } 2394 2395 err = kvm_arch_remove_sw_breakpoint(cpu, bp); 2396 if (err) { 2397 return err; 2398 } 2399 2400 QTAILQ_REMOVE(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry); 2401 g_free(bp); 2402 } else { 2403 err = kvm_arch_remove_hw_breakpoint(addr, len, type); 2404 if (err) { 2405 return err; 2406 } 2407 } 2408 2409 CPU_FOREACH(cpu) { 2410 err = kvm_update_guest_debug(cpu, 0); 2411 if (err) { 2412 return err; 2413 } 2414 } 2415 return 0; 2416 } 2417 2418 void kvm_remove_all_breakpoints(CPUState *cpu) 2419 { 2420 struct kvm_sw_breakpoint *bp, *next; 2421 KVMState *s = cpu->kvm_state; 2422 CPUState *tmpcpu; 2423 2424 QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) { 2425 if (kvm_arch_remove_sw_breakpoint(cpu, bp) != 0) { 2426 /* Try harder to find a CPU that currently sees the breakpoint. */ 2427 CPU_FOREACH(tmpcpu) { 2428 if (kvm_arch_remove_sw_breakpoint(tmpcpu, bp) == 0) { 2429 break; 2430 } 2431 } 2432 } 2433 QTAILQ_REMOVE(&s->kvm_sw_breakpoints, bp, entry); 2434 g_free(bp); 2435 } 2436 kvm_arch_remove_all_hw_breakpoints(); 2437 2438 CPU_FOREACH(cpu) { 2439 kvm_update_guest_debug(cpu, 0); 2440 } 2441 } 2442 2443 #else /* !KVM_CAP_SET_GUEST_DEBUG */ 2444 2445 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap) 2446 { 2447 return -EINVAL; 2448 } 2449 2450 int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr, 2451 target_ulong len, int type) 2452 { 2453 return -EINVAL; 2454 } 2455 2456 int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr, 2457 target_ulong len, int type) 2458 { 2459 return -EINVAL; 2460 } 2461 2462 void kvm_remove_all_breakpoints(CPUState *cpu) 2463 { 2464 } 2465 #endif /* !KVM_CAP_SET_GUEST_DEBUG */ 2466 2467 static int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset) 2468 { 2469 KVMState *s = kvm_state; 2470 struct kvm_signal_mask *sigmask; 2471 int r; 2472 2473 sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset)); 2474 2475 sigmask->len = s->sigmask_len; 2476 memcpy(sigmask->sigset, sigset, sizeof(*sigset)); 2477 r = kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, sigmask); 2478 g_free(sigmask); 2479 2480 return r; 2481 } 2482 2483 static void kvm_ipi_signal(int sig) 2484 { 2485 if (current_cpu) { 2486 assert(kvm_immediate_exit); 2487 kvm_cpu_kick(current_cpu); 2488 } 2489 } 2490 2491 void kvm_init_cpu_signals(CPUState *cpu) 2492 { 2493 int r; 2494 sigset_t set; 2495 struct sigaction sigact; 2496 2497 memset(&sigact, 0, sizeof(sigact)); 2498 sigact.sa_handler = kvm_ipi_signal; 2499 sigaction(SIG_IPI, &sigact, NULL); 2500 2501 pthread_sigmask(SIG_BLOCK, NULL, &set); 2502 #if defined KVM_HAVE_MCE_INJECTION 2503 sigdelset(&set, SIGBUS); 2504 pthread_sigmask(SIG_SETMASK, &set, NULL); 2505 #endif 2506 sigdelset(&set, SIG_IPI); 2507 if (kvm_immediate_exit) { 2508 r = pthread_sigmask(SIG_SETMASK, &set, NULL); 2509 } else { 2510 r = kvm_set_signal_mask(cpu, &set); 2511 } 2512 if (r) { 2513 fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r)); 2514 exit(1); 2515 } 2516 } 2517 2518 /* Called asynchronously in VCPU thread. */ 2519 int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr) 2520 { 2521 #ifdef KVM_HAVE_MCE_INJECTION 2522 if (have_sigbus_pending) { 2523 return 1; 2524 } 2525 have_sigbus_pending = true; 2526 pending_sigbus_addr = addr; 2527 pending_sigbus_code = code; 2528 atomic_set(&cpu->exit_request, 1); 2529 return 0; 2530 #else 2531 return 1; 2532 #endif 2533 } 2534 2535 /* Called synchronously (via signalfd) in main thread. */ 2536 int kvm_on_sigbus(int code, void *addr) 2537 { 2538 #ifdef KVM_HAVE_MCE_INJECTION 2539 /* Action required MCE kills the process if SIGBUS is blocked. Because 2540 * that's what happens in the I/O thread, where we handle MCE via signalfd, 2541 * we can only get action optional here. 2542 */ 2543 assert(code != BUS_MCEERR_AR); 2544 kvm_arch_on_sigbus_vcpu(first_cpu, code, addr); 2545 return 0; 2546 #else 2547 return 1; 2548 #endif 2549 } 2550 2551 int kvm_create_device(KVMState *s, uint64_t type, bool test) 2552 { 2553 int ret; 2554 struct kvm_create_device create_dev; 2555 2556 create_dev.type = type; 2557 create_dev.fd = -1; 2558 create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0; 2559 2560 if (!kvm_check_extension(s, KVM_CAP_DEVICE_CTRL)) { 2561 return -ENOTSUP; 2562 } 2563 2564 ret = kvm_vm_ioctl(s, KVM_CREATE_DEVICE, &create_dev); 2565 if (ret) { 2566 return ret; 2567 } 2568 2569 return test ? 0 : create_dev.fd; 2570 } 2571 2572 bool kvm_device_supported(int vmfd, uint64_t type) 2573 { 2574 struct kvm_create_device create_dev = { 2575 .type = type, 2576 .fd = -1, 2577 .flags = KVM_CREATE_DEVICE_TEST, 2578 }; 2579 2580 if (ioctl(vmfd, KVM_CHECK_EXTENSION, KVM_CAP_DEVICE_CTRL) <= 0) { 2581 return false; 2582 } 2583 2584 return (ioctl(vmfd, KVM_CREATE_DEVICE, &create_dev) >= 0); 2585 } 2586 2587 int kvm_set_one_reg(CPUState *cs, uint64_t id, void *source) 2588 { 2589 struct kvm_one_reg reg; 2590 int r; 2591 2592 reg.id = id; 2593 reg.addr = (uintptr_t) source; 2594 r = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, ®); 2595 if (r) { 2596 trace_kvm_failed_reg_set(id, strerror(-r)); 2597 } 2598 return r; 2599 } 2600 2601 int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target) 2602 { 2603 struct kvm_one_reg reg; 2604 int r; 2605 2606 reg.id = id; 2607 reg.addr = (uintptr_t) target; 2608 r = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, ®); 2609 if (r) { 2610 trace_kvm_failed_reg_get(id, strerror(-r)); 2611 } 2612 return r; 2613 } 2614 2615 static void kvm_accel_class_init(ObjectClass *oc, void *data) 2616 { 2617 AccelClass *ac = ACCEL_CLASS(oc); 2618 ac->name = "KVM"; 2619 ac->init_machine = kvm_init; 2620 ac->allowed = &kvm_allowed; 2621 } 2622 2623 static const TypeInfo kvm_accel_type = { 2624 .name = TYPE_KVM_ACCEL, 2625 .parent = TYPE_ACCEL, 2626 .class_init = kvm_accel_class_init, 2627 .instance_size = sizeof(KVMState), 2628 }; 2629 2630 static void kvm_type_init(void) 2631 { 2632 type_register_static(&kvm_accel_type); 2633 } 2634 2635 type_init(kvm_type_init); 2636