1 /* 2 * QEMU KVM support 3 * 4 * Copyright IBM, Corp. 2008 5 * Red Hat, Inc. 2008 6 * 7 * Authors: 8 * Anthony Liguori <aliguori@us.ibm.com> 9 * Glauber Costa <gcosta@redhat.com> 10 * 11 * This work is licensed under the terms of the GNU GPL, version 2 or later. 12 * See the COPYING file in the top-level directory. 13 * 14 */ 15 16 #include "qemu/osdep.h" 17 #include <sys/ioctl.h> 18 19 #include <linux/kvm.h> 20 21 #include "qemu-common.h" 22 #include "qemu/atomic.h" 23 #include "qemu/option.h" 24 #include "qemu/config-file.h" 25 #include "qemu/error-report.h" 26 #include "qapi/error.h" 27 #include "hw/hw.h" 28 #include "hw/pci/msi.h" 29 #include "hw/pci/msix.h" 30 #include "hw/s390x/adapter.h" 31 #include "exec/gdbstub.h" 32 #include "sysemu/kvm_int.h" 33 #include "sysemu/cpus.h" 34 #include "qemu/bswap.h" 35 #include "exec/memory.h" 36 #include "exec/ram_addr.h" 37 #include "exec/address-spaces.h" 38 #include "qemu/event_notifier.h" 39 #include "trace.h" 40 #include "hw/irq.h" 41 #include "sysemu/sev.h" 42 43 #include "hw/boards.h" 44 45 /* This check must be after config-host.h is included */ 46 #ifdef CONFIG_EVENTFD 47 #include <sys/eventfd.h> 48 #endif 49 50 /* KVM uses PAGE_SIZE in its definition of KVM_COALESCED_MMIO_MAX. We 51 * need to use the real host PAGE_SIZE, as that's what KVM will use. 52 */ 53 #define PAGE_SIZE getpagesize() 54 55 //#define DEBUG_KVM 56 57 #ifdef DEBUG_KVM 58 #define DPRINTF(fmt, ...) \ 59 do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0) 60 #else 61 #define DPRINTF(fmt, ...) \ 62 do { } while (0) 63 #endif 64 65 #define KVM_MSI_HASHTAB_SIZE 256 66 67 struct KVMParkedVcpu { 68 unsigned long vcpu_id; 69 int kvm_fd; 70 QLIST_ENTRY(KVMParkedVcpu) node; 71 }; 72 73 struct KVMState 74 { 75 AccelState parent_obj; 76 77 int nr_slots; 78 int fd; 79 int vmfd; 80 int coalesced_mmio; 81 struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; 82 bool coalesced_flush_in_progress; 83 int vcpu_events; 84 int robust_singlestep; 85 int debugregs; 86 #ifdef KVM_CAP_SET_GUEST_DEBUG 87 struct kvm_sw_breakpoint_head kvm_sw_breakpoints; 88 #endif 89 int many_ioeventfds; 90 int intx_set_mask; 91 bool sync_mmu; 92 /* The man page (and posix) say ioctl numbers are signed int, but 93 * they're not. Linux, glibc and *BSD all treat ioctl numbers as 94 * unsigned, and treating them as signed here can break things */ 95 unsigned irq_set_ioctl; 96 unsigned int sigmask_len; 97 GHashTable *gsimap; 98 #ifdef KVM_CAP_IRQ_ROUTING 99 struct kvm_irq_routing *irq_routes; 100 int nr_allocated_irq_routes; 101 unsigned long *used_gsi_bitmap; 102 unsigned int gsi_count; 103 QTAILQ_HEAD(msi_hashtab, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE]; 104 #endif 105 KVMMemoryListener memory_listener; 106 QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus; 107 108 /* memory encryption */ 109 void *memcrypt_handle; 110 int (*memcrypt_encrypt_data)(void *handle, uint8_t *ptr, uint64_t len); 111 }; 112 113 KVMState *kvm_state; 114 bool kvm_kernel_irqchip; 115 bool kvm_split_irqchip; 116 bool kvm_async_interrupts_allowed; 117 bool kvm_halt_in_kernel_allowed; 118 bool kvm_eventfds_allowed; 119 bool kvm_irqfds_allowed; 120 bool kvm_resamplefds_allowed; 121 bool kvm_msi_via_irqfd_allowed; 122 bool kvm_gsi_routing_allowed; 123 bool kvm_gsi_direct_mapping; 124 bool kvm_allowed; 125 bool kvm_readonly_mem_allowed; 126 bool kvm_vm_attributes_allowed; 127 bool kvm_direct_msi_allowed; 128 bool kvm_ioeventfd_any_length_allowed; 129 bool kvm_msi_use_devid; 130 static bool kvm_immediate_exit; 131 132 static const KVMCapabilityInfo kvm_required_capabilites[] = { 133 KVM_CAP_INFO(USER_MEMORY), 134 KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS), 135 KVM_CAP_INFO(JOIN_MEMORY_REGIONS_WORKS), 136 KVM_CAP_LAST_INFO 137 }; 138 139 int kvm_get_max_memslots(void) 140 { 141 KVMState *s = KVM_STATE(current_machine->accelerator); 142 143 return s->nr_slots; 144 } 145 146 bool kvm_memcrypt_enabled(void) 147 { 148 if (kvm_state && kvm_state->memcrypt_handle) { 149 return true; 150 } 151 152 return false; 153 } 154 155 int kvm_memcrypt_encrypt_data(uint8_t *ptr, uint64_t len) 156 { 157 if (kvm_state->memcrypt_handle && 158 kvm_state->memcrypt_encrypt_data) { 159 return kvm_state->memcrypt_encrypt_data(kvm_state->memcrypt_handle, 160 ptr, len); 161 } 162 163 return 1; 164 } 165 166 static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml) 167 { 168 KVMState *s = kvm_state; 169 int i; 170 171 for (i = 0; i < s->nr_slots; i++) { 172 if (kml->slots[i].memory_size == 0) { 173 return &kml->slots[i]; 174 } 175 } 176 177 return NULL; 178 } 179 180 bool kvm_has_free_slot(MachineState *ms) 181 { 182 KVMState *s = KVM_STATE(ms->accelerator); 183 184 return kvm_get_free_slot(&s->memory_listener); 185 } 186 187 static KVMSlot *kvm_alloc_slot(KVMMemoryListener *kml) 188 { 189 KVMSlot *slot = kvm_get_free_slot(kml); 190 191 if (slot) { 192 return slot; 193 } 194 195 fprintf(stderr, "%s: no free slot available\n", __func__); 196 abort(); 197 } 198 199 static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml, 200 hwaddr start_addr, 201 hwaddr size) 202 { 203 KVMState *s = kvm_state; 204 int i; 205 206 for (i = 0; i < s->nr_slots; i++) { 207 KVMSlot *mem = &kml->slots[i]; 208 209 if (start_addr == mem->start_addr && size == mem->memory_size) { 210 return mem; 211 } 212 } 213 214 return NULL; 215 } 216 217 /* 218 * Calculate and align the start address and the size of the section. 219 * Return the size. If the size is 0, the aligned section is empty. 220 */ 221 static hwaddr kvm_align_section(MemoryRegionSection *section, 222 hwaddr *start) 223 { 224 hwaddr size = int128_get64(section->size); 225 hwaddr delta, aligned; 226 227 /* kvm works in page size chunks, but the function may be called 228 with sub-page size and unaligned start address. Pad the start 229 address to next and truncate size to previous page boundary. */ 230 aligned = ROUND_UP(section->offset_within_address_space, 231 qemu_real_host_page_size); 232 delta = aligned - section->offset_within_address_space; 233 *start = aligned; 234 if (delta > size) { 235 return 0; 236 } 237 238 return (size - delta) & qemu_real_host_page_mask; 239 } 240 241 int kvm_physical_memory_addr_from_host(KVMState *s, void *ram, 242 hwaddr *phys_addr) 243 { 244 KVMMemoryListener *kml = &s->memory_listener; 245 int i; 246 247 for (i = 0; i < s->nr_slots; i++) { 248 KVMSlot *mem = &kml->slots[i]; 249 250 if (ram >= mem->ram && ram < mem->ram + mem->memory_size) { 251 *phys_addr = mem->start_addr + (ram - mem->ram); 252 return 1; 253 } 254 } 255 256 return 0; 257 } 258 259 static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot, bool new) 260 { 261 KVMState *s = kvm_state; 262 struct kvm_userspace_memory_region mem; 263 int ret; 264 265 mem.slot = slot->slot | (kml->as_id << 16); 266 mem.guest_phys_addr = slot->start_addr; 267 mem.userspace_addr = (unsigned long)slot->ram; 268 mem.flags = slot->flags; 269 270 if (slot->memory_size && !new && (mem.flags ^ slot->old_flags) & KVM_MEM_READONLY) { 271 /* Set the slot size to 0 before setting the slot to the desired 272 * value. This is needed based on KVM commit 75d61fbc. */ 273 mem.memory_size = 0; 274 kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem); 275 } 276 mem.memory_size = slot->memory_size; 277 ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem); 278 slot->old_flags = mem.flags; 279 trace_kvm_set_user_memory(mem.slot, mem.flags, mem.guest_phys_addr, 280 mem.memory_size, mem.userspace_addr, ret); 281 return ret; 282 } 283 284 int kvm_destroy_vcpu(CPUState *cpu) 285 { 286 KVMState *s = kvm_state; 287 long mmap_size; 288 struct KVMParkedVcpu *vcpu = NULL; 289 int ret = 0; 290 291 DPRINTF("kvm_destroy_vcpu\n"); 292 293 mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0); 294 if (mmap_size < 0) { 295 ret = mmap_size; 296 DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n"); 297 goto err; 298 } 299 300 ret = munmap(cpu->kvm_run, mmap_size); 301 if (ret < 0) { 302 goto err; 303 } 304 305 vcpu = g_malloc0(sizeof(*vcpu)); 306 vcpu->vcpu_id = kvm_arch_vcpu_id(cpu); 307 vcpu->kvm_fd = cpu->kvm_fd; 308 QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node); 309 err: 310 return ret; 311 } 312 313 static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id) 314 { 315 struct KVMParkedVcpu *cpu; 316 317 QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) { 318 if (cpu->vcpu_id == vcpu_id) { 319 int kvm_fd; 320 321 QLIST_REMOVE(cpu, node); 322 kvm_fd = cpu->kvm_fd; 323 g_free(cpu); 324 return kvm_fd; 325 } 326 } 327 328 return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id); 329 } 330 331 int kvm_init_vcpu(CPUState *cpu) 332 { 333 KVMState *s = kvm_state; 334 long mmap_size; 335 int ret; 336 337 DPRINTF("kvm_init_vcpu\n"); 338 339 ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu)); 340 if (ret < 0) { 341 DPRINTF("kvm_create_vcpu failed\n"); 342 goto err; 343 } 344 345 cpu->kvm_fd = ret; 346 cpu->kvm_state = s; 347 cpu->vcpu_dirty = true; 348 349 mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0); 350 if (mmap_size < 0) { 351 ret = mmap_size; 352 DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n"); 353 goto err; 354 } 355 356 cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, 357 cpu->kvm_fd, 0); 358 if (cpu->kvm_run == MAP_FAILED) { 359 ret = -errno; 360 DPRINTF("mmap'ing vcpu state failed\n"); 361 goto err; 362 } 363 364 if (s->coalesced_mmio && !s->coalesced_mmio_ring) { 365 s->coalesced_mmio_ring = 366 (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE; 367 } 368 369 ret = kvm_arch_init_vcpu(cpu); 370 err: 371 return ret; 372 } 373 374 /* 375 * dirty pages logging control 376 */ 377 378 static int kvm_mem_flags(MemoryRegion *mr) 379 { 380 bool readonly = mr->readonly || memory_region_is_romd(mr); 381 int flags = 0; 382 383 if (memory_region_get_dirty_log_mask(mr) != 0) { 384 flags |= KVM_MEM_LOG_DIRTY_PAGES; 385 } 386 if (readonly && kvm_readonly_mem_allowed) { 387 flags |= KVM_MEM_READONLY; 388 } 389 return flags; 390 } 391 392 static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem, 393 MemoryRegion *mr) 394 { 395 mem->flags = kvm_mem_flags(mr); 396 397 /* If nothing changed effectively, no need to issue ioctl */ 398 if (mem->flags == mem->old_flags) { 399 return 0; 400 } 401 402 return kvm_set_user_memory_region(kml, mem, false); 403 } 404 405 static int kvm_section_update_flags(KVMMemoryListener *kml, 406 MemoryRegionSection *section) 407 { 408 hwaddr start_addr, size; 409 KVMSlot *mem; 410 411 size = kvm_align_section(section, &start_addr); 412 if (!size) { 413 return 0; 414 } 415 416 mem = kvm_lookup_matching_slot(kml, start_addr, size); 417 if (!mem) { 418 /* We don't have a slot if we want to trap every access. */ 419 return 0; 420 } 421 422 return kvm_slot_update_flags(kml, mem, section->mr); 423 } 424 425 static void kvm_log_start(MemoryListener *listener, 426 MemoryRegionSection *section, 427 int old, int new) 428 { 429 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 430 int r; 431 432 if (old != 0) { 433 return; 434 } 435 436 r = kvm_section_update_flags(kml, section); 437 if (r < 0) { 438 abort(); 439 } 440 } 441 442 static void kvm_log_stop(MemoryListener *listener, 443 MemoryRegionSection *section, 444 int old, int new) 445 { 446 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 447 int r; 448 449 if (new != 0) { 450 return; 451 } 452 453 r = kvm_section_update_flags(kml, section); 454 if (r < 0) { 455 abort(); 456 } 457 } 458 459 /* get kvm's dirty pages bitmap and update qemu's */ 460 static int kvm_get_dirty_pages_log_range(MemoryRegionSection *section, 461 unsigned long *bitmap) 462 { 463 ram_addr_t start = section->offset_within_region + 464 memory_region_get_ram_addr(section->mr); 465 ram_addr_t pages = int128_get64(section->size) / getpagesize(); 466 467 cpu_physical_memory_set_dirty_lebitmap(bitmap, start, pages); 468 return 0; 469 } 470 471 #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1)) 472 473 /** 474 * kvm_physical_sync_dirty_bitmap - Grab dirty bitmap from kernel space 475 * This function updates qemu's dirty bitmap using 476 * memory_region_set_dirty(). This means all bits are set 477 * to dirty. 478 * 479 * @start_add: start of logged region. 480 * @end_addr: end of logged region. 481 */ 482 static int kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml, 483 MemoryRegionSection *section) 484 { 485 KVMState *s = kvm_state; 486 struct kvm_dirty_log d = {}; 487 KVMSlot *mem; 488 hwaddr start_addr, size; 489 490 size = kvm_align_section(section, &start_addr); 491 if (size) { 492 mem = kvm_lookup_matching_slot(kml, start_addr, size); 493 if (!mem) { 494 /* We don't have a slot if we want to trap every access. */ 495 return 0; 496 } 497 498 /* XXX bad kernel interface alert 499 * For dirty bitmap, kernel allocates array of size aligned to 500 * bits-per-long. But for case when the kernel is 64bits and 501 * the userspace is 32bits, userspace can't align to the same 502 * bits-per-long, since sizeof(long) is different between kernel 503 * and user space. This way, userspace will provide buffer which 504 * may be 4 bytes less than the kernel will use, resulting in 505 * userspace memory corruption (which is not detectable by valgrind 506 * too, in most cases). 507 * So for now, let's align to 64 instead of HOST_LONG_BITS here, in 508 * a hope that sizeof(long) won't become >8 any time soon. 509 */ 510 size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS), 511 /*HOST_LONG_BITS*/ 64) / 8; 512 d.dirty_bitmap = g_malloc0(size); 513 514 d.slot = mem->slot | (kml->as_id << 16); 515 if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) { 516 DPRINTF("ioctl failed %d\n", errno); 517 g_free(d.dirty_bitmap); 518 return -1; 519 } 520 521 kvm_get_dirty_pages_log_range(section, d.dirty_bitmap); 522 g_free(d.dirty_bitmap); 523 } 524 525 return 0; 526 } 527 528 static void kvm_coalesce_mmio_region(MemoryListener *listener, 529 MemoryRegionSection *secion, 530 hwaddr start, hwaddr size) 531 { 532 KVMState *s = kvm_state; 533 534 if (s->coalesced_mmio) { 535 struct kvm_coalesced_mmio_zone zone; 536 537 zone.addr = start; 538 zone.size = size; 539 zone.pad = 0; 540 541 (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone); 542 } 543 } 544 545 static void kvm_uncoalesce_mmio_region(MemoryListener *listener, 546 MemoryRegionSection *secion, 547 hwaddr start, hwaddr size) 548 { 549 KVMState *s = kvm_state; 550 551 if (s->coalesced_mmio) { 552 struct kvm_coalesced_mmio_zone zone; 553 554 zone.addr = start; 555 zone.size = size; 556 zone.pad = 0; 557 558 (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone); 559 } 560 } 561 562 int kvm_check_extension(KVMState *s, unsigned int extension) 563 { 564 int ret; 565 566 ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension); 567 if (ret < 0) { 568 ret = 0; 569 } 570 571 return ret; 572 } 573 574 int kvm_vm_check_extension(KVMState *s, unsigned int extension) 575 { 576 int ret; 577 578 ret = kvm_vm_ioctl(s, KVM_CHECK_EXTENSION, extension); 579 if (ret < 0) { 580 /* VM wide version not implemented, use global one instead */ 581 ret = kvm_check_extension(s, extension); 582 } 583 584 return ret; 585 } 586 587 static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size) 588 { 589 #if defined(HOST_WORDS_BIGENDIAN) != defined(TARGET_WORDS_BIGENDIAN) 590 /* The kernel expects ioeventfd values in HOST_WORDS_BIGENDIAN 591 * endianness, but the memory core hands them in target endianness. 592 * For example, PPC is always treated as big-endian even if running 593 * on KVM and on PPC64LE. Correct here. 594 */ 595 switch (size) { 596 case 2: 597 val = bswap16(val); 598 break; 599 case 4: 600 val = bswap32(val); 601 break; 602 } 603 #endif 604 return val; 605 } 606 607 static int kvm_set_ioeventfd_mmio(int fd, hwaddr addr, uint32_t val, 608 bool assign, uint32_t size, bool datamatch) 609 { 610 int ret; 611 struct kvm_ioeventfd iofd = { 612 .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0, 613 .addr = addr, 614 .len = size, 615 .flags = 0, 616 .fd = fd, 617 }; 618 619 if (!kvm_enabled()) { 620 return -ENOSYS; 621 } 622 623 if (datamatch) { 624 iofd.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH; 625 } 626 if (!assign) { 627 iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN; 628 } 629 630 ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd); 631 632 if (ret < 0) { 633 return -errno; 634 } 635 636 return 0; 637 } 638 639 static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val, 640 bool assign, uint32_t size, bool datamatch) 641 { 642 struct kvm_ioeventfd kick = { 643 .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0, 644 .addr = addr, 645 .flags = KVM_IOEVENTFD_FLAG_PIO, 646 .len = size, 647 .fd = fd, 648 }; 649 int r; 650 if (!kvm_enabled()) { 651 return -ENOSYS; 652 } 653 if (datamatch) { 654 kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH; 655 } 656 if (!assign) { 657 kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN; 658 } 659 r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick); 660 if (r < 0) { 661 return r; 662 } 663 return 0; 664 } 665 666 667 static int kvm_check_many_ioeventfds(void) 668 { 669 /* Userspace can use ioeventfd for io notification. This requires a host 670 * that supports eventfd(2) and an I/O thread; since eventfd does not 671 * support SIGIO it cannot interrupt the vcpu. 672 * 673 * Older kernels have a 6 device limit on the KVM io bus. Find out so we 674 * can avoid creating too many ioeventfds. 675 */ 676 #if defined(CONFIG_EVENTFD) 677 int ioeventfds[7]; 678 int i, ret = 0; 679 for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) { 680 ioeventfds[i] = eventfd(0, EFD_CLOEXEC); 681 if (ioeventfds[i] < 0) { 682 break; 683 } 684 ret = kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, true, 2, true); 685 if (ret < 0) { 686 close(ioeventfds[i]); 687 break; 688 } 689 } 690 691 /* Decide whether many devices are supported or not */ 692 ret = i == ARRAY_SIZE(ioeventfds); 693 694 while (i-- > 0) { 695 kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, false, 2, true); 696 close(ioeventfds[i]); 697 } 698 return ret; 699 #else 700 return 0; 701 #endif 702 } 703 704 static const KVMCapabilityInfo * 705 kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list) 706 { 707 while (list->name) { 708 if (!kvm_check_extension(s, list->value)) { 709 return list; 710 } 711 list++; 712 } 713 return NULL; 714 } 715 716 static void kvm_set_phys_mem(KVMMemoryListener *kml, 717 MemoryRegionSection *section, bool add) 718 { 719 KVMSlot *mem; 720 int err; 721 MemoryRegion *mr = section->mr; 722 bool writeable = !mr->readonly && !mr->rom_device; 723 hwaddr start_addr, size; 724 void *ram; 725 726 if (!memory_region_is_ram(mr)) { 727 if (writeable || !kvm_readonly_mem_allowed) { 728 return; 729 } else if (!mr->romd_mode) { 730 /* If the memory device is not in romd_mode, then we actually want 731 * to remove the kvm memory slot so all accesses will trap. */ 732 add = false; 733 } 734 } 735 736 size = kvm_align_section(section, &start_addr); 737 if (!size) { 738 return; 739 } 740 741 /* use aligned delta to align the ram address */ 742 ram = memory_region_get_ram_ptr(mr) + section->offset_within_region + 743 (start_addr - section->offset_within_address_space); 744 745 if (!add) { 746 mem = kvm_lookup_matching_slot(kml, start_addr, size); 747 if (!mem) { 748 return; 749 } 750 if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { 751 kvm_physical_sync_dirty_bitmap(kml, section); 752 } 753 754 /* unregister the slot */ 755 mem->memory_size = 0; 756 mem->flags = 0; 757 err = kvm_set_user_memory_region(kml, mem, false); 758 if (err) { 759 fprintf(stderr, "%s: error unregistering slot: %s\n", 760 __func__, strerror(-err)); 761 abort(); 762 } 763 return; 764 } 765 766 /* register the new slot */ 767 mem = kvm_alloc_slot(kml); 768 mem->memory_size = size; 769 mem->start_addr = start_addr; 770 mem->ram = ram; 771 mem->flags = kvm_mem_flags(mr); 772 773 err = kvm_set_user_memory_region(kml, mem, true); 774 if (err) { 775 fprintf(stderr, "%s: error registering slot: %s\n", __func__, 776 strerror(-err)); 777 abort(); 778 } 779 } 780 781 static void kvm_region_add(MemoryListener *listener, 782 MemoryRegionSection *section) 783 { 784 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 785 786 memory_region_ref(section->mr); 787 kvm_set_phys_mem(kml, section, true); 788 } 789 790 static void kvm_region_del(MemoryListener *listener, 791 MemoryRegionSection *section) 792 { 793 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 794 795 kvm_set_phys_mem(kml, section, false); 796 memory_region_unref(section->mr); 797 } 798 799 static void kvm_log_sync(MemoryListener *listener, 800 MemoryRegionSection *section) 801 { 802 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 803 int r; 804 805 r = kvm_physical_sync_dirty_bitmap(kml, section); 806 if (r < 0) { 807 abort(); 808 } 809 } 810 811 static void kvm_mem_ioeventfd_add(MemoryListener *listener, 812 MemoryRegionSection *section, 813 bool match_data, uint64_t data, 814 EventNotifier *e) 815 { 816 int fd = event_notifier_get_fd(e); 817 int r; 818 819 r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space, 820 data, true, int128_get64(section->size), 821 match_data); 822 if (r < 0) { 823 fprintf(stderr, "%s: error adding ioeventfd: %s\n", 824 __func__, strerror(-r)); 825 abort(); 826 } 827 } 828 829 static void kvm_mem_ioeventfd_del(MemoryListener *listener, 830 MemoryRegionSection *section, 831 bool match_data, uint64_t data, 832 EventNotifier *e) 833 { 834 int fd = event_notifier_get_fd(e); 835 int r; 836 837 r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space, 838 data, false, int128_get64(section->size), 839 match_data); 840 if (r < 0) { 841 abort(); 842 } 843 } 844 845 static void kvm_io_ioeventfd_add(MemoryListener *listener, 846 MemoryRegionSection *section, 847 bool match_data, uint64_t data, 848 EventNotifier *e) 849 { 850 int fd = event_notifier_get_fd(e); 851 int r; 852 853 r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space, 854 data, true, int128_get64(section->size), 855 match_data); 856 if (r < 0) { 857 fprintf(stderr, "%s: error adding ioeventfd: %s\n", 858 __func__, strerror(-r)); 859 abort(); 860 } 861 } 862 863 static void kvm_io_ioeventfd_del(MemoryListener *listener, 864 MemoryRegionSection *section, 865 bool match_data, uint64_t data, 866 EventNotifier *e) 867 868 { 869 int fd = event_notifier_get_fd(e); 870 int r; 871 872 r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space, 873 data, false, int128_get64(section->size), 874 match_data); 875 if (r < 0) { 876 abort(); 877 } 878 } 879 880 void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml, 881 AddressSpace *as, int as_id) 882 { 883 int i; 884 885 kml->slots = g_malloc0(s->nr_slots * sizeof(KVMSlot)); 886 kml->as_id = as_id; 887 888 for (i = 0; i < s->nr_slots; i++) { 889 kml->slots[i].slot = i; 890 } 891 892 kml->listener.region_add = kvm_region_add; 893 kml->listener.region_del = kvm_region_del; 894 kml->listener.log_start = kvm_log_start; 895 kml->listener.log_stop = kvm_log_stop; 896 kml->listener.log_sync = kvm_log_sync; 897 kml->listener.priority = 10; 898 899 memory_listener_register(&kml->listener, as); 900 } 901 902 static MemoryListener kvm_io_listener = { 903 .eventfd_add = kvm_io_ioeventfd_add, 904 .eventfd_del = kvm_io_ioeventfd_del, 905 .priority = 10, 906 }; 907 908 int kvm_set_irq(KVMState *s, int irq, int level) 909 { 910 struct kvm_irq_level event; 911 int ret; 912 913 assert(kvm_async_interrupts_enabled()); 914 915 event.level = level; 916 event.irq = irq; 917 ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event); 918 if (ret < 0) { 919 perror("kvm_set_irq"); 920 abort(); 921 } 922 923 return (s->irq_set_ioctl == KVM_IRQ_LINE) ? 1 : event.status; 924 } 925 926 #ifdef KVM_CAP_IRQ_ROUTING 927 typedef struct KVMMSIRoute { 928 struct kvm_irq_routing_entry kroute; 929 QTAILQ_ENTRY(KVMMSIRoute) entry; 930 } KVMMSIRoute; 931 932 static void set_gsi(KVMState *s, unsigned int gsi) 933 { 934 set_bit(gsi, s->used_gsi_bitmap); 935 } 936 937 static void clear_gsi(KVMState *s, unsigned int gsi) 938 { 939 clear_bit(gsi, s->used_gsi_bitmap); 940 } 941 942 void kvm_init_irq_routing(KVMState *s) 943 { 944 int gsi_count, i; 945 946 gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING) - 1; 947 if (gsi_count > 0) { 948 /* Round up so we can search ints using ffs */ 949 s->used_gsi_bitmap = bitmap_new(gsi_count); 950 s->gsi_count = gsi_count; 951 } 952 953 s->irq_routes = g_malloc0(sizeof(*s->irq_routes)); 954 s->nr_allocated_irq_routes = 0; 955 956 if (!kvm_direct_msi_allowed) { 957 for (i = 0; i < KVM_MSI_HASHTAB_SIZE; i++) { 958 QTAILQ_INIT(&s->msi_hashtab[i]); 959 } 960 } 961 962 kvm_arch_init_irq_routing(s); 963 } 964 965 void kvm_irqchip_commit_routes(KVMState *s) 966 { 967 int ret; 968 969 if (kvm_gsi_direct_mapping()) { 970 return; 971 } 972 973 if (!kvm_gsi_routing_enabled()) { 974 return; 975 } 976 977 s->irq_routes->flags = 0; 978 trace_kvm_irqchip_commit_routes(); 979 ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes); 980 assert(ret == 0); 981 } 982 983 static void kvm_add_routing_entry(KVMState *s, 984 struct kvm_irq_routing_entry *entry) 985 { 986 struct kvm_irq_routing_entry *new; 987 int n, size; 988 989 if (s->irq_routes->nr == s->nr_allocated_irq_routes) { 990 n = s->nr_allocated_irq_routes * 2; 991 if (n < 64) { 992 n = 64; 993 } 994 size = sizeof(struct kvm_irq_routing); 995 size += n * sizeof(*new); 996 s->irq_routes = g_realloc(s->irq_routes, size); 997 s->nr_allocated_irq_routes = n; 998 } 999 n = s->irq_routes->nr++; 1000 new = &s->irq_routes->entries[n]; 1001 1002 *new = *entry; 1003 1004 set_gsi(s, entry->gsi); 1005 } 1006 1007 static int kvm_update_routing_entry(KVMState *s, 1008 struct kvm_irq_routing_entry *new_entry) 1009 { 1010 struct kvm_irq_routing_entry *entry; 1011 int n; 1012 1013 for (n = 0; n < s->irq_routes->nr; n++) { 1014 entry = &s->irq_routes->entries[n]; 1015 if (entry->gsi != new_entry->gsi) { 1016 continue; 1017 } 1018 1019 if(!memcmp(entry, new_entry, sizeof *entry)) { 1020 return 0; 1021 } 1022 1023 *entry = *new_entry; 1024 1025 return 0; 1026 } 1027 1028 return -ESRCH; 1029 } 1030 1031 void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin) 1032 { 1033 struct kvm_irq_routing_entry e = {}; 1034 1035 assert(pin < s->gsi_count); 1036 1037 e.gsi = irq; 1038 e.type = KVM_IRQ_ROUTING_IRQCHIP; 1039 e.flags = 0; 1040 e.u.irqchip.irqchip = irqchip; 1041 e.u.irqchip.pin = pin; 1042 kvm_add_routing_entry(s, &e); 1043 } 1044 1045 void kvm_irqchip_release_virq(KVMState *s, int virq) 1046 { 1047 struct kvm_irq_routing_entry *e; 1048 int i; 1049 1050 if (kvm_gsi_direct_mapping()) { 1051 return; 1052 } 1053 1054 for (i = 0; i < s->irq_routes->nr; i++) { 1055 e = &s->irq_routes->entries[i]; 1056 if (e->gsi == virq) { 1057 s->irq_routes->nr--; 1058 *e = s->irq_routes->entries[s->irq_routes->nr]; 1059 } 1060 } 1061 clear_gsi(s, virq); 1062 kvm_arch_release_virq_post(virq); 1063 trace_kvm_irqchip_release_virq(virq); 1064 } 1065 1066 static unsigned int kvm_hash_msi(uint32_t data) 1067 { 1068 /* This is optimized for IA32 MSI layout. However, no other arch shall 1069 * repeat the mistake of not providing a direct MSI injection API. */ 1070 return data & 0xff; 1071 } 1072 1073 static void kvm_flush_dynamic_msi_routes(KVMState *s) 1074 { 1075 KVMMSIRoute *route, *next; 1076 unsigned int hash; 1077 1078 for (hash = 0; hash < KVM_MSI_HASHTAB_SIZE; hash++) { 1079 QTAILQ_FOREACH_SAFE(route, &s->msi_hashtab[hash], entry, next) { 1080 kvm_irqchip_release_virq(s, route->kroute.gsi); 1081 QTAILQ_REMOVE(&s->msi_hashtab[hash], route, entry); 1082 g_free(route); 1083 } 1084 } 1085 } 1086 1087 static int kvm_irqchip_get_virq(KVMState *s) 1088 { 1089 int next_virq; 1090 1091 /* 1092 * PIC and IOAPIC share the first 16 GSI numbers, thus the available 1093 * GSI numbers are more than the number of IRQ route. Allocating a GSI 1094 * number can succeed even though a new route entry cannot be added. 1095 * When this happens, flush dynamic MSI entries to free IRQ route entries. 1096 */ 1097 if (!kvm_direct_msi_allowed && s->irq_routes->nr == s->gsi_count) { 1098 kvm_flush_dynamic_msi_routes(s); 1099 } 1100 1101 /* Return the lowest unused GSI in the bitmap */ 1102 next_virq = find_first_zero_bit(s->used_gsi_bitmap, s->gsi_count); 1103 if (next_virq >= s->gsi_count) { 1104 return -ENOSPC; 1105 } else { 1106 return next_virq; 1107 } 1108 } 1109 1110 static KVMMSIRoute *kvm_lookup_msi_route(KVMState *s, MSIMessage msg) 1111 { 1112 unsigned int hash = kvm_hash_msi(msg.data); 1113 KVMMSIRoute *route; 1114 1115 QTAILQ_FOREACH(route, &s->msi_hashtab[hash], entry) { 1116 if (route->kroute.u.msi.address_lo == (uint32_t)msg.address && 1117 route->kroute.u.msi.address_hi == (msg.address >> 32) && 1118 route->kroute.u.msi.data == le32_to_cpu(msg.data)) { 1119 return route; 1120 } 1121 } 1122 return NULL; 1123 } 1124 1125 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg) 1126 { 1127 struct kvm_msi msi; 1128 KVMMSIRoute *route; 1129 1130 if (kvm_direct_msi_allowed) { 1131 msi.address_lo = (uint32_t)msg.address; 1132 msi.address_hi = msg.address >> 32; 1133 msi.data = le32_to_cpu(msg.data); 1134 msi.flags = 0; 1135 memset(msi.pad, 0, sizeof(msi.pad)); 1136 1137 return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi); 1138 } 1139 1140 route = kvm_lookup_msi_route(s, msg); 1141 if (!route) { 1142 int virq; 1143 1144 virq = kvm_irqchip_get_virq(s); 1145 if (virq < 0) { 1146 return virq; 1147 } 1148 1149 route = g_malloc0(sizeof(KVMMSIRoute)); 1150 route->kroute.gsi = virq; 1151 route->kroute.type = KVM_IRQ_ROUTING_MSI; 1152 route->kroute.flags = 0; 1153 route->kroute.u.msi.address_lo = (uint32_t)msg.address; 1154 route->kroute.u.msi.address_hi = msg.address >> 32; 1155 route->kroute.u.msi.data = le32_to_cpu(msg.data); 1156 1157 kvm_add_routing_entry(s, &route->kroute); 1158 kvm_irqchip_commit_routes(s); 1159 1160 QTAILQ_INSERT_TAIL(&s->msi_hashtab[kvm_hash_msi(msg.data)], route, 1161 entry); 1162 } 1163 1164 assert(route->kroute.type == KVM_IRQ_ROUTING_MSI); 1165 1166 return kvm_set_irq(s, route->kroute.gsi, 1); 1167 } 1168 1169 int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev) 1170 { 1171 struct kvm_irq_routing_entry kroute = {}; 1172 int virq; 1173 MSIMessage msg = {0, 0}; 1174 1175 if (pci_available && dev) { 1176 msg = pci_get_msi_message(dev, vector); 1177 } 1178 1179 if (kvm_gsi_direct_mapping()) { 1180 return kvm_arch_msi_data_to_gsi(msg.data); 1181 } 1182 1183 if (!kvm_gsi_routing_enabled()) { 1184 return -ENOSYS; 1185 } 1186 1187 virq = kvm_irqchip_get_virq(s); 1188 if (virq < 0) { 1189 return virq; 1190 } 1191 1192 kroute.gsi = virq; 1193 kroute.type = KVM_IRQ_ROUTING_MSI; 1194 kroute.flags = 0; 1195 kroute.u.msi.address_lo = (uint32_t)msg.address; 1196 kroute.u.msi.address_hi = msg.address >> 32; 1197 kroute.u.msi.data = le32_to_cpu(msg.data); 1198 if (pci_available && kvm_msi_devid_required()) { 1199 kroute.flags = KVM_MSI_VALID_DEVID; 1200 kroute.u.msi.devid = pci_requester_id(dev); 1201 } 1202 if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) { 1203 kvm_irqchip_release_virq(s, virq); 1204 return -EINVAL; 1205 } 1206 1207 trace_kvm_irqchip_add_msi_route(dev ? dev->name : (char *)"N/A", 1208 vector, virq); 1209 1210 kvm_add_routing_entry(s, &kroute); 1211 kvm_arch_add_msi_route_post(&kroute, vector, dev); 1212 kvm_irqchip_commit_routes(s); 1213 1214 return virq; 1215 } 1216 1217 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg, 1218 PCIDevice *dev) 1219 { 1220 struct kvm_irq_routing_entry kroute = {}; 1221 1222 if (kvm_gsi_direct_mapping()) { 1223 return 0; 1224 } 1225 1226 if (!kvm_irqchip_in_kernel()) { 1227 return -ENOSYS; 1228 } 1229 1230 kroute.gsi = virq; 1231 kroute.type = KVM_IRQ_ROUTING_MSI; 1232 kroute.flags = 0; 1233 kroute.u.msi.address_lo = (uint32_t)msg.address; 1234 kroute.u.msi.address_hi = msg.address >> 32; 1235 kroute.u.msi.data = le32_to_cpu(msg.data); 1236 if (pci_available && kvm_msi_devid_required()) { 1237 kroute.flags = KVM_MSI_VALID_DEVID; 1238 kroute.u.msi.devid = pci_requester_id(dev); 1239 } 1240 if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) { 1241 return -EINVAL; 1242 } 1243 1244 trace_kvm_irqchip_update_msi_route(virq); 1245 1246 return kvm_update_routing_entry(s, &kroute); 1247 } 1248 1249 static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int rfd, int virq, 1250 bool assign) 1251 { 1252 struct kvm_irqfd irqfd = { 1253 .fd = fd, 1254 .gsi = virq, 1255 .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN, 1256 }; 1257 1258 if (rfd != -1) { 1259 irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE; 1260 irqfd.resamplefd = rfd; 1261 } 1262 1263 if (!kvm_irqfds_enabled()) { 1264 return -ENOSYS; 1265 } 1266 1267 return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd); 1268 } 1269 1270 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter) 1271 { 1272 struct kvm_irq_routing_entry kroute = {}; 1273 int virq; 1274 1275 if (!kvm_gsi_routing_enabled()) { 1276 return -ENOSYS; 1277 } 1278 1279 virq = kvm_irqchip_get_virq(s); 1280 if (virq < 0) { 1281 return virq; 1282 } 1283 1284 kroute.gsi = virq; 1285 kroute.type = KVM_IRQ_ROUTING_S390_ADAPTER; 1286 kroute.flags = 0; 1287 kroute.u.adapter.summary_addr = adapter->summary_addr; 1288 kroute.u.adapter.ind_addr = adapter->ind_addr; 1289 kroute.u.adapter.summary_offset = adapter->summary_offset; 1290 kroute.u.adapter.ind_offset = adapter->ind_offset; 1291 kroute.u.adapter.adapter_id = adapter->adapter_id; 1292 1293 kvm_add_routing_entry(s, &kroute); 1294 1295 return virq; 1296 } 1297 1298 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint) 1299 { 1300 struct kvm_irq_routing_entry kroute = {}; 1301 int virq; 1302 1303 if (!kvm_gsi_routing_enabled()) { 1304 return -ENOSYS; 1305 } 1306 if (!kvm_check_extension(s, KVM_CAP_HYPERV_SYNIC)) { 1307 return -ENOSYS; 1308 } 1309 virq = kvm_irqchip_get_virq(s); 1310 if (virq < 0) { 1311 return virq; 1312 } 1313 1314 kroute.gsi = virq; 1315 kroute.type = KVM_IRQ_ROUTING_HV_SINT; 1316 kroute.flags = 0; 1317 kroute.u.hv_sint.vcpu = vcpu; 1318 kroute.u.hv_sint.sint = sint; 1319 1320 kvm_add_routing_entry(s, &kroute); 1321 kvm_irqchip_commit_routes(s); 1322 1323 return virq; 1324 } 1325 1326 #else /* !KVM_CAP_IRQ_ROUTING */ 1327 1328 void kvm_init_irq_routing(KVMState *s) 1329 { 1330 } 1331 1332 void kvm_irqchip_release_virq(KVMState *s, int virq) 1333 { 1334 } 1335 1336 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg) 1337 { 1338 abort(); 1339 } 1340 1341 int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev) 1342 { 1343 return -ENOSYS; 1344 } 1345 1346 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter) 1347 { 1348 return -ENOSYS; 1349 } 1350 1351 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint) 1352 { 1353 return -ENOSYS; 1354 } 1355 1356 static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int virq, bool assign) 1357 { 1358 abort(); 1359 } 1360 1361 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg) 1362 { 1363 return -ENOSYS; 1364 } 1365 #endif /* !KVM_CAP_IRQ_ROUTING */ 1366 1367 int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, 1368 EventNotifier *rn, int virq) 1369 { 1370 return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), 1371 rn ? event_notifier_get_fd(rn) : -1, virq, true); 1372 } 1373 1374 int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, 1375 int virq) 1376 { 1377 return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), -1, virq, 1378 false); 1379 } 1380 1381 int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n, 1382 EventNotifier *rn, qemu_irq irq) 1383 { 1384 gpointer key, gsi; 1385 gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi); 1386 1387 if (!found) { 1388 return -ENXIO; 1389 } 1390 return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, GPOINTER_TO_INT(gsi)); 1391 } 1392 1393 int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n, 1394 qemu_irq irq) 1395 { 1396 gpointer key, gsi; 1397 gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi); 1398 1399 if (!found) { 1400 return -ENXIO; 1401 } 1402 return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, GPOINTER_TO_INT(gsi)); 1403 } 1404 1405 void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi) 1406 { 1407 g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi)); 1408 } 1409 1410 static void kvm_irqchip_create(MachineState *machine, KVMState *s) 1411 { 1412 int ret; 1413 1414 if (kvm_check_extension(s, KVM_CAP_IRQCHIP)) { 1415 ; 1416 } else if (kvm_check_extension(s, KVM_CAP_S390_IRQCHIP)) { 1417 ret = kvm_vm_enable_cap(s, KVM_CAP_S390_IRQCHIP, 0); 1418 if (ret < 0) { 1419 fprintf(stderr, "Enable kernel irqchip failed: %s\n", strerror(-ret)); 1420 exit(1); 1421 } 1422 } else { 1423 return; 1424 } 1425 1426 /* First probe and see if there's a arch-specific hook to create the 1427 * in-kernel irqchip for us */ 1428 ret = kvm_arch_irqchip_create(machine, s); 1429 if (ret == 0) { 1430 if (machine_kernel_irqchip_split(machine)) { 1431 perror("Split IRQ chip mode not supported."); 1432 exit(1); 1433 } else { 1434 ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP); 1435 } 1436 } 1437 if (ret < 0) { 1438 fprintf(stderr, "Create kernel irqchip failed: %s\n", strerror(-ret)); 1439 exit(1); 1440 } 1441 1442 kvm_kernel_irqchip = true; 1443 /* If we have an in-kernel IRQ chip then we must have asynchronous 1444 * interrupt delivery (though the reverse is not necessarily true) 1445 */ 1446 kvm_async_interrupts_allowed = true; 1447 kvm_halt_in_kernel_allowed = true; 1448 1449 kvm_init_irq_routing(s); 1450 1451 s->gsimap = g_hash_table_new(g_direct_hash, g_direct_equal); 1452 } 1453 1454 /* Find number of supported CPUs using the recommended 1455 * procedure from the kernel API documentation to cope with 1456 * older kernels that may be missing capabilities. 1457 */ 1458 static int kvm_recommended_vcpus(KVMState *s) 1459 { 1460 int ret = kvm_vm_check_extension(s, KVM_CAP_NR_VCPUS); 1461 return (ret) ? ret : 4; 1462 } 1463 1464 static int kvm_max_vcpus(KVMState *s) 1465 { 1466 int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS); 1467 return (ret) ? ret : kvm_recommended_vcpus(s); 1468 } 1469 1470 static int kvm_max_vcpu_id(KVMState *s) 1471 { 1472 int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPU_ID); 1473 return (ret) ? ret : kvm_max_vcpus(s); 1474 } 1475 1476 bool kvm_vcpu_id_is_valid(int vcpu_id) 1477 { 1478 KVMState *s = KVM_STATE(current_machine->accelerator); 1479 return vcpu_id >= 0 && vcpu_id < kvm_max_vcpu_id(s); 1480 } 1481 1482 static int kvm_init(MachineState *ms) 1483 { 1484 MachineClass *mc = MACHINE_GET_CLASS(ms); 1485 static const char upgrade_note[] = 1486 "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n" 1487 "(see http://sourceforge.net/projects/kvm).\n"; 1488 struct { 1489 const char *name; 1490 int num; 1491 } num_cpus[] = { 1492 { "SMP", smp_cpus }, 1493 { "hotpluggable", max_cpus }, 1494 { NULL, } 1495 }, *nc = num_cpus; 1496 int soft_vcpus_limit, hard_vcpus_limit; 1497 KVMState *s; 1498 const KVMCapabilityInfo *missing_cap; 1499 int ret; 1500 int type = 0; 1501 const char *kvm_type; 1502 1503 s = KVM_STATE(ms->accelerator); 1504 1505 /* 1506 * On systems where the kernel can support different base page 1507 * sizes, host page size may be different from TARGET_PAGE_SIZE, 1508 * even with KVM. TARGET_PAGE_SIZE is assumed to be the minimum 1509 * page size for the system though. 1510 */ 1511 assert(TARGET_PAGE_SIZE <= getpagesize()); 1512 1513 s->sigmask_len = 8; 1514 1515 #ifdef KVM_CAP_SET_GUEST_DEBUG 1516 QTAILQ_INIT(&s->kvm_sw_breakpoints); 1517 #endif 1518 QLIST_INIT(&s->kvm_parked_vcpus); 1519 s->vmfd = -1; 1520 s->fd = qemu_open("/dev/kvm", O_RDWR); 1521 if (s->fd == -1) { 1522 fprintf(stderr, "Could not access KVM kernel module: %m\n"); 1523 ret = -errno; 1524 goto err; 1525 } 1526 1527 ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0); 1528 if (ret < KVM_API_VERSION) { 1529 if (ret >= 0) { 1530 ret = -EINVAL; 1531 } 1532 fprintf(stderr, "kvm version too old\n"); 1533 goto err; 1534 } 1535 1536 if (ret > KVM_API_VERSION) { 1537 ret = -EINVAL; 1538 fprintf(stderr, "kvm version not supported\n"); 1539 goto err; 1540 } 1541 1542 kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT); 1543 s->nr_slots = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS); 1544 1545 /* If unspecified, use the default value */ 1546 if (!s->nr_slots) { 1547 s->nr_slots = 32; 1548 } 1549 1550 kvm_type = qemu_opt_get(qemu_get_machine_opts(), "kvm-type"); 1551 if (mc->kvm_type) { 1552 type = mc->kvm_type(kvm_type); 1553 } else if (kvm_type) { 1554 ret = -EINVAL; 1555 fprintf(stderr, "Invalid argument kvm-type=%s\n", kvm_type); 1556 goto err; 1557 } 1558 1559 do { 1560 ret = kvm_ioctl(s, KVM_CREATE_VM, type); 1561 } while (ret == -EINTR); 1562 1563 if (ret < 0) { 1564 fprintf(stderr, "ioctl(KVM_CREATE_VM) failed: %d %s\n", -ret, 1565 strerror(-ret)); 1566 1567 #ifdef TARGET_S390X 1568 if (ret == -EINVAL) { 1569 fprintf(stderr, 1570 "Host kernel setup problem detected. Please verify:\n"); 1571 fprintf(stderr, "- for kernels supporting the switch_amode or" 1572 " user_mode parameters, whether\n"); 1573 fprintf(stderr, 1574 " user space is running in primary address space\n"); 1575 fprintf(stderr, 1576 "- for kernels supporting the vm.allocate_pgste sysctl, " 1577 "whether it is enabled\n"); 1578 } 1579 #endif 1580 goto err; 1581 } 1582 1583 s->vmfd = ret; 1584 1585 /* check the vcpu limits */ 1586 soft_vcpus_limit = kvm_recommended_vcpus(s); 1587 hard_vcpus_limit = kvm_max_vcpus(s); 1588 1589 while (nc->name) { 1590 if (nc->num > soft_vcpus_limit) { 1591 warn_report("Number of %s cpus requested (%d) exceeds " 1592 "the recommended cpus supported by KVM (%d)", 1593 nc->name, nc->num, soft_vcpus_limit); 1594 1595 if (nc->num > hard_vcpus_limit) { 1596 fprintf(stderr, "Number of %s cpus requested (%d) exceeds " 1597 "the maximum cpus supported by KVM (%d)\n", 1598 nc->name, nc->num, hard_vcpus_limit); 1599 exit(1); 1600 } 1601 } 1602 nc++; 1603 } 1604 1605 missing_cap = kvm_check_extension_list(s, kvm_required_capabilites); 1606 if (!missing_cap) { 1607 missing_cap = 1608 kvm_check_extension_list(s, kvm_arch_required_capabilities); 1609 } 1610 if (missing_cap) { 1611 ret = -EINVAL; 1612 fprintf(stderr, "kvm does not support %s\n%s", 1613 missing_cap->name, upgrade_note); 1614 goto err; 1615 } 1616 1617 s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO); 1618 1619 #ifdef KVM_CAP_VCPU_EVENTS 1620 s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS); 1621 #endif 1622 1623 s->robust_singlestep = 1624 kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP); 1625 1626 #ifdef KVM_CAP_DEBUGREGS 1627 s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS); 1628 #endif 1629 1630 #ifdef KVM_CAP_IRQ_ROUTING 1631 kvm_direct_msi_allowed = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0); 1632 #endif 1633 1634 s->intx_set_mask = kvm_check_extension(s, KVM_CAP_PCI_2_3); 1635 1636 s->irq_set_ioctl = KVM_IRQ_LINE; 1637 if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) { 1638 s->irq_set_ioctl = KVM_IRQ_LINE_STATUS; 1639 } 1640 1641 #ifdef KVM_CAP_READONLY_MEM 1642 kvm_readonly_mem_allowed = 1643 (kvm_check_extension(s, KVM_CAP_READONLY_MEM) > 0); 1644 #endif 1645 1646 kvm_eventfds_allowed = 1647 (kvm_check_extension(s, KVM_CAP_IOEVENTFD) > 0); 1648 1649 kvm_irqfds_allowed = 1650 (kvm_check_extension(s, KVM_CAP_IRQFD) > 0); 1651 1652 kvm_resamplefds_allowed = 1653 (kvm_check_extension(s, KVM_CAP_IRQFD_RESAMPLE) > 0); 1654 1655 kvm_vm_attributes_allowed = 1656 (kvm_check_extension(s, KVM_CAP_VM_ATTRIBUTES) > 0); 1657 1658 kvm_ioeventfd_any_length_allowed = 1659 (kvm_check_extension(s, KVM_CAP_IOEVENTFD_ANY_LENGTH) > 0); 1660 1661 kvm_state = s; 1662 1663 /* 1664 * if memory encryption object is specified then initialize the memory 1665 * encryption context. 1666 */ 1667 if (ms->memory_encryption) { 1668 kvm_state->memcrypt_handle = sev_guest_init(ms->memory_encryption); 1669 if (!kvm_state->memcrypt_handle) { 1670 ret = -1; 1671 goto err; 1672 } 1673 1674 kvm_state->memcrypt_encrypt_data = sev_encrypt_data; 1675 } 1676 1677 ret = kvm_arch_init(ms, s); 1678 if (ret < 0) { 1679 goto err; 1680 } 1681 1682 if (machine_kernel_irqchip_allowed(ms)) { 1683 kvm_irqchip_create(ms, s); 1684 } 1685 1686 if (kvm_eventfds_allowed) { 1687 s->memory_listener.listener.eventfd_add = kvm_mem_ioeventfd_add; 1688 s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del; 1689 } 1690 s->memory_listener.listener.coalesced_mmio_add = kvm_coalesce_mmio_region; 1691 s->memory_listener.listener.coalesced_mmio_del = kvm_uncoalesce_mmio_region; 1692 1693 kvm_memory_listener_register(s, &s->memory_listener, 1694 &address_space_memory, 0); 1695 memory_listener_register(&kvm_io_listener, 1696 &address_space_io); 1697 1698 s->many_ioeventfds = kvm_check_many_ioeventfds(); 1699 1700 s->sync_mmu = !!kvm_vm_check_extension(kvm_state, KVM_CAP_SYNC_MMU); 1701 1702 return 0; 1703 1704 err: 1705 assert(ret < 0); 1706 if (s->vmfd >= 0) { 1707 close(s->vmfd); 1708 } 1709 if (s->fd != -1) { 1710 close(s->fd); 1711 } 1712 g_free(s->memory_listener.slots); 1713 1714 return ret; 1715 } 1716 1717 void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len) 1718 { 1719 s->sigmask_len = sigmask_len; 1720 } 1721 1722 static void kvm_handle_io(uint16_t port, MemTxAttrs attrs, void *data, int direction, 1723 int size, uint32_t count) 1724 { 1725 int i; 1726 uint8_t *ptr = data; 1727 1728 for (i = 0; i < count; i++) { 1729 address_space_rw(&address_space_io, port, attrs, 1730 ptr, size, 1731 direction == KVM_EXIT_IO_OUT); 1732 ptr += size; 1733 } 1734 } 1735 1736 static int kvm_handle_internal_error(CPUState *cpu, struct kvm_run *run) 1737 { 1738 fprintf(stderr, "KVM internal error. Suberror: %d\n", 1739 run->internal.suberror); 1740 1741 if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) { 1742 int i; 1743 1744 for (i = 0; i < run->internal.ndata; ++i) { 1745 fprintf(stderr, "extra data[%d]: %"PRIx64"\n", 1746 i, (uint64_t)run->internal.data[i]); 1747 } 1748 } 1749 if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) { 1750 fprintf(stderr, "emulation failure\n"); 1751 if (!kvm_arch_stop_on_emulation_error(cpu)) { 1752 cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_CODE); 1753 return EXCP_INTERRUPT; 1754 } 1755 } 1756 /* FIXME: Should trigger a qmp message to let management know 1757 * something went wrong. 1758 */ 1759 return -1; 1760 } 1761 1762 void kvm_flush_coalesced_mmio_buffer(void) 1763 { 1764 KVMState *s = kvm_state; 1765 1766 if (s->coalesced_flush_in_progress) { 1767 return; 1768 } 1769 1770 s->coalesced_flush_in_progress = true; 1771 1772 if (s->coalesced_mmio_ring) { 1773 struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring; 1774 while (ring->first != ring->last) { 1775 struct kvm_coalesced_mmio *ent; 1776 1777 ent = &ring->coalesced_mmio[ring->first]; 1778 1779 cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len); 1780 smp_wmb(); 1781 ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX; 1782 } 1783 } 1784 1785 s->coalesced_flush_in_progress = false; 1786 } 1787 1788 static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg) 1789 { 1790 if (!cpu->vcpu_dirty) { 1791 kvm_arch_get_registers(cpu); 1792 cpu->vcpu_dirty = true; 1793 } 1794 } 1795 1796 void kvm_cpu_synchronize_state(CPUState *cpu) 1797 { 1798 if (!cpu->vcpu_dirty) { 1799 run_on_cpu(cpu, do_kvm_cpu_synchronize_state, RUN_ON_CPU_NULL); 1800 } 1801 } 1802 1803 static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg) 1804 { 1805 kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE); 1806 cpu->vcpu_dirty = false; 1807 } 1808 1809 void kvm_cpu_synchronize_post_reset(CPUState *cpu) 1810 { 1811 run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL); 1812 } 1813 1814 static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg) 1815 { 1816 kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE); 1817 cpu->vcpu_dirty = false; 1818 } 1819 1820 void kvm_cpu_synchronize_post_init(CPUState *cpu) 1821 { 1822 run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, RUN_ON_CPU_NULL); 1823 } 1824 1825 static void do_kvm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg) 1826 { 1827 cpu->vcpu_dirty = true; 1828 } 1829 1830 void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu) 1831 { 1832 run_on_cpu(cpu, do_kvm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL); 1833 } 1834 1835 #ifdef KVM_HAVE_MCE_INJECTION 1836 static __thread void *pending_sigbus_addr; 1837 static __thread int pending_sigbus_code; 1838 static __thread bool have_sigbus_pending; 1839 #endif 1840 1841 static void kvm_cpu_kick(CPUState *cpu) 1842 { 1843 atomic_set(&cpu->kvm_run->immediate_exit, 1); 1844 } 1845 1846 static void kvm_cpu_kick_self(void) 1847 { 1848 if (kvm_immediate_exit) { 1849 kvm_cpu_kick(current_cpu); 1850 } else { 1851 qemu_cpu_kick_self(); 1852 } 1853 } 1854 1855 static void kvm_eat_signals(CPUState *cpu) 1856 { 1857 struct timespec ts = { 0, 0 }; 1858 siginfo_t siginfo; 1859 sigset_t waitset; 1860 sigset_t chkset; 1861 int r; 1862 1863 if (kvm_immediate_exit) { 1864 atomic_set(&cpu->kvm_run->immediate_exit, 0); 1865 /* Write kvm_run->immediate_exit before the cpu->exit_request 1866 * write in kvm_cpu_exec. 1867 */ 1868 smp_wmb(); 1869 return; 1870 } 1871 1872 sigemptyset(&waitset); 1873 sigaddset(&waitset, SIG_IPI); 1874 1875 do { 1876 r = sigtimedwait(&waitset, &siginfo, &ts); 1877 if (r == -1 && !(errno == EAGAIN || errno == EINTR)) { 1878 perror("sigtimedwait"); 1879 exit(1); 1880 } 1881 1882 r = sigpending(&chkset); 1883 if (r == -1) { 1884 perror("sigpending"); 1885 exit(1); 1886 } 1887 } while (sigismember(&chkset, SIG_IPI)); 1888 } 1889 1890 int kvm_cpu_exec(CPUState *cpu) 1891 { 1892 struct kvm_run *run = cpu->kvm_run; 1893 int ret, run_ret; 1894 1895 DPRINTF("kvm_cpu_exec()\n"); 1896 1897 if (kvm_arch_process_async_events(cpu)) { 1898 atomic_set(&cpu->exit_request, 0); 1899 return EXCP_HLT; 1900 } 1901 1902 qemu_mutex_unlock_iothread(); 1903 cpu_exec_start(cpu); 1904 1905 do { 1906 MemTxAttrs attrs; 1907 1908 if (cpu->vcpu_dirty) { 1909 kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE); 1910 cpu->vcpu_dirty = false; 1911 } 1912 1913 kvm_arch_pre_run(cpu, run); 1914 if (atomic_read(&cpu->exit_request)) { 1915 DPRINTF("interrupt exit requested\n"); 1916 /* 1917 * KVM requires us to reenter the kernel after IO exits to complete 1918 * instruction emulation. This self-signal will ensure that we 1919 * leave ASAP again. 1920 */ 1921 kvm_cpu_kick_self(); 1922 } 1923 1924 /* Read cpu->exit_request before KVM_RUN reads run->immediate_exit. 1925 * Matching barrier in kvm_eat_signals. 1926 */ 1927 smp_rmb(); 1928 1929 run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0); 1930 1931 attrs = kvm_arch_post_run(cpu, run); 1932 1933 #ifdef KVM_HAVE_MCE_INJECTION 1934 if (unlikely(have_sigbus_pending)) { 1935 qemu_mutex_lock_iothread(); 1936 kvm_arch_on_sigbus_vcpu(cpu, pending_sigbus_code, 1937 pending_sigbus_addr); 1938 have_sigbus_pending = false; 1939 qemu_mutex_unlock_iothread(); 1940 } 1941 #endif 1942 1943 if (run_ret < 0) { 1944 if (run_ret == -EINTR || run_ret == -EAGAIN) { 1945 DPRINTF("io window exit\n"); 1946 kvm_eat_signals(cpu); 1947 ret = EXCP_INTERRUPT; 1948 break; 1949 } 1950 fprintf(stderr, "error: kvm run failed %s\n", 1951 strerror(-run_ret)); 1952 #ifdef TARGET_PPC 1953 if (run_ret == -EBUSY) { 1954 fprintf(stderr, 1955 "This is probably because your SMT is enabled.\n" 1956 "VCPU can only run on primary threads with all " 1957 "secondary threads offline.\n"); 1958 } 1959 #endif 1960 ret = -1; 1961 break; 1962 } 1963 1964 trace_kvm_run_exit(cpu->cpu_index, run->exit_reason); 1965 switch (run->exit_reason) { 1966 case KVM_EXIT_IO: 1967 DPRINTF("handle_io\n"); 1968 /* Called outside BQL */ 1969 kvm_handle_io(run->io.port, attrs, 1970 (uint8_t *)run + run->io.data_offset, 1971 run->io.direction, 1972 run->io.size, 1973 run->io.count); 1974 ret = 0; 1975 break; 1976 case KVM_EXIT_MMIO: 1977 DPRINTF("handle_mmio\n"); 1978 /* Called outside BQL */ 1979 address_space_rw(&address_space_memory, 1980 run->mmio.phys_addr, attrs, 1981 run->mmio.data, 1982 run->mmio.len, 1983 run->mmio.is_write); 1984 ret = 0; 1985 break; 1986 case KVM_EXIT_IRQ_WINDOW_OPEN: 1987 DPRINTF("irq_window_open\n"); 1988 ret = EXCP_INTERRUPT; 1989 break; 1990 case KVM_EXIT_SHUTDOWN: 1991 DPRINTF("shutdown\n"); 1992 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); 1993 ret = EXCP_INTERRUPT; 1994 break; 1995 case KVM_EXIT_UNKNOWN: 1996 fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n", 1997 (uint64_t)run->hw.hardware_exit_reason); 1998 ret = -1; 1999 break; 2000 case KVM_EXIT_INTERNAL_ERROR: 2001 ret = kvm_handle_internal_error(cpu, run); 2002 break; 2003 case KVM_EXIT_SYSTEM_EVENT: 2004 switch (run->system_event.type) { 2005 case KVM_SYSTEM_EVENT_SHUTDOWN: 2006 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN); 2007 ret = EXCP_INTERRUPT; 2008 break; 2009 case KVM_SYSTEM_EVENT_RESET: 2010 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); 2011 ret = EXCP_INTERRUPT; 2012 break; 2013 case KVM_SYSTEM_EVENT_CRASH: 2014 kvm_cpu_synchronize_state(cpu); 2015 qemu_mutex_lock_iothread(); 2016 qemu_system_guest_panicked(cpu_get_crash_info(cpu)); 2017 qemu_mutex_unlock_iothread(); 2018 ret = 0; 2019 break; 2020 default: 2021 DPRINTF("kvm_arch_handle_exit\n"); 2022 ret = kvm_arch_handle_exit(cpu, run); 2023 break; 2024 } 2025 break; 2026 default: 2027 DPRINTF("kvm_arch_handle_exit\n"); 2028 ret = kvm_arch_handle_exit(cpu, run); 2029 break; 2030 } 2031 } while (ret == 0); 2032 2033 cpu_exec_end(cpu); 2034 qemu_mutex_lock_iothread(); 2035 2036 if (ret < 0) { 2037 cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_CODE); 2038 vm_stop(RUN_STATE_INTERNAL_ERROR); 2039 } 2040 2041 atomic_set(&cpu->exit_request, 0); 2042 return ret; 2043 } 2044 2045 int kvm_ioctl(KVMState *s, int type, ...) 2046 { 2047 int ret; 2048 void *arg; 2049 va_list ap; 2050 2051 va_start(ap, type); 2052 arg = va_arg(ap, void *); 2053 va_end(ap); 2054 2055 trace_kvm_ioctl(type, arg); 2056 ret = ioctl(s->fd, type, arg); 2057 if (ret == -1) { 2058 ret = -errno; 2059 } 2060 return ret; 2061 } 2062 2063 int kvm_vm_ioctl(KVMState *s, int type, ...) 2064 { 2065 int ret; 2066 void *arg; 2067 va_list ap; 2068 2069 va_start(ap, type); 2070 arg = va_arg(ap, void *); 2071 va_end(ap); 2072 2073 trace_kvm_vm_ioctl(type, arg); 2074 ret = ioctl(s->vmfd, type, arg); 2075 if (ret == -1) { 2076 ret = -errno; 2077 } 2078 return ret; 2079 } 2080 2081 int kvm_vcpu_ioctl(CPUState *cpu, int type, ...) 2082 { 2083 int ret; 2084 void *arg; 2085 va_list ap; 2086 2087 va_start(ap, type); 2088 arg = va_arg(ap, void *); 2089 va_end(ap); 2090 2091 trace_kvm_vcpu_ioctl(cpu->cpu_index, type, arg); 2092 ret = ioctl(cpu->kvm_fd, type, arg); 2093 if (ret == -1) { 2094 ret = -errno; 2095 } 2096 return ret; 2097 } 2098 2099 int kvm_device_ioctl(int fd, int type, ...) 2100 { 2101 int ret; 2102 void *arg; 2103 va_list ap; 2104 2105 va_start(ap, type); 2106 arg = va_arg(ap, void *); 2107 va_end(ap); 2108 2109 trace_kvm_device_ioctl(fd, type, arg); 2110 ret = ioctl(fd, type, arg); 2111 if (ret == -1) { 2112 ret = -errno; 2113 } 2114 return ret; 2115 } 2116 2117 int kvm_vm_check_attr(KVMState *s, uint32_t group, uint64_t attr) 2118 { 2119 int ret; 2120 struct kvm_device_attr attribute = { 2121 .group = group, 2122 .attr = attr, 2123 }; 2124 2125 if (!kvm_vm_attributes_allowed) { 2126 return 0; 2127 } 2128 2129 ret = kvm_vm_ioctl(s, KVM_HAS_DEVICE_ATTR, &attribute); 2130 /* kvm returns 0 on success for HAS_DEVICE_ATTR */ 2131 return ret ? 0 : 1; 2132 } 2133 2134 int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr) 2135 { 2136 struct kvm_device_attr attribute = { 2137 .group = group, 2138 .attr = attr, 2139 .flags = 0, 2140 }; 2141 2142 return kvm_device_ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute) ? 0 : 1; 2143 } 2144 2145 int kvm_device_access(int fd, int group, uint64_t attr, 2146 void *val, bool write, Error **errp) 2147 { 2148 struct kvm_device_attr kvmattr; 2149 int err; 2150 2151 kvmattr.flags = 0; 2152 kvmattr.group = group; 2153 kvmattr.attr = attr; 2154 kvmattr.addr = (uintptr_t)val; 2155 2156 err = kvm_device_ioctl(fd, 2157 write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR, 2158 &kvmattr); 2159 if (err < 0) { 2160 error_setg_errno(errp, -err, 2161 "KVM_%s_DEVICE_ATTR failed: Group %d " 2162 "attr 0x%016" PRIx64, 2163 write ? "SET" : "GET", group, attr); 2164 } 2165 return err; 2166 } 2167 2168 bool kvm_has_sync_mmu(void) 2169 { 2170 return kvm_state->sync_mmu; 2171 } 2172 2173 int kvm_has_vcpu_events(void) 2174 { 2175 return kvm_state->vcpu_events; 2176 } 2177 2178 int kvm_has_robust_singlestep(void) 2179 { 2180 return kvm_state->robust_singlestep; 2181 } 2182 2183 int kvm_has_debugregs(void) 2184 { 2185 return kvm_state->debugregs; 2186 } 2187 2188 int kvm_has_many_ioeventfds(void) 2189 { 2190 if (!kvm_enabled()) { 2191 return 0; 2192 } 2193 return kvm_state->many_ioeventfds; 2194 } 2195 2196 int kvm_has_gsi_routing(void) 2197 { 2198 #ifdef KVM_CAP_IRQ_ROUTING 2199 return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING); 2200 #else 2201 return false; 2202 #endif 2203 } 2204 2205 int kvm_has_intx_set_mask(void) 2206 { 2207 return kvm_state->intx_set_mask; 2208 } 2209 2210 bool kvm_arm_supports_user_irq(void) 2211 { 2212 return kvm_check_extension(kvm_state, KVM_CAP_ARM_USER_IRQ); 2213 } 2214 2215 #ifdef KVM_CAP_SET_GUEST_DEBUG 2216 struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu, 2217 target_ulong pc) 2218 { 2219 struct kvm_sw_breakpoint *bp; 2220 2221 QTAILQ_FOREACH(bp, &cpu->kvm_state->kvm_sw_breakpoints, entry) { 2222 if (bp->pc == pc) { 2223 return bp; 2224 } 2225 } 2226 return NULL; 2227 } 2228 2229 int kvm_sw_breakpoints_active(CPUState *cpu) 2230 { 2231 return !QTAILQ_EMPTY(&cpu->kvm_state->kvm_sw_breakpoints); 2232 } 2233 2234 struct kvm_set_guest_debug_data { 2235 struct kvm_guest_debug dbg; 2236 int err; 2237 }; 2238 2239 static void kvm_invoke_set_guest_debug(CPUState *cpu, run_on_cpu_data data) 2240 { 2241 struct kvm_set_guest_debug_data *dbg_data = 2242 (struct kvm_set_guest_debug_data *) data.host_ptr; 2243 2244 dbg_data->err = kvm_vcpu_ioctl(cpu, KVM_SET_GUEST_DEBUG, 2245 &dbg_data->dbg); 2246 } 2247 2248 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap) 2249 { 2250 struct kvm_set_guest_debug_data data; 2251 2252 data.dbg.control = reinject_trap; 2253 2254 if (cpu->singlestep_enabled) { 2255 data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP; 2256 } 2257 kvm_arch_update_guest_debug(cpu, &data.dbg); 2258 2259 run_on_cpu(cpu, kvm_invoke_set_guest_debug, 2260 RUN_ON_CPU_HOST_PTR(&data)); 2261 return data.err; 2262 } 2263 2264 int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr, 2265 target_ulong len, int type) 2266 { 2267 struct kvm_sw_breakpoint *bp; 2268 int err; 2269 2270 if (type == GDB_BREAKPOINT_SW) { 2271 bp = kvm_find_sw_breakpoint(cpu, addr); 2272 if (bp) { 2273 bp->use_count++; 2274 return 0; 2275 } 2276 2277 bp = g_malloc(sizeof(struct kvm_sw_breakpoint)); 2278 bp->pc = addr; 2279 bp->use_count = 1; 2280 err = kvm_arch_insert_sw_breakpoint(cpu, bp); 2281 if (err) { 2282 g_free(bp); 2283 return err; 2284 } 2285 2286 QTAILQ_INSERT_HEAD(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry); 2287 } else { 2288 err = kvm_arch_insert_hw_breakpoint(addr, len, type); 2289 if (err) { 2290 return err; 2291 } 2292 } 2293 2294 CPU_FOREACH(cpu) { 2295 err = kvm_update_guest_debug(cpu, 0); 2296 if (err) { 2297 return err; 2298 } 2299 } 2300 return 0; 2301 } 2302 2303 int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr, 2304 target_ulong len, int type) 2305 { 2306 struct kvm_sw_breakpoint *bp; 2307 int err; 2308 2309 if (type == GDB_BREAKPOINT_SW) { 2310 bp = kvm_find_sw_breakpoint(cpu, addr); 2311 if (!bp) { 2312 return -ENOENT; 2313 } 2314 2315 if (bp->use_count > 1) { 2316 bp->use_count--; 2317 return 0; 2318 } 2319 2320 err = kvm_arch_remove_sw_breakpoint(cpu, bp); 2321 if (err) { 2322 return err; 2323 } 2324 2325 QTAILQ_REMOVE(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry); 2326 g_free(bp); 2327 } else { 2328 err = kvm_arch_remove_hw_breakpoint(addr, len, type); 2329 if (err) { 2330 return err; 2331 } 2332 } 2333 2334 CPU_FOREACH(cpu) { 2335 err = kvm_update_guest_debug(cpu, 0); 2336 if (err) { 2337 return err; 2338 } 2339 } 2340 return 0; 2341 } 2342 2343 void kvm_remove_all_breakpoints(CPUState *cpu) 2344 { 2345 struct kvm_sw_breakpoint *bp, *next; 2346 KVMState *s = cpu->kvm_state; 2347 CPUState *tmpcpu; 2348 2349 QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) { 2350 if (kvm_arch_remove_sw_breakpoint(cpu, bp) != 0) { 2351 /* Try harder to find a CPU that currently sees the breakpoint. */ 2352 CPU_FOREACH(tmpcpu) { 2353 if (kvm_arch_remove_sw_breakpoint(tmpcpu, bp) == 0) { 2354 break; 2355 } 2356 } 2357 } 2358 QTAILQ_REMOVE(&s->kvm_sw_breakpoints, bp, entry); 2359 g_free(bp); 2360 } 2361 kvm_arch_remove_all_hw_breakpoints(); 2362 2363 CPU_FOREACH(cpu) { 2364 kvm_update_guest_debug(cpu, 0); 2365 } 2366 } 2367 2368 #else /* !KVM_CAP_SET_GUEST_DEBUG */ 2369 2370 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap) 2371 { 2372 return -EINVAL; 2373 } 2374 2375 int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr, 2376 target_ulong len, int type) 2377 { 2378 return -EINVAL; 2379 } 2380 2381 int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr, 2382 target_ulong len, int type) 2383 { 2384 return -EINVAL; 2385 } 2386 2387 void kvm_remove_all_breakpoints(CPUState *cpu) 2388 { 2389 } 2390 #endif /* !KVM_CAP_SET_GUEST_DEBUG */ 2391 2392 static int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset) 2393 { 2394 KVMState *s = kvm_state; 2395 struct kvm_signal_mask *sigmask; 2396 int r; 2397 2398 sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset)); 2399 2400 sigmask->len = s->sigmask_len; 2401 memcpy(sigmask->sigset, sigset, sizeof(*sigset)); 2402 r = kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, sigmask); 2403 g_free(sigmask); 2404 2405 return r; 2406 } 2407 2408 static void kvm_ipi_signal(int sig) 2409 { 2410 if (current_cpu) { 2411 assert(kvm_immediate_exit); 2412 kvm_cpu_kick(current_cpu); 2413 } 2414 } 2415 2416 void kvm_init_cpu_signals(CPUState *cpu) 2417 { 2418 int r; 2419 sigset_t set; 2420 struct sigaction sigact; 2421 2422 memset(&sigact, 0, sizeof(sigact)); 2423 sigact.sa_handler = kvm_ipi_signal; 2424 sigaction(SIG_IPI, &sigact, NULL); 2425 2426 pthread_sigmask(SIG_BLOCK, NULL, &set); 2427 #if defined KVM_HAVE_MCE_INJECTION 2428 sigdelset(&set, SIGBUS); 2429 pthread_sigmask(SIG_SETMASK, &set, NULL); 2430 #endif 2431 sigdelset(&set, SIG_IPI); 2432 if (kvm_immediate_exit) { 2433 r = pthread_sigmask(SIG_SETMASK, &set, NULL); 2434 } else { 2435 r = kvm_set_signal_mask(cpu, &set); 2436 } 2437 if (r) { 2438 fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r)); 2439 exit(1); 2440 } 2441 } 2442 2443 /* Called asynchronously in VCPU thread. */ 2444 int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr) 2445 { 2446 #ifdef KVM_HAVE_MCE_INJECTION 2447 if (have_sigbus_pending) { 2448 return 1; 2449 } 2450 have_sigbus_pending = true; 2451 pending_sigbus_addr = addr; 2452 pending_sigbus_code = code; 2453 atomic_set(&cpu->exit_request, 1); 2454 return 0; 2455 #else 2456 return 1; 2457 #endif 2458 } 2459 2460 /* Called synchronously (via signalfd) in main thread. */ 2461 int kvm_on_sigbus(int code, void *addr) 2462 { 2463 #ifdef KVM_HAVE_MCE_INJECTION 2464 /* Action required MCE kills the process if SIGBUS is blocked. Because 2465 * that's what happens in the I/O thread, where we handle MCE via signalfd, 2466 * we can only get action optional here. 2467 */ 2468 assert(code != BUS_MCEERR_AR); 2469 kvm_arch_on_sigbus_vcpu(first_cpu, code, addr); 2470 return 0; 2471 #else 2472 return 1; 2473 #endif 2474 } 2475 2476 int kvm_create_device(KVMState *s, uint64_t type, bool test) 2477 { 2478 int ret; 2479 struct kvm_create_device create_dev; 2480 2481 create_dev.type = type; 2482 create_dev.fd = -1; 2483 create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0; 2484 2485 if (!kvm_check_extension(s, KVM_CAP_DEVICE_CTRL)) { 2486 return -ENOTSUP; 2487 } 2488 2489 ret = kvm_vm_ioctl(s, KVM_CREATE_DEVICE, &create_dev); 2490 if (ret) { 2491 return ret; 2492 } 2493 2494 return test ? 0 : create_dev.fd; 2495 } 2496 2497 bool kvm_device_supported(int vmfd, uint64_t type) 2498 { 2499 struct kvm_create_device create_dev = { 2500 .type = type, 2501 .fd = -1, 2502 .flags = KVM_CREATE_DEVICE_TEST, 2503 }; 2504 2505 if (ioctl(vmfd, KVM_CHECK_EXTENSION, KVM_CAP_DEVICE_CTRL) <= 0) { 2506 return false; 2507 } 2508 2509 return (ioctl(vmfd, KVM_CREATE_DEVICE, &create_dev) >= 0); 2510 } 2511 2512 int kvm_set_one_reg(CPUState *cs, uint64_t id, void *source) 2513 { 2514 struct kvm_one_reg reg; 2515 int r; 2516 2517 reg.id = id; 2518 reg.addr = (uintptr_t) source; 2519 r = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, ®); 2520 if (r) { 2521 trace_kvm_failed_reg_set(id, strerror(-r)); 2522 } 2523 return r; 2524 } 2525 2526 int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target) 2527 { 2528 struct kvm_one_reg reg; 2529 int r; 2530 2531 reg.id = id; 2532 reg.addr = (uintptr_t) target; 2533 r = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, ®); 2534 if (r) { 2535 trace_kvm_failed_reg_get(id, strerror(-r)); 2536 } 2537 return r; 2538 } 2539 2540 static void kvm_accel_class_init(ObjectClass *oc, void *data) 2541 { 2542 AccelClass *ac = ACCEL_CLASS(oc); 2543 ac->name = "KVM"; 2544 ac->init_machine = kvm_init; 2545 ac->allowed = &kvm_allowed; 2546 } 2547 2548 static const TypeInfo kvm_accel_type = { 2549 .name = TYPE_KVM_ACCEL, 2550 .parent = TYPE_ACCEL, 2551 .class_init = kvm_accel_class_init, 2552 .instance_size = sizeof(KVMState), 2553 }; 2554 2555 static void kvm_type_init(void) 2556 { 2557 type_register_static(&kvm_accel_type); 2558 } 2559 2560 type_init(kvm_type_init); 2561