1 /* 2 * QEMU KVM support 3 * 4 * Copyright IBM, Corp. 2008 5 * Red Hat, Inc. 2008 6 * 7 * Authors: 8 * Anthony Liguori <aliguori@us.ibm.com> 9 * Glauber Costa <gcosta@redhat.com> 10 * 11 * This work is licensed under the terms of the GNU GPL, version 2 or later. 12 * See the COPYING file in the top-level directory. 13 * 14 */ 15 16 #include "qemu/osdep.h" 17 #include <sys/ioctl.h> 18 19 #include <linux/kvm.h> 20 21 #include "qemu-common.h" 22 #include "qemu/atomic.h" 23 #include "qemu/option.h" 24 #include "qemu/config-file.h" 25 #include "qemu/error-report.h" 26 #include "qapi/error.h" 27 #include "hw/hw.h" 28 #include "hw/pci/msi.h" 29 #include "hw/pci/msix.h" 30 #include "hw/s390x/adapter.h" 31 #include "exec/gdbstub.h" 32 #include "sysemu/kvm_int.h" 33 #include "sysemu/cpus.h" 34 #include "qemu/bswap.h" 35 #include "exec/memory.h" 36 #include "exec/ram_addr.h" 37 #include "exec/address-spaces.h" 38 #include "qemu/event_notifier.h" 39 #include "trace.h" 40 #include "hw/irq.h" 41 42 #include "hw/boards.h" 43 44 /* This check must be after config-host.h is included */ 45 #ifdef CONFIG_EVENTFD 46 #include <sys/eventfd.h> 47 #endif 48 49 /* KVM uses PAGE_SIZE in its definition of KVM_COALESCED_MMIO_MAX. We 50 * need to use the real host PAGE_SIZE, as that's what KVM will use. 51 */ 52 #define PAGE_SIZE getpagesize() 53 54 //#define DEBUG_KVM 55 56 #ifdef DEBUG_KVM 57 #define DPRINTF(fmt, ...) \ 58 do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0) 59 #else 60 #define DPRINTF(fmt, ...) \ 61 do { } while (0) 62 #endif 63 64 #define KVM_MSI_HASHTAB_SIZE 256 65 66 struct KVMParkedVcpu { 67 unsigned long vcpu_id; 68 int kvm_fd; 69 QLIST_ENTRY(KVMParkedVcpu) node; 70 }; 71 72 struct KVMState 73 { 74 AccelState parent_obj; 75 76 int nr_slots; 77 int fd; 78 int vmfd; 79 int coalesced_mmio; 80 struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; 81 bool coalesced_flush_in_progress; 82 int vcpu_events; 83 int robust_singlestep; 84 int debugregs; 85 #ifdef KVM_CAP_SET_GUEST_DEBUG 86 struct kvm_sw_breakpoint_head kvm_sw_breakpoints; 87 #endif 88 int many_ioeventfds; 89 int intx_set_mask; 90 bool sync_mmu; 91 /* The man page (and posix) say ioctl numbers are signed int, but 92 * they're not. Linux, glibc and *BSD all treat ioctl numbers as 93 * unsigned, and treating them as signed here can break things */ 94 unsigned irq_set_ioctl; 95 unsigned int sigmask_len; 96 GHashTable *gsimap; 97 #ifdef KVM_CAP_IRQ_ROUTING 98 struct kvm_irq_routing *irq_routes; 99 int nr_allocated_irq_routes; 100 unsigned long *used_gsi_bitmap; 101 unsigned int gsi_count; 102 QTAILQ_HEAD(msi_hashtab, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE]; 103 #endif 104 KVMMemoryListener memory_listener; 105 QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus; 106 }; 107 108 KVMState *kvm_state; 109 bool kvm_kernel_irqchip; 110 bool kvm_split_irqchip; 111 bool kvm_async_interrupts_allowed; 112 bool kvm_halt_in_kernel_allowed; 113 bool kvm_eventfds_allowed; 114 bool kvm_irqfds_allowed; 115 bool kvm_resamplefds_allowed; 116 bool kvm_msi_via_irqfd_allowed; 117 bool kvm_gsi_routing_allowed; 118 bool kvm_gsi_direct_mapping; 119 bool kvm_allowed; 120 bool kvm_readonly_mem_allowed; 121 bool kvm_vm_attributes_allowed; 122 bool kvm_direct_msi_allowed; 123 bool kvm_ioeventfd_any_length_allowed; 124 bool kvm_msi_use_devid; 125 static bool kvm_immediate_exit; 126 127 static const KVMCapabilityInfo kvm_required_capabilites[] = { 128 KVM_CAP_INFO(USER_MEMORY), 129 KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS), 130 KVM_CAP_INFO(JOIN_MEMORY_REGIONS_WORKS), 131 KVM_CAP_LAST_INFO 132 }; 133 134 int kvm_get_max_memslots(void) 135 { 136 KVMState *s = KVM_STATE(current_machine->accelerator); 137 138 return s->nr_slots; 139 } 140 141 static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml) 142 { 143 KVMState *s = kvm_state; 144 int i; 145 146 for (i = 0; i < s->nr_slots; i++) { 147 if (kml->slots[i].memory_size == 0) { 148 return &kml->slots[i]; 149 } 150 } 151 152 return NULL; 153 } 154 155 bool kvm_has_free_slot(MachineState *ms) 156 { 157 KVMState *s = KVM_STATE(ms->accelerator); 158 159 return kvm_get_free_slot(&s->memory_listener); 160 } 161 162 static KVMSlot *kvm_alloc_slot(KVMMemoryListener *kml) 163 { 164 KVMSlot *slot = kvm_get_free_slot(kml); 165 166 if (slot) { 167 return slot; 168 } 169 170 fprintf(stderr, "%s: no free slot available\n", __func__); 171 abort(); 172 } 173 174 static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml, 175 hwaddr start_addr, 176 hwaddr size) 177 { 178 KVMState *s = kvm_state; 179 int i; 180 181 for (i = 0; i < s->nr_slots; i++) { 182 KVMSlot *mem = &kml->slots[i]; 183 184 if (start_addr == mem->start_addr && size == mem->memory_size) { 185 return mem; 186 } 187 } 188 189 return NULL; 190 } 191 192 /* 193 * Calculate and align the start address and the size of the section. 194 * Return the size. If the size is 0, the aligned section is empty. 195 */ 196 static hwaddr kvm_align_section(MemoryRegionSection *section, 197 hwaddr *start) 198 { 199 hwaddr size = int128_get64(section->size); 200 hwaddr delta, aligned; 201 202 /* kvm works in page size chunks, but the function may be called 203 with sub-page size and unaligned start address. Pad the start 204 address to next and truncate size to previous page boundary. */ 205 aligned = ROUND_UP(section->offset_within_address_space, 206 qemu_real_host_page_size); 207 delta = aligned - section->offset_within_address_space; 208 *start = aligned; 209 if (delta > size) { 210 return 0; 211 } 212 213 return (size - delta) & qemu_real_host_page_mask; 214 } 215 216 int kvm_physical_memory_addr_from_host(KVMState *s, void *ram, 217 hwaddr *phys_addr) 218 { 219 KVMMemoryListener *kml = &s->memory_listener; 220 int i; 221 222 for (i = 0; i < s->nr_slots; i++) { 223 KVMSlot *mem = &kml->slots[i]; 224 225 if (ram >= mem->ram && ram < mem->ram + mem->memory_size) { 226 *phys_addr = mem->start_addr + (ram - mem->ram); 227 return 1; 228 } 229 } 230 231 return 0; 232 } 233 234 static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot) 235 { 236 KVMState *s = kvm_state; 237 struct kvm_userspace_memory_region mem; 238 int ret; 239 240 mem.slot = slot->slot | (kml->as_id << 16); 241 mem.guest_phys_addr = slot->start_addr; 242 mem.userspace_addr = (unsigned long)slot->ram; 243 mem.flags = slot->flags; 244 245 if (slot->memory_size && mem.flags & KVM_MEM_READONLY) { 246 /* Set the slot size to 0 before setting the slot to the desired 247 * value. This is needed based on KVM commit 75d61fbc. */ 248 mem.memory_size = 0; 249 kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem); 250 } 251 mem.memory_size = slot->memory_size; 252 ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem); 253 trace_kvm_set_user_memory(mem.slot, mem.flags, mem.guest_phys_addr, 254 mem.memory_size, mem.userspace_addr, ret); 255 return ret; 256 } 257 258 int kvm_destroy_vcpu(CPUState *cpu) 259 { 260 KVMState *s = kvm_state; 261 long mmap_size; 262 struct KVMParkedVcpu *vcpu = NULL; 263 int ret = 0; 264 265 DPRINTF("kvm_destroy_vcpu\n"); 266 267 mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0); 268 if (mmap_size < 0) { 269 ret = mmap_size; 270 DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n"); 271 goto err; 272 } 273 274 ret = munmap(cpu->kvm_run, mmap_size); 275 if (ret < 0) { 276 goto err; 277 } 278 279 vcpu = g_malloc0(sizeof(*vcpu)); 280 vcpu->vcpu_id = kvm_arch_vcpu_id(cpu); 281 vcpu->kvm_fd = cpu->kvm_fd; 282 QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node); 283 err: 284 return ret; 285 } 286 287 static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id) 288 { 289 struct KVMParkedVcpu *cpu; 290 291 QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) { 292 if (cpu->vcpu_id == vcpu_id) { 293 int kvm_fd; 294 295 QLIST_REMOVE(cpu, node); 296 kvm_fd = cpu->kvm_fd; 297 g_free(cpu); 298 return kvm_fd; 299 } 300 } 301 302 return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id); 303 } 304 305 int kvm_init_vcpu(CPUState *cpu) 306 { 307 KVMState *s = kvm_state; 308 long mmap_size; 309 int ret; 310 311 DPRINTF("kvm_init_vcpu\n"); 312 313 ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu)); 314 if (ret < 0) { 315 DPRINTF("kvm_create_vcpu failed\n"); 316 goto err; 317 } 318 319 cpu->kvm_fd = ret; 320 cpu->kvm_state = s; 321 cpu->vcpu_dirty = true; 322 323 mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0); 324 if (mmap_size < 0) { 325 ret = mmap_size; 326 DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n"); 327 goto err; 328 } 329 330 cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, 331 cpu->kvm_fd, 0); 332 if (cpu->kvm_run == MAP_FAILED) { 333 ret = -errno; 334 DPRINTF("mmap'ing vcpu state failed\n"); 335 goto err; 336 } 337 338 if (s->coalesced_mmio && !s->coalesced_mmio_ring) { 339 s->coalesced_mmio_ring = 340 (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE; 341 } 342 343 ret = kvm_arch_init_vcpu(cpu); 344 err: 345 return ret; 346 } 347 348 /* 349 * dirty pages logging control 350 */ 351 352 static int kvm_mem_flags(MemoryRegion *mr) 353 { 354 bool readonly = mr->readonly || memory_region_is_romd(mr); 355 int flags = 0; 356 357 if (memory_region_get_dirty_log_mask(mr) != 0) { 358 flags |= KVM_MEM_LOG_DIRTY_PAGES; 359 } 360 if (readonly && kvm_readonly_mem_allowed) { 361 flags |= KVM_MEM_READONLY; 362 } 363 return flags; 364 } 365 366 static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem, 367 MemoryRegion *mr) 368 { 369 int old_flags; 370 371 old_flags = mem->flags; 372 mem->flags = kvm_mem_flags(mr); 373 374 /* If nothing changed effectively, no need to issue ioctl */ 375 if (mem->flags == old_flags) { 376 return 0; 377 } 378 379 return kvm_set_user_memory_region(kml, mem); 380 } 381 382 static int kvm_section_update_flags(KVMMemoryListener *kml, 383 MemoryRegionSection *section) 384 { 385 hwaddr start_addr, size; 386 KVMSlot *mem; 387 388 size = kvm_align_section(section, &start_addr); 389 if (!size) { 390 return 0; 391 } 392 393 mem = kvm_lookup_matching_slot(kml, start_addr, size); 394 if (!mem) { 395 /* We don't have a slot if we want to trap every access. */ 396 return 0; 397 } 398 399 return kvm_slot_update_flags(kml, mem, section->mr); 400 } 401 402 static void kvm_log_start(MemoryListener *listener, 403 MemoryRegionSection *section, 404 int old, int new) 405 { 406 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 407 int r; 408 409 if (old != 0) { 410 return; 411 } 412 413 r = kvm_section_update_flags(kml, section); 414 if (r < 0) { 415 abort(); 416 } 417 } 418 419 static void kvm_log_stop(MemoryListener *listener, 420 MemoryRegionSection *section, 421 int old, int new) 422 { 423 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 424 int r; 425 426 if (new != 0) { 427 return; 428 } 429 430 r = kvm_section_update_flags(kml, section); 431 if (r < 0) { 432 abort(); 433 } 434 } 435 436 /* get kvm's dirty pages bitmap and update qemu's */ 437 static int kvm_get_dirty_pages_log_range(MemoryRegionSection *section, 438 unsigned long *bitmap) 439 { 440 ram_addr_t start = section->offset_within_region + 441 memory_region_get_ram_addr(section->mr); 442 ram_addr_t pages = int128_get64(section->size) / getpagesize(); 443 444 cpu_physical_memory_set_dirty_lebitmap(bitmap, start, pages); 445 return 0; 446 } 447 448 #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1)) 449 450 /** 451 * kvm_physical_sync_dirty_bitmap - Grab dirty bitmap from kernel space 452 * This function updates qemu's dirty bitmap using 453 * memory_region_set_dirty(). This means all bits are set 454 * to dirty. 455 * 456 * @start_add: start of logged region. 457 * @end_addr: end of logged region. 458 */ 459 static int kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml, 460 MemoryRegionSection *section) 461 { 462 KVMState *s = kvm_state; 463 struct kvm_dirty_log d = {}; 464 KVMSlot *mem; 465 hwaddr start_addr, size; 466 467 size = kvm_align_section(section, &start_addr); 468 if (size) { 469 mem = kvm_lookup_matching_slot(kml, start_addr, size); 470 if (!mem) { 471 /* We don't have a slot if we want to trap every access. */ 472 return 0; 473 } 474 475 /* XXX bad kernel interface alert 476 * For dirty bitmap, kernel allocates array of size aligned to 477 * bits-per-long. But for case when the kernel is 64bits and 478 * the userspace is 32bits, userspace can't align to the same 479 * bits-per-long, since sizeof(long) is different between kernel 480 * and user space. This way, userspace will provide buffer which 481 * may be 4 bytes less than the kernel will use, resulting in 482 * userspace memory corruption (which is not detectable by valgrind 483 * too, in most cases). 484 * So for now, let's align to 64 instead of HOST_LONG_BITS here, in 485 * a hope that sizeof(long) won't become >8 any time soon. 486 */ 487 size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS), 488 /*HOST_LONG_BITS*/ 64) / 8; 489 d.dirty_bitmap = g_malloc0(size); 490 491 d.slot = mem->slot | (kml->as_id << 16); 492 if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) { 493 DPRINTF("ioctl failed %d\n", errno); 494 g_free(d.dirty_bitmap); 495 return -1; 496 } 497 498 kvm_get_dirty_pages_log_range(section, d.dirty_bitmap); 499 g_free(d.dirty_bitmap); 500 } 501 502 return 0; 503 } 504 505 static void kvm_coalesce_mmio_region(MemoryListener *listener, 506 MemoryRegionSection *secion, 507 hwaddr start, hwaddr size) 508 { 509 KVMState *s = kvm_state; 510 511 if (s->coalesced_mmio) { 512 struct kvm_coalesced_mmio_zone zone; 513 514 zone.addr = start; 515 zone.size = size; 516 zone.pad = 0; 517 518 (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone); 519 } 520 } 521 522 static void kvm_uncoalesce_mmio_region(MemoryListener *listener, 523 MemoryRegionSection *secion, 524 hwaddr start, hwaddr size) 525 { 526 KVMState *s = kvm_state; 527 528 if (s->coalesced_mmio) { 529 struct kvm_coalesced_mmio_zone zone; 530 531 zone.addr = start; 532 zone.size = size; 533 zone.pad = 0; 534 535 (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone); 536 } 537 } 538 539 int kvm_check_extension(KVMState *s, unsigned int extension) 540 { 541 int ret; 542 543 ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension); 544 if (ret < 0) { 545 ret = 0; 546 } 547 548 return ret; 549 } 550 551 int kvm_vm_check_extension(KVMState *s, unsigned int extension) 552 { 553 int ret; 554 555 ret = kvm_vm_ioctl(s, KVM_CHECK_EXTENSION, extension); 556 if (ret < 0) { 557 /* VM wide version not implemented, use global one instead */ 558 ret = kvm_check_extension(s, extension); 559 } 560 561 return ret; 562 } 563 564 static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size) 565 { 566 #if defined(HOST_WORDS_BIGENDIAN) != defined(TARGET_WORDS_BIGENDIAN) 567 /* The kernel expects ioeventfd values in HOST_WORDS_BIGENDIAN 568 * endianness, but the memory core hands them in target endianness. 569 * For example, PPC is always treated as big-endian even if running 570 * on KVM and on PPC64LE. Correct here. 571 */ 572 switch (size) { 573 case 2: 574 val = bswap16(val); 575 break; 576 case 4: 577 val = bswap32(val); 578 break; 579 } 580 #endif 581 return val; 582 } 583 584 static int kvm_set_ioeventfd_mmio(int fd, hwaddr addr, uint32_t val, 585 bool assign, uint32_t size, bool datamatch) 586 { 587 int ret; 588 struct kvm_ioeventfd iofd = { 589 .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0, 590 .addr = addr, 591 .len = size, 592 .flags = 0, 593 .fd = fd, 594 }; 595 596 if (!kvm_enabled()) { 597 return -ENOSYS; 598 } 599 600 if (datamatch) { 601 iofd.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH; 602 } 603 if (!assign) { 604 iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN; 605 } 606 607 ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd); 608 609 if (ret < 0) { 610 return -errno; 611 } 612 613 return 0; 614 } 615 616 static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val, 617 bool assign, uint32_t size, bool datamatch) 618 { 619 struct kvm_ioeventfd kick = { 620 .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0, 621 .addr = addr, 622 .flags = KVM_IOEVENTFD_FLAG_PIO, 623 .len = size, 624 .fd = fd, 625 }; 626 int r; 627 if (!kvm_enabled()) { 628 return -ENOSYS; 629 } 630 if (datamatch) { 631 kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH; 632 } 633 if (!assign) { 634 kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN; 635 } 636 r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick); 637 if (r < 0) { 638 return r; 639 } 640 return 0; 641 } 642 643 644 static int kvm_check_many_ioeventfds(void) 645 { 646 /* Userspace can use ioeventfd for io notification. This requires a host 647 * that supports eventfd(2) and an I/O thread; since eventfd does not 648 * support SIGIO it cannot interrupt the vcpu. 649 * 650 * Older kernels have a 6 device limit on the KVM io bus. Find out so we 651 * can avoid creating too many ioeventfds. 652 */ 653 #if defined(CONFIG_EVENTFD) 654 int ioeventfds[7]; 655 int i, ret = 0; 656 for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) { 657 ioeventfds[i] = eventfd(0, EFD_CLOEXEC); 658 if (ioeventfds[i] < 0) { 659 break; 660 } 661 ret = kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, true, 2, true); 662 if (ret < 0) { 663 close(ioeventfds[i]); 664 break; 665 } 666 } 667 668 /* Decide whether many devices are supported or not */ 669 ret = i == ARRAY_SIZE(ioeventfds); 670 671 while (i-- > 0) { 672 kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, false, 2, true); 673 close(ioeventfds[i]); 674 } 675 return ret; 676 #else 677 return 0; 678 #endif 679 } 680 681 static const KVMCapabilityInfo * 682 kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list) 683 { 684 while (list->name) { 685 if (!kvm_check_extension(s, list->value)) { 686 return list; 687 } 688 list++; 689 } 690 return NULL; 691 } 692 693 static void kvm_set_phys_mem(KVMMemoryListener *kml, 694 MemoryRegionSection *section, bool add) 695 { 696 KVMSlot *mem; 697 int err; 698 MemoryRegion *mr = section->mr; 699 bool writeable = !mr->readonly && !mr->rom_device; 700 hwaddr start_addr, size; 701 void *ram; 702 703 if (!memory_region_is_ram(mr)) { 704 if (writeable || !kvm_readonly_mem_allowed) { 705 return; 706 } else if (!mr->romd_mode) { 707 /* If the memory device is not in romd_mode, then we actually want 708 * to remove the kvm memory slot so all accesses will trap. */ 709 add = false; 710 } 711 } 712 713 size = kvm_align_section(section, &start_addr); 714 if (!size) { 715 return; 716 } 717 718 /* use aligned delta to align the ram address */ 719 ram = memory_region_get_ram_ptr(mr) + section->offset_within_region + 720 (start_addr - section->offset_within_address_space); 721 722 if (!add) { 723 mem = kvm_lookup_matching_slot(kml, start_addr, size); 724 if (!mem) { 725 return; 726 } 727 if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { 728 kvm_physical_sync_dirty_bitmap(kml, section); 729 } 730 731 /* unregister the slot */ 732 mem->memory_size = 0; 733 err = kvm_set_user_memory_region(kml, mem); 734 if (err) { 735 fprintf(stderr, "%s: error unregistering slot: %s\n", 736 __func__, strerror(-err)); 737 abort(); 738 } 739 return; 740 } 741 742 /* register the new slot */ 743 mem = kvm_alloc_slot(kml); 744 mem->memory_size = size; 745 mem->start_addr = start_addr; 746 mem->ram = ram; 747 mem->flags = kvm_mem_flags(mr); 748 749 err = kvm_set_user_memory_region(kml, mem); 750 if (err) { 751 fprintf(stderr, "%s: error registering slot: %s\n", __func__, 752 strerror(-err)); 753 abort(); 754 } 755 } 756 757 static void kvm_region_add(MemoryListener *listener, 758 MemoryRegionSection *section) 759 { 760 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 761 762 memory_region_ref(section->mr); 763 kvm_set_phys_mem(kml, section, true); 764 } 765 766 static void kvm_region_del(MemoryListener *listener, 767 MemoryRegionSection *section) 768 { 769 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 770 771 kvm_set_phys_mem(kml, section, false); 772 memory_region_unref(section->mr); 773 } 774 775 static void kvm_log_sync(MemoryListener *listener, 776 MemoryRegionSection *section) 777 { 778 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 779 int r; 780 781 r = kvm_physical_sync_dirty_bitmap(kml, section); 782 if (r < 0) { 783 abort(); 784 } 785 } 786 787 static void kvm_mem_ioeventfd_add(MemoryListener *listener, 788 MemoryRegionSection *section, 789 bool match_data, uint64_t data, 790 EventNotifier *e) 791 { 792 int fd = event_notifier_get_fd(e); 793 int r; 794 795 r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space, 796 data, true, int128_get64(section->size), 797 match_data); 798 if (r < 0) { 799 fprintf(stderr, "%s: error adding ioeventfd: %s\n", 800 __func__, strerror(-r)); 801 abort(); 802 } 803 } 804 805 static void kvm_mem_ioeventfd_del(MemoryListener *listener, 806 MemoryRegionSection *section, 807 bool match_data, uint64_t data, 808 EventNotifier *e) 809 { 810 int fd = event_notifier_get_fd(e); 811 int r; 812 813 r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space, 814 data, false, int128_get64(section->size), 815 match_data); 816 if (r < 0) { 817 abort(); 818 } 819 } 820 821 static void kvm_io_ioeventfd_add(MemoryListener *listener, 822 MemoryRegionSection *section, 823 bool match_data, uint64_t data, 824 EventNotifier *e) 825 { 826 int fd = event_notifier_get_fd(e); 827 int r; 828 829 r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space, 830 data, true, int128_get64(section->size), 831 match_data); 832 if (r < 0) { 833 fprintf(stderr, "%s: error adding ioeventfd: %s\n", 834 __func__, strerror(-r)); 835 abort(); 836 } 837 } 838 839 static void kvm_io_ioeventfd_del(MemoryListener *listener, 840 MemoryRegionSection *section, 841 bool match_data, uint64_t data, 842 EventNotifier *e) 843 844 { 845 int fd = event_notifier_get_fd(e); 846 int r; 847 848 r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space, 849 data, false, int128_get64(section->size), 850 match_data); 851 if (r < 0) { 852 abort(); 853 } 854 } 855 856 void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml, 857 AddressSpace *as, int as_id) 858 { 859 int i; 860 861 kml->slots = g_malloc0(s->nr_slots * sizeof(KVMSlot)); 862 kml->as_id = as_id; 863 864 for (i = 0; i < s->nr_slots; i++) { 865 kml->slots[i].slot = i; 866 } 867 868 kml->listener.region_add = kvm_region_add; 869 kml->listener.region_del = kvm_region_del; 870 kml->listener.log_start = kvm_log_start; 871 kml->listener.log_stop = kvm_log_stop; 872 kml->listener.log_sync = kvm_log_sync; 873 kml->listener.priority = 10; 874 875 memory_listener_register(&kml->listener, as); 876 } 877 878 static MemoryListener kvm_io_listener = { 879 .eventfd_add = kvm_io_ioeventfd_add, 880 .eventfd_del = kvm_io_ioeventfd_del, 881 .priority = 10, 882 }; 883 884 int kvm_set_irq(KVMState *s, int irq, int level) 885 { 886 struct kvm_irq_level event; 887 int ret; 888 889 assert(kvm_async_interrupts_enabled()); 890 891 event.level = level; 892 event.irq = irq; 893 ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event); 894 if (ret < 0) { 895 perror("kvm_set_irq"); 896 abort(); 897 } 898 899 return (s->irq_set_ioctl == KVM_IRQ_LINE) ? 1 : event.status; 900 } 901 902 #ifdef KVM_CAP_IRQ_ROUTING 903 typedef struct KVMMSIRoute { 904 struct kvm_irq_routing_entry kroute; 905 QTAILQ_ENTRY(KVMMSIRoute) entry; 906 } KVMMSIRoute; 907 908 static void set_gsi(KVMState *s, unsigned int gsi) 909 { 910 set_bit(gsi, s->used_gsi_bitmap); 911 } 912 913 static void clear_gsi(KVMState *s, unsigned int gsi) 914 { 915 clear_bit(gsi, s->used_gsi_bitmap); 916 } 917 918 void kvm_init_irq_routing(KVMState *s) 919 { 920 int gsi_count, i; 921 922 gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING) - 1; 923 if (gsi_count > 0) { 924 /* Round up so we can search ints using ffs */ 925 s->used_gsi_bitmap = bitmap_new(gsi_count); 926 s->gsi_count = gsi_count; 927 } 928 929 s->irq_routes = g_malloc0(sizeof(*s->irq_routes)); 930 s->nr_allocated_irq_routes = 0; 931 932 if (!kvm_direct_msi_allowed) { 933 for (i = 0; i < KVM_MSI_HASHTAB_SIZE; i++) { 934 QTAILQ_INIT(&s->msi_hashtab[i]); 935 } 936 } 937 938 kvm_arch_init_irq_routing(s); 939 } 940 941 void kvm_irqchip_commit_routes(KVMState *s) 942 { 943 int ret; 944 945 if (kvm_gsi_direct_mapping()) { 946 return; 947 } 948 949 if (!kvm_gsi_routing_enabled()) { 950 return; 951 } 952 953 s->irq_routes->flags = 0; 954 trace_kvm_irqchip_commit_routes(); 955 ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes); 956 assert(ret == 0); 957 } 958 959 static void kvm_add_routing_entry(KVMState *s, 960 struct kvm_irq_routing_entry *entry) 961 { 962 struct kvm_irq_routing_entry *new; 963 int n, size; 964 965 if (s->irq_routes->nr == s->nr_allocated_irq_routes) { 966 n = s->nr_allocated_irq_routes * 2; 967 if (n < 64) { 968 n = 64; 969 } 970 size = sizeof(struct kvm_irq_routing); 971 size += n * sizeof(*new); 972 s->irq_routes = g_realloc(s->irq_routes, size); 973 s->nr_allocated_irq_routes = n; 974 } 975 n = s->irq_routes->nr++; 976 new = &s->irq_routes->entries[n]; 977 978 *new = *entry; 979 980 set_gsi(s, entry->gsi); 981 } 982 983 static int kvm_update_routing_entry(KVMState *s, 984 struct kvm_irq_routing_entry *new_entry) 985 { 986 struct kvm_irq_routing_entry *entry; 987 int n; 988 989 for (n = 0; n < s->irq_routes->nr; n++) { 990 entry = &s->irq_routes->entries[n]; 991 if (entry->gsi != new_entry->gsi) { 992 continue; 993 } 994 995 if(!memcmp(entry, new_entry, sizeof *entry)) { 996 return 0; 997 } 998 999 *entry = *new_entry; 1000 1001 return 0; 1002 } 1003 1004 return -ESRCH; 1005 } 1006 1007 void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin) 1008 { 1009 struct kvm_irq_routing_entry e = {}; 1010 1011 assert(pin < s->gsi_count); 1012 1013 e.gsi = irq; 1014 e.type = KVM_IRQ_ROUTING_IRQCHIP; 1015 e.flags = 0; 1016 e.u.irqchip.irqchip = irqchip; 1017 e.u.irqchip.pin = pin; 1018 kvm_add_routing_entry(s, &e); 1019 } 1020 1021 void kvm_irqchip_release_virq(KVMState *s, int virq) 1022 { 1023 struct kvm_irq_routing_entry *e; 1024 int i; 1025 1026 if (kvm_gsi_direct_mapping()) { 1027 return; 1028 } 1029 1030 for (i = 0; i < s->irq_routes->nr; i++) { 1031 e = &s->irq_routes->entries[i]; 1032 if (e->gsi == virq) { 1033 s->irq_routes->nr--; 1034 *e = s->irq_routes->entries[s->irq_routes->nr]; 1035 } 1036 } 1037 clear_gsi(s, virq); 1038 kvm_arch_release_virq_post(virq); 1039 trace_kvm_irqchip_release_virq(virq); 1040 } 1041 1042 static unsigned int kvm_hash_msi(uint32_t data) 1043 { 1044 /* This is optimized for IA32 MSI layout. However, no other arch shall 1045 * repeat the mistake of not providing a direct MSI injection API. */ 1046 return data & 0xff; 1047 } 1048 1049 static void kvm_flush_dynamic_msi_routes(KVMState *s) 1050 { 1051 KVMMSIRoute *route, *next; 1052 unsigned int hash; 1053 1054 for (hash = 0; hash < KVM_MSI_HASHTAB_SIZE; hash++) { 1055 QTAILQ_FOREACH_SAFE(route, &s->msi_hashtab[hash], entry, next) { 1056 kvm_irqchip_release_virq(s, route->kroute.gsi); 1057 QTAILQ_REMOVE(&s->msi_hashtab[hash], route, entry); 1058 g_free(route); 1059 } 1060 } 1061 } 1062 1063 static int kvm_irqchip_get_virq(KVMState *s) 1064 { 1065 int next_virq; 1066 1067 /* 1068 * PIC and IOAPIC share the first 16 GSI numbers, thus the available 1069 * GSI numbers are more than the number of IRQ route. Allocating a GSI 1070 * number can succeed even though a new route entry cannot be added. 1071 * When this happens, flush dynamic MSI entries to free IRQ route entries. 1072 */ 1073 if (!kvm_direct_msi_allowed && s->irq_routes->nr == s->gsi_count) { 1074 kvm_flush_dynamic_msi_routes(s); 1075 } 1076 1077 /* Return the lowest unused GSI in the bitmap */ 1078 next_virq = find_first_zero_bit(s->used_gsi_bitmap, s->gsi_count); 1079 if (next_virq >= s->gsi_count) { 1080 return -ENOSPC; 1081 } else { 1082 return next_virq; 1083 } 1084 } 1085 1086 static KVMMSIRoute *kvm_lookup_msi_route(KVMState *s, MSIMessage msg) 1087 { 1088 unsigned int hash = kvm_hash_msi(msg.data); 1089 KVMMSIRoute *route; 1090 1091 QTAILQ_FOREACH(route, &s->msi_hashtab[hash], entry) { 1092 if (route->kroute.u.msi.address_lo == (uint32_t)msg.address && 1093 route->kroute.u.msi.address_hi == (msg.address >> 32) && 1094 route->kroute.u.msi.data == le32_to_cpu(msg.data)) { 1095 return route; 1096 } 1097 } 1098 return NULL; 1099 } 1100 1101 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg) 1102 { 1103 struct kvm_msi msi; 1104 KVMMSIRoute *route; 1105 1106 if (kvm_direct_msi_allowed) { 1107 msi.address_lo = (uint32_t)msg.address; 1108 msi.address_hi = msg.address >> 32; 1109 msi.data = le32_to_cpu(msg.data); 1110 msi.flags = 0; 1111 memset(msi.pad, 0, sizeof(msi.pad)); 1112 1113 return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi); 1114 } 1115 1116 route = kvm_lookup_msi_route(s, msg); 1117 if (!route) { 1118 int virq; 1119 1120 virq = kvm_irqchip_get_virq(s); 1121 if (virq < 0) { 1122 return virq; 1123 } 1124 1125 route = g_malloc0(sizeof(KVMMSIRoute)); 1126 route->kroute.gsi = virq; 1127 route->kroute.type = KVM_IRQ_ROUTING_MSI; 1128 route->kroute.flags = 0; 1129 route->kroute.u.msi.address_lo = (uint32_t)msg.address; 1130 route->kroute.u.msi.address_hi = msg.address >> 32; 1131 route->kroute.u.msi.data = le32_to_cpu(msg.data); 1132 1133 kvm_add_routing_entry(s, &route->kroute); 1134 kvm_irqchip_commit_routes(s); 1135 1136 QTAILQ_INSERT_TAIL(&s->msi_hashtab[kvm_hash_msi(msg.data)], route, 1137 entry); 1138 } 1139 1140 assert(route->kroute.type == KVM_IRQ_ROUTING_MSI); 1141 1142 return kvm_set_irq(s, route->kroute.gsi, 1); 1143 } 1144 1145 int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev) 1146 { 1147 struct kvm_irq_routing_entry kroute = {}; 1148 int virq; 1149 MSIMessage msg = {0, 0}; 1150 1151 if (pci_available && dev) { 1152 msg = pci_get_msi_message(dev, vector); 1153 } 1154 1155 if (kvm_gsi_direct_mapping()) { 1156 return kvm_arch_msi_data_to_gsi(msg.data); 1157 } 1158 1159 if (!kvm_gsi_routing_enabled()) { 1160 return -ENOSYS; 1161 } 1162 1163 virq = kvm_irqchip_get_virq(s); 1164 if (virq < 0) { 1165 return virq; 1166 } 1167 1168 kroute.gsi = virq; 1169 kroute.type = KVM_IRQ_ROUTING_MSI; 1170 kroute.flags = 0; 1171 kroute.u.msi.address_lo = (uint32_t)msg.address; 1172 kroute.u.msi.address_hi = msg.address >> 32; 1173 kroute.u.msi.data = le32_to_cpu(msg.data); 1174 if (pci_available && kvm_msi_devid_required()) { 1175 kroute.flags = KVM_MSI_VALID_DEVID; 1176 kroute.u.msi.devid = pci_requester_id(dev); 1177 } 1178 if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) { 1179 kvm_irqchip_release_virq(s, virq); 1180 return -EINVAL; 1181 } 1182 1183 trace_kvm_irqchip_add_msi_route(dev ? dev->name : (char *)"N/A", 1184 vector, virq); 1185 1186 kvm_add_routing_entry(s, &kroute); 1187 kvm_arch_add_msi_route_post(&kroute, vector, dev); 1188 kvm_irqchip_commit_routes(s); 1189 1190 return virq; 1191 } 1192 1193 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg, 1194 PCIDevice *dev) 1195 { 1196 struct kvm_irq_routing_entry kroute = {}; 1197 1198 if (kvm_gsi_direct_mapping()) { 1199 return 0; 1200 } 1201 1202 if (!kvm_irqchip_in_kernel()) { 1203 return -ENOSYS; 1204 } 1205 1206 kroute.gsi = virq; 1207 kroute.type = KVM_IRQ_ROUTING_MSI; 1208 kroute.flags = 0; 1209 kroute.u.msi.address_lo = (uint32_t)msg.address; 1210 kroute.u.msi.address_hi = msg.address >> 32; 1211 kroute.u.msi.data = le32_to_cpu(msg.data); 1212 if (pci_available && kvm_msi_devid_required()) { 1213 kroute.flags = KVM_MSI_VALID_DEVID; 1214 kroute.u.msi.devid = pci_requester_id(dev); 1215 } 1216 if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) { 1217 return -EINVAL; 1218 } 1219 1220 trace_kvm_irqchip_update_msi_route(virq); 1221 1222 return kvm_update_routing_entry(s, &kroute); 1223 } 1224 1225 static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int rfd, int virq, 1226 bool assign) 1227 { 1228 struct kvm_irqfd irqfd = { 1229 .fd = fd, 1230 .gsi = virq, 1231 .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN, 1232 }; 1233 1234 if (rfd != -1) { 1235 irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE; 1236 irqfd.resamplefd = rfd; 1237 } 1238 1239 if (!kvm_irqfds_enabled()) { 1240 return -ENOSYS; 1241 } 1242 1243 return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd); 1244 } 1245 1246 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter) 1247 { 1248 struct kvm_irq_routing_entry kroute = {}; 1249 int virq; 1250 1251 if (!kvm_gsi_routing_enabled()) { 1252 return -ENOSYS; 1253 } 1254 1255 virq = kvm_irqchip_get_virq(s); 1256 if (virq < 0) { 1257 return virq; 1258 } 1259 1260 kroute.gsi = virq; 1261 kroute.type = KVM_IRQ_ROUTING_S390_ADAPTER; 1262 kroute.flags = 0; 1263 kroute.u.adapter.summary_addr = adapter->summary_addr; 1264 kroute.u.adapter.ind_addr = adapter->ind_addr; 1265 kroute.u.adapter.summary_offset = adapter->summary_offset; 1266 kroute.u.adapter.ind_offset = adapter->ind_offset; 1267 kroute.u.adapter.adapter_id = adapter->adapter_id; 1268 1269 kvm_add_routing_entry(s, &kroute); 1270 1271 return virq; 1272 } 1273 1274 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint) 1275 { 1276 struct kvm_irq_routing_entry kroute = {}; 1277 int virq; 1278 1279 if (!kvm_gsi_routing_enabled()) { 1280 return -ENOSYS; 1281 } 1282 if (!kvm_check_extension(s, KVM_CAP_HYPERV_SYNIC)) { 1283 return -ENOSYS; 1284 } 1285 virq = kvm_irqchip_get_virq(s); 1286 if (virq < 0) { 1287 return virq; 1288 } 1289 1290 kroute.gsi = virq; 1291 kroute.type = KVM_IRQ_ROUTING_HV_SINT; 1292 kroute.flags = 0; 1293 kroute.u.hv_sint.vcpu = vcpu; 1294 kroute.u.hv_sint.sint = sint; 1295 1296 kvm_add_routing_entry(s, &kroute); 1297 kvm_irqchip_commit_routes(s); 1298 1299 return virq; 1300 } 1301 1302 #else /* !KVM_CAP_IRQ_ROUTING */ 1303 1304 void kvm_init_irq_routing(KVMState *s) 1305 { 1306 } 1307 1308 void kvm_irqchip_release_virq(KVMState *s, int virq) 1309 { 1310 } 1311 1312 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg) 1313 { 1314 abort(); 1315 } 1316 1317 int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev) 1318 { 1319 return -ENOSYS; 1320 } 1321 1322 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter) 1323 { 1324 return -ENOSYS; 1325 } 1326 1327 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint) 1328 { 1329 return -ENOSYS; 1330 } 1331 1332 static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int virq, bool assign) 1333 { 1334 abort(); 1335 } 1336 1337 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg) 1338 { 1339 return -ENOSYS; 1340 } 1341 #endif /* !KVM_CAP_IRQ_ROUTING */ 1342 1343 int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, 1344 EventNotifier *rn, int virq) 1345 { 1346 return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), 1347 rn ? event_notifier_get_fd(rn) : -1, virq, true); 1348 } 1349 1350 int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, 1351 int virq) 1352 { 1353 return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), -1, virq, 1354 false); 1355 } 1356 1357 int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n, 1358 EventNotifier *rn, qemu_irq irq) 1359 { 1360 gpointer key, gsi; 1361 gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi); 1362 1363 if (!found) { 1364 return -ENXIO; 1365 } 1366 return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, GPOINTER_TO_INT(gsi)); 1367 } 1368 1369 int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n, 1370 qemu_irq irq) 1371 { 1372 gpointer key, gsi; 1373 gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi); 1374 1375 if (!found) { 1376 return -ENXIO; 1377 } 1378 return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, GPOINTER_TO_INT(gsi)); 1379 } 1380 1381 void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi) 1382 { 1383 g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi)); 1384 } 1385 1386 static void kvm_irqchip_create(MachineState *machine, KVMState *s) 1387 { 1388 int ret; 1389 1390 if (kvm_check_extension(s, KVM_CAP_IRQCHIP)) { 1391 ; 1392 } else if (kvm_check_extension(s, KVM_CAP_S390_IRQCHIP)) { 1393 ret = kvm_vm_enable_cap(s, KVM_CAP_S390_IRQCHIP, 0); 1394 if (ret < 0) { 1395 fprintf(stderr, "Enable kernel irqchip failed: %s\n", strerror(-ret)); 1396 exit(1); 1397 } 1398 } else { 1399 return; 1400 } 1401 1402 /* First probe and see if there's a arch-specific hook to create the 1403 * in-kernel irqchip for us */ 1404 ret = kvm_arch_irqchip_create(machine, s); 1405 if (ret == 0) { 1406 if (machine_kernel_irqchip_split(machine)) { 1407 perror("Split IRQ chip mode not supported."); 1408 exit(1); 1409 } else { 1410 ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP); 1411 } 1412 } 1413 if (ret < 0) { 1414 fprintf(stderr, "Create kernel irqchip failed: %s\n", strerror(-ret)); 1415 exit(1); 1416 } 1417 1418 kvm_kernel_irqchip = true; 1419 /* If we have an in-kernel IRQ chip then we must have asynchronous 1420 * interrupt delivery (though the reverse is not necessarily true) 1421 */ 1422 kvm_async_interrupts_allowed = true; 1423 kvm_halt_in_kernel_allowed = true; 1424 1425 kvm_init_irq_routing(s); 1426 1427 s->gsimap = g_hash_table_new(g_direct_hash, g_direct_equal); 1428 } 1429 1430 /* Find number of supported CPUs using the recommended 1431 * procedure from the kernel API documentation to cope with 1432 * older kernels that may be missing capabilities. 1433 */ 1434 static int kvm_recommended_vcpus(KVMState *s) 1435 { 1436 int ret = kvm_vm_check_extension(s, KVM_CAP_NR_VCPUS); 1437 return (ret) ? ret : 4; 1438 } 1439 1440 static int kvm_max_vcpus(KVMState *s) 1441 { 1442 int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS); 1443 return (ret) ? ret : kvm_recommended_vcpus(s); 1444 } 1445 1446 static int kvm_max_vcpu_id(KVMState *s) 1447 { 1448 int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPU_ID); 1449 return (ret) ? ret : kvm_max_vcpus(s); 1450 } 1451 1452 bool kvm_vcpu_id_is_valid(int vcpu_id) 1453 { 1454 KVMState *s = KVM_STATE(current_machine->accelerator); 1455 return vcpu_id >= 0 && vcpu_id < kvm_max_vcpu_id(s); 1456 } 1457 1458 static int kvm_init(MachineState *ms) 1459 { 1460 MachineClass *mc = MACHINE_GET_CLASS(ms); 1461 static const char upgrade_note[] = 1462 "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n" 1463 "(see http://sourceforge.net/projects/kvm).\n"; 1464 struct { 1465 const char *name; 1466 int num; 1467 } num_cpus[] = { 1468 { "SMP", smp_cpus }, 1469 { "hotpluggable", max_cpus }, 1470 { NULL, } 1471 }, *nc = num_cpus; 1472 int soft_vcpus_limit, hard_vcpus_limit; 1473 KVMState *s; 1474 const KVMCapabilityInfo *missing_cap; 1475 int ret; 1476 int type = 0; 1477 const char *kvm_type; 1478 1479 s = KVM_STATE(ms->accelerator); 1480 1481 /* 1482 * On systems where the kernel can support different base page 1483 * sizes, host page size may be different from TARGET_PAGE_SIZE, 1484 * even with KVM. TARGET_PAGE_SIZE is assumed to be the minimum 1485 * page size for the system though. 1486 */ 1487 assert(TARGET_PAGE_SIZE <= getpagesize()); 1488 1489 s->sigmask_len = 8; 1490 1491 #ifdef KVM_CAP_SET_GUEST_DEBUG 1492 QTAILQ_INIT(&s->kvm_sw_breakpoints); 1493 #endif 1494 QLIST_INIT(&s->kvm_parked_vcpus); 1495 s->vmfd = -1; 1496 s->fd = qemu_open("/dev/kvm", O_RDWR); 1497 if (s->fd == -1) { 1498 fprintf(stderr, "Could not access KVM kernel module: %m\n"); 1499 ret = -errno; 1500 goto err; 1501 } 1502 1503 ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0); 1504 if (ret < KVM_API_VERSION) { 1505 if (ret >= 0) { 1506 ret = -EINVAL; 1507 } 1508 fprintf(stderr, "kvm version too old\n"); 1509 goto err; 1510 } 1511 1512 if (ret > KVM_API_VERSION) { 1513 ret = -EINVAL; 1514 fprintf(stderr, "kvm version not supported\n"); 1515 goto err; 1516 } 1517 1518 kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT); 1519 s->nr_slots = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS); 1520 1521 /* If unspecified, use the default value */ 1522 if (!s->nr_slots) { 1523 s->nr_slots = 32; 1524 } 1525 1526 kvm_type = qemu_opt_get(qemu_get_machine_opts(), "kvm-type"); 1527 if (mc->kvm_type) { 1528 type = mc->kvm_type(kvm_type); 1529 } else if (kvm_type) { 1530 ret = -EINVAL; 1531 fprintf(stderr, "Invalid argument kvm-type=%s\n", kvm_type); 1532 goto err; 1533 } 1534 1535 do { 1536 ret = kvm_ioctl(s, KVM_CREATE_VM, type); 1537 } while (ret == -EINTR); 1538 1539 if (ret < 0) { 1540 fprintf(stderr, "ioctl(KVM_CREATE_VM) failed: %d %s\n", -ret, 1541 strerror(-ret)); 1542 1543 #ifdef TARGET_S390X 1544 if (ret == -EINVAL) { 1545 fprintf(stderr, 1546 "Host kernel setup problem detected. Please verify:\n"); 1547 fprintf(stderr, "- for kernels supporting the switch_amode or" 1548 " user_mode parameters, whether\n"); 1549 fprintf(stderr, 1550 " user space is running in primary address space\n"); 1551 fprintf(stderr, 1552 "- for kernels supporting the vm.allocate_pgste sysctl, " 1553 "whether it is enabled\n"); 1554 } 1555 #endif 1556 goto err; 1557 } 1558 1559 s->vmfd = ret; 1560 1561 /* check the vcpu limits */ 1562 soft_vcpus_limit = kvm_recommended_vcpus(s); 1563 hard_vcpus_limit = kvm_max_vcpus(s); 1564 1565 while (nc->name) { 1566 if (nc->num > soft_vcpus_limit) { 1567 warn_report("Number of %s cpus requested (%d) exceeds " 1568 "the recommended cpus supported by KVM (%d)", 1569 nc->name, nc->num, soft_vcpus_limit); 1570 1571 if (nc->num > hard_vcpus_limit) { 1572 fprintf(stderr, "Number of %s cpus requested (%d) exceeds " 1573 "the maximum cpus supported by KVM (%d)\n", 1574 nc->name, nc->num, hard_vcpus_limit); 1575 exit(1); 1576 } 1577 } 1578 nc++; 1579 } 1580 1581 missing_cap = kvm_check_extension_list(s, kvm_required_capabilites); 1582 if (!missing_cap) { 1583 missing_cap = 1584 kvm_check_extension_list(s, kvm_arch_required_capabilities); 1585 } 1586 if (missing_cap) { 1587 ret = -EINVAL; 1588 fprintf(stderr, "kvm does not support %s\n%s", 1589 missing_cap->name, upgrade_note); 1590 goto err; 1591 } 1592 1593 s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO); 1594 1595 #ifdef KVM_CAP_VCPU_EVENTS 1596 s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS); 1597 #endif 1598 1599 s->robust_singlestep = 1600 kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP); 1601 1602 #ifdef KVM_CAP_DEBUGREGS 1603 s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS); 1604 #endif 1605 1606 #ifdef KVM_CAP_IRQ_ROUTING 1607 kvm_direct_msi_allowed = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0); 1608 #endif 1609 1610 s->intx_set_mask = kvm_check_extension(s, KVM_CAP_PCI_2_3); 1611 1612 s->irq_set_ioctl = KVM_IRQ_LINE; 1613 if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) { 1614 s->irq_set_ioctl = KVM_IRQ_LINE_STATUS; 1615 } 1616 1617 #ifdef KVM_CAP_READONLY_MEM 1618 kvm_readonly_mem_allowed = 1619 (kvm_check_extension(s, KVM_CAP_READONLY_MEM) > 0); 1620 #endif 1621 1622 kvm_eventfds_allowed = 1623 (kvm_check_extension(s, KVM_CAP_IOEVENTFD) > 0); 1624 1625 kvm_irqfds_allowed = 1626 (kvm_check_extension(s, KVM_CAP_IRQFD) > 0); 1627 1628 kvm_resamplefds_allowed = 1629 (kvm_check_extension(s, KVM_CAP_IRQFD_RESAMPLE) > 0); 1630 1631 kvm_vm_attributes_allowed = 1632 (kvm_check_extension(s, KVM_CAP_VM_ATTRIBUTES) > 0); 1633 1634 kvm_ioeventfd_any_length_allowed = 1635 (kvm_check_extension(s, KVM_CAP_IOEVENTFD_ANY_LENGTH) > 0); 1636 1637 kvm_state = s; 1638 1639 ret = kvm_arch_init(ms, s); 1640 if (ret < 0) { 1641 goto err; 1642 } 1643 1644 if (machine_kernel_irqchip_allowed(ms)) { 1645 kvm_irqchip_create(ms, s); 1646 } 1647 1648 if (kvm_eventfds_allowed) { 1649 s->memory_listener.listener.eventfd_add = kvm_mem_ioeventfd_add; 1650 s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del; 1651 } 1652 s->memory_listener.listener.coalesced_mmio_add = kvm_coalesce_mmio_region; 1653 s->memory_listener.listener.coalesced_mmio_del = kvm_uncoalesce_mmio_region; 1654 1655 kvm_memory_listener_register(s, &s->memory_listener, 1656 &address_space_memory, 0); 1657 memory_listener_register(&kvm_io_listener, 1658 &address_space_io); 1659 1660 s->many_ioeventfds = kvm_check_many_ioeventfds(); 1661 1662 s->sync_mmu = !!kvm_vm_check_extension(kvm_state, KVM_CAP_SYNC_MMU); 1663 1664 return 0; 1665 1666 err: 1667 assert(ret < 0); 1668 if (s->vmfd >= 0) { 1669 close(s->vmfd); 1670 } 1671 if (s->fd != -1) { 1672 close(s->fd); 1673 } 1674 g_free(s->memory_listener.slots); 1675 1676 return ret; 1677 } 1678 1679 void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len) 1680 { 1681 s->sigmask_len = sigmask_len; 1682 } 1683 1684 static void kvm_handle_io(uint16_t port, MemTxAttrs attrs, void *data, int direction, 1685 int size, uint32_t count) 1686 { 1687 int i; 1688 uint8_t *ptr = data; 1689 1690 for (i = 0; i < count; i++) { 1691 address_space_rw(&address_space_io, port, attrs, 1692 ptr, size, 1693 direction == KVM_EXIT_IO_OUT); 1694 ptr += size; 1695 } 1696 } 1697 1698 static int kvm_handle_internal_error(CPUState *cpu, struct kvm_run *run) 1699 { 1700 fprintf(stderr, "KVM internal error. Suberror: %d\n", 1701 run->internal.suberror); 1702 1703 if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) { 1704 int i; 1705 1706 for (i = 0; i < run->internal.ndata; ++i) { 1707 fprintf(stderr, "extra data[%d]: %"PRIx64"\n", 1708 i, (uint64_t)run->internal.data[i]); 1709 } 1710 } 1711 if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) { 1712 fprintf(stderr, "emulation failure\n"); 1713 if (!kvm_arch_stop_on_emulation_error(cpu)) { 1714 cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_CODE); 1715 return EXCP_INTERRUPT; 1716 } 1717 } 1718 /* FIXME: Should trigger a qmp message to let management know 1719 * something went wrong. 1720 */ 1721 return -1; 1722 } 1723 1724 void kvm_flush_coalesced_mmio_buffer(void) 1725 { 1726 KVMState *s = kvm_state; 1727 1728 if (s->coalesced_flush_in_progress) { 1729 return; 1730 } 1731 1732 s->coalesced_flush_in_progress = true; 1733 1734 if (s->coalesced_mmio_ring) { 1735 struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring; 1736 while (ring->first != ring->last) { 1737 struct kvm_coalesced_mmio *ent; 1738 1739 ent = &ring->coalesced_mmio[ring->first]; 1740 1741 cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len); 1742 smp_wmb(); 1743 ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX; 1744 } 1745 } 1746 1747 s->coalesced_flush_in_progress = false; 1748 } 1749 1750 static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg) 1751 { 1752 if (!cpu->vcpu_dirty) { 1753 kvm_arch_get_registers(cpu); 1754 cpu->vcpu_dirty = true; 1755 } 1756 } 1757 1758 void kvm_cpu_synchronize_state(CPUState *cpu) 1759 { 1760 if (!cpu->vcpu_dirty) { 1761 run_on_cpu(cpu, do_kvm_cpu_synchronize_state, RUN_ON_CPU_NULL); 1762 } 1763 } 1764 1765 static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg) 1766 { 1767 kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE); 1768 cpu->vcpu_dirty = false; 1769 } 1770 1771 void kvm_cpu_synchronize_post_reset(CPUState *cpu) 1772 { 1773 run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL); 1774 } 1775 1776 static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg) 1777 { 1778 kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE); 1779 cpu->vcpu_dirty = false; 1780 } 1781 1782 void kvm_cpu_synchronize_post_init(CPUState *cpu) 1783 { 1784 run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, RUN_ON_CPU_NULL); 1785 } 1786 1787 static void do_kvm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg) 1788 { 1789 cpu->vcpu_dirty = true; 1790 } 1791 1792 void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu) 1793 { 1794 run_on_cpu(cpu, do_kvm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL); 1795 } 1796 1797 #ifdef KVM_HAVE_MCE_INJECTION 1798 static __thread void *pending_sigbus_addr; 1799 static __thread int pending_sigbus_code; 1800 static __thread bool have_sigbus_pending; 1801 #endif 1802 1803 static void kvm_cpu_kick(CPUState *cpu) 1804 { 1805 atomic_set(&cpu->kvm_run->immediate_exit, 1); 1806 } 1807 1808 static void kvm_cpu_kick_self(void) 1809 { 1810 if (kvm_immediate_exit) { 1811 kvm_cpu_kick(current_cpu); 1812 } else { 1813 qemu_cpu_kick_self(); 1814 } 1815 } 1816 1817 static void kvm_eat_signals(CPUState *cpu) 1818 { 1819 struct timespec ts = { 0, 0 }; 1820 siginfo_t siginfo; 1821 sigset_t waitset; 1822 sigset_t chkset; 1823 int r; 1824 1825 if (kvm_immediate_exit) { 1826 atomic_set(&cpu->kvm_run->immediate_exit, 0); 1827 /* Write kvm_run->immediate_exit before the cpu->exit_request 1828 * write in kvm_cpu_exec. 1829 */ 1830 smp_wmb(); 1831 return; 1832 } 1833 1834 sigemptyset(&waitset); 1835 sigaddset(&waitset, SIG_IPI); 1836 1837 do { 1838 r = sigtimedwait(&waitset, &siginfo, &ts); 1839 if (r == -1 && !(errno == EAGAIN || errno == EINTR)) { 1840 perror("sigtimedwait"); 1841 exit(1); 1842 } 1843 1844 r = sigpending(&chkset); 1845 if (r == -1) { 1846 perror("sigpending"); 1847 exit(1); 1848 } 1849 } while (sigismember(&chkset, SIG_IPI)); 1850 } 1851 1852 int kvm_cpu_exec(CPUState *cpu) 1853 { 1854 struct kvm_run *run = cpu->kvm_run; 1855 int ret, run_ret; 1856 1857 DPRINTF("kvm_cpu_exec()\n"); 1858 1859 if (kvm_arch_process_async_events(cpu)) { 1860 atomic_set(&cpu->exit_request, 0); 1861 return EXCP_HLT; 1862 } 1863 1864 qemu_mutex_unlock_iothread(); 1865 cpu_exec_start(cpu); 1866 1867 do { 1868 MemTxAttrs attrs; 1869 1870 if (cpu->vcpu_dirty) { 1871 kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE); 1872 cpu->vcpu_dirty = false; 1873 } 1874 1875 kvm_arch_pre_run(cpu, run); 1876 if (atomic_read(&cpu->exit_request)) { 1877 DPRINTF("interrupt exit requested\n"); 1878 /* 1879 * KVM requires us to reenter the kernel after IO exits to complete 1880 * instruction emulation. This self-signal will ensure that we 1881 * leave ASAP again. 1882 */ 1883 kvm_cpu_kick_self(); 1884 } 1885 1886 /* Read cpu->exit_request before KVM_RUN reads run->immediate_exit. 1887 * Matching barrier in kvm_eat_signals. 1888 */ 1889 smp_rmb(); 1890 1891 run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0); 1892 1893 attrs = kvm_arch_post_run(cpu, run); 1894 1895 #ifdef KVM_HAVE_MCE_INJECTION 1896 if (unlikely(have_sigbus_pending)) { 1897 qemu_mutex_lock_iothread(); 1898 kvm_arch_on_sigbus_vcpu(cpu, pending_sigbus_code, 1899 pending_sigbus_addr); 1900 have_sigbus_pending = false; 1901 qemu_mutex_unlock_iothread(); 1902 } 1903 #endif 1904 1905 if (run_ret < 0) { 1906 if (run_ret == -EINTR || run_ret == -EAGAIN) { 1907 DPRINTF("io window exit\n"); 1908 kvm_eat_signals(cpu); 1909 ret = EXCP_INTERRUPT; 1910 break; 1911 } 1912 fprintf(stderr, "error: kvm run failed %s\n", 1913 strerror(-run_ret)); 1914 #ifdef TARGET_PPC 1915 if (run_ret == -EBUSY) { 1916 fprintf(stderr, 1917 "This is probably because your SMT is enabled.\n" 1918 "VCPU can only run on primary threads with all " 1919 "secondary threads offline.\n"); 1920 } 1921 #endif 1922 ret = -1; 1923 break; 1924 } 1925 1926 trace_kvm_run_exit(cpu->cpu_index, run->exit_reason); 1927 switch (run->exit_reason) { 1928 case KVM_EXIT_IO: 1929 DPRINTF("handle_io\n"); 1930 /* Called outside BQL */ 1931 kvm_handle_io(run->io.port, attrs, 1932 (uint8_t *)run + run->io.data_offset, 1933 run->io.direction, 1934 run->io.size, 1935 run->io.count); 1936 ret = 0; 1937 break; 1938 case KVM_EXIT_MMIO: 1939 DPRINTF("handle_mmio\n"); 1940 /* Called outside BQL */ 1941 address_space_rw(&address_space_memory, 1942 run->mmio.phys_addr, attrs, 1943 run->mmio.data, 1944 run->mmio.len, 1945 run->mmio.is_write); 1946 ret = 0; 1947 break; 1948 case KVM_EXIT_IRQ_WINDOW_OPEN: 1949 DPRINTF("irq_window_open\n"); 1950 ret = EXCP_INTERRUPT; 1951 break; 1952 case KVM_EXIT_SHUTDOWN: 1953 DPRINTF("shutdown\n"); 1954 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); 1955 ret = EXCP_INTERRUPT; 1956 break; 1957 case KVM_EXIT_UNKNOWN: 1958 fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n", 1959 (uint64_t)run->hw.hardware_exit_reason); 1960 ret = -1; 1961 break; 1962 case KVM_EXIT_INTERNAL_ERROR: 1963 ret = kvm_handle_internal_error(cpu, run); 1964 break; 1965 case KVM_EXIT_SYSTEM_EVENT: 1966 switch (run->system_event.type) { 1967 case KVM_SYSTEM_EVENT_SHUTDOWN: 1968 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN); 1969 ret = EXCP_INTERRUPT; 1970 break; 1971 case KVM_SYSTEM_EVENT_RESET: 1972 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); 1973 ret = EXCP_INTERRUPT; 1974 break; 1975 case KVM_SYSTEM_EVENT_CRASH: 1976 kvm_cpu_synchronize_state(cpu); 1977 qemu_mutex_lock_iothread(); 1978 qemu_system_guest_panicked(cpu_get_crash_info(cpu)); 1979 qemu_mutex_unlock_iothread(); 1980 ret = 0; 1981 break; 1982 default: 1983 DPRINTF("kvm_arch_handle_exit\n"); 1984 ret = kvm_arch_handle_exit(cpu, run); 1985 break; 1986 } 1987 break; 1988 default: 1989 DPRINTF("kvm_arch_handle_exit\n"); 1990 ret = kvm_arch_handle_exit(cpu, run); 1991 break; 1992 } 1993 } while (ret == 0); 1994 1995 cpu_exec_end(cpu); 1996 qemu_mutex_lock_iothread(); 1997 1998 if (ret < 0) { 1999 cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_CODE); 2000 vm_stop(RUN_STATE_INTERNAL_ERROR); 2001 } 2002 2003 atomic_set(&cpu->exit_request, 0); 2004 return ret; 2005 } 2006 2007 int kvm_ioctl(KVMState *s, int type, ...) 2008 { 2009 int ret; 2010 void *arg; 2011 va_list ap; 2012 2013 va_start(ap, type); 2014 arg = va_arg(ap, void *); 2015 va_end(ap); 2016 2017 trace_kvm_ioctl(type, arg); 2018 ret = ioctl(s->fd, type, arg); 2019 if (ret == -1) { 2020 ret = -errno; 2021 } 2022 return ret; 2023 } 2024 2025 int kvm_vm_ioctl(KVMState *s, int type, ...) 2026 { 2027 int ret; 2028 void *arg; 2029 va_list ap; 2030 2031 va_start(ap, type); 2032 arg = va_arg(ap, void *); 2033 va_end(ap); 2034 2035 trace_kvm_vm_ioctl(type, arg); 2036 ret = ioctl(s->vmfd, type, arg); 2037 if (ret == -1) { 2038 ret = -errno; 2039 } 2040 return ret; 2041 } 2042 2043 int kvm_vcpu_ioctl(CPUState *cpu, int type, ...) 2044 { 2045 int ret; 2046 void *arg; 2047 va_list ap; 2048 2049 va_start(ap, type); 2050 arg = va_arg(ap, void *); 2051 va_end(ap); 2052 2053 trace_kvm_vcpu_ioctl(cpu->cpu_index, type, arg); 2054 ret = ioctl(cpu->kvm_fd, type, arg); 2055 if (ret == -1) { 2056 ret = -errno; 2057 } 2058 return ret; 2059 } 2060 2061 int kvm_device_ioctl(int fd, int type, ...) 2062 { 2063 int ret; 2064 void *arg; 2065 va_list ap; 2066 2067 va_start(ap, type); 2068 arg = va_arg(ap, void *); 2069 va_end(ap); 2070 2071 trace_kvm_device_ioctl(fd, type, arg); 2072 ret = ioctl(fd, type, arg); 2073 if (ret == -1) { 2074 ret = -errno; 2075 } 2076 return ret; 2077 } 2078 2079 int kvm_vm_check_attr(KVMState *s, uint32_t group, uint64_t attr) 2080 { 2081 int ret; 2082 struct kvm_device_attr attribute = { 2083 .group = group, 2084 .attr = attr, 2085 }; 2086 2087 if (!kvm_vm_attributes_allowed) { 2088 return 0; 2089 } 2090 2091 ret = kvm_vm_ioctl(s, KVM_HAS_DEVICE_ATTR, &attribute); 2092 /* kvm returns 0 on success for HAS_DEVICE_ATTR */ 2093 return ret ? 0 : 1; 2094 } 2095 2096 int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr) 2097 { 2098 struct kvm_device_attr attribute = { 2099 .group = group, 2100 .attr = attr, 2101 .flags = 0, 2102 }; 2103 2104 return kvm_device_ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute) ? 0 : 1; 2105 } 2106 2107 int kvm_device_access(int fd, int group, uint64_t attr, 2108 void *val, bool write, Error **errp) 2109 { 2110 struct kvm_device_attr kvmattr; 2111 int err; 2112 2113 kvmattr.flags = 0; 2114 kvmattr.group = group; 2115 kvmattr.attr = attr; 2116 kvmattr.addr = (uintptr_t)val; 2117 2118 err = kvm_device_ioctl(fd, 2119 write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR, 2120 &kvmattr); 2121 if (err < 0) { 2122 error_setg_errno(errp, -err, 2123 "KVM_%s_DEVICE_ATTR failed: Group %d " 2124 "attr 0x%016" PRIx64, 2125 write ? "SET" : "GET", group, attr); 2126 } 2127 return err; 2128 } 2129 2130 bool kvm_has_sync_mmu(void) 2131 { 2132 return kvm_state->sync_mmu; 2133 } 2134 2135 int kvm_has_vcpu_events(void) 2136 { 2137 return kvm_state->vcpu_events; 2138 } 2139 2140 int kvm_has_robust_singlestep(void) 2141 { 2142 return kvm_state->robust_singlestep; 2143 } 2144 2145 int kvm_has_debugregs(void) 2146 { 2147 return kvm_state->debugregs; 2148 } 2149 2150 int kvm_has_many_ioeventfds(void) 2151 { 2152 if (!kvm_enabled()) { 2153 return 0; 2154 } 2155 return kvm_state->many_ioeventfds; 2156 } 2157 2158 int kvm_has_gsi_routing(void) 2159 { 2160 #ifdef KVM_CAP_IRQ_ROUTING 2161 return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING); 2162 #else 2163 return false; 2164 #endif 2165 } 2166 2167 int kvm_has_intx_set_mask(void) 2168 { 2169 return kvm_state->intx_set_mask; 2170 } 2171 2172 bool kvm_arm_supports_user_irq(void) 2173 { 2174 return kvm_check_extension(kvm_state, KVM_CAP_ARM_USER_IRQ); 2175 } 2176 2177 #ifdef KVM_CAP_SET_GUEST_DEBUG 2178 struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu, 2179 target_ulong pc) 2180 { 2181 struct kvm_sw_breakpoint *bp; 2182 2183 QTAILQ_FOREACH(bp, &cpu->kvm_state->kvm_sw_breakpoints, entry) { 2184 if (bp->pc == pc) { 2185 return bp; 2186 } 2187 } 2188 return NULL; 2189 } 2190 2191 int kvm_sw_breakpoints_active(CPUState *cpu) 2192 { 2193 return !QTAILQ_EMPTY(&cpu->kvm_state->kvm_sw_breakpoints); 2194 } 2195 2196 struct kvm_set_guest_debug_data { 2197 struct kvm_guest_debug dbg; 2198 int err; 2199 }; 2200 2201 static void kvm_invoke_set_guest_debug(CPUState *cpu, run_on_cpu_data data) 2202 { 2203 struct kvm_set_guest_debug_data *dbg_data = 2204 (struct kvm_set_guest_debug_data *) data.host_ptr; 2205 2206 dbg_data->err = kvm_vcpu_ioctl(cpu, KVM_SET_GUEST_DEBUG, 2207 &dbg_data->dbg); 2208 } 2209 2210 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap) 2211 { 2212 struct kvm_set_guest_debug_data data; 2213 2214 data.dbg.control = reinject_trap; 2215 2216 if (cpu->singlestep_enabled) { 2217 data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP; 2218 } 2219 kvm_arch_update_guest_debug(cpu, &data.dbg); 2220 2221 run_on_cpu(cpu, kvm_invoke_set_guest_debug, 2222 RUN_ON_CPU_HOST_PTR(&data)); 2223 return data.err; 2224 } 2225 2226 int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr, 2227 target_ulong len, int type) 2228 { 2229 struct kvm_sw_breakpoint *bp; 2230 int err; 2231 2232 if (type == GDB_BREAKPOINT_SW) { 2233 bp = kvm_find_sw_breakpoint(cpu, addr); 2234 if (bp) { 2235 bp->use_count++; 2236 return 0; 2237 } 2238 2239 bp = g_malloc(sizeof(struct kvm_sw_breakpoint)); 2240 bp->pc = addr; 2241 bp->use_count = 1; 2242 err = kvm_arch_insert_sw_breakpoint(cpu, bp); 2243 if (err) { 2244 g_free(bp); 2245 return err; 2246 } 2247 2248 QTAILQ_INSERT_HEAD(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry); 2249 } else { 2250 err = kvm_arch_insert_hw_breakpoint(addr, len, type); 2251 if (err) { 2252 return err; 2253 } 2254 } 2255 2256 CPU_FOREACH(cpu) { 2257 err = kvm_update_guest_debug(cpu, 0); 2258 if (err) { 2259 return err; 2260 } 2261 } 2262 return 0; 2263 } 2264 2265 int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr, 2266 target_ulong len, int type) 2267 { 2268 struct kvm_sw_breakpoint *bp; 2269 int err; 2270 2271 if (type == GDB_BREAKPOINT_SW) { 2272 bp = kvm_find_sw_breakpoint(cpu, addr); 2273 if (!bp) { 2274 return -ENOENT; 2275 } 2276 2277 if (bp->use_count > 1) { 2278 bp->use_count--; 2279 return 0; 2280 } 2281 2282 err = kvm_arch_remove_sw_breakpoint(cpu, bp); 2283 if (err) { 2284 return err; 2285 } 2286 2287 QTAILQ_REMOVE(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry); 2288 g_free(bp); 2289 } else { 2290 err = kvm_arch_remove_hw_breakpoint(addr, len, type); 2291 if (err) { 2292 return err; 2293 } 2294 } 2295 2296 CPU_FOREACH(cpu) { 2297 err = kvm_update_guest_debug(cpu, 0); 2298 if (err) { 2299 return err; 2300 } 2301 } 2302 return 0; 2303 } 2304 2305 void kvm_remove_all_breakpoints(CPUState *cpu) 2306 { 2307 struct kvm_sw_breakpoint *bp, *next; 2308 KVMState *s = cpu->kvm_state; 2309 CPUState *tmpcpu; 2310 2311 QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) { 2312 if (kvm_arch_remove_sw_breakpoint(cpu, bp) != 0) { 2313 /* Try harder to find a CPU that currently sees the breakpoint. */ 2314 CPU_FOREACH(tmpcpu) { 2315 if (kvm_arch_remove_sw_breakpoint(tmpcpu, bp) == 0) { 2316 break; 2317 } 2318 } 2319 } 2320 QTAILQ_REMOVE(&s->kvm_sw_breakpoints, bp, entry); 2321 g_free(bp); 2322 } 2323 kvm_arch_remove_all_hw_breakpoints(); 2324 2325 CPU_FOREACH(cpu) { 2326 kvm_update_guest_debug(cpu, 0); 2327 } 2328 } 2329 2330 #else /* !KVM_CAP_SET_GUEST_DEBUG */ 2331 2332 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap) 2333 { 2334 return -EINVAL; 2335 } 2336 2337 int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr, 2338 target_ulong len, int type) 2339 { 2340 return -EINVAL; 2341 } 2342 2343 int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr, 2344 target_ulong len, int type) 2345 { 2346 return -EINVAL; 2347 } 2348 2349 void kvm_remove_all_breakpoints(CPUState *cpu) 2350 { 2351 } 2352 #endif /* !KVM_CAP_SET_GUEST_DEBUG */ 2353 2354 static int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset) 2355 { 2356 KVMState *s = kvm_state; 2357 struct kvm_signal_mask *sigmask; 2358 int r; 2359 2360 sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset)); 2361 2362 sigmask->len = s->sigmask_len; 2363 memcpy(sigmask->sigset, sigset, sizeof(*sigset)); 2364 r = kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, sigmask); 2365 g_free(sigmask); 2366 2367 return r; 2368 } 2369 2370 static void kvm_ipi_signal(int sig) 2371 { 2372 if (current_cpu) { 2373 assert(kvm_immediate_exit); 2374 kvm_cpu_kick(current_cpu); 2375 } 2376 } 2377 2378 void kvm_init_cpu_signals(CPUState *cpu) 2379 { 2380 int r; 2381 sigset_t set; 2382 struct sigaction sigact; 2383 2384 memset(&sigact, 0, sizeof(sigact)); 2385 sigact.sa_handler = kvm_ipi_signal; 2386 sigaction(SIG_IPI, &sigact, NULL); 2387 2388 pthread_sigmask(SIG_BLOCK, NULL, &set); 2389 #if defined KVM_HAVE_MCE_INJECTION 2390 sigdelset(&set, SIGBUS); 2391 pthread_sigmask(SIG_SETMASK, &set, NULL); 2392 #endif 2393 sigdelset(&set, SIG_IPI); 2394 if (kvm_immediate_exit) { 2395 r = pthread_sigmask(SIG_SETMASK, &set, NULL); 2396 } else { 2397 r = kvm_set_signal_mask(cpu, &set); 2398 } 2399 if (r) { 2400 fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r)); 2401 exit(1); 2402 } 2403 } 2404 2405 /* Called asynchronously in VCPU thread. */ 2406 int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr) 2407 { 2408 #ifdef KVM_HAVE_MCE_INJECTION 2409 if (have_sigbus_pending) { 2410 return 1; 2411 } 2412 have_sigbus_pending = true; 2413 pending_sigbus_addr = addr; 2414 pending_sigbus_code = code; 2415 atomic_set(&cpu->exit_request, 1); 2416 return 0; 2417 #else 2418 return 1; 2419 #endif 2420 } 2421 2422 /* Called synchronously (via signalfd) in main thread. */ 2423 int kvm_on_sigbus(int code, void *addr) 2424 { 2425 #ifdef KVM_HAVE_MCE_INJECTION 2426 /* Action required MCE kills the process if SIGBUS is blocked. Because 2427 * that's what happens in the I/O thread, where we handle MCE via signalfd, 2428 * we can only get action optional here. 2429 */ 2430 assert(code != BUS_MCEERR_AR); 2431 kvm_arch_on_sigbus_vcpu(first_cpu, code, addr); 2432 return 0; 2433 #else 2434 return 1; 2435 #endif 2436 } 2437 2438 int kvm_create_device(KVMState *s, uint64_t type, bool test) 2439 { 2440 int ret; 2441 struct kvm_create_device create_dev; 2442 2443 create_dev.type = type; 2444 create_dev.fd = -1; 2445 create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0; 2446 2447 if (!kvm_check_extension(s, KVM_CAP_DEVICE_CTRL)) { 2448 return -ENOTSUP; 2449 } 2450 2451 ret = kvm_vm_ioctl(s, KVM_CREATE_DEVICE, &create_dev); 2452 if (ret) { 2453 return ret; 2454 } 2455 2456 return test ? 0 : create_dev.fd; 2457 } 2458 2459 bool kvm_device_supported(int vmfd, uint64_t type) 2460 { 2461 struct kvm_create_device create_dev = { 2462 .type = type, 2463 .fd = -1, 2464 .flags = KVM_CREATE_DEVICE_TEST, 2465 }; 2466 2467 if (ioctl(vmfd, KVM_CHECK_EXTENSION, KVM_CAP_DEVICE_CTRL) <= 0) { 2468 return false; 2469 } 2470 2471 return (ioctl(vmfd, KVM_CREATE_DEVICE, &create_dev) >= 0); 2472 } 2473 2474 int kvm_set_one_reg(CPUState *cs, uint64_t id, void *source) 2475 { 2476 struct kvm_one_reg reg; 2477 int r; 2478 2479 reg.id = id; 2480 reg.addr = (uintptr_t) source; 2481 r = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, ®); 2482 if (r) { 2483 trace_kvm_failed_reg_set(id, strerror(-r)); 2484 } 2485 return r; 2486 } 2487 2488 int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target) 2489 { 2490 struct kvm_one_reg reg; 2491 int r; 2492 2493 reg.id = id; 2494 reg.addr = (uintptr_t) target; 2495 r = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, ®); 2496 if (r) { 2497 trace_kvm_failed_reg_get(id, strerror(-r)); 2498 } 2499 return r; 2500 } 2501 2502 static void kvm_accel_class_init(ObjectClass *oc, void *data) 2503 { 2504 AccelClass *ac = ACCEL_CLASS(oc); 2505 ac->name = "KVM"; 2506 ac->init_machine = kvm_init; 2507 ac->allowed = &kvm_allowed; 2508 } 2509 2510 static const TypeInfo kvm_accel_type = { 2511 .name = TYPE_KVM_ACCEL, 2512 .parent = TYPE_ACCEL, 2513 .class_init = kvm_accel_class_init, 2514 .instance_size = sizeof(KVMState), 2515 }; 2516 2517 static void kvm_type_init(void) 2518 { 2519 type_register_static(&kvm_accel_type); 2520 } 2521 2522 type_init(kvm_type_init); 2523