1 /* 2 * QEMU KVM support 3 * 4 * Copyright IBM, Corp. 2008 5 * Red Hat, Inc. 2008 6 * 7 * Authors: 8 * Anthony Liguori <aliguori@us.ibm.com> 9 * Glauber Costa <gcosta@redhat.com> 10 * 11 * This work is licensed under the terms of the GNU GPL, version 2 or later. 12 * See the COPYING file in the top-level directory. 13 * 14 */ 15 16 #include "qemu/osdep.h" 17 #include <sys/ioctl.h> 18 19 #include <linux/kvm.h> 20 21 #include "qemu-common.h" 22 #include "qemu/atomic.h" 23 #include "qemu/option.h" 24 #include "qemu/config-file.h" 25 #include "qemu/error-report.h" 26 #include "qapi/error.h" 27 #include "hw/hw.h" 28 #include "hw/pci/msi.h" 29 #include "hw/pci/msix.h" 30 #include "hw/s390x/adapter.h" 31 #include "exec/gdbstub.h" 32 #include "sysemu/kvm_int.h" 33 #include "sysemu/cpus.h" 34 #include "qemu/bswap.h" 35 #include "exec/memory.h" 36 #include "exec/ram_addr.h" 37 #include "exec/address-spaces.h" 38 #include "qemu/event_notifier.h" 39 #include "trace.h" 40 #include "hw/irq.h" 41 42 #include "hw/boards.h" 43 44 /* This check must be after config-host.h is included */ 45 #ifdef CONFIG_EVENTFD 46 #include <sys/eventfd.h> 47 #endif 48 49 /* KVM uses PAGE_SIZE in its definition of KVM_COALESCED_MMIO_MAX. We 50 * need to use the real host PAGE_SIZE, as that's what KVM will use. 51 */ 52 #define PAGE_SIZE getpagesize() 53 54 //#define DEBUG_KVM 55 56 #ifdef DEBUG_KVM 57 #define DPRINTF(fmt, ...) \ 58 do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0) 59 #else 60 #define DPRINTF(fmt, ...) \ 61 do { } while (0) 62 #endif 63 64 #define KVM_MSI_HASHTAB_SIZE 256 65 66 struct KVMParkedVcpu { 67 unsigned long vcpu_id; 68 int kvm_fd; 69 QLIST_ENTRY(KVMParkedVcpu) node; 70 }; 71 72 struct KVMState 73 { 74 AccelState parent_obj; 75 76 int nr_slots; 77 int fd; 78 int vmfd; 79 int coalesced_mmio; 80 struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; 81 bool coalesced_flush_in_progress; 82 int vcpu_events; 83 int robust_singlestep; 84 int debugregs; 85 #ifdef KVM_CAP_SET_GUEST_DEBUG 86 struct kvm_sw_breakpoint_head kvm_sw_breakpoints; 87 #endif 88 int many_ioeventfds; 89 int intx_set_mask; 90 /* The man page (and posix) say ioctl numbers are signed int, but 91 * they're not. Linux, glibc and *BSD all treat ioctl numbers as 92 * unsigned, and treating them as signed here can break things */ 93 unsigned irq_set_ioctl; 94 unsigned int sigmask_len; 95 GHashTable *gsimap; 96 #ifdef KVM_CAP_IRQ_ROUTING 97 struct kvm_irq_routing *irq_routes; 98 int nr_allocated_irq_routes; 99 unsigned long *used_gsi_bitmap; 100 unsigned int gsi_count; 101 QTAILQ_HEAD(msi_hashtab, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE]; 102 #endif 103 KVMMemoryListener memory_listener; 104 QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus; 105 }; 106 107 KVMState *kvm_state; 108 bool kvm_kernel_irqchip; 109 bool kvm_split_irqchip; 110 bool kvm_async_interrupts_allowed; 111 bool kvm_halt_in_kernel_allowed; 112 bool kvm_eventfds_allowed; 113 bool kvm_irqfds_allowed; 114 bool kvm_resamplefds_allowed; 115 bool kvm_msi_via_irqfd_allowed; 116 bool kvm_gsi_routing_allowed; 117 bool kvm_gsi_direct_mapping; 118 bool kvm_allowed; 119 bool kvm_readonly_mem_allowed; 120 bool kvm_vm_attributes_allowed; 121 bool kvm_direct_msi_allowed; 122 bool kvm_ioeventfd_any_length_allowed; 123 bool kvm_msi_use_devid; 124 static bool kvm_immediate_exit; 125 126 static const KVMCapabilityInfo kvm_required_capabilites[] = { 127 KVM_CAP_INFO(USER_MEMORY), 128 KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS), 129 KVM_CAP_INFO(JOIN_MEMORY_REGIONS_WORKS), 130 KVM_CAP_LAST_INFO 131 }; 132 133 int kvm_get_max_memslots(void) 134 { 135 KVMState *s = KVM_STATE(current_machine->accelerator); 136 137 return s->nr_slots; 138 } 139 140 static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml) 141 { 142 KVMState *s = kvm_state; 143 int i; 144 145 for (i = 0; i < s->nr_slots; i++) { 146 if (kml->slots[i].memory_size == 0) { 147 return &kml->slots[i]; 148 } 149 } 150 151 return NULL; 152 } 153 154 bool kvm_has_free_slot(MachineState *ms) 155 { 156 KVMState *s = KVM_STATE(ms->accelerator); 157 158 return kvm_get_free_slot(&s->memory_listener); 159 } 160 161 static KVMSlot *kvm_alloc_slot(KVMMemoryListener *kml) 162 { 163 KVMSlot *slot = kvm_get_free_slot(kml); 164 165 if (slot) { 166 return slot; 167 } 168 169 fprintf(stderr, "%s: no free slot available\n", __func__); 170 abort(); 171 } 172 173 static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml, 174 hwaddr start_addr, 175 hwaddr size) 176 { 177 KVMState *s = kvm_state; 178 int i; 179 180 for (i = 0; i < s->nr_slots; i++) { 181 KVMSlot *mem = &kml->slots[i]; 182 183 if (start_addr == mem->start_addr && size == mem->memory_size) { 184 return mem; 185 } 186 } 187 188 return NULL; 189 } 190 191 /* 192 * Calculate and align the start address and the size of the section. 193 * Return the size. If the size is 0, the aligned section is empty. 194 */ 195 static hwaddr kvm_align_section(MemoryRegionSection *section, 196 hwaddr *start) 197 { 198 hwaddr size = int128_get64(section->size); 199 hwaddr delta; 200 201 *start = section->offset_within_address_space; 202 203 /* kvm works in page size chunks, but the function may be called 204 with sub-page size and unaligned start address. Pad the start 205 address to next and truncate size to previous page boundary. */ 206 delta = qemu_real_host_page_size - (*start & ~qemu_real_host_page_mask); 207 delta &= ~qemu_real_host_page_mask; 208 *start += delta; 209 if (delta > size) { 210 return 0; 211 } 212 size -= delta; 213 size &= qemu_real_host_page_mask; 214 if (*start & ~qemu_real_host_page_mask) { 215 return 0; 216 } 217 218 return size; 219 } 220 221 int kvm_physical_memory_addr_from_host(KVMState *s, void *ram, 222 hwaddr *phys_addr) 223 { 224 KVMMemoryListener *kml = &s->memory_listener; 225 int i; 226 227 for (i = 0; i < s->nr_slots; i++) { 228 KVMSlot *mem = &kml->slots[i]; 229 230 if (ram >= mem->ram && ram < mem->ram + mem->memory_size) { 231 *phys_addr = mem->start_addr + (ram - mem->ram); 232 return 1; 233 } 234 } 235 236 return 0; 237 } 238 239 static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot) 240 { 241 KVMState *s = kvm_state; 242 struct kvm_userspace_memory_region mem; 243 244 mem.slot = slot->slot | (kml->as_id << 16); 245 mem.guest_phys_addr = slot->start_addr; 246 mem.userspace_addr = (unsigned long)slot->ram; 247 mem.flags = slot->flags; 248 249 if (slot->memory_size && mem.flags & KVM_MEM_READONLY) { 250 /* Set the slot size to 0 before setting the slot to the desired 251 * value. This is needed based on KVM commit 75d61fbc. */ 252 mem.memory_size = 0; 253 kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem); 254 } 255 mem.memory_size = slot->memory_size; 256 return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem); 257 } 258 259 int kvm_destroy_vcpu(CPUState *cpu) 260 { 261 KVMState *s = kvm_state; 262 long mmap_size; 263 struct KVMParkedVcpu *vcpu = NULL; 264 int ret = 0; 265 266 DPRINTF("kvm_destroy_vcpu\n"); 267 268 mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0); 269 if (mmap_size < 0) { 270 ret = mmap_size; 271 DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n"); 272 goto err; 273 } 274 275 ret = munmap(cpu->kvm_run, mmap_size); 276 if (ret < 0) { 277 goto err; 278 } 279 280 vcpu = g_malloc0(sizeof(*vcpu)); 281 vcpu->vcpu_id = kvm_arch_vcpu_id(cpu); 282 vcpu->kvm_fd = cpu->kvm_fd; 283 QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node); 284 err: 285 return ret; 286 } 287 288 static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id) 289 { 290 struct KVMParkedVcpu *cpu; 291 292 QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) { 293 if (cpu->vcpu_id == vcpu_id) { 294 int kvm_fd; 295 296 QLIST_REMOVE(cpu, node); 297 kvm_fd = cpu->kvm_fd; 298 g_free(cpu); 299 return kvm_fd; 300 } 301 } 302 303 return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id); 304 } 305 306 int kvm_init_vcpu(CPUState *cpu) 307 { 308 KVMState *s = kvm_state; 309 long mmap_size; 310 int ret; 311 312 DPRINTF("kvm_init_vcpu\n"); 313 314 ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu)); 315 if (ret < 0) { 316 DPRINTF("kvm_create_vcpu failed\n"); 317 goto err; 318 } 319 320 cpu->kvm_fd = ret; 321 cpu->kvm_state = s; 322 cpu->vcpu_dirty = true; 323 324 mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0); 325 if (mmap_size < 0) { 326 ret = mmap_size; 327 DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n"); 328 goto err; 329 } 330 331 cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, 332 cpu->kvm_fd, 0); 333 if (cpu->kvm_run == MAP_FAILED) { 334 ret = -errno; 335 DPRINTF("mmap'ing vcpu state failed\n"); 336 goto err; 337 } 338 339 if (s->coalesced_mmio && !s->coalesced_mmio_ring) { 340 s->coalesced_mmio_ring = 341 (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE; 342 } 343 344 ret = kvm_arch_init_vcpu(cpu); 345 err: 346 return ret; 347 } 348 349 /* 350 * dirty pages logging control 351 */ 352 353 static int kvm_mem_flags(MemoryRegion *mr) 354 { 355 bool readonly = mr->readonly || memory_region_is_romd(mr); 356 int flags = 0; 357 358 if (memory_region_get_dirty_log_mask(mr) != 0) { 359 flags |= KVM_MEM_LOG_DIRTY_PAGES; 360 } 361 if (readonly && kvm_readonly_mem_allowed) { 362 flags |= KVM_MEM_READONLY; 363 } 364 return flags; 365 } 366 367 static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem, 368 MemoryRegion *mr) 369 { 370 int old_flags; 371 372 old_flags = mem->flags; 373 mem->flags = kvm_mem_flags(mr); 374 375 /* If nothing changed effectively, no need to issue ioctl */ 376 if (mem->flags == old_flags) { 377 return 0; 378 } 379 380 return kvm_set_user_memory_region(kml, mem); 381 } 382 383 static int kvm_section_update_flags(KVMMemoryListener *kml, 384 MemoryRegionSection *section) 385 { 386 hwaddr start_addr, size; 387 KVMSlot *mem; 388 389 size = kvm_align_section(section, &start_addr); 390 if (!size) { 391 return 0; 392 } 393 394 mem = kvm_lookup_matching_slot(kml, start_addr, size); 395 if (!mem) { 396 fprintf(stderr, "%s: error finding slot\n", __func__); 397 abort(); 398 } 399 400 return kvm_slot_update_flags(kml, mem, section->mr); 401 } 402 403 static void kvm_log_start(MemoryListener *listener, 404 MemoryRegionSection *section, 405 int old, int new) 406 { 407 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 408 int r; 409 410 if (old != 0) { 411 return; 412 } 413 414 r = kvm_section_update_flags(kml, section); 415 if (r < 0) { 416 abort(); 417 } 418 } 419 420 static void kvm_log_stop(MemoryListener *listener, 421 MemoryRegionSection *section, 422 int old, int new) 423 { 424 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 425 int r; 426 427 if (new != 0) { 428 return; 429 } 430 431 r = kvm_section_update_flags(kml, section); 432 if (r < 0) { 433 abort(); 434 } 435 } 436 437 /* get kvm's dirty pages bitmap and update qemu's */ 438 static int kvm_get_dirty_pages_log_range(MemoryRegionSection *section, 439 unsigned long *bitmap) 440 { 441 ram_addr_t start = section->offset_within_region + 442 memory_region_get_ram_addr(section->mr); 443 ram_addr_t pages = int128_get64(section->size) / getpagesize(); 444 445 cpu_physical_memory_set_dirty_lebitmap(bitmap, start, pages); 446 return 0; 447 } 448 449 #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1)) 450 451 /** 452 * kvm_physical_sync_dirty_bitmap - Grab dirty bitmap from kernel space 453 * This function updates qemu's dirty bitmap using 454 * memory_region_set_dirty(). This means all bits are set 455 * to dirty. 456 * 457 * @start_add: start of logged region. 458 * @end_addr: end of logged region. 459 */ 460 static int kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml, 461 MemoryRegionSection *section) 462 { 463 KVMState *s = kvm_state; 464 struct kvm_dirty_log d = {}; 465 KVMSlot *mem; 466 hwaddr start_addr, size; 467 468 size = kvm_align_section(section, &start_addr); 469 if (size) { 470 mem = kvm_lookup_matching_slot(kml, start_addr, size); 471 if (!mem) { 472 fprintf(stderr, "%s: error finding slot\n", __func__); 473 abort(); 474 } 475 476 /* XXX bad kernel interface alert 477 * For dirty bitmap, kernel allocates array of size aligned to 478 * bits-per-long. But for case when the kernel is 64bits and 479 * the userspace is 32bits, userspace can't align to the same 480 * bits-per-long, since sizeof(long) is different between kernel 481 * and user space. This way, userspace will provide buffer which 482 * may be 4 bytes less than the kernel will use, resulting in 483 * userspace memory corruption (which is not detectable by valgrind 484 * too, in most cases). 485 * So for now, let's align to 64 instead of HOST_LONG_BITS here, in 486 * a hope that sizeof(long) won't become >8 any time soon. 487 */ 488 size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS), 489 /*HOST_LONG_BITS*/ 64) / 8; 490 d.dirty_bitmap = g_malloc0(size); 491 492 d.slot = mem->slot | (kml->as_id << 16); 493 if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) { 494 DPRINTF("ioctl failed %d\n", errno); 495 g_free(d.dirty_bitmap); 496 return -1; 497 } 498 499 kvm_get_dirty_pages_log_range(section, d.dirty_bitmap); 500 g_free(d.dirty_bitmap); 501 } 502 503 return 0; 504 } 505 506 static void kvm_coalesce_mmio_region(MemoryListener *listener, 507 MemoryRegionSection *secion, 508 hwaddr start, hwaddr size) 509 { 510 KVMState *s = kvm_state; 511 512 if (s->coalesced_mmio) { 513 struct kvm_coalesced_mmio_zone zone; 514 515 zone.addr = start; 516 zone.size = size; 517 zone.pad = 0; 518 519 (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone); 520 } 521 } 522 523 static void kvm_uncoalesce_mmio_region(MemoryListener *listener, 524 MemoryRegionSection *secion, 525 hwaddr start, hwaddr size) 526 { 527 KVMState *s = kvm_state; 528 529 if (s->coalesced_mmio) { 530 struct kvm_coalesced_mmio_zone zone; 531 532 zone.addr = start; 533 zone.size = size; 534 zone.pad = 0; 535 536 (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone); 537 } 538 } 539 540 int kvm_check_extension(KVMState *s, unsigned int extension) 541 { 542 int ret; 543 544 ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension); 545 if (ret < 0) { 546 ret = 0; 547 } 548 549 return ret; 550 } 551 552 int kvm_vm_check_extension(KVMState *s, unsigned int extension) 553 { 554 int ret; 555 556 ret = kvm_vm_ioctl(s, KVM_CHECK_EXTENSION, extension); 557 if (ret < 0) { 558 /* VM wide version not implemented, use global one instead */ 559 ret = kvm_check_extension(s, extension); 560 } 561 562 return ret; 563 } 564 565 static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size) 566 { 567 #if defined(HOST_WORDS_BIGENDIAN) != defined(TARGET_WORDS_BIGENDIAN) 568 /* The kernel expects ioeventfd values in HOST_WORDS_BIGENDIAN 569 * endianness, but the memory core hands them in target endianness. 570 * For example, PPC is always treated as big-endian even if running 571 * on KVM and on PPC64LE. Correct here. 572 */ 573 switch (size) { 574 case 2: 575 val = bswap16(val); 576 break; 577 case 4: 578 val = bswap32(val); 579 break; 580 } 581 #endif 582 return val; 583 } 584 585 static int kvm_set_ioeventfd_mmio(int fd, hwaddr addr, uint32_t val, 586 bool assign, uint32_t size, bool datamatch) 587 { 588 int ret; 589 struct kvm_ioeventfd iofd = { 590 .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0, 591 .addr = addr, 592 .len = size, 593 .flags = 0, 594 .fd = fd, 595 }; 596 597 if (!kvm_enabled()) { 598 return -ENOSYS; 599 } 600 601 if (datamatch) { 602 iofd.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH; 603 } 604 if (!assign) { 605 iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN; 606 } 607 608 ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd); 609 610 if (ret < 0) { 611 return -errno; 612 } 613 614 return 0; 615 } 616 617 static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val, 618 bool assign, uint32_t size, bool datamatch) 619 { 620 struct kvm_ioeventfd kick = { 621 .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0, 622 .addr = addr, 623 .flags = KVM_IOEVENTFD_FLAG_PIO, 624 .len = size, 625 .fd = fd, 626 }; 627 int r; 628 if (!kvm_enabled()) { 629 return -ENOSYS; 630 } 631 if (datamatch) { 632 kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH; 633 } 634 if (!assign) { 635 kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN; 636 } 637 r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick); 638 if (r < 0) { 639 return r; 640 } 641 return 0; 642 } 643 644 645 static int kvm_check_many_ioeventfds(void) 646 { 647 /* Userspace can use ioeventfd for io notification. This requires a host 648 * that supports eventfd(2) and an I/O thread; since eventfd does not 649 * support SIGIO it cannot interrupt the vcpu. 650 * 651 * Older kernels have a 6 device limit on the KVM io bus. Find out so we 652 * can avoid creating too many ioeventfds. 653 */ 654 #if defined(CONFIG_EVENTFD) 655 int ioeventfds[7]; 656 int i, ret = 0; 657 for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) { 658 ioeventfds[i] = eventfd(0, EFD_CLOEXEC); 659 if (ioeventfds[i] < 0) { 660 break; 661 } 662 ret = kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, true, 2, true); 663 if (ret < 0) { 664 close(ioeventfds[i]); 665 break; 666 } 667 } 668 669 /* Decide whether many devices are supported or not */ 670 ret = i == ARRAY_SIZE(ioeventfds); 671 672 while (i-- > 0) { 673 kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, false, 2, true); 674 close(ioeventfds[i]); 675 } 676 return ret; 677 #else 678 return 0; 679 #endif 680 } 681 682 static const KVMCapabilityInfo * 683 kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list) 684 { 685 while (list->name) { 686 if (!kvm_check_extension(s, list->value)) { 687 return list; 688 } 689 list++; 690 } 691 return NULL; 692 } 693 694 static void kvm_set_phys_mem(KVMMemoryListener *kml, 695 MemoryRegionSection *section, bool add) 696 { 697 KVMSlot *mem; 698 int err; 699 MemoryRegion *mr = section->mr; 700 bool writeable = !mr->readonly && !mr->rom_device; 701 hwaddr start_addr, size; 702 void *ram; 703 704 if (!memory_region_is_ram(mr)) { 705 if (writeable || !kvm_readonly_mem_allowed) { 706 return; 707 } else if (!mr->romd_mode) { 708 /* If the memory device is not in romd_mode, then we actually want 709 * to remove the kvm memory slot so all accesses will trap. */ 710 add = false; 711 } 712 } 713 714 size = kvm_align_section(section, &start_addr); 715 if (!size) { 716 return; 717 } 718 719 ram = memory_region_get_ram_ptr(mr) + section->offset_within_region + 720 (section->offset_within_address_space - start_addr); 721 722 mem = kvm_lookup_matching_slot(kml, start_addr, size); 723 if (!add) { 724 if (!mem) { 725 return; 726 } 727 if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { 728 kvm_physical_sync_dirty_bitmap(kml, section); 729 } 730 731 /* unregister the slot */ 732 mem->memory_size = 0; 733 err = kvm_set_user_memory_region(kml, mem); 734 if (err) { 735 fprintf(stderr, "%s: error unregistering overlapping slot: %s\n", 736 __func__, strerror(-err)); 737 abort(); 738 } 739 return; 740 } 741 742 if (mem) { 743 /* update the slot */ 744 kvm_slot_update_flags(kml, mem, mr); 745 return; 746 } 747 748 /* register the new slot */ 749 mem = kvm_alloc_slot(kml); 750 mem->memory_size = size; 751 mem->start_addr = start_addr; 752 mem->ram = ram; 753 mem->flags = kvm_mem_flags(mr); 754 755 err = kvm_set_user_memory_region(kml, mem); 756 if (err) { 757 fprintf(stderr, "%s: error registering slot: %s\n", __func__, 758 strerror(-err)); 759 abort(); 760 } 761 } 762 763 static void kvm_region_add(MemoryListener *listener, 764 MemoryRegionSection *section) 765 { 766 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 767 768 memory_region_ref(section->mr); 769 kvm_set_phys_mem(kml, section, true); 770 } 771 772 static void kvm_region_del(MemoryListener *listener, 773 MemoryRegionSection *section) 774 { 775 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 776 777 kvm_set_phys_mem(kml, section, false); 778 memory_region_unref(section->mr); 779 } 780 781 static void kvm_log_sync(MemoryListener *listener, 782 MemoryRegionSection *section) 783 { 784 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 785 int r; 786 787 r = kvm_physical_sync_dirty_bitmap(kml, section); 788 if (r < 0) { 789 abort(); 790 } 791 } 792 793 static void kvm_mem_ioeventfd_add(MemoryListener *listener, 794 MemoryRegionSection *section, 795 bool match_data, uint64_t data, 796 EventNotifier *e) 797 { 798 int fd = event_notifier_get_fd(e); 799 int r; 800 801 r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space, 802 data, true, int128_get64(section->size), 803 match_data); 804 if (r < 0) { 805 fprintf(stderr, "%s: error adding ioeventfd: %s\n", 806 __func__, strerror(-r)); 807 abort(); 808 } 809 } 810 811 static void kvm_mem_ioeventfd_del(MemoryListener *listener, 812 MemoryRegionSection *section, 813 bool match_data, uint64_t data, 814 EventNotifier *e) 815 { 816 int fd = event_notifier_get_fd(e); 817 int r; 818 819 r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space, 820 data, false, int128_get64(section->size), 821 match_data); 822 if (r < 0) { 823 abort(); 824 } 825 } 826 827 static void kvm_io_ioeventfd_add(MemoryListener *listener, 828 MemoryRegionSection *section, 829 bool match_data, uint64_t data, 830 EventNotifier *e) 831 { 832 int fd = event_notifier_get_fd(e); 833 int r; 834 835 r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space, 836 data, true, int128_get64(section->size), 837 match_data); 838 if (r < 0) { 839 fprintf(stderr, "%s: error adding ioeventfd: %s\n", 840 __func__, strerror(-r)); 841 abort(); 842 } 843 } 844 845 static void kvm_io_ioeventfd_del(MemoryListener *listener, 846 MemoryRegionSection *section, 847 bool match_data, uint64_t data, 848 EventNotifier *e) 849 850 { 851 int fd = event_notifier_get_fd(e); 852 int r; 853 854 r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space, 855 data, false, int128_get64(section->size), 856 match_data); 857 if (r < 0) { 858 abort(); 859 } 860 } 861 862 void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml, 863 AddressSpace *as, int as_id) 864 { 865 int i; 866 867 kml->slots = g_malloc0(s->nr_slots * sizeof(KVMSlot)); 868 kml->as_id = as_id; 869 870 for (i = 0; i < s->nr_slots; i++) { 871 kml->slots[i].slot = i; 872 } 873 874 kml->listener.region_add = kvm_region_add; 875 kml->listener.region_del = kvm_region_del; 876 kml->listener.log_start = kvm_log_start; 877 kml->listener.log_stop = kvm_log_stop; 878 kml->listener.log_sync = kvm_log_sync; 879 kml->listener.priority = 10; 880 881 memory_listener_register(&kml->listener, as); 882 } 883 884 static MemoryListener kvm_io_listener = { 885 .eventfd_add = kvm_io_ioeventfd_add, 886 .eventfd_del = kvm_io_ioeventfd_del, 887 .priority = 10, 888 }; 889 890 int kvm_set_irq(KVMState *s, int irq, int level) 891 { 892 struct kvm_irq_level event; 893 int ret; 894 895 assert(kvm_async_interrupts_enabled()); 896 897 event.level = level; 898 event.irq = irq; 899 ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event); 900 if (ret < 0) { 901 perror("kvm_set_irq"); 902 abort(); 903 } 904 905 return (s->irq_set_ioctl == KVM_IRQ_LINE) ? 1 : event.status; 906 } 907 908 #ifdef KVM_CAP_IRQ_ROUTING 909 typedef struct KVMMSIRoute { 910 struct kvm_irq_routing_entry kroute; 911 QTAILQ_ENTRY(KVMMSIRoute) entry; 912 } KVMMSIRoute; 913 914 static void set_gsi(KVMState *s, unsigned int gsi) 915 { 916 set_bit(gsi, s->used_gsi_bitmap); 917 } 918 919 static void clear_gsi(KVMState *s, unsigned int gsi) 920 { 921 clear_bit(gsi, s->used_gsi_bitmap); 922 } 923 924 void kvm_init_irq_routing(KVMState *s) 925 { 926 int gsi_count, i; 927 928 gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING) - 1; 929 if (gsi_count > 0) { 930 /* Round up so we can search ints using ffs */ 931 s->used_gsi_bitmap = bitmap_new(gsi_count); 932 s->gsi_count = gsi_count; 933 } 934 935 s->irq_routes = g_malloc0(sizeof(*s->irq_routes)); 936 s->nr_allocated_irq_routes = 0; 937 938 if (!kvm_direct_msi_allowed) { 939 for (i = 0; i < KVM_MSI_HASHTAB_SIZE; i++) { 940 QTAILQ_INIT(&s->msi_hashtab[i]); 941 } 942 } 943 944 kvm_arch_init_irq_routing(s); 945 } 946 947 void kvm_irqchip_commit_routes(KVMState *s) 948 { 949 int ret; 950 951 if (kvm_gsi_direct_mapping()) { 952 return; 953 } 954 955 if (!kvm_gsi_routing_enabled()) { 956 return; 957 } 958 959 s->irq_routes->flags = 0; 960 trace_kvm_irqchip_commit_routes(); 961 ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes); 962 assert(ret == 0); 963 } 964 965 static void kvm_add_routing_entry(KVMState *s, 966 struct kvm_irq_routing_entry *entry) 967 { 968 struct kvm_irq_routing_entry *new; 969 int n, size; 970 971 if (s->irq_routes->nr == s->nr_allocated_irq_routes) { 972 n = s->nr_allocated_irq_routes * 2; 973 if (n < 64) { 974 n = 64; 975 } 976 size = sizeof(struct kvm_irq_routing); 977 size += n * sizeof(*new); 978 s->irq_routes = g_realloc(s->irq_routes, size); 979 s->nr_allocated_irq_routes = n; 980 } 981 n = s->irq_routes->nr++; 982 new = &s->irq_routes->entries[n]; 983 984 *new = *entry; 985 986 set_gsi(s, entry->gsi); 987 } 988 989 static int kvm_update_routing_entry(KVMState *s, 990 struct kvm_irq_routing_entry *new_entry) 991 { 992 struct kvm_irq_routing_entry *entry; 993 int n; 994 995 for (n = 0; n < s->irq_routes->nr; n++) { 996 entry = &s->irq_routes->entries[n]; 997 if (entry->gsi != new_entry->gsi) { 998 continue; 999 } 1000 1001 if(!memcmp(entry, new_entry, sizeof *entry)) { 1002 return 0; 1003 } 1004 1005 *entry = *new_entry; 1006 1007 return 0; 1008 } 1009 1010 return -ESRCH; 1011 } 1012 1013 void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin) 1014 { 1015 struct kvm_irq_routing_entry e = {}; 1016 1017 assert(pin < s->gsi_count); 1018 1019 e.gsi = irq; 1020 e.type = KVM_IRQ_ROUTING_IRQCHIP; 1021 e.flags = 0; 1022 e.u.irqchip.irqchip = irqchip; 1023 e.u.irqchip.pin = pin; 1024 kvm_add_routing_entry(s, &e); 1025 } 1026 1027 void kvm_irqchip_release_virq(KVMState *s, int virq) 1028 { 1029 struct kvm_irq_routing_entry *e; 1030 int i; 1031 1032 if (kvm_gsi_direct_mapping()) { 1033 return; 1034 } 1035 1036 for (i = 0; i < s->irq_routes->nr; i++) { 1037 e = &s->irq_routes->entries[i]; 1038 if (e->gsi == virq) { 1039 s->irq_routes->nr--; 1040 *e = s->irq_routes->entries[s->irq_routes->nr]; 1041 } 1042 } 1043 clear_gsi(s, virq); 1044 kvm_arch_release_virq_post(virq); 1045 trace_kvm_irqchip_release_virq(virq); 1046 } 1047 1048 static unsigned int kvm_hash_msi(uint32_t data) 1049 { 1050 /* This is optimized for IA32 MSI layout. However, no other arch shall 1051 * repeat the mistake of not providing a direct MSI injection API. */ 1052 return data & 0xff; 1053 } 1054 1055 static void kvm_flush_dynamic_msi_routes(KVMState *s) 1056 { 1057 KVMMSIRoute *route, *next; 1058 unsigned int hash; 1059 1060 for (hash = 0; hash < KVM_MSI_HASHTAB_SIZE; hash++) { 1061 QTAILQ_FOREACH_SAFE(route, &s->msi_hashtab[hash], entry, next) { 1062 kvm_irqchip_release_virq(s, route->kroute.gsi); 1063 QTAILQ_REMOVE(&s->msi_hashtab[hash], route, entry); 1064 g_free(route); 1065 } 1066 } 1067 } 1068 1069 static int kvm_irqchip_get_virq(KVMState *s) 1070 { 1071 int next_virq; 1072 1073 /* 1074 * PIC and IOAPIC share the first 16 GSI numbers, thus the available 1075 * GSI numbers are more than the number of IRQ route. Allocating a GSI 1076 * number can succeed even though a new route entry cannot be added. 1077 * When this happens, flush dynamic MSI entries to free IRQ route entries. 1078 */ 1079 if (!kvm_direct_msi_allowed && s->irq_routes->nr == s->gsi_count) { 1080 kvm_flush_dynamic_msi_routes(s); 1081 } 1082 1083 /* Return the lowest unused GSI in the bitmap */ 1084 next_virq = find_first_zero_bit(s->used_gsi_bitmap, s->gsi_count); 1085 if (next_virq >= s->gsi_count) { 1086 return -ENOSPC; 1087 } else { 1088 return next_virq; 1089 } 1090 } 1091 1092 static KVMMSIRoute *kvm_lookup_msi_route(KVMState *s, MSIMessage msg) 1093 { 1094 unsigned int hash = kvm_hash_msi(msg.data); 1095 KVMMSIRoute *route; 1096 1097 QTAILQ_FOREACH(route, &s->msi_hashtab[hash], entry) { 1098 if (route->kroute.u.msi.address_lo == (uint32_t)msg.address && 1099 route->kroute.u.msi.address_hi == (msg.address >> 32) && 1100 route->kroute.u.msi.data == le32_to_cpu(msg.data)) { 1101 return route; 1102 } 1103 } 1104 return NULL; 1105 } 1106 1107 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg) 1108 { 1109 struct kvm_msi msi; 1110 KVMMSIRoute *route; 1111 1112 if (kvm_direct_msi_allowed) { 1113 msi.address_lo = (uint32_t)msg.address; 1114 msi.address_hi = msg.address >> 32; 1115 msi.data = le32_to_cpu(msg.data); 1116 msi.flags = 0; 1117 memset(msi.pad, 0, sizeof(msi.pad)); 1118 1119 return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi); 1120 } 1121 1122 route = kvm_lookup_msi_route(s, msg); 1123 if (!route) { 1124 int virq; 1125 1126 virq = kvm_irqchip_get_virq(s); 1127 if (virq < 0) { 1128 return virq; 1129 } 1130 1131 route = g_malloc0(sizeof(KVMMSIRoute)); 1132 route->kroute.gsi = virq; 1133 route->kroute.type = KVM_IRQ_ROUTING_MSI; 1134 route->kroute.flags = 0; 1135 route->kroute.u.msi.address_lo = (uint32_t)msg.address; 1136 route->kroute.u.msi.address_hi = msg.address >> 32; 1137 route->kroute.u.msi.data = le32_to_cpu(msg.data); 1138 1139 kvm_add_routing_entry(s, &route->kroute); 1140 kvm_irqchip_commit_routes(s); 1141 1142 QTAILQ_INSERT_TAIL(&s->msi_hashtab[kvm_hash_msi(msg.data)], route, 1143 entry); 1144 } 1145 1146 assert(route->kroute.type == KVM_IRQ_ROUTING_MSI); 1147 1148 return kvm_set_irq(s, route->kroute.gsi, 1); 1149 } 1150 1151 int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev) 1152 { 1153 struct kvm_irq_routing_entry kroute = {}; 1154 int virq; 1155 MSIMessage msg = {0, 0}; 1156 1157 if (pci_available && dev) { 1158 msg = pci_get_msi_message(dev, vector); 1159 } 1160 1161 if (kvm_gsi_direct_mapping()) { 1162 return kvm_arch_msi_data_to_gsi(msg.data); 1163 } 1164 1165 if (!kvm_gsi_routing_enabled()) { 1166 return -ENOSYS; 1167 } 1168 1169 virq = kvm_irqchip_get_virq(s); 1170 if (virq < 0) { 1171 return virq; 1172 } 1173 1174 kroute.gsi = virq; 1175 kroute.type = KVM_IRQ_ROUTING_MSI; 1176 kroute.flags = 0; 1177 kroute.u.msi.address_lo = (uint32_t)msg.address; 1178 kroute.u.msi.address_hi = msg.address >> 32; 1179 kroute.u.msi.data = le32_to_cpu(msg.data); 1180 if (pci_available && kvm_msi_devid_required()) { 1181 kroute.flags = KVM_MSI_VALID_DEVID; 1182 kroute.u.msi.devid = pci_requester_id(dev); 1183 } 1184 if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) { 1185 kvm_irqchip_release_virq(s, virq); 1186 return -EINVAL; 1187 } 1188 1189 trace_kvm_irqchip_add_msi_route(dev ? dev->name : (char *)"N/A", 1190 vector, virq); 1191 1192 kvm_add_routing_entry(s, &kroute); 1193 kvm_arch_add_msi_route_post(&kroute, vector, dev); 1194 kvm_irqchip_commit_routes(s); 1195 1196 return virq; 1197 } 1198 1199 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg, 1200 PCIDevice *dev) 1201 { 1202 struct kvm_irq_routing_entry kroute = {}; 1203 1204 if (kvm_gsi_direct_mapping()) { 1205 return 0; 1206 } 1207 1208 if (!kvm_irqchip_in_kernel()) { 1209 return -ENOSYS; 1210 } 1211 1212 kroute.gsi = virq; 1213 kroute.type = KVM_IRQ_ROUTING_MSI; 1214 kroute.flags = 0; 1215 kroute.u.msi.address_lo = (uint32_t)msg.address; 1216 kroute.u.msi.address_hi = msg.address >> 32; 1217 kroute.u.msi.data = le32_to_cpu(msg.data); 1218 if (pci_available && kvm_msi_devid_required()) { 1219 kroute.flags = KVM_MSI_VALID_DEVID; 1220 kroute.u.msi.devid = pci_requester_id(dev); 1221 } 1222 if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) { 1223 return -EINVAL; 1224 } 1225 1226 trace_kvm_irqchip_update_msi_route(virq); 1227 1228 return kvm_update_routing_entry(s, &kroute); 1229 } 1230 1231 static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int rfd, int virq, 1232 bool assign) 1233 { 1234 struct kvm_irqfd irqfd = { 1235 .fd = fd, 1236 .gsi = virq, 1237 .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN, 1238 }; 1239 1240 if (rfd != -1) { 1241 irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE; 1242 irqfd.resamplefd = rfd; 1243 } 1244 1245 if (!kvm_irqfds_enabled()) { 1246 return -ENOSYS; 1247 } 1248 1249 return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd); 1250 } 1251 1252 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter) 1253 { 1254 struct kvm_irq_routing_entry kroute = {}; 1255 int virq; 1256 1257 if (!kvm_gsi_routing_enabled()) { 1258 return -ENOSYS; 1259 } 1260 1261 virq = kvm_irqchip_get_virq(s); 1262 if (virq < 0) { 1263 return virq; 1264 } 1265 1266 kroute.gsi = virq; 1267 kroute.type = KVM_IRQ_ROUTING_S390_ADAPTER; 1268 kroute.flags = 0; 1269 kroute.u.adapter.summary_addr = adapter->summary_addr; 1270 kroute.u.adapter.ind_addr = adapter->ind_addr; 1271 kroute.u.adapter.summary_offset = adapter->summary_offset; 1272 kroute.u.adapter.ind_offset = adapter->ind_offset; 1273 kroute.u.adapter.adapter_id = adapter->adapter_id; 1274 1275 kvm_add_routing_entry(s, &kroute); 1276 1277 return virq; 1278 } 1279 1280 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint) 1281 { 1282 struct kvm_irq_routing_entry kroute = {}; 1283 int virq; 1284 1285 if (!kvm_gsi_routing_enabled()) { 1286 return -ENOSYS; 1287 } 1288 if (!kvm_check_extension(s, KVM_CAP_HYPERV_SYNIC)) { 1289 return -ENOSYS; 1290 } 1291 virq = kvm_irqchip_get_virq(s); 1292 if (virq < 0) { 1293 return virq; 1294 } 1295 1296 kroute.gsi = virq; 1297 kroute.type = KVM_IRQ_ROUTING_HV_SINT; 1298 kroute.flags = 0; 1299 kroute.u.hv_sint.vcpu = vcpu; 1300 kroute.u.hv_sint.sint = sint; 1301 1302 kvm_add_routing_entry(s, &kroute); 1303 kvm_irqchip_commit_routes(s); 1304 1305 return virq; 1306 } 1307 1308 #else /* !KVM_CAP_IRQ_ROUTING */ 1309 1310 void kvm_init_irq_routing(KVMState *s) 1311 { 1312 } 1313 1314 void kvm_irqchip_release_virq(KVMState *s, int virq) 1315 { 1316 } 1317 1318 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg) 1319 { 1320 abort(); 1321 } 1322 1323 int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev) 1324 { 1325 return -ENOSYS; 1326 } 1327 1328 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter) 1329 { 1330 return -ENOSYS; 1331 } 1332 1333 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint) 1334 { 1335 return -ENOSYS; 1336 } 1337 1338 static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int virq, bool assign) 1339 { 1340 abort(); 1341 } 1342 1343 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg) 1344 { 1345 return -ENOSYS; 1346 } 1347 #endif /* !KVM_CAP_IRQ_ROUTING */ 1348 1349 int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, 1350 EventNotifier *rn, int virq) 1351 { 1352 return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), 1353 rn ? event_notifier_get_fd(rn) : -1, virq, true); 1354 } 1355 1356 int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, 1357 int virq) 1358 { 1359 return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), -1, virq, 1360 false); 1361 } 1362 1363 int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n, 1364 EventNotifier *rn, qemu_irq irq) 1365 { 1366 gpointer key, gsi; 1367 gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi); 1368 1369 if (!found) { 1370 return -ENXIO; 1371 } 1372 return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, GPOINTER_TO_INT(gsi)); 1373 } 1374 1375 int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n, 1376 qemu_irq irq) 1377 { 1378 gpointer key, gsi; 1379 gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi); 1380 1381 if (!found) { 1382 return -ENXIO; 1383 } 1384 return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, GPOINTER_TO_INT(gsi)); 1385 } 1386 1387 void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi) 1388 { 1389 g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi)); 1390 } 1391 1392 static void kvm_irqchip_create(MachineState *machine, KVMState *s) 1393 { 1394 int ret; 1395 1396 if (kvm_check_extension(s, KVM_CAP_IRQCHIP)) { 1397 ; 1398 } else if (kvm_check_extension(s, KVM_CAP_S390_IRQCHIP)) { 1399 ret = kvm_vm_enable_cap(s, KVM_CAP_S390_IRQCHIP, 0); 1400 if (ret < 0) { 1401 fprintf(stderr, "Enable kernel irqchip failed: %s\n", strerror(-ret)); 1402 exit(1); 1403 } 1404 } else { 1405 return; 1406 } 1407 1408 /* First probe and see if there's a arch-specific hook to create the 1409 * in-kernel irqchip for us */ 1410 ret = kvm_arch_irqchip_create(machine, s); 1411 if (ret == 0) { 1412 if (machine_kernel_irqchip_split(machine)) { 1413 perror("Split IRQ chip mode not supported."); 1414 exit(1); 1415 } else { 1416 ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP); 1417 } 1418 } 1419 if (ret < 0) { 1420 fprintf(stderr, "Create kernel irqchip failed: %s\n", strerror(-ret)); 1421 exit(1); 1422 } 1423 1424 kvm_kernel_irqchip = true; 1425 /* If we have an in-kernel IRQ chip then we must have asynchronous 1426 * interrupt delivery (though the reverse is not necessarily true) 1427 */ 1428 kvm_async_interrupts_allowed = true; 1429 kvm_halt_in_kernel_allowed = true; 1430 1431 kvm_init_irq_routing(s); 1432 1433 s->gsimap = g_hash_table_new(g_direct_hash, g_direct_equal); 1434 } 1435 1436 /* Find number of supported CPUs using the recommended 1437 * procedure from the kernel API documentation to cope with 1438 * older kernels that may be missing capabilities. 1439 */ 1440 static int kvm_recommended_vcpus(KVMState *s) 1441 { 1442 int ret = kvm_check_extension(s, KVM_CAP_NR_VCPUS); 1443 return (ret) ? ret : 4; 1444 } 1445 1446 static int kvm_max_vcpus(KVMState *s) 1447 { 1448 int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS); 1449 return (ret) ? ret : kvm_recommended_vcpus(s); 1450 } 1451 1452 static int kvm_max_vcpu_id(KVMState *s) 1453 { 1454 int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPU_ID); 1455 return (ret) ? ret : kvm_max_vcpus(s); 1456 } 1457 1458 bool kvm_vcpu_id_is_valid(int vcpu_id) 1459 { 1460 KVMState *s = KVM_STATE(current_machine->accelerator); 1461 return vcpu_id >= 0 && vcpu_id < kvm_max_vcpu_id(s); 1462 } 1463 1464 static int kvm_init(MachineState *ms) 1465 { 1466 MachineClass *mc = MACHINE_GET_CLASS(ms); 1467 static const char upgrade_note[] = 1468 "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n" 1469 "(see http://sourceforge.net/projects/kvm).\n"; 1470 struct { 1471 const char *name; 1472 int num; 1473 } num_cpus[] = { 1474 { "SMP", smp_cpus }, 1475 { "hotpluggable", max_cpus }, 1476 { NULL, } 1477 }, *nc = num_cpus; 1478 int soft_vcpus_limit, hard_vcpus_limit; 1479 KVMState *s; 1480 const KVMCapabilityInfo *missing_cap; 1481 int ret; 1482 int type = 0; 1483 const char *kvm_type; 1484 1485 s = KVM_STATE(ms->accelerator); 1486 1487 /* 1488 * On systems where the kernel can support different base page 1489 * sizes, host page size may be different from TARGET_PAGE_SIZE, 1490 * even with KVM. TARGET_PAGE_SIZE is assumed to be the minimum 1491 * page size for the system though. 1492 */ 1493 assert(TARGET_PAGE_SIZE <= getpagesize()); 1494 1495 s->sigmask_len = 8; 1496 1497 #ifdef KVM_CAP_SET_GUEST_DEBUG 1498 QTAILQ_INIT(&s->kvm_sw_breakpoints); 1499 #endif 1500 QLIST_INIT(&s->kvm_parked_vcpus); 1501 s->vmfd = -1; 1502 s->fd = qemu_open("/dev/kvm", O_RDWR); 1503 if (s->fd == -1) { 1504 fprintf(stderr, "Could not access KVM kernel module: %m\n"); 1505 ret = -errno; 1506 goto err; 1507 } 1508 1509 ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0); 1510 if (ret < KVM_API_VERSION) { 1511 if (ret >= 0) { 1512 ret = -EINVAL; 1513 } 1514 fprintf(stderr, "kvm version too old\n"); 1515 goto err; 1516 } 1517 1518 if (ret > KVM_API_VERSION) { 1519 ret = -EINVAL; 1520 fprintf(stderr, "kvm version not supported\n"); 1521 goto err; 1522 } 1523 1524 kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT); 1525 s->nr_slots = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS); 1526 1527 /* If unspecified, use the default value */ 1528 if (!s->nr_slots) { 1529 s->nr_slots = 32; 1530 } 1531 1532 /* check the vcpu limits */ 1533 soft_vcpus_limit = kvm_recommended_vcpus(s); 1534 hard_vcpus_limit = kvm_max_vcpus(s); 1535 1536 while (nc->name) { 1537 if (nc->num > soft_vcpus_limit) { 1538 warn_report("Number of %s cpus requested (%d) exceeds " 1539 "the recommended cpus supported by KVM (%d)", 1540 nc->name, nc->num, soft_vcpus_limit); 1541 1542 if (nc->num > hard_vcpus_limit) { 1543 fprintf(stderr, "Number of %s cpus requested (%d) exceeds " 1544 "the maximum cpus supported by KVM (%d)\n", 1545 nc->name, nc->num, hard_vcpus_limit); 1546 exit(1); 1547 } 1548 } 1549 nc++; 1550 } 1551 1552 kvm_type = qemu_opt_get(qemu_get_machine_opts(), "kvm-type"); 1553 if (mc->kvm_type) { 1554 type = mc->kvm_type(kvm_type); 1555 } else if (kvm_type) { 1556 ret = -EINVAL; 1557 fprintf(stderr, "Invalid argument kvm-type=%s\n", kvm_type); 1558 goto err; 1559 } 1560 1561 do { 1562 ret = kvm_ioctl(s, KVM_CREATE_VM, type); 1563 } while (ret == -EINTR); 1564 1565 if (ret < 0) { 1566 fprintf(stderr, "ioctl(KVM_CREATE_VM) failed: %d %s\n", -ret, 1567 strerror(-ret)); 1568 1569 #ifdef TARGET_S390X 1570 if (ret == -EINVAL) { 1571 fprintf(stderr, 1572 "Host kernel setup problem detected. Please verify:\n"); 1573 fprintf(stderr, "- for kernels supporting the switch_amode or" 1574 " user_mode parameters, whether\n"); 1575 fprintf(stderr, 1576 " user space is running in primary address space\n"); 1577 fprintf(stderr, 1578 "- for kernels supporting the vm.allocate_pgste sysctl, " 1579 "whether it is enabled\n"); 1580 } 1581 #endif 1582 goto err; 1583 } 1584 1585 s->vmfd = ret; 1586 missing_cap = kvm_check_extension_list(s, kvm_required_capabilites); 1587 if (!missing_cap) { 1588 missing_cap = 1589 kvm_check_extension_list(s, kvm_arch_required_capabilities); 1590 } 1591 if (missing_cap) { 1592 ret = -EINVAL; 1593 fprintf(stderr, "kvm does not support %s\n%s", 1594 missing_cap->name, upgrade_note); 1595 goto err; 1596 } 1597 1598 s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO); 1599 1600 #ifdef KVM_CAP_VCPU_EVENTS 1601 s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS); 1602 #endif 1603 1604 s->robust_singlestep = 1605 kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP); 1606 1607 #ifdef KVM_CAP_DEBUGREGS 1608 s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS); 1609 #endif 1610 1611 #ifdef KVM_CAP_IRQ_ROUTING 1612 kvm_direct_msi_allowed = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0); 1613 #endif 1614 1615 s->intx_set_mask = kvm_check_extension(s, KVM_CAP_PCI_2_3); 1616 1617 s->irq_set_ioctl = KVM_IRQ_LINE; 1618 if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) { 1619 s->irq_set_ioctl = KVM_IRQ_LINE_STATUS; 1620 } 1621 1622 #ifdef KVM_CAP_READONLY_MEM 1623 kvm_readonly_mem_allowed = 1624 (kvm_check_extension(s, KVM_CAP_READONLY_MEM) > 0); 1625 #endif 1626 1627 kvm_eventfds_allowed = 1628 (kvm_check_extension(s, KVM_CAP_IOEVENTFD) > 0); 1629 1630 kvm_irqfds_allowed = 1631 (kvm_check_extension(s, KVM_CAP_IRQFD) > 0); 1632 1633 kvm_resamplefds_allowed = 1634 (kvm_check_extension(s, KVM_CAP_IRQFD_RESAMPLE) > 0); 1635 1636 kvm_vm_attributes_allowed = 1637 (kvm_check_extension(s, KVM_CAP_VM_ATTRIBUTES) > 0); 1638 1639 kvm_ioeventfd_any_length_allowed = 1640 (kvm_check_extension(s, KVM_CAP_IOEVENTFD_ANY_LENGTH) > 0); 1641 1642 kvm_state = s; 1643 1644 ret = kvm_arch_init(ms, s); 1645 if (ret < 0) { 1646 goto err; 1647 } 1648 1649 if (machine_kernel_irqchip_allowed(ms)) { 1650 kvm_irqchip_create(ms, s); 1651 } 1652 1653 if (kvm_eventfds_allowed) { 1654 s->memory_listener.listener.eventfd_add = kvm_mem_ioeventfd_add; 1655 s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del; 1656 } 1657 s->memory_listener.listener.coalesced_mmio_add = kvm_coalesce_mmio_region; 1658 s->memory_listener.listener.coalesced_mmio_del = kvm_uncoalesce_mmio_region; 1659 1660 kvm_memory_listener_register(s, &s->memory_listener, 1661 &address_space_memory, 0); 1662 memory_listener_register(&kvm_io_listener, 1663 &address_space_io); 1664 1665 s->many_ioeventfds = kvm_check_many_ioeventfds(); 1666 1667 return 0; 1668 1669 err: 1670 assert(ret < 0); 1671 if (s->vmfd >= 0) { 1672 close(s->vmfd); 1673 } 1674 if (s->fd != -1) { 1675 close(s->fd); 1676 } 1677 g_free(s->memory_listener.slots); 1678 1679 return ret; 1680 } 1681 1682 void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len) 1683 { 1684 s->sigmask_len = sigmask_len; 1685 } 1686 1687 static void kvm_handle_io(uint16_t port, MemTxAttrs attrs, void *data, int direction, 1688 int size, uint32_t count) 1689 { 1690 int i; 1691 uint8_t *ptr = data; 1692 1693 for (i = 0; i < count; i++) { 1694 address_space_rw(&address_space_io, port, attrs, 1695 ptr, size, 1696 direction == KVM_EXIT_IO_OUT); 1697 ptr += size; 1698 } 1699 } 1700 1701 static int kvm_handle_internal_error(CPUState *cpu, struct kvm_run *run) 1702 { 1703 fprintf(stderr, "KVM internal error. Suberror: %d\n", 1704 run->internal.suberror); 1705 1706 if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) { 1707 int i; 1708 1709 for (i = 0; i < run->internal.ndata; ++i) { 1710 fprintf(stderr, "extra data[%d]: %"PRIx64"\n", 1711 i, (uint64_t)run->internal.data[i]); 1712 } 1713 } 1714 if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) { 1715 fprintf(stderr, "emulation failure\n"); 1716 if (!kvm_arch_stop_on_emulation_error(cpu)) { 1717 cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_CODE); 1718 return EXCP_INTERRUPT; 1719 } 1720 } 1721 /* FIXME: Should trigger a qmp message to let management know 1722 * something went wrong. 1723 */ 1724 return -1; 1725 } 1726 1727 void kvm_flush_coalesced_mmio_buffer(void) 1728 { 1729 KVMState *s = kvm_state; 1730 1731 if (s->coalesced_flush_in_progress) { 1732 return; 1733 } 1734 1735 s->coalesced_flush_in_progress = true; 1736 1737 if (s->coalesced_mmio_ring) { 1738 struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring; 1739 while (ring->first != ring->last) { 1740 struct kvm_coalesced_mmio *ent; 1741 1742 ent = &ring->coalesced_mmio[ring->first]; 1743 1744 cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len); 1745 smp_wmb(); 1746 ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX; 1747 } 1748 } 1749 1750 s->coalesced_flush_in_progress = false; 1751 } 1752 1753 static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg) 1754 { 1755 if (!cpu->vcpu_dirty) { 1756 kvm_arch_get_registers(cpu); 1757 cpu->vcpu_dirty = true; 1758 } 1759 } 1760 1761 void kvm_cpu_synchronize_state(CPUState *cpu) 1762 { 1763 if (!cpu->vcpu_dirty) { 1764 run_on_cpu(cpu, do_kvm_cpu_synchronize_state, RUN_ON_CPU_NULL); 1765 } 1766 } 1767 1768 static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg) 1769 { 1770 kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE); 1771 cpu->vcpu_dirty = false; 1772 } 1773 1774 void kvm_cpu_synchronize_post_reset(CPUState *cpu) 1775 { 1776 run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL); 1777 } 1778 1779 static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg) 1780 { 1781 kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE); 1782 cpu->vcpu_dirty = false; 1783 } 1784 1785 void kvm_cpu_synchronize_post_init(CPUState *cpu) 1786 { 1787 run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, RUN_ON_CPU_NULL); 1788 } 1789 1790 static void do_kvm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg) 1791 { 1792 cpu->vcpu_dirty = true; 1793 } 1794 1795 void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu) 1796 { 1797 run_on_cpu(cpu, do_kvm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL); 1798 } 1799 1800 #ifdef KVM_HAVE_MCE_INJECTION 1801 static __thread void *pending_sigbus_addr; 1802 static __thread int pending_sigbus_code; 1803 static __thread bool have_sigbus_pending; 1804 #endif 1805 1806 static void kvm_cpu_kick(CPUState *cpu) 1807 { 1808 atomic_set(&cpu->kvm_run->immediate_exit, 1); 1809 } 1810 1811 static void kvm_cpu_kick_self(void) 1812 { 1813 if (kvm_immediate_exit) { 1814 kvm_cpu_kick(current_cpu); 1815 } else { 1816 qemu_cpu_kick_self(); 1817 } 1818 } 1819 1820 static void kvm_eat_signals(CPUState *cpu) 1821 { 1822 struct timespec ts = { 0, 0 }; 1823 siginfo_t siginfo; 1824 sigset_t waitset; 1825 sigset_t chkset; 1826 int r; 1827 1828 if (kvm_immediate_exit) { 1829 atomic_set(&cpu->kvm_run->immediate_exit, 0); 1830 /* Write kvm_run->immediate_exit before the cpu->exit_request 1831 * write in kvm_cpu_exec. 1832 */ 1833 smp_wmb(); 1834 return; 1835 } 1836 1837 sigemptyset(&waitset); 1838 sigaddset(&waitset, SIG_IPI); 1839 1840 do { 1841 r = sigtimedwait(&waitset, &siginfo, &ts); 1842 if (r == -1 && !(errno == EAGAIN || errno == EINTR)) { 1843 perror("sigtimedwait"); 1844 exit(1); 1845 } 1846 1847 r = sigpending(&chkset); 1848 if (r == -1) { 1849 perror("sigpending"); 1850 exit(1); 1851 } 1852 } while (sigismember(&chkset, SIG_IPI)); 1853 } 1854 1855 int kvm_cpu_exec(CPUState *cpu) 1856 { 1857 struct kvm_run *run = cpu->kvm_run; 1858 int ret, run_ret; 1859 1860 DPRINTF("kvm_cpu_exec()\n"); 1861 1862 if (kvm_arch_process_async_events(cpu)) { 1863 atomic_set(&cpu->exit_request, 0); 1864 return EXCP_HLT; 1865 } 1866 1867 qemu_mutex_unlock_iothread(); 1868 cpu_exec_start(cpu); 1869 1870 do { 1871 MemTxAttrs attrs; 1872 1873 if (cpu->vcpu_dirty) { 1874 kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE); 1875 cpu->vcpu_dirty = false; 1876 } 1877 1878 kvm_arch_pre_run(cpu, run); 1879 if (atomic_read(&cpu->exit_request)) { 1880 DPRINTF("interrupt exit requested\n"); 1881 /* 1882 * KVM requires us to reenter the kernel after IO exits to complete 1883 * instruction emulation. This self-signal will ensure that we 1884 * leave ASAP again. 1885 */ 1886 kvm_cpu_kick_self(); 1887 } 1888 1889 /* Read cpu->exit_request before KVM_RUN reads run->immediate_exit. 1890 * Matching barrier in kvm_eat_signals. 1891 */ 1892 smp_rmb(); 1893 1894 run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0); 1895 1896 attrs = kvm_arch_post_run(cpu, run); 1897 1898 #ifdef KVM_HAVE_MCE_INJECTION 1899 if (unlikely(have_sigbus_pending)) { 1900 qemu_mutex_lock_iothread(); 1901 kvm_arch_on_sigbus_vcpu(cpu, pending_sigbus_code, 1902 pending_sigbus_addr); 1903 have_sigbus_pending = false; 1904 qemu_mutex_unlock_iothread(); 1905 } 1906 #endif 1907 1908 if (run_ret < 0) { 1909 if (run_ret == -EINTR || run_ret == -EAGAIN) { 1910 DPRINTF("io window exit\n"); 1911 kvm_eat_signals(cpu); 1912 ret = EXCP_INTERRUPT; 1913 break; 1914 } 1915 fprintf(stderr, "error: kvm run failed %s\n", 1916 strerror(-run_ret)); 1917 #ifdef TARGET_PPC 1918 if (run_ret == -EBUSY) { 1919 fprintf(stderr, 1920 "This is probably because your SMT is enabled.\n" 1921 "VCPU can only run on primary threads with all " 1922 "secondary threads offline.\n"); 1923 } 1924 #endif 1925 ret = -1; 1926 break; 1927 } 1928 1929 trace_kvm_run_exit(cpu->cpu_index, run->exit_reason); 1930 switch (run->exit_reason) { 1931 case KVM_EXIT_IO: 1932 DPRINTF("handle_io\n"); 1933 /* Called outside BQL */ 1934 kvm_handle_io(run->io.port, attrs, 1935 (uint8_t *)run + run->io.data_offset, 1936 run->io.direction, 1937 run->io.size, 1938 run->io.count); 1939 ret = 0; 1940 break; 1941 case KVM_EXIT_MMIO: 1942 DPRINTF("handle_mmio\n"); 1943 /* Called outside BQL */ 1944 address_space_rw(&address_space_memory, 1945 run->mmio.phys_addr, attrs, 1946 run->mmio.data, 1947 run->mmio.len, 1948 run->mmio.is_write); 1949 ret = 0; 1950 break; 1951 case KVM_EXIT_IRQ_WINDOW_OPEN: 1952 DPRINTF("irq_window_open\n"); 1953 ret = EXCP_INTERRUPT; 1954 break; 1955 case KVM_EXIT_SHUTDOWN: 1956 DPRINTF("shutdown\n"); 1957 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); 1958 ret = EXCP_INTERRUPT; 1959 break; 1960 case KVM_EXIT_UNKNOWN: 1961 fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n", 1962 (uint64_t)run->hw.hardware_exit_reason); 1963 ret = -1; 1964 break; 1965 case KVM_EXIT_INTERNAL_ERROR: 1966 ret = kvm_handle_internal_error(cpu, run); 1967 break; 1968 case KVM_EXIT_SYSTEM_EVENT: 1969 switch (run->system_event.type) { 1970 case KVM_SYSTEM_EVENT_SHUTDOWN: 1971 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN); 1972 ret = EXCP_INTERRUPT; 1973 break; 1974 case KVM_SYSTEM_EVENT_RESET: 1975 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); 1976 ret = EXCP_INTERRUPT; 1977 break; 1978 case KVM_SYSTEM_EVENT_CRASH: 1979 kvm_cpu_synchronize_state(cpu); 1980 qemu_mutex_lock_iothread(); 1981 qemu_system_guest_panicked(cpu_get_crash_info(cpu)); 1982 qemu_mutex_unlock_iothread(); 1983 ret = 0; 1984 break; 1985 default: 1986 DPRINTF("kvm_arch_handle_exit\n"); 1987 ret = kvm_arch_handle_exit(cpu, run); 1988 break; 1989 } 1990 break; 1991 default: 1992 DPRINTF("kvm_arch_handle_exit\n"); 1993 ret = kvm_arch_handle_exit(cpu, run); 1994 break; 1995 } 1996 } while (ret == 0); 1997 1998 cpu_exec_end(cpu); 1999 qemu_mutex_lock_iothread(); 2000 2001 if (ret < 0) { 2002 cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_CODE); 2003 vm_stop(RUN_STATE_INTERNAL_ERROR); 2004 } 2005 2006 atomic_set(&cpu->exit_request, 0); 2007 return ret; 2008 } 2009 2010 int kvm_ioctl(KVMState *s, int type, ...) 2011 { 2012 int ret; 2013 void *arg; 2014 va_list ap; 2015 2016 va_start(ap, type); 2017 arg = va_arg(ap, void *); 2018 va_end(ap); 2019 2020 trace_kvm_ioctl(type, arg); 2021 ret = ioctl(s->fd, type, arg); 2022 if (ret == -1) { 2023 ret = -errno; 2024 } 2025 return ret; 2026 } 2027 2028 int kvm_vm_ioctl(KVMState *s, int type, ...) 2029 { 2030 int ret; 2031 void *arg; 2032 va_list ap; 2033 2034 va_start(ap, type); 2035 arg = va_arg(ap, void *); 2036 va_end(ap); 2037 2038 trace_kvm_vm_ioctl(type, arg); 2039 ret = ioctl(s->vmfd, type, arg); 2040 if (ret == -1) { 2041 ret = -errno; 2042 } 2043 return ret; 2044 } 2045 2046 int kvm_vcpu_ioctl(CPUState *cpu, int type, ...) 2047 { 2048 int ret; 2049 void *arg; 2050 va_list ap; 2051 2052 va_start(ap, type); 2053 arg = va_arg(ap, void *); 2054 va_end(ap); 2055 2056 trace_kvm_vcpu_ioctl(cpu->cpu_index, type, arg); 2057 ret = ioctl(cpu->kvm_fd, type, arg); 2058 if (ret == -1) { 2059 ret = -errno; 2060 } 2061 return ret; 2062 } 2063 2064 int kvm_device_ioctl(int fd, int type, ...) 2065 { 2066 int ret; 2067 void *arg; 2068 va_list ap; 2069 2070 va_start(ap, type); 2071 arg = va_arg(ap, void *); 2072 va_end(ap); 2073 2074 trace_kvm_device_ioctl(fd, type, arg); 2075 ret = ioctl(fd, type, arg); 2076 if (ret == -1) { 2077 ret = -errno; 2078 } 2079 return ret; 2080 } 2081 2082 int kvm_vm_check_attr(KVMState *s, uint32_t group, uint64_t attr) 2083 { 2084 int ret; 2085 struct kvm_device_attr attribute = { 2086 .group = group, 2087 .attr = attr, 2088 }; 2089 2090 if (!kvm_vm_attributes_allowed) { 2091 return 0; 2092 } 2093 2094 ret = kvm_vm_ioctl(s, KVM_HAS_DEVICE_ATTR, &attribute); 2095 /* kvm returns 0 on success for HAS_DEVICE_ATTR */ 2096 return ret ? 0 : 1; 2097 } 2098 2099 int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr) 2100 { 2101 struct kvm_device_attr attribute = { 2102 .group = group, 2103 .attr = attr, 2104 .flags = 0, 2105 }; 2106 2107 return kvm_device_ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute) ? 0 : 1; 2108 } 2109 2110 int kvm_device_access(int fd, int group, uint64_t attr, 2111 void *val, bool write, Error **errp) 2112 { 2113 struct kvm_device_attr kvmattr; 2114 int err; 2115 2116 kvmattr.flags = 0; 2117 kvmattr.group = group; 2118 kvmattr.attr = attr; 2119 kvmattr.addr = (uintptr_t)val; 2120 2121 err = kvm_device_ioctl(fd, 2122 write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR, 2123 &kvmattr); 2124 if (err < 0) { 2125 error_setg_errno(errp, -err, 2126 "KVM_%s_DEVICE_ATTR failed: Group %d " 2127 "attr 0x%016" PRIx64, 2128 write ? "SET" : "GET", group, attr); 2129 } 2130 return err; 2131 } 2132 2133 /* Return 1 on success, 0 on failure */ 2134 int kvm_has_sync_mmu(void) 2135 { 2136 return kvm_check_extension(kvm_state, KVM_CAP_SYNC_MMU); 2137 } 2138 2139 int kvm_has_vcpu_events(void) 2140 { 2141 return kvm_state->vcpu_events; 2142 } 2143 2144 int kvm_has_robust_singlestep(void) 2145 { 2146 return kvm_state->robust_singlestep; 2147 } 2148 2149 int kvm_has_debugregs(void) 2150 { 2151 return kvm_state->debugregs; 2152 } 2153 2154 int kvm_has_many_ioeventfds(void) 2155 { 2156 if (!kvm_enabled()) { 2157 return 0; 2158 } 2159 return kvm_state->many_ioeventfds; 2160 } 2161 2162 int kvm_has_gsi_routing(void) 2163 { 2164 #ifdef KVM_CAP_IRQ_ROUTING 2165 return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING); 2166 #else 2167 return false; 2168 #endif 2169 } 2170 2171 int kvm_has_intx_set_mask(void) 2172 { 2173 return kvm_state->intx_set_mask; 2174 } 2175 2176 bool kvm_arm_supports_user_irq(void) 2177 { 2178 return kvm_check_extension(kvm_state, KVM_CAP_ARM_USER_IRQ); 2179 } 2180 2181 #ifdef KVM_CAP_SET_GUEST_DEBUG 2182 struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu, 2183 target_ulong pc) 2184 { 2185 struct kvm_sw_breakpoint *bp; 2186 2187 QTAILQ_FOREACH(bp, &cpu->kvm_state->kvm_sw_breakpoints, entry) { 2188 if (bp->pc == pc) { 2189 return bp; 2190 } 2191 } 2192 return NULL; 2193 } 2194 2195 int kvm_sw_breakpoints_active(CPUState *cpu) 2196 { 2197 return !QTAILQ_EMPTY(&cpu->kvm_state->kvm_sw_breakpoints); 2198 } 2199 2200 struct kvm_set_guest_debug_data { 2201 struct kvm_guest_debug dbg; 2202 int err; 2203 }; 2204 2205 static void kvm_invoke_set_guest_debug(CPUState *cpu, run_on_cpu_data data) 2206 { 2207 struct kvm_set_guest_debug_data *dbg_data = 2208 (struct kvm_set_guest_debug_data *) data.host_ptr; 2209 2210 dbg_data->err = kvm_vcpu_ioctl(cpu, KVM_SET_GUEST_DEBUG, 2211 &dbg_data->dbg); 2212 } 2213 2214 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap) 2215 { 2216 struct kvm_set_guest_debug_data data; 2217 2218 data.dbg.control = reinject_trap; 2219 2220 if (cpu->singlestep_enabled) { 2221 data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP; 2222 } 2223 kvm_arch_update_guest_debug(cpu, &data.dbg); 2224 2225 run_on_cpu(cpu, kvm_invoke_set_guest_debug, 2226 RUN_ON_CPU_HOST_PTR(&data)); 2227 return data.err; 2228 } 2229 2230 int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr, 2231 target_ulong len, int type) 2232 { 2233 struct kvm_sw_breakpoint *bp; 2234 int err; 2235 2236 if (type == GDB_BREAKPOINT_SW) { 2237 bp = kvm_find_sw_breakpoint(cpu, addr); 2238 if (bp) { 2239 bp->use_count++; 2240 return 0; 2241 } 2242 2243 bp = g_malloc(sizeof(struct kvm_sw_breakpoint)); 2244 bp->pc = addr; 2245 bp->use_count = 1; 2246 err = kvm_arch_insert_sw_breakpoint(cpu, bp); 2247 if (err) { 2248 g_free(bp); 2249 return err; 2250 } 2251 2252 QTAILQ_INSERT_HEAD(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry); 2253 } else { 2254 err = kvm_arch_insert_hw_breakpoint(addr, len, type); 2255 if (err) { 2256 return err; 2257 } 2258 } 2259 2260 CPU_FOREACH(cpu) { 2261 err = kvm_update_guest_debug(cpu, 0); 2262 if (err) { 2263 return err; 2264 } 2265 } 2266 return 0; 2267 } 2268 2269 int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr, 2270 target_ulong len, int type) 2271 { 2272 struct kvm_sw_breakpoint *bp; 2273 int err; 2274 2275 if (type == GDB_BREAKPOINT_SW) { 2276 bp = kvm_find_sw_breakpoint(cpu, addr); 2277 if (!bp) { 2278 return -ENOENT; 2279 } 2280 2281 if (bp->use_count > 1) { 2282 bp->use_count--; 2283 return 0; 2284 } 2285 2286 err = kvm_arch_remove_sw_breakpoint(cpu, bp); 2287 if (err) { 2288 return err; 2289 } 2290 2291 QTAILQ_REMOVE(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry); 2292 g_free(bp); 2293 } else { 2294 err = kvm_arch_remove_hw_breakpoint(addr, len, type); 2295 if (err) { 2296 return err; 2297 } 2298 } 2299 2300 CPU_FOREACH(cpu) { 2301 err = kvm_update_guest_debug(cpu, 0); 2302 if (err) { 2303 return err; 2304 } 2305 } 2306 return 0; 2307 } 2308 2309 void kvm_remove_all_breakpoints(CPUState *cpu) 2310 { 2311 struct kvm_sw_breakpoint *bp, *next; 2312 KVMState *s = cpu->kvm_state; 2313 CPUState *tmpcpu; 2314 2315 QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) { 2316 if (kvm_arch_remove_sw_breakpoint(cpu, bp) != 0) { 2317 /* Try harder to find a CPU that currently sees the breakpoint. */ 2318 CPU_FOREACH(tmpcpu) { 2319 if (kvm_arch_remove_sw_breakpoint(tmpcpu, bp) == 0) { 2320 break; 2321 } 2322 } 2323 } 2324 QTAILQ_REMOVE(&s->kvm_sw_breakpoints, bp, entry); 2325 g_free(bp); 2326 } 2327 kvm_arch_remove_all_hw_breakpoints(); 2328 2329 CPU_FOREACH(cpu) { 2330 kvm_update_guest_debug(cpu, 0); 2331 } 2332 } 2333 2334 #else /* !KVM_CAP_SET_GUEST_DEBUG */ 2335 2336 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap) 2337 { 2338 return -EINVAL; 2339 } 2340 2341 int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr, 2342 target_ulong len, int type) 2343 { 2344 return -EINVAL; 2345 } 2346 2347 int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr, 2348 target_ulong len, int type) 2349 { 2350 return -EINVAL; 2351 } 2352 2353 void kvm_remove_all_breakpoints(CPUState *cpu) 2354 { 2355 } 2356 #endif /* !KVM_CAP_SET_GUEST_DEBUG */ 2357 2358 static int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset) 2359 { 2360 KVMState *s = kvm_state; 2361 struct kvm_signal_mask *sigmask; 2362 int r; 2363 2364 sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset)); 2365 2366 sigmask->len = s->sigmask_len; 2367 memcpy(sigmask->sigset, sigset, sizeof(*sigset)); 2368 r = kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, sigmask); 2369 g_free(sigmask); 2370 2371 return r; 2372 } 2373 2374 static void kvm_ipi_signal(int sig) 2375 { 2376 if (current_cpu) { 2377 assert(kvm_immediate_exit); 2378 kvm_cpu_kick(current_cpu); 2379 } 2380 } 2381 2382 void kvm_init_cpu_signals(CPUState *cpu) 2383 { 2384 int r; 2385 sigset_t set; 2386 struct sigaction sigact; 2387 2388 memset(&sigact, 0, sizeof(sigact)); 2389 sigact.sa_handler = kvm_ipi_signal; 2390 sigaction(SIG_IPI, &sigact, NULL); 2391 2392 pthread_sigmask(SIG_BLOCK, NULL, &set); 2393 #if defined KVM_HAVE_MCE_INJECTION 2394 sigdelset(&set, SIGBUS); 2395 pthread_sigmask(SIG_SETMASK, &set, NULL); 2396 #endif 2397 sigdelset(&set, SIG_IPI); 2398 if (kvm_immediate_exit) { 2399 r = pthread_sigmask(SIG_SETMASK, &set, NULL); 2400 } else { 2401 r = kvm_set_signal_mask(cpu, &set); 2402 } 2403 if (r) { 2404 fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r)); 2405 exit(1); 2406 } 2407 } 2408 2409 /* Called asynchronously in VCPU thread. */ 2410 int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr) 2411 { 2412 #ifdef KVM_HAVE_MCE_INJECTION 2413 if (have_sigbus_pending) { 2414 return 1; 2415 } 2416 have_sigbus_pending = true; 2417 pending_sigbus_addr = addr; 2418 pending_sigbus_code = code; 2419 atomic_set(&cpu->exit_request, 1); 2420 return 0; 2421 #else 2422 return 1; 2423 #endif 2424 } 2425 2426 /* Called synchronously (via signalfd) in main thread. */ 2427 int kvm_on_sigbus(int code, void *addr) 2428 { 2429 #ifdef KVM_HAVE_MCE_INJECTION 2430 /* Action required MCE kills the process if SIGBUS is blocked. Because 2431 * that's what happens in the I/O thread, where we handle MCE via signalfd, 2432 * we can only get action optional here. 2433 */ 2434 assert(code != BUS_MCEERR_AR); 2435 kvm_arch_on_sigbus_vcpu(first_cpu, code, addr); 2436 return 0; 2437 #else 2438 return 1; 2439 #endif 2440 } 2441 2442 int kvm_create_device(KVMState *s, uint64_t type, bool test) 2443 { 2444 int ret; 2445 struct kvm_create_device create_dev; 2446 2447 create_dev.type = type; 2448 create_dev.fd = -1; 2449 create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0; 2450 2451 if (!kvm_check_extension(s, KVM_CAP_DEVICE_CTRL)) { 2452 return -ENOTSUP; 2453 } 2454 2455 ret = kvm_vm_ioctl(s, KVM_CREATE_DEVICE, &create_dev); 2456 if (ret) { 2457 return ret; 2458 } 2459 2460 return test ? 0 : create_dev.fd; 2461 } 2462 2463 bool kvm_device_supported(int vmfd, uint64_t type) 2464 { 2465 struct kvm_create_device create_dev = { 2466 .type = type, 2467 .fd = -1, 2468 .flags = KVM_CREATE_DEVICE_TEST, 2469 }; 2470 2471 if (ioctl(vmfd, KVM_CHECK_EXTENSION, KVM_CAP_DEVICE_CTRL) <= 0) { 2472 return false; 2473 } 2474 2475 return (ioctl(vmfd, KVM_CREATE_DEVICE, &create_dev) >= 0); 2476 } 2477 2478 int kvm_set_one_reg(CPUState *cs, uint64_t id, void *source) 2479 { 2480 struct kvm_one_reg reg; 2481 int r; 2482 2483 reg.id = id; 2484 reg.addr = (uintptr_t) source; 2485 r = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, ®); 2486 if (r) { 2487 trace_kvm_failed_reg_set(id, strerror(-r)); 2488 } 2489 return r; 2490 } 2491 2492 int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target) 2493 { 2494 struct kvm_one_reg reg; 2495 int r; 2496 2497 reg.id = id; 2498 reg.addr = (uintptr_t) target; 2499 r = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, ®); 2500 if (r) { 2501 trace_kvm_failed_reg_get(id, strerror(-r)); 2502 } 2503 return r; 2504 } 2505 2506 static void kvm_accel_class_init(ObjectClass *oc, void *data) 2507 { 2508 AccelClass *ac = ACCEL_CLASS(oc); 2509 ac->name = "KVM"; 2510 ac->init_machine = kvm_init; 2511 ac->allowed = &kvm_allowed; 2512 } 2513 2514 static const TypeInfo kvm_accel_type = { 2515 .name = TYPE_KVM_ACCEL, 2516 .parent = TYPE_ACCEL, 2517 .class_init = kvm_accel_class_init, 2518 .instance_size = sizeof(KVMState), 2519 }; 2520 2521 static void kvm_type_init(void) 2522 { 2523 type_register_static(&kvm_accel_type); 2524 } 2525 2526 type_init(kvm_type_init); 2527