1 /* 2 * QEMU KVM support 3 * 4 * Copyright IBM, Corp. 2008 5 * Red Hat, Inc. 2008 6 * 7 * Authors: 8 * Anthony Liguori <aliguori@us.ibm.com> 9 * Glauber Costa <gcosta@redhat.com> 10 * 11 * This work is licensed under the terms of the GNU GPL, version 2 or later. 12 * See the COPYING file in the top-level directory. 13 * 14 */ 15 16 #include "qemu/osdep.h" 17 #include <sys/ioctl.h> 18 19 #include <linux/kvm.h> 20 21 #include "qemu/atomic.h" 22 #include "qemu/option.h" 23 #include "qemu/config-file.h" 24 #include "qemu/error-report.h" 25 #include "qapi/error.h" 26 #include "hw/pci/msi.h" 27 #include "hw/pci/msix.h" 28 #include "hw/s390x/adapter.h" 29 #include "exec/gdbstub.h" 30 #include "sysemu/kvm_int.h" 31 #include "sysemu/runstate.h" 32 #include "sysemu/cpus.h" 33 #include "sysemu/sysemu.h" 34 #include "qemu/bswap.h" 35 #include "exec/memory.h" 36 #include "exec/ram_addr.h" 37 #include "exec/address-spaces.h" 38 #include "qemu/event_notifier.h" 39 #include "qemu/main-loop.h" 40 #include "trace.h" 41 #include "hw/irq.h" 42 #include "sysemu/sev.h" 43 #include "sysemu/balloon.h" 44 45 #include "hw/boards.h" 46 47 /* This check must be after config-host.h is included */ 48 #ifdef CONFIG_EVENTFD 49 #include <sys/eventfd.h> 50 #endif 51 52 /* KVM uses PAGE_SIZE in its definition of KVM_COALESCED_MMIO_MAX. We 53 * need to use the real host PAGE_SIZE, as that's what KVM will use. 54 */ 55 #define PAGE_SIZE qemu_real_host_page_size 56 57 //#define DEBUG_KVM 58 59 #ifdef DEBUG_KVM 60 #define DPRINTF(fmt, ...) \ 61 do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0) 62 #else 63 #define DPRINTF(fmt, ...) \ 64 do { } while (0) 65 #endif 66 67 #define KVM_MSI_HASHTAB_SIZE 256 68 69 struct KVMParkedVcpu { 70 unsigned long vcpu_id; 71 int kvm_fd; 72 QLIST_ENTRY(KVMParkedVcpu) node; 73 }; 74 75 struct KVMState 76 { 77 AccelState parent_obj; 78 79 int nr_slots; 80 int fd; 81 int vmfd; 82 int coalesced_mmio; 83 int coalesced_pio; 84 struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; 85 bool coalesced_flush_in_progress; 86 int vcpu_events; 87 int robust_singlestep; 88 int debugregs; 89 #ifdef KVM_CAP_SET_GUEST_DEBUG 90 QTAILQ_HEAD(, kvm_sw_breakpoint) kvm_sw_breakpoints; 91 #endif 92 int max_nested_state_len; 93 int many_ioeventfds; 94 int intx_set_mask; 95 bool sync_mmu; 96 bool manual_dirty_log_protect; 97 /* The man page (and posix) say ioctl numbers are signed int, but 98 * they're not. Linux, glibc and *BSD all treat ioctl numbers as 99 * unsigned, and treating them as signed here can break things */ 100 unsigned irq_set_ioctl; 101 unsigned int sigmask_len; 102 GHashTable *gsimap; 103 #ifdef KVM_CAP_IRQ_ROUTING 104 struct kvm_irq_routing *irq_routes; 105 int nr_allocated_irq_routes; 106 unsigned long *used_gsi_bitmap; 107 unsigned int gsi_count; 108 QTAILQ_HEAD(, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE]; 109 #endif 110 KVMMemoryListener memory_listener; 111 QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus; 112 113 /* memory encryption */ 114 void *memcrypt_handle; 115 int (*memcrypt_encrypt_data)(void *handle, uint8_t *ptr, uint64_t len); 116 117 /* For "info mtree -f" to tell if an MR is registered in KVM */ 118 int nr_as; 119 struct KVMAs { 120 KVMMemoryListener *ml; 121 AddressSpace *as; 122 } *as; 123 }; 124 125 KVMState *kvm_state; 126 bool kvm_kernel_irqchip; 127 bool kvm_split_irqchip; 128 bool kvm_async_interrupts_allowed; 129 bool kvm_halt_in_kernel_allowed; 130 bool kvm_eventfds_allowed; 131 bool kvm_irqfds_allowed; 132 bool kvm_resamplefds_allowed; 133 bool kvm_msi_via_irqfd_allowed; 134 bool kvm_gsi_routing_allowed; 135 bool kvm_gsi_direct_mapping; 136 bool kvm_allowed; 137 bool kvm_readonly_mem_allowed; 138 bool kvm_vm_attributes_allowed; 139 bool kvm_direct_msi_allowed; 140 bool kvm_ioeventfd_any_length_allowed; 141 bool kvm_msi_use_devid; 142 static bool kvm_immediate_exit; 143 static hwaddr kvm_max_slot_size = ~0; 144 145 static const KVMCapabilityInfo kvm_required_capabilites[] = { 146 KVM_CAP_INFO(USER_MEMORY), 147 KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS), 148 KVM_CAP_INFO(JOIN_MEMORY_REGIONS_WORKS), 149 KVM_CAP_LAST_INFO 150 }; 151 152 static NotifierList kvm_irqchip_change_notifiers = 153 NOTIFIER_LIST_INITIALIZER(kvm_irqchip_change_notifiers); 154 155 #define kvm_slots_lock(kml) qemu_mutex_lock(&(kml)->slots_lock) 156 #define kvm_slots_unlock(kml) qemu_mutex_unlock(&(kml)->slots_lock) 157 158 int kvm_get_max_memslots(void) 159 { 160 KVMState *s = KVM_STATE(current_machine->accelerator); 161 162 return s->nr_slots; 163 } 164 165 bool kvm_memcrypt_enabled(void) 166 { 167 if (kvm_state && kvm_state->memcrypt_handle) { 168 return true; 169 } 170 171 return false; 172 } 173 174 int kvm_memcrypt_encrypt_data(uint8_t *ptr, uint64_t len) 175 { 176 if (kvm_state->memcrypt_handle && 177 kvm_state->memcrypt_encrypt_data) { 178 return kvm_state->memcrypt_encrypt_data(kvm_state->memcrypt_handle, 179 ptr, len); 180 } 181 182 return 1; 183 } 184 185 /* Called with KVMMemoryListener.slots_lock held */ 186 static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml) 187 { 188 KVMState *s = kvm_state; 189 int i; 190 191 for (i = 0; i < s->nr_slots; i++) { 192 if (kml->slots[i].memory_size == 0) { 193 return &kml->slots[i]; 194 } 195 } 196 197 return NULL; 198 } 199 200 bool kvm_has_free_slot(MachineState *ms) 201 { 202 KVMState *s = KVM_STATE(ms->accelerator); 203 bool result; 204 KVMMemoryListener *kml = &s->memory_listener; 205 206 kvm_slots_lock(kml); 207 result = !!kvm_get_free_slot(kml); 208 kvm_slots_unlock(kml); 209 210 return result; 211 } 212 213 /* Called with KVMMemoryListener.slots_lock held */ 214 static KVMSlot *kvm_alloc_slot(KVMMemoryListener *kml) 215 { 216 KVMSlot *slot = kvm_get_free_slot(kml); 217 218 if (slot) { 219 return slot; 220 } 221 222 fprintf(stderr, "%s: no free slot available\n", __func__); 223 abort(); 224 } 225 226 static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml, 227 hwaddr start_addr, 228 hwaddr size) 229 { 230 KVMState *s = kvm_state; 231 int i; 232 233 for (i = 0; i < s->nr_slots; i++) { 234 KVMSlot *mem = &kml->slots[i]; 235 236 if (start_addr == mem->start_addr && size == mem->memory_size) { 237 return mem; 238 } 239 } 240 241 return NULL; 242 } 243 244 /* 245 * Calculate and align the start address and the size of the section. 246 * Return the size. If the size is 0, the aligned section is empty. 247 */ 248 static hwaddr kvm_align_section(MemoryRegionSection *section, 249 hwaddr *start) 250 { 251 hwaddr size = int128_get64(section->size); 252 hwaddr delta, aligned; 253 254 /* kvm works in page size chunks, but the function may be called 255 with sub-page size and unaligned start address. Pad the start 256 address to next and truncate size to previous page boundary. */ 257 aligned = ROUND_UP(section->offset_within_address_space, 258 qemu_real_host_page_size); 259 delta = aligned - section->offset_within_address_space; 260 *start = aligned; 261 if (delta > size) { 262 return 0; 263 } 264 265 return (size - delta) & qemu_real_host_page_mask; 266 } 267 268 int kvm_physical_memory_addr_from_host(KVMState *s, void *ram, 269 hwaddr *phys_addr) 270 { 271 KVMMemoryListener *kml = &s->memory_listener; 272 int i, ret = 0; 273 274 kvm_slots_lock(kml); 275 for (i = 0; i < s->nr_slots; i++) { 276 KVMSlot *mem = &kml->slots[i]; 277 278 if (ram >= mem->ram && ram < mem->ram + mem->memory_size) { 279 *phys_addr = mem->start_addr + (ram - mem->ram); 280 ret = 1; 281 break; 282 } 283 } 284 kvm_slots_unlock(kml); 285 286 return ret; 287 } 288 289 static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot, bool new) 290 { 291 KVMState *s = kvm_state; 292 struct kvm_userspace_memory_region mem; 293 int ret; 294 295 mem.slot = slot->slot | (kml->as_id << 16); 296 mem.guest_phys_addr = slot->start_addr; 297 mem.userspace_addr = (unsigned long)slot->ram; 298 mem.flags = slot->flags; 299 300 if (slot->memory_size && !new && (mem.flags ^ slot->old_flags) & KVM_MEM_READONLY) { 301 /* Set the slot size to 0 before setting the slot to the desired 302 * value. This is needed based on KVM commit 75d61fbc. */ 303 mem.memory_size = 0; 304 kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem); 305 } 306 mem.memory_size = slot->memory_size; 307 ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem); 308 slot->old_flags = mem.flags; 309 trace_kvm_set_user_memory(mem.slot, mem.flags, mem.guest_phys_addr, 310 mem.memory_size, mem.userspace_addr, ret); 311 return ret; 312 } 313 314 int kvm_destroy_vcpu(CPUState *cpu) 315 { 316 KVMState *s = kvm_state; 317 long mmap_size; 318 struct KVMParkedVcpu *vcpu = NULL; 319 int ret = 0; 320 321 DPRINTF("kvm_destroy_vcpu\n"); 322 323 ret = kvm_arch_destroy_vcpu(cpu); 324 if (ret < 0) { 325 goto err; 326 } 327 328 mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0); 329 if (mmap_size < 0) { 330 ret = mmap_size; 331 DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n"); 332 goto err; 333 } 334 335 ret = munmap(cpu->kvm_run, mmap_size); 336 if (ret < 0) { 337 goto err; 338 } 339 340 vcpu = g_malloc0(sizeof(*vcpu)); 341 vcpu->vcpu_id = kvm_arch_vcpu_id(cpu); 342 vcpu->kvm_fd = cpu->kvm_fd; 343 QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node); 344 err: 345 return ret; 346 } 347 348 static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id) 349 { 350 struct KVMParkedVcpu *cpu; 351 352 QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) { 353 if (cpu->vcpu_id == vcpu_id) { 354 int kvm_fd; 355 356 QLIST_REMOVE(cpu, node); 357 kvm_fd = cpu->kvm_fd; 358 g_free(cpu); 359 return kvm_fd; 360 } 361 } 362 363 return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id); 364 } 365 366 int kvm_init_vcpu(CPUState *cpu) 367 { 368 KVMState *s = kvm_state; 369 long mmap_size; 370 int ret; 371 372 DPRINTF("kvm_init_vcpu\n"); 373 374 ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu)); 375 if (ret < 0) { 376 DPRINTF("kvm_create_vcpu failed\n"); 377 goto err; 378 } 379 380 cpu->kvm_fd = ret; 381 cpu->kvm_state = s; 382 cpu->vcpu_dirty = true; 383 384 mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0); 385 if (mmap_size < 0) { 386 ret = mmap_size; 387 DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n"); 388 goto err; 389 } 390 391 cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, 392 cpu->kvm_fd, 0); 393 if (cpu->kvm_run == MAP_FAILED) { 394 ret = -errno; 395 DPRINTF("mmap'ing vcpu state failed\n"); 396 goto err; 397 } 398 399 if (s->coalesced_mmio && !s->coalesced_mmio_ring) { 400 s->coalesced_mmio_ring = 401 (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE; 402 } 403 404 ret = kvm_arch_init_vcpu(cpu); 405 err: 406 return ret; 407 } 408 409 /* 410 * dirty pages logging control 411 */ 412 413 static int kvm_mem_flags(MemoryRegion *mr) 414 { 415 bool readonly = mr->readonly || memory_region_is_romd(mr); 416 int flags = 0; 417 418 if (memory_region_get_dirty_log_mask(mr) != 0) { 419 flags |= KVM_MEM_LOG_DIRTY_PAGES; 420 } 421 if (readonly && kvm_readonly_mem_allowed) { 422 flags |= KVM_MEM_READONLY; 423 } 424 return flags; 425 } 426 427 /* Called with KVMMemoryListener.slots_lock held */ 428 static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem, 429 MemoryRegion *mr) 430 { 431 mem->flags = kvm_mem_flags(mr); 432 433 /* If nothing changed effectively, no need to issue ioctl */ 434 if (mem->flags == mem->old_flags) { 435 return 0; 436 } 437 438 return kvm_set_user_memory_region(kml, mem, false); 439 } 440 441 static int kvm_section_update_flags(KVMMemoryListener *kml, 442 MemoryRegionSection *section) 443 { 444 hwaddr start_addr, size, slot_size; 445 KVMSlot *mem; 446 int ret = 0; 447 448 size = kvm_align_section(section, &start_addr); 449 if (!size) { 450 return 0; 451 } 452 453 kvm_slots_lock(kml); 454 455 while (size && !ret) { 456 slot_size = MIN(kvm_max_slot_size, size); 457 mem = kvm_lookup_matching_slot(kml, start_addr, slot_size); 458 if (!mem) { 459 /* We don't have a slot if we want to trap every access. */ 460 goto out; 461 } 462 463 ret = kvm_slot_update_flags(kml, mem, section->mr); 464 start_addr += slot_size; 465 size -= slot_size; 466 } 467 468 out: 469 kvm_slots_unlock(kml); 470 return ret; 471 } 472 473 static void kvm_log_start(MemoryListener *listener, 474 MemoryRegionSection *section, 475 int old, int new) 476 { 477 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 478 int r; 479 480 if (old != 0) { 481 return; 482 } 483 484 r = kvm_section_update_flags(kml, section); 485 if (r < 0) { 486 abort(); 487 } 488 } 489 490 static void kvm_log_stop(MemoryListener *listener, 491 MemoryRegionSection *section, 492 int old, int new) 493 { 494 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 495 int r; 496 497 if (new != 0) { 498 return; 499 } 500 501 r = kvm_section_update_flags(kml, section); 502 if (r < 0) { 503 abort(); 504 } 505 } 506 507 /* get kvm's dirty pages bitmap and update qemu's */ 508 static int kvm_get_dirty_pages_log_range(MemoryRegionSection *section, 509 unsigned long *bitmap) 510 { 511 ram_addr_t start = section->offset_within_region + 512 memory_region_get_ram_addr(section->mr); 513 ram_addr_t pages = int128_get64(section->size) / qemu_real_host_page_size; 514 515 cpu_physical_memory_set_dirty_lebitmap(bitmap, start, pages); 516 return 0; 517 } 518 519 #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1)) 520 521 /** 522 * kvm_physical_sync_dirty_bitmap - Sync dirty bitmap from kernel space 523 * 524 * This function will first try to fetch dirty bitmap from the kernel, 525 * and then updates qemu's dirty bitmap. 526 * 527 * NOTE: caller must be with kml->slots_lock held. 528 * 529 * @kml: the KVM memory listener object 530 * @section: the memory section to sync the dirty bitmap with 531 */ 532 static int kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml, 533 MemoryRegionSection *section) 534 { 535 KVMState *s = kvm_state; 536 struct kvm_dirty_log d = {}; 537 KVMSlot *mem; 538 hwaddr start_addr, size; 539 hwaddr slot_size, slot_offset = 0; 540 int ret = 0; 541 542 size = kvm_align_section(section, &start_addr); 543 while (size) { 544 MemoryRegionSection subsection = *section; 545 546 slot_size = MIN(kvm_max_slot_size, size); 547 mem = kvm_lookup_matching_slot(kml, start_addr, slot_size); 548 if (!mem) { 549 /* We don't have a slot if we want to trap every access. */ 550 goto out; 551 } 552 553 /* XXX bad kernel interface alert 554 * For dirty bitmap, kernel allocates array of size aligned to 555 * bits-per-long. But for case when the kernel is 64bits and 556 * the userspace is 32bits, userspace can't align to the same 557 * bits-per-long, since sizeof(long) is different between kernel 558 * and user space. This way, userspace will provide buffer which 559 * may be 4 bytes less than the kernel will use, resulting in 560 * userspace memory corruption (which is not detectable by valgrind 561 * too, in most cases). 562 * So for now, let's align to 64 instead of HOST_LONG_BITS here, in 563 * a hope that sizeof(long) won't become >8 any time soon. 564 */ 565 if (!mem->dirty_bmap) { 566 hwaddr bitmap_size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS), 567 /*HOST_LONG_BITS*/ 64) / 8; 568 /* Allocate on the first log_sync, once and for all */ 569 mem->dirty_bmap = g_malloc0(bitmap_size); 570 } 571 572 d.dirty_bitmap = mem->dirty_bmap; 573 d.slot = mem->slot | (kml->as_id << 16); 574 if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) { 575 DPRINTF("ioctl failed %d\n", errno); 576 ret = -1; 577 goto out; 578 } 579 580 subsection.offset_within_region += slot_offset; 581 subsection.size = int128_make64(slot_size); 582 kvm_get_dirty_pages_log_range(&subsection, d.dirty_bitmap); 583 584 slot_offset += slot_size; 585 start_addr += slot_size; 586 size -= slot_size; 587 } 588 out: 589 return ret; 590 } 591 592 /* Alignment requirement for KVM_CLEAR_DIRTY_LOG - 64 pages */ 593 #define KVM_CLEAR_LOG_SHIFT 6 594 #define KVM_CLEAR_LOG_ALIGN (qemu_real_host_page_size << KVM_CLEAR_LOG_SHIFT) 595 #define KVM_CLEAR_LOG_MASK (-KVM_CLEAR_LOG_ALIGN) 596 597 static int kvm_log_clear_one_slot(KVMSlot *mem, int as_id, uint64_t start, 598 uint64_t size) 599 { 600 KVMState *s = kvm_state; 601 uint64_t end, bmap_start, start_delta, bmap_npages; 602 struct kvm_clear_dirty_log d; 603 unsigned long *bmap_clear = NULL, psize = qemu_real_host_page_size; 604 int ret; 605 606 /* 607 * We need to extend either the start or the size or both to 608 * satisfy the KVM interface requirement. Firstly, do the start 609 * page alignment on 64 host pages 610 */ 611 bmap_start = start & KVM_CLEAR_LOG_MASK; 612 start_delta = start - bmap_start; 613 bmap_start /= psize; 614 615 /* 616 * The kernel interface has restriction on the size too, that either: 617 * 618 * (1) the size is 64 host pages aligned (just like the start), or 619 * (2) the size fills up until the end of the KVM memslot. 620 */ 621 bmap_npages = DIV_ROUND_UP(size + start_delta, KVM_CLEAR_LOG_ALIGN) 622 << KVM_CLEAR_LOG_SHIFT; 623 end = mem->memory_size / psize; 624 if (bmap_npages > end - bmap_start) { 625 bmap_npages = end - bmap_start; 626 } 627 start_delta /= psize; 628 629 /* 630 * Prepare the bitmap to clear dirty bits. Here we must guarantee 631 * that we won't clear any unknown dirty bits otherwise we might 632 * accidentally clear some set bits which are not yet synced from 633 * the kernel into QEMU's bitmap, then we'll lose track of the 634 * guest modifications upon those pages (which can directly lead 635 * to guest data loss or panic after migration). 636 * 637 * Layout of the KVMSlot.dirty_bmap: 638 * 639 * |<-------- bmap_npages -----------..>| 640 * [1] 641 * start_delta size 642 * |----------------|-------------|------------------|------------| 643 * ^ ^ ^ ^ 644 * | | | | 645 * start bmap_start (start) end 646 * of memslot of memslot 647 * 648 * [1] bmap_npages can be aligned to either 64 pages or the end of slot 649 */ 650 651 assert(bmap_start % BITS_PER_LONG == 0); 652 /* We should never do log_clear before log_sync */ 653 assert(mem->dirty_bmap); 654 if (start_delta) { 655 /* Slow path - we need to manipulate a temp bitmap */ 656 bmap_clear = bitmap_new(bmap_npages); 657 bitmap_copy_with_src_offset(bmap_clear, mem->dirty_bmap, 658 bmap_start, start_delta + size / psize); 659 /* 660 * We need to fill the holes at start because that was not 661 * specified by the caller and we extended the bitmap only for 662 * 64 pages alignment 663 */ 664 bitmap_clear(bmap_clear, 0, start_delta); 665 d.dirty_bitmap = bmap_clear; 666 } else { 667 /* Fast path - start address aligns well with BITS_PER_LONG */ 668 d.dirty_bitmap = mem->dirty_bmap + BIT_WORD(bmap_start); 669 } 670 671 d.first_page = bmap_start; 672 /* It should never overflow. If it happens, say something */ 673 assert(bmap_npages <= UINT32_MAX); 674 d.num_pages = bmap_npages; 675 d.slot = mem->slot | (as_id << 16); 676 677 if (kvm_vm_ioctl(s, KVM_CLEAR_DIRTY_LOG, &d) == -1) { 678 ret = -errno; 679 error_report("%s: KVM_CLEAR_DIRTY_LOG failed, slot=%d, " 680 "start=0x%"PRIx64", size=0x%"PRIx32", errno=%d", 681 __func__, d.slot, (uint64_t)d.first_page, 682 (uint32_t)d.num_pages, ret); 683 } else { 684 ret = 0; 685 trace_kvm_clear_dirty_log(d.slot, d.first_page, d.num_pages); 686 } 687 688 /* 689 * After we have updated the remote dirty bitmap, we update the 690 * cached bitmap as well for the memslot, then if another user 691 * clears the same region we know we shouldn't clear it again on 692 * the remote otherwise it's data loss as well. 693 */ 694 bitmap_clear(mem->dirty_bmap, bmap_start + start_delta, 695 size / psize); 696 /* This handles the NULL case well */ 697 g_free(bmap_clear); 698 return ret; 699 } 700 701 702 /** 703 * kvm_physical_log_clear - Clear the kernel's dirty bitmap for range 704 * 705 * NOTE: this will be a no-op if we haven't enabled manual dirty log 706 * protection in the host kernel because in that case this operation 707 * will be done within log_sync(). 708 * 709 * @kml: the kvm memory listener 710 * @section: the memory range to clear dirty bitmap 711 */ 712 static int kvm_physical_log_clear(KVMMemoryListener *kml, 713 MemoryRegionSection *section) 714 { 715 KVMState *s = kvm_state; 716 uint64_t start, size, offset, count; 717 KVMSlot *mem; 718 int ret = 0, i; 719 720 if (!s->manual_dirty_log_protect) { 721 /* No need to do explicit clear */ 722 return ret; 723 } 724 725 start = section->offset_within_address_space; 726 size = int128_get64(section->size); 727 728 if (!size) { 729 /* Nothing more we can do... */ 730 return ret; 731 } 732 733 kvm_slots_lock(kml); 734 735 for (i = 0; i < s->nr_slots; i++) { 736 mem = &kml->slots[i]; 737 /* Discard slots that are empty or do not overlap the section */ 738 if (!mem->memory_size || 739 mem->start_addr > start + size - 1 || 740 start > mem->start_addr + mem->memory_size - 1) { 741 continue; 742 } 743 744 if (start >= mem->start_addr) { 745 /* The slot starts before section or is aligned to it. */ 746 offset = start - mem->start_addr; 747 count = MIN(mem->memory_size - offset, size); 748 } else { 749 /* The slot starts after section. */ 750 offset = 0; 751 count = MIN(mem->memory_size, size - (mem->start_addr - start)); 752 } 753 ret = kvm_log_clear_one_slot(mem, kml->as_id, offset, count); 754 if (ret < 0) { 755 break; 756 } 757 } 758 759 kvm_slots_unlock(kml); 760 761 return ret; 762 } 763 764 static void kvm_coalesce_mmio_region(MemoryListener *listener, 765 MemoryRegionSection *secion, 766 hwaddr start, hwaddr size) 767 { 768 KVMState *s = kvm_state; 769 770 if (s->coalesced_mmio) { 771 struct kvm_coalesced_mmio_zone zone; 772 773 zone.addr = start; 774 zone.size = size; 775 zone.pad = 0; 776 777 (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone); 778 } 779 } 780 781 static void kvm_uncoalesce_mmio_region(MemoryListener *listener, 782 MemoryRegionSection *secion, 783 hwaddr start, hwaddr size) 784 { 785 KVMState *s = kvm_state; 786 787 if (s->coalesced_mmio) { 788 struct kvm_coalesced_mmio_zone zone; 789 790 zone.addr = start; 791 zone.size = size; 792 zone.pad = 0; 793 794 (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone); 795 } 796 } 797 798 static void kvm_coalesce_pio_add(MemoryListener *listener, 799 MemoryRegionSection *section, 800 hwaddr start, hwaddr size) 801 { 802 KVMState *s = kvm_state; 803 804 if (s->coalesced_pio) { 805 struct kvm_coalesced_mmio_zone zone; 806 807 zone.addr = start; 808 zone.size = size; 809 zone.pio = 1; 810 811 (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone); 812 } 813 } 814 815 static void kvm_coalesce_pio_del(MemoryListener *listener, 816 MemoryRegionSection *section, 817 hwaddr start, hwaddr size) 818 { 819 KVMState *s = kvm_state; 820 821 if (s->coalesced_pio) { 822 struct kvm_coalesced_mmio_zone zone; 823 824 zone.addr = start; 825 zone.size = size; 826 zone.pio = 1; 827 828 (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone); 829 } 830 } 831 832 static MemoryListener kvm_coalesced_pio_listener = { 833 .coalesced_io_add = kvm_coalesce_pio_add, 834 .coalesced_io_del = kvm_coalesce_pio_del, 835 }; 836 837 int kvm_check_extension(KVMState *s, unsigned int extension) 838 { 839 int ret; 840 841 ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension); 842 if (ret < 0) { 843 ret = 0; 844 } 845 846 return ret; 847 } 848 849 int kvm_vm_check_extension(KVMState *s, unsigned int extension) 850 { 851 int ret; 852 853 ret = kvm_vm_ioctl(s, KVM_CHECK_EXTENSION, extension); 854 if (ret < 0) { 855 /* VM wide version not implemented, use global one instead */ 856 ret = kvm_check_extension(s, extension); 857 } 858 859 return ret; 860 } 861 862 static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size) 863 { 864 #if defined(HOST_WORDS_BIGENDIAN) != defined(TARGET_WORDS_BIGENDIAN) 865 /* The kernel expects ioeventfd values in HOST_WORDS_BIGENDIAN 866 * endianness, but the memory core hands them in target endianness. 867 * For example, PPC is always treated as big-endian even if running 868 * on KVM and on PPC64LE. Correct here. 869 */ 870 switch (size) { 871 case 2: 872 val = bswap16(val); 873 break; 874 case 4: 875 val = bswap32(val); 876 break; 877 } 878 #endif 879 return val; 880 } 881 882 static int kvm_set_ioeventfd_mmio(int fd, hwaddr addr, uint32_t val, 883 bool assign, uint32_t size, bool datamatch) 884 { 885 int ret; 886 struct kvm_ioeventfd iofd = { 887 .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0, 888 .addr = addr, 889 .len = size, 890 .flags = 0, 891 .fd = fd, 892 }; 893 894 trace_kvm_set_ioeventfd_mmio(fd, (uint64_t)addr, val, assign, size, 895 datamatch); 896 if (!kvm_enabled()) { 897 return -ENOSYS; 898 } 899 900 if (datamatch) { 901 iofd.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH; 902 } 903 if (!assign) { 904 iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN; 905 } 906 907 ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd); 908 909 if (ret < 0) { 910 return -errno; 911 } 912 913 return 0; 914 } 915 916 static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val, 917 bool assign, uint32_t size, bool datamatch) 918 { 919 struct kvm_ioeventfd kick = { 920 .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0, 921 .addr = addr, 922 .flags = KVM_IOEVENTFD_FLAG_PIO, 923 .len = size, 924 .fd = fd, 925 }; 926 int r; 927 trace_kvm_set_ioeventfd_pio(fd, addr, val, assign, size, datamatch); 928 if (!kvm_enabled()) { 929 return -ENOSYS; 930 } 931 if (datamatch) { 932 kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH; 933 } 934 if (!assign) { 935 kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN; 936 } 937 r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick); 938 if (r < 0) { 939 return r; 940 } 941 return 0; 942 } 943 944 945 static int kvm_check_many_ioeventfds(void) 946 { 947 /* Userspace can use ioeventfd for io notification. This requires a host 948 * that supports eventfd(2) and an I/O thread; since eventfd does not 949 * support SIGIO it cannot interrupt the vcpu. 950 * 951 * Older kernels have a 6 device limit on the KVM io bus. Find out so we 952 * can avoid creating too many ioeventfds. 953 */ 954 #if defined(CONFIG_EVENTFD) 955 int ioeventfds[7]; 956 int i, ret = 0; 957 for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) { 958 ioeventfds[i] = eventfd(0, EFD_CLOEXEC); 959 if (ioeventfds[i] < 0) { 960 break; 961 } 962 ret = kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, true, 2, true); 963 if (ret < 0) { 964 close(ioeventfds[i]); 965 break; 966 } 967 } 968 969 /* Decide whether many devices are supported or not */ 970 ret = i == ARRAY_SIZE(ioeventfds); 971 972 while (i-- > 0) { 973 kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, false, 2, true); 974 close(ioeventfds[i]); 975 } 976 return ret; 977 #else 978 return 0; 979 #endif 980 } 981 982 static const KVMCapabilityInfo * 983 kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list) 984 { 985 while (list->name) { 986 if (!kvm_check_extension(s, list->value)) { 987 return list; 988 } 989 list++; 990 } 991 return NULL; 992 } 993 994 void kvm_set_max_memslot_size(hwaddr max_slot_size) 995 { 996 g_assert( 997 ROUND_UP(max_slot_size, qemu_real_host_page_size) == max_slot_size 998 ); 999 kvm_max_slot_size = max_slot_size; 1000 } 1001 1002 static void kvm_set_phys_mem(KVMMemoryListener *kml, 1003 MemoryRegionSection *section, bool add) 1004 { 1005 KVMSlot *mem; 1006 int err; 1007 MemoryRegion *mr = section->mr; 1008 bool writeable = !mr->readonly && !mr->rom_device; 1009 hwaddr start_addr, size, slot_size; 1010 void *ram; 1011 1012 if (!memory_region_is_ram(mr)) { 1013 if (writeable || !kvm_readonly_mem_allowed) { 1014 return; 1015 } else if (!mr->romd_mode) { 1016 /* If the memory device is not in romd_mode, then we actually want 1017 * to remove the kvm memory slot so all accesses will trap. */ 1018 add = false; 1019 } 1020 } 1021 1022 size = kvm_align_section(section, &start_addr); 1023 if (!size) { 1024 return; 1025 } 1026 1027 /* use aligned delta to align the ram address */ 1028 ram = memory_region_get_ram_ptr(mr) + section->offset_within_region + 1029 (start_addr - section->offset_within_address_space); 1030 1031 kvm_slots_lock(kml); 1032 1033 if (!add) { 1034 do { 1035 slot_size = MIN(kvm_max_slot_size, size); 1036 mem = kvm_lookup_matching_slot(kml, start_addr, slot_size); 1037 if (!mem) { 1038 goto out; 1039 } 1040 if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { 1041 kvm_physical_sync_dirty_bitmap(kml, section); 1042 } 1043 1044 /* unregister the slot */ 1045 g_free(mem->dirty_bmap); 1046 mem->dirty_bmap = NULL; 1047 mem->memory_size = 0; 1048 mem->flags = 0; 1049 err = kvm_set_user_memory_region(kml, mem, false); 1050 if (err) { 1051 fprintf(stderr, "%s: error unregistering slot: %s\n", 1052 __func__, strerror(-err)); 1053 abort(); 1054 } 1055 start_addr += slot_size; 1056 size -= slot_size; 1057 } while (size); 1058 goto out; 1059 } 1060 1061 /* register the new slot */ 1062 do { 1063 slot_size = MIN(kvm_max_slot_size, size); 1064 mem = kvm_alloc_slot(kml); 1065 mem->memory_size = slot_size; 1066 mem->start_addr = start_addr; 1067 mem->ram = ram; 1068 mem->flags = kvm_mem_flags(mr); 1069 1070 err = kvm_set_user_memory_region(kml, mem, true); 1071 if (err) { 1072 fprintf(stderr, "%s: error registering slot: %s\n", __func__, 1073 strerror(-err)); 1074 abort(); 1075 } 1076 start_addr += slot_size; 1077 ram += slot_size; 1078 size -= slot_size; 1079 } while (size); 1080 1081 out: 1082 kvm_slots_unlock(kml); 1083 } 1084 1085 static void kvm_region_add(MemoryListener *listener, 1086 MemoryRegionSection *section) 1087 { 1088 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 1089 1090 memory_region_ref(section->mr); 1091 kvm_set_phys_mem(kml, section, true); 1092 } 1093 1094 static void kvm_region_del(MemoryListener *listener, 1095 MemoryRegionSection *section) 1096 { 1097 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 1098 1099 kvm_set_phys_mem(kml, section, false); 1100 memory_region_unref(section->mr); 1101 } 1102 1103 static void kvm_log_sync(MemoryListener *listener, 1104 MemoryRegionSection *section) 1105 { 1106 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 1107 int r; 1108 1109 kvm_slots_lock(kml); 1110 r = kvm_physical_sync_dirty_bitmap(kml, section); 1111 kvm_slots_unlock(kml); 1112 if (r < 0) { 1113 abort(); 1114 } 1115 } 1116 1117 static void kvm_log_clear(MemoryListener *listener, 1118 MemoryRegionSection *section) 1119 { 1120 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 1121 int r; 1122 1123 r = kvm_physical_log_clear(kml, section); 1124 if (r < 0) { 1125 error_report_once("%s: kvm log clear failed: mr=%s " 1126 "offset=%"HWADDR_PRIx" size=%"PRIx64, __func__, 1127 section->mr->name, section->offset_within_region, 1128 int128_get64(section->size)); 1129 abort(); 1130 } 1131 } 1132 1133 static void kvm_mem_ioeventfd_add(MemoryListener *listener, 1134 MemoryRegionSection *section, 1135 bool match_data, uint64_t data, 1136 EventNotifier *e) 1137 { 1138 int fd = event_notifier_get_fd(e); 1139 int r; 1140 1141 r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space, 1142 data, true, int128_get64(section->size), 1143 match_data); 1144 if (r < 0) { 1145 fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n", 1146 __func__, strerror(-r), -r); 1147 abort(); 1148 } 1149 } 1150 1151 static void kvm_mem_ioeventfd_del(MemoryListener *listener, 1152 MemoryRegionSection *section, 1153 bool match_data, uint64_t data, 1154 EventNotifier *e) 1155 { 1156 int fd = event_notifier_get_fd(e); 1157 int r; 1158 1159 r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space, 1160 data, false, int128_get64(section->size), 1161 match_data); 1162 if (r < 0) { 1163 fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n", 1164 __func__, strerror(-r), -r); 1165 abort(); 1166 } 1167 } 1168 1169 static void kvm_io_ioeventfd_add(MemoryListener *listener, 1170 MemoryRegionSection *section, 1171 bool match_data, uint64_t data, 1172 EventNotifier *e) 1173 { 1174 int fd = event_notifier_get_fd(e); 1175 int r; 1176 1177 r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space, 1178 data, true, int128_get64(section->size), 1179 match_data); 1180 if (r < 0) { 1181 fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n", 1182 __func__, strerror(-r), -r); 1183 abort(); 1184 } 1185 } 1186 1187 static void kvm_io_ioeventfd_del(MemoryListener *listener, 1188 MemoryRegionSection *section, 1189 bool match_data, uint64_t data, 1190 EventNotifier *e) 1191 1192 { 1193 int fd = event_notifier_get_fd(e); 1194 int r; 1195 1196 r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space, 1197 data, false, int128_get64(section->size), 1198 match_data); 1199 if (r < 0) { 1200 fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n", 1201 __func__, strerror(-r), -r); 1202 abort(); 1203 } 1204 } 1205 1206 void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml, 1207 AddressSpace *as, int as_id) 1208 { 1209 int i; 1210 1211 qemu_mutex_init(&kml->slots_lock); 1212 kml->slots = g_malloc0(s->nr_slots * sizeof(KVMSlot)); 1213 kml->as_id = as_id; 1214 1215 for (i = 0; i < s->nr_slots; i++) { 1216 kml->slots[i].slot = i; 1217 } 1218 1219 kml->listener.region_add = kvm_region_add; 1220 kml->listener.region_del = kvm_region_del; 1221 kml->listener.log_start = kvm_log_start; 1222 kml->listener.log_stop = kvm_log_stop; 1223 kml->listener.log_sync = kvm_log_sync; 1224 kml->listener.log_clear = kvm_log_clear; 1225 kml->listener.priority = 10; 1226 1227 memory_listener_register(&kml->listener, as); 1228 1229 for (i = 0; i < s->nr_as; ++i) { 1230 if (!s->as[i].as) { 1231 s->as[i].as = as; 1232 s->as[i].ml = kml; 1233 break; 1234 } 1235 } 1236 } 1237 1238 static MemoryListener kvm_io_listener = { 1239 .eventfd_add = kvm_io_ioeventfd_add, 1240 .eventfd_del = kvm_io_ioeventfd_del, 1241 .priority = 10, 1242 }; 1243 1244 int kvm_set_irq(KVMState *s, int irq, int level) 1245 { 1246 struct kvm_irq_level event; 1247 int ret; 1248 1249 assert(kvm_async_interrupts_enabled()); 1250 1251 event.level = level; 1252 event.irq = irq; 1253 ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event); 1254 if (ret < 0) { 1255 perror("kvm_set_irq"); 1256 abort(); 1257 } 1258 1259 return (s->irq_set_ioctl == KVM_IRQ_LINE) ? 1 : event.status; 1260 } 1261 1262 #ifdef KVM_CAP_IRQ_ROUTING 1263 typedef struct KVMMSIRoute { 1264 struct kvm_irq_routing_entry kroute; 1265 QTAILQ_ENTRY(KVMMSIRoute) entry; 1266 } KVMMSIRoute; 1267 1268 static void set_gsi(KVMState *s, unsigned int gsi) 1269 { 1270 set_bit(gsi, s->used_gsi_bitmap); 1271 } 1272 1273 static void clear_gsi(KVMState *s, unsigned int gsi) 1274 { 1275 clear_bit(gsi, s->used_gsi_bitmap); 1276 } 1277 1278 void kvm_init_irq_routing(KVMState *s) 1279 { 1280 int gsi_count, i; 1281 1282 gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING) - 1; 1283 if (gsi_count > 0) { 1284 /* Round up so we can search ints using ffs */ 1285 s->used_gsi_bitmap = bitmap_new(gsi_count); 1286 s->gsi_count = gsi_count; 1287 } 1288 1289 s->irq_routes = g_malloc0(sizeof(*s->irq_routes)); 1290 s->nr_allocated_irq_routes = 0; 1291 1292 if (!kvm_direct_msi_allowed) { 1293 for (i = 0; i < KVM_MSI_HASHTAB_SIZE; i++) { 1294 QTAILQ_INIT(&s->msi_hashtab[i]); 1295 } 1296 } 1297 1298 kvm_arch_init_irq_routing(s); 1299 } 1300 1301 void kvm_irqchip_commit_routes(KVMState *s) 1302 { 1303 int ret; 1304 1305 if (kvm_gsi_direct_mapping()) { 1306 return; 1307 } 1308 1309 if (!kvm_gsi_routing_enabled()) { 1310 return; 1311 } 1312 1313 s->irq_routes->flags = 0; 1314 trace_kvm_irqchip_commit_routes(); 1315 ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes); 1316 assert(ret == 0); 1317 } 1318 1319 static void kvm_add_routing_entry(KVMState *s, 1320 struct kvm_irq_routing_entry *entry) 1321 { 1322 struct kvm_irq_routing_entry *new; 1323 int n, size; 1324 1325 if (s->irq_routes->nr == s->nr_allocated_irq_routes) { 1326 n = s->nr_allocated_irq_routes * 2; 1327 if (n < 64) { 1328 n = 64; 1329 } 1330 size = sizeof(struct kvm_irq_routing); 1331 size += n * sizeof(*new); 1332 s->irq_routes = g_realloc(s->irq_routes, size); 1333 s->nr_allocated_irq_routes = n; 1334 } 1335 n = s->irq_routes->nr++; 1336 new = &s->irq_routes->entries[n]; 1337 1338 *new = *entry; 1339 1340 set_gsi(s, entry->gsi); 1341 } 1342 1343 static int kvm_update_routing_entry(KVMState *s, 1344 struct kvm_irq_routing_entry *new_entry) 1345 { 1346 struct kvm_irq_routing_entry *entry; 1347 int n; 1348 1349 for (n = 0; n < s->irq_routes->nr; n++) { 1350 entry = &s->irq_routes->entries[n]; 1351 if (entry->gsi != new_entry->gsi) { 1352 continue; 1353 } 1354 1355 if(!memcmp(entry, new_entry, sizeof *entry)) { 1356 return 0; 1357 } 1358 1359 *entry = *new_entry; 1360 1361 return 0; 1362 } 1363 1364 return -ESRCH; 1365 } 1366 1367 void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin) 1368 { 1369 struct kvm_irq_routing_entry e = {}; 1370 1371 assert(pin < s->gsi_count); 1372 1373 e.gsi = irq; 1374 e.type = KVM_IRQ_ROUTING_IRQCHIP; 1375 e.flags = 0; 1376 e.u.irqchip.irqchip = irqchip; 1377 e.u.irqchip.pin = pin; 1378 kvm_add_routing_entry(s, &e); 1379 } 1380 1381 void kvm_irqchip_release_virq(KVMState *s, int virq) 1382 { 1383 struct kvm_irq_routing_entry *e; 1384 int i; 1385 1386 if (kvm_gsi_direct_mapping()) { 1387 return; 1388 } 1389 1390 for (i = 0; i < s->irq_routes->nr; i++) { 1391 e = &s->irq_routes->entries[i]; 1392 if (e->gsi == virq) { 1393 s->irq_routes->nr--; 1394 *e = s->irq_routes->entries[s->irq_routes->nr]; 1395 } 1396 } 1397 clear_gsi(s, virq); 1398 kvm_arch_release_virq_post(virq); 1399 trace_kvm_irqchip_release_virq(virq); 1400 } 1401 1402 void kvm_irqchip_add_change_notifier(Notifier *n) 1403 { 1404 notifier_list_add(&kvm_irqchip_change_notifiers, n); 1405 } 1406 1407 void kvm_irqchip_remove_change_notifier(Notifier *n) 1408 { 1409 notifier_remove(n); 1410 } 1411 1412 void kvm_irqchip_change_notify(void) 1413 { 1414 notifier_list_notify(&kvm_irqchip_change_notifiers, NULL); 1415 } 1416 1417 static unsigned int kvm_hash_msi(uint32_t data) 1418 { 1419 /* This is optimized for IA32 MSI layout. However, no other arch shall 1420 * repeat the mistake of not providing a direct MSI injection API. */ 1421 return data & 0xff; 1422 } 1423 1424 static void kvm_flush_dynamic_msi_routes(KVMState *s) 1425 { 1426 KVMMSIRoute *route, *next; 1427 unsigned int hash; 1428 1429 for (hash = 0; hash < KVM_MSI_HASHTAB_SIZE; hash++) { 1430 QTAILQ_FOREACH_SAFE(route, &s->msi_hashtab[hash], entry, next) { 1431 kvm_irqchip_release_virq(s, route->kroute.gsi); 1432 QTAILQ_REMOVE(&s->msi_hashtab[hash], route, entry); 1433 g_free(route); 1434 } 1435 } 1436 } 1437 1438 static int kvm_irqchip_get_virq(KVMState *s) 1439 { 1440 int next_virq; 1441 1442 /* 1443 * PIC and IOAPIC share the first 16 GSI numbers, thus the available 1444 * GSI numbers are more than the number of IRQ route. Allocating a GSI 1445 * number can succeed even though a new route entry cannot be added. 1446 * When this happens, flush dynamic MSI entries to free IRQ route entries. 1447 */ 1448 if (!kvm_direct_msi_allowed && s->irq_routes->nr == s->gsi_count) { 1449 kvm_flush_dynamic_msi_routes(s); 1450 } 1451 1452 /* Return the lowest unused GSI in the bitmap */ 1453 next_virq = find_first_zero_bit(s->used_gsi_bitmap, s->gsi_count); 1454 if (next_virq >= s->gsi_count) { 1455 return -ENOSPC; 1456 } else { 1457 return next_virq; 1458 } 1459 } 1460 1461 static KVMMSIRoute *kvm_lookup_msi_route(KVMState *s, MSIMessage msg) 1462 { 1463 unsigned int hash = kvm_hash_msi(msg.data); 1464 KVMMSIRoute *route; 1465 1466 QTAILQ_FOREACH(route, &s->msi_hashtab[hash], entry) { 1467 if (route->kroute.u.msi.address_lo == (uint32_t)msg.address && 1468 route->kroute.u.msi.address_hi == (msg.address >> 32) && 1469 route->kroute.u.msi.data == le32_to_cpu(msg.data)) { 1470 return route; 1471 } 1472 } 1473 return NULL; 1474 } 1475 1476 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg) 1477 { 1478 struct kvm_msi msi; 1479 KVMMSIRoute *route; 1480 1481 if (kvm_direct_msi_allowed) { 1482 msi.address_lo = (uint32_t)msg.address; 1483 msi.address_hi = msg.address >> 32; 1484 msi.data = le32_to_cpu(msg.data); 1485 msi.flags = 0; 1486 memset(msi.pad, 0, sizeof(msi.pad)); 1487 1488 return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi); 1489 } 1490 1491 route = kvm_lookup_msi_route(s, msg); 1492 if (!route) { 1493 int virq; 1494 1495 virq = kvm_irqchip_get_virq(s); 1496 if (virq < 0) { 1497 return virq; 1498 } 1499 1500 route = g_malloc0(sizeof(KVMMSIRoute)); 1501 route->kroute.gsi = virq; 1502 route->kroute.type = KVM_IRQ_ROUTING_MSI; 1503 route->kroute.flags = 0; 1504 route->kroute.u.msi.address_lo = (uint32_t)msg.address; 1505 route->kroute.u.msi.address_hi = msg.address >> 32; 1506 route->kroute.u.msi.data = le32_to_cpu(msg.data); 1507 1508 kvm_add_routing_entry(s, &route->kroute); 1509 kvm_irqchip_commit_routes(s); 1510 1511 QTAILQ_INSERT_TAIL(&s->msi_hashtab[kvm_hash_msi(msg.data)], route, 1512 entry); 1513 } 1514 1515 assert(route->kroute.type == KVM_IRQ_ROUTING_MSI); 1516 1517 return kvm_set_irq(s, route->kroute.gsi, 1); 1518 } 1519 1520 int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev) 1521 { 1522 struct kvm_irq_routing_entry kroute = {}; 1523 int virq; 1524 MSIMessage msg = {0, 0}; 1525 1526 if (pci_available && dev) { 1527 msg = pci_get_msi_message(dev, vector); 1528 } 1529 1530 if (kvm_gsi_direct_mapping()) { 1531 return kvm_arch_msi_data_to_gsi(msg.data); 1532 } 1533 1534 if (!kvm_gsi_routing_enabled()) { 1535 return -ENOSYS; 1536 } 1537 1538 virq = kvm_irqchip_get_virq(s); 1539 if (virq < 0) { 1540 return virq; 1541 } 1542 1543 kroute.gsi = virq; 1544 kroute.type = KVM_IRQ_ROUTING_MSI; 1545 kroute.flags = 0; 1546 kroute.u.msi.address_lo = (uint32_t)msg.address; 1547 kroute.u.msi.address_hi = msg.address >> 32; 1548 kroute.u.msi.data = le32_to_cpu(msg.data); 1549 if (pci_available && kvm_msi_devid_required()) { 1550 kroute.flags = KVM_MSI_VALID_DEVID; 1551 kroute.u.msi.devid = pci_requester_id(dev); 1552 } 1553 if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) { 1554 kvm_irqchip_release_virq(s, virq); 1555 return -EINVAL; 1556 } 1557 1558 trace_kvm_irqchip_add_msi_route(dev ? dev->name : (char *)"N/A", 1559 vector, virq); 1560 1561 kvm_add_routing_entry(s, &kroute); 1562 kvm_arch_add_msi_route_post(&kroute, vector, dev); 1563 kvm_irqchip_commit_routes(s); 1564 1565 return virq; 1566 } 1567 1568 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg, 1569 PCIDevice *dev) 1570 { 1571 struct kvm_irq_routing_entry kroute = {}; 1572 1573 if (kvm_gsi_direct_mapping()) { 1574 return 0; 1575 } 1576 1577 if (!kvm_irqchip_in_kernel()) { 1578 return -ENOSYS; 1579 } 1580 1581 kroute.gsi = virq; 1582 kroute.type = KVM_IRQ_ROUTING_MSI; 1583 kroute.flags = 0; 1584 kroute.u.msi.address_lo = (uint32_t)msg.address; 1585 kroute.u.msi.address_hi = msg.address >> 32; 1586 kroute.u.msi.data = le32_to_cpu(msg.data); 1587 if (pci_available && kvm_msi_devid_required()) { 1588 kroute.flags = KVM_MSI_VALID_DEVID; 1589 kroute.u.msi.devid = pci_requester_id(dev); 1590 } 1591 if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) { 1592 return -EINVAL; 1593 } 1594 1595 trace_kvm_irqchip_update_msi_route(virq); 1596 1597 return kvm_update_routing_entry(s, &kroute); 1598 } 1599 1600 static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int rfd, int virq, 1601 bool assign) 1602 { 1603 struct kvm_irqfd irqfd = { 1604 .fd = fd, 1605 .gsi = virq, 1606 .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN, 1607 }; 1608 1609 if (rfd != -1) { 1610 irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE; 1611 irqfd.resamplefd = rfd; 1612 } 1613 1614 if (!kvm_irqfds_enabled()) { 1615 return -ENOSYS; 1616 } 1617 1618 return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd); 1619 } 1620 1621 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter) 1622 { 1623 struct kvm_irq_routing_entry kroute = {}; 1624 int virq; 1625 1626 if (!kvm_gsi_routing_enabled()) { 1627 return -ENOSYS; 1628 } 1629 1630 virq = kvm_irqchip_get_virq(s); 1631 if (virq < 0) { 1632 return virq; 1633 } 1634 1635 kroute.gsi = virq; 1636 kroute.type = KVM_IRQ_ROUTING_S390_ADAPTER; 1637 kroute.flags = 0; 1638 kroute.u.adapter.summary_addr = adapter->summary_addr; 1639 kroute.u.adapter.ind_addr = adapter->ind_addr; 1640 kroute.u.adapter.summary_offset = adapter->summary_offset; 1641 kroute.u.adapter.ind_offset = adapter->ind_offset; 1642 kroute.u.adapter.adapter_id = adapter->adapter_id; 1643 1644 kvm_add_routing_entry(s, &kroute); 1645 1646 return virq; 1647 } 1648 1649 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint) 1650 { 1651 struct kvm_irq_routing_entry kroute = {}; 1652 int virq; 1653 1654 if (!kvm_gsi_routing_enabled()) { 1655 return -ENOSYS; 1656 } 1657 if (!kvm_check_extension(s, KVM_CAP_HYPERV_SYNIC)) { 1658 return -ENOSYS; 1659 } 1660 virq = kvm_irqchip_get_virq(s); 1661 if (virq < 0) { 1662 return virq; 1663 } 1664 1665 kroute.gsi = virq; 1666 kroute.type = KVM_IRQ_ROUTING_HV_SINT; 1667 kroute.flags = 0; 1668 kroute.u.hv_sint.vcpu = vcpu; 1669 kroute.u.hv_sint.sint = sint; 1670 1671 kvm_add_routing_entry(s, &kroute); 1672 kvm_irqchip_commit_routes(s); 1673 1674 return virq; 1675 } 1676 1677 #else /* !KVM_CAP_IRQ_ROUTING */ 1678 1679 void kvm_init_irq_routing(KVMState *s) 1680 { 1681 } 1682 1683 void kvm_irqchip_release_virq(KVMState *s, int virq) 1684 { 1685 } 1686 1687 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg) 1688 { 1689 abort(); 1690 } 1691 1692 int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev) 1693 { 1694 return -ENOSYS; 1695 } 1696 1697 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter) 1698 { 1699 return -ENOSYS; 1700 } 1701 1702 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint) 1703 { 1704 return -ENOSYS; 1705 } 1706 1707 static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int virq, bool assign) 1708 { 1709 abort(); 1710 } 1711 1712 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg) 1713 { 1714 return -ENOSYS; 1715 } 1716 #endif /* !KVM_CAP_IRQ_ROUTING */ 1717 1718 int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, 1719 EventNotifier *rn, int virq) 1720 { 1721 return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), 1722 rn ? event_notifier_get_fd(rn) : -1, virq, true); 1723 } 1724 1725 int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, 1726 int virq) 1727 { 1728 return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), -1, virq, 1729 false); 1730 } 1731 1732 int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n, 1733 EventNotifier *rn, qemu_irq irq) 1734 { 1735 gpointer key, gsi; 1736 gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi); 1737 1738 if (!found) { 1739 return -ENXIO; 1740 } 1741 return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, GPOINTER_TO_INT(gsi)); 1742 } 1743 1744 int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n, 1745 qemu_irq irq) 1746 { 1747 gpointer key, gsi; 1748 gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi); 1749 1750 if (!found) { 1751 return -ENXIO; 1752 } 1753 return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, GPOINTER_TO_INT(gsi)); 1754 } 1755 1756 void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi) 1757 { 1758 g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi)); 1759 } 1760 1761 static void kvm_irqchip_create(MachineState *machine, KVMState *s) 1762 { 1763 int ret; 1764 1765 if (kvm_check_extension(s, KVM_CAP_IRQCHIP)) { 1766 ; 1767 } else if (kvm_check_extension(s, KVM_CAP_S390_IRQCHIP)) { 1768 ret = kvm_vm_enable_cap(s, KVM_CAP_S390_IRQCHIP, 0); 1769 if (ret < 0) { 1770 fprintf(stderr, "Enable kernel irqchip failed: %s\n", strerror(-ret)); 1771 exit(1); 1772 } 1773 } else { 1774 return; 1775 } 1776 1777 /* First probe and see if there's a arch-specific hook to create the 1778 * in-kernel irqchip for us */ 1779 ret = kvm_arch_irqchip_create(machine, s); 1780 if (ret == 0) { 1781 if (machine_kernel_irqchip_split(machine)) { 1782 perror("Split IRQ chip mode not supported."); 1783 exit(1); 1784 } else { 1785 ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP); 1786 } 1787 } 1788 if (ret < 0) { 1789 fprintf(stderr, "Create kernel irqchip failed: %s\n", strerror(-ret)); 1790 exit(1); 1791 } 1792 1793 kvm_kernel_irqchip = true; 1794 /* If we have an in-kernel IRQ chip then we must have asynchronous 1795 * interrupt delivery (though the reverse is not necessarily true) 1796 */ 1797 kvm_async_interrupts_allowed = true; 1798 kvm_halt_in_kernel_allowed = true; 1799 1800 kvm_init_irq_routing(s); 1801 1802 s->gsimap = g_hash_table_new(g_direct_hash, g_direct_equal); 1803 } 1804 1805 /* Find number of supported CPUs using the recommended 1806 * procedure from the kernel API documentation to cope with 1807 * older kernels that may be missing capabilities. 1808 */ 1809 static int kvm_recommended_vcpus(KVMState *s) 1810 { 1811 int ret = kvm_vm_check_extension(s, KVM_CAP_NR_VCPUS); 1812 return (ret) ? ret : 4; 1813 } 1814 1815 static int kvm_max_vcpus(KVMState *s) 1816 { 1817 int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS); 1818 return (ret) ? ret : kvm_recommended_vcpus(s); 1819 } 1820 1821 static int kvm_max_vcpu_id(KVMState *s) 1822 { 1823 int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPU_ID); 1824 return (ret) ? ret : kvm_max_vcpus(s); 1825 } 1826 1827 bool kvm_vcpu_id_is_valid(int vcpu_id) 1828 { 1829 KVMState *s = KVM_STATE(current_machine->accelerator); 1830 return vcpu_id >= 0 && vcpu_id < kvm_max_vcpu_id(s); 1831 } 1832 1833 static int kvm_init(MachineState *ms) 1834 { 1835 MachineClass *mc = MACHINE_GET_CLASS(ms); 1836 static const char upgrade_note[] = 1837 "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n" 1838 "(see http://sourceforge.net/projects/kvm).\n"; 1839 struct { 1840 const char *name; 1841 int num; 1842 } num_cpus[] = { 1843 { "SMP", ms->smp.cpus }, 1844 { "hotpluggable", ms->smp.max_cpus }, 1845 { NULL, } 1846 }, *nc = num_cpus; 1847 int soft_vcpus_limit, hard_vcpus_limit; 1848 KVMState *s; 1849 const KVMCapabilityInfo *missing_cap; 1850 int ret; 1851 int type = 0; 1852 const char *kvm_type; 1853 1854 s = KVM_STATE(ms->accelerator); 1855 1856 /* 1857 * On systems where the kernel can support different base page 1858 * sizes, host page size may be different from TARGET_PAGE_SIZE, 1859 * even with KVM. TARGET_PAGE_SIZE is assumed to be the minimum 1860 * page size for the system though. 1861 */ 1862 assert(TARGET_PAGE_SIZE <= qemu_real_host_page_size); 1863 1864 s->sigmask_len = 8; 1865 1866 #ifdef KVM_CAP_SET_GUEST_DEBUG 1867 QTAILQ_INIT(&s->kvm_sw_breakpoints); 1868 #endif 1869 QLIST_INIT(&s->kvm_parked_vcpus); 1870 s->vmfd = -1; 1871 s->fd = qemu_open("/dev/kvm", O_RDWR); 1872 if (s->fd == -1) { 1873 fprintf(stderr, "Could not access KVM kernel module: %m\n"); 1874 ret = -errno; 1875 goto err; 1876 } 1877 1878 ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0); 1879 if (ret < KVM_API_VERSION) { 1880 if (ret >= 0) { 1881 ret = -EINVAL; 1882 } 1883 fprintf(stderr, "kvm version too old\n"); 1884 goto err; 1885 } 1886 1887 if (ret > KVM_API_VERSION) { 1888 ret = -EINVAL; 1889 fprintf(stderr, "kvm version not supported\n"); 1890 goto err; 1891 } 1892 1893 kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT); 1894 s->nr_slots = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS); 1895 1896 /* If unspecified, use the default value */ 1897 if (!s->nr_slots) { 1898 s->nr_slots = 32; 1899 } 1900 1901 s->nr_as = kvm_check_extension(s, KVM_CAP_MULTI_ADDRESS_SPACE); 1902 if (s->nr_as <= 1) { 1903 s->nr_as = 1; 1904 } 1905 s->as = g_new0(struct KVMAs, s->nr_as); 1906 1907 kvm_type = qemu_opt_get(qemu_get_machine_opts(), "kvm-type"); 1908 if (mc->kvm_type) { 1909 type = mc->kvm_type(ms, kvm_type); 1910 } else if (kvm_type) { 1911 ret = -EINVAL; 1912 fprintf(stderr, "Invalid argument kvm-type=%s\n", kvm_type); 1913 goto err; 1914 } 1915 1916 do { 1917 ret = kvm_ioctl(s, KVM_CREATE_VM, type); 1918 } while (ret == -EINTR); 1919 1920 if (ret < 0) { 1921 fprintf(stderr, "ioctl(KVM_CREATE_VM) failed: %d %s\n", -ret, 1922 strerror(-ret)); 1923 1924 #ifdef TARGET_S390X 1925 if (ret == -EINVAL) { 1926 fprintf(stderr, 1927 "Host kernel setup problem detected. Please verify:\n"); 1928 fprintf(stderr, "- for kernels supporting the switch_amode or" 1929 " user_mode parameters, whether\n"); 1930 fprintf(stderr, 1931 " user space is running in primary address space\n"); 1932 fprintf(stderr, 1933 "- for kernels supporting the vm.allocate_pgste sysctl, " 1934 "whether it is enabled\n"); 1935 } 1936 #endif 1937 goto err; 1938 } 1939 1940 s->vmfd = ret; 1941 1942 /* check the vcpu limits */ 1943 soft_vcpus_limit = kvm_recommended_vcpus(s); 1944 hard_vcpus_limit = kvm_max_vcpus(s); 1945 1946 while (nc->name) { 1947 if (nc->num > soft_vcpus_limit) { 1948 warn_report("Number of %s cpus requested (%d) exceeds " 1949 "the recommended cpus supported by KVM (%d)", 1950 nc->name, nc->num, soft_vcpus_limit); 1951 1952 if (nc->num > hard_vcpus_limit) { 1953 fprintf(stderr, "Number of %s cpus requested (%d) exceeds " 1954 "the maximum cpus supported by KVM (%d)\n", 1955 nc->name, nc->num, hard_vcpus_limit); 1956 exit(1); 1957 } 1958 } 1959 nc++; 1960 } 1961 1962 missing_cap = kvm_check_extension_list(s, kvm_required_capabilites); 1963 if (!missing_cap) { 1964 missing_cap = 1965 kvm_check_extension_list(s, kvm_arch_required_capabilities); 1966 } 1967 if (missing_cap) { 1968 ret = -EINVAL; 1969 fprintf(stderr, "kvm does not support %s\n%s", 1970 missing_cap->name, upgrade_note); 1971 goto err; 1972 } 1973 1974 s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO); 1975 s->coalesced_pio = s->coalesced_mmio && 1976 kvm_check_extension(s, KVM_CAP_COALESCED_PIO); 1977 1978 s->manual_dirty_log_protect = 1979 kvm_check_extension(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); 1980 if (s->manual_dirty_log_protect) { 1981 ret = kvm_vm_enable_cap(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0, 1); 1982 if (ret) { 1983 warn_report("Trying to enable KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 " 1984 "but failed. Falling back to the legacy mode. "); 1985 s->manual_dirty_log_protect = false; 1986 } 1987 } 1988 1989 #ifdef KVM_CAP_VCPU_EVENTS 1990 s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS); 1991 #endif 1992 1993 s->robust_singlestep = 1994 kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP); 1995 1996 #ifdef KVM_CAP_DEBUGREGS 1997 s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS); 1998 #endif 1999 2000 s->max_nested_state_len = kvm_check_extension(s, KVM_CAP_NESTED_STATE); 2001 2002 #ifdef KVM_CAP_IRQ_ROUTING 2003 kvm_direct_msi_allowed = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0); 2004 #endif 2005 2006 s->intx_set_mask = kvm_check_extension(s, KVM_CAP_PCI_2_3); 2007 2008 s->irq_set_ioctl = KVM_IRQ_LINE; 2009 if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) { 2010 s->irq_set_ioctl = KVM_IRQ_LINE_STATUS; 2011 } 2012 2013 kvm_readonly_mem_allowed = 2014 (kvm_check_extension(s, KVM_CAP_READONLY_MEM) > 0); 2015 2016 kvm_eventfds_allowed = 2017 (kvm_check_extension(s, KVM_CAP_IOEVENTFD) > 0); 2018 2019 kvm_irqfds_allowed = 2020 (kvm_check_extension(s, KVM_CAP_IRQFD) > 0); 2021 2022 kvm_resamplefds_allowed = 2023 (kvm_check_extension(s, KVM_CAP_IRQFD_RESAMPLE) > 0); 2024 2025 kvm_vm_attributes_allowed = 2026 (kvm_check_extension(s, KVM_CAP_VM_ATTRIBUTES) > 0); 2027 2028 kvm_ioeventfd_any_length_allowed = 2029 (kvm_check_extension(s, KVM_CAP_IOEVENTFD_ANY_LENGTH) > 0); 2030 2031 kvm_state = s; 2032 2033 /* 2034 * if memory encryption object is specified then initialize the memory 2035 * encryption context. 2036 */ 2037 if (ms->memory_encryption) { 2038 kvm_state->memcrypt_handle = sev_guest_init(ms->memory_encryption); 2039 if (!kvm_state->memcrypt_handle) { 2040 ret = -1; 2041 goto err; 2042 } 2043 2044 kvm_state->memcrypt_encrypt_data = sev_encrypt_data; 2045 } 2046 2047 ret = kvm_arch_init(ms, s); 2048 if (ret < 0) { 2049 goto err; 2050 } 2051 2052 if (machine_kernel_irqchip_allowed(ms)) { 2053 kvm_irqchip_create(ms, s); 2054 } 2055 2056 if (kvm_eventfds_allowed) { 2057 s->memory_listener.listener.eventfd_add = kvm_mem_ioeventfd_add; 2058 s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del; 2059 } 2060 s->memory_listener.listener.coalesced_io_add = kvm_coalesce_mmio_region; 2061 s->memory_listener.listener.coalesced_io_del = kvm_uncoalesce_mmio_region; 2062 2063 kvm_memory_listener_register(s, &s->memory_listener, 2064 &address_space_memory, 0); 2065 memory_listener_register(&kvm_io_listener, 2066 &address_space_io); 2067 memory_listener_register(&kvm_coalesced_pio_listener, 2068 &address_space_io); 2069 2070 s->many_ioeventfds = kvm_check_many_ioeventfds(); 2071 2072 s->sync_mmu = !!kvm_vm_check_extension(kvm_state, KVM_CAP_SYNC_MMU); 2073 if (!s->sync_mmu) { 2074 qemu_balloon_inhibit(true); 2075 } 2076 2077 return 0; 2078 2079 err: 2080 assert(ret < 0); 2081 if (s->vmfd >= 0) { 2082 close(s->vmfd); 2083 } 2084 if (s->fd != -1) { 2085 close(s->fd); 2086 } 2087 g_free(s->memory_listener.slots); 2088 2089 return ret; 2090 } 2091 2092 void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len) 2093 { 2094 s->sigmask_len = sigmask_len; 2095 } 2096 2097 static void kvm_handle_io(uint16_t port, MemTxAttrs attrs, void *data, int direction, 2098 int size, uint32_t count) 2099 { 2100 int i; 2101 uint8_t *ptr = data; 2102 2103 for (i = 0; i < count; i++) { 2104 address_space_rw(&address_space_io, port, attrs, 2105 ptr, size, 2106 direction == KVM_EXIT_IO_OUT); 2107 ptr += size; 2108 } 2109 } 2110 2111 static int kvm_handle_internal_error(CPUState *cpu, struct kvm_run *run) 2112 { 2113 fprintf(stderr, "KVM internal error. Suberror: %d\n", 2114 run->internal.suberror); 2115 2116 if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) { 2117 int i; 2118 2119 for (i = 0; i < run->internal.ndata; ++i) { 2120 fprintf(stderr, "extra data[%d]: %"PRIx64"\n", 2121 i, (uint64_t)run->internal.data[i]); 2122 } 2123 } 2124 if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) { 2125 fprintf(stderr, "emulation failure\n"); 2126 if (!kvm_arch_stop_on_emulation_error(cpu)) { 2127 cpu_dump_state(cpu, stderr, CPU_DUMP_CODE); 2128 return EXCP_INTERRUPT; 2129 } 2130 } 2131 /* FIXME: Should trigger a qmp message to let management know 2132 * something went wrong. 2133 */ 2134 return -1; 2135 } 2136 2137 void kvm_flush_coalesced_mmio_buffer(void) 2138 { 2139 KVMState *s = kvm_state; 2140 2141 if (s->coalesced_flush_in_progress) { 2142 return; 2143 } 2144 2145 s->coalesced_flush_in_progress = true; 2146 2147 if (s->coalesced_mmio_ring) { 2148 struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring; 2149 while (ring->first != ring->last) { 2150 struct kvm_coalesced_mmio *ent; 2151 2152 ent = &ring->coalesced_mmio[ring->first]; 2153 2154 if (ent->pio == 1) { 2155 address_space_rw(&address_space_io, ent->phys_addr, 2156 MEMTXATTRS_UNSPECIFIED, ent->data, 2157 ent->len, true); 2158 } else { 2159 cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len); 2160 } 2161 smp_wmb(); 2162 ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX; 2163 } 2164 } 2165 2166 s->coalesced_flush_in_progress = false; 2167 } 2168 2169 static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg) 2170 { 2171 if (!cpu->vcpu_dirty) { 2172 kvm_arch_get_registers(cpu); 2173 cpu->vcpu_dirty = true; 2174 } 2175 } 2176 2177 void kvm_cpu_synchronize_state(CPUState *cpu) 2178 { 2179 if (!cpu->vcpu_dirty) { 2180 run_on_cpu(cpu, do_kvm_cpu_synchronize_state, RUN_ON_CPU_NULL); 2181 } 2182 } 2183 2184 static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg) 2185 { 2186 kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE); 2187 cpu->vcpu_dirty = false; 2188 } 2189 2190 void kvm_cpu_synchronize_post_reset(CPUState *cpu) 2191 { 2192 run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL); 2193 } 2194 2195 static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg) 2196 { 2197 kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE); 2198 cpu->vcpu_dirty = false; 2199 } 2200 2201 void kvm_cpu_synchronize_post_init(CPUState *cpu) 2202 { 2203 run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, RUN_ON_CPU_NULL); 2204 } 2205 2206 static void do_kvm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg) 2207 { 2208 cpu->vcpu_dirty = true; 2209 } 2210 2211 void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu) 2212 { 2213 run_on_cpu(cpu, do_kvm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL); 2214 } 2215 2216 #ifdef KVM_HAVE_MCE_INJECTION 2217 static __thread void *pending_sigbus_addr; 2218 static __thread int pending_sigbus_code; 2219 static __thread bool have_sigbus_pending; 2220 #endif 2221 2222 static void kvm_cpu_kick(CPUState *cpu) 2223 { 2224 atomic_set(&cpu->kvm_run->immediate_exit, 1); 2225 } 2226 2227 static void kvm_cpu_kick_self(void) 2228 { 2229 if (kvm_immediate_exit) { 2230 kvm_cpu_kick(current_cpu); 2231 } else { 2232 qemu_cpu_kick_self(); 2233 } 2234 } 2235 2236 static void kvm_eat_signals(CPUState *cpu) 2237 { 2238 struct timespec ts = { 0, 0 }; 2239 siginfo_t siginfo; 2240 sigset_t waitset; 2241 sigset_t chkset; 2242 int r; 2243 2244 if (kvm_immediate_exit) { 2245 atomic_set(&cpu->kvm_run->immediate_exit, 0); 2246 /* Write kvm_run->immediate_exit before the cpu->exit_request 2247 * write in kvm_cpu_exec. 2248 */ 2249 smp_wmb(); 2250 return; 2251 } 2252 2253 sigemptyset(&waitset); 2254 sigaddset(&waitset, SIG_IPI); 2255 2256 do { 2257 r = sigtimedwait(&waitset, &siginfo, &ts); 2258 if (r == -1 && !(errno == EAGAIN || errno == EINTR)) { 2259 perror("sigtimedwait"); 2260 exit(1); 2261 } 2262 2263 r = sigpending(&chkset); 2264 if (r == -1) { 2265 perror("sigpending"); 2266 exit(1); 2267 } 2268 } while (sigismember(&chkset, SIG_IPI)); 2269 } 2270 2271 int kvm_cpu_exec(CPUState *cpu) 2272 { 2273 struct kvm_run *run = cpu->kvm_run; 2274 int ret, run_ret; 2275 2276 DPRINTF("kvm_cpu_exec()\n"); 2277 2278 if (kvm_arch_process_async_events(cpu)) { 2279 atomic_set(&cpu->exit_request, 0); 2280 return EXCP_HLT; 2281 } 2282 2283 qemu_mutex_unlock_iothread(); 2284 cpu_exec_start(cpu); 2285 2286 do { 2287 MemTxAttrs attrs; 2288 2289 if (cpu->vcpu_dirty) { 2290 kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE); 2291 cpu->vcpu_dirty = false; 2292 } 2293 2294 kvm_arch_pre_run(cpu, run); 2295 if (atomic_read(&cpu->exit_request)) { 2296 DPRINTF("interrupt exit requested\n"); 2297 /* 2298 * KVM requires us to reenter the kernel after IO exits to complete 2299 * instruction emulation. This self-signal will ensure that we 2300 * leave ASAP again. 2301 */ 2302 kvm_cpu_kick_self(); 2303 } 2304 2305 /* Read cpu->exit_request before KVM_RUN reads run->immediate_exit. 2306 * Matching barrier in kvm_eat_signals. 2307 */ 2308 smp_rmb(); 2309 2310 run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0); 2311 2312 attrs = kvm_arch_post_run(cpu, run); 2313 2314 #ifdef KVM_HAVE_MCE_INJECTION 2315 if (unlikely(have_sigbus_pending)) { 2316 qemu_mutex_lock_iothread(); 2317 kvm_arch_on_sigbus_vcpu(cpu, pending_sigbus_code, 2318 pending_sigbus_addr); 2319 have_sigbus_pending = false; 2320 qemu_mutex_unlock_iothread(); 2321 } 2322 #endif 2323 2324 if (run_ret < 0) { 2325 if (run_ret == -EINTR || run_ret == -EAGAIN) { 2326 DPRINTF("io window exit\n"); 2327 kvm_eat_signals(cpu); 2328 ret = EXCP_INTERRUPT; 2329 break; 2330 } 2331 fprintf(stderr, "error: kvm run failed %s\n", 2332 strerror(-run_ret)); 2333 #ifdef TARGET_PPC 2334 if (run_ret == -EBUSY) { 2335 fprintf(stderr, 2336 "This is probably because your SMT is enabled.\n" 2337 "VCPU can only run on primary threads with all " 2338 "secondary threads offline.\n"); 2339 } 2340 #endif 2341 ret = -1; 2342 break; 2343 } 2344 2345 trace_kvm_run_exit(cpu->cpu_index, run->exit_reason); 2346 switch (run->exit_reason) { 2347 case KVM_EXIT_IO: 2348 DPRINTF("handle_io\n"); 2349 /* Called outside BQL */ 2350 kvm_handle_io(run->io.port, attrs, 2351 (uint8_t *)run + run->io.data_offset, 2352 run->io.direction, 2353 run->io.size, 2354 run->io.count); 2355 ret = 0; 2356 break; 2357 case KVM_EXIT_MMIO: 2358 DPRINTF("handle_mmio\n"); 2359 /* Called outside BQL */ 2360 address_space_rw(&address_space_memory, 2361 run->mmio.phys_addr, attrs, 2362 run->mmio.data, 2363 run->mmio.len, 2364 run->mmio.is_write); 2365 ret = 0; 2366 break; 2367 case KVM_EXIT_IRQ_WINDOW_OPEN: 2368 DPRINTF("irq_window_open\n"); 2369 ret = EXCP_INTERRUPT; 2370 break; 2371 case KVM_EXIT_SHUTDOWN: 2372 DPRINTF("shutdown\n"); 2373 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); 2374 ret = EXCP_INTERRUPT; 2375 break; 2376 case KVM_EXIT_UNKNOWN: 2377 fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n", 2378 (uint64_t)run->hw.hardware_exit_reason); 2379 ret = -1; 2380 break; 2381 case KVM_EXIT_INTERNAL_ERROR: 2382 ret = kvm_handle_internal_error(cpu, run); 2383 break; 2384 case KVM_EXIT_SYSTEM_EVENT: 2385 switch (run->system_event.type) { 2386 case KVM_SYSTEM_EVENT_SHUTDOWN: 2387 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN); 2388 ret = EXCP_INTERRUPT; 2389 break; 2390 case KVM_SYSTEM_EVENT_RESET: 2391 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); 2392 ret = EXCP_INTERRUPT; 2393 break; 2394 case KVM_SYSTEM_EVENT_CRASH: 2395 kvm_cpu_synchronize_state(cpu); 2396 qemu_mutex_lock_iothread(); 2397 qemu_system_guest_panicked(cpu_get_crash_info(cpu)); 2398 qemu_mutex_unlock_iothread(); 2399 ret = 0; 2400 break; 2401 default: 2402 DPRINTF("kvm_arch_handle_exit\n"); 2403 ret = kvm_arch_handle_exit(cpu, run); 2404 break; 2405 } 2406 break; 2407 default: 2408 DPRINTF("kvm_arch_handle_exit\n"); 2409 ret = kvm_arch_handle_exit(cpu, run); 2410 break; 2411 } 2412 } while (ret == 0); 2413 2414 cpu_exec_end(cpu); 2415 qemu_mutex_lock_iothread(); 2416 2417 if (ret < 0) { 2418 cpu_dump_state(cpu, stderr, CPU_DUMP_CODE); 2419 vm_stop(RUN_STATE_INTERNAL_ERROR); 2420 } 2421 2422 atomic_set(&cpu->exit_request, 0); 2423 return ret; 2424 } 2425 2426 int kvm_ioctl(KVMState *s, int type, ...) 2427 { 2428 int ret; 2429 void *arg; 2430 va_list ap; 2431 2432 va_start(ap, type); 2433 arg = va_arg(ap, void *); 2434 va_end(ap); 2435 2436 trace_kvm_ioctl(type, arg); 2437 ret = ioctl(s->fd, type, arg); 2438 if (ret == -1) { 2439 ret = -errno; 2440 } 2441 return ret; 2442 } 2443 2444 int kvm_vm_ioctl(KVMState *s, int type, ...) 2445 { 2446 int ret; 2447 void *arg; 2448 va_list ap; 2449 2450 va_start(ap, type); 2451 arg = va_arg(ap, void *); 2452 va_end(ap); 2453 2454 trace_kvm_vm_ioctl(type, arg); 2455 ret = ioctl(s->vmfd, type, arg); 2456 if (ret == -1) { 2457 ret = -errno; 2458 } 2459 return ret; 2460 } 2461 2462 int kvm_vcpu_ioctl(CPUState *cpu, int type, ...) 2463 { 2464 int ret; 2465 void *arg; 2466 va_list ap; 2467 2468 va_start(ap, type); 2469 arg = va_arg(ap, void *); 2470 va_end(ap); 2471 2472 trace_kvm_vcpu_ioctl(cpu->cpu_index, type, arg); 2473 ret = ioctl(cpu->kvm_fd, type, arg); 2474 if (ret == -1) { 2475 ret = -errno; 2476 } 2477 return ret; 2478 } 2479 2480 int kvm_device_ioctl(int fd, int type, ...) 2481 { 2482 int ret; 2483 void *arg; 2484 va_list ap; 2485 2486 va_start(ap, type); 2487 arg = va_arg(ap, void *); 2488 va_end(ap); 2489 2490 trace_kvm_device_ioctl(fd, type, arg); 2491 ret = ioctl(fd, type, arg); 2492 if (ret == -1) { 2493 ret = -errno; 2494 } 2495 return ret; 2496 } 2497 2498 int kvm_vm_check_attr(KVMState *s, uint32_t group, uint64_t attr) 2499 { 2500 int ret; 2501 struct kvm_device_attr attribute = { 2502 .group = group, 2503 .attr = attr, 2504 }; 2505 2506 if (!kvm_vm_attributes_allowed) { 2507 return 0; 2508 } 2509 2510 ret = kvm_vm_ioctl(s, KVM_HAS_DEVICE_ATTR, &attribute); 2511 /* kvm returns 0 on success for HAS_DEVICE_ATTR */ 2512 return ret ? 0 : 1; 2513 } 2514 2515 int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr) 2516 { 2517 struct kvm_device_attr attribute = { 2518 .group = group, 2519 .attr = attr, 2520 .flags = 0, 2521 }; 2522 2523 return kvm_device_ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute) ? 0 : 1; 2524 } 2525 2526 int kvm_device_access(int fd, int group, uint64_t attr, 2527 void *val, bool write, Error **errp) 2528 { 2529 struct kvm_device_attr kvmattr; 2530 int err; 2531 2532 kvmattr.flags = 0; 2533 kvmattr.group = group; 2534 kvmattr.attr = attr; 2535 kvmattr.addr = (uintptr_t)val; 2536 2537 err = kvm_device_ioctl(fd, 2538 write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR, 2539 &kvmattr); 2540 if (err < 0) { 2541 error_setg_errno(errp, -err, 2542 "KVM_%s_DEVICE_ATTR failed: Group %d " 2543 "attr 0x%016" PRIx64, 2544 write ? "SET" : "GET", group, attr); 2545 } 2546 return err; 2547 } 2548 2549 bool kvm_has_sync_mmu(void) 2550 { 2551 return kvm_state->sync_mmu; 2552 } 2553 2554 int kvm_has_vcpu_events(void) 2555 { 2556 return kvm_state->vcpu_events; 2557 } 2558 2559 int kvm_has_robust_singlestep(void) 2560 { 2561 return kvm_state->robust_singlestep; 2562 } 2563 2564 int kvm_has_debugregs(void) 2565 { 2566 return kvm_state->debugregs; 2567 } 2568 2569 int kvm_max_nested_state_length(void) 2570 { 2571 return kvm_state->max_nested_state_len; 2572 } 2573 2574 int kvm_has_many_ioeventfds(void) 2575 { 2576 if (!kvm_enabled()) { 2577 return 0; 2578 } 2579 return kvm_state->many_ioeventfds; 2580 } 2581 2582 int kvm_has_gsi_routing(void) 2583 { 2584 #ifdef KVM_CAP_IRQ_ROUTING 2585 return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING); 2586 #else 2587 return false; 2588 #endif 2589 } 2590 2591 int kvm_has_intx_set_mask(void) 2592 { 2593 return kvm_state->intx_set_mask; 2594 } 2595 2596 bool kvm_arm_supports_user_irq(void) 2597 { 2598 return kvm_check_extension(kvm_state, KVM_CAP_ARM_USER_IRQ); 2599 } 2600 2601 #ifdef KVM_CAP_SET_GUEST_DEBUG 2602 struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu, 2603 target_ulong pc) 2604 { 2605 struct kvm_sw_breakpoint *bp; 2606 2607 QTAILQ_FOREACH(bp, &cpu->kvm_state->kvm_sw_breakpoints, entry) { 2608 if (bp->pc == pc) { 2609 return bp; 2610 } 2611 } 2612 return NULL; 2613 } 2614 2615 int kvm_sw_breakpoints_active(CPUState *cpu) 2616 { 2617 return !QTAILQ_EMPTY(&cpu->kvm_state->kvm_sw_breakpoints); 2618 } 2619 2620 struct kvm_set_guest_debug_data { 2621 struct kvm_guest_debug dbg; 2622 int err; 2623 }; 2624 2625 static void kvm_invoke_set_guest_debug(CPUState *cpu, run_on_cpu_data data) 2626 { 2627 struct kvm_set_guest_debug_data *dbg_data = 2628 (struct kvm_set_guest_debug_data *) data.host_ptr; 2629 2630 dbg_data->err = kvm_vcpu_ioctl(cpu, KVM_SET_GUEST_DEBUG, 2631 &dbg_data->dbg); 2632 } 2633 2634 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap) 2635 { 2636 struct kvm_set_guest_debug_data data; 2637 2638 data.dbg.control = reinject_trap; 2639 2640 if (cpu->singlestep_enabled) { 2641 data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP; 2642 } 2643 kvm_arch_update_guest_debug(cpu, &data.dbg); 2644 2645 run_on_cpu(cpu, kvm_invoke_set_guest_debug, 2646 RUN_ON_CPU_HOST_PTR(&data)); 2647 return data.err; 2648 } 2649 2650 int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr, 2651 target_ulong len, int type) 2652 { 2653 struct kvm_sw_breakpoint *bp; 2654 int err; 2655 2656 if (type == GDB_BREAKPOINT_SW) { 2657 bp = kvm_find_sw_breakpoint(cpu, addr); 2658 if (bp) { 2659 bp->use_count++; 2660 return 0; 2661 } 2662 2663 bp = g_malloc(sizeof(struct kvm_sw_breakpoint)); 2664 bp->pc = addr; 2665 bp->use_count = 1; 2666 err = kvm_arch_insert_sw_breakpoint(cpu, bp); 2667 if (err) { 2668 g_free(bp); 2669 return err; 2670 } 2671 2672 QTAILQ_INSERT_HEAD(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry); 2673 } else { 2674 err = kvm_arch_insert_hw_breakpoint(addr, len, type); 2675 if (err) { 2676 return err; 2677 } 2678 } 2679 2680 CPU_FOREACH(cpu) { 2681 err = kvm_update_guest_debug(cpu, 0); 2682 if (err) { 2683 return err; 2684 } 2685 } 2686 return 0; 2687 } 2688 2689 int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr, 2690 target_ulong len, int type) 2691 { 2692 struct kvm_sw_breakpoint *bp; 2693 int err; 2694 2695 if (type == GDB_BREAKPOINT_SW) { 2696 bp = kvm_find_sw_breakpoint(cpu, addr); 2697 if (!bp) { 2698 return -ENOENT; 2699 } 2700 2701 if (bp->use_count > 1) { 2702 bp->use_count--; 2703 return 0; 2704 } 2705 2706 err = kvm_arch_remove_sw_breakpoint(cpu, bp); 2707 if (err) { 2708 return err; 2709 } 2710 2711 QTAILQ_REMOVE(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry); 2712 g_free(bp); 2713 } else { 2714 err = kvm_arch_remove_hw_breakpoint(addr, len, type); 2715 if (err) { 2716 return err; 2717 } 2718 } 2719 2720 CPU_FOREACH(cpu) { 2721 err = kvm_update_guest_debug(cpu, 0); 2722 if (err) { 2723 return err; 2724 } 2725 } 2726 return 0; 2727 } 2728 2729 void kvm_remove_all_breakpoints(CPUState *cpu) 2730 { 2731 struct kvm_sw_breakpoint *bp, *next; 2732 KVMState *s = cpu->kvm_state; 2733 CPUState *tmpcpu; 2734 2735 QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) { 2736 if (kvm_arch_remove_sw_breakpoint(cpu, bp) != 0) { 2737 /* Try harder to find a CPU that currently sees the breakpoint. */ 2738 CPU_FOREACH(tmpcpu) { 2739 if (kvm_arch_remove_sw_breakpoint(tmpcpu, bp) == 0) { 2740 break; 2741 } 2742 } 2743 } 2744 QTAILQ_REMOVE(&s->kvm_sw_breakpoints, bp, entry); 2745 g_free(bp); 2746 } 2747 kvm_arch_remove_all_hw_breakpoints(); 2748 2749 CPU_FOREACH(cpu) { 2750 kvm_update_guest_debug(cpu, 0); 2751 } 2752 } 2753 2754 #else /* !KVM_CAP_SET_GUEST_DEBUG */ 2755 2756 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap) 2757 { 2758 return -EINVAL; 2759 } 2760 2761 int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr, 2762 target_ulong len, int type) 2763 { 2764 return -EINVAL; 2765 } 2766 2767 int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr, 2768 target_ulong len, int type) 2769 { 2770 return -EINVAL; 2771 } 2772 2773 void kvm_remove_all_breakpoints(CPUState *cpu) 2774 { 2775 } 2776 #endif /* !KVM_CAP_SET_GUEST_DEBUG */ 2777 2778 static int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset) 2779 { 2780 KVMState *s = kvm_state; 2781 struct kvm_signal_mask *sigmask; 2782 int r; 2783 2784 sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset)); 2785 2786 sigmask->len = s->sigmask_len; 2787 memcpy(sigmask->sigset, sigset, sizeof(*sigset)); 2788 r = kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, sigmask); 2789 g_free(sigmask); 2790 2791 return r; 2792 } 2793 2794 static void kvm_ipi_signal(int sig) 2795 { 2796 if (current_cpu) { 2797 assert(kvm_immediate_exit); 2798 kvm_cpu_kick(current_cpu); 2799 } 2800 } 2801 2802 void kvm_init_cpu_signals(CPUState *cpu) 2803 { 2804 int r; 2805 sigset_t set; 2806 struct sigaction sigact; 2807 2808 memset(&sigact, 0, sizeof(sigact)); 2809 sigact.sa_handler = kvm_ipi_signal; 2810 sigaction(SIG_IPI, &sigact, NULL); 2811 2812 pthread_sigmask(SIG_BLOCK, NULL, &set); 2813 #if defined KVM_HAVE_MCE_INJECTION 2814 sigdelset(&set, SIGBUS); 2815 pthread_sigmask(SIG_SETMASK, &set, NULL); 2816 #endif 2817 sigdelset(&set, SIG_IPI); 2818 if (kvm_immediate_exit) { 2819 r = pthread_sigmask(SIG_SETMASK, &set, NULL); 2820 } else { 2821 r = kvm_set_signal_mask(cpu, &set); 2822 } 2823 if (r) { 2824 fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r)); 2825 exit(1); 2826 } 2827 } 2828 2829 /* Called asynchronously in VCPU thread. */ 2830 int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr) 2831 { 2832 #ifdef KVM_HAVE_MCE_INJECTION 2833 if (have_sigbus_pending) { 2834 return 1; 2835 } 2836 have_sigbus_pending = true; 2837 pending_sigbus_addr = addr; 2838 pending_sigbus_code = code; 2839 atomic_set(&cpu->exit_request, 1); 2840 return 0; 2841 #else 2842 return 1; 2843 #endif 2844 } 2845 2846 /* Called synchronously (via signalfd) in main thread. */ 2847 int kvm_on_sigbus(int code, void *addr) 2848 { 2849 #ifdef KVM_HAVE_MCE_INJECTION 2850 /* Action required MCE kills the process if SIGBUS is blocked. Because 2851 * that's what happens in the I/O thread, where we handle MCE via signalfd, 2852 * we can only get action optional here. 2853 */ 2854 assert(code != BUS_MCEERR_AR); 2855 kvm_arch_on_sigbus_vcpu(first_cpu, code, addr); 2856 return 0; 2857 #else 2858 return 1; 2859 #endif 2860 } 2861 2862 int kvm_create_device(KVMState *s, uint64_t type, bool test) 2863 { 2864 int ret; 2865 struct kvm_create_device create_dev; 2866 2867 create_dev.type = type; 2868 create_dev.fd = -1; 2869 create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0; 2870 2871 if (!kvm_check_extension(s, KVM_CAP_DEVICE_CTRL)) { 2872 return -ENOTSUP; 2873 } 2874 2875 ret = kvm_vm_ioctl(s, KVM_CREATE_DEVICE, &create_dev); 2876 if (ret) { 2877 return ret; 2878 } 2879 2880 return test ? 0 : create_dev.fd; 2881 } 2882 2883 bool kvm_device_supported(int vmfd, uint64_t type) 2884 { 2885 struct kvm_create_device create_dev = { 2886 .type = type, 2887 .fd = -1, 2888 .flags = KVM_CREATE_DEVICE_TEST, 2889 }; 2890 2891 if (ioctl(vmfd, KVM_CHECK_EXTENSION, KVM_CAP_DEVICE_CTRL) <= 0) { 2892 return false; 2893 } 2894 2895 return (ioctl(vmfd, KVM_CREATE_DEVICE, &create_dev) >= 0); 2896 } 2897 2898 int kvm_set_one_reg(CPUState *cs, uint64_t id, void *source) 2899 { 2900 struct kvm_one_reg reg; 2901 int r; 2902 2903 reg.id = id; 2904 reg.addr = (uintptr_t) source; 2905 r = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, ®); 2906 if (r) { 2907 trace_kvm_failed_reg_set(id, strerror(-r)); 2908 } 2909 return r; 2910 } 2911 2912 int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target) 2913 { 2914 struct kvm_one_reg reg; 2915 int r; 2916 2917 reg.id = id; 2918 reg.addr = (uintptr_t) target; 2919 r = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, ®); 2920 if (r) { 2921 trace_kvm_failed_reg_get(id, strerror(-r)); 2922 } 2923 return r; 2924 } 2925 2926 static bool kvm_accel_has_memory(MachineState *ms, AddressSpace *as, 2927 hwaddr start_addr, hwaddr size) 2928 { 2929 KVMState *kvm = KVM_STATE(ms->accelerator); 2930 int i; 2931 2932 for (i = 0; i < kvm->nr_as; ++i) { 2933 if (kvm->as[i].as == as && kvm->as[i].ml) { 2934 size = MIN(kvm_max_slot_size, size); 2935 return NULL != kvm_lookup_matching_slot(kvm->as[i].ml, 2936 start_addr, size); 2937 } 2938 } 2939 2940 return false; 2941 } 2942 2943 static void kvm_accel_class_init(ObjectClass *oc, void *data) 2944 { 2945 AccelClass *ac = ACCEL_CLASS(oc); 2946 ac->name = "KVM"; 2947 ac->init_machine = kvm_init; 2948 ac->has_memory = kvm_accel_has_memory; 2949 ac->allowed = &kvm_allowed; 2950 } 2951 2952 static const TypeInfo kvm_accel_type = { 2953 .name = TYPE_KVM_ACCEL, 2954 .parent = TYPE_ACCEL, 2955 .class_init = kvm_accel_class_init, 2956 .instance_size = sizeof(KVMState), 2957 }; 2958 2959 static void kvm_type_init(void) 2960 { 2961 type_register_static(&kvm_accel_type); 2962 } 2963 2964 type_init(kvm_type_init); 2965