1 /* 2 * QEMU KVM support 3 * 4 * Copyright IBM, Corp. 2008 5 * Red Hat, Inc. 2008 6 * 7 * Authors: 8 * Anthony Liguori <aliguori@us.ibm.com> 9 * Glauber Costa <gcosta@redhat.com> 10 * 11 * This work is licensed under the terms of the GNU GPL, version 2 or later. 12 * See the COPYING file in the top-level directory. 13 * 14 */ 15 16 #include "qemu/osdep.h" 17 #include <sys/ioctl.h> 18 19 #include <linux/kvm.h> 20 21 #include "qemu/atomic.h" 22 #include "qemu/option.h" 23 #include "qemu/config-file.h" 24 #include "qemu/error-report.h" 25 #include "qapi/error.h" 26 #include "hw/pci/msi.h" 27 #include "hw/pci/msix.h" 28 #include "hw/s390x/adapter.h" 29 #include "exec/gdbstub.h" 30 #include "sysemu/kvm_int.h" 31 #include "sysemu/runstate.h" 32 #include "sysemu/cpus.h" 33 #include "sysemu/sysemu.h" 34 #include "qemu/bswap.h" 35 #include "exec/memory.h" 36 #include "exec/ram_addr.h" 37 #include "exec/address-spaces.h" 38 #include "qemu/event_notifier.h" 39 #include "qemu/main-loop.h" 40 #include "trace.h" 41 #include "hw/irq.h" 42 #include "sysemu/sev.h" 43 #include "sysemu/balloon.h" 44 45 #include "hw/boards.h" 46 47 /* This check must be after config-host.h is included */ 48 #ifdef CONFIG_EVENTFD 49 #include <sys/eventfd.h> 50 #endif 51 52 /* KVM uses PAGE_SIZE in its definition of KVM_COALESCED_MMIO_MAX. We 53 * need to use the real host PAGE_SIZE, as that's what KVM will use. 54 */ 55 #define PAGE_SIZE getpagesize() 56 57 //#define DEBUG_KVM 58 59 #ifdef DEBUG_KVM 60 #define DPRINTF(fmt, ...) \ 61 do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0) 62 #else 63 #define DPRINTF(fmt, ...) \ 64 do { } while (0) 65 #endif 66 67 #define KVM_MSI_HASHTAB_SIZE 256 68 69 struct KVMParkedVcpu { 70 unsigned long vcpu_id; 71 int kvm_fd; 72 QLIST_ENTRY(KVMParkedVcpu) node; 73 }; 74 75 struct KVMState 76 { 77 AccelState parent_obj; 78 79 int nr_slots; 80 int fd; 81 int vmfd; 82 int coalesced_mmio; 83 int coalesced_pio; 84 struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; 85 bool coalesced_flush_in_progress; 86 int vcpu_events; 87 int robust_singlestep; 88 int debugregs; 89 #ifdef KVM_CAP_SET_GUEST_DEBUG 90 QTAILQ_HEAD(, kvm_sw_breakpoint) kvm_sw_breakpoints; 91 #endif 92 int max_nested_state_len; 93 int many_ioeventfds; 94 int intx_set_mask; 95 bool sync_mmu; 96 bool manual_dirty_log_protect; 97 /* The man page (and posix) say ioctl numbers are signed int, but 98 * they're not. Linux, glibc and *BSD all treat ioctl numbers as 99 * unsigned, and treating them as signed here can break things */ 100 unsigned irq_set_ioctl; 101 unsigned int sigmask_len; 102 GHashTable *gsimap; 103 #ifdef KVM_CAP_IRQ_ROUTING 104 struct kvm_irq_routing *irq_routes; 105 int nr_allocated_irq_routes; 106 unsigned long *used_gsi_bitmap; 107 unsigned int gsi_count; 108 QTAILQ_HEAD(, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE]; 109 #endif 110 KVMMemoryListener memory_listener; 111 QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus; 112 113 /* memory encryption */ 114 void *memcrypt_handle; 115 int (*memcrypt_encrypt_data)(void *handle, uint8_t *ptr, uint64_t len); 116 117 /* For "info mtree -f" to tell if an MR is registered in KVM */ 118 int nr_as; 119 struct KVMAs { 120 KVMMemoryListener *ml; 121 AddressSpace *as; 122 } *as; 123 }; 124 125 KVMState *kvm_state; 126 bool kvm_kernel_irqchip; 127 bool kvm_split_irqchip; 128 bool kvm_async_interrupts_allowed; 129 bool kvm_halt_in_kernel_allowed; 130 bool kvm_eventfds_allowed; 131 bool kvm_irqfds_allowed; 132 bool kvm_resamplefds_allowed; 133 bool kvm_msi_via_irqfd_allowed; 134 bool kvm_gsi_routing_allowed; 135 bool kvm_gsi_direct_mapping; 136 bool kvm_allowed; 137 bool kvm_readonly_mem_allowed; 138 bool kvm_vm_attributes_allowed; 139 bool kvm_direct_msi_allowed; 140 bool kvm_ioeventfd_any_length_allowed; 141 bool kvm_msi_use_devid; 142 static bool kvm_immediate_exit; 143 144 static const KVMCapabilityInfo kvm_required_capabilites[] = { 145 KVM_CAP_INFO(USER_MEMORY), 146 KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS), 147 KVM_CAP_INFO(JOIN_MEMORY_REGIONS_WORKS), 148 KVM_CAP_LAST_INFO 149 }; 150 151 #define kvm_slots_lock(kml) qemu_mutex_lock(&(kml)->slots_lock) 152 #define kvm_slots_unlock(kml) qemu_mutex_unlock(&(kml)->slots_lock) 153 154 int kvm_get_max_memslots(void) 155 { 156 KVMState *s = KVM_STATE(current_machine->accelerator); 157 158 return s->nr_slots; 159 } 160 161 bool kvm_memcrypt_enabled(void) 162 { 163 if (kvm_state && kvm_state->memcrypt_handle) { 164 return true; 165 } 166 167 return false; 168 } 169 170 int kvm_memcrypt_encrypt_data(uint8_t *ptr, uint64_t len) 171 { 172 if (kvm_state->memcrypt_handle && 173 kvm_state->memcrypt_encrypt_data) { 174 return kvm_state->memcrypt_encrypt_data(kvm_state->memcrypt_handle, 175 ptr, len); 176 } 177 178 return 1; 179 } 180 181 /* Called with KVMMemoryListener.slots_lock held */ 182 static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml) 183 { 184 KVMState *s = kvm_state; 185 int i; 186 187 for (i = 0; i < s->nr_slots; i++) { 188 if (kml->slots[i].memory_size == 0) { 189 return &kml->slots[i]; 190 } 191 } 192 193 return NULL; 194 } 195 196 bool kvm_has_free_slot(MachineState *ms) 197 { 198 KVMState *s = KVM_STATE(ms->accelerator); 199 bool result; 200 KVMMemoryListener *kml = &s->memory_listener; 201 202 kvm_slots_lock(kml); 203 result = !!kvm_get_free_slot(kml); 204 kvm_slots_unlock(kml); 205 206 return result; 207 } 208 209 /* Called with KVMMemoryListener.slots_lock held */ 210 static KVMSlot *kvm_alloc_slot(KVMMemoryListener *kml) 211 { 212 KVMSlot *slot = kvm_get_free_slot(kml); 213 214 if (slot) { 215 return slot; 216 } 217 218 fprintf(stderr, "%s: no free slot available\n", __func__); 219 abort(); 220 } 221 222 static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml, 223 hwaddr start_addr, 224 hwaddr size) 225 { 226 KVMState *s = kvm_state; 227 int i; 228 229 for (i = 0; i < s->nr_slots; i++) { 230 KVMSlot *mem = &kml->slots[i]; 231 232 if (start_addr == mem->start_addr && size == mem->memory_size) { 233 return mem; 234 } 235 } 236 237 return NULL; 238 } 239 240 /* 241 * Calculate and align the start address and the size of the section. 242 * Return the size. If the size is 0, the aligned section is empty. 243 */ 244 static hwaddr kvm_align_section(MemoryRegionSection *section, 245 hwaddr *start) 246 { 247 hwaddr size = int128_get64(section->size); 248 hwaddr delta, aligned; 249 250 /* kvm works in page size chunks, but the function may be called 251 with sub-page size and unaligned start address. Pad the start 252 address to next and truncate size to previous page boundary. */ 253 aligned = ROUND_UP(section->offset_within_address_space, 254 qemu_real_host_page_size); 255 delta = aligned - section->offset_within_address_space; 256 *start = aligned; 257 if (delta > size) { 258 return 0; 259 } 260 261 return (size - delta) & qemu_real_host_page_mask; 262 } 263 264 int kvm_physical_memory_addr_from_host(KVMState *s, void *ram, 265 hwaddr *phys_addr) 266 { 267 KVMMemoryListener *kml = &s->memory_listener; 268 int i, ret = 0; 269 270 kvm_slots_lock(kml); 271 for (i = 0; i < s->nr_slots; i++) { 272 KVMSlot *mem = &kml->slots[i]; 273 274 if (ram >= mem->ram && ram < mem->ram + mem->memory_size) { 275 *phys_addr = mem->start_addr + (ram - mem->ram); 276 ret = 1; 277 break; 278 } 279 } 280 kvm_slots_unlock(kml); 281 282 return ret; 283 } 284 285 static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot, bool new) 286 { 287 KVMState *s = kvm_state; 288 struct kvm_userspace_memory_region mem; 289 int ret; 290 291 mem.slot = slot->slot | (kml->as_id << 16); 292 mem.guest_phys_addr = slot->start_addr; 293 mem.userspace_addr = (unsigned long)slot->ram; 294 mem.flags = slot->flags; 295 296 if (slot->memory_size && !new && (mem.flags ^ slot->old_flags) & KVM_MEM_READONLY) { 297 /* Set the slot size to 0 before setting the slot to the desired 298 * value. This is needed based on KVM commit 75d61fbc. */ 299 mem.memory_size = 0; 300 kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem); 301 } 302 mem.memory_size = slot->memory_size; 303 ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem); 304 slot->old_flags = mem.flags; 305 trace_kvm_set_user_memory(mem.slot, mem.flags, mem.guest_phys_addr, 306 mem.memory_size, mem.userspace_addr, ret); 307 return ret; 308 } 309 310 int kvm_destroy_vcpu(CPUState *cpu) 311 { 312 KVMState *s = kvm_state; 313 long mmap_size; 314 struct KVMParkedVcpu *vcpu = NULL; 315 int ret = 0; 316 317 DPRINTF("kvm_destroy_vcpu\n"); 318 319 ret = kvm_arch_destroy_vcpu(cpu); 320 if (ret < 0) { 321 goto err; 322 } 323 324 mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0); 325 if (mmap_size < 0) { 326 ret = mmap_size; 327 DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n"); 328 goto err; 329 } 330 331 ret = munmap(cpu->kvm_run, mmap_size); 332 if (ret < 0) { 333 goto err; 334 } 335 336 vcpu = g_malloc0(sizeof(*vcpu)); 337 vcpu->vcpu_id = kvm_arch_vcpu_id(cpu); 338 vcpu->kvm_fd = cpu->kvm_fd; 339 QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node); 340 err: 341 return ret; 342 } 343 344 static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id) 345 { 346 struct KVMParkedVcpu *cpu; 347 348 QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) { 349 if (cpu->vcpu_id == vcpu_id) { 350 int kvm_fd; 351 352 QLIST_REMOVE(cpu, node); 353 kvm_fd = cpu->kvm_fd; 354 g_free(cpu); 355 return kvm_fd; 356 } 357 } 358 359 return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id); 360 } 361 362 int kvm_init_vcpu(CPUState *cpu) 363 { 364 KVMState *s = kvm_state; 365 long mmap_size; 366 int ret; 367 368 DPRINTF("kvm_init_vcpu\n"); 369 370 ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu)); 371 if (ret < 0) { 372 DPRINTF("kvm_create_vcpu failed\n"); 373 goto err; 374 } 375 376 cpu->kvm_fd = ret; 377 cpu->kvm_state = s; 378 cpu->vcpu_dirty = true; 379 380 mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0); 381 if (mmap_size < 0) { 382 ret = mmap_size; 383 DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n"); 384 goto err; 385 } 386 387 cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, 388 cpu->kvm_fd, 0); 389 if (cpu->kvm_run == MAP_FAILED) { 390 ret = -errno; 391 DPRINTF("mmap'ing vcpu state failed\n"); 392 goto err; 393 } 394 395 if (s->coalesced_mmio && !s->coalesced_mmio_ring) { 396 s->coalesced_mmio_ring = 397 (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE; 398 } 399 400 ret = kvm_arch_init_vcpu(cpu); 401 err: 402 return ret; 403 } 404 405 /* 406 * dirty pages logging control 407 */ 408 409 static int kvm_mem_flags(MemoryRegion *mr) 410 { 411 bool readonly = mr->readonly || memory_region_is_romd(mr); 412 int flags = 0; 413 414 if (memory_region_get_dirty_log_mask(mr) != 0) { 415 flags |= KVM_MEM_LOG_DIRTY_PAGES; 416 } 417 if (readonly && kvm_readonly_mem_allowed) { 418 flags |= KVM_MEM_READONLY; 419 } 420 return flags; 421 } 422 423 /* Called with KVMMemoryListener.slots_lock held */ 424 static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem, 425 MemoryRegion *mr) 426 { 427 mem->flags = kvm_mem_flags(mr); 428 429 /* If nothing changed effectively, no need to issue ioctl */ 430 if (mem->flags == mem->old_flags) { 431 return 0; 432 } 433 434 return kvm_set_user_memory_region(kml, mem, false); 435 } 436 437 static int kvm_section_update_flags(KVMMemoryListener *kml, 438 MemoryRegionSection *section) 439 { 440 hwaddr start_addr, size; 441 KVMSlot *mem; 442 int ret = 0; 443 444 size = kvm_align_section(section, &start_addr); 445 if (!size) { 446 return 0; 447 } 448 449 kvm_slots_lock(kml); 450 451 mem = kvm_lookup_matching_slot(kml, start_addr, size); 452 if (!mem) { 453 /* We don't have a slot if we want to trap every access. */ 454 goto out; 455 } 456 457 ret = kvm_slot_update_flags(kml, mem, section->mr); 458 459 out: 460 kvm_slots_unlock(kml); 461 return ret; 462 } 463 464 static void kvm_log_start(MemoryListener *listener, 465 MemoryRegionSection *section, 466 int old, int new) 467 { 468 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 469 int r; 470 471 if (old != 0) { 472 return; 473 } 474 475 r = kvm_section_update_flags(kml, section); 476 if (r < 0) { 477 abort(); 478 } 479 } 480 481 static void kvm_log_stop(MemoryListener *listener, 482 MemoryRegionSection *section, 483 int old, int new) 484 { 485 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 486 int r; 487 488 if (new != 0) { 489 return; 490 } 491 492 r = kvm_section_update_flags(kml, section); 493 if (r < 0) { 494 abort(); 495 } 496 } 497 498 /* get kvm's dirty pages bitmap and update qemu's */ 499 static int kvm_get_dirty_pages_log_range(MemoryRegionSection *section, 500 unsigned long *bitmap) 501 { 502 ram_addr_t start = section->offset_within_region + 503 memory_region_get_ram_addr(section->mr); 504 ram_addr_t pages = int128_get64(section->size) / getpagesize(); 505 506 cpu_physical_memory_set_dirty_lebitmap(bitmap, start, pages); 507 return 0; 508 } 509 510 #define ALIGN(x, y) (((x)+(y)-1) & ~((y)-1)) 511 512 /** 513 * kvm_physical_sync_dirty_bitmap - Sync dirty bitmap from kernel space 514 * 515 * This function will first try to fetch dirty bitmap from the kernel, 516 * and then updates qemu's dirty bitmap. 517 * 518 * NOTE: caller must be with kml->slots_lock held. 519 * 520 * @kml: the KVM memory listener object 521 * @section: the memory section to sync the dirty bitmap with 522 */ 523 static int kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml, 524 MemoryRegionSection *section) 525 { 526 KVMState *s = kvm_state; 527 struct kvm_dirty_log d = {}; 528 KVMSlot *mem; 529 hwaddr start_addr, size; 530 int ret = 0; 531 532 size = kvm_align_section(section, &start_addr); 533 if (size) { 534 mem = kvm_lookup_matching_slot(kml, start_addr, size); 535 if (!mem) { 536 /* We don't have a slot if we want to trap every access. */ 537 goto out; 538 } 539 540 /* XXX bad kernel interface alert 541 * For dirty bitmap, kernel allocates array of size aligned to 542 * bits-per-long. But for case when the kernel is 64bits and 543 * the userspace is 32bits, userspace can't align to the same 544 * bits-per-long, since sizeof(long) is different between kernel 545 * and user space. This way, userspace will provide buffer which 546 * may be 4 bytes less than the kernel will use, resulting in 547 * userspace memory corruption (which is not detectable by valgrind 548 * too, in most cases). 549 * So for now, let's align to 64 instead of HOST_LONG_BITS here, in 550 * a hope that sizeof(long) won't become >8 any time soon. 551 */ 552 size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS), 553 /*HOST_LONG_BITS*/ 64) / 8; 554 if (!mem->dirty_bmap) { 555 /* Allocate on the first log_sync, once and for all */ 556 mem->dirty_bmap = g_malloc0(size); 557 } 558 559 d.dirty_bitmap = mem->dirty_bmap; 560 d.slot = mem->slot | (kml->as_id << 16); 561 if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) { 562 DPRINTF("ioctl failed %d\n", errno); 563 ret = -1; 564 goto out; 565 } 566 567 kvm_get_dirty_pages_log_range(section, d.dirty_bitmap); 568 } 569 out: 570 return ret; 571 } 572 573 /* Alignment requirement for KVM_CLEAR_DIRTY_LOG - 64 pages */ 574 #define KVM_CLEAR_LOG_SHIFT 6 575 #define KVM_CLEAR_LOG_ALIGN (qemu_real_host_page_size << KVM_CLEAR_LOG_SHIFT) 576 #define KVM_CLEAR_LOG_MASK (-KVM_CLEAR_LOG_ALIGN) 577 578 /** 579 * kvm_physical_log_clear - Clear the kernel's dirty bitmap for range 580 * 581 * NOTE: this will be a no-op if we haven't enabled manual dirty log 582 * protection in the host kernel because in that case this operation 583 * will be done within log_sync(). 584 * 585 * @kml: the kvm memory listener 586 * @section: the memory range to clear dirty bitmap 587 */ 588 static int kvm_physical_log_clear(KVMMemoryListener *kml, 589 MemoryRegionSection *section) 590 { 591 KVMState *s = kvm_state; 592 struct kvm_clear_dirty_log d; 593 uint64_t start, end, bmap_start, start_delta, bmap_npages, size; 594 unsigned long *bmap_clear = NULL, psize = qemu_real_host_page_size; 595 KVMSlot *mem = NULL; 596 int ret, i; 597 598 if (!s->manual_dirty_log_protect) { 599 /* No need to do explicit clear */ 600 return 0; 601 } 602 603 start = section->offset_within_address_space; 604 size = int128_get64(section->size); 605 606 if (!size) { 607 /* Nothing more we can do... */ 608 return 0; 609 } 610 611 kvm_slots_lock(kml); 612 613 /* Find any possible slot that covers the section */ 614 for (i = 0; i < s->nr_slots; i++) { 615 mem = &kml->slots[i]; 616 if (mem->start_addr <= start && 617 start + size <= mem->start_addr + mem->memory_size) { 618 break; 619 } 620 } 621 622 /* 623 * We should always find one memslot until this point, otherwise 624 * there could be something wrong from the upper layer 625 */ 626 assert(mem && i != s->nr_slots); 627 628 /* 629 * We need to extend either the start or the size or both to 630 * satisfy the KVM interface requirement. Firstly, do the start 631 * page alignment on 64 host pages 632 */ 633 bmap_start = (start - mem->start_addr) & KVM_CLEAR_LOG_MASK; 634 start_delta = start - mem->start_addr - bmap_start; 635 bmap_start /= psize; 636 637 /* 638 * The kernel interface has restriction on the size too, that either: 639 * 640 * (1) the size is 64 host pages aligned (just like the start), or 641 * (2) the size fills up until the end of the KVM memslot. 642 */ 643 bmap_npages = DIV_ROUND_UP(size + start_delta, KVM_CLEAR_LOG_ALIGN) 644 << KVM_CLEAR_LOG_SHIFT; 645 end = mem->memory_size / psize; 646 if (bmap_npages > end - bmap_start) { 647 bmap_npages = end - bmap_start; 648 } 649 start_delta /= psize; 650 651 /* 652 * Prepare the bitmap to clear dirty bits. Here we must guarantee 653 * that we won't clear any unknown dirty bits otherwise we might 654 * accidentally clear some set bits which are not yet synced from 655 * the kernel into QEMU's bitmap, then we'll lose track of the 656 * guest modifications upon those pages (which can directly lead 657 * to guest data loss or panic after migration). 658 * 659 * Layout of the KVMSlot.dirty_bmap: 660 * 661 * |<-------- bmap_npages -----------..>| 662 * [1] 663 * start_delta size 664 * |----------------|-------------|------------------|------------| 665 * ^ ^ ^ ^ 666 * | | | | 667 * start bmap_start (start) end 668 * of memslot of memslot 669 * 670 * [1] bmap_npages can be aligned to either 64 pages or the end of slot 671 */ 672 673 assert(bmap_start % BITS_PER_LONG == 0); 674 /* We should never do log_clear before log_sync */ 675 assert(mem->dirty_bmap); 676 if (start_delta) { 677 /* Slow path - we need to manipulate a temp bitmap */ 678 bmap_clear = bitmap_new(bmap_npages); 679 bitmap_copy_with_src_offset(bmap_clear, mem->dirty_bmap, 680 bmap_start, start_delta + size / psize); 681 /* 682 * We need to fill the holes at start because that was not 683 * specified by the caller and we extended the bitmap only for 684 * 64 pages alignment 685 */ 686 bitmap_clear(bmap_clear, 0, start_delta); 687 d.dirty_bitmap = bmap_clear; 688 } else { 689 /* Fast path - start address aligns well with BITS_PER_LONG */ 690 d.dirty_bitmap = mem->dirty_bmap + BIT_WORD(bmap_start); 691 } 692 693 d.first_page = bmap_start; 694 /* It should never overflow. If it happens, say something */ 695 assert(bmap_npages <= UINT32_MAX); 696 d.num_pages = bmap_npages; 697 d.slot = mem->slot | (kml->as_id << 16); 698 699 if (kvm_vm_ioctl(s, KVM_CLEAR_DIRTY_LOG, &d) == -1) { 700 ret = -errno; 701 error_report("%s: KVM_CLEAR_DIRTY_LOG failed, slot=%d, " 702 "start=0x%"PRIx64", size=0x%"PRIx32", errno=%d", 703 __func__, d.slot, (uint64_t)d.first_page, 704 (uint32_t)d.num_pages, ret); 705 } else { 706 ret = 0; 707 trace_kvm_clear_dirty_log(d.slot, d.first_page, d.num_pages); 708 } 709 710 /* 711 * After we have updated the remote dirty bitmap, we update the 712 * cached bitmap as well for the memslot, then if another user 713 * clears the same region we know we shouldn't clear it again on 714 * the remote otherwise it's data loss as well. 715 */ 716 bitmap_clear(mem->dirty_bmap, bmap_start + start_delta, 717 size / psize); 718 /* This handles the NULL case well */ 719 g_free(bmap_clear); 720 721 kvm_slots_unlock(kml); 722 723 return ret; 724 } 725 726 static void kvm_coalesce_mmio_region(MemoryListener *listener, 727 MemoryRegionSection *secion, 728 hwaddr start, hwaddr size) 729 { 730 KVMState *s = kvm_state; 731 732 if (s->coalesced_mmio) { 733 struct kvm_coalesced_mmio_zone zone; 734 735 zone.addr = start; 736 zone.size = size; 737 zone.pad = 0; 738 739 (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone); 740 } 741 } 742 743 static void kvm_uncoalesce_mmio_region(MemoryListener *listener, 744 MemoryRegionSection *secion, 745 hwaddr start, hwaddr size) 746 { 747 KVMState *s = kvm_state; 748 749 if (s->coalesced_mmio) { 750 struct kvm_coalesced_mmio_zone zone; 751 752 zone.addr = start; 753 zone.size = size; 754 zone.pad = 0; 755 756 (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone); 757 } 758 } 759 760 static void kvm_coalesce_pio_add(MemoryListener *listener, 761 MemoryRegionSection *section, 762 hwaddr start, hwaddr size) 763 { 764 KVMState *s = kvm_state; 765 766 if (s->coalesced_pio) { 767 struct kvm_coalesced_mmio_zone zone; 768 769 zone.addr = start; 770 zone.size = size; 771 zone.pio = 1; 772 773 (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone); 774 } 775 } 776 777 static void kvm_coalesce_pio_del(MemoryListener *listener, 778 MemoryRegionSection *section, 779 hwaddr start, hwaddr size) 780 { 781 KVMState *s = kvm_state; 782 783 if (s->coalesced_pio) { 784 struct kvm_coalesced_mmio_zone zone; 785 786 zone.addr = start; 787 zone.size = size; 788 zone.pio = 1; 789 790 (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone); 791 } 792 } 793 794 static MemoryListener kvm_coalesced_pio_listener = { 795 .coalesced_io_add = kvm_coalesce_pio_add, 796 .coalesced_io_del = kvm_coalesce_pio_del, 797 }; 798 799 int kvm_check_extension(KVMState *s, unsigned int extension) 800 { 801 int ret; 802 803 ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension); 804 if (ret < 0) { 805 ret = 0; 806 } 807 808 return ret; 809 } 810 811 int kvm_vm_check_extension(KVMState *s, unsigned int extension) 812 { 813 int ret; 814 815 ret = kvm_vm_ioctl(s, KVM_CHECK_EXTENSION, extension); 816 if (ret < 0) { 817 /* VM wide version not implemented, use global one instead */ 818 ret = kvm_check_extension(s, extension); 819 } 820 821 return ret; 822 } 823 824 static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size) 825 { 826 #if defined(HOST_WORDS_BIGENDIAN) != defined(TARGET_WORDS_BIGENDIAN) 827 /* The kernel expects ioeventfd values in HOST_WORDS_BIGENDIAN 828 * endianness, but the memory core hands them in target endianness. 829 * For example, PPC is always treated as big-endian even if running 830 * on KVM and on PPC64LE. Correct here. 831 */ 832 switch (size) { 833 case 2: 834 val = bswap16(val); 835 break; 836 case 4: 837 val = bswap32(val); 838 break; 839 } 840 #endif 841 return val; 842 } 843 844 static int kvm_set_ioeventfd_mmio(int fd, hwaddr addr, uint32_t val, 845 bool assign, uint32_t size, bool datamatch) 846 { 847 int ret; 848 struct kvm_ioeventfd iofd = { 849 .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0, 850 .addr = addr, 851 .len = size, 852 .flags = 0, 853 .fd = fd, 854 }; 855 856 trace_kvm_set_ioeventfd_mmio(fd, (uint64_t)addr, val, assign, size, 857 datamatch); 858 if (!kvm_enabled()) { 859 return -ENOSYS; 860 } 861 862 if (datamatch) { 863 iofd.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH; 864 } 865 if (!assign) { 866 iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN; 867 } 868 869 ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd); 870 871 if (ret < 0) { 872 return -errno; 873 } 874 875 return 0; 876 } 877 878 static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val, 879 bool assign, uint32_t size, bool datamatch) 880 { 881 struct kvm_ioeventfd kick = { 882 .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0, 883 .addr = addr, 884 .flags = KVM_IOEVENTFD_FLAG_PIO, 885 .len = size, 886 .fd = fd, 887 }; 888 int r; 889 trace_kvm_set_ioeventfd_pio(fd, addr, val, assign, size, datamatch); 890 if (!kvm_enabled()) { 891 return -ENOSYS; 892 } 893 if (datamatch) { 894 kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH; 895 } 896 if (!assign) { 897 kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN; 898 } 899 r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick); 900 if (r < 0) { 901 return r; 902 } 903 return 0; 904 } 905 906 907 static int kvm_check_many_ioeventfds(void) 908 { 909 /* Userspace can use ioeventfd for io notification. This requires a host 910 * that supports eventfd(2) and an I/O thread; since eventfd does not 911 * support SIGIO it cannot interrupt the vcpu. 912 * 913 * Older kernels have a 6 device limit on the KVM io bus. Find out so we 914 * can avoid creating too many ioeventfds. 915 */ 916 #if defined(CONFIG_EVENTFD) 917 int ioeventfds[7]; 918 int i, ret = 0; 919 for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) { 920 ioeventfds[i] = eventfd(0, EFD_CLOEXEC); 921 if (ioeventfds[i] < 0) { 922 break; 923 } 924 ret = kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, true, 2, true); 925 if (ret < 0) { 926 close(ioeventfds[i]); 927 break; 928 } 929 } 930 931 /* Decide whether many devices are supported or not */ 932 ret = i == ARRAY_SIZE(ioeventfds); 933 934 while (i-- > 0) { 935 kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, false, 2, true); 936 close(ioeventfds[i]); 937 } 938 return ret; 939 #else 940 return 0; 941 #endif 942 } 943 944 static const KVMCapabilityInfo * 945 kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list) 946 { 947 while (list->name) { 948 if (!kvm_check_extension(s, list->value)) { 949 return list; 950 } 951 list++; 952 } 953 return NULL; 954 } 955 956 static void kvm_set_phys_mem(KVMMemoryListener *kml, 957 MemoryRegionSection *section, bool add) 958 { 959 KVMSlot *mem; 960 int err; 961 MemoryRegion *mr = section->mr; 962 bool writeable = !mr->readonly && !mr->rom_device; 963 hwaddr start_addr, size; 964 void *ram; 965 966 if (!memory_region_is_ram(mr)) { 967 if (writeable || !kvm_readonly_mem_allowed) { 968 return; 969 } else if (!mr->romd_mode) { 970 /* If the memory device is not in romd_mode, then we actually want 971 * to remove the kvm memory slot so all accesses will trap. */ 972 add = false; 973 } 974 } 975 976 size = kvm_align_section(section, &start_addr); 977 if (!size) { 978 return; 979 } 980 981 /* use aligned delta to align the ram address */ 982 ram = memory_region_get_ram_ptr(mr) + section->offset_within_region + 983 (start_addr - section->offset_within_address_space); 984 985 kvm_slots_lock(kml); 986 987 if (!add) { 988 mem = kvm_lookup_matching_slot(kml, start_addr, size); 989 if (!mem) { 990 goto out; 991 } 992 if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { 993 kvm_physical_sync_dirty_bitmap(kml, section); 994 } 995 996 /* unregister the slot */ 997 g_free(mem->dirty_bmap); 998 mem->dirty_bmap = NULL; 999 mem->memory_size = 0; 1000 mem->flags = 0; 1001 err = kvm_set_user_memory_region(kml, mem, false); 1002 if (err) { 1003 fprintf(stderr, "%s: error unregistering slot: %s\n", 1004 __func__, strerror(-err)); 1005 abort(); 1006 } 1007 goto out; 1008 } 1009 1010 /* register the new slot */ 1011 mem = kvm_alloc_slot(kml); 1012 mem->memory_size = size; 1013 mem->start_addr = start_addr; 1014 mem->ram = ram; 1015 mem->flags = kvm_mem_flags(mr); 1016 1017 err = kvm_set_user_memory_region(kml, mem, true); 1018 if (err) { 1019 fprintf(stderr, "%s: error registering slot: %s\n", __func__, 1020 strerror(-err)); 1021 abort(); 1022 } 1023 1024 out: 1025 kvm_slots_unlock(kml); 1026 } 1027 1028 static void kvm_region_add(MemoryListener *listener, 1029 MemoryRegionSection *section) 1030 { 1031 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 1032 1033 memory_region_ref(section->mr); 1034 kvm_set_phys_mem(kml, section, true); 1035 } 1036 1037 static void kvm_region_del(MemoryListener *listener, 1038 MemoryRegionSection *section) 1039 { 1040 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 1041 1042 kvm_set_phys_mem(kml, section, false); 1043 memory_region_unref(section->mr); 1044 } 1045 1046 static void kvm_log_sync(MemoryListener *listener, 1047 MemoryRegionSection *section) 1048 { 1049 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 1050 int r; 1051 1052 kvm_slots_lock(kml); 1053 r = kvm_physical_sync_dirty_bitmap(kml, section); 1054 kvm_slots_unlock(kml); 1055 if (r < 0) { 1056 abort(); 1057 } 1058 } 1059 1060 static void kvm_log_clear(MemoryListener *listener, 1061 MemoryRegionSection *section) 1062 { 1063 KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener); 1064 int r; 1065 1066 r = kvm_physical_log_clear(kml, section); 1067 if (r < 0) { 1068 error_report_once("%s: kvm log clear failed: mr=%s " 1069 "offset=%"HWADDR_PRIx" size=%"PRIx64, __func__, 1070 section->mr->name, section->offset_within_region, 1071 int128_get64(section->size)); 1072 abort(); 1073 } 1074 } 1075 1076 static void kvm_mem_ioeventfd_add(MemoryListener *listener, 1077 MemoryRegionSection *section, 1078 bool match_data, uint64_t data, 1079 EventNotifier *e) 1080 { 1081 int fd = event_notifier_get_fd(e); 1082 int r; 1083 1084 r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space, 1085 data, true, int128_get64(section->size), 1086 match_data); 1087 if (r < 0) { 1088 fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n", 1089 __func__, strerror(-r), -r); 1090 abort(); 1091 } 1092 } 1093 1094 static void kvm_mem_ioeventfd_del(MemoryListener *listener, 1095 MemoryRegionSection *section, 1096 bool match_data, uint64_t data, 1097 EventNotifier *e) 1098 { 1099 int fd = event_notifier_get_fd(e); 1100 int r; 1101 1102 r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space, 1103 data, false, int128_get64(section->size), 1104 match_data); 1105 if (r < 0) { 1106 fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n", 1107 __func__, strerror(-r), -r); 1108 abort(); 1109 } 1110 } 1111 1112 static void kvm_io_ioeventfd_add(MemoryListener *listener, 1113 MemoryRegionSection *section, 1114 bool match_data, uint64_t data, 1115 EventNotifier *e) 1116 { 1117 int fd = event_notifier_get_fd(e); 1118 int r; 1119 1120 r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space, 1121 data, true, int128_get64(section->size), 1122 match_data); 1123 if (r < 0) { 1124 fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n", 1125 __func__, strerror(-r), -r); 1126 abort(); 1127 } 1128 } 1129 1130 static void kvm_io_ioeventfd_del(MemoryListener *listener, 1131 MemoryRegionSection *section, 1132 bool match_data, uint64_t data, 1133 EventNotifier *e) 1134 1135 { 1136 int fd = event_notifier_get_fd(e); 1137 int r; 1138 1139 r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space, 1140 data, false, int128_get64(section->size), 1141 match_data); 1142 if (r < 0) { 1143 fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n", 1144 __func__, strerror(-r), -r); 1145 abort(); 1146 } 1147 } 1148 1149 void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml, 1150 AddressSpace *as, int as_id) 1151 { 1152 int i; 1153 1154 qemu_mutex_init(&kml->slots_lock); 1155 kml->slots = g_malloc0(s->nr_slots * sizeof(KVMSlot)); 1156 kml->as_id = as_id; 1157 1158 for (i = 0; i < s->nr_slots; i++) { 1159 kml->slots[i].slot = i; 1160 } 1161 1162 kml->listener.region_add = kvm_region_add; 1163 kml->listener.region_del = kvm_region_del; 1164 kml->listener.log_start = kvm_log_start; 1165 kml->listener.log_stop = kvm_log_stop; 1166 kml->listener.log_sync = kvm_log_sync; 1167 kml->listener.log_clear = kvm_log_clear; 1168 kml->listener.priority = 10; 1169 1170 memory_listener_register(&kml->listener, as); 1171 1172 for (i = 0; i < s->nr_as; ++i) { 1173 if (!s->as[i].as) { 1174 s->as[i].as = as; 1175 s->as[i].ml = kml; 1176 break; 1177 } 1178 } 1179 } 1180 1181 static MemoryListener kvm_io_listener = { 1182 .eventfd_add = kvm_io_ioeventfd_add, 1183 .eventfd_del = kvm_io_ioeventfd_del, 1184 .priority = 10, 1185 }; 1186 1187 int kvm_set_irq(KVMState *s, int irq, int level) 1188 { 1189 struct kvm_irq_level event; 1190 int ret; 1191 1192 assert(kvm_async_interrupts_enabled()); 1193 1194 event.level = level; 1195 event.irq = irq; 1196 ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event); 1197 if (ret < 0) { 1198 perror("kvm_set_irq"); 1199 abort(); 1200 } 1201 1202 return (s->irq_set_ioctl == KVM_IRQ_LINE) ? 1 : event.status; 1203 } 1204 1205 #ifdef KVM_CAP_IRQ_ROUTING 1206 typedef struct KVMMSIRoute { 1207 struct kvm_irq_routing_entry kroute; 1208 QTAILQ_ENTRY(KVMMSIRoute) entry; 1209 } KVMMSIRoute; 1210 1211 static void set_gsi(KVMState *s, unsigned int gsi) 1212 { 1213 set_bit(gsi, s->used_gsi_bitmap); 1214 } 1215 1216 static void clear_gsi(KVMState *s, unsigned int gsi) 1217 { 1218 clear_bit(gsi, s->used_gsi_bitmap); 1219 } 1220 1221 void kvm_init_irq_routing(KVMState *s) 1222 { 1223 int gsi_count, i; 1224 1225 gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING) - 1; 1226 if (gsi_count > 0) { 1227 /* Round up so we can search ints using ffs */ 1228 s->used_gsi_bitmap = bitmap_new(gsi_count); 1229 s->gsi_count = gsi_count; 1230 } 1231 1232 s->irq_routes = g_malloc0(sizeof(*s->irq_routes)); 1233 s->nr_allocated_irq_routes = 0; 1234 1235 if (!kvm_direct_msi_allowed) { 1236 for (i = 0; i < KVM_MSI_HASHTAB_SIZE; i++) { 1237 QTAILQ_INIT(&s->msi_hashtab[i]); 1238 } 1239 } 1240 1241 kvm_arch_init_irq_routing(s); 1242 } 1243 1244 void kvm_irqchip_commit_routes(KVMState *s) 1245 { 1246 int ret; 1247 1248 if (kvm_gsi_direct_mapping()) { 1249 return; 1250 } 1251 1252 if (!kvm_gsi_routing_enabled()) { 1253 return; 1254 } 1255 1256 s->irq_routes->flags = 0; 1257 trace_kvm_irqchip_commit_routes(); 1258 ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes); 1259 assert(ret == 0); 1260 } 1261 1262 static void kvm_add_routing_entry(KVMState *s, 1263 struct kvm_irq_routing_entry *entry) 1264 { 1265 struct kvm_irq_routing_entry *new; 1266 int n, size; 1267 1268 if (s->irq_routes->nr == s->nr_allocated_irq_routes) { 1269 n = s->nr_allocated_irq_routes * 2; 1270 if (n < 64) { 1271 n = 64; 1272 } 1273 size = sizeof(struct kvm_irq_routing); 1274 size += n * sizeof(*new); 1275 s->irq_routes = g_realloc(s->irq_routes, size); 1276 s->nr_allocated_irq_routes = n; 1277 } 1278 n = s->irq_routes->nr++; 1279 new = &s->irq_routes->entries[n]; 1280 1281 *new = *entry; 1282 1283 set_gsi(s, entry->gsi); 1284 } 1285 1286 static int kvm_update_routing_entry(KVMState *s, 1287 struct kvm_irq_routing_entry *new_entry) 1288 { 1289 struct kvm_irq_routing_entry *entry; 1290 int n; 1291 1292 for (n = 0; n < s->irq_routes->nr; n++) { 1293 entry = &s->irq_routes->entries[n]; 1294 if (entry->gsi != new_entry->gsi) { 1295 continue; 1296 } 1297 1298 if(!memcmp(entry, new_entry, sizeof *entry)) { 1299 return 0; 1300 } 1301 1302 *entry = *new_entry; 1303 1304 return 0; 1305 } 1306 1307 return -ESRCH; 1308 } 1309 1310 void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin) 1311 { 1312 struct kvm_irq_routing_entry e = {}; 1313 1314 assert(pin < s->gsi_count); 1315 1316 e.gsi = irq; 1317 e.type = KVM_IRQ_ROUTING_IRQCHIP; 1318 e.flags = 0; 1319 e.u.irqchip.irqchip = irqchip; 1320 e.u.irqchip.pin = pin; 1321 kvm_add_routing_entry(s, &e); 1322 } 1323 1324 void kvm_irqchip_release_virq(KVMState *s, int virq) 1325 { 1326 struct kvm_irq_routing_entry *e; 1327 int i; 1328 1329 if (kvm_gsi_direct_mapping()) { 1330 return; 1331 } 1332 1333 for (i = 0; i < s->irq_routes->nr; i++) { 1334 e = &s->irq_routes->entries[i]; 1335 if (e->gsi == virq) { 1336 s->irq_routes->nr--; 1337 *e = s->irq_routes->entries[s->irq_routes->nr]; 1338 } 1339 } 1340 clear_gsi(s, virq); 1341 kvm_arch_release_virq_post(virq); 1342 trace_kvm_irqchip_release_virq(virq); 1343 } 1344 1345 static unsigned int kvm_hash_msi(uint32_t data) 1346 { 1347 /* This is optimized for IA32 MSI layout. However, no other arch shall 1348 * repeat the mistake of not providing a direct MSI injection API. */ 1349 return data & 0xff; 1350 } 1351 1352 static void kvm_flush_dynamic_msi_routes(KVMState *s) 1353 { 1354 KVMMSIRoute *route, *next; 1355 unsigned int hash; 1356 1357 for (hash = 0; hash < KVM_MSI_HASHTAB_SIZE; hash++) { 1358 QTAILQ_FOREACH_SAFE(route, &s->msi_hashtab[hash], entry, next) { 1359 kvm_irqchip_release_virq(s, route->kroute.gsi); 1360 QTAILQ_REMOVE(&s->msi_hashtab[hash], route, entry); 1361 g_free(route); 1362 } 1363 } 1364 } 1365 1366 static int kvm_irqchip_get_virq(KVMState *s) 1367 { 1368 int next_virq; 1369 1370 /* 1371 * PIC and IOAPIC share the first 16 GSI numbers, thus the available 1372 * GSI numbers are more than the number of IRQ route. Allocating a GSI 1373 * number can succeed even though a new route entry cannot be added. 1374 * When this happens, flush dynamic MSI entries to free IRQ route entries. 1375 */ 1376 if (!kvm_direct_msi_allowed && s->irq_routes->nr == s->gsi_count) { 1377 kvm_flush_dynamic_msi_routes(s); 1378 } 1379 1380 /* Return the lowest unused GSI in the bitmap */ 1381 next_virq = find_first_zero_bit(s->used_gsi_bitmap, s->gsi_count); 1382 if (next_virq >= s->gsi_count) { 1383 return -ENOSPC; 1384 } else { 1385 return next_virq; 1386 } 1387 } 1388 1389 static KVMMSIRoute *kvm_lookup_msi_route(KVMState *s, MSIMessage msg) 1390 { 1391 unsigned int hash = kvm_hash_msi(msg.data); 1392 KVMMSIRoute *route; 1393 1394 QTAILQ_FOREACH(route, &s->msi_hashtab[hash], entry) { 1395 if (route->kroute.u.msi.address_lo == (uint32_t)msg.address && 1396 route->kroute.u.msi.address_hi == (msg.address >> 32) && 1397 route->kroute.u.msi.data == le32_to_cpu(msg.data)) { 1398 return route; 1399 } 1400 } 1401 return NULL; 1402 } 1403 1404 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg) 1405 { 1406 struct kvm_msi msi; 1407 KVMMSIRoute *route; 1408 1409 if (kvm_direct_msi_allowed) { 1410 msi.address_lo = (uint32_t)msg.address; 1411 msi.address_hi = msg.address >> 32; 1412 msi.data = le32_to_cpu(msg.data); 1413 msi.flags = 0; 1414 memset(msi.pad, 0, sizeof(msi.pad)); 1415 1416 return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi); 1417 } 1418 1419 route = kvm_lookup_msi_route(s, msg); 1420 if (!route) { 1421 int virq; 1422 1423 virq = kvm_irqchip_get_virq(s); 1424 if (virq < 0) { 1425 return virq; 1426 } 1427 1428 route = g_malloc0(sizeof(KVMMSIRoute)); 1429 route->kroute.gsi = virq; 1430 route->kroute.type = KVM_IRQ_ROUTING_MSI; 1431 route->kroute.flags = 0; 1432 route->kroute.u.msi.address_lo = (uint32_t)msg.address; 1433 route->kroute.u.msi.address_hi = msg.address >> 32; 1434 route->kroute.u.msi.data = le32_to_cpu(msg.data); 1435 1436 kvm_add_routing_entry(s, &route->kroute); 1437 kvm_irqchip_commit_routes(s); 1438 1439 QTAILQ_INSERT_TAIL(&s->msi_hashtab[kvm_hash_msi(msg.data)], route, 1440 entry); 1441 } 1442 1443 assert(route->kroute.type == KVM_IRQ_ROUTING_MSI); 1444 1445 return kvm_set_irq(s, route->kroute.gsi, 1); 1446 } 1447 1448 int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev) 1449 { 1450 struct kvm_irq_routing_entry kroute = {}; 1451 int virq; 1452 MSIMessage msg = {0, 0}; 1453 1454 if (pci_available && dev) { 1455 msg = pci_get_msi_message(dev, vector); 1456 } 1457 1458 if (kvm_gsi_direct_mapping()) { 1459 return kvm_arch_msi_data_to_gsi(msg.data); 1460 } 1461 1462 if (!kvm_gsi_routing_enabled()) { 1463 return -ENOSYS; 1464 } 1465 1466 virq = kvm_irqchip_get_virq(s); 1467 if (virq < 0) { 1468 return virq; 1469 } 1470 1471 kroute.gsi = virq; 1472 kroute.type = KVM_IRQ_ROUTING_MSI; 1473 kroute.flags = 0; 1474 kroute.u.msi.address_lo = (uint32_t)msg.address; 1475 kroute.u.msi.address_hi = msg.address >> 32; 1476 kroute.u.msi.data = le32_to_cpu(msg.data); 1477 if (pci_available && kvm_msi_devid_required()) { 1478 kroute.flags = KVM_MSI_VALID_DEVID; 1479 kroute.u.msi.devid = pci_requester_id(dev); 1480 } 1481 if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) { 1482 kvm_irqchip_release_virq(s, virq); 1483 return -EINVAL; 1484 } 1485 1486 trace_kvm_irqchip_add_msi_route(dev ? dev->name : (char *)"N/A", 1487 vector, virq); 1488 1489 kvm_add_routing_entry(s, &kroute); 1490 kvm_arch_add_msi_route_post(&kroute, vector, dev); 1491 kvm_irqchip_commit_routes(s); 1492 1493 return virq; 1494 } 1495 1496 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg, 1497 PCIDevice *dev) 1498 { 1499 struct kvm_irq_routing_entry kroute = {}; 1500 1501 if (kvm_gsi_direct_mapping()) { 1502 return 0; 1503 } 1504 1505 if (!kvm_irqchip_in_kernel()) { 1506 return -ENOSYS; 1507 } 1508 1509 kroute.gsi = virq; 1510 kroute.type = KVM_IRQ_ROUTING_MSI; 1511 kroute.flags = 0; 1512 kroute.u.msi.address_lo = (uint32_t)msg.address; 1513 kroute.u.msi.address_hi = msg.address >> 32; 1514 kroute.u.msi.data = le32_to_cpu(msg.data); 1515 if (pci_available && kvm_msi_devid_required()) { 1516 kroute.flags = KVM_MSI_VALID_DEVID; 1517 kroute.u.msi.devid = pci_requester_id(dev); 1518 } 1519 if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) { 1520 return -EINVAL; 1521 } 1522 1523 trace_kvm_irqchip_update_msi_route(virq); 1524 1525 return kvm_update_routing_entry(s, &kroute); 1526 } 1527 1528 static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int rfd, int virq, 1529 bool assign) 1530 { 1531 struct kvm_irqfd irqfd = { 1532 .fd = fd, 1533 .gsi = virq, 1534 .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN, 1535 }; 1536 1537 if (rfd != -1) { 1538 irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE; 1539 irqfd.resamplefd = rfd; 1540 } 1541 1542 if (!kvm_irqfds_enabled()) { 1543 return -ENOSYS; 1544 } 1545 1546 return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd); 1547 } 1548 1549 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter) 1550 { 1551 struct kvm_irq_routing_entry kroute = {}; 1552 int virq; 1553 1554 if (!kvm_gsi_routing_enabled()) { 1555 return -ENOSYS; 1556 } 1557 1558 virq = kvm_irqchip_get_virq(s); 1559 if (virq < 0) { 1560 return virq; 1561 } 1562 1563 kroute.gsi = virq; 1564 kroute.type = KVM_IRQ_ROUTING_S390_ADAPTER; 1565 kroute.flags = 0; 1566 kroute.u.adapter.summary_addr = adapter->summary_addr; 1567 kroute.u.adapter.ind_addr = adapter->ind_addr; 1568 kroute.u.adapter.summary_offset = adapter->summary_offset; 1569 kroute.u.adapter.ind_offset = adapter->ind_offset; 1570 kroute.u.adapter.adapter_id = adapter->adapter_id; 1571 1572 kvm_add_routing_entry(s, &kroute); 1573 1574 return virq; 1575 } 1576 1577 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint) 1578 { 1579 struct kvm_irq_routing_entry kroute = {}; 1580 int virq; 1581 1582 if (!kvm_gsi_routing_enabled()) { 1583 return -ENOSYS; 1584 } 1585 if (!kvm_check_extension(s, KVM_CAP_HYPERV_SYNIC)) { 1586 return -ENOSYS; 1587 } 1588 virq = kvm_irqchip_get_virq(s); 1589 if (virq < 0) { 1590 return virq; 1591 } 1592 1593 kroute.gsi = virq; 1594 kroute.type = KVM_IRQ_ROUTING_HV_SINT; 1595 kroute.flags = 0; 1596 kroute.u.hv_sint.vcpu = vcpu; 1597 kroute.u.hv_sint.sint = sint; 1598 1599 kvm_add_routing_entry(s, &kroute); 1600 kvm_irqchip_commit_routes(s); 1601 1602 return virq; 1603 } 1604 1605 #else /* !KVM_CAP_IRQ_ROUTING */ 1606 1607 void kvm_init_irq_routing(KVMState *s) 1608 { 1609 } 1610 1611 void kvm_irqchip_release_virq(KVMState *s, int virq) 1612 { 1613 } 1614 1615 int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg) 1616 { 1617 abort(); 1618 } 1619 1620 int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev) 1621 { 1622 return -ENOSYS; 1623 } 1624 1625 int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter) 1626 { 1627 return -ENOSYS; 1628 } 1629 1630 int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint) 1631 { 1632 return -ENOSYS; 1633 } 1634 1635 static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int virq, bool assign) 1636 { 1637 abort(); 1638 } 1639 1640 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg) 1641 { 1642 return -ENOSYS; 1643 } 1644 #endif /* !KVM_CAP_IRQ_ROUTING */ 1645 1646 int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, 1647 EventNotifier *rn, int virq) 1648 { 1649 return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), 1650 rn ? event_notifier_get_fd(rn) : -1, virq, true); 1651 } 1652 1653 int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n, 1654 int virq) 1655 { 1656 return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), -1, virq, 1657 false); 1658 } 1659 1660 int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n, 1661 EventNotifier *rn, qemu_irq irq) 1662 { 1663 gpointer key, gsi; 1664 gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi); 1665 1666 if (!found) { 1667 return -ENXIO; 1668 } 1669 return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, GPOINTER_TO_INT(gsi)); 1670 } 1671 1672 int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n, 1673 qemu_irq irq) 1674 { 1675 gpointer key, gsi; 1676 gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi); 1677 1678 if (!found) { 1679 return -ENXIO; 1680 } 1681 return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, GPOINTER_TO_INT(gsi)); 1682 } 1683 1684 void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi) 1685 { 1686 g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi)); 1687 } 1688 1689 static void kvm_irqchip_create(MachineState *machine, KVMState *s) 1690 { 1691 int ret; 1692 1693 if (kvm_check_extension(s, KVM_CAP_IRQCHIP)) { 1694 ; 1695 } else if (kvm_check_extension(s, KVM_CAP_S390_IRQCHIP)) { 1696 ret = kvm_vm_enable_cap(s, KVM_CAP_S390_IRQCHIP, 0); 1697 if (ret < 0) { 1698 fprintf(stderr, "Enable kernel irqchip failed: %s\n", strerror(-ret)); 1699 exit(1); 1700 } 1701 } else { 1702 return; 1703 } 1704 1705 /* First probe and see if there's a arch-specific hook to create the 1706 * in-kernel irqchip for us */ 1707 ret = kvm_arch_irqchip_create(machine, s); 1708 if (ret == 0) { 1709 if (machine_kernel_irqchip_split(machine)) { 1710 perror("Split IRQ chip mode not supported."); 1711 exit(1); 1712 } else { 1713 ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP); 1714 } 1715 } 1716 if (ret < 0) { 1717 fprintf(stderr, "Create kernel irqchip failed: %s\n", strerror(-ret)); 1718 exit(1); 1719 } 1720 1721 kvm_kernel_irqchip = true; 1722 /* If we have an in-kernel IRQ chip then we must have asynchronous 1723 * interrupt delivery (though the reverse is not necessarily true) 1724 */ 1725 kvm_async_interrupts_allowed = true; 1726 kvm_halt_in_kernel_allowed = true; 1727 1728 kvm_init_irq_routing(s); 1729 1730 s->gsimap = g_hash_table_new(g_direct_hash, g_direct_equal); 1731 } 1732 1733 /* Find number of supported CPUs using the recommended 1734 * procedure from the kernel API documentation to cope with 1735 * older kernels that may be missing capabilities. 1736 */ 1737 static int kvm_recommended_vcpus(KVMState *s) 1738 { 1739 int ret = kvm_vm_check_extension(s, KVM_CAP_NR_VCPUS); 1740 return (ret) ? ret : 4; 1741 } 1742 1743 static int kvm_max_vcpus(KVMState *s) 1744 { 1745 int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS); 1746 return (ret) ? ret : kvm_recommended_vcpus(s); 1747 } 1748 1749 static int kvm_max_vcpu_id(KVMState *s) 1750 { 1751 int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPU_ID); 1752 return (ret) ? ret : kvm_max_vcpus(s); 1753 } 1754 1755 bool kvm_vcpu_id_is_valid(int vcpu_id) 1756 { 1757 KVMState *s = KVM_STATE(current_machine->accelerator); 1758 return vcpu_id >= 0 && vcpu_id < kvm_max_vcpu_id(s); 1759 } 1760 1761 static int kvm_init(MachineState *ms) 1762 { 1763 MachineClass *mc = MACHINE_GET_CLASS(ms); 1764 static const char upgrade_note[] = 1765 "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n" 1766 "(see http://sourceforge.net/projects/kvm).\n"; 1767 struct { 1768 const char *name; 1769 int num; 1770 } num_cpus[] = { 1771 { "SMP", ms->smp.cpus }, 1772 { "hotpluggable", ms->smp.max_cpus }, 1773 { NULL, } 1774 }, *nc = num_cpus; 1775 int soft_vcpus_limit, hard_vcpus_limit; 1776 KVMState *s; 1777 const KVMCapabilityInfo *missing_cap; 1778 int ret; 1779 int type = 0; 1780 const char *kvm_type; 1781 1782 s = KVM_STATE(ms->accelerator); 1783 1784 /* 1785 * On systems where the kernel can support different base page 1786 * sizes, host page size may be different from TARGET_PAGE_SIZE, 1787 * even with KVM. TARGET_PAGE_SIZE is assumed to be the minimum 1788 * page size for the system though. 1789 */ 1790 assert(TARGET_PAGE_SIZE <= getpagesize()); 1791 1792 s->sigmask_len = 8; 1793 1794 #ifdef KVM_CAP_SET_GUEST_DEBUG 1795 QTAILQ_INIT(&s->kvm_sw_breakpoints); 1796 #endif 1797 QLIST_INIT(&s->kvm_parked_vcpus); 1798 s->vmfd = -1; 1799 s->fd = qemu_open("/dev/kvm", O_RDWR); 1800 if (s->fd == -1) { 1801 fprintf(stderr, "Could not access KVM kernel module: %m\n"); 1802 ret = -errno; 1803 goto err; 1804 } 1805 1806 ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0); 1807 if (ret < KVM_API_VERSION) { 1808 if (ret >= 0) { 1809 ret = -EINVAL; 1810 } 1811 fprintf(stderr, "kvm version too old\n"); 1812 goto err; 1813 } 1814 1815 if (ret > KVM_API_VERSION) { 1816 ret = -EINVAL; 1817 fprintf(stderr, "kvm version not supported\n"); 1818 goto err; 1819 } 1820 1821 kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT); 1822 s->nr_slots = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS); 1823 1824 /* If unspecified, use the default value */ 1825 if (!s->nr_slots) { 1826 s->nr_slots = 32; 1827 } 1828 1829 s->nr_as = kvm_check_extension(s, KVM_CAP_MULTI_ADDRESS_SPACE); 1830 if (s->nr_as <= 1) { 1831 s->nr_as = 1; 1832 } 1833 s->as = g_new0(struct KVMAs, s->nr_as); 1834 1835 kvm_type = qemu_opt_get(qemu_get_machine_opts(), "kvm-type"); 1836 if (mc->kvm_type) { 1837 type = mc->kvm_type(ms, kvm_type); 1838 } else if (kvm_type) { 1839 ret = -EINVAL; 1840 fprintf(stderr, "Invalid argument kvm-type=%s\n", kvm_type); 1841 goto err; 1842 } 1843 1844 do { 1845 ret = kvm_ioctl(s, KVM_CREATE_VM, type); 1846 } while (ret == -EINTR); 1847 1848 if (ret < 0) { 1849 fprintf(stderr, "ioctl(KVM_CREATE_VM) failed: %d %s\n", -ret, 1850 strerror(-ret)); 1851 1852 #ifdef TARGET_S390X 1853 if (ret == -EINVAL) { 1854 fprintf(stderr, 1855 "Host kernel setup problem detected. Please verify:\n"); 1856 fprintf(stderr, "- for kernels supporting the switch_amode or" 1857 " user_mode parameters, whether\n"); 1858 fprintf(stderr, 1859 " user space is running in primary address space\n"); 1860 fprintf(stderr, 1861 "- for kernels supporting the vm.allocate_pgste sysctl, " 1862 "whether it is enabled\n"); 1863 } 1864 #endif 1865 goto err; 1866 } 1867 1868 s->vmfd = ret; 1869 1870 /* check the vcpu limits */ 1871 soft_vcpus_limit = kvm_recommended_vcpus(s); 1872 hard_vcpus_limit = kvm_max_vcpus(s); 1873 1874 while (nc->name) { 1875 if (nc->num > soft_vcpus_limit) { 1876 warn_report("Number of %s cpus requested (%d) exceeds " 1877 "the recommended cpus supported by KVM (%d)", 1878 nc->name, nc->num, soft_vcpus_limit); 1879 1880 if (nc->num > hard_vcpus_limit) { 1881 fprintf(stderr, "Number of %s cpus requested (%d) exceeds " 1882 "the maximum cpus supported by KVM (%d)\n", 1883 nc->name, nc->num, hard_vcpus_limit); 1884 exit(1); 1885 } 1886 } 1887 nc++; 1888 } 1889 1890 missing_cap = kvm_check_extension_list(s, kvm_required_capabilites); 1891 if (!missing_cap) { 1892 missing_cap = 1893 kvm_check_extension_list(s, kvm_arch_required_capabilities); 1894 } 1895 if (missing_cap) { 1896 ret = -EINVAL; 1897 fprintf(stderr, "kvm does not support %s\n%s", 1898 missing_cap->name, upgrade_note); 1899 goto err; 1900 } 1901 1902 s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO); 1903 s->coalesced_pio = s->coalesced_mmio && 1904 kvm_check_extension(s, KVM_CAP_COALESCED_PIO); 1905 1906 s->manual_dirty_log_protect = 1907 kvm_check_extension(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); 1908 if (s->manual_dirty_log_protect) { 1909 ret = kvm_vm_enable_cap(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0, 1); 1910 if (ret) { 1911 warn_report("Trying to enable KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 " 1912 "but failed. Falling back to the legacy mode. "); 1913 s->manual_dirty_log_protect = false; 1914 } 1915 } 1916 1917 #ifdef KVM_CAP_VCPU_EVENTS 1918 s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS); 1919 #endif 1920 1921 s->robust_singlestep = 1922 kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP); 1923 1924 #ifdef KVM_CAP_DEBUGREGS 1925 s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS); 1926 #endif 1927 1928 s->max_nested_state_len = kvm_check_extension(s, KVM_CAP_NESTED_STATE); 1929 1930 #ifdef KVM_CAP_IRQ_ROUTING 1931 kvm_direct_msi_allowed = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0); 1932 #endif 1933 1934 s->intx_set_mask = kvm_check_extension(s, KVM_CAP_PCI_2_3); 1935 1936 s->irq_set_ioctl = KVM_IRQ_LINE; 1937 if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) { 1938 s->irq_set_ioctl = KVM_IRQ_LINE_STATUS; 1939 } 1940 1941 kvm_readonly_mem_allowed = 1942 (kvm_check_extension(s, KVM_CAP_READONLY_MEM) > 0); 1943 1944 kvm_eventfds_allowed = 1945 (kvm_check_extension(s, KVM_CAP_IOEVENTFD) > 0); 1946 1947 kvm_irqfds_allowed = 1948 (kvm_check_extension(s, KVM_CAP_IRQFD) > 0); 1949 1950 kvm_resamplefds_allowed = 1951 (kvm_check_extension(s, KVM_CAP_IRQFD_RESAMPLE) > 0); 1952 1953 kvm_vm_attributes_allowed = 1954 (kvm_check_extension(s, KVM_CAP_VM_ATTRIBUTES) > 0); 1955 1956 kvm_ioeventfd_any_length_allowed = 1957 (kvm_check_extension(s, KVM_CAP_IOEVENTFD_ANY_LENGTH) > 0); 1958 1959 kvm_state = s; 1960 1961 /* 1962 * if memory encryption object is specified then initialize the memory 1963 * encryption context. 1964 */ 1965 if (ms->memory_encryption) { 1966 kvm_state->memcrypt_handle = sev_guest_init(ms->memory_encryption); 1967 if (!kvm_state->memcrypt_handle) { 1968 ret = -1; 1969 goto err; 1970 } 1971 1972 kvm_state->memcrypt_encrypt_data = sev_encrypt_data; 1973 } 1974 1975 ret = kvm_arch_init(ms, s); 1976 if (ret < 0) { 1977 goto err; 1978 } 1979 1980 if (machine_kernel_irqchip_allowed(ms)) { 1981 kvm_irqchip_create(ms, s); 1982 } 1983 1984 if (kvm_eventfds_allowed) { 1985 s->memory_listener.listener.eventfd_add = kvm_mem_ioeventfd_add; 1986 s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del; 1987 } 1988 s->memory_listener.listener.coalesced_io_add = kvm_coalesce_mmio_region; 1989 s->memory_listener.listener.coalesced_io_del = kvm_uncoalesce_mmio_region; 1990 1991 kvm_memory_listener_register(s, &s->memory_listener, 1992 &address_space_memory, 0); 1993 memory_listener_register(&kvm_io_listener, 1994 &address_space_io); 1995 memory_listener_register(&kvm_coalesced_pio_listener, 1996 &address_space_io); 1997 1998 s->many_ioeventfds = kvm_check_many_ioeventfds(); 1999 2000 s->sync_mmu = !!kvm_vm_check_extension(kvm_state, KVM_CAP_SYNC_MMU); 2001 if (!s->sync_mmu) { 2002 qemu_balloon_inhibit(true); 2003 } 2004 2005 return 0; 2006 2007 err: 2008 assert(ret < 0); 2009 if (s->vmfd >= 0) { 2010 close(s->vmfd); 2011 } 2012 if (s->fd != -1) { 2013 close(s->fd); 2014 } 2015 g_free(s->memory_listener.slots); 2016 2017 return ret; 2018 } 2019 2020 void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len) 2021 { 2022 s->sigmask_len = sigmask_len; 2023 } 2024 2025 static void kvm_handle_io(uint16_t port, MemTxAttrs attrs, void *data, int direction, 2026 int size, uint32_t count) 2027 { 2028 int i; 2029 uint8_t *ptr = data; 2030 2031 for (i = 0; i < count; i++) { 2032 address_space_rw(&address_space_io, port, attrs, 2033 ptr, size, 2034 direction == KVM_EXIT_IO_OUT); 2035 ptr += size; 2036 } 2037 } 2038 2039 static int kvm_handle_internal_error(CPUState *cpu, struct kvm_run *run) 2040 { 2041 fprintf(stderr, "KVM internal error. Suberror: %d\n", 2042 run->internal.suberror); 2043 2044 if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) { 2045 int i; 2046 2047 for (i = 0; i < run->internal.ndata; ++i) { 2048 fprintf(stderr, "extra data[%d]: %"PRIx64"\n", 2049 i, (uint64_t)run->internal.data[i]); 2050 } 2051 } 2052 if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) { 2053 fprintf(stderr, "emulation failure\n"); 2054 if (!kvm_arch_stop_on_emulation_error(cpu)) { 2055 cpu_dump_state(cpu, stderr, CPU_DUMP_CODE); 2056 return EXCP_INTERRUPT; 2057 } 2058 } 2059 /* FIXME: Should trigger a qmp message to let management know 2060 * something went wrong. 2061 */ 2062 return -1; 2063 } 2064 2065 void kvm_flush_coalesced_mmio_buffer(void) 2066 { 2067 KVMState *s = kvm_state; 2068 2069 if (s->coalesced_flush_in_progress) { 2070 return; 2071 } 2072 2073 s->coalesced_flush_in_progress = true; 2074 2075 if (s->coalesced_mmio_ring) { 2076 struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring; 2077 while (ring->first != ring->last) { 2078 struct kvm_coalesced_mmio *ent; 2079 2080 ent = &ring->coalesced_mmio[ring->first]; 2081 2082 if (ent->pio == 1) { 2083 address_space_rw(&address_space_io, ent->phys_addr, 2084 MEMTXATTRS_UNSPECIFIED, ent->data, 2085 ent->len, true); 2086 } else { 2087 cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len); 2088 } 2089 smp_wmb(); 2090 ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX; 2091 } 2092 } 2093 2094 s->coalesced_flush_in_progress = false; 2095 } 2096 2097 static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg) 2098 { 2099 if (!cpu->vcpu_dirty) { 2100 kvm_arch_get_registers(cpu); 2101 cpu->vcpu_dirty = true; 2102 } 2103 } 2104 2105 void kvm_cpu_synchronize_state(CPUState *cpu) 2106 { 2107 if (!cpu->vcpu_dirty) { 2108 run_on_cpu(cpu, do_kvm_cpu_synchronize_state, RUN_ON_CPU_NULL); 2109 } 2110 } 2111 2112 static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg) 2113 { 2114 kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE); 2115 cpu->vcpu_dirty = false; 2116 } 2117 2118 void kvm_cpu_synchronize_post_reset(CPUState *cpu) 2119 { 2120 run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL); 2121 } 2122 2123 static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg) 2124 { 2125 kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE); 2126 cpu->vcpu_dirty = false; 2127 } 2128 2129 void kvm_cpu_synchronize_post_init(CPUState *cpu) 2130 { 2131 run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, RUN_ON_CPU_NULL); 2132 } 2133 2134 static void do_kvm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg) 2135 { 2136 cpu->vcpu_dirty = true; 2137 } 2138 2139 void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu) 2140 { 2141 run_on_cpu(cpu, do_kvm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL); 2142 } 2143 2144 #ifdef KVM_HAVE_MCE_INJECTION 2145 static __thread void *pending_sigbus_addr; 2146 static __thread int pending_sigbus_code; 2147 static __thread bool have_sigbus_pending; 2148 #endif 2149 2150 static void kvm_cpu_kick(CPUState *cpu) 2151 { 2152 atomic_set(&cpu->kvm_run->immediate_exit, 1); 2153 } 2154 2155 static void kvm_cpu_kick_self(void) 2156 { 2157 if (kvm_immediate_exit) { 2158 kvm_cpu_kick(current_cpu); 2159 } else { 2160 qemu_cpu_kick_self(); 2161 } 2162 } 2163 2164 static void kvm_eat_signals(CPUState *cpu) 2165 { 2166 struct timespec ts = { 0, 0 }; 2167 siginfo_t siginfo; 2168 sigset_t waitset; 2169 sigset_t chkset; 2170 int r; 2171 2172 if (kvm_immediate_exit) { 2173 atomic_set(&cpu->kvm_run->immediate_exit, 0); 2174 /* Write kvm_run->immediate_exit before the cpu->exit_request 2175 * write in kvm_cpu_exec. 2176 */ 2177 smp_wmb(); 2178 return; 2179 } 2180 2181 sigemptyset(&waitset); 2182 sigaddset(&waitset, SIG_IPI); 2183 2184 do { 2185 r = sigtimedwait(&waitset, &siginfo, &ts); 2186 if (r == -1 && !(errno == EAGAIN || errno == EINTR)) { 2187 perror("sigtimedwait"); 2188 exit(1); 2189 } 2190 2191 r = sigpending(&chkset); 2192 if (r == -1) { 2193 perror("sigpending"); 2194 exit(1); 2195 } 2196 } while (sigismember(&chkset, SIG_IPI)); 2197 } 2198 2199 int kvm_cpu_exec(CPUState *cpu) 2200 { 2201 struct kvm_run *run = cpu->kvm_run; 2202 int ret, run_ret; 2203 2204 DPRINTF("kvm_cpu_exec()\n"); 2205 2206 if (kvm_arch_process_async_events(cpu)) { 2207 atomic_set(&cpu->exit_request, 0); 2208 return EXCP_HLT; 2209 } 2210 2211 qemu_mutex_unlock_iothread(); 2212 cpu_exec_start(cpu); 2213 2214 do { 2215 MemTxAttrs attrs; 2216 2217 if (cpu->vcpu_dirty) { 2218 kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE); 2219 cpu->vcpu_dirty = false; 2220 } 2221 2222 kvm_arch_pre_run(cpu, run); 2223 if (atomic_read(&cpu->exit_request)) { 2224 DPRINTF("interrupt exit requested\n"); 2225 /* 2226 * KVM requires us to reenter the kernel after IO exits to complete 2227 * instruction emulation. This self-signal will ensure that we 2228 * leave ASAP again. 2229 */ 2230 kvm_cpu_kick_self(); 2231 } 2232 2233 /* Read cpu->exit_request before KVM_RUN reads run->immediate_exit. 2234 * Matching barrier in kvm_eat_signals. 2235 */ 2236 smp_rmb(); 2237 2238 run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0); 2239 2240 attrs = kvm_arch_post_run(cpu, run); 2241 2242 #ifdef KVM_HAVE_MCE_INJECTION 2243 if (unlikely(have_sigbus_pending)) { 2244 qemu_mutex_lock_iothread(); 2245 kvm_arch_on_sigbus_vcpu(cpu, pending_sigbus_code, 2246 pending_sigbus_addr); 2247 have_sigbus_pending = false; 2248 qemu_mutex_unlock_iothread(); 2249 } 2250 #endif 2251 2252 if (run_ret < 0) { 2253 if (run_ret == -EINTR || run_ret == -EAGAIN) { 2254 DPRINTF("io window exit\n"); 2255 kvm_eat_signals(cpu); 2256 ret = EXCP_INTERRUPT; 2257 break; 2258 } 2259 fprintf(stderr, "error: kvm run failed %s\n", 2260 strerror(-run_ret)); 2261 #ifdef TARGET_PPC 2262 if (run_ret == -EBUSY) { 2263 fprintf(stderr, 2264 "This is probably because your SMT is enabled.\n" 2265 "VCPU can only run on primary threads with all " 2266 "secondary threads offline.\n"); 2267 } 2268 #endif 2269 ret = -1; 2270 break; 2271 } 2272 2273 trace_kvm_run_exit(cpu->cpu_index, run->exit_reason); 2274 switch (run->exit_reason) { 2275 case KVM_EXIT_IO: 2276 DPRINTF("handle_io\n"); 2277 /* Called outside BQL */ 2278 kvm_handle_io(run->io.port, attrs, 2279 (uint8_t *)run + run->io.data_offset, 2280 run->io.direction, 2281 run->io.size, 2282 run->io.count); 2283 ret = 0; 2284 break; 2285 case KVM_EXIT_MMIO: 2286 DPRINTF("handle_mmio\n"); 2287 /* Called outside BQL */ 2288 address_space_rw(&address_space_memory, 2289 run->mmio.phys_addr, attrs, 2290 run->mmio.data, 2291 run->mmio.len, 2292 run->mmio.is_write); 2293 ret = 0; 2294 break; 2295 case KVM_EXIT_IRQ_WINDOW_OPEN: 2296 DPRINTF("irq_window_open\n"); 2297 ret = EXCP_INTERRUPT; 2298 break; 2299 case KVM_EXIT_SHUTDOWN: 2300 DPRINTF("shutdown\n"); 2301 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); 2302 ret = EXCP_INTERRUPT; 2303 break; 2304 case KVM_EXIT_UNKNOWN: 2305 fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n", 2306 (uint64_t)run->hw.hardware_exit_reason); 2307 ret = -1; 2308 break; 2309 case KVM_EXIT_INTERNAL_ERROR: 2310 ret = kvm_handle_internal_error(cpu, run); 2311 break; 2312 case KVM_EXIT_SYSTEM_EVENT: 2313 switch (run->system_event.type) { 2314 case KVM_SYSTEM_EVENT_SHUTDOWN: 2315 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN); 2316 ret = EXCP_INTERRUPT; 2317 break; 2318 case KVM_SYSTEM_EVENT_RESET: 2319 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); 2320 ret = EXCP_INTERRUPT; 2321 break; 2322 case KVM_SYSTEM_EVENT_CRASH: 2323 kvm_cpu_synchronize_state(cpu); 2324 qemu_mutex_lock_iothread(); 2325 qemu_system_guest_panicked(cpu_get_crash_info(cpu)); 2326 qemu_mutex_unlock_iothread(); 2327 ret = 0; 2328 break; 2329 default: 2330 DPRINTF("kvm_arch_handle_exit\n"); 2331 ret = kvm_arch_handle_exit(cpu, run); 2332 break; 2333 } 2334 break; 2335 default: 2336 DPRINTF("kvm_arch_handle_exit\n"); 2337 ret = kvm_arch_handle_exit(cpu, run); 2338 break; 2339 } 2340 } while (ret == 0); 2341 2342 cpu_exec_end(cpu); 2343 qemu_mutex_lock_iothread(); 2344 2345 if (ret < 0) { 2346 cpu_dump_state(cpu, stderr, CPU_DUMP_CODE); 2347 vm_stop(RUN_STATE_INTERNAL_ERROR); 2348 } 2349 2350 atomic_set(&cpu->exit_request, 0); 2351 return ret; 2352 } 2353 2354 int kvm_ioctl(KVMState *s, int type, ...) 2355 { 2356 int ret; 2357 void *arg; 2358 va_list ap; 2359 2360 va_start(ap, type); 2361 arg = va_arg(ap, void *); 2362 va_end(ap); 2363 2364 trace_kvm_ioctl(type, arg); 2365 ret = ioctl(s->fd, type, arg); 2366 if (ret == -1) { 2367 ret = -errno; 2368 } 2369 return ret; 2370 } 2371 2372 int kvm_vm_ioctl(KVMState *s, int type, ...) 2373 { 2374 int ret; 2375 void *arg; 2376 va_list ap; 2377 2378 va_start(ap, type); 2379 arg = va_arg(ap, void *); 2380 va_end(ap); 2381 2382 trace_kvm_vm_ioctl(type, arg); 2383 ret = ioctl(s->vmfd, type, arg); 2384 if (ret == -1) { 2385 ret = -errno; 2386 } 2387 return ret; 2388 } 2389 2390 int kvm_vcpu_ioctl(CPUState *cpu, int type, ...) 2391 { 2392 int ret; 2393 void *arg; 2394 va_list ap; 2395 2396 va_start(ap, type); 2397 arg = va_arg(ap, void *); 2398 va_end(ap); 2399 2400 trace_kvm_vcpu_ioctl(cpu->cpu_index, type, arg); 2401 ret = ioctl(cpu->kvm_fd, type, arg); 2402 if (ret == -1) { 2403 ret = -errno; 2404 } 2405 return ret; 2406 } 2407 2408 int kvm_device_ioctl(int fd, int type, ...) 2409 { 2410 int ret; 2411 void *arg; 2412 va_list ap; 2413 2414 va_start(ap, type); 2415 arg = va_arg(ap, void *); 2416 va_end(ap); 2417 2418 trace_kvm_device_ioctl(fd, type, arg); 2419 ret = ioctl(fd, type, arg); 2420 if (ret == -1) { 2421 ret = -errno; 2422 } 2423 return ret; 2424 } 2425 2426 int kvm_vm_check_attr(KVMState *s, uint32_t group, uint64_t attr) 2427 { 2428 int ret; 2429 struct kvm_device_attr attribute = { 2430 .group = group, 2431 .attr = attr, 2432 }; 2433 2434 if (!kvm_vm_attributes_allowed) { 2435 return 0; 2436 } 2437 2438 ret = kvm_vm_ioctl(s, KVM_HAS_DEVICE_ATTR, &attribute); 2439 /* kvm returns 0 on success for HAS_DEVICE_ATTR */ 2440 return ret ? 0 : 1; 2441 } 2442 2443 int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr) 2444 { 2445 struct kvm_device_attr attribute = { 2446 .group = group, 2447 .attr = attr, 2448 .flags = 0, 2449 }; 2450 2451 return kvm_device_ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute) ? 0 : 1; 2452 } 2453 2454 int kvm_device_access(int fd, int group, uint64_t attr, 2455 void *val, bool write, Error **errp) 2456 { 2457 struct kvm_device_attr kvmattr; 2458 int err; 2459 2460 kvmattr.flags = 0; 2461 kvmattr.group = group; 2462 kvmattr.attr = attr; 2463 kvmattr.addr = (uintptr_t)val; 2464 2465 err = kvm_device_ioctl(fd, 2466 write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR, 2467 &kvmattr); 2468 if (err < 0) { 2469 error_setg_errno(errp, -err, 2470 "KVM_%s_DEVICE_ATTR failed: Group %d " 2471 "attr 0x%016" PRIx64, 2472 write ? "SET" : "GET", group, attr); 2473 } 2474 return err; 2475 } 2476 2477 bool kvm_has_sync_mmu(void) 2478 { 2479 return kvm_state->sync_mmu; 2480 } 2481 2482 int kvm_has_vcpu_events(void) 2483 { 2484 return kvm_state->vcpu_events; 2485 } 2486 2487 int kvm_has_robust_singlestep(void) 2488 { 2489 return kvm_state->robust_singlestep; 2490 } 2491 2492 int kvm_has_debugregs(void) 2493 { 2494 return kvm_state->debugregs; 2495 } 2496 2497 int kvm_max_nested_state_length(void) 2498 { 2499 return kvm_state->max_nested_state_len; 2500 } 2501 2502 int kvm_has_many_ioeventfds(void) 2503 { 2504 if (!kvm_enabled()) { 2505 return 0; 2506 } 2507 return kvm_state->many_ioeventfds; 2508 } 2509 2510 int kvm_has_gsi_routing(void) 2511 { 2512 #ifdef KVM_CAP_IRQ_ROUTING 2513 return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING); 2514 #else 2515 return false; 2516 #endif 2517 } 2518 2519 int kvm_has_intx_set_mask(void) 2520 { 2521 return kvm_state->intx_set_mask; 2522 } 2523 2524 bool kvm_arm_supports_user_irq(void) 2525 { 2526 return kvm_check_extension(kvm_state, KVM_CAP_ARM_USER_IRQ); 2527 } 2528 2529 #ifdef KVM_CAP_SET_GUEST_DEBUG 2530 struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu, 2531 target_ulong pc) 2532 { 2533 struct kvm_sw_breakpoint *bp; 2534 2535 QTAILQ_FOREACH(bp, &cpu->kvm_state->kvm_sw_breakpoints, entry) { 2536 if (bp->pc == pc) { 2537 return bp; 2538 } 2539 } 2540 return NULL; 2541 } 2542 2543 int kvm_sw_breakpoints_active(CPUState *cpu) 2544 { 2545 return !QTAILQ_EMPTY(&cpu->kvm_state->kvm_sw_breakpoints); 2546 } 2547 2548 struct kvm_set_guest_debug_data { 2549 struct kvm_guest_debug dbg; 2550 int err; 2551 }; 2552 2553 static void kvm_invoke_set_guest_debug(CPUState *cpu, run_on_cpu_data data) 2554 { 2555 struct kvm_set_guest_debug_data *dbg_data = 2556 (struct kvm_set_guest_debug_data *) data.host_ptr; 2557 2558 dbg_data->err = kvm_vcpu_ioctl(cpu, KVM_SET_GUEST_DEBUG, 2559 &dbg_data->dbg); 2560 } 2561 2562 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap) 2563 { 2564 struct kvm_set_guest_debug_data data; 2565 2566 data.dbg.control = reinject_trap; 2567 2568 if (cpu->singlestep_enabled) { 2569 data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP; 2570 } 2571 kvm_arch_update_guest_debug(cpu, &data.dbg); 2572 2573 run_on_cpu(cpu, kvm_invoke_set_guest_debug, 2574 RUN_ON_CPU_HOST_PTR(&data)); 2575 return data.err; 2576 } 2577 2578 int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr, 2579 target_ulong len, int type) 2580 { 2581 struct kvm_sw_breakpoint *bp; 2582 int err; 2583 2584 if (type == GDB_BREAKPOINT_SW) { 2585 bp = kvm_find_sw_breakpoint(cpu, addr); 2586 if (bp) { 2587 bp->use_count++; 2588 return 0; 2589 } 2590 2591 bp = g_malloc(sizeof(struct kvm_sw_breakpoint)); 2592 bp->pc = addr; 2593 bp->use_count = 1; 2594 err = kvm_arch_insert_sw_breakpoint(cpu, bp); 2595 if (err) { 2596 g_free(bp); 2597 return err; 2598 } 2599 2600 QTAILQ_INSERT_HEAD(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry); 2601 } else { 2602 err = kvm_arch_insert_hw_breakpoint(addr, len, type); 2603 if (err) { 2604 return err; 2605 } 2606 } 2607 2608 CPU_FOREACH(cpu) { 2609 err = kvm_update_guest_debug(cpu, 0); 2610 if (err) { 2611 return err; 2612 } 2613 } 2614 return 0; 2615 } 2616 2617 int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr, 2618 target_ulong len, int type) 2619 { 2620 struct kvm_sw_breakpoint *bp; 2621 int err; 2622 2623 if (type == GDB_BREAKPOINT_SW) { 2624 bp = kvm_find_sw_breakpoint(cpu, addr); 2625 if (!bp) { 2626 return -ENOENT; 2627 } 2628 2629 if (bp->use_count > 1) { 2630 bp->use_count--; 2631 return 0; 2632 } 2633 2634 err = kvm_arch_remove_sw_breakpoint(cpu, bp); 2635 if (err) { 2636 return err; 2637 } 2638 2639 QTAILQ_REMOVE(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry); 2640 g_free(bp); 2641 } else { 2642 err = kvm_arch_remove_hw_breakpoint(addr, len, type); 2643 if (err) { 2644 return err; 2645 } 2646 } 2647 2648 CPU_FOREACH(cpu) { 2649 err = kvm_update_guest_debug(cpu, 0); 2650 if (err) { 2651 return err; 2652 } 2653 } 2654 return 0; 2655 } 2656 2657 void kvm_remove_all_breakpoints(CPUState *cpu) 2658 { 2659 struct kvm_sw_breakpoint *bp, *next; 2660 KVMState *s = cpu->kvm_state; 2661 CPUState *tmpcpu; 2662 2663 QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) { 2664 if (kvm_arch_remove_sw_breakpoint(cpu, bp) != 0) { 2665 /* Try harder to find a CPU that currently sees the breakpoint. */ 2666 CPU_FOREACH(tmpcpu) { 2667 if (kvm_arch_remove_sw_breakpoint(tmpcpu, bp) == 0) { 2668 break; 2669 } 2670 } 2671 } 2672 QTAILQ_REMOVE(&s->kvm_sw_breakpoints, bp, entry); 2673 g_free(bp); 2674 } 2675 kvm_arch_remove_all_hw_breakpoints(); 2676 2677 CPU_FOREACH(cpu) { 2678 kvm_update_guest_debug(cpu, 0); 2679 } 2680 } 2681 2682 #else /* !KVM_CAP_SET_GUEST_DEBUG */ 2683 2684 int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap) 2685 { 2686 return -EINVAL; 2687 } 2688 2689 int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr, 2690 target_ulong len, int type) 2691 { 2692 return -EINVAL; 2693 } 2694 2695 int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr, 2696 target_ulong len, int type) 2697 { 2698 return -EINVAL; 2699 } 2700 2701 void kvm_remove_all_breakpoints(CPUState *cpu) 2702 { 2703 } 2704 #endif /* !KVM_CAP_SET_GUEST_DEBUG */ 2705 2706 static int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset) 2707 { 2708 KVMState *s = kvm_state; 2709 struct kvm_signal_mask *sigmask; 2710 int r; 2711 2712 sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset)); 2713 2714 sigmask->len = s->sigmask_len; 2715 memcpy(sigmask->sigset, sigset, sizeof(*sigset)); 2716 r = kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, sigmask); 2717 g_free(sigmask); 2718 2719 return r; 2720 } 2721 2722 static void kvm_ipi_signal(int sig) 2723 { 2724 if (current_cpu) { 2725 assert(kvm_immediate_exit); 2726 kvm_cpu_kick(current_cpu); 2727 } 2728 } 2729 2730 void kvm_init_cpu_signals(CPUState *cpu) 2731 { 2732 int r; 2733 sigset_t set; 2734 struct sigaction sigact; 2735 2736 memset(&sigact, 0, sizeof(sigact)); 2737 sigact.sa_handler = kvm_ipi_signal; 2738 sigaction(SIG_IPI, &sigact, NULL); 2739 2740 pthread_sigmask(SIG_BLOCK, NULL, &set); 2741 #if defined KVM_HAVE_MCE_INJECTION 2742 sigdelset(&set, SIGBUS); 2743 pthread_sigmask(SIG_SETMASK, &set, NULL); 2744 #endif 2745 sigdelset(&set, SIG_IPI); 2746 if (kvm_immediate_exit) { 2747 r = pthread_sigmask(SIG_SETMASK, &set, NULL); 2748 } else { 2749 r = kvm_set_signal_mask(cpu, &set); 2750 } 2751 if (r) { 2752 fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r)); 2753 exit(1); 2754 } 2755 } 2756 2757 /* Called asynchronously in VCPU thread. */ 2758 int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr) 2759 { 2760 #ifdef KVM_HAVE_MCE_INJECTION 2761 if (have_sigbus_pending) { 2762 return 1; 2763 } 2764 have_sigbus_pending = true; 2765 pending_sigbus_addr = addr; 2766 pending_sigbus_code = code; 2767 atomic_set(&cpu->exit_request, 1); 2768 return 0; 2769 #else 2770 return 1; 2771 #endif 2772 } 2773 2774 /* Called synchronously (via signalfd) in main thread. */ 2775 int kvm_on_sigbus(int code, void *addr) 2776 { 2777 #ifdef KVM_HAVE_MCE_INJECTION 2778 /* Action required MCE kills the process if SIGBUS is blocked. Because 2779 * that's what happens in the I/O thread, where we handle MCE via signalfd, 2780 * we can only get action optional here. 2781 */ 2782 assert(code != BUS_MCEERR_AR); 2783 kvm_arch_on_sigbus_vcpu(first_cpu, code, addr); 2784 return 0; 2785 #else 2786 return 1; 2787 #endif 2788 } 2789 2790 int kvm_create_device(KVMState *s, uint64_t type, bool test) 2791 { 2792 int ret; 2793 struct kvm_create_device create_dev; 2794 2795 create_dev.type = type; 2796 create_dev.fd = -1; 2797 create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0; 2798 2799 if (!kvm_check_extension(s, KVM_CAP_DEVICE_CTRL)) { 2800 return -ENOTSUP; 2801 } 2802 2803 ret = kvm_vm_ioctl(s, KVM_CREATE_DEVICE, &create_dev); 2804 if (ret) { 2805 return ret; 2806 } 2807 2808 return test ? 0 : create_dev.fd; 2809 } 2810 2811 bool kvm_device_supported(int vmfd, uint64_t type) 2812 { 2813 struct kvm_create_device create_dev = { 2814 .type = type, 2815 .fd = -1, 2816 .flags = KVM_CREATE_DEVICE_TEST, 2817 }; 2818 2819 if (ioctl(vmfd, KVM_CHECK_EXTENSION, KVM_CAP_DEVICE_CTRL) <= 0) { 2820 return false; 2821 } 2822 2823 return (ioctl(vmfd, KVM_CREATE_DEVICE, &create_dev) >= 0); 2824 } 2825 2826 int kvm_set_one_reg(CPUState *cs, uint64_t id, void *source) 2827 { 2828 struct kvm_one_reg reg; 2829 int r; 2830 2831 reg.id = id; 2832 reg.addr = (uintptr_t) source; 2833 r = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, ®); 2834 if (r) { 2835 trace_kvm_failed_reg_set(id, strerror(-r)); 2836 } 2837 return r; 2838 } 2839 2840 int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target) 2841 { 2842 struct kvm_one_reg reg; 2843 int r; 2844 2845 reg.id = id; 2846 reg.addr = (uintptr_t) target; 2847 r = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, ®); 2848 if (r) { 2849 trace_kvm_failed_reg_get(id, strerror(-r)); 2850 } 2851 return r; 2852 } 2853 2854 static bool kvm_accel_has_memory(MachineState *ms, AddressSpace *as, 2855 hwaddr start_addr, hwaddr size) 2856 { 2857 KVMState *kvm = KVM_STATE(ms->accelerator); 2858 int i; 2859 2860 for (i = 0; i < kvm->nr_as; ++i) { 2861 if (kvm->as[i].as == as && kvm->as[i].ml) { 2862 return NULL != kvm_lookup_matching_slot(kvm->as[i].ml, 2863 start_addr, size); 2864 } 2865 } 2866 2867 return false; 2868 } 2869 2870 static void kvm_accel_class_init(ObjectClass *oc, void *data) 2871 { 2872 AccelClass *ac = ACCEL_CLASS(oc); 2873 ac->name = "KVM"; 2874 ac->init_machine = kvm_init; 2875 ac->has_memory = kvm_accel_has_memory; 2876 ac->allowed = &kvm_allowed; 2877 } 2878 2879 static const TypeInfo kvm_accel_type = { 2880 .name = TYPE_KVM_ACCEL, 2881 .parent = TYPE_ACCEL, 2882 .class_init = kvm_accel_class_init, 2883 .instance_size = sizeof(KVMState), 2884 }; 2885 2886 static void kvm_type_init(void) 2887 { 2888 type_register_static(&kvm_accel_type); 2889 } 2890 2891 type_init(kvm_type_init); 2892