1 /* 2 * Xen HVM emulation support in KVM 3 * 4 * Copyright © 2019 Oracle and/or its affiliates. All rights reserved. 5 * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. 6 * 7 * This work is licensed under the terms of the GNU GPL, version 2 or later. 8 * See the COPYING file in the top-level directory. 9 * 10 */ 11 12 #include "qemu/osdep.h" 13 #include "qemu/log.h" 14 #include "qemu/main-loop.h" 15 #include "qemu/error-report.h" 16 #include "hw/xen/xen.h" 17 #include "sysemu/kvm_int.h" 18 #include "sysemu/kvm_xen.h" 19 #include "kvm/kvm_i386.h" 20 #include "exec/address-spaces.h" 21 #include "xen-emu.h" 22 #include "trace.h" 23 #include "sysemu/runstate.h" 24 25 #include "hw/pci/msi.h" 26 #include "hw/i386/apic-msidef.h" 27 #include "hw/i386/e820_memory_layout.h" 28 #include "hw/i386/kvm/xen_overlay.h" 29 #include "hw/i386/kvm/xen_evtchn.h" 30 #include "hw/i386/kvm/xen_gnttab.h" 31 #include "hw/i386/kvm/xen_primary_console.h" 32 #include "hw/i386/kvm/xen_xenstore.h" 33 34 #include "hw/xen/interface/version.h" 35 #include "hw/xen/interface/sched.h" 36 #include "hw/xen/interface/memory.h" 37 #include "hw/xen/interface/hvm/hvm_op.h" 38 #include "hw/xen/interface/hvm/params.h" 39 #include "hw/xen/interface/vcpu.h" 40 #include "hw/xen/interface/event_channel.h" 41 #include "hw/xen/interface/grant_table.h" 42 43 #include "xen-compat.h" 44 45 static void xen_vcpu_singleshot_timer_event(void *opaque); 46 static void xen_vcpu_periodic_timer_event(void *opaque); 47 static int vcpuop_stop_singleshot_timer(CPUState *cs); 48 49 #ifdef TARGET_X86_64 50 #define hypercall_compat32(longmode) (!(longmode)) 51 #else 52 #define hypercall_compat32(longmode) (false) 53 #endif 54 55 static bool kvm_gva_to_gpa(CPUState *cs, uint64_t gva, uint64_t *gpa, 56 size_t *len, bool is_write) 57 { 58 struct kvm_translation tr = { 59 .linear_address = gva, 60 }; 61 62 if (len) { 63 *len = TARGET_PAGE_SIZE - (gva & ~TARGET_PAGE_MASK); 64 } 65 66 if (kvm_vcpu_ioctl(cs, KVM_TRANSLATE, &tr) || !tr.valid || 67 (is_write && !tr.writeable)) { 68 return false; 69 } 70 *gpa = tr.physical_address; 71 return true; 72 } 73 74 static int kvm_gva_rw(CPUState *cs, uint64_t gva, void *_buf, size_t sz, 75 bool is_write) 76 { 77 uint8_t *buf = (uint8_t *)_buf; 78 uint64_t gpa; 79 size_t len; 80 81 while (sz) { 82 if (!kvm_gva_to_gpa(cs, gva, &gpa, &len, is_write)) { 83 return -EFAULT; 84 } 85 if (len > sz) { 86 len = sz; 87 } 88 89 cpu_physical_memory_rw(gpa, buf, len, is_write); 90 91 buf += len; 92 sz -= len; 93 gva += len; 94 } 95 96 return 0; 97 } 98 99 static inline int kvm_copy_from_gva(CPUState *cs, uint64_t gva, void *buf, 100 size_t sz) 101 { 102 return kvm_gva_rw(cs, gva, buf, sz, false); 103 } 104 105 static inline int kvm_copy_to_gva(CPUState *cs, uint64_t gva, void *buf, 106 size_t sz) 107 { 108 return kvm_gva_rw(cs, gva, buf, sz, true); 109 } 110 111 int kvm_xen_init(KVMState *s, uint32_t hypercall_msr) 112 { 113 const int required_caps = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR | 114 KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | KVM_XEN_HVM_CONFIG_SHARED_INFO; 115 struct kvm_xen_hvm_config cfg = { 116 .msr = hypercall_msr, 117 .flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL, 118 }; 119 int xen_caps, ret; 120 121 xen_caps = kvm_check_extension(s, KVM_CAP_XEN_HVM); 122 if (required_caps & ~xen_caps) { 123 error_report("kvm: Xen HVM guest support not present or insufficient"); 124 return -ENOSYS; 125 } 126 127 if (xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND) { 128 struct kvm_xen_hvm_attr ha = { 129 .type = KVM_XEN_ATTR_TYPE_XEN_VERSION, 130 .u.xen_version = s->xen_version, 131 }; 132 (void)kvm_vm_ioctl(s, KVM_XEN_HVM_SET_ATTR, &ha); 133 134 cfg.flags |= KVM_XEN_HVM_CONFIG_EVTCHN_SEND; 135 } 136 137 ret = kvm_vm_ioctl(s, KVM_XEN_HVM_CONFIG, &cfg); 138 if (ret < 0) { 139 error_report("kvm: Failed to enable Xen HVM support: %s", 140 strerror(-ret)); 141 return ret; 142 } 143 144 /* If called a second time, don't repeat the rest of the setup. */ 145 if (s->xen_caps) { 146 return 0; 147 } 148 149 /* 150 * Event channel delivery via GSI/PCI_INTX needs to poll the vcpu_info 151 * of vCPU0 to deassert the IRQ when ->evtchn_upcall_pending is cleared. 152 * 153 * In the kernel, there's a notifier hook on the PIC/IOAPIC which allows 154 * such things to be polled at precisely the right time. We *could* do 155 * it nicely in the kernel: check vcpu_info[0]->evtchn_upcall_pending at 156 * the moment the IRQ is acked, and see if it should be reasserted. 157 * 158 * But the in-kernel irqchip is deprecated, so we're unlikely to add 159 * that support in the kernel. Insist on using the split irqchip mode 160 * instead. 161 * 162 * This leaves us polling for the level going low in QEMU, which lacks 163 * the appropriate hooks in its PIC/IOAPIC code. Even VFIO is sending a 164 * spurious 'ack' to an INTX IRQ every time there's any MMIO access to 165 * the device (for which it has to unmap the device and trap access, for 166 * some period after an IRQ!!). In the Xen case, we do it on exit from 167 * KVM_RUN, if the flag is set to say that the GSI is currently asserted. 168 * Which is kind of icky, but less so than the VFIO one. I may fix them 169 * both later... 170 */ 171 if (!kvm_kernel_irqchip_split()) { 172 error_report("kvm: Xen support requires kernel-irqchip=split"); 173 return -EINVAL; 174 } 175 176 s->xen_caps = xen_caps; 177 178 /* Tell fw_cfg to notify the BIOS to reserve the range. */ 179 ret = e820_add_entry(XEN_SPECIAL_AREA_ADDR, XEN_SPECIAL_AREA_SIZE, 180 E820_RESERVED); 181 if (ret < 0) { 182 fprintf(stderr, "e820_add_entry() table is full\n"); 183 return ret; 184 } 185 186 /* The pages couldn't be overlaid until KVM was initialized */ 187 xen_primary_console_reset(); 188 xen_xenstore_reset(); 189 190 return 0; 191 } 192 193 int kvm_xen_init_vcpu(CPUState *cs) 194 { 195 X86CPU *cpu = X86_CPU(cs); 196 CPUX86State *env = &cpu->env; 197 int err; 198 199 /* 200 * The kernel needs to know the Xen/ACPI vCPU ID because that's 201 * what the guest uses in hypercalls such as timers. It doesn't 202 * match the APIC ID which is generally used for talking to the 203 * kernel about vCPUs. And if vCPU threads race with creating 204 * their KVM vCPUs out of order, it doesn't necessarily match 205 * with the kernel's internal vCPU indices either. 206 */ 207 if (kvm_xen_has_cap(EVTCHN_SEND)) { 208 struct kvm_xen_vcpu_attr va = { 209 .type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID, 210 .u.vcpu_id = cs->cpu_index, 211 }; 212 err = kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &va); 213 if (err) { 214 error_report("kvm: Failed to set Xen vCPU ID attribute: %s", 215 strerror(-err)); 216 return err; 217 } 218 } 219 220 env->xen_vcpu_info_gpa = INVALID_GPA; 221 env->xen_vcpu_info_default_gpa = INVALID_GPA; 222 env->xen_vcpu_time_info_gpa = INVALID_GPA; 223 env->xen_vcpu_runstate_gpa = INVALID_GPA; 224 225 qemu_mutex_init(&env->xen_timers_lock); 226 env->xen_singleshot_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, 227 xen_vcpu_singleshot_timer_event, 228 cpu); 229 if (!env->xen_singleshot_timer) { 230 return -ENOMEM; 231 } 232 env->xen_singleshot_timer->opaque = cs; 233 234 env->xen_periodic_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, 235 xen_vcpu_periodic_timer_event, 236 cpu); 237 if (!env->xen_periodic_timer) { 238 return -ENOMEM; 239 } 240 env->xen_periodic_timer->opaque = cs; 241 242 return 0; 243 } 244 245 uint32_t kvm_xen_get_caps(void) 246 { 247 return kvm_state->xen_caps; 248 } 249 250 static bool kvm_xen_hcall_xen_version(struct kvm_xen_exit *exit, X86CPU *cpu, 251 int cmd, uint64_t arg) 252 { 253 int err = 0; 254 255 switch (cmd) { 256 case XENVER_get_features: { 257 struct xen_feature_info fi; 258 259 /* No need for 32/64 compat handling */ 260 qemu_build_assert(sizeof(fi) == 8); 261 262 err = kvm_copy_from_gva(CPU(cpu), arg, &fi, sizeof(fi)); 263 if (err) { 264 break; 265 } 266 267 fi.submap = 0; 268 if (fi.submap_idx == 0) { 269 fi.submap |= 1 << XENFEAT_writable_page_tables | 270 1 << XENFEAT_writable_descriptor_tables | 271 1 << XENFEAT_auto_translated_physmap | 272 1 << XENFEAT_hvm_callback_vector | 273 1 << XENFEAT_hvm_safe_pvclock | 274 1 << XENFEAT_hvm_pirqs; 275 } 276 277 err = kvm_copy_to_gva(CPU(cpu), arg, &fi, sizeof(fi)); 278 break; 279 } 280 281 default: 282 return false; 283 } 284 285 exit->u.hcall.result = err; 286 return true; 287 } 288 289 static int kvm_xen_set_vcpu_attr(CPUState *cs, uint16_t type, uint64_t gpa) 290 { 291 struct kvm_xen_vcpu_attr xhsi; 292 293 xhsi.type = type; 294 xhsi.u.gpa = gpa; 295 296 trace_kvm_xen_set_vcpu_attr(cs->cpu_index, type, gpa); 297 298 return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &xhsi); 299 } 300 301 static int kvm_xen_set_vcpu_callback_vector(CPUState *cs) 302 { 303 uint8_t vector = X86_CPU(cs)->env.xen_vcpu_callback_vector; 304 struct kvm_xen_vcpu_attr xva; 305 306 xva.type = KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR; 307 xva.u.vector = vector; 308 309 trace_kvm_xen_set_vcpu_callback(cs->cpu_index, vector); 310 311 return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &xva); 312 } 313 314 static void do_set_vcpu_callback_vector(CPUState *cs, run_on_cpu_data data) 315 { 316 X86CPU *cpu = X86_CPU(cs); 317 CPUX86State *env = &cpu->env; 318 319 env->xen_vcpu_callback_vector = data.host_int; 320 321 if (kvm_xen_has_cap(EVTCHN_SEND)) { 322 kvm_xen_set_vcpu_callback_vector(cs); 323 } 324 } 325 326 static int set_vcpu_info(CPUState *cs, uint64_t gpa) 327 { 328 X86CPU *cpu = X86_CPU(cs); 329 CPUX86State *env = &cpu->env; 330 MemoryRegionSection mrs = { .mr = NULL }; 331 void *vcpu_info_hva = NULL; 332 int ret; 333 334 ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO, gpa); 335 if (ret || gpa == INVALID_GPA) { 336 goto out; 337 } 338 339 mrs = memory_region_find(get_system_memory(), gpa, 340 sizeof(struct vcpu_info)); 341 if (mrs.mr && mrs.mr->ram_block && 342 !int128_lt(mrs.size, int128_make64(sizeof(struct vcpu_info)))) { 343 vcpu_info_hva = qemu_map_ram_ptr(mrs.mr->ram_block, 344 mrs.offset_within_region); 345 } 346 if (!vcpu_info_hva) { 347 if (mrs.mr) { 348 memory_region_unref(mrs.mr); 349 mrs.mr = NULL; 350 } 351 ret = -EINVAL; 352 } 353 354 out: 355 if (env->xen_vcpu_info_mr) { 356 memory_region_unref(env->xen_vcpu_info_mr); 357 } 358 env->xen_vcpu_info_hva = vcpu_info_hva; 359 env->xen_vcpu_info_mr = mrs.mr; 360 return ret; 361 } 362 363 static void do_set_vcpu_info_default_gpa(CPUState *cs, run_on_cpu_data data) 364 { 365 X86CPU *cpu = X86_CPU(cs); 366 CPUX86State *env = &cpu->env; 367 368 env->xen_vcpu_info_default_gpa = data.host_ulong; 369 370 /* Changing the default does nothing if a vcpu_info was explicitly set. */ 371 if (env->xen_vcpu_info_gpa == INVALID_GPA) { 372 set_vcpu_info(cs, env->xen_vcpu_info_default_gpa); 373 } 374 } 375 376 static void do_set_vcpu_info_gpa(CPUState *cs, run_on_cpu_data data) 377 { 378 X86CPU *cpu = X86_CPU(cs); 379 CPUX86State *env = &cpu->env; 380 381 env->xen_vcpu_info_gpa = data.host_ulong; 382 383 set_vcpu_info(cs, env->xen_vcpu_info_gpa); 384 } 385 386 void *kvm_xen_get_vcpu_info_hva(uint32_t vcpu_id) 387 { 388 CPUState *cs = qemu_get_cpu(vcpu_id); 389 if (!cs) { 390 return NULL; 391 } 392 393 return X86_CPU(cs)->env.xen_vcpu_info_hva; 394 } 395 396 void kvm_xen_maybe_deassert_callback(CPUState *cs) 397 { 398 CPUX86State *env = &X86_CPU(cs)->env; 399 struct vcpu_info *vi = env->xen_vcpu_info_hva; 400 if (!vi) { 401 return; 402 } 403 404 /* If the evtchn_upcall_pending flag is cleared, turn the GSI off. */ 405 if (!vi->evtchn_upcall_pending) { 406 qemu_mutex_lock_iothread(); 407 /* 408 * Check again now we have the lock, because it may have been 409 * asserted in the interim. And we don't want to take the lock 410 * every time because this is a fast path. 411 */ 412 if (!vi->evtchn_upcall_pending) { 413 X86_CPU(cs)->env.xen_callback_asserted = false; 414 xen_evtchn_set_callback_level(0); 415 } 416 qemu_mutex_unlock_iothread(); 417 } 418 } 419 420 void kvm_xen_set_callback_asserted(void) 421 { 422 CPUState *cs = qemu_get_cpu(0); 423 424 if (cs) { 425 X86_CPU(cs)->env.xen_callback_asserted = true; 426 } 427 } 428 429 bool kvm_xen_has_vcpu_callback_vector(void) 430 { 431 CPUState *cs = qemu_get_cpu(0); 432 433 return cs && !!X86_CPU(cs)->env.xen_vcpu_callback_vector; 434 } 435 436 void kvm_xen_inject_vcpu_callback_vector(uint32_t vcpu_id, int type) 437 { 438 CPUState *cs = qemu_get_cpu(vcpu_id); 439 uint8_t vector; 440 441 if (!cs) { 442 return; 443 } 444 445 vector = X86_CPU(cs)->env.xen_vcpu_callback_vector; 446 if (vector) { 447 /* 448 * The per-vCPU callback vector injected via lapic. Just 449 * deliver it as an MSI. 450 */ 451 MSIMessage msg = { 452 .address = APIC_DEFAULT_ADDRESS | 453 (X86_CPU(cs)->apic_id << MSI_ADDR_DEST_ID_SHIFT), 454 .data = vector | (1UL << MSI_DATA_LEVEL_SHIFT), 455 }; 456 kvm_irqchip_send_msi(kvm_state, msg); 457 return; 458 } 459 460 switch (type) { 461 case HVM_PARAM_CALLBACK_TYPE_VECTOR: 462 /* 463 * If the evtchn_upcall_pending field in the vcpu_info is set, then 464 * KVM will automatically deliver the vector on entering the vCPU 465 * so all we have to do is kick it out. 466 */ 467 qemu_cpu_kick(cs); 468 break; 469 470 case HVM_PARAM_CALLBACK_TYPE_GSI: 471 case HVM_PARAM_CALLBACK_TYPE_PCI_INTX: 472 if (vcpu_id == 0) { 473 xen_evtchn_set_callback_level(1); 474 } 475 break; 476 } 477 } 478 479 /* Must always be called with xen_timers_lock held */ 480 static int kvm_xen_set_vcpu_timer(CPUState *cs) 481 { 482 X86CPU *cpu = X86_CPU(cs); 483 CPUX86State *env = &cpu->env; 484 485 struct kvm_xen_vcpu_attr va = { 486 .type = KVM_XEN_VCPU_ATTR_TYPE_TIMER, 487 .u.timer.port = env->xen_virq[VIRQ_TIMER], 488 .u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL, 489 .u.timer.expires_ns = env->xen_singleshot_timer_ns, 490 }; 491 492 return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &va); 493 } 494 495 static void do_set_vcpu_timer_virq(CPUState *cs, run_on_cpu_data data) 496 { 497 QEMU_LOCK_GUARD(&X86_CPU(cs)->env.xen_timers_lock); 498 kvm_xen_set_vcpu_timer(cs); 499 } 500 501 int kvm_xen_set_vcpu_virq(uint32_t vcpu_id, uint16_t virq, uint16_t port) 502 { 503 CPUState *cs = qemu_get_cpu(vcpu_id); 504 505 if (!cs) { 506 return -ENOENT; 507 } 508 509 /* cpu.h doesn't include the actual Xen header. */ 510 qemu_build_assert(NR_VIRQS == XEN_NR_VIRQS); 511 512 if (virq >= NR_VIRQS) { 513 return -EINVAL; 514 } 515 516 if (port && X86_CPU(cs)->env.xen_virq[virq]) { 517 return -EEXIST; 518 } 519 520 X86_CPU(cs)->env.xen_virq[virq] = port; 521 if (virq == VIRQ_TIMER && kvm_xen_has_cap(EVTCHN_SEND)) { 522 async_run_on_cpu(cs, do_set_vcpu_timer_virq, 523 RUN_ON_CPU_HOST_INT(port)); 524 } 525 return 0; 526 } 527 528 static void do_set_vcpu_time_info_gpa(CPUState *cs, run_on_cpu_data data) 529 { 530 X86CPU *cpu = X86_CPU(cs); 531 CPUX86State *env = &cpu->env; 532 533 env->xen_vcpu_time_info_gpa = data.host_ulong; 534 535 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO, 536 env->xen_vcpu_time_info_gpa); 537 } 538 539 static void do_set_vcpu_runstate_gpa(CPUState *cs, run_on_cpu_data data) 540 { 541 X86CPU *cpu = X86_CPU(cs); 542 CPUX86State *env = &cpu->env; 543 544 env->xen_vcpu_runstate_gpa = data.host_ulong; 545 546 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR, 547 env->xen_vcpu_runstate_gpa); 548 } 549 550 static void do_vcpu_soft_reset(CPUState *cs, run_on_cpu_data data) 551 { 552 X86CPU *cpu = X86_CPU(cs); 553 CPUX86State *env = &cpu->env; 554 555 env->xen_vcpu_info_gpa = INVALID_GPA; 556 env->xen_vcpu_info_default_gpa = INVALID_GPA; 557 env->xen_vcpu_time_info_gpa = INVALID_GPA; 558 env->xen_vcpu_runstate_gpa = INVALID_GPA; 559 env->xen_vcpu_callback_vector = 0; 560 memset(env->xen_virq, 0, sizeof(env->xen_virq)); 561 562 set_vcpu_info(cs, INVALID_GPA); 563 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO, 564 INVALID_GPA); 565 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR, 566 INVALID_GPA); 567 if (kvm_xen_has_cap(EVTCHN_SEND)) { 568 kvm_xen_set_vcpu_callback_vector(cs); 569 570 QEMU_LOCK_GUARD(&X86_CPU(cs)->env.xen_timers_lock); 571 env->xen_singleshot_timer_ns = 0; 572 kvm_xen_set_vcpu_timer(cs); 573 } else { 574 vcpuop_stop_singleshot_timer(cs); 575 }; 576 577 } 578 579 static int xen_set_shared_info(uint64_t gfn) 580 { 581 uint64_t gpa = gfn << TARGET_PAGE_BITS; 582 int i, err; 583 584 QEMU_IOTHREAD_LOCK_GUARD(); 585 586 /* 587 * The xen_overlay device tells KVM about it too, since it had to 588 * do that on migration load anyway (unless we're going to jump 589 * through lots of hoops to maintain the fiction that this isn't 590 * KVM-specific. 591 */ 592 err = xen_overlay_map_shinfo_page(gpa); 593 if (err) { 594 return err; 595 } 596 597 trace_kvm_xen_set_shared_info(gfn); 598 599 for (i = 0; i < XEN_LEGACY_MAX_VCPUS; i++) { 600 CPUState *cpu = qemu_get_cpu(i); 601 if (cpu) { 602 async_run_on_cpu(cpu, do_set_vcpu_info_default_gpa, 603 RUN_ON_CPU_HOST_ULONG(gpa)); 604 } 605 gpa += sizeof(vcpu_info_t); 606 } 607 608 return err; 609 } 610 611 static int add_to_physmap_one(uint32_t space, uint64_t idx, uint64_t gfn) 612 { 613 switch (space) { 614 case XENMAPSPACE_shared_info: 615 if (idx > 0) { 616 return -EINVAL; 617 } 618 return xen_set_shared_info(gfn); 619 620 case XENMAPSPACE_grant_table: 621 return xen_gnttab_map_page(idx, gfn); 622 623 case XENMAPSPACE_gmfn: 624 case XENMAPSPACE_gmfn_range: 625 return -ENOTSUP; 626 627 case XENMAPSPACE_gmfn_foreign: 628 case XENMAPSPACE_dev_mmio: 629 return -EPERM; 630 631 default: 632 return -EINVAL; 633 } 634 } 635 636 static int do_add_to_physmap(struct kvm_xen_exit *exit, X86CPU *cpu, 637 uint64_t arg) 638 { 639 struct xen_add_to_physmap xatp; 640 CPUState *cs = CPU(cpu); 641 642 if (hypercall_compat32(exit->u.hcall.longmode)) { 643 struct compat_xen_add_to_physmap xatp32; 644 645 qemu_build_assert(sizeof(struct compat_xen_add_to_physmap) == 16); 646 if (kvm_copy_from_gva(cs, arg, &xatp32, sizeof(xatp32))) { 647 return -EFAULT; 648 } 649 xatp.domid = xatp32.domid; 650 xatp.size = xatp32.size; 651 xatp.space = xatp32.space; 652 xatp.idx = xatp32.idx; 653 xatp.gpfn = xatp32.gpfn; 654 } else { 655 if (kvm_copy_from_gva(cs, arg, &xatp, sizeof(xatp))) { 656 return -EFAULT; 657 } 658 } 659 660 if (xatp.domid != DOMID_SELF && xatp.domid != xen_domid) { 661 return -ESRCH; 662 } 663 664 return add_to_physmap_one(xatp.space, xatp.idx, xatp.gpfn); 665 } 666 667 static int do_add_to_physmap_batch(struct kvm_xen_exit *exit, X86CPU *cpu, 668 uint64_t arg) 669 { 670 struct xen_add_to_physmap_batch xatpb; 671 unsigned long idxs_gva, gpfns_gva, errs_gva; 672 CPUState *cs = CPU(cpu); 673 size_t op_sz; 674 675 if (hypercall_compat32(exit->u.hcall.longmode)) { 676 struct compat_xen_add_to_physmap_batch xatpb32; 677 678 qemu_build_assert(sizeof(struct compat_xen_add_to_physmap_batch) == 20); 679 if (kvm_copy_from_gva(cs, arg, &xatpb32, sizeof(xatpb32))) { 680 return -EFAULT; 681 } 682 xatpb.domid = xatpb32.domid; 683 xatpb.space = xatpb32.space; 684 xatpb.size = xatpb32.size; 685 686 idxs_gva = xatpb32.idxs.c; 687 gpfns_gva = xatpb32.gpfns.c; 688 errs_gva = xatpb32.errs.c; 689 op_sz = sizeof(uint32_t); 690 } else { 691 if (kvm_copy_from_gva(cs, arg, &xatpb, sizeof(xatpb))) { 692 return -EFAULT; 693 } 694 op_sz = sizeof(unsigned long); 695 idxs_gva = (unsigned long)xatpb.idxs.p; 696 gpfns_gva = (unsigned long)xatpb.gpfns.p; 697 errs_gva = (unsigned long)xatpb.errs.p; 698 } 699 700 if (xatpb.domid != DOMID_SELF && xatpb.domid != xen_domid) { 701 return -ESRCH; 702 } 703 704 /* Explicitly invalid for the batch op. Not that we implement it anyway. */ 705 if (xatpb.space == XENMAPSPACE_gmfn_range) { 706 return -EINVAL; 707 } 708 709 while (xatpb.size--) { 710 unsigned long idx = 0; 711 unsigned long gpfn = 0; 712 int err; 713 714 /* For 32-bit compat this only copies the low 32 bits of each */ 715 if (kvm_copy_from_gva(cs, idxs_gva, &idx, op_sz) || 716 kvm_copy_from_gva(cs, gpfns_gva, &gpfn, op_sz)) { 717 return -EFAULT; 718 } 719 idxs_gva += op_sz; 720 gpfns_gva += op_sz; 721 722 err = add_to_physmap_one(xatpb.space, idx, gpfn); 723 724 if (kvm_copy_to_gva(cs, errs_gva, &err, sizeof(err))) { 725 return -EFAULT; 726 } 727 errs_gva += sizeof(err); 728 } 729 return 0; 730 } 731 732 static bool kvm_xen_hcall_memory_op(struct kvm_xen_exit *exit, X86CPU *cpu, 733 int cmd, uint64_t arg) 734 { 735 int err; 736 737 switch (cmd) { 738 case XENMEM_add_to_physmap: 739 err = do_add_to_physmap(exit, cpu, arg); 740 break; 741 742 case XENMEM_add_to_physmap_batch: 743 err = do_add_to_physmap_batch(exit, cpu, arg); 744 break; 745 746 default: 747 return false; 748 } 749 750 exit->u.hcall.result = err; 751 return true; 752 } 753 754 static bool handle_set_param(struct kvm_xen_exit *exit, X86CPU *cpu, 755 uint64_t arg) 756 { 757 CPUState *cs = CPU(cpu); 758 struct xen_hvm_param hp; 759 int err = 0; 760 761 /* No need for 32/64 compat handling */ 762 qemu_build_assert(sizeof(hp) == 16); 763 764 if (kvm_copy_from_gva(cs, arg, &hp, sizeof(hp))) { 765 err = -EFAULT; 766 goto out; 767 } 768 769 if (hp.domid != DOMID_SELF && hp.domid != xen_domid) { 770 err = -ESRCH; 771 goto out; 772 } 773 774 switch (hp.index) { 775 case HVM_PARAM_CALLBACK_IRQ: 776 qemu_mutex_lock_iothread(); 777 err = xen_evtchn_set_callback_param(hp.value); 778 qemu_mutex_unlock_iothread(); 779 xen_set_long_mode(exit->u.hcall.longmode); 780 break; 781 default: 782 return false; 783 } 784 785 out: 786 exit->u.hcall.result = err; 787 return true; 788 } 789 790 static bool handle_get_param(struct kvm_xen_exit *exit, X86CPU *cpu, 791 uint64_t arg) 792 { 793 CPUState *cs = CPU(cpu); 794 struct xen_hvm_param hp; 795 int err = 0; 796 797 /* No need for 32/64 compat handling */ 798 qemu_build_assert(sizeof(hp) == 16); 799 800 if (kvm_copy_from_gva(cs, arg, &hp, sizeof(hp))) { 801 err = -EFAULT; 802 goto out; 803 } 804 805 if (hp.domid != DOMID_SELF && hp.domid != xen_domid) { 806 err = -ESRCH; 807 goto out; 808 } 809 810 switch (hp.index) { 811 case HVM_PARAM_STORE_PFN: 812 hp.value = XEN_SPECIAL_PFN(XENSTORE); 813 break; 814 case HVM_PARAM_STORE_EVTCHN: 815 hp.value = xen_xenstore_get_port(); 816 break; 817 case HVM_PARAM_CONSOLE_PFN: 818 hp.value = xen_primary_console_get_pfn(); 819 if (!hp.value) { 820 err = -EINVAL; 821 } 822 break; 823 case HVM_PARAM_CONSOLE_EVTCHN: 824 hp.value = xen_primary_console_get_port(); 825 if (!hp.value) { 826 err = -EINVAL; 827 } 828 break; 829 default: 830 return false; 831 } 832 833 if (!err && kvm_copy_to_gva(cs, arg, &hp, sizeof(hp))) { 834 err = -EFAULT; 835 } 836 out: 837 exit->u.hcall.result = err; 838 return true; 839 } 840 841 static int kvm_xen_hcall_evtchn_upcall_vector(struct kvm_xen_exit *exit, 842 X86CPU *cpu, uint64_t arg) 843 { 844 struct xen_hvm_evtchn_upcall_vector up; 845 CPUState *target_cs; 846 847 /* No need for 32/64 compat handling */ 848 qemu_build_assert(sizeof(up) == 8); 849 850 if (kvm_copy_from_gva(CPU(cpu), arg, &up, sizeof(up))) { 851 return -EFAULT; 852 } 853 854 if (up.vector < 0x10) { 855 return -EINVAL; 856 } 857 858 target_cs = qemu_get_cpu(up.vcpu); 859 if (!target_cs) { 860 return -EINVAL; 861 } 862 863 async_run_on_cpu(target_cs, do_set_vcpu_callback_vector, 864 RUN_ON_CPU_HOST_INT(up.vector)); 865 return 0; 866 } 867 868 static bool kvm_xen_hcall_hvm_op(struct kvm_xen_exit *exit, X86CPU *cpu, 869 int cmd, uint64_t arg) 870 { 871 int ret = -ENOSYS; 872 switch (cmd) { 873 case HVMOP_set_evtchn_upcall_vector: 874 ret = kvm_xen_hcall_evtchn_upcall_vector(exit, cpu, arg); 875 break; 876 877 case HVMOP_pagetable_dying: 878 ret = -ENOSYS; 879 break; 880 881 case HVMOP_set_param: 882 return handle_set_param(exit, cpu, arg); 883 884 case HVMOP_get_param: 885 return handle_get_param(exit, cpu, arg); 886 887 default: 888 return false; 889 } 890 891 exit->u.hcall.result = ret; 892 return true; 893 } 894 895 static int vcpuop_register_vcpu_info(CPUState *cs, CPUState *target, 896 uint64_t arg) 897 { 898 struct vcpu_register_vcpu_info rvi; 899 uint64_t gpa; 900 901 /* No need for 32/64 compat handling */ 902 qemu_build_assert(sizeof(rvi) == 16); 903 qemu_build_assert(sizeof(struct vcpu_info) == 64); 904 905 if (!target) { 906 return -ENOENT; 907 } 908 909 if (kvm_copy_from_gva(cs, arg, &rvi, sizeof(rvi))) { 910 return -EFAULT; 911 } 912 913 if (rvi.offset > TARGET_PAGE_SIZE - sizeof(struct vcpu_info)) { 914 return -EINVAL; 915 } 916 917 gpa = ((rvi.mfn << TARGET_PAGE_BITS) + rvi.offset); 918 async_run_on_cpu(target, do_set_vcpu_info_gpa, RUN_ON_CPU_HOST_ULONG(gpa)); 919 return 0; 920 } 921 922 static int vcpuop_register_vcpu_time_info(CPUState *cs, CPUState *target, 923 uint64_t arg) 924 { 925 struct vcpu_register_time_memory_area tma; 926 uint64_t gpa; 927 size_t len; 928 929 /* No need for 32/64 compat handling */ 930 qemu_build_assert(sizeof(tma) == 8); 931 qemu_build_assert(sizeof(struct vcpu_time_info) == 32); 932 933 if (!target) { 934 return -ENOENT; 935 } 936 937 if (kvm_copy_from_gva(cs, arg, &tma, sizeof(tma))) { 938 return -EFAULT; 939 } 940 941 /* 942 * Xen actually uses the GVA and does the translation through the guest 943 * page tables each time. But Linux/KVM uses the GPA, on the assumption 944 * that guests only ever use *global* addresses (kernel virtual addresses) 945 * for it. If Linux is changed to redo the GVA→GPA translation each time, 946 * it will offer a new vCPU attribute for that, and we'll use it instead. 947 */ 948 if (!kvm_gva_to_gpa(cs, tma.addr.p, &gpa, &len, false) || 949 len < sizeof(struct vcpu_time_info)) { 950 return -EFAULT; 951 } 952 953 async_run_on_cpu(target, do_set_vcpu_time_info_gpa, 954 RUN_ON_CPU_HOST_ULONG(gpa)); 955 return 0; 956 } 957 958 static int vcpuop_register_runstate_info(CPUState *cs, CPUState *target, 959 uint64_t arg) 960 { 961 struct vcpu_register_runstate_memory_area rma; 962 uint64_t gpa; 963 size_t len; 964 965 /* No need for 32/64 compat handling */ 966 qemu_build_assert(sizeof(rma) == 8); 967 /* The runstate area actually does change size, but Linux copes. */ 968 969 if (!target) { 970 return -ENOENT; 971 } 972 973 if (kvm_copy_from_gva(cs, arg, &rma, sizeof(rma))) { 974 return -EFAULT; 975 } 976 977 /* As with vcpu_time_info, Xen actually uses the GVA but KVM doesn't. */ 978 if (!kvm_gva_to_gpa(cs, rma.addr.p, &gpa, &len, false)) { 979 return -EFAULT; 980 } 981 982 async_run_on_cpu(target, do_set_vcpu_runstate_gpa, 983 RUN_ON_CPU_HOST_ULONG(gpa)); 984 return 0; 985 } 986 987 static uint64_t kvm_get_current_ns(void) 988 { 989 struct kvm_clock_data data; 990 int ret; 991 992 ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data); 993 if (ret < 0) { 994 fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret)); 995 abort(); 996 } 997 998 return data.clock; 999 } 1000 1001 static void xen_vcpu_singleshot_timer_event(void *opaque) 1002 { 1003 CPUState *cpu = opaque; 1004 CPUX86State *env = &X86_CPU(cpu)->env; 1005 uint16_t port = env->xen_virq[VIRQ_TIMER]; 1006 1007 if (likely(port)) { 1008 xen_evtchn_set_port(port); 1009 } 1010 1011 qemu_mutex_lock(&env->xen_timers_lock); 1012 env->xen_singleshot_timer_ns = 0; 1013 qemu_mutex_unlock(&env->xen_timers_lock); 1014 } 1015 1016 static void xen_vcpu_periodic_timer_event(void *opaque) 1017 { 1018 CPUState *cpu = opaque; 1019 CPUX86State *env = &X86_CPU(cpu)->env; 1020 uint16_t port = env->xen_virq[VIRQ_TIMER]; 1021 int64_t qemu_now; 1022 1023 if (likely(port)) { 1024 xen_evtchn_set_port(port); 1025 } 1026 1027 qemu_mutex_lock(&env->xen_timers_lock); 1028 1029 qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); 1030 timer_mod_ns(env->xen_periodic_timer, 1031 qemu_now + env->xen_periodic_timer_period); 1032 1033 qemu_mutex_unlock(&env->xen_timers_lock); 1034 } 1035 1036 static int do_set_periodic_timer(CPUState *target, uint64_t period_ns) 1037 { 1038 CPUX86State *tenv = &X86_CPU(target)->env; 1039 int64_t qemu_now; 1040 1041 timer_del(tenv->xen_periodic_timer); 1042 1043 qemu_mutex_lock(&tenv->xen_timers_lock); 1044 1045 qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); 1046 timer_mod_ns(tenv->xen_periodic_timer, qemu_now + period_ns); 1047 tenv->xen_periodic_timer_period = period_ns; 1048 1049 qemu_mutex_unlock(&tenv->xen_timers_lock); 1050 return 0; 1051 } 1052 1053 #define MILLISECS(_ms) ((int64_t)((_ms) * 1000000ULL)) 1054 #define MICROSECS(_us) ((int64_t)((_us) * 1000ULL)) 1055 #define STIME_MAX ((time_t)((int64_t)~0ull >> 1)) 1056 /* Chosen so (NOW() + delta) won't overflow without an uptime of 200 years */ 1057 #define STIME_DELTA_MAX ((int64_t)((uint64_t)~0ull >> 2)) 1058 1059 static int vcpuop_set_periodic_timer(CPUState *cs, CPUState *target, 1060 uint64_t arg) 1061 { 1062 struct vcpu_set_periodic_timer spt; 1063 1064 qemu_build_assert(sizeof(spt) == 8); 1065 if (kvm_copy_from_gva(cs, arg, &spt, sizeof(spt))) { 1066 return -EFAULT; 1067 } 1068 1069 if (spt.period_ns < MILLISECS(1) || spt.period_ns > STIME_DELTA_MAX) { 1070 return -EINVAL; 1071 } 1072 1073 return do_set_periodic_timer(target, spt.period_ns); 1074 } 1075 1076 static int vcpuop_stop_periodic_timer(CPUState *target) 1077 { 1078 CPUX86State *tenv = &X86_CPU(target)->env; 1079 1080 qemu_mutex_lock(&tenv->xen_timers_lock); 1081 1082 timer_del(tenv->xen_periodic_timer); 1083 tenv->xen_periodic_timer_period = 0; 1084 1085 qemu_mutex_unlock(&tenv->xen_timers_lock); 1086 return 0; 1087 } 1088 1089 /* 1090 * Userspace handling of timer, for older kernels. 1091 * Must always be called with xen_timers_lock held. 1092 */ 1093 static int do_set_singleshot_timer(CPUState *cs, uint64_t timeout_abs, 1094 bool linux_wa) 1095 { 1096 CPUX86State *env = &X86_CPU(cs)->env; 1097 int64_t now = kvm_get_current_ns(); 1098 int64_t qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); 1099 int64_t delta = timeout_abs - now; 1100 1101 if (linux_wa && unlikely((int64_t)timeout_abs < 0 || 1102 (delta > 0 && (uint32_t)(delta >> 50) != 0))) { 1103 /* 1104 * Xen has a 'Linux workaround' in do_set_timer_op() which checks 1105 * for negative absolute timeout values (caused by integer 1106 * overflow), and for values about 13 days in the future (2^50ns) 1107 * which would be caused by jiffies overflow. For those cases, it 1108 * sets the timeout 100ms in the future (not *too* soon, since if 1109 * a guest really did set a long timeout on purpose we don't want 1110 * to keep churning CPU time by waking it up). 1111 */ 1112 delta = (100 * SCALE_MS); 1113 timeout_abs = now + delta; 1114 } 1115 1116 timer_mod_ns(env->xen_singleshot_timer, qemu_now + delta); 1117 env->xen_singleshot_timer_ns = now + delta; 1118 return 0; 1119 } 1120 1121 static int vcpuop_set_singleshot_timer(CPUState *cs, uint64_t arg) 1122 { 1123 struct vcpu_set_singleshot_timer sst = { 0 }; 1124 1125 /* 1126 * The struct is a uint64_t followed by a uint32_t. On 32-bit that 1127 * makes it 12 bytes. On 64-bit it gets padded to 16. The parts 1128 * that get used are identical, and there's four bytes of padding 1129 * unused at the end. For true Xen compatibility we should attempt 1130 * to copy the full 16 bytes from 64-bit guests, and return -EFAULT 1131 * if we can't get the padding too. But that's daft. Just copy what 1132 * we need. 1133 */ 1134 qemu_build_assert(offsetof(struct vcpu_set_singleshot_timer, flags) == 8); 1135 qemu_build_assert(sizeof(sst) >= 12); 1136 1137 if (kvm_copy_from_gva(cs, arg, &sst, 12)) { 1138 return -EFAULT; 1139 } 1140 1141 QEMU_LOCK_GUARD(&X86_CPU(cs)->env.xen_timers_lock); 1142 1143 /* 1144 * We ignore the VCPU_SSHOTTMR_future flag, just as Xen now does. 1145 * The only guest that ever used it, got it wrong. 1146 * https://xenbits.xen.org/gitweb/?p=xen.git;a=commitdiff;h=19c6cbd909 1147 */ 1148 return do_set_singleshot_timer(cs, sst.timeout_abs_ns, false); 1149 } 1150 1151 static int vcpuop_stop_singleshot_timer(CPUState *cs) 1152 { 1153 CPUX86State *env = &X86_CPU(cs)->env; 1154 1155 qemu_mutex_lock(&env->xen_timers_lock); 1156 1157 timer_del(env->xen_singleshot_timer); 1158 env->xen_singleshot_timer_ns = 0; 1159 1160 qemu_mutex_unlock(&env->xen_timers_lock); 1161 return 0; 1162 } 1163 1164 static bool kvm_xen_hcall_set_timer_op(struct kvm_xen_exit *exit, X86CPU *cpu, 1165 uint64_t timeout) 1166 { 1167 int err; 1168 1169 if (unlikely(timeout == 0)) { 1170 err = vcpuop_stop_singleshot_timer(CPU(cpu)); 1171 } else { 1172 QEMU_LOCK_GUARD(&X86_CPU(cpu)->env.xen_timers_lock); 1173 err = do_set_singleshot_timer(CPU(cpu), timeout, true); 1174 } 1175 exit->u.hcall.result = err; 1176 return true; 1177 } 1178 1179 static bool kvm_xen_hcall_vcpu_op(struct kvm_xen_exit *exit, X86CPU *cpu, 1180 int cmd, int vcpu_id, uint64_t arg) 1181 { 1182 CPUState *cs = CPU(cpu); 1183 CPUState *dest = cs->cpu_index == vcpu_id ? cs : qemu_get_cpu(vcpu_id); 1184 int err; 1185 1186 if (!dest) { 1187 err = -ENOENT; 1188 goto out; 1189 } 1190 1191 switch (cmd) { 1192 case VCPUOP_register_runstate_memory_area: 1193 err = vcpuop_register_runstate_info(cs, dest, arg); 1194 break; 1195 case VCPUOP_register_vcpu_time_memory_area: 1196 err = vcpuop_register_vcpu_time_info(cs, dest, arg); 1197 break; 1198 case VCPUOP_register_vcpu_info: 1199 err = vcpuop_register_vcpu_info(cs, dest, arg); 1200 break; 1201 case VCPUOP_set_singleshot_timer: { 1202 if (cs->cpu_index == vcpu_id) { 1203 err = vcpuop_set_singleshot_timer(dest, arg); 1204 } else { 1205 err = -EINVAL; 1206 } 1207 break; 1208 } 1209 case VCPUOP_stop_singleshot_timer: 1210 if (cs->cpu_index == vcpu_id) { 1211 err = vcpuop_stop_singleshot_timer(dest); 1212 } else { 1213 err = -EINVAL; 1214 } 1215 break; 1216 case VCPUOP_set_periodic_timer: { 1217 err = vcpuop_set_periodic_timer(cs, dest, arg); 1218 break; 1219 } 1220 case VCPUOP_stop_periodic_timer: 1221 err = vcpuop_stop_periodic_timer(dest); 1222 break; 1223 1224 default: 1225 return false; 1226 } 1227 1228 out: 1229 exit->u.hcall.result = err; 1230 return true; 1231 } 1232 1233 static bool kvm_xen_hcall_evtchn_op(struct kvm_xen_exit *exit, X86CPU *cpu, 1234 int cmd, uint64_t arg) 1235 { 1236 CPUState *cs = CPU(cpu); 1237 int err = -ENOSYS; 1238 1239 switch (cmd) { 1240 case EVTCHNOP_init_control: 1241 case EVTCHNOP_expand_array: 1242 case EVTCHNOP_set_priority: 1243 /* We do not support FIFO channels at this point */ 1244 err = -ENOSYS; 1245 break; 1246 1247 case EVTCHNOP_status: { 1248 struct evtchn_status status; 1249 1250 qemu_build_assert(sizeof(status) == 24); 1251 if (kvm_copy_from_gva(cs, arg, &status, sizeof(status))) { 1252 err = -EFAULT; 1253 break; 1254 } 1255 1256 err = xen_evtchn_status_op(&status); 1257 if (!err && kvm_copy_to_gva(cs, arg, &status, sizeof(status))) { 1258 err = -EFAULT; 1259 } 1260 break; 1261 } 1262 case EVTCHNOP_close: { 1263 struct evtchn_close close; 1264 1265 qemu_build_assert(sizeof(close) == 4); 1266 if (kvm_copy_from_gva(cs, arg, &close, sizeof(close))) { 1267 err = -EFAULT; 1268 break; 1269 } 1270 1271 err = xen_evtchn_close_op(&close); 1272 break; 1273 } 1274 case EVTCHNOP_unmask: { 1275 struct evtchn_unmask unmask; 1276 1277 qemu_build_assert(sizeof(unmask) == 4); 1278 if (kvm_copy_from_gva(cs, arg, &unmask, sizeof(unmask))) { 1279 err = -EFAULT; 1280 break; 1281 } 1282 1283 err = xen_evtchn_unmask_op(&unmask); 1284 break; 1285 } 1286 case EVTCHNOP_bind_virq: { 1287 struct evtchn_bind_virq virq; 1288 1289 qemu_build_assert(sizeof(virq) == 12); 1290 if (kvm_copy_from_gva(cs, arg, &virq, sizeof(virq))) { 1291 err = -EFAULT; 1292 break; 1293 } 1294 1295 err = xen_evtchn_bind_virq_op(&virq); 1296 if (!err && kvm_copy_to_gva(cs, arg, &virq, sizeof(virq))) { 1297 err = -EFAULT; 1298 } 1299 break; 1300 } 1301 case EVTCHNOP_bind_pirq: { 1302 struct evtchn_bind_pirq pirq; 1303 1304 qemu_build_assert(sizeof(pirq) == 12); 1305 if (kvm_copy_from_gva(cs, arg, &pirq, sizeof(pirq))) { 1306 err = -EFAULT; 1307 break; 1308 } 1309 1310 err = xen_evtchn_bind_pirq_op(&pirq); 1311 if (!err && kvm_copy_to_gva(cs, arg, &pirq, sizeof(pirq))) { 1312 err = -EFAULT; 1313 } 1314 break; 1315 } 1316 case EVTCHNOP_bind_ipi: { 1317 struct evtchn_bind_ipi ipi; 1318 1319 qemu_build_assert(sizeof(ipi) == 8); 1320 if (kvm_copy_from_gva(cs, arg, &ipi, sizeof(ipi))) { 1321 err = -EFAULT; 1322 break; 1323 } 1324 1325 err = xen_evtchn_bind_ipi_op(&ipi); 1326 if (!err && kvm_copy_to_gva(cs, arg, &ipi, sizeof(ipi))) { 1327 err = -EFAULT; 1328 } 1329 break; 1330 } 1331 case EVTCHNOP_send: { 1332 struct evtchn_send send; 1333 1334 qemu_build_assert(sizeof(send) == 4); 1335 if (kvm_copy_from_gva(cs, arg, &send, sizeof(send))) { 1336 err = -EFAULT; 1337 break; 1338 } 1339 1340 err = xen_evtchn_send_op(&send); 1341 break; 1342 } 1343 case EVTCHNOP_alloc_unbound: { 1344 struct evtchn_alloc_unbound alloc; 1345 1346 qemu_build_assert(sizeof(alloc) == 8); 1347 if (kvm_copy_from_gva(cs, arg, &alloc, sizeof(alloc))) { 1348 err = -EFAULT; 1349 break; 1350 } 1351 1352 err = xen_evtchn_alloc_unbound_op(&alloc); 1353 if (!err && kvm_copy_to_gva(cs, arg, &alloc, sizeof(alloc))) { 1354 err = -EFAULT; 1355 } 1356 break; 1357 } 1358 case EVTCHNOP_bind_interdomain: { 1359 struct evtchn_bind_interdomain interdomain; 1360 1361 qemu_build_assert(sizeof(interdomain) == 12); 1362 if (kvm_copy_from_gva(cs, arg, &interdomain, sizeof(interdomain))) { 1363 err = -EFAULT; 1364 break; 1365 } 1366 1367 err = xen_evtchn_bind_interdomain_op(&interdomain); 1368 if (!err && 1369 kvm_copy_to_gva(cs, arg, &interdomain, sizeof(interdomain))) { 1370 err = -EFAULT; 1371 } 1372 break; 1373 } 1374 case EVTCHNOP_bind_vcpu: { 1375 struct evtchn_bind_vcpu vcpu; 1376 1377 qemu_build_assert(sizeof(vcpu) == 8); 1378 if (kvm_copy_from_gva(cs, arg, &vcpu, sizeof(vcpu))) { 1379 err = -EFAULT; 1380 break; 1381 } 1382 1383 err = xen_evtchn_bind_vcpu_op(&vcpu); 1384 break; 1385 } 1386 case EVTCHNOP_reset: { 1387 struct evtchn_reset reset; 1388 1389 qemu_build_assert(sizeof(reset) == 2); 1390 if (kvm_copy_from_gva(cs, arg, &reset, sizeof(reset))) { 1391 err = -EFAULT; 1392 break; 1393 } 1394 1395 err = xen_evtchn_reset_op(&reset); 1396 break; 1397 } 1398 default: 1399 return false; 1400 } 1401 1402 exit->u.hcall.result = err; 1403 return true; 1404 } 1405 1406 int kvm_xen_soft_reset(void) 1407 { 1408 CPUState *cpu; 1409 int err; 1410 1411 assert(qemu_mutex_iothread_locked()); 1412 1413 trace_kvm_xen_soft_reset(); 1414 1415 err = xen_evtchn_soft_reset(); 1416 if (err) { 1417 return err; 1418 } 1419 1420 /* 1421 * Zero is the reset/startup state for HVM_PARAM_CALLBACK_IRQ. Strictly, 1422 * it maps to HVM_PARAM_CALLBACK_TYPE_GSI with GSI#0, but Xen refuses to 1423 * to deliver to the timer interrupt and treats that as 'disabled'. 1424 */ 1425 err = xen_evtchn_set_callback_param(0); 1426 if (err) { 1427 return err; 1428 } 1429 1430 CPU_FOREACH(cpu) { 1431 async_run_on_cpu(cpu, do_vcpu_soft_reset, RUN_ON_CPU_NULL); 1432 } 1433 1434 err = xen_overlay_map_shinfo_page(INVALID_GFN); 1435 if (err) { 1436 return err; 1437 } 1438 1439 err = xen_gnttab_reset(); 1440 if (err) { 1441 return err; 1442 } 1443 1444 err = xen_primary_console_reset(); 1445 if (err) { 1446 return err; 1447 } 1448 1449 err = xen_xenstore_reset(); 1450 if (err) { 1451 return err; 1452 } 1453 1454 return 0; 1455 } 1456 1457 static int schedop_shutdown(CPUState *cs, uint64_t arg) 1458 { 1459 struct sched_shutdown shutdown; 1460 int ret = 0; 1461 1462 /* No need for 32/64 compat handling */ 1463 qemu_build_assert(sizeof(shutdown) == 4); 1464 1465 if (kvm_copy_from_gva(cs, arg, &shutdown, sizeof(shutdown))) { 1466 return -EFAULT; 1467 } 1468 1469 switch (shutdown.reason) { 1470 case SHUTDOWN_crash: 1471 cpu_dump_state(cs, stderr, CPU_DUMP_CODE); 1472 qemu_system_guest_panicked(NULL); 1473 break; 1474 1475 case SHUTDOWN_reboot: 1476 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); 1477 break; 1478 1479 case SHUTDOWN_poweroff: 1480 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN); 1481 break; 1482 1483 case SHUTDOWN_soft_reset: 1484 qemu_mutex_lock_iothread(); 1485 ret = kvm_xen_soft_reset(); 1486 qemu_mutex_unlock_iothread(); 1487 break; 1488 1489 default: 1490 ret = -EINVAL; 1491 break; 1492 } 1493 1494 return ret; 1495 } 1496 1497 static bool kvm_xen_hcall_sched_op(struct kvm_xen_exit *exit, X86CPU *cpu, 1498 int cmd, uint64_t arg) 1499 { 1500 CPUState *cs = CPU(cpu); 1501 int err = -ENOSYS; 1502 1503 switch (cmd) { 1504 case SCHEDOP_shutdown: 1505 err = schedop_shutdown(cs, arg); 1506 break; 1507 1508 case SCHEDOP_poll: 1509 /* 1510 * Linux will panic if this doesn't work. Just yield; it's not 1511 * worth overthinking it because with event channel handling 1512 * in KVM, the kernel will intercept this and it will never 1513 * reach QEMU anyway. The semantics of the hypercall explicltly 1514 * permit spurious wakeups. 1515 */ 1516 case SCHEDOP_yield: 1517 sched_yield(); 1518 err = 0; 1519 break; 1520 1521 default: 1522 return false; 1523 } 1524 1525 exit->u.hcall.result = err; 1526 return true; 1527 } 1528 1529 static bool kvm_xen_hcall_gnttab_op(struct kvm_xen_exit *exit, X86CPU *cpu, 1530 int cmd, uint64_t arg, int count) 1531 { 1532 CPUState *cs = CPU(cpu); 1533 int err; 1534 1535 switch (cmd) { 1536 case GNTTABOP_set_version: { 1537 struct gnttab_set_version set; 1538 1539 qemu_build_assert(sizeof(set) == 4); 1540 if (kvm_copy_from_gva(cs, arg, &set, sizeof(set))) { 1541 err = -EFAULT; 1542 break; 1543 } 1544 1545 err = xen_gnttab_set_version_op(&set); 1546 if (!err && kvm_copy_to_gva(cs, arg, &set, sizeof(set))) { 1547 err = -EFAULT; 1548 } 1549 break; 1550 } 1551 case GNTTABOP_get_version: { 1552 struct gnttab_get_version get; 1553 1554 qemu_build_assert(sizeof(get) == 8); 1555 if (kvm_copy_from_gva(cs, arg, &get, sizeof(get))) { 1556 err = -EFAULT; 1557 break; 1558 } 1559 1560 err = xen_gnttab_get_version_op(&get); 1561 if (!err && kvm_copy_to_gva(cs, arg, &get, sizeof(get))) { 1562 err = -EFAULT; 1563 } 1564 break; 1565 } 1566 case GNTTABOP_query_size: { 1567 struct gnttab_query_size size; 1568 1569 qemu_build_assert(sizeof(size) == 16); 1570 if (kvm_copy_from_gva(cs, arg, &size, sizeof(size))) { 1571 err = -EFAULT; 1572 break; 1573 } 1574 1575 err = xen_gnttab_query_size_op(&size); 1576 if (!err && kvm_copy_to_gva(cs, arg, &size, sizeof(size))) { 1577 err = -EFAULT; 1578 } 1579 break; 1580 } 1581 case GNTTABOP_setup_table: 1582 case GNTTABOP_copy: 1583 case GNTTABOP_map_grant_ref: 1584 case GNTTABOP_unmap_grant_ref: 1585 case GNTTABOP_swap_grant_ref: 1586 return false; 1587 1588 default: 1589 /* Xen explicitly returns -ENOSYS to HVM guests for all others */ 1590 err = -ENOSYS; 1591 break; 1592 } 1593 1594 exit->u.hcall.result = err; 1595 return true; 1596 } 1597 1598 static bool kvm_xen_hcall_physdev_op(struct kvm_xen_exit *exit, X86CPU *cpu, 1599 int cmd, uint64_t arg) 1600 { 1601 CPUState *cs = CPU(cpu); 1602 int err; 1603 1604 switch (cmd) { 1605 case PHYSDEVOP_map_pirq: { 1606 struct physdev_map_pirq map; 1607 1608 if (hypercall_compat32(exit->u.hcall.longmode)) { 1609 struct compat_physdev_map_pirq *map32 = (void *)↦ 1610 1611 if (kvm_copy_from_gva(cs, arg, map32, sizeof(*map32))) { 1612 return -EFAULT; 1613 } 1614 1615 /* 1616 * The only thing that's different is the alignment of the 1617 * uint64_t table_base at the end, which gets padding to make 1618 * it 64-bit aligned in the 64-bit version. 1619 */ 1620 qemu_build_assert(sizeof(*map32) == 36); 1621 qemu_build_assert(offsetof(struct physdev_map_pirq, entry_nr) == 1622 offsetof(struct compat_physdev_map_pirq, entry_nr)); 1623 memmove(&map.table_base, &map32->table_base, sizeof(map.table_base)); 1624 } else { 1625 if (kvm_copy_from_gva(cs, arg, &map, sizeof(map))) { 1626 err = -EFAULT; 1627 break; 1628 } 1629 } 1630 err = xen_physdev_map_pirq(&map); 1631 /* 1632 * Since table_base is an IN parameter and won't be changed, just 1633 * copy the size of the compat structure back to the guest. 1634 */ 1635 if (!err && kvm_copy_to_gva(cs, arg, &map, 1636 sizeof(struct compat_physdev_map_pirq))) { 1637 err = -EFAULT; 1638 } 1639 break; 1640 } 1641 case PHYSDEVOP_unmap_pirq: { 1642 struct physdev_unmap_pirq unmap; 1643 1644 qemu_build_assert(sizeof(unmap) == 8); 1645 if (kvm_copy_from_gva(cs, arg, &unmap, sizeof(unmap))) { 1646 err = -EFAULT; 1647 break; 1648 } 1649 1650 err = xen_physdev_unmap_pirq(&unmap); 1651 if (!err && kvm_copy_to_gva(cs, arg, &unmap, sizeof(unmap))) { 1652 err = -EFAULT; 1653 } 1654 break; 1655 } 1656 case PHYSDEVOP_eoi: { 1657 struct physdev_eoi eoi; 1658 1659 qemu_build_assert(sizeof(eoi) == 4); 1660 if (kvm_copy_from_gva(cs, arg, &eoi, sizeof(eoi))) { 1661 err = -EFAULT; 1662 break; 1663 } 1664 1665 err = xen_physdev_eoi_pirq(&eoi); 1666 if (!err && kvm_copy_to_gva(cs, arg, &eoi, sizeof(eoi))) { 1667 err = -EFAULT; 1668 } 1669 break; 1670 } 1671 case PHYSDEVOP_irq_status_query: { 1672 struct physdev_irq_status_query query; 1673 1674 qemu_build_assert(sizeof(query) == 8); 1675 if (kvm_copy_from_gva(cs, arg, &query, sizeof(query))) { 1676 err = -EFAULT; 1677 break; 1678 } 1679 1680 err = xen_physdev_query_pirq(&query); 1681 if (!err && kvm_copy_to_gva(cs, arg, &query, sizeof(query))) { 1682 err = -EFAULT; 1683 } 1684 break; 1685 } 1686 case PHYSDEVOP_get_free_pirq: { 1687 struct physdev_get_free_pirq get; 1688 1689 qemu_build_assert(sizeof(get) == 8); 1690 if (kvm_copy_from_gva(cs, arg, &get, sizeof(get))) { 1691 err = -EFAULT; 1692 break; 1693 } 1694 1695 err = xen_physdev_get_free_pirq(&get); 1696 if (!err && kvm_copy_to_gva(cs, arg, &get, sizeof(get))) { 1697 err = -EFAULT; 1698 } 1699 break; 1700 } 1701 case PHYSDEVOP_pirq_eoi_gmfn_v2: /* FreeBSD 13 makes this hypercall */ 1702 err = -ENOSYS; 1703 break; 1704 1705 default: 1706 return false; 1707 } 1708 1709 exit->u.hcall.result = err; 1710 return true; 1711 } 1712 1713 static bool do_kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit) 1714 { 1715 uint16_t code = exit->u.hcall.input; 1716 1717 if (exit->u.hcall.cpl > 0) { 1718 exit->u.hcall.result = -EPERM; 1719 return true; 1720 } 1721 1722 switch (code) { 1723 case __HYPERVISOR_set_timer_op: 1724 if (exit->u.hcall.longmode) { 1725 return kvm_xen_hcall_set_timer_op(exit, cpu, 1726 exit->u.hcall.params[0]); 1727 } else { 1728 /* In 32-bit mode, the 64-bit timer value is in two args. */ 1729 uint64_t val = ((uint64_t)exit->u.hcall.params[1]) << 32 | 1730 (uint32_t)exit->u.hcall.params[0]; 1731 return kvm_xen_hcall_set_timer_op(exit, cpu, val); 1732 } 1733 case __HYPERVISOR_grant_table_op: 1734 return kvm_xen_hcall_gnttab_op(exit, cpu, exit->u.hcall.params[0], 1735 exit->u.hcall.params[1], 1736 exit->u.hcall.params[2]); 1737 case __HYPERVISOR_sched_op: 1738 return kvm_xen_hcall_sched_op(exit, cpu, exit->u.hcall.params[0], 1739 exit->u.hcall.params[1]); 1740 case __HYPERVISOR_event_channel_op: 1741 return kvm_xen_hcall_evtchn_op(exit, cpu, exit->u.hcall.params[0], 1742 exit->u.hcall.params[1]); 1743 case __HYPERVISOR_vcpu_op: 1744 return kvm_xen_hcall_vcpu_op(exit, cpu, 1745 exit->u.hcall.params[0], 1746 exit->u.hcall.params[1], 1747 exit->u.hcall.params[2]); 1748 case __HYPERVISOR_hvm_op: 1749 return kvm_xen_hcall_hvm_op(exit, cpu, exit->u.hcall.params[0], 1750 exit->u.hcall.params[1]); 1751 case __HYPERVISOR_memory_op: 1752 return kvm_xen_hcall_memory_op(exit, cpu, exit->u.hcall.params[0], 1753 exit->u.hcall.params[1]); 1754 case __HYPERVISOR_physdev_op: 1755 return kvm_xen_hcall_physdev_op(exit, cpu, exit->u.hcall.params[0], 1756 exit->u.hcall.params[1]); 1757 case __HYPERVISOR_xen_version: 1758 return kvm_xen_hcall_xen_version(exit, cpu, exit->u.hcall.params[0], 1759 exit->u.hcall.params[1]); 1760 default: 1761 return false; 1762 } 1763 } 1764 1765 int kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit) 1766 { 1767 if (exit->type != KVM_EXIT_XEN_HCALL) { 1768 return -1; 1769 } 1770 1771 /* 1772 * The kernel latches the guest 32/64 mode when the MSR is used to fill 1773 * the hypercall page. So if we see a hypercall in a mode that doesn't 1774 * match our own idea of the guest mode, fetch the kernel's idea of the 1775 * "long mode" to remain in sync. 1776 */ 1777 if (exit->u.hcall.longmode != xen_is_long_mode()) { 1778 xen_sync_long_mode(); 1779 } 1780 1781 if (!do_kvm_xen_handle_exit(cpu, exit)) { 1782 /* 1783 * Some hypercalls will be deliberately "implemented" by returning 1784 * -ENOSYS. This case is for hypercalls which are unexpected. 1785 */ 1786 exit->u.hcall.result = -ENOSYS; 1787 qemu_log_mask(LOG_UNIMP, "Unimplemented Xen hypercall %" 1788 PRId64 " (0x%" PRIx64 " 0x%" PRIx64 " 0x%" PRIx64 ")\n", 1789 (uint64_t)exit->u.hcall.input, 1790 (uint64_t)exit->u.hcall.params[0], 1791 (uint64_t)exit->u.hcall.params[1], 1792 (uint64_t)exit->u.hcall.params[2]); 1793 } 1794 1795 trace_kvm_xen_hypercall(CPU(cpu)->cpu_index, exit->u.hcall.cpl, 1796 exit->u.hcall.input, exit->u.hcall.params[0], 1797 exit->u.hcall.params[1], exit->u.hcall.params[2], 1798 exit->u.hcall.result); 1799 return 0; 1800 } 1801 1802 uint16_t kvm_xen_get_gnttab_max_frames(void) 1803 { 1804 KVMState *s = KVM_STATE(current_accel()); 1805 return s->xen_gnttab_max_frames; 1806 } 1807 1808 uint16_t kvm_xen_get_evtchn_max_pirq(void) 1809 { 1810 KVMState *s = KVM_STATE(current_accel()); 1811 return s->xen_evtchn_max_pirq; 1812 } 1813 1814 int kvm_put_xen_state(CPUState *cs) 1815 { 1816 X86CPU *cpu = X86_CPU(cs); 1817 CPUX86State *env = &cpu->env; 1818 uint64_t gpa; 1819 int ret; 1820 1821 gpa = env->xen_vcpu_info_gpa; 1822 if (gpa == INVALID_GPA) { 1823 gpa = env->xen_vcpu_info_default_gpa; 1824 } 1825 1826 if (gpa != INVALID_GPA) { 1827 ret = set_vcpu_info(cs, gpa); 1828 if (ret < 0) { 1829 return ret; 1830 } 1831 } 1832 1833 gpa = env->xen_vcpu_time_info_gpa; 1834 if (gpa != INVALID_GPA) { 1835 ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO, 1836 gpa); 1837 if (ret < 0) { 1838 return ret; 1839 } 1840 } 1841 1842 gpa = env->xen_vcpu_runstate_gpa; 1843 if (gpa != INVALID_GPA) { 1844 ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR, 1845 gpa); 1846 if (ret < 0) { 1847 return ret; 1848 } 1849 } 1850 1851 if (env->xen_periodic_timer_period) { 1852 ret = do_set_periodic_timer(cs, env->xen_periodic_timer_period); 1853 if (ret < 0) { 1854 return ret; 1855 } 1856 } 1857 1858 if (!kvm_xen_has_cap(EVTCHN_SEND)) { 1859 /* 1860 * If the kernel has EVTCHN_SEND support then it handles timers too, 1861 * so the timer will be restored by kvm_xen_set_vcpu_timer() below. 1862 */ 1863 QEMU_LOCK_GUARD(&env->xen_timers_lock); 1864 if (env->xen_singleshot_timer_ns) { 1865 ret = do_set_singleshot_timer(cs, env->xen_singleshot_timer_ns, 1866 false); 1867 if (ret < 0) { 1868 return ret; 1869 } 1870 } 1871 return 0; 1872 } 1873 1874 if (env->xen_vcpu_callback_vector) { 1875 ret = kvm_xen_set_vcpu_callback_vector(cs); 1876 if (ret < 0) { 1877 return ret; 1878 } 1879 } 1880 1881 if (env->xen_virq[VIRQ_TIMER]) { 1882 do_set_vcpu_timer_virq(cs, 1883 RUN_ON_CPU_HOST_INT(env->xen_virq[VIRQ_TIMER])); 1884 } 1885 return 0; 1886 } 1887 1888 int kvm_get_xen_state(CPUState *cs) 1889 { 1890 X86CPU *cpu = X86_CPU(cs); 1891 CPUX86State *env = &cpu->env; 1892 uint64_t gpa; 1893 int ret; 1894 1895 /* 1896 * The kernel does not mark vcpu_info as dirty when it delivers interrupts 1897 * to it. It's up to userspace to *assume* that any page shared thus is 1898 * always considered dirty. The shared_info page is different since it's 1899 * an overlay and migrated separately anyway. 1900 */ 1901 gpa = env->xen_vcpu_info_gpa; 1902 if (gpa == INVALID_GPA) { 1903 gpa = env->xen_vcpu_info_default_gpa; 1904 } 1905 if (gpa != INVALID_GPA) { 1906 MemoryRegionSection mrs = memory_region_find(get_system_memory(), 1907 gpa, 1908 sizeof(struct vcpu_info)); 1909 if (mrs.mr && 1910 !int128_lt(mrs.size, int128_make64(sizeof(struct vcpu_info)))) { 1911 memory_region_set_dirty(mrs.mr, mrs.offset_within_region, 1912 sizeof(struct vcpu_info)); 1913 } 1914 } 1915 1916 if (!kvm_xen_has_cap(EVTCHN_SEND)) { 1917 return 0; 1918 } 1919 1920 /* 1921 * If the kernel is accelerating timers, read out the current value of the 1922 * singleshot timer deadline. 1923 */ 1924 if (env->xen_virq[VIRQ_TIMER]) { 1925 struct kvm_xen_vcpu_attr va = { 1926 .type = KVM_XEN_VCPU_ATTR_TYPE_TIMER, 1927 }; 1928 ret = kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_GET_ATTR, &va); 1929 if (ret < 0) { 1930 return ret; 1931 } 1932 1933 /* 1934 * This locking is fairly pointless, and is here to appease Coverity. 1935 * There is an unavoidable race condition if a different vCPU sets a 1936 * timer for this vCPU after the value has been read out. But that's 1937 * OK in practice because *all* the vCPUs need to be stopped before 1938 * we set about migrating their state. 1939 */ 1940 QEMU_LOCK_GUARD(&X86_CPU(cs)->env.xen_timers_lock); 1941 env->xen_singleshot_timer_ns = va.u.timer.expires_ns; 1942 } 1943 1944 return 0; 1945 } 1946