1 /* 2 * Xen HVM emulation support in KVM 3 * 4 * Copyright © 2019 Oracle and/or its affiliates. All rights reserved. 5 * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. 6 * 7 * This work is licensed under the terms of the GNU GPL, version 2 or later. 8 * See the COPYING file in the top-level directory. 9 * 10 */ 11 12 #include "qemu/osdep.h" 13 #include "qemu/log.h" 14 #include "qemu/main-loop.h" 15 #include "qemu/error-report.h" 16 #include "hw/xen/xen.h" 17 #include "sysemu/kvm_int.h" 18 #include "sysemu/kvm_xen.h" 19 #include "kvm/kvm_i386.h" 20 #include "exec/address-spaces.h" 21 #include "xen-emu.h" 22 #include "trace.h" 23 #include "sysemu/runstate.h" 24 25 #include "hw/pci/msi.h" 26 #include "hw/i386/apic-msidef.h" 27 #include "hw/i386/e820_memory_layout.h" 28 #include "hw/i386/kvm/xen_overlay.h" 29 #include "hw/i386/kvm/xen_evtchn.h" 30 #include "hw/i386/kvm/xen_gnttab.h" 31 #include "hw/i386/kvm/xen_xenstore.h" 32 33 #include "hw/xen/interface/version.h" 34 #include "hw/xen/interface/sched.h" 35 #include "hw/xen/interface/memory.h" 36 #include "hw/xen/interface/hvm/hvm_op.h" 37 #include "hw/xen/interface/hvm/params.h" 38 #include "hw/xen/interface/vcpu.h" 39 #include "hw/xen/interface/event_channel.h" 40 #include "hw/xen/interface/grant_table.h" 41 42 #include "xen-compat.h" 43 44 static void xen_vcpu_singleshot_timer_event(void *opaque); 45 static void xen_vcpu_periodic_timer_event(void *opaque); 46 47 #ifdef TARGET_X86_64 48 #define hypercall_compat32(longmode) (!(longmode)) 49 #else 50 #define hypercall_compat32(longmode) (false) 51 #endif 52 53 static bool kvm_gva_to_gpa(CPUState *cs, uint64_t gva, uint64_t *gpa, 54 size_t *len, bool is_write) 55 { 56 struct kvm_translation tr = { 57 .linear_address = gva, 58 }; 59 60 if (len) { 61 *len = TARGET_PAGE_SIZE - (gva & ~TARGET_PAGE_MASK); 62 } 63 64 if (kvm_vcpu_ioctl(cs, KVM_TRANSLATE, &tr) || !tr.valid || 65 (is_write && !tr.writeable)) { 66 return false; 67 } 68 *gpa = tr.physical_address; 69 return true; 70 } 71 72 static int kvm_gva_rw(CPUState *cs, uint64_t gva, void *_buf, size_t sz, 73 bool is_write) 74 { 75 uint8_t *buf = (uint8_t *)_buf; 76 uint64_t gpa; 77 size_t len; 78 79 while (sz) { 80 if (!kvm_gva_to_gpa(cs, gva, &gpa, &len, is_write)) { 81 return -EFAULT; 82 } 83 if (len > sz) { 84 len = sz; 85 } 86 87 cpu_physical_memory_rw(gpa, buf, len, is_write); 88 89 buf += len; 90 sz -= len; 91 gva += len; 92 } 93 94 return 0; 95 } 96 97 static inline int kvm_copy_from_gva(CPUState *cs, uint64_t gva, void *buf, 98 size_t sz) 99 { 100 return kvm_gva_rw(cs, gva, buf, sz, false); 101 } 102 103 static inline int kvm_copy_to_gva(CPUState *cs, uint64_t gva, void *buf, 104 size_t sz) 105 { 106 return kvm_gva_rw(cs, gva, buf, sz, true); 107 } 108 109 int kvm_xen_init(KVMState *s, uint32_t hypercall_msr) 110 { 111 const int required_caps = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR | 112 KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | KVM_XEN_HVM_CONFIG_SHARED_INFO; 113 struct kvm_xen_hvm_config cfg = { 114 .msr = hypercall_msr, 115 .flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL, 116 }; 117 int xen_caps, ret; 118 119 xen_caps = kvm_check_extension(s, KVM_CAP_XEN_HVM); 120 if (required_caps & ~xen_caps) { 121 error_report("kvm: Xen HVM guest support not present or insufficient"); 122 return -ENOSYS; 123 } 124 125 if (xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND) { 126 struct kvm_xen_hvm_attr ha = { 127 .type = KVM_XEN_ATTR_TYPE_XEN_VERSION, 128 .u.xen_version = s->xen_version, 129 }; 130 (void)kvm_vm_ioctl(s, KVM_XEN_HVM_SET_ATTR, &ha); 131 132 cfg.flags |= KVM_XEN_HVM_CONFIG_EVTCHN_SEND; 133 } 134 135 ret = kvm_vm_ioctl(s, KVM_XEN_HVM_CONFIG, &cfg); 136 if (ret < 0) { 137 error_report("kvm: Failed to enable Xen HVM support: %s", 138 strerror(-ret)); 139 return ret; 140 } 141 142 /* If called a second time, don't repeat the rest of the setup. */ 143 if (s->xen_caps) { 144 return 0; 145 } 146 147 /* 148 * Event channel delivery via GSI/PCI_INTX needs to poll the vcpu_info 149 * of vCPU0 to deassert the IRQ when ->evtchn_upcall_pending is cleared. 150 * 151 * In the kernel, there's a notifier hook on the PIC/IOAPIC which allows 152 * such things to be polled at precisely the right time. We *could* do 153 * it nicely in the kernel: check vcpu_info[0]->evtchn_upcall_pending at 154 * the moment the IRQ is acked, and see if it should be reasserted. 155 * 156 * But the in-kernel irqchip is deprecated, so we're unlikely to add 157 * that support in the kernel. Insist on using the split irqchip mode 158 * instead. 159 * 160 * This leaves us polling for the level going low in QEMU, which lacks 161 * the appropriate hooks in its PIC/IOAPIC code. Even VFIO is sending a 162 * spurious 'ack' to an INTX IRQ every time there's any MMIO access to 163 * the device (for which it has to unmap the device and trap access, for 164 * some period after an IRQ!!). In the Xen case, we do it on exit from 165 * KVM_RUN, if the flag is set to say that the GSI is currently asserted. 166 * Which is kind of icky, but less so than the VFIO one. I may fix them 167 * both later... 168 */ 169 if (!kvm_kernel_irqchip_split()) { 170 error_report("kvm: Xen support requires kernel-irqchip=split"); 171 return -EINVAL; 172 } 173 174 s->xen_caps = xen_caps; 175 176 /* Tell fw_cfg to notify the BIOS to reserve the range. */ 177 ret = e820_add_entry(XEN_SPECIAL_AREA_ADDR, XEN_SPECIAL_AREA_SIZE, 178 E820_RESERVED); 179 if (ret < 0) { 180 fprintf(stderr, "e820_add_entry() table is full\n"); 181 return ret; 182 } 183 184 /* The page couldn't be overlaid until KVM was initialized */ 185 xen_xenstore_reset(); 186 187 return 0; 188 } 189 190 int kvm_xen_init_vcpu(CPUState *cs) 191 { 192 X86CPU *cpu = X86_CPU(cs); 193 CPUX86State *env = &cpu->env; 194 int err; 195 196 /* 197 * The kernel needs to know the Xen/ACPI vCPU ID because that's 198 * what the guest uses in hypercalls such as timers. It doesn't 199 * match the APIC ID which is generally used for talking to the 200 * kernel about vCPUs. And if vCPU threads race with creating 201 * their KVM vCPUs out of order, it doesn't necessarily match 202 * with the kernel's internal vCPU indices either. 203 */ 204 if (kvm_xen_has_cap(EVTCHN_SEND)) { 205 struct kvm_xen_vcpu_attr va = { 206 .type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID, 207 .u.vcpu_id = cs->cpu_index, 208 }; 209 err = kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &va); 210 if (err) { 211 error_report("kvm: Failed to set Xen vCPU ID attribute: %s", 212 strerror(-err)); 213 return err; 214 } 215 } 216 217 env->xen_vcpu_info_gpa = INVALID_GPA; 218 env->xen_vcpu_info_default_gpa = INVALID_GPA; 219 env->xen_vcpu_time_info_gpa = INVALID_GPA; 220 env->xen_vcpu_runstate_gpa = INVALID_GPA; 221 222 qemu_mutex_init(&env->xen_timers_lock); 223 env->xen_singleshot_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, 224 xen_vcpu_singleshot_timer_event, 225 cpu); 226 if (!env->xen_singleshot_timer) { 227 return -ENOMEM; 228 } 229 env->xen_singleshot_timer->opaque = cs; 230 231 env->xen_periodic_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, 232 xen_vcpu_periodic_timer_event, 233 cpu); 234 if (!env->xen_periodic_timer) { 235 return -ENOMEM; 236 } 237 env->xen_periodic_timer->opaque = cs; 238 239 return 0; 240 } 241 242 uint32_t kvm_xen_get_caps(void) 243 { 244 return kvm_state->xen_caps; 245 } 246 247 static bool kvm_xen_hcall_xen_version(struct kvm_xen_exit *exit, X86CPU *cpu, 248 int cmd, uint64_t arg) 249 { 250 int err = 0; 251 252 switch (cmd) { 253 case XENVER_get_features: { 254 struct xen_feature_info fi; 255 256 /* No need for 32/64 compat handling */ 257 qemu_build_assert(sizeof(fi) == 8); 258 259 err = kvm_copy_from_gva(CPU(cpu), arg, &fi, sizeof(fi)); 260 if (err) { 261 break; 262 } 263 264 fi.submap = 0; 265 if (fi.submap_idx == 0) { 266 fi.submap |= 1 << XENFEAT_writable_page_tables | 267 1 << XENFEAT_writable_descriptor_tables | 268 1 << XENFEAT_auto_translated_physmap | 269 1 << XENFEAT_supervisor_mode_kernel | 270 1 << XENFEAT_hvm_callback_vector | 271 1 << XENFEAT_hvm_safe_pvclock | 272 1 << XENFEAT_hvm_pirqs; 273 } 274 275 err = kvm_copy_to_gva(CPU(cpu), arg, &fi, sizeof(fi)); 276 break; 277 } 278 279 default: 280 return false; 281 } 282 283 exit->u.hcall.result = err; 284 return true; 285 } 286 287 static int kvm_xen_set_vcpu_attr(CPUState *cs, uint16_t type, uint64_t gpa) 288 { 289 struct kvm_xen_vcpu_attr xhsi; 290 291 xhsi.type = type; 292 xhsi.u.gpa = gpa; 293 294 trace_kvm_xen_set_vcpu_attr(cs->cpu_index, type, gpa); 295 296 return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &xhsi); 297 } 298 299 static int kvm_xen_set_vcpu_callback_vector(CPUState *cs) 300 { 301 uint8_t vector = X86_CPU(cs)->env.xen_vcpu_callback_vector; 302 struct kvm_xen_vcpu_attr xva; 303 304 xva.type = KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR; 305 xva.u.vector = vector; 306 307 trace_kvm_xen_set_vcpu_callback(cs->cpu_index, vector); 308 309 return kvm_vcpu_ioctl(cs, KVM_XEN_HVM_SET_ATTR, &xva); 310 } 311 312 static void do_set_vcpu_callback_vector(CPUState *cs, run_on_cpu_data data) 313 { 314 X86CPU *cpu = X86_CPU(cs); 315 CPUX86State *env = &cpu->env; 316 317 env->xen_vcpu_callback_vector = data.host_int; 318 319 if (kvm_xen_has_cap(EVTCHN_SEND)) { 320 kvm_xen_set_vcpu_callback_vector(cs); 321 } 322 } 323 324 static int set_vcpu_info(CPUState *cs, uint64_t gpa) 325 { 326 X86CPU *cpu = X86_CPU(cs); 327 CPUX86State *env = &cpu->env; 328 MemoryRegionSection mrs = { .mr = NULL }; 329 void *vcpu_info_hva = NULL; 330 int ret; 331 332 ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO, gpa); 333 if (ret || gpa == INVALID_GPA) { 334 goto out; 335 } 336 337 mrs = memory_region_find(get_system_memory(), gpa, 338 sizeof(struct vcpu_info)); 339 if (mrs.mr && mrs.mr->ram_block && 340 !int128_lt(mrs.size, int128_make64(sizeof(struct vcpu_info)))) { 341 vcpu_info_hva = qemu_map_ram_ptr(mrs.mr->ram_block, 342 mrs.offset_within_region); 343 } 344 if (!vcpu_info_hva) { 345 if (mrs.mr) { 346 memory_region_unref(mrs.mr); 347 mrs.mr = NULL; 348 } 349 ret = -EINVAL; 350 } 351 352 out: 353 if (env->xen_vcpu_info_mr) { 354 memory_region_unref(env->xen_vcpu_info_mr); 355 } 356 env->xen_vcpu_info_hva = vcpu_info_hva; 357 env->xen_vcpu_info_mr = mrs.mr; 358 return ret; 359 } 360 361 static void do_set_vcpu_info_default_gpa(CPUState *cs, run_on_cpu_data data) 362 { 363 X86CPU *cpu = X86_CPU(cs); 364 CPUX86State *env = &cpu->env; 365 366 env->xen_vcpu_info_default_gpa = data.host_ulong; 367 368 /* Changing the default does nothing if a vcpu_info was explicitly set. */ 369 if (env->xen_vcpu_info_gpa == INVALID_GPA) { 370 set_vcpu_info(cs, env->xen_vcpu_info_default_gpa); 371 } 372 } 373 374 static void do_set_vcpu_info_gpa(CPUState *cs, run_on_cpu_data data) 375 { 376 X86CPU *cpu = X86_CPU(cs); 377 CPUX86State *env = &cpu->env; 378 379 env->xen_vcpu_info_gpa = data.host_ulong; 380 381 set_vcpu_info(cs, env->xen_vcpu_info_gpa); 382 } 383 384 void *kvm_xen_get_vcpu_info_hva(uint32_t vcpu_id) 385 { 386 CPUState *cs = qemu_get_cpu(vcpu_id); 387 if (!cs) { 388 return NULL; 389 } 390 391 return X86_CPU(cs)->env.xen_vcpu_info_hva; 392 } 393 394 void kvm_xen_maybe_deassert_callback(CPUState *cs) 395 { 396 CPUX86State *env = &X86_CPU(cs)->env; 397 struct vcpu_info *vi = env->xen_vcpu_info_hva; 398 if (!vi) { 399 return; 400 } 401 402 /* If the evtchn_upcall_pending flag is cleared, turn the GSI off. */ 403 if (!vi->evtchn_upcall_pending) { 404 qemu_mutex_lock_iothread(); 405 /* 406 * Check again now we have the lock, because it may have been 407 * asserted in the interim. And we don't want to take the lock 408 * every time because this is a fast path. 409 */ 410 if (!vi->evtchn_upcall_pending) { 411 X86_CPU(cs)->env.xen_callback_asserted = false; 412 xen_evtchn_set_callback_level(0); 413 } 414 qemu_mutex_unlock_iothread(); 415 } 416 } 417 418 void kvm_xen_set_callback_asserted(void) 419 { 420 CPUState *cs = qemu_get_cpu(0); 421 422 if (cs) { 423 X86_CPU(cs)->env.xen_callback_asserted = true; 424 } 425 } 426 427 void kvm_xen_inject_vcpu_callback_vector(uint32_t vcpu_id, int type) 428 { 429 CPUState *cs = qemu_get_cpu(vcpu_id); 430 uint8_t vector; 431 432 if (!cs) { 433 return; 434 } 435 436 vector = X86_CPU(cs)->env.xen_vcpu_callback_vector; 437 if (vector) { 438 /* 439 * The per-vCPU callback vector injected via lapic. Just 440 * deliver it as an MSI. 441 */ 442 MSIMessage msg = { 443 .address = APIC_DEFAULT_ADDRESS | X86_CPU(cs)->apic_id, 444 .data = vector | (1UL << MSI_DATA_LEVEL_SHIFT), 445 }; 446 kvm_irqchip_send_msi(kvm_state, msg); 447 return; 448 } 449 450 switch (type) { 451 case HVM_PARAM_CALLBACK_TYPE_VECTOR: 452 /* 453 * If the evtchn_upcall_pending field in the vcpu_info is set, then 454 * KVM will automatically deliver the vector on entering the vCPU 455 * so all we have to do is kick it out. 456 */ 457 qemu_cpu_kick(cs); 458 break; 459 460 case HVM_PARAM_CALLBACK_TYPE_GSI: 461 case HVM_PARAM_CALLBACK_TYPE_PCI_INTX: 462 if (vcpu_id == 0) { 463 xen_evtchn_set_callback_level(1); 464 } 465 break; 466 } 467 } 468 469 static int kvm_xen_set_vcpu_timer(CPUState *cs) 470 { 471 X86CPU *cpu = X86_CPU(cs); 472 CPUX86State *env = &cpu->env; 473 474 struct kvm_xen_vcpu_attr va = { 475 .type = KVM_XEN_VCPU_ATTR_TYPE_TIMER, 476 .u.timer.port = env->xen_virq[VIRQ_TIMER], 477 .u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL, 478 .u.timer.expires_ns = env->xen_singleshot_timer_ns, 479 }; 480 481 return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &va); 482 } 483 484 static void do_set_vcpu_timer_virq(CPUState *cs, run_on_cpu_data data) 485 { 486 kvm_xen_set_vcpu_timer(cs); 487 } 488 489 int kvm_xen_set_vcpu_virq(uint32_t vcpu_id, uint16_t virq, uint16_t port) 490 { 491 CPUState *cs = qemu_get_cpu(vcpu_id); 492 493 if (!cs) { 494 return -ENOENT; 495 } 496 497 /* cpu.h doesn't include the actual Xen header. */ 498 qemu_build_assert(NR_VIRQS == XEN_NR_VIRQS); 499 500 if (virq >= NR_VIRQS) { 501 return -EINVAL; 502 } 503 504 if (port && X86_CPU(cs)->env.xen_virq[virq]) { 505 return -EEXIST; 506 } 507 508 X86_CPU(cs)->env.xen_virq[virq] = port; 509 if (virq == VIRQ_TIMER && kvm_xen_has_cap(EVTCHN_SEND)) { 510 async_run_on_cpu(cs, do_set_vcpu_timer_virq, 511 RUN_ON_CPU_HOST_INT(port)); 512 } 513 return 0; 514 } 515 516 static void do_set_vcpu_time_info_gpa(CPUState *cs, run_on_cpu_data data) 517 { 518 X86CPU *cpu = X86_CPU(cs); 519 CPUX86State *env = &cpu->env; 520 521 env->xen_vcpu_time_info_gpa = data.host_ulong; 522 523 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO, 524 env->xen_vcpu_time_info_gpa); 525 } 526 527 static void do_set_vcpu_runstate_gpa(CPUState *cs, run_on_cpu_data data) 528 { 529 X86CPU *cpu = X86_CPU(cs); 530 CPUX86State *env = &cpu->env; 531 532 env->xen_vcpu_runstate_gpa = data.host_ulong; 533 534 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR, 535 env->xen_vcpu_runstate_gpa); 536 } 537 538 static void do_vcpu_soft_reset(CPUState *cs, run_on_cpu_data data) 539 { 540 X86CPU *cpu = X86_CPU(cs); 541 CPUX86State *env = &cpu->env; 542 543 env->xen_vcpu_info_gpa = INVALID_GPA; 544 env->xen_vcpu_info_default_gpa = INVALID_GPA; 545 env->xen_vcpu_time_info_gpa = INVALID_GPA; 546 env->xen_vcpu_runstate_gpa = INVALID_GPA; 547 env->xen_vcpu_callback_vector = 0; 548 env->xen_singleshot_timer_ns = 0; 549 memset(env->xen_virq, 0, sizeof(env->xen_virq)); 550 551 set_vcpu_info(cs, INVALID_GPA); 552 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO, 553 INVALID_GPA); 554 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR, 555 INVALID_GPA); 556 if (kvm_xen_has_cap(EVTCHN_SEND)) { 557 kvm_xen_set_vcpu_callback_vector(cs); 558 kvm_xen_set_vcpu_timer(cs); 559 } 560 561 } 562 563 static int xen_set_shared_info(uint64_t gfn) 564 { 565 uint64_t gpa = gfn << TARGET_PAGE_BITS; 566 int i, err; 567 568 QEMU_IOTHREAD_LOCK_GUARD(); 569 570 /* 571 * The xen_overlay device tells KVM about it too, since it had to 572 * do that on migration load anyway (unless we're going to jump 573 * through lots of hoops to maintain the fiction that this isn't 574 * KVM-specific. 575 */ 576 err = xen_overlay_map_shinfo_page(gpa); 577 if (err) { 578 return err; 579 } 580 581 trace_kvm_xen_set_shared_info(gfn); 582 583 for (i = 0; i < XEN_LEGACY_MAX_VCPUS; i++) { 584 CPUState *cpu = qemu_get_cpu(i); 585 if (cpu) { 586 async_run_on_cpu(cpu, do_set_vcpu_info_default_gpa, 587 RUN_ON_CPU_HOST_ULONG(gpa)); 588 } 589 gpa += sizeof(vcpu_info_t); 590 } 591 592 return err; 593 } 594 595 static int add_to_physmap_one(uint32_t space, uint64_t idx, uint64_t gfn) 596 { 597 switch (space) { 598 case XENMAPSPACE_shared_info: 599 if (idx > 0) { 600 return -EINVAL; 601 } 602 return xen_set_shared_info(gfn); 603 604 case XENMAPSPACE_grant_table: 605 return xen_gnttab_map_page(idx, gfn); 606 607 case XENMAPSPACE_gmfn: 608 case XENMAPSPACE_gmfn_range: 609 return -ENOTSUP; 610 611 case XENMAPSPACE_gmfn_foreign: 612 case XENMAPSPACE_dev_mmio: 613 return -EPERM; 614 615 default: 616 return -EINVAL; 617 } 618 } 619 620 static int do_add_to_physmap(struct kvm_xen_exit *exit, X86CPU *cpu, 621 uint64_t arg) 622 { 623 struct xen_add_to_physmap xatp; 624 CPUState *cs = CPU(cpu); 625 626 if (hypercall_compat32(exit->u.hcall.longmode)) { 627 struct compat_xen_add_to_physmap xatp32; 628 629 qemu_build_assert(sizeof(struct compat_xen_add_to_physmap) == 16); 630 if (kvm_copy_from_gva(cs, arg, &xatp32, sizeof(xatp32))) { 631 return -EFAULT; 632 } 633 xatp.domid = xatp32.domid; 634 xatp.size = xatp32.size; 635 xatp.space = xatp32.space; 636 xatp.idx = xatp32.idx; 637 xatp.gpfn = xatp32.gpfn; 638 } else { 639 if (kvm_copy_from_gva(cs, arg, &xatp, sizeof(xatp))) { 640 return -EFAULT; 641 } 642 } 643 644 if (xatp.domid != DOMID_SELF && xatp.domid != xen_domid) { 645 return -ESRCH; 646 } 647 648 return add_to_physmap_one(xatp.space, xatp.idx, xatp.gpfn); 649 } 650 651 static int do_add_to_physmap_batch(struct kvm_xen_exit *exit, X86CPU *cpu, 652 uint64_t arg) 653 { 654 struct xen_add_to_physmap_batch xatpb; 655 unsigned long idxs_gva, gpfns_gva, errs_gva; 656 CPUState *cs = CPU(cpu); 657 size_t op_sz; 658 659 if (hypercall_compat32(exit->u.hcall.longmode)) { 660 struct compat_xen_add_to_physmap_batch xatpb32; 661 662 qemu_build_assert(sizeof(struct compat_xen_add_to_physmap_batch) == 20); 663 if (kvm_copy_from_gva(cs, arg, &xatpb32, sizeof(xatpb32))) { 664 return -EFAULT; 665 } 666 xatpb.domid = xatpb32.domid; 667 xatpb.space = xatpb32.space; 668 xatpb.size = xatpb32.size; 669 670 idxs_gva = xatpb32.idxs.c; 671 gpfns_gva = xatpb32.gpfns.c; 672 errs_gva = xatpb32.errs.c; 673 op_sz = sizeof(uint32_t); 674 } else { 675 if (kvm_copy_from_gva(cs, arg, &xatpb, sizeof(xatpb))) { 676 return -EFAULT; 677 } 678 op_sz = sizeof(unsigned long); 679 idxs_gva = (unsigned long)xatpb.idxs.p; 680 gpfns_gva = (unsigned long)xatpb.gpfns.p; 681 errs_gva = (unsigned long)xatpb.errs.p; 682 } 683 684 if (xatpb.domid != DOMID_SELF && xatpb.domid != xen_domid) { 685 return -ESRCH; 686 } 687 688 /* Explicitly invalid for the batch op. Not that we implement it anyway. */ 689 if (xatpb.space == XENMAPSPACE_gmfn_range) { 690 return -EINVAL; 691 } 692 693 while (xatpb.size--) { 694 unsigned long idx = 0; 695 unsigned long gpfn = 0; 696 int err; 697 698 /* For 32-bit compat this only copies the low 32 bits of each */ 699 if (kvm_copy_from_gva(cs, idxs_gva, &idx, op_sz) || 700 kvm_copy_from_gva(cs, gpfns_gva, &gpfn, op_sz)) { 701 return -EFAULT; 702 } 703 idxs_gva += op_sz; 704 gpfns_gva += op_sz; 705 706 err = add_to_physmap_one(xatpb.space, idx, gpfn); 707 708 if (kvm_copy_to_gva(cs, errs_gva, &err, sizeof(err))) { 709 return -EFAULT; 710 } 711 errs_gva += sizeof(err); 712 } 713 return 0; 714 } 715 716 static bool kvm_xen_hcall_memory_op(struct kvm_xen_exit *exit, X86CPU *cpu, 717 int cmd, uint64_t arg) 718 { 719 int err; 720 721 switch (cmd) { 722 case XENMEM_add_to_physmap: 723 err = do_add_to_physmap(exit, cpu, arg); 724 break; 725 726 case XENMEM_add_to_physmap_batch: 727 err = do_add_to_physmap_batch(exit, cpu, arg); 728 break; 729 730 default: 731 return false; 732 } 733 734 exit->u.hcall.result = err; 735 return true; 736 } 737 738 static bool handle_set_param(struct kvm_xen_exit *exit, X86CPU *cpu, 739 uint64_t arg) 740 { 741 CPUState *cs = CPU(cpu); 742 struct xen_hvm_param hp; 743 int err = 0; 744 745 /* No need for 32/64 compat handling */ 746 qemu_build_assert(sizeof(hp) == 16); 747 748 if (kvm_copy_from_gva(cs, arg, &hp, sizeof(hp))) { 749 err = -EFAULT; 750 goto out; 751 } 752 753 if (hp.domid != DOMID_SELF && hp.domid != xen_domid) { 754 err = -ESRCH; 755 goto out; 756 } 757 758 switch (hp.index) { 759 case HVM_PARAM_CALLBACK_IRQ: 760 qemu_mutex_lock_iothread(); 761 err = xen_evtchn_set_callback_param(hp.value); 762 qemu_mutex_unlock_iothread(); 763 xen_set_long_mode(exit->u.hcall.longmode); 764 break; 765 default: 766 return false; 767 } 768 769 out: 770 exit->u.hcall.result = err; 771 return true; 772 } 773 774 static bool handle_get_param(struct kvm_xen_exit *exit, X86CPU *cpu, 775 uint64_t arg) 776 { 777 CPUState *cs = CPU(cpu); 778 struct xen_hvm_param hp; 779 int err = 0; 780 781 /* No need for 32/64 compat handling */ 782 qemu_build_assert(sizeof(hp) == 16); 783 784 if (kvm_copy_from_gva(cs, arg, &hp, sizeof(hp))) { 785 err = -EFAULT; 786 goto out; 787 } 788 789 if (hp.domid != DOMID_SELF && hp.domid != xen_domid) { 790 err = -ESRCH; 791 goto out; 792 } 793 794 switch (hp.index) { 795 case HVM_PARAM_STORE_PFN: 796 hp.value = XEN_SPECIAL_PFN(XENSTORE); 797 break; 798 case HVM_PARAM_STORE_EVTCHN: 799 hp.value = xen_xenstore_get_port(); 800 break; 801 default: 802 return false; 803 } 804 805 if (kvm_copy_to_gva(cs, arg, &hp, sizeof(hp))) { 806 err = -EFAULT; 807 } 808 out: 809 exit->u.hcall.result = err; 810 return true; 811 } 812 813 static int kvm_xen_hcall_evtchn_upcall_vector(struct kvm_xen_exit *exit, 814 X86CPU *cpu, uint64_t arg) 815 { 816 struct xen_hvm_evtchn_upcall_vector up; 817 CPUState *target_cs; 818 819 /* No need for 32/64 compat handling */ 820 qemu_build_assert(sizeof(up) == 8); 821 822 if (kvm_copy_from_gva(CPU(cpu), arg, &up, sizeof(up))) { 823 return -EFAULT; 824 } 825 826 if (up.vector < 0x10) { 827 return -EINVAL; 828 } 829 830 target_cs = qemu_get_cpu(up.vcpu); 831 if (!target_cs) { 832 return -EINVAL; 833 } 834 835 async_run_on_cpu(target_cs, do_set_vcpu_callback_vector, 836 RUN_ON_CPU_HOST_INT(up.vector)); 837 return 0; 838 } 839 840 static bool kvm_xen_hcall_hvm_op(struct kvm_xen_exit *exit, X86CPU *cpu, 841 int cmd, uint64_t arg) 842 { 843 int ret = -ENOSYS; 844 switch (cmd) { 845 case HVMOP_set_evtchn_upcall_vector: 846 ret = kvm_xen_hcall_evtchn_upcall_vector(exit, cpu, 847 exit->u.hcall.params[0]); 848 break; 849 850 case HVMOP_pagetable_dying: 851 ret = -ENOSYS; 852 break; 853 854 case HVMOP_set_param: 855 return handle_set_param(exit, cpu, arg); 856 857 case HVMOP_get_param: 858 return handle_get_param(exit, cpu, arg); 859 860 default: 861 return false; 862 } 863 864 exit->u.hcall.result = ret; 865 return true; 866 } 867 868 static int vcpuop_register_vcpu_info(CPUState *cs, CPUState *target, 869 uint64_t arg) 870 { 871 struct vcpu_register_vcpu_info rvi; 872 uint64_t gpa; 873 874 /* No need for 32/64 compat handling */ 875 qemu_build_assert(sizeof(rvi) == 16); 876 qemu_build_assert(sizeof(struct vcpu_info) == 64); 877 878 if (!target) { 879 return -ENOENT; 880 } 881 882 if (kvm_copy_from_gva(cs, arg, &rvi, sizeof(rvi))) { 883 return -EFAULT; 884 } 885 886 if (rvi.offset > TARGET_PAGE_SIZE - sizeof(struct vcpu_info)) { 887 return -EINVAL; 888 } 889 890 gpa = ((rvi.mfn << TARGET_PAGE_BITS) + rvi.offset); 891 async_run_on_cpu(target, do_set_vcpu_info_gpa, RUN_ON_CPU_HOST_ULONG(gpa)); 892 return 0; 893 } 894 895 static int vcpuop_register_vcpu_time_info(CPUState *cs, CPUState *target, 896 uint64_t arg) 897 { 898 struct vcpu_register_time_memory_area tma; 899 uint64_t gpa; 900 size_t len; 901 902 /* No need for 32/64 compat handling */ 903 qemu_build_assert(sizeof(tma) == 8); 904 qemu_build_assert(sizeof(struct vcpu_time_info) == 32); 905 906 if (!target) { 907 return -ENOENT; 908 } 909 910 if (kvm_copy_from_gva(cs, arg, &tma, sizeof(tma))) { 911 return -EFAULT; 912 } 913 914 /* 915 * Xen actually uses the GVA and does the translation through the guest 916 * page tables each time. But Linux/KVM uses the GPA, on the assumption 917 * that guests only ever use *global* addresses (kernel virtual addresses) 918 * for it. If Linux is changed to redo the GVA→GPA translation each time, 919 * it will offer a new vCPU attribute for that, and we'll use it instead. 920 */ 921 if (!kvm_gva_to_gpa(cs, tma.addr.p, &gpa, &len, false) || 922 len < sizeof(struct vcpu_time_info)) { 923 return -EFAULT; 924 } 925 926 async_run_on_cpu(target, do_set_vcpu_time_info_gpa, 927 RUN_ON_CPU_HOST_ULONG(gpa)); 928 return 0; 929 } 930 931 static int vcpuop_register_runstate_info(CPUState *cs, CPUState *target, 932 uint64_t arg) 933 { 934 struct vcpu_register_runstate_memory_area rma; 935 uint64_t gpa; 936 size_t len; 937 938 /* No need for 32/64 compat handling */ 939 qemu_build_assert(sizeof(rma) == 8); 940 /* The runstate area actually does change size, but Linux copes. */ 941 942 if (!target) { 943 return -ENOENT; 944 } 945 946 if (kvm_copy_from_gva(cs, arg, &rma, sizeof(rma))) { 947 return -EFAULT; 948 } 949 950 /* As with vcpu_time_info, Xen actually uses the GVA but KVM doesn't. */ 951 if (!kvm_gva_to_gpa(cs, rma.addr.p, &gpa, &len, false)) { 952 return -EFAULT; 953 } 954 955 async_run_on_cpu(target, do_set_vcpu_runstate_gpa, 956 RUN_ON_CPU_HOST_ULONG(gpa)); 957 return 0; 958 } 959 960 static uint64_t kvm_get_current_ns(void) 961 { 962 struct kvm_clock_data data; 963 int ret; 964 965 ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data); 966 if (ret < 0) { 967 fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret)); 968 abort(); 969 } 970 971 return data.clock; 972 } 973 974 static void xen_vcpu_singleshot_timer_event(void *opaque) 975 { 976 CPUState *cpu = opaque; 977 CPUX86State *env = &X86_CPU(cpu)->env; 978 uint16_t port = env->xen_virq[VIRQ_TIMER]; 979 980 if (likely(port)) { 981 xen_evtchn_set_port(port); 982 } 983 984 qemu_mutex_lock(&env->xen_timers_lock); 985 env->xen_singleshot_timer_ns = 0; 986 qemu_mutex_unlock(&env->xen_timers_lock); 987 } 988 989 static void xen_vcpu_periodic_timer_event(void *opaque) 990 { 991 CPUState *cpu = opaque; 992 CPUX86State *env = &X86_CPU(cpu)->env; 993 uint16_t port = env->xen_virq[VIRQ_TIMER]; 994 int64_t qemu_now; 995 996 if (likely(port)) { 997 xen_evtchn_set_port(port); 998 } 999 1000 qemu_mutex_lock(&env->xen_timers_lock); 1001 1002 qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); 1003 timer_mod_ns(env->xen_periodic_timer, 1004 qemu_now + env->xen_periodic_timer_period); 1005 1006 qemu_mutex_unlock(&env->xen_timers_lock); 1007 } 1008 1009 static int do_set_periodic_timer(CPUState *target, uint64_t period_ns) 1010 { 1011 CPUX86State *tenv = &X86_CPU(target)->env; 1012 int64_t qemu_now; 1013 1014 timer_del(tenv->xen_periodic_timer); 1015 1016 qemu_mutex_lock(&tenv->xen_timers_lock); 1017 1018 qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); 1019 timer_mod_ns(tenv->xen_periodic_timer, qemu_now + period_ns); 1020 tenv->xen_periodic_timer_period = period_ns; 1021 1022 qemu_mutex_unlock(&tenv->xen_timers_lock); 1023 return 0; 1024 } 1025 1026 #define MILLISECS(_ms) ((int64_t)((_ms) * 1000000ULL)) 1027 #define MICROSECS(_us) ((int64_t)((_us) * 1000ULL)) 1028 #define STIME_MAX ((time_t)((int64_t)~0ull >> 1)) 1029 /* Chosen so (NOW() + delta) wont overflow without an uptime of 200 years */ 1030 #define STIME_DELTA_MAX ((int64_t)((uint64_t)~0ull >> 2)) 1031 1032 static int vcpuop_set_periodic_timer(CPUState *cs, CPUState *target, 1033 uint64_t arg) 1034 { 1035 struct vcpu_set_periodic_timer spt; 1036 1037 qemu_build_assert(sizeof(spt) == 8); 1038 if (kvm_copy_from_gva(cs, arg, &spt, sizeof(spt))) { 1039 return -EFAULT; 1040 } 1041 1042 if (spt.period_ns < MILLISECS(1) || spt.period_ns > STIME_DELTA_MAX) { 1043 return -EINVAL; 1044 } 1045 1046 return do_set_periodic_timer(target, spt.period_ns); 1047 } 1048 1049 static int vcpuop_stop_periodic_timer(CPUState *target) 1050 { 1051 CPUX86State *tenv = &X86_CPU(target)->env; 1052 1053 qemu_mutex_lock(&tenv->xen_timers_lock); 1054 1055 timer_del(tenv->xen_periodic_timer); 1056 tenv->xen_periodic_timer_period = 0; 1057 1058 qemu_mutex_unlock(&tenv->xen_timers_lock); 1059 return 0; 1060 } 1061 1062 static int do_set_singleshot_timer(CPUState *cs, uint64_t timeout_abs, 1063 bool future, bool linux_wa) 1064 { 1065 CPUX86State *env = &X86_CPU(cs)->env; 1066 int64_t now = kvm_get_current_ns(); 1067 int64_t qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); 1068 int64_t delta = timeout_abs - now; 1069 1070 if (future && timeout_abs < now) { 1071 return -ETIME; 1072 } 1073 1074 if (linux_wa && unlikely((int64_t)timeout_abs < 0 || 1075 (delta > 0 && (uint32_t)(delta >> 50) != 0))) { 1076 /* 1077 * Xen has a 'Linux workaround' in do_set_timer_op() which checks 1078 * for negative absolute timeout values (caused by integer 1079 * overflow), and for values about 13 days in the future (2^50ns) 1080 * which would be caused by jiffies overflow. For those cases, it 1081 * sets the timeout 100ms in the future (not *too* soon, since if 1082 * a guest really did set a long timeout on purpose we don't want 1083 * to keep churning CPU time by waking it up). 1084 */ 1085 delta = (100 * SCALE_MS); 1086 timeout_abs = now + delta; 1087 } 1088 1089 qemu_mutex_lock(&env->xen_timers_lock); 1090 1091 timer_mod_ns(env->xen_singleshot_timer, qemu_now + delta); 1092 env->xen_singleshot_timer_ns = now + delta; 1093 1094 qemu_mutex_unlock(&env->xen_timers_lock); 1095 return 0; 1096 } 1097 1098 static int vcpuop_set_singleshot_timer(CPUState *cs, uint64_t arg) 1099 { 1100 struct vcpu_set_singleshot_timer sst = { 0 }; 1101 1102 /* 1103 * The struct is a uint64_t followed by a uint32_t. On 32-bit that 1104 * makes it 12 bytes. On 64-bit it gets padded to 16. The parts 1105 * that get used are identical, and there's four bytes of padding 1106 * unused at the end. For true Xen compatibility we should attempt 1107 * to copy the full 16 bytes from 64-bit guests, and return -EFAULT 1108 * if we can't get the padding too. But that's daft. Just copy what 1109 * we need. 1110 */ 1111 qemu_build_assert(offsetof(struct vcpu_set_singleshot_timer, flags) == 8); 1112 qemu_build_assert(sizeof(sst) >= 12); 1113 1114 if (kvm_copy_from_gva(cs, arg, &sst, 12)) { 1115 return -EFAULT; 1116 } 1117 1118 return do_set_singleshot_timer(cs, sst.timeout_abs_ns, 1119 !!(sst.flags & VCPU_SSHOTTMR_future), 1120 false); 1121 } 1122 1123 static int vcpuop_stop_singleshot_timer(CPUState *cs) 1124 { 1125 CPUX86State *env = &X86_CPU(cs)->env; 1126 1127 qemu_mutex_lock(&env->xen_timers_lock); 1128 1129 timer_del(env->xen_singleshot_timer); 1130 env->xen_singleshot_timer_ns = 0; 1131 1132 qemu_mutex_unlock(&env->xen_timers_lock); 1133 return 0; 1134 } 1135 1136 static bool kvm_xen_hcall_set_timer_op(struct kvm_xen_exit *exit, X86CPU *cpu, 1137 uint64_t timeout) 1138 { 1139 int err; 1140 1141 if (unlikely(timeout == 0)) { 1142 err = vcpuop_stop_singleshot_timer(CPU(cpu)); 1143 } else { 1144 err = do_set_singleshot_timer(CPU(cpu), timeout, false, true); 1145 } 1146 exit->u.hcall.result = err; 1147 return true; 1148 } 1149 1150 static bool kvm_xen_hcall_vcpu_op(struct kvm_xen_exit *exit, X86CPU *cpu, 1151 int cmd, int vcpu_id, uint64_t arg) 1152 { 1153 CPUState *cs = CPU(cpu); 1154 CPUState *dest = cs->cpu_index == vcpu_id ? cs : qemu_get_cpu(vcpu_id); 1155 int err; 1156 1157 if (!dest) { 1158 err = -ENOENT; 1159 goto out; 1160 } 1161 1162 switch (cmd) { 1163 case VCPUOP_register_runstate_memory_area: 1164 err = vcpuop_register_runstate_info(cs, dest, arg); 1165 break; 1166 case VCPUOP_register_vcpu_time_memory_area: 1167 err = vcpuop_register_vcpu_time_info(cs, dest, arg); 1168 break; 1169 case VCPUOP_register_vcpu_info: 1170 err = vcpuop_register_vcpu_info(cs, dest, arg); 1171 break; 1172 case VCPUOP_set_singleshot_timer: { 1173 if (cs->cpu_index == vcpu_id) { 1174 err = vcpuop_set_singleshot_timer(dest, arg); 1175 } else { 1176 err = -EINVAL; 1177 } 1178 break; 1179 } 1180 case VCPUOP_stop_singleshot_timer: 1181 if (cs->cpu_index == vcpu_id) { 1182 err = vcpuop_stop_singleshot_timer(dest); 1183 } else { 1184 err = -EINVAL; 1185 } 1186 break; 1187 case VCPUOP_set_periodic_timer: { 1188 err = vcpuop_set_periodic_timer(cs, dest, arg); 1189 break; 1190 } 1191 case VCPUOP_stop_periodic_timer: 1192 err = vcpuop_stop_periodic_timer(dest); 1193 break; 1194 1195 default: 1196 return false; 1197 } 1198 1199 out: 1200 exit->u.hcall.result = err; 1201 return true; 1202 } 1203 1204 static bool kvm_xen_hcall_evtchn_op(struct kvm_xen_exit *exit, X86CPU *cpu, 1205 int cmd, uint64_t arg) 1206 { 1207 CPUState *cs = CPU(cpu); 1208 int err = -ENOSYS; 1209 1210 switch (cmd) { 1211 case EVTCHNOP_init_control: 1212 case EVTCHNOP_expand_array: 1213 case EVTCHNOP_set_priority: 1214 /* We do not support FIFO channels at this point */ 1215 err = -ENOSYS; 1216 break; 1217 1218 case EVTCHNOP_status: { 1219 struct evtchn_status status; 1220 1221 qemu_build_assert(sizeof(status) == 24); 1222 if (kvm_copy_from_gva(cs, arg, &status, sizeof(status))) { 1223 err = -EFAULT; 1224 break; 1225 } 1226 1227 err = xen_evtchn_status_op(&status); 1228 if (!err && kvm_copy_to_gva(cs, arg, &status, sizeof(status))) { 1229 err = -EFAULT; 1230 } 1231 break; 1232 } 1233 case EVTCHNOP_close: { 1234 struct evtchn_close close; 1235 1236 qemu_build_assert(sizeof(close) == 4); 1237 if (kvm_copy_from_gva(cs, arg, &close, sizeof(close))) { 1238 err = -EFAULT; 1239 break; 1240 } 1241 1242 err = xen_evtchn_close_op(&close); 1243 break; 1244 } 1245 case EVTCHNOP_unmask: { 1246 struct evtchn_unmask unmask; 1247 1248 qemu_build_assert(sizeof(unmask) == 4); 1249 if (kvm_copy_from_gva(cs, arg, &unmask, sizeof(unmask))) { 1250 err = -EFAULT; 1251 break; 1252 } 1253 1254 err = xen_evtchn_unmask_op(&unmask); 1255 break; 1256 } 1257 case EVTCHNOP_bind_virq: { 1258 struct evtchn_bind_virq virq; 1259 1260 qemu_build_assert(sizeof(virq) == 12); 1261 if (kvm_copy_from_gva(cs, arg, &virq, sizeof(virq))) { 1262 err = -EFAULT; 1263 break; 1264 } 1265 1266 err = xen_evtchn_bind_virq_op(&virq); 1267 if (!err && kvm_copy_to_gva(cs, arg, &virq, sizeof(virq))) { 1268 err = -EFAULT; 1269 } 1270 break; 1271 } 1272 case EVTCHNOP_bind_pirq: { 1273 struct evtchn_bind_pirq pirq; 1274 1275 qemu_build_assert(sizeof(pirq) == 12); 1276 if (kvm_copy_from_gva(cs, arg, &pirq, sizeof(pirq))) { 1277 err = -EFAULT; 1278 break; 1279 } 1280 1281 err = xen_evtchn_bind_pirq_op(&pirq); 1282 if (!err && kvm_copy_to_gva(cs, arg, &pirq, sizeof(pirq))) { 1283 err = -EFAULT; 1284 } 1285 break; 1286 } 1287 case EVTCHNOP_bind_ipi: { 1288 struct evtchn_bind_ipi ipi; 1289 1290 qemu_build_assert(sizeof(ipi) == 8); 1291 if (kvm_copy_from_gva(cs, arg, &ipi, sizeof(ipi))) { 1292 err = -EFAULT; 1293 break; 1294 } 1295 1296 err = xen_evtchn_bind_ipi_op(&ipi); 1297 if (!err && kvm_copy_to_gva(cs, arg, &ipi, sizeof(ipi))) { 1298 err = -EFAULT; 1299 } 1300 break; 1301 } 1302 case EVTCHNOP_send: { 1303 struct evtchn_send send; 1304 1305 qemu_build_assert(sizeof(send) == 4); 1306 if (kvm_copy_from_gva(cs, arg, &send, sizeof(send))) { 1307 err = -EFAULT; 1308 break; 1309 } 1310 1311 err = xen_evtchn_send_op(&send); 1312 break; 1313 } 1314 case EVTCHNOP_alloc_unbound: { 1315 struct evtchn_alloc_unbound alloc; 1316 1317 qemu_build_assert(sizeof(alloc) == 8); 1318 if (kvm_copy_from_gva(cs, arg, &alloc, sizeof(alloc))) { 1319 err = -EFAULT; 1320 break; 1321 } 1322 1323 err = xen_evtchn_alloc_unbound_op(&alloc); 1324 if (!err && kvm_copy_to_gva(cs, arg, &alloc, sizeof(alloc))) { 1325 err = -EFAULT; 1326 } 1327 break; 1328 } 1329 case EVTCHNOP_bind_interdomain: { 1330 struct evtchn_bind_interdomain interdomain; 1331 1332 qemu_build_assert(sizeof(interdomain) == 12); 1333 if (kvm_copy_from_gva(cs, arg, &interdomain, sizeof(interdomain))) { 1334 err = -EFAULT; 1335 break; 1336 } 1337 1338 err = xen_evtchn_bind_interdomain_op(&interdomain); 1339 if (!err && 1340 kvm_copy_to_gva(cs, arg, &interdomain, sizeof(interdomain))) { 1341 err = -EFAULT; 1342 } 1343 break; 1344 } 1345 case EVTCHNOP_bind_vcpu: { 1346 struct evtchn_bind_vcpu vcpu; 1347 1348 qemu_build_assert(sizeof(vcpu) == 8); 1349 if (kvm_copy_from_gva(cs, arg, &vcpu, sizeof(vcpu))) { 1350 err = -EFAULT; 1351 break; 1352 } 1353 1354 err = xen_evtchn_bind_vcpu_op(&vcpu); 1355 break; 1356 } 1357 case EVTCHNOP_reset: { 1358 struct evtchn_reset reset; 1359 1360 qemu_build_assert(sizeof(reset) == 2); 1361 if (kvm_copy_from_gva(cs, arg, &reset, sizeof(reset))) { 1362 err = -EFAULT; 1363 break; 1364 } 1365 1366 err = xen_evtchn_reset_op(&reset); 1367 break; 1368 } 1369 default: 1370 return false; 1371 } 1372 1373 exit->u.hcall.result = err; 1374 return true; 1375 } 1376 1377 int kvm_xen_soft_reset(void) 1378 { 1379 CPUState *cpu; 1380 int err; 1381 1382 assert(qemu_mutex_iothread_locked()); 1383 1384 trace_kvm_xen_soft_reset(); 1385 1386 err = xen_evtchn_soft_reset(); 1387 if (err) { 1388 return err; 1389 } 1390 1391 /* 1392 * Zero is the reset/startup state for HVM_PARAM_CALLBACK_IRQ. Strictly, 1393 * it maps to HVM_PARAM_CALLBACK_TYPE_GSI with GSI#0, but Xen refuses to 1394 * to deliver to the timer interrupt and treats that as 'disabled'. 1395 */ 1396 err = xen_evtchn_set_callback_param(0); 1397 if (err) { 1398 return err; 1399 } 1400 1401 CPU_FOREACH(cpu) { 1402 async_run_on_cpu(cpu, do_vcpu_soft_reset, RUN_ON_CPU_NULL); 1403 } 1404 1405 err = xen_overlay_map_shinfo_page(INVALID_GFN); 1406 if (err) { 1407 return err; 1408 } 1409 1410 err = xen_gnttab_reset(); 1411 if (err) { 1412 return err; 1413 } 1414 1415 err = xen_xenstore_reset(); 1416 if (err) { 1417 return err; 1418 } 1419 1420 return 0; 1421 } 1422 1423 static int schedop_shutdown(CPUState *cs, uint64_t arg) 1424 { 1425 struct sched_shutdown shutdown; 1426 int ret = 0; 1427 1428 /* No need for 32/64 compat handling */ 1429 qemu_build_assert(sizeof(shutdown) == 4); 1430 1431 if (kvm_copy_from_gva(cs, arg, &shutdown, sizeof(shutdown))) { 1432 return -EFAULT; 1433 } 1434 1435 switch (shutdown.reason) { 1436 case SHUTDOWN_crash: 1437 cpu_dump_state(cs, stderr, CPU_DUMP_CODE); 1438 qemu_system_guest_panicked(NULL); 1439 break; 1440 1441 case SHUTDOWN_reboot: 1442 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); 1443 break; 1444 1445 case SHUTDOWN_poweroff: 1446 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN); 1447 break; 1448 1449 case SHUTDOWN_soft_reset: 1450 qemu_mutex_lock_iothread(); 1451 ret = kvm_xen_soft_reset(); 1452 qemu_mutex_unlock_iothread(); 1453 break; 1454 1455 default: 1456 ret = -EINVAL; 1457 break; 1458 } 1459 1460 return ret; 1461 } 1462 1463 static bool kvm_xen_hcall_sched_op(struct kvm_xen_exit *exit, X86CPU *cpu, 1464 int cmd, uint64_t arg) 1465 { 1466 CPUState *cs = CPU(cpu); 1467 int err = -ENOSYS; 1468 1469 switch (cmd) { 1470 case SCHEDOP_shutdown: 1471 err = schedop_shutdown(cs, arg); 1472 break; 1473 1474 case SCHEDOP_poll: 1475 /* 1476 * Linux will panic if this doesn't work. Just yield; it's not 1477 * worth overthinking it because with event channel handling 1478 * in KVM, the kernel will intercept this and it will never 1479 * reach QEMU anyway. The semantics of the hypercall explicltly 1480 * permit spurious wakeups. 1481 */ 1482 case SCHEDOP_yield: 1483 sched_yield(); 1484 err = 0; 1485 break; 1486 1487 default: 1488 return false; 1489 } 1490 1491 exit->u.hcall.result = err; 1492 return true; 1493 } 1494 1495 static bool kvm_xen_hcall_gnttab_op(struct kvm_xen_exit *exit, X86CPU *cpu, 1496 int cmd, uint64_t arg, int count) 1497 { 1498 CPUState *cs = CPU(cpu); 1499 int err; 1500 1501 switch (cmd) { 1502 case GNTTABOP_set_version: { 1503 struct gnttab_set_version set; 1504 1505 qemu_build_assert(sizeof(set) == 4); 1506 if (kvm_copy_from_gva(cs, arg, &set, sizeof(set))) { 1507 err = -EFAULT; 1508 break; 1509 } 1510 1511 err = xen_gnttab_set_version_op(&set); 1512 if (!err && kvm_copy_to_gva(cs, arg, &set, sizeof(set))) { 1513 err = -EFAULT; 1514 } 1515 break; 1516 } 1517 case GNTTABOP_get_version: { 1518 struct gnttab_get_version get; 1519 1520 qemu_build_assert(sizeof(get) == 8); 1521 if (kvm_copy_from_gva(cs, arg, &get, sizeof(get))) { 1522 err = -EFAULT; 1523 break; 1524 } 1525 1526 err = xen_gnttab_get_version_op(&get); 1527 if (!err && kvm_copy_to_gva(cs, arg, &get, sizeof(get))) { 1528 err = -EFAULT; 1529 } 1530 break; 1531 } 1532 case GNTTABOP_query_size: { 1533 struct gnttab_query_size size; 1534 1535 qemu_build_assert(sizeof(size) == 16); 1536 if (kvm_copy_from_gva(cs, arg, &size, sizeof(size))) { 1537 err = -EFAULT; 1538 break; 1539 } 1540 1541 err = xen_gnttab_query_size_op(&size); 1542 if (!err && kvm_copy_to_gva(cs, arg, &size, sizeof(size))) { 1543 err = -EFAULT; 1544 } 1545 break; 1546 } 1547 case GNTTABOP_setup_table: 1548 case GNTTABOP_copy: 1549 case GNTTABOP_map_grant_ref: 1550 case GNTTABOP_unmap_grant_ref: 1551 case GNTTABOP_swap_grant_ref: 1552 return false; 1553 1554 default: 1555 /* Xen explicitly returns -ENOSYS to HVM guests for all others */ 1556 err = -ENOSYS; 1557 break; 1558 } 1559 1560 exit->u.hcall.result = err; 1561 return true; 1562 } 1563 1564 static bool kvm_xen_hcall_physdev_op(struct kvm_xen_exit *exit, X86CPU *cpu, 1565 int cmd, uint64_t arg) 1566 { 1567 CPUState *cs = CPU(cpu); 1568 int err; 1569 1570 switch (cmd) { 1571 case PHYSDEVOP_map_pirq: { 1572 struct physdev_map_pirq map; 1573 1574 if (hypercall_compat32(exit->u.hcall.longmode)) { 1575 struct compat_physdev_map_pirq *map32 = (void *)↦ 1576 1577 if (kvm_copy_from_gva(cs, arg, map32, sizeof(*map32))) { 1578 return -EFAULT; 1579 } 1580 1581 /* 1582 * The only thing that's different is the alignment of the 1583 * uint64_t table_base at the end, which gets padding to make 1584 * it 64-bit aligned in the 64-bit version. 1585 */ 1586 qemu_build_assert(sizeof(*map32) == 36); 1587 qemu_build_assert(offsetof(struct physdev_map_pirq, entry_nr) == 1588 offsetof(struct compat_physdev_map_pirq, entry_nr)); 1589 memmove(&map.table_base, &map32->table_base, sizeof(map.table_base)); 1590 } else { 1591 if (kvm_copy_from_gva(cs, arg, &map, sizeof(map))) { 1592 err = -EFAULT; 1593 break; 1594 } 1595 } 1596 err = xen_physdev_map_pirq(&map); 1597 /* 1598 * Since table_base is an IN parameter and won't be changed, just 1599 * copy the size of the compat structure back to the guest. 1600 */ 1601 if (!err && kvm_copy_to_gva(cs, arg, &map, 1602 sizeof(struct compat_physdev_map_pirq))) { 1603 err = -EFAULT; 1604 } 1605 break; 1606 } 1607 case PHYSDEVOP_unmap_pirq: { 1608 struct physdev_unmap_pirq unmap; 1609 1610 qemu_build_assert(sizeof(unmap) == 8); 1611 if (kvm_copy_from_gva(cs, arg, &unmap, sizeof(unmap))) { 1612 err = -EFAULT; 1613 break; 1614 } 1615 1616 err = xen_physdev_unmap_pirq(&unmap); 1617 if (!err && kvm_copy_to_gva(cs, arg, &unmap, sizeof(unmap))) { 1618 err = -EFAULT; 1619 } 1620 break; 1621 } 1622 case PHYSDEVOP_eoi: { 1623 struct physdev_eoi eoi; 1624 1625 qemu_build_assert(sizeof(eoi) == 4); 1626 if (kvm_copy_from_gva(cs, arg, &eoi, sizeof(eoi))) { 1627 err = -EFAULT; 1628 break; 1629 } 1630 1631 err = xen_physdev_eoi_pirq(&eoi); 1632 if (!err && kvm_copy_to_gva(cs, arg, &eoi, sizeof(eoi))) { 1633 err = -EFAULT; 1634 } 1635 break; 1636 } 1637 case PHYSDEVOP_irq_status_query: { 1638 struct physdev_irq_status_query query; 1639 1640 qemu_build_assert(sizeof(query) == 8); 1641 if (kvm_copy_from_gva(cs, arg, &query, sizeof(query))) { 1642 err = -EFAULT; 1643 break; 1644 } 1645 1646 err = xen_physdev_query_pirq(&query); 1647 if (!err && kvm_copy_to_gva(cs, arg, &query, sizeof(query))) { 1648 err = -EFAULT; 1649 } 1650 break; 1651 } 1652 case PHYSDEVOP_get_free_pirq: { 1653 struct physdev_get_free_pirq get; 1654 1655 qemu_build_assert(sizeof(get) == 8); 1656 if (kvm_copy_from_gva(cs, arg, &get, sizeof(get))) { 1657 err = -EFAULT; 1658 break; 1659 } 1660 1661 err = xen_physdev_get_free_pirq(&get); 1662 if (!err && kvm_copy_to_gva(cs, arg, &get, sizeof(get))) { 1663 err = -EFAULT; 1664 } 1665 break; 1666 } 1667 case PHYSDEVOP_pirq_eoi_gmfn_v2: /* FreeBSD 13 makes this hypercall */ 1668 err = -ENOSYS; 1669 break; 1670 1671 default: 1672 return false; 1673 } 1674 1675 exit->u.hcall.result = err; 1676 return true; 1677 } 1678 1679 static bool do_kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit) 1680 { 1681 uint16_t code = exit->u.hcall.input; 1682 1683 if (exit->u.hcall.cpl > 0) { 1684 exit->u.hcall.result = -EPERM; 1685 return true; 1686 } 1687 1688 switch (code) { 1689 case __HYPERVISOR_set_timer_op: 1690 if (exit->u.hcall.longmode) { 1691 return kvm_xen_hcall_set_timer_op(exit, cpu, 1692 exit->u.hcall.params[0]); 1693 } else { 1694 /* In 32-bit mode, the 64-bit timer value is in two args. */ 1695 uint64_t val = ((uint64_t)exit->u.hcall.params[1]) << 32 | 1696 (uint32_t)exit->u.hcall.params[0]; 1697 return kvm_xen_hcall_set_timer_op(exit, cpu, val); 1698 } 1699 case __HYPERVISOR_grant_table_op: 1700 return kvm_xen_hcall_gnttab_op(exit, cpu, exit->u.hcall.params[0], 1701 exit->u.hcall.params[1], 1702 exit->u.hcall.params[2]); 1703 case __HYPERVISOR_sched_op: 1704 return kvm_xen_hcall_sched_op(exit, cpu, exit->u.hcall.params[0], 1705 exit->u.hcall.params[1]); 1706 case __HYPERVISOR_event_channel_op: 1707 return kvm_xen_hcall_evtchn_op(exit, cpu, exit->u.hcall.params[0], 1708 exit->u.hcall.params[1]); 1709 case __HYPERVISOR_vcpu_op: 1710 return kvm_xen_hcall_vcpu_op(exit, cpu, 1711 exit->u.hcall.params[0], 1712 exit->u.hcall.params[1], 1713 exit->u.hcall.params[2]); 1714 case __HYPERVISOR_hvm_op: 1715 return kvm_xen_hcall_hvm_op(exit, cpu, exit->u.hcall.params[0], 1716 exit->u.hcall.params[1]); 1717 case __HYPERVISOR_memory_op: 1718 return kvm_xen_hcall_memory_op(exit, cpu, exit->u.hcall.params[0], 1719 exit->u.hcall.params[1]); 1720 case __HYPERVISOR_physdev_op: 1721 return kvm_xen_hcall_physdev_op(exit, cpu, exit->u.hcall.params[0], 1722 exit->u.hcall.params[1]); 1723 case __HYPERVISOR_xen_version: 1724 return kvm_xen_hcall_xen_version(exit, cpu, exit->u.hcall.params[0], 1725 exit->u.hcall.params[1]); 1726 default: 1727 return false; 1728 } 1729 } 1730 1731 int kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit) 1732 { 1733 if (exit->type != KVM_EXIT_XEN_HCALL) { 1734 return -1; 1735 } 1736 1737 /* 1738 * The kernel latches the guest 32/64 mode when the MSR is used to fill 1739 * the hypercall page. So if we see a hypercall in a mode that doesn't 1740 * match our own idea of the guest mode, fetch the kernel's idea of the 1741 * "long mode" to remain in sync. 1742 */ 1743 if (exit->u.hcall.longmode != xen_is_long_mode()) { 1744 xen_sync_long_mode(); 1745 } 1746 1747 if (!do_kvm_xen_handle_exit(cpu, exit)) { 1748 /* 1749 * Some hypercalls will be deliberately "implemented" by returning 1750 * -ENOSYS. This case is for hypercalls which are unexpected. 1751 */ 1752 exit->u.hcall.result = -ENOSYS; 1753 qemu_log_mask(LOG_UNIMP, "Unimplemented Xen hypercall %" 1754 PRId64 " (0x%" PRIx64 " 0x%" PRIx64 " 0x%" PRIx64 ")\n", 1755 (uint64_t)exit->u.hcall.input, 1756 (uint64_t)exit->u.hcall.params[0], 1757 (uint64_t)exit->u.hcall.params[1], 1758 (uint64_t)exit->u.hcall.params[2]); 1759 } 1760 1761 trace_kvm_xen_hypercall(CPU(cpu)->cpu_index, exit->u.hcall.cpl, 1762 exit->u.hcall.input, exit->u.hcall.params[0], 1763 exit->u.hcall.params[1], exit->u.hcall.params[2], 1764 exit->u.hcall.result); 1765 return 0; 1766 } 1767 1768 uint16_t kvm_xen_get_gnttab_max_frames(void) 1769 { 1770 KVMState *s = KVM_STATE(current_accel()); 1771 return s->xen_gnttab_max_frames; 1772 } 1773 1774 uint16_t kvm_xen_get_evtchn_max_pirq(void) 1775 { 1776 KVMState *s = KVM_STATE(current_accel()); 1777 return s->xen_evtchn_max_pirq; 1778 } 1779 1780 int kvm_put_xen_state(CPUState *cs) 1781 { 1782 X86CPU *cpu = X86_CPU(cs); 1783 CPUX86State *env = &cpu->env; 1784 uint64_t gpa; 1785 int ret; 1786 1787 gpa = env->xen_vcpu_info_gpa; 1788 if (gpa == INVALID_GPA) { 1789 gpa = env->xen_vcpu_info_default_gpa; 1790 } 1791 1792 if (gpa != INVALID_GPA) { 1793 ret = set_vcpu_info(cs, gpa); 1794 if (ret < 0) { 1795 return ret; 1796 } 1797 } 1798 1799 gpa = env->xen_vcpu_time_info_gpa; 1800 if (gpa != INVALID_GPA) { 1801 ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO, 1802 gpa); 1803 if (ret < 0) { 1804 return ret; 1805 } 1806 } 1807 1808 gpa = env->xen_vcpu_runstate_gpa; 1809 if (gpa != INVALID_GPA) { 1810 ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR, 1811 gpa); 1812 if (ret < 0) { 1813 return ret; 1814 } 1815 } 1816 1817 if (env->xen_periodic_timer_period) { 1818 ret = do_set_periodic_timer(cs, env->xen_periodic_timer_period); 1819 if (ret < 0) { 1820 return ret; 1821 } 1822 } 1823 1824 if (!kvm_xen_has_cap(EVTCHN_SEND)) { 1825 /* 1826 * If the kernel has EVTCHN_SEND support then it handles timers too, 1827 * so the timer will be restored by kvm_xen_set_vcpu_timer() below. 1828 */ 1829 if (env->xen_singleshot_timer_ns) { 1830 ret = do_set_singleshot_timer(cs, env->xen_singleshot_timer_ns, 1831 false, false); 1832 if (ret < 0) { 1833 return ret; 1834 } 1835 } 1836 return 0; 1837 } 1838 1839 if (env->xen_vcpu_callback_vector) { 1840 ret = kvm_xen_set_vcpu_callback_vector(cs); 1841 if (ret < 0) { 1842 return ret; 1843 } 1844 } 1845 1846 if (env->xen_virq[VIRQ_TIMER]) { 1847 ret = kvm_xen_set_vcpu_timer(cs); 1848 if (ret < 0) { 1849 return ret; 1850 } 1851 } 1852 return 0; 1853 } 1854 1855 int kvm_get_xen_state(CPUState *cs) 1856 { 1857 X86CPU *cpu = X86_CPU(cs); 1858 CPUX86State *env = &cpu->env; 1859 uint64_t gpa; 1860 int ret; 1861 1862 /* 1863 * The kernel does not mark vcpu_info as dirty when it delivers interrupts 1864 * to it. It's up to userspace to *assume* that any page shared thus is 1865 * always considered dirty. The shared_info page is different since it's 1866 * an overlay and migrated separately anyway. 1867 */ 1868 gpa = env->xen_vcpu_info_gpa; 1869 if (gpa == INVALID_GPA) { 1870 gpa = env->xen_vcpu_info_default_gpa; 1871 } 1872 if (gpa != INVALID_GPA) { 1873 MemoryRegionSection mrs = memory_region_find(get_system_memory(), 1874 gpa, 1875 sizeof(struct vcpu_info)); 1876 if (mrs.mr && 1877 !int128_lt(mrs.size, int128_make64(sizeof(struct vcpu_info)))) { 1878 memory_region_set_dirty(mrs.mr, mrs.offset_within_region, 1879 sizeof(struct vcpu_info)); 1880 } 1881 } 1882 1883 if (!kvm_xen_has_cap(EVTCHN_SEND)) { 1884 return 0; 1885 } 1886 1887 /* 1888 * If the kernel is accelerating timers, read out the current value of the 1889 * singleshot timer deadline. 1890 */ 1891 if (env->xen_virq[VIRQ_TIMER]) { 1892 struct kvm_xen_vcpu_attr va = { 1893 .type = KVM_XEN_VCPU_ATTR_TYPE_TIMER, 1894 }; 1895 ret = kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_GET_ATTR, &va); 1896 if (ret < 0) { 1897 return ret; 1898 } 1899 env->xen_singleshot_timer_ns = va.u.timer.expires_ns; 1900 } 1901 1902 return 0; 1903 } 1904