1 /* 2 * Xen HVM emulation support in KVM 3 * 4 * Copyright © 2019 Oracle and/or its affiliates. All rights reserved. 5 * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. 6 * 7 * This work is licensed under the terms of the GNU GPL, version 2 or later. 8 * See the COPYING file in the top-level directory. 9 * 10 */ 11 12 #include "qemu/osdep.h" 13 #include "qemu/log.h" 14 #include "qemu/main-loop.h" 15 #include "hw/xen/xen.h" 16 #include "sysemu/kvm_int.h" 17 #include "sysemu/kvm_xen.h" 18 #include "kvm/kvm_i386.h" 19 #include "exec/address-spaces.h" 20 #include "xen-emu.h" 21 #include "trace.h" 22 #include "sysemu/runstate.h" 23 24 #include "hw/pci/msi.h" 25 #include "hw/i386/apic-msidef.h" 26 #include "hw/i386/e820_memory_layout.h" 27 #include "hw/i386/kvm/xen_overlay.h" 28 #include "hw/i386/kvm/xen_evtchn.h" 29 #include "hw/i386/kvm/xen_gnttab.h" 30 #include "hw/i386/kvm/xen_xenstore.h" 31 32 #include "hw/xen/interface/version.h" 33 #include "hw/xen/interface/sched.h" 34 #include "hw/xen/interface/memory.h" 35 #include "hw/xen/interface/hvm/hvm_op.h" 36 #include "hw/xen/interface/hvm/params.h" 37 #include "hw/xen/interface/vcpu.h" 38 #include "hw/xen/interface/event_channel.h" 39 #include "hw/xen/interface/grant_table.h" 40 41 #include "xen-compat.h" 42 43 static void xen_vcpu_singleshot_timer_event(void *opaque); 44 static void xen_vcpu_periodic_timer_event(void *opaque); 45 46 #ifdef TARGET_X86_64 47 #define hypercall_compat32(longmode) (!(longmode)) 48 #else 49 #define hypercall_compat32(longmode) (false) 50 #endif 51 52 static bool kvm_gva_to_gpa(CPUState *cs, uint64_t gva, uint64_t *gpa, 53 size_t *len, bool is_write) 54 { 55 struct kvm_translation tr = { 56 .linear_address = gva, 57 }; 58 59 if (len) { 60 *len = TARGET_PAGE_SIZE - (gva & ~TARGET_PAGE_MASK); 61 } 62 63 if (kvm_vcpu_ioctl(cs, KVM_TRANSLATE, &tr) || !tr.valid || 64 (is_write && !tr.writeable)) { 65 return false; 66 } 67 *gpa = tr.physical_address; 68 return true; 69 } 70 71 static int kvm_gva_rw(CPUState *cs, uint64_t gva, void *_buf, size_t sz, 72 bool is_write) 73 { 74 uint8_t *buf = (uint8_t *)_buf; 75 uint64_t gpa; 76 size_t len; 77 78 while (sz) { 79 if (!kvm_gva_to_gpa(cs, gva, &gpa, &len, is_write)) { 80 return -EFAULT; 81 } 82 if (len > sz) { 83 len = sz; 84 } 85 86 cpu_physical_memory_rw(gpa, buf, len, is_write); 87 88 buf += len; 89 sz -= len; 90 gva += len; 91 } 92 93 return 0; 94 } 95 96 static inline int kvm_copy_from_gva(CPUState *cs, uint64_t gva, void *buf, 97 size_t sz) 98 { 99 return kvm_gva_rw(cs, gva, buf, sz, false); 100 } 101 102 static inline int kvm_copy_to_gva(CPUState *cs, uint64_t gva, void *buf, 103 size_t sz) 104 { 105 return kvm_gva_rw(cs, gva, buf, sz, true); 106 } 107 108 int kvm_xen_init(KVMState *s, uint32_t hypercall_msr) 109 { 110 const int required_caps = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR | 111 KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | KVM_XEN_HVM_CONFIG_SHARED_INFO; 112 struct kvm_xen_hvm_config cfg = { 113 .msr = hypercall_msr, 114 .flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL, 115 }; 116 int xen_caps, ret; 117 118 xen_caps = kvm_check_extension(s, KVM_CAP_XEN_HVM); 119 if (required_caps & ~xen_caps) { 120 error_report("kvm: Xen HVM guest support not present or insufficient"); 121 return -ENOSYS; 122 } 123 124 if (xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND) { 125 struct kvm_xen_hvm_attr ha = { 126 .type = KVM_XEN_ATTR_TYPE_XEN_VERSION, 127 .u.xen_version = s->xen_version, 128 }; 129 (void)kvm_vm_ioctl(s, KVM_XEN_HVM_SET_ATTR, &ha); 130 131 cfg.flags |= KVM_XEN_HVM_CONFIG_EVTCHN_SEND; 132 } 133 134 ret = kvm_vm_ioctl(s, KVM_XEN_HVM_CONFIG, &cfg); 135 if (ret < 0) { 136 error_report("kvm: Failed to enable Xen HVM support: %s", 137 strerror(-ret)); 138 return ret; 139 } 140 141 /* If called a second time, don't repeat the rest of the setup. */ 142 if (s->xen_caps) { 143 return 0; 144 } 145 146 /* 147 * Event channel delivery via GSI/PCI_INTX needs to poll the vcpu_info 148 * of vCPU0 to deassert the IRQ when ->evtchn_upcall_pending is cleared. 149 * 150 * In the kernel, there's a notifier hook on the PIC/IOAPIC which allows 151 * such things to be polled at precisely the right time. We *could* do 152 * it nicely in the kernel: check vcpu_info[0]->evtchn_upcall_pending at 153 * the moment the IRQ is acked, and see if it should be reasserted. 154 * 155 * But the in-kernel irqchip is deprecated, so we're unlikely to add 156 * that support in the kernel. Insist on using the split irqchip mode 157 * instead. 158 * 159 * This leaves us polling for the level going low in QEMU, which lacks 160 * the appropriate hooks in its PIC/IOAPIC code. Even VFIO is sending a 161 * spurious 'ack' to an INTX IRQ every time there's any MMIO access to 162 * the device (for which it has to unmap the device and trap access, for 163 * some period after an IRQ!!). In the Xen case, we do it on exit from 164 * KVM_RUN, if the flag is set to say that the GSI is currently asserted. 165 * Which is kind of icky, but less so than the VFIO one. I may fix them 166 * both later... 167 */ 168 if (!kvm_kernel_irqchip_split()) { 169 error_report("kvm: Xen support requires kernel-irqchip=split"); 170 return -EINVAL; 171 } 172 173 s->xen_caps = xen_caps; 174 175 /* Tell fw_cfg to notify the BIOS to reserve the range. */ 176 ret = e820_add_entry(XEN_SPECIAL_AREA_ADDR, XEN_SPECIAL_AREA_SIZE, 177 E820_RESERVED); 178 if (ret < 0) { 179 fprintf(stderr, "e820_add_entry() table is full\n"); 180 return ret; 181 } 182 183 /* The page couldn't be overlaid until KVM was initialized */ 184 xen_xenstore_reset(); 185 186 return 0; 187 } 188 189 int kvm_xen_init_vcpu(CPUState *cs) 190 { 191 X86CPU *cpu = X86_CPU(cs); 192 CPUX86State *env = &cpu->env; 193 int err; 194 195 /* 196 * The kernel needs to know the Xen/ACPI vCPU ID because that's 197 * what the guest uses in hypercalls such as timers. It doesn't 198 * match the APIC ID which is generally used for talking to the 199 * kernel about vCPUs. And if vCPU threads race with creating 200 * their KVM vCPUs out of order, it doesn't necessarily match 201 * with the kernel's internal vCPU indices either. 202 */ 203 if (kvm_xen_has_cap(EVTCHN_SEND)) { 204 struct kvm_xen_vcpu_attr va = { 205 .type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID, 206 .u.vcpu_id = cs->cpu_index, 207 }; 208 err = kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &va); 209 if (err) { 210 error_report("kvm: Failed to set Xen vCPU ID attribute: %s", 211 strerror(-err)); 212 return err; 213 } 214 } 215 216 env->xen_vcpu_info_gpa = INVALID_GPA; 217 env->xen_vcpu_info_default_gpa = INVALID_GPA; 218 env->xen_vcpu_time_info_gpa = INVALID_GPA; 219 env->xen_vcpu_runstate_gpa = INVALID_GPA; 220 221 qemu_mutex_init(&env->xen_timers_lock); 222 env->xen_singleshot_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, 223 xen_vcpu_singleshot_timer_event, 224 cpu); 225 if (!env->xen_singleshot_timer) { 226 return -ENOMEM; 227 } 228 env->xen_singleshot_timer->opaque = cs; 229 230 env->xen_periodic_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, 231 xen_vcpu_periodic_timer_event, 232 cpu); 233 if (!env->xen_periodic_timer) { 234 return -ENOMEM; 235 } 236 env->xen_periodic_timer->opaque = cs; 237 238 return 0; 239 } 240 241 uint32_t kvm_xen_get_caps(void) 242 { 243 return kvm_state->xen_caps; 244 } 245 246 static bool kvm_xen_hcall_xen_version(struct kvm_xen_exit *exit, X86CPU *cpu, 247 int cmd, uint64_t arg) 248 { 249 int err = 0; 250 251 switch (cmd) { 252 case XENVER_get_features: { 253 struct xen_feature_info fi; 254 255 /* No need for 32/64 compat handling */ 256 qemu_build_assert(sizeof(fi) == 8); 257 258 err = kvm_copy_from_gva(CPU(cpu), arg, &fi, sizeof(fi)); 259 if (err) { 260 break; 261 } 262 263 fi.submap = 0; 264 if (fi.submap_idx == 0) { 265 fi.submap |= 1 << XENFEAT_writable_page_tables | 266 1 << XENFEAT_writable_descriptor_tables | 267 1 << XENFEAT_auto_translated_physmap | 268 1 << XENFEAT_supervisor_mode_kernel | 269 1 << XENFEAT_hvm_callback_vector | 270 1 << XENFEAT_hvm_safe_pvclock | 271 1 << XENFEAT_hvm_pirqs; 272 } 273 274 err = kvm_copy_to_gva(CPU(cpu), arg, &fi, sizeof(fi)); 275 break; 276 } 277 278 default: 279 return false; 280 } 281 282 exit->u.hcall.result = err; 283 return true; 284 } 285 286 static int kvm_xen_set_vcpu_attr(CPUState *cs, uint16_t type, uint64_t gpa) 287 { 288 struct kvm_xen_vcpu_attr xhsi; 289 290 xhsi.type = type; 291 xhsi.u.gpa = gpa; 292 293 trace_kvm_xen_set_vcpu_attr(cs->cpu_index, type, gpa); 294 295 return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &xhsi); 296 } 297 298 static int kvm_xen_set_vcpu_callback_vector(CPUState *cs) 299 { 300 uint8_t vector = X86_CPU(cs)->env.xen_vcpu_callback_vector; 301 struct kvm_xen_vcpu_attr xva; 302 303 xva.type = KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR; 304 xva.u.vector = vector; 305 306 trace_kvm_xen_set_vcpu_callback(cs->cpu_index, vector); 307 308 return kvm_vcpu_ioctl(cs, KVM_XEN_HVM_SET_ATTR, &xva); 309 } 310 311 static void do_set_vcpu_callback_vector(CPUState *cs, run_on_cpu_data data) 312 { 313 X86CPU *cpu = X86_CPU(cs); 314 CPUX86State *env = &cpu->env; 315 316 env->xen_vcpu_callback_vector = data.host_int; 317 318 if (kvm_xen_has_cap(EVTCHN_SEND)) { 319 kvm_xen_set_vcpu_callback_vector(cs); 320 } 321 } 322 323 static int set_vcpu_info(CPUState *cs, uint64_t gpa) 324 { 325 X86CPU *cpu = X86_CPU(cs); 326 CPUX86State *env = &cpu->env; 327 MemoryRegionSection mrs = { .mr = NULL }; 328 void *vcpu_info_hva = NULL; 329 int ret; 330 331 ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO, gpa); 332 if (ret || gpa == INVALID_GPA) { 333 goto out; 334 } 335 336 mrs = memory_region_find(get_system_memory(), gpa, 337 sizeof(struct vcpu_info)); 338 if (mrs.mr && mrs.mr->ram_block && 339 !int128_lt(mrs.size, int128_make64(sizeof(struct vcpu_info)))) { 340 vcpu_info_hva = qemu_map_ram_ptr(mrs.mr->ram_block, 341 mrs.offset_within_region); 342 } 343 if (!vcpu_info_hva) { 344 if (mrs.mr) { 345 memory_region_unref(mrs.mr); 346 mrs.mr = NULL; 347 } 348 ret = -EINVAL; 349 } 350 351 out: 352 if (env->xen_vcpu_info_mr) { 353 memory_region_unref(env->xen_vcpu_info_mr); 354 } 355 env->xen_vcpu_info_hva = vcpu_info_hva; 356 env->xen_vcpu_info_mr = mrs.mr; 357 return ret; 358 } 359 360 static void do_set_vcpu_info_default_gpa(CPUState *cs, run_on_cpu_data data) 361 { 362 X86CPU *cpu = X86_CPU(cs); 363 CPUX86State *env = &cpu->env; 364 365 env->xen_vcpu_info_default_gpa = data.host_ulong; 366 367 /* Changing the default does nothing if a vcpu_info was explicitly set. */ 368 if (env->xen_vcpu_info_gpa == INVALID_GPA) { 369 set_vcpu_info(cs, env->xen_vcpu_info_default_gpa); 370 } 371 } 372 373 static void do_set_vcpu_info_gpa(CPUState *cs, run_on_cpu_data data) 374 { 375 X86CPU *cpu = X86_CPU(cs); 376 CPUX86State *env = &cpu->env; 377 378 env->xen_vcpu_info_gpa = data.host_ulong; 379 380 set_vcpu_info(cs, env->xen_vcpu_info_gpa); 381 } 382 383 void *kvm_xen_get_vcpu_info_hva(uint32_t vcpu_id) 384 { 385 CPUState *cs = qemu_get_cpu(vcpu_id); 386 if (!cs) { 387 return NULL; 388 } 389 390 return X86_CPU(cs)->env.xen_vcpu_info_hva; 391 } 392 393 void kvm_xen_maybe_deassert_callback(CPUState *cs) 394 { 395 CPUX86State *env = &X86_CPU(cs)->env; 396 struct vcpu_info *vi = env->xen_vcpu_info_hva; 397 if (!vi) { 398 return; 399 } 400 401 /* If the evtchn_upcall_pending flag is cleared, turn the GSI off. */ 402 if (!vi->evtchn_upcall_pending) { 403 qemu_mutex_lock_iothread(); 404 /* 405 * Check again now we have the lock, because it may have been 406 * asserted in the interim. And we don't want to take the lock 407 * every time because this is a fast path. 408 */ 409 if (!vi->evtchn_upcall_pending) { 410 X86_CPU(cs)->env.xen_callback_asserted = false; 411 xen_evtchn_set_callback_level(0); 412 } 413 qemu_mutex_unlock_iothread(); 414 } 415 } 416 417 void kvm_xen_set_callback_asserted(void) 418 { 419 CPUState *cs = qemu_get_cpu(0); 420 421 if (cs) { 422 X86_CPU(cs)->env.xen_callback_asserted = true; 423 } 424 } 425 426 void kvm_xen_inject_vcpu_callback_vector(uint32_t vcpu_id, int type) 427 { 428 CPUState *cs = qemu_get_cpu(vcpu_id); 429 uint8_t vector; 430 431 if (!cs) { 432 return; 433 } 434 435 vector = X86_CPU(cs)->env.xen_vcpu_callback_vector; 436 if (vector) { 437 /* 438 * The per-vCPU callback vector injected via lapic. Just 439 * deliver it as an MSI. 440 */ 441 MSIMessage msg = { 442 .address = APIC_DEFAULT_ADDRESS | X86_CPU(cs)->apic_id, 443 .data = vector | (1UL << MSI_DATA_LEVEL_SHIFT), 444 }; 445 kvm_irqchip_send_msi(kvm_state, msg); 446 return; 447 } 448 449 switch (type) { 450 case HVM_PARAM_CALLBACK_TYPE_VECTOR: 451 /* 452 * If the evtchn_upcall_pending field in the vcpu_info is set, then 453 * KVM will automatically deliver the vector on entering the vCPU 454 * so all we have to do is kick it out. 455 */ 456 qemu_cpu_kick(cs); 457 break; 458 459 case HVM_PARAM_CALLBACK_TYPE_GSI: 460 case HVM_PARAM_CALLBACK_TYPE_PCI_INTX: 461 if (vcpu_id == 0) { 462 xen_evtchn_set_callback_level(1); 463 } 464 break; 465 } 466 } 467 468 static int kvm_xen_set_vcpu_timer(CPUState *cs) 469 { 470 X86CPU *cpu = X86_CPU(cs); 471 CPUX86State *env = &cpu->env; 472 473 struct kvm_xen_vcpu_attr va = { 474 .type = KVM_XEN_VCPU_ATTR_TYPE_TIMER, 475 .u.timer.port = env->xen_virq[VIRQ_TIMER], 476 .u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL, 477 .u.timer.expires_ns = env->xen_singleshot_timer_ns, 478 }; 479 480 return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &va); 481 } 482 483 static void do_set_vcpu_timer_virq(CPUState *cs, run_on_cpu_data data) 484 { 485 kvm_xen_set_vcpu_timer(cs); 486 } 487 488 int kvm_xen_set_vcpu_virq(uint32_t vcpu_id, uint16_t virq, uint16_t port) 489 { 490 CPUState *cs = qemu_get_cpu(vcpu_id); 491 492 if (!cs) { 493 return -ENOENT; 494 } 495 496 /* cpu.h doesn't include the actual Xen header. */ 497 qemu_build_assert(NR_VIRQS == XEN_NR_VIRQS); 498 499 if (virq >= NR_VIRQS) { 500 return -EINVAL; 501 } 502 503 if (port && X86_CPU(cs)->env.xen_virq[virq]) { 504 return -EEXIST; 505 } 506 507 X86_CPU(cs)->env.xen_virq[virq] = port; 508 if (virq == VIRQ_TIMER && kvm_xen_has_cap(EVTCHN_SEND)) { 509 async_run_on_cpu(cs, do_set_vcpu_timer_virq, 510 RUN_ON_CPU_HOST_INT(port)); 511 } 512 return 0; 513 } 514 515 static void do_set_vcpu_time_info_gpa(CPUState *cs, run_on_cpu_data data) 516 { 517 X86CPU *cpu = X86_CPU(cs); 518 CPUX86State *env = &cpu->env; 519 520 env->xen_vcpu_time_info_gpa = data.host_ulong; 521 522 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO, 523 env->xen_vcpu_time_info_gpa); 524 } 525 526 static void do_set_vcpu_runstate_gpa(CPUState *cs, run_on_cpu_data data) 527 { 528 X86CPU *cpu = X86_CPU(cs); 529 CPUX86State *env = &cpu->env; 530 531 env->xen_vcpu_runstate_gpa = data.host_ulong; 532 533 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR, 534 env->xen_vcpu_runstate_gpa); 535 } 536 537 static void do_vcpu_soft_reset(CPUState *cs, run_on_cpu_data data) 538 { 539 X86CPU *cpu = X86_CPU(cs); 540 CPUX86State *env = &cpu->env; 541 542 env->xen_vcpu_info_gpa = INVALID_GPA; 543 env->xen_vcpu_info_default_gpa = INVALID_GPA; 544 env->xen_vcpu_time_info_gpa = INVALID_GPA; 545 env->xen_vcpu_runstate_gpa = INVALID_GPA; 546 env->xen_vcpu_callback_vector = 0; 547 env->xen_singleshot_timer_ns = 0; 548 memset(env->xen_virq, 0, sizeof(env->xen_virq)); 549 550 set_vcpu_info(cs, INVALID_GPA); 551 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO, 552 INVALID_GPA); 553 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR, 554 INVALID_GPA); 555 if (kvm_xen_has_cap(EVTCHN_SEND)) { 556 kvm_xen_set_vcpu_callback_vector(cs); 557 kvm_xen_set_vcpu_timer(cs); 558 } 559 560 } 561 562 static int xen_set_shared_info(uint64_t gfn) 563 { 564 uint64_t gpa = gfn << TARGET_PAGE_BITS; 565 int i, err; 566 567 QEMU_IOTHREAD_LOCK_GUARD(); 568 569 /* 570 * The xen_overlay device tells KVM about it too, since it had to 571 * do that on migration load anyway (unless we're going to jump 572 * through lots of hoops to maintain the fiction that this isn't 573 * KVM-specific. 574 */ 575 err = xen_overlay_map_shinfo_page(gpa); 576 if (err) { 577 return err; 578 } 579 580 trace_kvm_xen_set_shared_info(gfn); 581 582 for (i = 0; i < XEN_LEGACY_MAX_VCPUS; i++) { 583 CPUState *cpu = qemu_get_cpu(i); 584 if (cpu) { 585 async_run_on_cpu(cpu, do_set_vcpu_info_default_gpa, 586 RUN_ON_CPU_HOST_ULONG(gpa)); 587 } 588 gpa += sizeof(vcpu_info_t); 589 } 590 591 return err; 592 } 593 594 static int add_to_physmap_one(uint32_t space, uint64_t idx, uint64_t gfn) 595 { 596 switch (space) { 597 case XENMAPSPACE_shared_info: 598 if (idx > 0) { 599 return -EINVAL; 600 } 601 return xen_set_shared_info(gfn); 602 603 case XENMAPSPACE_grant_table: 604 return xen_gnttab_map_page(idx, gfn); 605 606 case XENMAPSPACE_gmfn: 607 case XENMAPSPACE_gmfn_range: 608 return -ENOTSUP; 609 610 case XENMAPSPACE_gmfn_foreign: 611 case XENMAPSPACE_dev_mmio: 612 return -EPERM; 613 614 default: 615 return -EINVAL; 616 } 617 } 618 619 static int do_add_to_physmap(struct kvm_xen_exit *exit, X86CPU *cpu, 620 uint64_t arg) 621 { 622 struct xen_add_to_physmap xatp; 623 CPUState *cs = CPU(cpu); 624 625 if (hypercall_compat32(exit->u.hcall.longmode)) { 626 struct compat_xen_add_to_physmap xatp32; 627 628 qemu_build_assert(sizeof(struct compat_xen_add_to_physmap) == 16); 629 if (kvm_copy_from_gva(cs, arg, &xatp32, sizeof(xatp32))) { 630 return -EFAULT; 631 } 632 xatp.domid = xatp32.domid; 633 xatp.size = xatp32.size; 634 xatp.space = xatp32.space; 635 xatp.idx = xatp32.idx; 636 xatp.gpfn = xatp32.gpfn; 637 } else { 638 if (kvm_copy_from_gva(cs, arg, &xatp, sizeof(xatp))) { 639 return -EFAULT; 640 } 641 } 642 643 if (xatp.domid != DOMID_SELF && xatp.domid != xen_domid) { 644 return -ESRCH; 645 } 646 647 return add_to_physmap_one(xatp.space, xatp.idx, xatp.gpfn); 648 } 649 650 static int do_add_to_physmap_batch(struct kvm_xen_exit *exit, X86CPU *cpu, 651 uint64_t arg) 652 { 653 struct xen_add_to_physmap_batch xatpb; 654 unsigned long idxs_gva, gpfns_gva, errs_gva; 655 CPUState *cs = CPU(cpu); 656 size_t op_sz; 657 658 if (hypercall_compat32(exit->u.hcall.longmode)) { 659 struct compat_xen_add_to_physmap_batch xatpb32; 660 661 qemu_build_assert(sizeof(struct compat_xen_add_to_physmap_batch) == 20); 662 if (kvm_copy_from_gva(cs, arg, &xatpb32, sizeof(xatpb32))) { 663 return -EFAULT; 664 } 665 xatpb.domid = xatpb32.domid; 666 xatpb.space = xatpb32.space; 667 xatpb.size = xatpb32.size; 668 669 idxs_gva = xatpb32.idxs.c; 670 gpfns_gva = xatpb32.gpfns.c; 671 errs_gva = xatpb32.errs.c; 672 op_sz = sizeof(uint32_t); 673 } else { 674 if (kvm_copy_from_gva(cs, arg, &xatpb, sizeof(xatpb))) { 675 return -EFAULT; 676 } 677 op_sz = sizeof(unsigned long); 678 idxs_gva = (unsigned long)xatpb.idxs.p; 679 gpfns_gva = (unsigned long)xatpb.gpfns.p; 680 errs_gva = (unsigned long)xatpb.errs.p; 681 } 682 683 if (xatpb.domid != DOMID_SELF && xatpb.domid != xen_domid) { 684 return -ESRCH; 685 } 686 687 /* Explicitly invalid for the batch op. Not that we implement it anyway. */ 688 if (xatpb.space == XENMAPSPACE_gmfn_range) { 689 return -EINVAL; 690 } 691 692 while (xatpb.size--) { 693 unsigned long idx = 0; 694 unsigned long gpfn = 0; 695 int err; 696 697 /* For 32-bit compat this only copies the low 32 bits of each */ 698 if (kvm_copy_from_gva(cs, idxs_gva, &idx, op_sz) || 699 kvm_copy_from_gva(cs, gpfns_gva, &gpfn, op_sz)) { 700 return -EFAULT; 701 } 702 idxs_gva += op_sz; 703 gpfns_gva += op_sz; 704 705 err = add_to_physmap_one(xatpb.space, idx, gpfn); 706 707 if (kvm_copy_to_gva(cs, errs_gva, &err, sizeof(err))) { 708 return -EFAULT; 709 } 710 errs_gva += sizeof(err); 711 } 712 return 0; 713 } 714 715 static bool kvm_xen_hcall_memory_op(struct kvm_xen_exit *exit, X86CPU *cpu, 716 int cmd, uint64_t arg) 717 { 718 int err; 719 720 switch (cmd) { 721 case XENMEM_add_to_physmap: 722 err = do_add_to_physmap(exit, cpu, arg); 723 break; 724 725 case XENMEM_add_to_physmap_batch: 726 err = do_add_to_physmap_batch(exit, cpu, arg); 727 break; 728 729 default: 730 return false; 731 } 732 733 exit->u.hcall.result = err; 734 return true; 735 } 736 737 static bool handle_set_param(struct kvm_xen_exit *exit, X86CPU *cpu, 738 uint64_t arg) 739 { 740 CPUState *cs = CPU(cpu); 741 struct xen_hvm_param hp; 742 int err = 0; 743 744 /* No need for 32/64 compat handling */ 745 qemu_build_assert(sizeof(hp) == 16); 746 747 if (kvm_copy_from_gva(cs, arg, &hp, sizeof(hp))) { 748 err = -EFAULT; 749 goto out; 750 } 751 752 if (hp.domid != DOMID_SELF && hp.domid != xen_domid) { 753 err = -ESRCH; 754 goto out; 755 } 756 757 switch (hp.index) { 758 case HVM_PARAM_CALLBACK_IRQ: 759 qemu_mutex_lock_iothread(); 760 err = xen_evtchn_set_callback_param(hp.value); 761 qemu_mutex_unlock_iothread(); 762 xen_set_long_mode(exit->u.hcall.longmode); 763 break; 764 default: 765 return false; 766 } 767 768 out: 769 exit->u.hcall.result = err; 770 return true; 771 } 772 773 static bool handle_get_param(struct kvm_xen_exit *exit, X86CPU *cpu, 774 uint64_t arg) 775 { 776 CPUState *cs = CPU(cpu); 777 struct xen_hvm_param hp; 778 int err = 0; 779 780 /* No need for 32/64 compat handling */ 781 qemu_build_assert(sizeof(hp) == 16); 782 783 if (kvm_copy_from_gva(cs, arg, &hp, sizeof(hp))) { 784 err = -EFAULT; 785 goto out; 786 } 787 788 if (hp.domid != DOMID_SELF && hp.domid != xen_domid) { 789 err = -ESRCH; 790 goto out; 791 } 792 793 switch (hp.index) { 794 case HVM_PARAM_STORE_PFN: 795 hp.value = XEN_SPECIAL_PFN(XENSTORE); 796 break; 797 case HVM_PARAM_STORE_EVTCHN: 798 hp.value = xen_xenstore_get_port(); 799 break; 800 default: 801 return false; 802 } 803 804 if (kvm_copy_to_gva(cs, arg, &hp, sizeof(hp))) { 805 err = -EFAULT; 806 } 807 out: 808 exit->u.hcall.result = err; 809 return true; 810 } 811 812 static int kvm_xen_hcall_evtchn_upcall_vector(struct kvm_xen_exit *exit, 813 X86CPU *cpu, uint64_t arg) 814 { 815 struct xen_hvm_evtchn_upcall_vector up; 816 CPUState *target_cs; 817 818 /* No need for 32/64 compat handling */ 819 qemu_build_assert(sizeof(up) == 8); 820 821 if (kvm_copy_from_gva(CPU(cpu), arg, &up, sizeof(up))) { 822 return -EFAULT; 823 } 824 825 if (up.vector < 0x10) { 826 return -EINVAL; 827 } 828 829 target_cs = qemu_get_cpu(up.vcpu); 830 if (!target_cs) { 831 return -EINVAL; 832 } 833 834 async_run_on_cpu(target_cs, do_set_vcpu_callback_vector, 835 RUN_ON_CPU_HOST_INT(up.vector)); 836 return 0; 837 } 838 839 static bool kvm_xen_hcall_hvm_op(struct kvm_xen_exit *exit, X86CPU *cpu, 840 int cmd, uint64_t arg) 841 { 842 int ret = -ENOSYS; 843 switch (cmd) { 844 case HVMOP_set_evtchn_upcall_vector: 845 ret = kvm_xen_hcall_evtchn_upcall_vector(exit, cpu, 846 exit->u.hcall.params[0]); 847 break; 848 849 case HVMOP_pagetable_dying: 850 ret = -ENOSYS; 851 break; 852 853 case HVMOP_set_param: 854 return handle_set_param(exit, cpu, arg); 855 856 case HVMOP_get_param: 857 return handle_get_param(exit, cpu, arg); 858 859 default: 860 return false; 861 } 862 863 exit->u.hcall.result = ret; 864 return true; 865 } 866 867 static int vcpuop_register_vcpu_info(CPUState *cs, CPUState *target, 868 uint64_t arg) 869 { 870 struct vcpu_register_vcpu_info rvi; 871 uint64_t gpa; 872 873 /* No need for 32/64 compat handling */ 874 qemu_build_assert(sizeof(rvi) == 16); 875 qemu_build_assert(sizeof(struct vcpu_info) == 64); 876 877 if (!target) { 878 return -ENOENT; 879 } 880 881 if (kvm_copy_from_gva(cs, arg, &rvi, sizeof(rvi))) { 882 return -EFAULT; 883 } 884 885 if (rvi.offset > TARGET_PAGE_SIZE - sizeof(struct vcpu_info)) { 886 return -EINVAL; 887 } 888 889 gpa = ((rvi.mfn << TARGET_PAGE_BITS) + rvi.offset); 890 async_run_on_cpu(target, do_set_vcpu_info_gpa, RUN_ON_CPU_HOST_ULONG(gpa)); 891 return 0; 892 } 893 894 static int vcpuop_register_vcpu_time_info(CPUState *cs, CPUState *target, 895 uint64_t arg) 896 { 897 struct vcpu_register_time_memory_area tma; 898 uint64_t gpa; 899 size_t len; 900 901 /* No need for 32/64 compat handling */ 902 qemu_build_assert(sizeof(tma) == 8); 903 qemu_build_assert(sizeof(struct vcpu_time_info) == 32); 904 905 if (!target) { 906 return -ENOENT; 907 } 908 909 if (kvm_copy_from_gva(cs, arg, &tma, sizeof(tma))) { 910 return -EFAULT; 911 } 912 913 /* 914 * Xen actually uses the GVA and does the translation through the guest 915 * page tables each time. But Linux/KVM uses the GPA, on the assumption 916 * that guests only ever use *global* addresses (kernel virtual addresses) 917 * for it. If Linux is changed to redo the GVA→GPA translation each time, 918 * it will offer a new vCPU attribute for that, and we'll use it instead. 919 */ 920 if (!kvm_gva_to_gpa(cs, tma.addr.p, &gpa, &len, false) || 921 len < sizeof(struct vcpu_time_info)) { 922 return -EFAULT; 923 } 924 925 async_run_on_cpu(target, do_set_vcpu_time_info_gpa, 926 RUN_ON_CPU_HOST_ULONG(gpa)); 927 return 0; 928 } 929 930 static int vcpuop_register_runstate_info(CPUState *cs, CPUState *target, 931 uint64_t arg) 932 { 933 struct vcpu_register_runstate_memory_area rma; 934 uint64_t gpa; 935 size_t len; 936 937 /* No need for 32/64 compat handling */ 938 qemu_build_assert(sizeof(rma) == 8); 939 /* The runstate area actually does change size, but Linux copes. */ 940 941 if (!target) { 942 return -ENOENT; 943 } 944 945 if (kvm_copy_from_gva(cs, arg, &rma, sizeof(rma))) { 946 return -EFAULT; 947 } 948 949 /* As with vcpu_time_info, Xen actually uses the GVA but KVM doesn't. */ 950 if (!kvm_gva_to_gpa(cs, rma.addr.p, &gpa, &len, false)) { 951 return -EFAULT; 952 } 953 954 async_run_on_cpu(target, do_set_vcpu_runstate_gpa, 955 RUN_ON_CPU_HOST_ULONG(gpa)); 956 return 0; 957 } 958 959 static uint64_t kvm_get_current_ns(void) 960 { 961 struct kvm_clock_data data; 962 int ret; 963 964 ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data); 965 if (ret < 0) { 966 fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret)); 967 abort(); 968 } 969 970 return data.clock; 971 } 972 973 static void xen_vcpu_singleshot_timer_event(void *opaque) 974 { 975 CPUState *cpu = opaque; 976 CPUX86State *env = &X86_CPU(cpu)->env; 977 uint16_t port = env->xen_virq[VIRQ_TIMER]; 978 979 if (likely(port)) { 980 xen_evtchn_set_port(port); 981 } 982 983 qemu_mutex_lock(&env->xen_timers_lock); 984 env->xen_singleshot_timer_ns = 0; 985 qemu_mutex_unlock(&env->xen_timers_lock); 986 } 987 988 static void xen_vcpu_periodic_timer_event(void *opaque) 989 { 990 CPUState *cpu = opaque; 991 CPUX86State *env = &X86_CPU(cpu)->env; 992 uint16_t port = env->xen_virq[VIRQ_TIMER]; 993 int64_t qemu_now; 994 995 if (likely(port)) { 996 xen_evtchn_set_port(port); 997 } 998 999 qemu_mutex_lock(&env->xen_timers_lock); 1000 1001 qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); 1002 timer_mod_ns(env->xen_periodic_timer, 1003 qemu_now + env->xen_periodic_timer_period); 1004 1005 qemu_mutex_unlock(&env->xen_timers_lock); 1006 } 1007 1008 static int do_set_periodic_timer(CPUState *target, uint64_t period_ns) 1009 { 1010 CPUX86State *tenv = &X86_CPU(target)->env; 1011 int64_t qemu_now; 1012 1013 timer_del(tenv->xen_periodic_timer); 1014 1015 qemu_mutex_lock(&tenv->xen_timers_lock); 1016 1017 qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); 1018 timer_mod_ns(tenv->xen_periodic_timer, qemu_now + period_ns); 1019 tenv->xen_periodic_timer_period = period_ns; 1020 1021 qemu_mutex_unlock(&tenv->xen_timers_lock); 1022 return 0; 1023 } 1024 1025 #define MILLISECS(_ms) ((int64_t)((_ms) * 1000000ULL)) 1026 #define MICROSECS(_us) ((int64_t)((_us) * 1000ULL)) 1027 #define STIME_MAX ((time_t)((int64_t)~0ull >> 1)) 1028 /* Chosen so (NOW() + delta) wont overflow without an uptime of 200 years */ 1029 #define STIME_DELTA_MAX ((int64_t)((uint64_t)~0ull >> 2)) 1030 1031 static int vcpuop_set_periodic_timer(CPUState *cs, CPUState *target, 1032 uint64_t arg) 1033 { 1034 struct vcpu_set_periodic_timer spt; 1035 1036 qemu_build_assert(sizeof(spt) == 8); 1037 if (kvm_copy_from_gva(cs, arg, &spt, sizeof(spt))) { 1038 return -EFAULT; 1039 } 1040 1041 if (spt.period_ns < MILLISECS(1) || spt.period_ns > STIME_DELTA_MAX) { 1042 return -EINVAL; 1043 } 1044 1045 return do_set_periodic_timer(target, spt.period_ns); 1046 } 1047 1048 static int vcpuop_stop_periodic_timer(CPUState *target) 1049 { 1050 CPUX86State *tenv = &X86_CPU(target)->env; 1051 1052 qemu_mutex_lock(&tenv->xen_timers_lock); 1053 1054 timer_del(tenv->xen_periodic_timer); 1055 tenv->xen_periodic_timer_period = 0; 1056 1057 qemu_mutex_unlock(&tenv->xen_timers_lock); 1058 return 0; 1059 } 1060 1061 static int do_set_singleshot_timer(CPUState *cs, uint64_t timeout_abs, 1062 bool future, bool linux_wa) 1063 { 1064 CPUX86State *env = &X86_CPU(cs)->env; 1065 int64_t now = kvm_get_current_ns(); 1066 int64_t qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); 1067 int64_t delta = timeout_abs - now; 1068 1069 if (future && timeout_abs < now) { 1070 return -ETIME; 1071 } 1072 1073 if (linux_wa && unlikely((int64_t)timeout_abs < 0 || 1074 (delta > 0 && (uint32_t)(delta >> 50) != 0))) { 1075 /* 1076 * Xen has a 'Linux workaround' in do_set_timer_op() which checks 1077 * for negative absolute timeout values (caused by integer 1078 * overflow), and for values about 13 days in the future (2^50ns) 1079 * which would be caused by jiffies overflow. For those cases, it 1080 * sets the timeout 100ms in the future (not *too* soon, since if 1081 * a guest really did set a long timeout on purpose we don't want 1082 * to keep churning CPU time by waking it up). 1083 */ 1084 delta = (100 * SCALE_MS); 1085 timeout_abs = now + delta; 1086 } 1087 1088 qemu_mutex_lock(&env->xen_timers_lock); 1089 1090 timer_mod_ns(env->xen_singleshot_timer, qemu_now + delta); 1091 env->xen_singleshot_timer_ns = now + delta; 1092 1093 qemu_mutex_unlock(&env->xen_timers_lock); 1094 return 0; 1095 } 1096 1097 static int vcpuop_set_singleshot_timer(CPUState *cs, uint64_t arg) 1098 { 1099 struct vcpu_set_singleshot_timer sst = { 0 }; 1100 1101 /* 1102 * The struct is a uint64_t followed by a uint32_t. On 32-bit that 1103 * makes it 12 bytes. On 64-bit it gets padded to 16. The parts 1104 * that get used are identical, and there's four bytes of padding 1105 * unused at the end. For true Xen compatibility we should attempt 1106 * to copy the full 16 bytes from 64-bit guests, and return -EFAULT 1107 * if we can't get the padding too. But that's daft. Just copy what 1108 * we need. 1109 */ 1110 qemu_build_assert(offsetof(struct vcpu_set_singleshot_timer, flags) == 8); 1111 qemu_build_assert(sizeof(sst) >= 12); 1112 1113 if (kvm_copy_from_gva(cs, arg, &sst, 12)) { 1114 return -EFAULT; 1115 } 1116 1117 return do_set_singleshot_timer(cs, sst.timeout_abs_ns, 1118 !!(sst.flags & VCPU_SSHOTTMR_future), 1119 false); 1120 } 1121 1122 static int vcpuop_stop_singleshot_timer(CPUState *cs) 1123 { 1124 CPUX86State *env = &X86_CPU(cs)->env; 1125 1126 qemu_mutex_lock(&env->xen_timers_lock); 1127 1128 timer_del(env->xen_singleshot_timer); 1129 env->xen_singleshot_timer_ns = 0; 1130 1131 qemu_mutex_unlock(&env->xen_timers_lock); 1132 return 0; 1133 } 1134 1135 static bool kvm_xen_hcall_set_timer_op(struct kvm_xen_exit *exit, X86CPU *cpu, 1136 uint64_t timeout) 1137 { 1138 int err; 1139 1140 if (unlikely(timeout == 0)) { 1141 err = vcpuop_stop_singleshot_timer(CPU(cpu)); 1142 } else { 1143 err = do_set_singleshot_timer(CPU(cpu), timeout, false, true); 1144 } 1145 exit->u.hcall.result = err; 1146 return true; 1147 } 1148 1149 static bool kvm_xen_hcall_vcpu_op(struct kvm_xen_exit *exit, X86CPU *cpu, 1150 int cmd, int vcpu_id, uint64_t arg) 1151 { 1152 CPUState *cs = CPU(cpu); 1153 CPUState *dest = cs->cpu_index == vcpu_id ? cs : qemu_get_cpu(vcpu_id); 1154 int err; 1155 1156 if (!dest) { 1157 err = -ENOENT; 1158 goto out; 1159 } 1160 1161 switch (cmd) { 1162 case VCPUOP_register_runstate_memory_area: 1163 err = vcpuop_register_runstate_info(cs, dest, arg); 1164 break; 1165 case VCPUOP_register_vcpu_time_memory_area: 1166 err = vcpuop_register_vcpu_time_info(cs, dest, arg); 1167 break; 1168 case VCPUOP_register_vcpu_info: 1169 err = vcpuop_register_vcpu_info(cs, dest, arg); 1170 break; 1171 case VCPUOP_set_singleshot_timer: { 1172 if (cs->cpu_index == vcpu_id) { 1173 err = vcpuop_set_singleshot_timer(dest, arg); 1174 } else { 1175 err = -EINVAL; 1176 } 1177 break; 1178 } 1179 case VCPUOP_stop_singleshot_timer: 1180 if (cs->cpu_index == vcpu_id) { 1181 err = vcpuop_stop_singleshot_timer(dest); 1182 } else { 1183 err = -EINVAL; 1184 } 1185 break; 1186 case VCPUOP_set_periodic_timer: { 1187 err = vcpuop_set_periodic_timer(cs, dest, arg); 1188 break; 1189 } 1190 case VCPUOP_stop_periodic_timer: 1191 err = vcpuop_stop_periodic_timer(dest); 1192 break; 1193 1194 default: 1195 return false; 1196 } 1197 1198 out: 1199 exit->u.hcall.result = err; 1200 return true; 1201 } 1202 1203 static bool kvm_xen_hcall_evtchn_op(struct kvm_xen_exit *exit, X86CPU *cpu, 1204 int cmd, uint64_t arg) 1205 { 1206 CPUState *cs = CPU(cpu); 1207 int err = -ENOSYS; 1208 1209 switch (cmd) { 1210 case EVTCHNOP_init_control: 1211 case EVTCHNOP_expand_array: 1212 case EVTCHNOP_set_priority: 1213 /* We do not support FIFO channels at this point */ 1214 err = -ENOSYS; 1215 break; 1216 1217 case EVTCHNOP_status: { 1218 struct evtchn_status status; 1219 1220 qemu_build_assert(sizeof(status) == 24); 1221 if (kvm_copy_from_gva(cs, arg, &status, sizeof(status))) { 1222 err = -EFAULT; 1223 break; 1224 } 1225 1226 err = xen_evtchn_status_op(&status); 1227 if (!err && kvm_copy_to_gva(cs, arg, &status, sizeof(status))) { 1228 err = -EFAULT; 1229 } 1230 break; 1231 } 1232 case EVTCHNOP_close: { 1233 struct evtchn_close close; 1234 1235 qemu_build_assert(sizeof(close) == 4); 1236 if (kvm_copy_from_gva(cs, arg, &close, sizeof(close))) { 1237 err = -EFAULT; 1238 break; 1239 } 1240 1241 err = xen_evtchn_close_op(&close); 1242 break; 1243 } 1244 case EVTCHNOP_unmask: { 1245 struct evtchn_unmask unmask; 1246 1247 qemu_build_assert(sizeof(unmask) == 4); 1248 if (kvm_copy_from_gva(cs, arg, &unmask, sizeof(unmask))) { 1249 err = -EFAULT; 1250 break; 1251 } 1252 1253 err = xen_evtchn_unmask_op(&unmask); 1254 break; 1255 } 1256 case EVTCHNOP_bind_virq: { 1257 struct evtchn_bind_virq virq; 1258 1259 qemu_build_assert(sizeof(virq) == 12); 1260 if (kvm_copy_from_gva(cs, arg, &virq, sizeof(virq))) { 1261 err = -EFAULT; 1262 break; 1263 } 1264 1265 err = xen_evtchn_bind_virq_op(&virq); 1266 if (!err && kvm_copy_to_gva(cs, arg, &virq, sizeof(virq))) { 1267 err = -EFAULT; 1268 } 1269 break; 1270 } 1271 case EVTCHNOP_bind_pirq: { 1272 struct evtchn_bind_pirq pirq; 1273 1274 qemu_build_assert(sizeof(pirq) == 12); 1275 if (kvm_copy_from_gva(cs, arg, &pirq, sizeof(pirq))) { 1276 err = -EFAULT; 1277 break; 1278 } 1279 1280 err = xen_evtchn_bind_pirq_op(&pirq); 1281 if (!err && kvm_copy_to_gva(cs, arg, &pirq, sizeof(pirq))) { 1282 err = -EFAULT; 1283 } 1284 break; 1285 } 1286 case EVTCHNOP_bind_ipi: { 1287 struct evtchn_bind_ipi ipi; 1288 1289 qemu_build_assert(sizeof(ipi) == 8); 1290 if (kvm_copy_from_gva(cs, arg, &ipi, sizeof(ipi))) { 1291 err = -EFAULT; 1292 break; 1293 } 1294 1295 err = xen_evtchn_bind_ipi_op(&ipi); 1296 if (!err && kvm_copy_to_gva(cs, arg, &ipi, sizeof(ipi))) { 1297 err = -EFAULT; 1298 } 1299 break; 1300 } 1301 case EVTCHNOP_send: { 1302 struct evtchn_send send; 1303 1304 qemu_build_assert(sizeof(send) == 4); 1305 if (kvm_copy_from_gva(cs, arg, &send, sizeof(send))) { 1306 err = -EFAULT; 1307 break; 1308 } 1309 1310 err = xen_evtchn_send_op(&send); 1311 break; 1312 } 1313 case EVTCHNOP_alloc_unbound: { 1314 struct evtchn_alloc_unbound alloc; 1315 1316 qemu_build_assert(sizeof(alloc) == 8); 1317 if (kvm_copy_from_gva(cs, arg, &alloc, sizeof(alloc))) { 1318 err = -EFAULT; 1319 break; 1320 } 1321 1322 err = xen_evtchn_alloc_unbound_op(&alloc); 1323 if (!err && kvm_copy_to_gva(cs, arg, &alloc, sizeof(alloc))) { 1324 err = -EFAULT; 1325 } 1326 break; 1327 } 1328 case EVTCHNOP_bind_interdomain: { 1329 struct evtchn_bind_interdomain interdomain; 1330 1331 qemu_build_assert(sizeof(interdomain) == 12); 1332 if (kvm_copy_from_gva(cs, arg, &interdomain, sizeof(interdomain))) { 1333 err = -EFAULT; 1334 break; 1335 } 1336 1337 err = xen_evtchn_bind_interdomain_op(&interdomain); 1338 if (!err && 1339 kvm_copy_to_gva(cs, arg, &interdomain, sizeof(interdomain))) { 1340 err = -EFAULT; 1341 } 1342 break; 1343 } 1344 case EVTCHNOP_bind_vcpu: { 1345 struct evtchn_bind_vcpu vcpu; 1346 1347 qemu_build_assert(sizeof(vcpu) == 8); 1348 if (kvm_copy_from_gva(cs, arg, &vcpu, sizeof(vcpu))) { 1349 err = -EFAULT; 1350 break; 1351 } 1352 1353 err = xen_evtchn_bind_vcpu_op(&vcpu); 1354 break; 1355 } 1356 case EVTCHNOP_reset: { 1357 struct evtchn_reset reset; 1358 1359 qemu_build_assert(sizeof(reset) == 2); 1360 if (kvm_copy_from_gva(cs, arg, &reset, sizeof(reset))) { 1361 err = -EFAULT; 1362 break; 1363 } 1364 1365 err = xen_evtchn_reset_op(&reset); 1366 break; 1367 } 1368 default: 1369 return false; 1370 } 1371 1372 exit->u.hcall.result = err; 1373 return true; 1374 } 1375 1376 int kvm_xen_soft_reset(void) 1377 { 1378 CPUState *cpu; 1379 int err; 1380 1381 assert(qemu_mutex_iothread_locked()); 1382 1383 trace_kvm_xen_soft_reset(); 1384 1385 err = xen_evtchn_soft_reset(); 1386 if (err) { 1387 return err; 1388 } 1389 1390 /* 1391 * Zero is the reset/startup state for HVM_PARAM_CALLBACK_IRQ. Strictly, 1392 * it maps to HVM_PARAM_CALLBACK_TYPE_GSI with GSI#0, but Xen refuses to 1393 * to deliver to the timer interrupt and treats that as 'disabled'. 1394 */ 1395 err = xen_evtchn_set_callback_param(0); 1396 if (err) { 1397 return err; 1398 } 1399 1400 CPU_FOREACH(cpu) { 1401 async_run_on_cpu(cpu, do_vcpu_soft_reset, RUN_ON_CPU_NULL); 1402 } 1403 1404 err = xen_overlay_map_shinfo_page(INVALID_GFN); 1405 if (err) { 1406 return err; 1407 } 1408 1409 err = xen_gnttab_reset(); 1410 if (err) { 1411 return err; 1412 } 1413 1414 err = xen_xenstore_reset(); 1415 if (err) { 1416 return err; 1417 } 1418 1419 return 0; 1420 } 1421 1422 static int schedop_shutdown(CPUState *cs, uint64_t arg) 1423 { 1424 struct sched_shutdown shutdown; 1425 int ret = 0; 1426 1427 /* No need for 32/64 compat handling */ 1428 qemu_build_assert(sizeof(shutdown) == 4); 1429 1430 if (kvm_copy_from_gva(cs, arg, &shutdown, sizeof(shutdown))) { 1431 return -EFAULT; 1432 } 1433 1434 switch (shutdown.reason) { 1435 case SHUTDOWN_crash: 1436 cpu_dump_state(cs, stderr, CPU_DUMP_CODE); 1437 qemu_system_guest_panicked(NULL); 1438 break; 1439 1440 case SHUTDOWN_reboot: 1441 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); 1442 break; 1443 1444 case SHUTDOWN_poweroff: 1445 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN); 1446 break; 1447 1448 case SHUTDOWN_soft_reset: 1449 qemu_mutex_lock_iothread(); 1450 ret = kvm_xen_soft_reset(); 1451 qemu_mutex_unlock_iothread(); 1452 break; 1453 1454 default: 1455 ret = -EINVAL; 1456 break; 1457 } 1458 1459 return ret; 1460 } 1461 1462 static bool kvm_xen_hcall_sched_op(struct kvm_xen_exit *exit, X86CPU *cpu, 1463 int cmd, uint64_t arg) 1464 { 1465 CPUState *cs = CPU(cpu); 1466 int err = -ENOSYS; 1467 1468 switch (cmd) { 1469 case SCHEDOP_shutdown: 1470 err = schedop_shutdown(cs, arg); 1471 break; 1472 1473 case SCHEDOP_poll: 1474 /* 1475 * Linux will panic if this doesn't work. Just yield; it's not 1476 * worth overthinking it because with event channel handling 1477 * in KVM, the kernel will intercept this and it will never 1478 * reach QEMU anyway. The semantics of the hypercall explicltly 1479 * permit spurious wakeups. 1480 */ 1481 case SCHEDOP_yield: 1482 sched_yield(); 1483 err = 0; 1484 break; 1485 1486 default: 1487 return false; 1488 } 1489 1490 exit->u.hcall.result = err; 1491 return true; 1492 } 1493 1494 static bool kvm_xen_hcall_gnttab_op(struct kvm_xen_exit *exit, X86CPU *cpu, 1495 int cmd, uint64_t arg, int count) 1496 { 1497 CPUState *cs = CPU(cpu); 1498 int err; 1499 1500 switch (cmd) { 1501 case GNTTABOP_set_version: { 1502 struct gnttab_set_version set; 1503 1504 qemu_build_assert(sizeof(set) == 4); 1505 if (kvm_copy_from_gva(cs, arg, &set, sizeof(set))) { 1506 err = -EFAULT; 1507 break; 1508 } 1509 1510 err = xen_gnttab_set_version_op(&set); 1511 if (!err && kvm_copy_to_gva(cs, arg, &set, sizeof(set))) { 1512 err = -EFAULT; 1513 } 1514 break; 1515 } 1516 case GNTTABOP_get_version: { 1517 struct gnttab_get_version get; 1518 1519 qemu_build_assert(sizeof(get) == 8); 1520 if (kvm_copy_from_gva(cs, arg, &get, sizeof(get))) { 1521 err = -EFAULT; 1522 break; 1523 } 1524 1525 err = xen_gnttab_get_version_op(&get); 1526 if (!err && kvm_copy_to_gva(cs, arg, &get, sizeof(get))) { 1527 err = -EFAULT; 1528 } 1529 break; 1530 } 1531 case GNTTABOP_query_size: { 1532 struct gnttab_query_size size; 1533 1534 qemu_build_assert(sizeof(size) == 16); 1535 if (kvm_copy_from_gva(cs, arg, &size, sizeof(size))) { 1536 err = -EFAULT; 1537 break; 1538 } 1539 1540 err = xen_gnttab_query_size_op(&size); 1541 if (!err && kvm_copy_to_gva(cs, arg, &size, sizeof(size))) { 1542 err = -EFAULT; 1543 } 1544 break; 1545 } 1546 case GNTTABOP_setup_table: 1547 case GNTTABOP_copy: 1548 case GNTTABOP_map_grant_ref: 1549 case GNTTABOP_unmap_grant_ref: 1550 case GNTTABOP_swap_grant_ref: 1551 return false; 1552 1553 default: 1554 /* Xen explicitly returns -ENOSYS to HVM guests for all others */ 1555 err = -ENOSYS; 1556 break; 1557 } 1558 1559 exit->u.hcall.result = err; 1560 return true; 1561 } 1562 1563 static bool kvm_xen_hcall_physdev_op(struct kvm_xen_exit *exit, X86CPU *cpu, 1564 int cmd, uint64_t arg) 1565 { 1566 CPUState *cs = CPU(cpu); 1567 int err; 1568 1569 switch (cmd) { 1570 case PHYSDEVOP_map_pirq: { 1571 struct physdev_map_pirq map; 1572 1573 if (hypercall_compat32(exit->u.hcall.longmode)) { 1574 struct compat_physdev_map_pirq *map32 = (void *)↦ 1575 1576 if (kvm_copy_from_gva(cs, arg, map32, sizeof(*map32))) { 1577 return -EFAULT; 1578 } 1579 1580 /* 1581 * The only thing that's different is the alignment of the 1582 * uint64_t table_base at the end, which gets padding to make 1583 * it 64-bit aligned in the 64-bit version. 1584 */ 1585 qemu_build_assert(sizeof(*map32) == 36); 1586 qemu_build_assert(offsetof(struct physdev_map_pirq, entry_nr) == 1587 offsetof(struct compat_physdev_map_pirq, entry_nr)); 1588 memmove(&map.table_base, &map32->table_base, sizeof(map.table_base)); 1589 } else { 1590 if (kvm_copy_from_gva(cs, arg, &map, sizeof(map))) { 1591 err = -EFAULT; 1592 break; 1593 } 1594 } 1595 err = xen_physdev_map_pirq(&map); 1596 /* 1597 * Since table_base is an IN parameter and won't be changed, just 1598 * copy the size of the compat structure back to the guest. 1599 */ 1600 if (!err && kvm_copy_to_gva(cs, arg, &map, 1601 sizeof(struct compat_physdev_map_pirq))) { 1602 err = -EFAULT; 1603 } 1604 break; 1605 } 1606 case PHYSDEVOP_unmap_pirq: { 1607 struct physdev_unmap_pirq unmap; 1608 1609 qemu_build_assert(sizeof(unmap) == 8); 1610 if (kvm_copy_from_gva(cs, arg, &unmap, sizeof(unmap))) { 1611 err = -EFAULT; 1612 break; 1613 } 1614 1615 err = xen_physdev_unmap_pirq(&unmap); 1616 if (!err && kvm_copy_to_gva(cs, arg, &unmap, sizeof(unmap))) { 1617 err = -EFAULT; 1618 } 1619 break; 1620 } 1621 case PHYSDEVOP_eoi: { 1622 struct physdev_eoi eoi; 1623 1624 qemu_build_assert(sizeof(eoi) == 4); 1625 if (kvm_copy_from_gva(cs, arg, &eoi, sizeof(eoi))) { 1626 err = -EFAULT; 1627 break; 1628 } 1629 1630 err = xen_physdev_eoi_pirq(&eoi); 1631 if (!err && kvm_copy_to_gva(cs, arg, &eoi, sizeof(eoi))) { 1632 err = -EFAULT; 1633 } 1634 break; 1635 } 1636 case PHYSDEVOP_irq_status_query: { 1637 struct physdev_irq_status_query query; 1638 1639 qemu_build_assert(sizeof(query) == 8); 1640 if (kvm_copy_from_gva(cs, arg, &query, sizeof(query))) { 1641 err = -EFAULT; 1642 break; 1643 } 1644 1645 err = xen_physdev_query_pirq(&query); 1646 if (!err && kvm_copy_to_gva(cs, arg, &query, sizeof(query))) { 1647 err = -EFAULT; 1648 } 1649 break; 1650 } 1651 case PHYSDEVOP_get_free_pirq: { 1652 struct physdev_get_free_pirq get; 1653 1654 qemu_build_assert(sizeof(get) == 8); 1655 if (kvm_copy_from_gva(cs, arg, &get, sizeof(get))) { 1656 err = -EFAULT; 1657 break; 1658 } 1659 1660 err = xen_physdev_get_free_pirq(&get); 1661 if (!err && kvm_copy_to_gva(cs, arg, &get, sizeof(get))) { 1662 err = -EFAULT; 1663 } 1664 break; 1665 } 1666 case PHYSDEVOP_pirq_eoi_gmfn_v2: /* FreeBSD 13 makes this hypercall */ 1667 err = -ENOSYS; 1668 break; 1669 1670 default: 1671 return false; 1672 } 1673 1674 exit->u.hcall.result = err; 1675 return true; 1676 } 1677 1678 static bool do_kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit) 1679 { 1680 uint16_t code = exit->u.hcall.input; 1681 1682 if (exit->u.hcall.cpl > 0) { 1683 exit->u.hcall.result = -EPERM; 1684 return true; 1685 } 1686 1687 switch (code) { 1688 case __HYPERVISOR_set_timer_op: 1689 if (exit->u.hcall.longmode) { 1690 return kvm_xen_hcall_set_timer_op(exit, cpu, 1691 exit->u.hcall.params[0]); 1692 } else { 1693 /* In 32-bit mode, the 64-bit timer value is in two args. */ 1694 uint64_t val = ((uint64_t)exit->u.hcall.params[1]) << 32 | 1695 (uint32_t)exit->u.hcall.params[0]; 1696 return kvm_xen_hcall_set_timer_op(exit, cpu, val); 1697 } 1698 case __HYPERVISOR_grant_table_op: 1699 return kvm_xen_hcall_gnttab_op(exit, cpu, exit->u.hcall.params[0], 1700 exit->u.hcall.params[1], 1701 exit->u.hcall.params[2]); 1702 case __HYPERVISOR_sched_op: 1703 return kvm_xen_hcall_sched_op(exit, cpu, exit->u.hcall.params[0], 1704 exit->u.hcall.params[1]); 1705 case __HYPERVISOR_event_channel_op: 1706 return kvm_xen_hcall_evtchn_op(exit, cpu, exit->u.hcall.params[0], 1707 exit->u.hcall.params[1]); 1708 case __HYPERVISOR_vcpu_op: 1709 return kvm_xen_hcall_vcpu_op(exit, cpu, 1710 exit->u.hcall.params[0], 1711 exit->u.hcall.params[1], 1712 exit->u.hcall.params[2]); 1713 case __HYPERVISOR_hvm_op: 1714 return kvm_xen_hcall_hvm_op(exit, cpu, exit->u.hcall.params[0], 1715 exit->u.hcall.params[1]); 1716 case __HYPERVISOR_memory_op: 1717 return kvm_xen_hcall_memory_op(exit, cpu, exit->u.hcall.params[0], 1718 exit->u.hcall.params[1]); 1719 case __HYPERVISOR_physdev_op: 1720 return kvm_xen_hcall_physdev_op(exit, cpu, exit->u.hcall.params[0], 1721 exit->u.hcall.params[1]); 1722 case __HYPERVISOR_xen_version: 1723 return kvm_xen_hcall_xen_version(exit, cpu, exit->u.hcall.params[0], 1724 exit->u.hcall.params[1]); 1725 default: 1726 return false; 1727 } 1728 } 1729 1730 int kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit) 1731 { 1732 if (exit->type != KVM_EXIT_XEN_HCALL) { 1733 return -1; 1734 } 1735 1736 /* 1737 * The kernel latches the guest 32/64 mode when the MSR is used to fill 1738 * the hypercall page. So if we see a hypercall in a mode that doesn't 1739 * match our own idea of the guest mode, fetch the kernel's idea of the 1740 * "long mode" to remain in sync. 1741 */ 1742 if (exit->u.hcall.longmode != xen_is_long_mode()) { 1743 xen_sync_long_mode(); 1744 } 1745 1746 if (!do_kvm_xen_handle_exit(cpu, exit)) { 1747 /* 1748 * Some hypercalls will be deliberately "implemented" by returning 1749 * -ENOSYS. This case is for hypercalls which are unexpected. 1750 */ 1751 exit->u.hcall.result = -ENOSYS; 1752 qemu_log_mask(LOG_UNIMP, "Unimplemented Xen hypercall %" 1753 PRId64 " (0x%" PRIx64 " 0x%" PRIx64 " 0x%" PRIx64 ")\n", 1754 (uint64_t)exit->u.hcall.input, 1755 (uint64_t)exit->u.hcall.params[0], 1756 (uint64_t)exit->u.hcall.params[1], 1757 (uint64_t)exit->u.hcall.params[2]); 1758 } 1759 1760 trace_kvm_xen_hypercall(CPU(cpu)->cpu_index, exit->u.hcall.cpl, 1761 exit->u.hcall.input, exit->u.hcall.params[0], 1762 exit->u.hcall.params[1], exit->u.hcall.params[2], 1763 exit->u.hcall.result); 1764 return 0; 1765 } 1766 1767 uint16_t kvm_xen_get_gnttab_max_frames(void) 1768 { 1769 KVMState *s = KVM_STATE(current_accel()); 1770 return s->xen_gnttab_max_frames; 1771 } 1772 1773 uint16_t kvm_xen_get_evtchn_max_pirq(void) 1774 { 1775 KVMState *s = KVM_STATE(current_accel()); 1776 return s->xen_evtchn_max_pirq; 1777 } 1778 1779 int kvm_put_xen_state(CPUState *cs) 1780 { 1781 X86CPU *cpu = X86_CPU(cs); 1782 CPUX86State *env = &cpu->env; 1783 uint64_t gpa; 1784 int ret; 1785 1786 gpa = env->xen_vcpu_info_gpa; 1787 if (gpa == INVALID_GPA) { 1788 gpa = env->xen_vcpu_info_default_gpa; 1789 } 1790 1791 if (gpa != INVALID_GPA) { 1792 ret = set_vcpu_info(cs, gpa); 1793 if (ret < 0) { 1794 return ret; 1795 } 1796 } 1797 1798 gpa = env->xen_vcpu_time_info_gpa; 1799 if (gpa != INVALID_GPA) { 1800 ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO, 1801 gpa); 1802 if (ret < 0) { 1803 return ret; 1804 } 1805 } 1806 1807 gpa = env->xen_vcpu_runstate_gpa; 1808 if (gpa != INVALID_GPA) { 1809 ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR, 1810 gpa); 1811 if (ret < 0) { 1812 return ret; 1813 } 1814 } 1815 1816 if (env->xen_periodic_timer_period) { 1817 ret = do_set_periodic_timer(cs, env->xen_periodic_timer_period); 1818 if (ret < 0) { 1819 return ret; 1820 } 1821 } 1822 1823 if (!kvm_xen_has_cap(EVTCHN_SEND)) { 1824 /* 1825 * If the kernel has EVTCHN_SEND support then it handles timers too, 1826 * so the timer will be restored by kvm_xen_set_vcpu_timer() below. 1827 */ 1828 if (env->xen_singleshot_timer_ns) { 1829 ret = do_set_singleshot_timer(cs, env->xen_singleshot_timer_ns, 1830 false, false); 1831 if (ret < 0) { 1832 return ret; 1833 } 1834 } 1835 return 0; 1836 } 1837 1838 if (env->xen_vcpu_callback_vector) { 1839 ret = kvm_xen_set_vcpu_callback_vector(cs); 1840 if (ret < 0) { 1841 return ret; 1842 } 1843 } 1844 1845 if (env->xen_virq[VIRQ_TIMER]) { 1846 ret = kvm_xen_set_vcpu_timer(cs); 1847 if (ret < 0) { 1848 return ret; 1849 } 1850 } 1851 return 0; 1852 } 1853 1854 int kvm_get_xen_state(CPUState *cs) 1855 { 1856 X86CPU *cpu = X86_CPU(cs); 1857 CPUX86State *env = &cpu->env; 1858 uint64_t gpa; 1859 int ret; 1860 1861 /* 1862 * The kernel does not mark vcpu_info as dirty when it delivers interrupts 1863 * to it. It's up to userspace to *assume* that any page shared thus is 1864 * always considered dirty. The shared_info page is different since it's 1865 * an overlay and migrated separately anyway. 1866 */ 1867 gpa = env->xen_vcpu_info_gpa; 1868 if (gpa == INVALID_GPA) { 1869 gpa = env->xen_vcpu_info_default_gpa; 1870 } 1871 if (gpa != INVALID_GPA) { 1872 MemoryRegionSection mrs = memory_region_find(get_system_memory(), 1873 gpa, 1874 sizeof(struct vcpu_info)); 1875 if (mrs.mr && 1876 !int128_lt(mrs.size, int128_make64(sizeof(struct vcpu_info)))) { 1877 memory_region_set_dirty(mrs.mr, mrs.offset_within_region, 1878 sizeof(struct vcpu_info)); 1879 } 1880 } 1881 1882 if (!kvm_xen_has_cap(EVTCHN_SEND)) { 1883 return 0; 1884 } 1885 1886 /* 1887 * If the kernel is accelerating timers, read out the current value of the 1888 * singleshot timer deadline. 1889 */ 1890 if (env->xen_virq[VIRQ_TIMER]) { 1891 struct kvm_xen_vcpu_attr va = { 1892 .type = KVM_XEN_VCPU_ATTR_TYPE_TIMER, 1893 }; 1894 ret = kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_GET_ATTR, &va); 1895 if (ret < 0) { 1896 return ret; 1897 } 1898 env->xen_singleshot_timer_ns = va.u.timer.expires_ns; 1899 } 1900 1901 return 0; 1902 } 1903