1 /* 2 * Xen HVM emulation support in KVM 3 * 4 * Copyright © 2019 Oracle and/or its affiliates. All rights reserved. 5 * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. 6 * 7 * This work is licensed under the terms of the GNU GPL, version 2 or later. 8 * See the COPYING file in the top-level directory. 9 * 10 */ 11 12 #include "qemu/osdep.h" 13 #include "qemu/log.h" 14 #include "qemu/main-loop.h" 15 #include "qemu/error-report.h" 16 #include "hw/xen/xen.h" 17 #include "sysemu/kvm_int.h" 18 #include "sysemu/kvm_xen.h" 19 #include "kvm/kvm_i386.h" 20 #include "exec/address-spaces.h" 21 #include "xen-emu.h" 22 #include "trace.h" 23 #include "sysemu/runstate.h" 24 25 #include "hw/pci/msi.h" 26 #include "hw/i386/apic-msidef.h" 27 #include "hw/i386/e820_memory_layout.h" 28 #include "hw/i386/kvm/xen_overlay.h" 29 #include "hw/i386/kvm/xen_evtchn.h" 30 #include "hw/i386/kvm/xen_gnttab.h" 31 #include "hw/i386/kvm/xen_xenstore.h" 32 33 #include "hw/xen/interface/version.h" 34 #include "hw/xen/interface/sched.h" 35 #include "hw/xen/interface/memory.h" 36 #include "hw/xen/interface/hvm/hvm_op.h" 37 #include "hw/xen/interface/hvm/params.h" 38 #include "hw/xen/interface/vcpu.h" 39 #include "hw/xen/interface/event_channel.h" 40 #include "hw/xen/interface/grant_table.h" 41 42 #include "xen-compat.h" 43 44 static void xen_vcpu_singleshot_timer_event(void *opaque); 45 static void xen_vcpu_periodic_timer_event(void *opaque); 46 static int vcpuop_stop_singleshot_timer(CPUState *cs); 47 48 #ifdef TARGET_X86_64 49 #define hypercall_compat32(longmode) (!(longmode)) 50 #else 51 #define hypercall_compat32(longmode) (false) 52 #endif 53 54 static bool kvm_gva_to_gpa(CPUState *cs, uint64_t gva, uint64_t *gpa, 55 size_t *len, bool is_write) 56 { 57 struct kvm_translation tr = { 58 .linear_address = gva, 59 }; 60 61 if (len) { 62 *len = TARGET_PAGE_SIZE - (gva & ~TARGET_PAGE_MASK); 63 } 64 65 if (kvm_vcpu_ioctl(cs, KVM_TRANSLATE, &tr) || !tr.valid || 66 (is_write && !tr.writeable)) { 67 return false; 68 } 69 *gpa = tr.physical_address; 70 return true; 71 } 72 73 static int kvm_gva_rw(CPUState *cs, uint64_t gva, void *_buf, size_t sz, 74 bool is_write) 75 { 76 uint8_t *buf = (uint8_t *)_buf; 77 uint64_t gpa; 78 size_t len; 79 80 while (sz) { 81 if (!kvm_gva_to_gpa(cs, gva, &gpa, &len, is_write)) { 82 return -EFAULT; 83 } 84 if (len > sz) { 85 len = sz; 86 } 87 88 cpu_physical_memory_rw(gpa, buf, len, is_write); 89 90 buf += len; 91 sz -= len; 92 gva += len; 93 } 94 95 return 0; 96 } 97 98 static inline int kvm_copy_from_gva(CPUState *cs, uint64_t gva, void *buf, 99 size_t sz) 100 { 101 return kvm_gva_rw(cs, gva, buf, sz, false); 102 } 103 104 static inline int kvm_copy_to_gva(CPUState *cs, uint64_t gva, void *buf, 105 size_t sz) 106 { 107 return kvm_gva_rw(cs, gva, buf, sz, true); 108 } 109 110 int kvm_xen_init(KVMState *s, uint32_t hypercall_msr) 111 { 112 const int required_caps = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR | 113 KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | KVM_XEN_HVM_CONFIG_SHARED_INFO; 114 struct kvm_xen_hvm_config cfg = { 115 .msr = hypercall_msr, 116 .flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL, 117 }; 118 int xen_caps, ret; 119 120 xen_caps = kvm_check_extension(s, KVM_CAP_XEN_HVM); 121 if (required_caps & ~xen_caps) { 122 error_report("kvm: Xen HVM guest support not present or insufficient"); 123 return -ENOSYS; 124 } 125 126 if (xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND) { 127 struct kvm_xen_hvm_attr ha = { 128 .type = KVM_XEN_ATTR_TYPE_XEN_VERSION, 129 .u.xen_version = s->xen_version, 130 }; 131 (void)kvm_vm_ioctl(s, KVM_XEN_HVM_SET_ATTR, &ha); 132 133 cfg.flags |= KVM_XEN_HVM_CONFIG_EVTCHN_SEND; 134 } 135 136 ret = kvm_vm_ioctl(s, KVM_XEN_HVM_CONFIG, &cfg); 137 if (ret < 0) { 138 error_report("kvm: Failed to enable Xen HVM support: %s", 139 strerror(-ret)); 140 return ret; 141 } 142 143 /* If called a second time, don't repeat the rest of the setup. */ 144 if (s->xen_caps) { 145 return 0; 146 } 147 148 /* 149 * Event channel delivery via GSI/PCI_INTX needs to poll the vcpu_info 150 * of vCPU0 to deassert the IRQ when ->evtchn_upcall_pending is cleared. 151 * 152 * In the kernel, there's a notifier hook on the PIC/IOAPIC which allows 153 * such things to be polled at precisely the right time. We *could* do 154 * it nicely in the kernel: check vcpu_info[0]->evtchn_upcall_pending at 155 * the moment the IRQ is acked, and see if it should be reasserted. 156 * 157 * But the in-kernel irqchip is deprecated, so we're unlikely to add 158 * that support in the kernel. Insist on using the split irqchip mode 159 * instead. 160 * 161 * This leaves us polling for the level going low in QEMU, which lacks 162 * the appropriate hooks in its PIC/IOAPIC code. Even VFIO is sending a 163 * spurious 'ack' to an INTX IRQ every time there's any MMIO access to 164 * the device (for which it has to unmap the device and trap access, for 165 * some period after an IRQ!!). In the Xen case, we do it on exit from 166 * KVM_RUN, if the flag is set to say that the GSI is currently asserted. 167 * Which is kind of icky, but less so than the VFIO one. I may fix them 168 * both later... 169 */ 170 if (!kvm_kernel_irqchip_split()) { 171 error_report("kvm: Xen support requires kernel-irqchip=split"); 172 return -EINVAL; 173 } 174 175 s->xen_caps = xen_caps; 176 177 /* Tell fw_cfg to notify the BIOS to reserve the range. */ 178 ret = e820_add_entry(XEN_SPECIAL_AREA_ADDR, XEN_SPECIAL_AREA_SIZE, 179 E820_RESERVED); 180 if (ret < 0) { 181 fprintf(stderr, "e820_add_entry() table is full\n"); 182 return ret; 183 } 184 185 /* The page couldn't be overlaid until KVM was initialized */ 186 xen_xenstore_reset(); 187 188 return 0; 189 } 190 191 int kvm_xen_init_vcpu(CPUState *cs) 192 { 193 X86CPU *cpu = X86_CPU(cs); 194 CPUX86State *env = &cpu->env; 195 int err; 196 197 /* 198 * The kernel needs to know the Xen/ACPI vCPU ID because that's 199 * what the guest uses in hypercalls such as timers. It doesn't 200 * match the APIC ID which is generally used for talking to the 201 * kernel about vCPUs. And if vCPU threads race with creating 202 * their KVM vCPUs out of order, it doesn't necessarily match 203 * with the kernel's internal vCPU indices either. 204 */ 205 if (kvm_xen_has_cap(EVTCHN_SEND)) { 206 struct kvm_xen_vcpu_attr va = { 207 .type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID, 208 .u.vcpu_id = cs->cpu_index, 209 }; 210 err = kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &va); 211 if (err) { 212 error_report("kvm: Failed to set Xen vCPU ID attribute: %s", 213 strerror(-err)); 214 return err; 215 } 216 } 217 218 env->xen_vcpu_info_gpa = INVALID_GPA; 219 env->xen_vcpu_info_default_gpa = INVALID_GPA; 220 env->xen_vcpu_time_info_gpa = INVALID_GPA; 221 env->xen_vcpu_runstate_gpa = INVALID_GPA; 222 223 qemu_mutex_init(&env->xen_timers_lock); 224 env->xen_singleshot_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, 225 xen_vcpu_singleshot_timer_event, 226 cpu); 227 if (!env->xen_singleshot_timer) { 228 return -ENOMEM; 229 } 230 env->xen_singleshot_timer->opaque = cs; 231 232 env->xen_periodic_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, 233 xen_vcpu_periodic_timer_event, 234 cpu); 235 if (!env->xen_periodic_timer) { 236 return -ENOMEM; 237 } 238 env->xen_periodic_timer->opaque = cs; 239 240 return 0; 241 } 242 243 uint32_t kvm_xen_get_caps(void) 244 { 245 return kvm_state->xen_caps; 246 } 247 248 static bool kvm_xen_hcall_xen_version(struct kvm_xen_exit *exit, X86CPU *cpu, 249 int cmd, uint64_t arg) 250 { 251 int err = 0; 252 253 switch (cmd) { 254 case XENVER_get_features: { 255 struct xen_feature_info fi; 256 257 /* No need for 32/64 compat handling */ 258 qemu_build_assert(sizeof(fi) == 8); 259 260 err = kvm_copy_from_gva(CPU(cpu), arg, &fi, sizeof(fi)); 261 if (err) { 262 break; 263 } 264 265 fi.submap = 0; 266 if (fi.submap_idx == 0) { 267 fi.submap |= 1 << XENFEAT_writable_page_tables | 268 1 << XENFEAT_writable_descriptor_tables | 269 1 << XENFEAT_auto_translated_physmap | 270 1 << XENFEAT_supervisor_mode_kernel | 271 1 << XENFEAT_hvm_callback_vector | 272 1 << XENFEAT_hvm_safe_pvclock | 273 1 << XENFEAT_hvm_pirqs; 274 } 275 276 err = kvm_copy_to_gva(CPU(cpu), arg, &fi, sizeof(fi)); 277 break; 278 } 279 280 default: 281 return false; 282 } 283 284 exit->u.hcall.result = err; 285 return true; 286 } 287 288 static int kvm_xen_set_vcpu_attr(CPUState *cs, uint16_t type, uint64_t gpa) 289 { 290 struct kvm_xen_vcpu_attr xhsi; 291 292 xhsi.type = type; 293 xhsi.u.gpa = gpa; 294 295 trace_kvm_xen_set_vcpu_attr(cs->cpu_index, type, gpa); 296 297 return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &xhsi); 298 } 299 300 static int kvm_xen_set_vcpu_callback_vector(CPUState *cs) 301 { 302 uint8_t vector = X86_CPU(cs)->env.xen_vcpu_callback_vector; 303 struct kvm_xen_vcpu_attr xva; 304 305 xva.type = KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR; 306 xva.u.vector = vector; 307 308 trace_kvm_xen_set_vcpu_callback(cs->cpu_index, vector); 309 310 return kvm_vcpu_ioctl(cs, KVM_XEN_HVM_SET_ATTR, &xva); 311 } 312 313 static void do_set_vcpu_callback_vector(CPUState *cs, run_on_cpu_data data) 314 { 315 X86CPU *cpu = X86_CPU(cs); 316 CPUX86State *env = &cpu->env; 317 318 env->xen_vcpu_callback_vector = data.host_int; 319 320 if (kvm_xen_has_cap(EVTCHN_SEND)) { 321 kvm_xen_set_vcpu_callback_vector(cs); 322 } 323 } 324 325 static int set_vcpu_info(CPUState *cs, uint64_t gpa) 326 { 327 X86CPU *cpu = X86_CPU(cs); 328 CPUX86State *env = &cpu->env; 329 MemoryRegionSection mrs = { .mr = NULL }; 330 void *vcpu_info_hva = NULL; 331 int ret; 332 333 ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO, gpa); 334 if (ret || gpa == INVALID_GPA) { 335 goto out; 336 } 337 338 mrs = memory_region_find(get_system_memory(), gpa, 339 sizeof(struct vcpu_info)); 340 if (mrs.mr && mrs.mr->ram_block && 341 !int128_lt(mrs.size, int128_make64(sizeof(struct vcpu_info)))) { 342 vcpu_info_hva = qemu_map_ram_ptr(mrs.mr->ram_block, 343 mrs.offset_within_region); 344 } 345 if (!vcpu_info_hva) { 346 if (mrs.mr) { 347 memory_region_unref(mrs.mr); 348 mrs.mr = NULL; 349 } 350 ret = -EINVAL; 351 } 352 353 out: 354 if (env->xen_vcpu_info_mr) { 355 memory_region_unref(env->xen_vcpu_info_mr); 356 } 357 env->xen_vcpu_info_hva = vcpu_info_hva; 358 env->xen_vcpu_info_mr = mrs.mr; 359 return ret; 360 } 361 362 static void do_set_vcpu_info_default_gpa(CPUState *cs, run_on_cpu_data data) 363 { 364 X86CPU *cpu = X86_CPU(cs); 365 CPUX86State *env = &cpu->env; 366 367 env->xen_vcpu_info_default_gpa = data.host_ulong; 368 369 /* Changing the default does nothing if a vcpu_info was explicitly set. */ 370 if (env->xen_vcpu_info_gpa == INVALID_GPA) { 371 set_vcpu_info(cs, env->xen_vcpu_info_default_gpa); 372 } 373 } 374 375 static void do_set_vcpu_info_gpa(CPUState *cs, run_on_cpu_data data) 376 { 377 X86CPU *cpu = X86_CPU(cs); 378 CPUX86State *env = &cpu->env; 379 380 env->xen_vcpu_info_gpa = data.host_ulong; 381 382 set_vcpu_info(cs, env->xen_vcpu_info_gpa); 383 } 384 385 void *kvm_xen_get_vcpu_info_hva(uint32_t vcpu_id) 386 { 387 CPUState *cs = qemu_get_cpu(vcpu_id); 388 if (!cs) { 389 return NULL; 390 } 391 392 return X86_CPU(cs)->env.xen_vcpu_info_hva; 393 } 394 395 void kvm_xen_maybe_deassert_callback(CPUState *cs) 396 { 397 CPUX86State *env = &X86_CPU(cs)->env; 398 struct vcpu_info *vi = env->xen_vcpu_info_hva; 399 if (!vi) { 400 return; 401 } 402 403 /* If the evtchn_upcall_pending flag is cleared, turn the GSI off. */ 404 if (!vi->evtchn_upcall_pending) { 405 qemu_mutex_lock_iothread(); 406 /* 407 * Check again now we have the lock, because it may have been 408 * asserted in the interim. And we don't want to take the lock 409 * every time because this is a fast path. 410 */ 411 if (!vi->evtchn_upcall_pending) { 412 X86_CPU(cs)->env.xen_callback_asserted = false; 413 xen_evtchn_set_callback_level(0); 414 } 415 qemu_mutex_unlock_iothread(); 416 } 417 } 418 419 void kvm_xen_set_callback_asserted(void) 420 { 421 CPUState *cs = qemu_get_cpu(0); 422 423 if (cs) { 424 X86_CPU(cs)->env.xen_callback_asserted = true; 425 } 426 } 427 428 void kvm_xen_inject_vcpu_callback_vector(uint32_t vcpu_id, int type) 429 { 430 CPUState *cs = qemu_get_cpu(vcpu_id); 431 uint8_t vector; 432 433 if (!cs) { 434 return; 435 } 436 437 vector = X86_CPU(cs)->env.xen_vcpu_callback_vector; 438 if (vector) { 439 /* 440 * The per-vCPU callback vector injected via lapic. Just 441 * deliver it as an MSI. 442 */ 443 MSIMessage msg = { 444 .address = APIC_DEFAULT_ADDRESS | X86_CPU(cs)->apic_id, 445 .data = vector | (1UL << MSI_DATA_LEVEL_SHIFT), 446 }; 447 kvm_irqchip_send_msi(kvm_state, msg); 448 return; 449 } 450 451 switch (type) { 452 case HVM_PARAM_CALLBACK_TYPE_VECTOR: 453 /* 454 * If the evtchn_upcall_pending field in the vcpu_info is set, then 455 * KVM will automatically deliver the vector on entering the vCPU 456 * so all we have to do is kick it out. 457 */ 458 qemu_cpu_kick(cs); 459 break; 460 461 case HVM_PARAM_CALLBACK_TYPE_GSI: 462 case HVM_PARAM_CALLBACK_TYPE_PCI_INTX: 463 if (vcpu_id == 0) { 464 xen_evtchn_set_callback_level(1); 465 } 466 break; 467 } 468 } 469 470 /* Must always be called with xen_timers_lock held */ 471 static int kvm_xen_set_vcpu_timer(CPUState *cs) 472 { 473 X86CPU *cpu = X86_CPU(cs); 474 CPUX86State *env = &cpu->env; 475 476 struct kvm_xen_vcpu_attr va = { 477 .type = KVM_XEN_VCPU_ATTR_TYPE_TIMER, 478 .u.timer.port = env->xen_virq[VIRQ_TIMER], 479 .u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL, 480 .u.timer.expires_ns = env->xen_singleshot_timer_ns, 481 }; 482 483 return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &va); 484 } 485 486 static void do_set_vcpu_timer_virq(CPUState *cs, run_on_cpu_data data) 487 { 488 QEMU_LOCK_GUARD(&X86_CPU(cs)->env.xen_timers_lock); 489 kvm_xen_set_vcpu_timer(cs); 490 } 491 492 int kvm_xen_set_vcpu_virq(uint32_t vcpu_id, uint16_t virq, uint16_t port) 493 { 494 CPUState *cs = qemu_get_cpu(vcpu_id); 495 496 if (!cs) { 497 return -ENOENT; 498 } 499 500 /* cpu.h doesn't include the actual Xen header. */ 501 qemu_build_assert(NR_VIRQS == XEN_NR_VIRQS); 502 503 if (virq >= NR_VIRQS) { 504 return -EINVAL; 505 } 506 507 if (port && X86_CPU(cs)->env.xen_virq[virq]) { 508 return -EEXIST; 509 } 510 511 X86_CPU(cs)->env.xen_virq[virq] = port; 512 if (virq == VIRQ_TIMER && kvm_xen_has_cap(EVTCHN_SEND)) { 513 async_run_on_cpu(cs, do_set_vcpu_timer_virq, 514 RUN_ON_CPU_HOST_INT(port)); 515 } 516 return 0; 517 } 518 519 static void do_set_vcpu_time_info_gpa(CPUState *cs, run_on_cpu_data data) 520 { 521 X86CPU *cpu = X86_CPU(cs); 522 CPUX86State *env = &cpu->env; 523 524 env->xen_vcpu_time_info_gpa = data.host_ulong; 525 526 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO, 527 env->xen_vcpu_time_info_gpa); 528 } 529 530 static void do_set_vcpu_runstate_gpa(CPUState *cs, run_on_cpu_data data) 531 { 532 X86CPU *cpu = X86_CPU(cs); 533 CPUX86State *env = &cpu->env; 534 535 env->xen_vcpu_runstate_gpa = data.host_ulong; 536 537 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR, 538 env->xen_vcpu_runstate_gpa); 539 } 540 541 static void do_vcpu_soft_reset(CPUState *cs, run_on_cpu_data data) 542 { 543 X86CPU *cpu = X86_CPU(cs); 544 CPUX86State *env = &cpu->env; 545 546 env->xen_vcpu_info_gpa = INVALID_GPA; 547 env->xen_vcpu_info_default_gpa = INVALID_GPA; 548 env->xen_vcpu_time_info_gpa = INVALID_GPA; 549 env->xen_vcpu_runstate_gpa = INVALID_GPA; 550 env->xen_vcpu_callback_vector = 0; 551 memset(env->xen_virq, 0, sizeof(env->xen_virq)); 552 553 set_vcpu_info(cs, INVALID_GPA); 554 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO, 555 INVALID_GPA); 556 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR, 557 INVALID_GPA); 558 if (kvm_xen_has_cap(EVTCHN_SEND)) { 559 kvm_xen_set_vcpu_callback_vector(cs); 560 561 QEMU_LOCK_GUARD(&X86_CPU(cs)->env.xen_timers_lock); 562 env->xen_singleshot_timer_ns = 0; 563 kvm_xen_set_vcpu_timer(cs); 564 } else { 565 vcpuop_stop_singleshot_timer(cs); 566 }; 567 568 } 569 570 static int xen_set_shared_info(uint64_t gfn) 571 { 572 uint64_t gpa = gfn << TARGET_PAGE_BITS; 573 int i, err; 574 575 QEMU_IOTHREAD_LOCK_GUARD(); 576 577 /* 578 * The xen_overlay device tells KVM about it too, since it had to 579 * do that on migration load anyway (unless we're going to jump 580 * through lots of hoops to maintain the fiction that this isn't 581 * KVM-specific. 582 */ 583 err = xen_overlay_map_shinfo_page(gpa); 584 if (err) { 585 return err; 586 } 587 588 trace_kvm_xen_set_shared_info(gfn); 589 590 for (i = 0; i < XEN_LEGACY_MAX_VCPUS; i++) { 591 CPUState *cpu = qemu_get_cpu(i); 592 if (cpu) { 593 async_run_on_cpu(cpu, do_set_vcpu_info_default_gpa, 594 RUN_ON_CPU_HOST_ULONG(gpa)); 595 } 596 gpa += sizeof(vcpu_info_t); 597 } 598 599 return err; 600 } 601 602 static int add_to_physmap_one(uint32_t space, uint64_t idx, uint64_t gfn) 603 { 604 switch (space) { 605 case XENMAPSPACE_shared_info: 606 if (idx > 0) { 607 return -EINVAL; 608 } 609 return xen_set_shared_info(gfn); 610 611 case XENMAPSPACE_grant_table: 612 return xen_gnttab_map_page(idx, gfn); 613 614 case XENMAPSPACE_gmfn: 615 case XENMAPSPACE_gmfn_range: 616 return -ENOTSUP; 617 618 case XENMAPSPACE_gmfn_foreign: 619 case XENMAPSPACE_dev_mmio: 620 return -EPERM; 621 622 default: 623 return -EINVAL; 624 } 625 } 626 627 static int do_add_to_physmap(struct kvm_xen_exit *exit, X86CPU *cpu, 628 uint64_t arg) 629 { 630 struct xen_add_to_physmap xatp; 631 CPUState *cs = CPU(cpu); 632 633 if (hypercall_compat32(exit->u.hcall.longmode)) { 634 struct compat_xen_add_to_physmap xatp32; 635 636 qemu_build_assert(sizeof(struct compat_xen_add_to_physmap) == 16); 637 if (kvm_copy_from_gva(cs, arg, &xatp32, sizeof(xatp32))) { 638 return -EFAULT; 639 } 640 xatp.domid = xatp32.domid; 641 xatp.size = xatp32.size; 642 xatp.space = xatp32.space; 643 xatp.idx = xatp32.idx; 644 xatp.gpfn = xatp32.gpfn; 645 } else { 646 if (kvm_copy_from_gva(cs, arg, &xatp, sizeof(xatp))) { 647 return -EFAULT; 648 } 649 } 650 651 if (xatp.domid != DOMID_SELF && xatp.domid != xen_domid) { 652 return -ESRCH; 653 } 654 655 return add_to_physmap_one(xatp.space, xatp.idx, xatp.gpfn); 656 } 657 658 static int do_add_to_physmap_batch(struct kvm_xen_exit *exit, X86CPU *cpu, 659 uint64_t arg) 660 { 661 struct xen_add_to_physmap_batch xatpb; 662 unsigned long idxs_gva, gpfns_gva, errs_gva; 663 CPUState *cs = CPU(cpu); 664 size_t op_sz; 665 666 if (hypercall_compat32(exit->u.hcall.longmode)) { 667 struct compat_xen_add_to_physmap_batch xatpb32; 668 669 qemu_build_assert(sizeof(struct compat_xen_add_to_physmap_batch) == 20); 670 if (kvm_copy_from_gva(cs, arg, &xatpb32, sizeof(xatpb32))) { 671 return -EFAULT; 672 } 673 xatpb.domid = xatpb32.domid; 674 xatpb.space = xatpb32.space; 675 xatpb.size = xatpb32.size; 676 677 idxs_gva = xatpb32.idxs.c; 678 gpfns_gva = xatpb32.gpfns.c; 679 errs_gva = xatpb32.errs.c; 680 op_sz = sizeof(uint32_t); 681 } else { 682 if (kvm_copy_from_gva(cs, arg, &xatpb, sizeof(xatpb))) { 683 return -EFAULT; 684 } 685 op_sz = sizeof(unsigned long); 686 idxs_gva = (unsigned long)xatpb.idxs.p; 687 gpfns_gva = (unsigned long)xatpb.gpfns.p; 688 errs_gva = (unsigned long)xatpb.errs.p; 689 } 690 691 if (xatpb.domid != DOMID_SELF && xatpb.domid != xen_domid) { 692 return -ESRCH; 693 } 694 695 /* Explicitly invalid for the batch op. Not that we implement it anyway. */ 696 if (xatpb.space == XENMAPSPACE_gmfn_range) { 697 return -EINVAL; 698 } 699 700 while (xatpb.size--) { 701 unsigned long idx = 0; 702 unsigned long gpfn = 0; 703 int err; 704 705 /* For 32-bit compat this only copies the low 32 bits of each */ 706 if (kvm_copy_from_gva(cs, idxs_gva, &idx, op_sz) || 707 kvm_copy_from_gva(cs, gpfns_gva, &gpfn, op_sz)) { 708 return -EFAULT; 709 } 710 idxs_gva += op_sz; 711 gpfns_gva += op_sz; 712 713 err = add_to_physmap_one(xatpb.space, idx, gpfn); 714 715 if (kvm_copy_to_gva(cs, errs_gva, &err, sizeof(err))) { 716 return -EFAULT; 717 } 718 errs_gva += sizeof(err); 719 } 720 return 0; 721 } 722 723 static bool kvm_xen_hcall_memory_op(struct kvm_xen_exit *exit, X86CPU *cpu, 724 int cmd, uint64_t arg) 725 { 726 int err; 727 728 switch (cmd) { 729 case XENMEM_add_to_physmap: 730 err = do_add_to_physmap(exit, cpu, arg); 731 break; 732 733 case XENMEM_add_to_physmap_batch: 734 err = do_add_to_physmap_batch(exit, cpu, arg); 735 break; 736 737 default: 738 return false; 739 } 740 741 exit->u.hcall.result = err; 742 return true; 743 } 744 745 static bool handle_set_param(struct kvm_xen_exit *exit, X86CPU *cpu, 746 uint64_t arg) 747 { 748 CPUState *cs = CPU(cpu); 749 struct xen_hvm_param hp; 750 int err = 0; 751 752 /* No need for 32/64 compat handling */ 753 qemu_build_assert(sizeof(hp) == 16); 754 755 if (kvm_copy_from_gva(cs, arg, &hp, sizeof(hp))) { 756 err = -EFAULT; 757 goto out; 758 } 759 760 if (hp.domid != DOMID_SELF && hp.domid != xen_domid) { 761 err = -ESRCH; 762 goto out; 763 } 764 765 switch (hp.index) { 766 case HVM_PARAM_CALLBACK_IRQ: 767 qemu_mutex_lock_iothread(); 768 err = xen_evtchn_set_callback_param(hp.value); 769 qemu_mutex_unlock_iothread(); 770 xen_set_long_mode(exit->u.hcall.longmode); 771 break; 772 default: 773 return false; 774 } 775 776 out: 777 exit->u.hcall.result = err; 778 return true; 779 } 780 781 static bool handle_get_param(struct kvm_xen_exit *exit, X86CPU *cpu, 782 uint64_t arg) 783 { 784 CPUState *cs = CPU(cpu); 785 struct xen_hvm_param hp; 786 int err = 0; 787 788 /* No need for 32/64 compat handling */ 789 qemu_build_assert(sizeof(hp) == 16); 790 791 if (kvm_copy_from_gva(cs, arg, &hp, sizeof(hp))) { 792 err = -EFAULT; 793 goto out; 794 } 795 796 if (hp.domid != DOMID_SELF && hp.domid != xen_domid) { 797 err = -ESRCH; 798 goto out; 799 } 800 801 switch (hp.index) { 802 case HVM_PARAM_STORE_PFN: 803 hp.value = XEN_SPECIAL_PFN(XENSTORE); 804 break; 805 case HVM_PARAM_STORE_EVTCHN: 806 hp.value = xen_xenstore_get_port(); 807 break; 808 default: 809 return false; 810 } 811 812 if (kvm_copy_to_gva(cs, arg, &hp, sizeof(hp))) { 813 err = -EFAULT; 814 } 815 out: 816 exit->u.hcall.result = err; 817 return true; 818 } 819 820 static int kvm_xen_hcall_evtchn_upcall_vector(struct kvm_xen_exit *exit, 821 X86CPU *cpu, uint64_t arg) 822 { 823 struct xen_hvm_evtchn_upcall_vector up; 824 CPUState *target_cs; 825 826 /* No need for 32/64 compat handling */ 827 qemu_build_assert(sizeof(up) == 8); 828 829 if (kvm_copy_from_gva(CPU(cpu), arg, &up, sizeof(up))) { 830 return -EFAULT; 831 } 832 833 if (up.vector < 0x10) { 834 return -EINVAL; 835 } 836 837 target_cs = qemu_get_cpu(up.vcpu); 838 if (!target_cs) { 839 return -EINVAL; 840 } 841 842 async_run_on_cpu(target_cs, do_set_vcpu_callback_vector, 843 RUN_ON_CPU_HOST_INT(up.vector)); 844 return 0; 845 } 846 847 static bool kvm_xen_hcall_hvm_op(struct kvm_xen_exit *exit, X86CPU *cpu, 848 int cmd, uint64_t arg) 849 { 850 int ret = -ENOSYS; 851 switch (cmd) { 852 case HVMOP_set_evtchn_upcall_vector: 853 ret = kvm_xen_hcall_evtchn_upcall_vector(exit, cpu, 854 exit->u.hcall.params[0]); 855 break; 856 857 case HVMOP_pagetable_dying: 858 ret = -ENOSYS; 859 break; 860 861 case HVMOP_set_param: 862 return handle_set_param(exit, cpu, arg); 863 864 case HVMOP_get_param: 865 return handle_get_param(exit, cpu, arg); 866 867 default: 868 return false; 869 } 870 871 exit->u.hcall.result = ret; 872 return true; 873 } 874 875 static int vcpuop_register_vcpu_info(CPUState *cs, CPUState *target, 876 uint64_t arg) 877 { 878 struct vcpu_register_vcpu_info rvi; 879 uint64_t gpa; 880 881 /* No need for 32/64 compat handling */ 882 qemu_build_assert(sizeof(rvi) == 16); 883 qemu_build_assert(sizeof(struct vcpu_info) == 64); 884 885 if (!target) { 886 return -ENOENT; 887 } 888 889 if (kvm_copy_from_gva(cs, arg, &rvi, sizeof(rvi))) { 890 return -EFAULT; 891 } 892 893 if (rvi.offset > TARGET_PAGE_SIZE - sizeof(struct vcpu_info)) { 894 return -EINVAL; 895 } 896 897 gpa = ((rvi.mfn << TARGET_PAGE_BITS) + rvi.offset); 898 async_run_on_cpu(target, do_set_vcpu_info_gpa, RUN_ON_CPU_HOST_ULONG(gpa)); 899 return 0; 900 } 901 902 static int vcpuop_register_vcpu_time_info(CPUState *cs, CPUState *target, 903 uint64_t arg) 904 { 905 struct vcpu_register_time_memory_area tma; 906 uint64_t gpa; 907 size_t len; 908 909 /* No need for 32/64 compat handling */ 910 qemu_build_assert(sizeof(tma) == 8); 911 qemu_build_assert(sizeof(struct vcpu_time_info) == 32); 912 913 if (!target) { 914 return -ENOENT; 915 } 916 917 if (kvm_copy_from_gva(cs, arg, &tma, sizeof(tma))) { 918 return -EFAULT; 919 } 920 921 /* 922 * Xen actually uses the GVA and does the translation through the guest 923 * page tables each time. But Linux/KVM uses the GPA, on the assumption 924 * that guests only ever use *global* addresses (kernel virtual addresses) 925 * for it. If Linux is changed to redo the GVA→GPA translation each time, 926 * it will offer a new vCPU attribute for that, and we'll use it instead. 927 */ 928 if (!kvm_gva_to_gpa(cs, tma.addr.p, &gpa, &len, false) || 929 len < sizeof(struct vcpu_time_info)) { 930 return -EFAULT; 931 } 932 933 async_run_on_cpu(target, do_set_vcpu_time_info_gpa, 934 RUN_ON_CPU_HOST_ULONG(gpa)); 935 return 0; 936 } 937 938 static int vcpuop_register_runstate_info(CPUState *cs, CPUState *target, 939 uint64_t arg) 940 { 941 struct vcpu_register_runstate_memory_area rma; 942 uint64_t gpa; 943 size_t len; 944 945 /* No need for 32/64 compat handling */ 946 qemu_build_assert(sizeof(rma) == 8); 947 /* The runstate area actually does change size, but Linux copes. */ 948 949 if (!target) { 950 return -ENOENT; 951 } 952 953 if (kvm_copy_from_gva(cs, arg, &rma, sizeof(rma))) { 954 return -EFAULT; 955 } 956 957 /* As with vcpu_time_info, Xen actually uses the GVA but KVM doesn't. */ 958 if (!kvm_gva_to_gpa(cs, rma.addr.p, &gpa, &len, false)) { 959 return -EFAULT; 960 } 961 962 async_run_on_cpu(target, do_set_vcpu_runstate_gpa, 963 RUN_ON_CPU_HOST_ULONG(gpa)); 964 return 0; 965 } 966 967 static uint64_t kvm_get_current_ns(void) 968 { 969 struct kvm_clock_data data; 970 int ret; 971 972 ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data); 973 if (ret < 0) { 974 fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret)); 975 abort(); 976 } 977 978 return data.clock; 979 } 980 981 static void xen_vcpu_singleshot_timer_event(void *opaque) 982 { 983 CPUState *cpu = opaque; 984 CPUX86State *env = &X86_CPU(cpu)->env; 985 uint16_t port = env->xen_virq[VIRQ_TIMER]; 986 987 if (likely(port)) { 988 xen_evtchn_set_port(port); 989 } 990 991 qemu_mutex_lock(&env->xen_timers_lock); 992 env->xen_singleshot_timer_ns = 0; 993 qemu_mutex_unlock(&env->xen_timers_lock); 994 } 995 996 static void xen_vcpu_periodic_timer_event(void *opaque) 997 { 998 CPUState *cpu = opaque; 999 CPUX86State *env = &X86_CPU(cpu)->env; 1000 uint16_t port = env->xen_virq[VIRQ_TIMER]; 1001 int64_t qemu_now; 1002 1003 if (likely(port)) { 1004 xen_evtchn_set_port(port); 1005 } 1006 1007 qemu_mutex_lock(&env->xen_timers_lock); 1008 1009 qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); 1010 timer_mod_ns(env->xen_periodic_timer, 1011 qemu_now + env->xen_periodic_timer_period); 1012 1013 qemu_mutex_unlock(&env->xen_timers_lock); 1014 } 1015 1016 static int do_set_periodic_timer(CPUState *target, uint64_t period_ns) 1017 { 1018 CPUX86State *tenv = &X86_CPU(target)->env; 1019 int64_t qemu_now; 1020 1021 timer_del(tenv->xen_periodic_timer); 1022 1023 qemu_mutex_lock(&tenv->xen_timers_lock); 1024 1025 qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); 1026 timer_mod_ns(tenv->xen_periodic_timer, qemu_now + period_ns); 1027 tenv->xen_periodic_timer_period = period_ns; 1028 1029 qemu_mutex_unlock(&tenv->xen_timers_lock); 1030 return 0; 1031 } 1032 1033 #define MILLISECS(_ms) ((int64_t)((_ms) * 1000000ULL)) 1034 #define MICROSECS(_us) ((int64_t)((_us) * 1000ULL)) 1035 #define STIME_MAX ((time_t)((int64_t)~0ull >> 1)) 1036 /* Chosen so (NOW() + delta) won't overflow without an uptime of 200 years */ 1037 #define STIME_DELTA_MAX ((int64_t)((uint64_t)~0ull >> 2)) 1038 1039 static int vcpuop_set_periodic_timer(CPUState *cs, CPUState *target, 1040 uint64_t arg) 1041 { 1042 struct vcpu_set_periodic_timer spt; 1043 1044 qemu_build_assert(sizeof(spt) == 8); 1045 if (kvm_copy_from_gva(cs, arg, &spt, sizeof(spt))) { 1046 return -EFAULT; 1047 } 1048 1049 if (spt.period_ns < MILLISECS(1) || spt.period_ns > STIME_DELTA_MAX) { 1050 return -EINVAL; 1051 } 1052 1053 return do_set_periodic_timer(target, spt.period_ns); 1054 } 1055 1056 static int vcpuop_stop_periodic_timer(CPUState *target) 1057 { 1058 CPUX86State *tenv = &X86_CPU(target)->env; 1059 1060 qemu_mutex_lock(&tenv->xen_timers_lock); 1061 1062 timer_del(tenv->xen_periodic_timer); 1063 tenv->xen_periodic_timer_period = 0; 1064 1065 qemu_mutex_unlock(&tenv->xen_timers_lock); 1066 return 0; 1067 } 1068 1069 /* 1070 * Userspace handling of timer, for older kernels. 1071 * Must always be called with xen_timers_lock held. 1072 */ 1073 static int do_set_singleshot_timer(CPUState *cs, uint64_t timeout_abs, 1074 bool future, bool linux_wa) 1075 { 1076 CPUX86State *env = &X86_CPU(cs)->env; 1077 int64_t now = kvm_get_current_ns(); 1078 int64_t qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); 1079 int64_t delta = timeout_abs - now; 1080 1081 if (future && timeout_abs < now) { 1082 return -ETIME; 1083 } 1084 1085 if (linux_wa && unlikely((int64_t)timeout_abs < 0 || 1086 (delta > 0 && (uint32_t)(delta >> 50) != 0))) { 1087 /* 1088 * Xen has a 'Linux workaround' in do_set_timer_op() which checks 1089 * for negative absolute timeout values (caused by integer 1090 * overflow), and for values about 13 days in the future (2^50ns) 1091 * which would be caused by jiffies overflow. For those cases, it 1092 * sets the timeout 100ms in the future (not *too* soon, since if 1093 * a guest really did set a long timeout on purpose we don't want 1094 * to keep churning CPU time by waking it up). 1095 */ 1096 delta = (100 * SCALE_MS); 1097 timeout_abs = now + delta; 1098 } 1099 1100 timer_mod_ns(env->xen_singleshot_timer, qemu_now + delta); 1101 env->xen_singleshot_timer_ns = now + delta; 1102 return 0; 1103 } 1104 1105 static int vcpuop_set_singleshot_timer(CPUState *cs, uint64_t arg) 1106 { 1107 struct vcpu_set_singleshot_timer sst = { 0 }; 1108 1109 /* 1110 * The struct is a uint64_t followed by a uint32_t. On 32-bit that 1111 * makes it 12 bytes. On 64-bit it gets padded to 16. The parts 1112 * that get used are identical, and there's four bytes of padding 1113 * unused at the end. For true Xen compatibility we should attempt 1114 * to copy the full 16 bytes from 64-bit guests, and return -EFAULT 1115 * if we can't get the padding too. But that's daft. Just copy what 1116 * we need. 1117 */ 1118 qemu_build_assert(offsetof(struct vcpu_set_singleshot_timer, flags) == 8); 1119 qemu_build_assert(sizeof(sst) >= 12); 1120 1121 if (kvm_copy_from_gva(cs, arg, &sst, 12)) { 1122 return -EFAULT; 1123 } 1124 1125 QEMU_LOCK_GUARD(&X86_CPU(cs)->env.xen_timers_lock); 1126 return do_set_singleshot_timer(cs, sst.timeout_abs_ns, 1127 !!(sst.flags & VCPU_SSHOTTMR_future), 1128 false); 1129 } 1130 1131 static int vcpuop_stop_singleshot_timer(CPUState *cs) 1132 { 1133 CPUX86State *env = &X86_CPU(cs)->env; 1134 1135 qemu_mutex_lock(&env->xen_timers_lock); 1136 1137 timer_del(env->xen_singleshot_timer); 1138 env->xen_singleshot_timer_ns = 0; 1139 1140 qemu_mutex_unlock(&env->xen_timers_lock); 1141 return 0; 1142 } 1143 1144 static bool kvm_xen_hcall_set_timer_op(struct kvm_xen_exit *exit, X86CPU *cpu, 1145 uint64_t timeout) 1146 { 1147 int err; 1148 1149 if (unlikely(timeout == 0)) { 1150 err = vcpuop_stop_singleshot_timer(CPU(cpu)); 1151 } else { 1152 QEMU_LOCK_GUARD(&X86_CPU(cpu)->env.xen_timers_lock); 1153 err = do_set_singleshot_timer(CPU(cpu), timeout, false, true); 1154 } 1155 exit->u.hcall.result = err; 1156 return true; 1157 } 1158 1159 static bool kvm_xen_hcall_vcpu_op(struct kvm_xen_exit *exit, X86CPU *cpu, 1160 int cmd, int vcpu_id, uint64_t arg) 1161 { 1162 CPUState *cs = CPU(cpu); 1163 CPUState *dest = cs->cpu_index == vcpu_id ? cs : qemu_get_cpu(vcpu_id); 1164 int err; 1165 1166 if (!dest) { 1167 err = -ENOENT; 1168 goto out; 1169 } 1170 1171 switch (cmd) { 1172 case VCPUOP_register_runstate_memory_area: 1173 err = vcpuop_register_runstate_info(cs, dest, arg); 1174 break; 1175 case VCPUOP_register_vcpu_time_memory_area: 1176 err = vcpuop_register_vcpu_time_info(cs, dest, arg); 1177 break; 1178 case VCPUOP_register_vcpu_info: 1179 err = vcpuop_register_vcpu_info(cs, dest, arg); 1180 break; 1181 case VCPUOP_set_singleshot_timer: { 1182 if (cs->cpu_index == vcpu_id) { 1183 err = vcpuop_set_singleshot_timer(dest, arg); 1184 } else { 1185 err = -EINVAL; 1186 } 1187 break; 1188 } 1189 case VCPUOP_stop_singleshot_timer: 1190 if (cs->cpu_index == vcpu_id) { 1191 err = vcpuop_stop_singleshot_timer(dest); 1192 } else { 1193 err = -EINVAL; 1194 } 1195 break; 1196 case VCPUOP_set_periodic_timer: { 1197 err = vcpuop_set_periodic_timer(cs, dest, arg); 1198 break; 1199 } 1200 case VCPUOP_stop_periodic_timer: 1201 err = vcpuop_stop_periodic_timer(dest); 1202 break; 1203 1204 default: 1205 return false; 1206 } 1207 1208 out: 1209 exit->u.hcall.result = err; 1210 return true; 1211 } 1212 1213 static bool kvm_xen_hcall_evtchn_op(struct kvm_xen_exit *exit, X86CPU *cpu, 1214 int cmd, uint64_t arg) 1215 { 1216 CPUState *cs = CPU(cpu); 1217 int err = -ENOSYS; 1218 1219 switch (cmd) { 1220 case EVTCHNOP_init_control: 1221 case EVTCHNOP_expand_array: 1222 case EVTCHNOP_set_priority: 1223 /* We do not support FIFO channels at this point */ 1224 err = -ENOSYS; 1225 break; 1226 1227 case EVTCHNOP_status: { 1228 struct evtchn_status status; 1229 1230 qemu_build_assert(sizeof(status) == 24); 1231 if (kvm_copy_from_gva(cs, arg, &status, sizeof(status))) { 1232 err = -EFAULT; 1233 break; 1234 } 1235 1236 err = xen_evtchn_status_op(&status); 1237 if (!err && kvm_copy_to_gva(cs, arg, &status, sizeof(status))) { 1238 err = -EFAULT; 1239 } 1240 break; 1241 } 1242 case EVTCHNOP_close: { 1243 struct evtchn_close close; 1244 1245 qemu_build_assert(sizeof(close) == 4); 1246 if (kvm_copy_from_gva(cs, arg, &close, sizeof(close))) { 1247 err = -EFAULT; 1248 break; 1249 } 1250 1251 err = xen_evtchn_close_op(&close); 1252 break; 1253 } 1254 case EVTCHNOP_unmask: { 1255 struct evtchn_unmask unmask; 1256 1257 qemu_build_assert(sizeof(unmask) == 4); 1258 if (kvm_copy_from_gva(cs, arg, &unmask, sizeof(unmask))) { 1259 err = -EFAULT; 1260 break; 1261 } 1262 1263 err = xen_evtchn_unmask_op(&unmask); 1264 break; 1265 } 1266 case EVTCHNOP_bind_virq: { 1267 struct evtchn_bind_virq virq; 1268 1269 qemu_build_assert(sizeof(virq) == 12); 1270 if (kvm_copy_from_gva(cs, arg, &virq, sizeof(virq))) { 1271 err = -EFAULT; 1272 break; 1273 } 1274 1275 err = xen_evtchn_bind_virq_op(&virq); 1276 if (!err && kvm_copy_to_gva(cs, arg, &virq, sizeof(virq))) { 1277 err = -EFAULT; 1278 } 1279 break; 1280 } 1281 case EVTCHNOP_bind_pirq: { 1282 struct evtchn_bind_pirq pirq; 1283 1284 qemu_build_assert(sizeof(pirq) == 12); 1285 if (kvm_copy_from_gva(cs, arg, &pirq, sizeof(pirq))) { 1286 err = -EFAULT; 1287 break; 1288 } 1289 1290 err = xen_evtchn_bind_pirq_op(&pirq); 1291 if (!err && kvm_copy_to_gva(cs, arg, &pirq, sizeof(pirq))) { 1292 err = -EFAULT; 1293 } 1294 break; 1295 } 1296 case EVTCHNOP_bind_ipi: { 1297 struct evtchn_bind_ipi ipi; 1298 1299 qemu_build_assert(sizeof(ipi) == 8); 1300 if (kvm_copy_from_gva(cs, arg, &ipi, sizeof(ipi))) { 1301 err = -EFAULT; 1302 break; 1303 } 1304 1305 err = xen_evtchn_bind_ipi_op(&ipi); 1306 if (!err && kvm_copy_to_gva(cs, arg, &ipi, sizeof(ipi))) { 1307 err = -EFAULT; 1308 } 1309 break; 1310 } 1311 case EVTCHNOP_send: { 1312 struct evtchn_send send; 1313 1314 qemu_build_assert(sizeof(send) == 4); 1315 if (kvm_copy_from_gva(cs, arg, &send, sizeof(send))) { 1316 err = -EFAULT; 1317 break; 1318 } 1319 1320 err = xen_evtchn_send_op(&send); 1321 break; 1322 } 1323 case EVTCHNOP_alloc_unbound: { 1324 struct evtchn_alloc_unbound alloc; 1325 1326 qemu_build_assert(sizeof(alloc) == 8); 1327 if (kvm_copy_from_gva(cs, arg, &alloc, sizeof(alloc))) { 1328 err = -EFAULT; 1329 break; 1330 } 1331 1332 err = xen_evtchn_alloc_unbound_op(&alloc); 1333 if (!err && kvm_copy_to_gva(cs, arg, &alloc, sizeof(alloc))) { 1334 err = -EFAULT; 1335 } 1336 break; 1337 } 1338 case EVTCHNOP_bind_interdomain: { 1339 struct evtchn_bind_interdomain interdomain; 1340 1341 qemu_build_assert(sizeof(interdomain) == 12); 1342 if (kvm_copy_from_gva(cs, arg, &interdomain, sizeof(interdomain))) { 1343 err = -EFAULT; 1344 break; 1345 } 1346 1347 err = xen_evtchn_bind_interdomain_op(&interdomain); 1348 if (!err && 1349 kvm_copy_to_gva(cs, arg, &interdomain, sizeof(interdomain))) { 1350 err = -EFAULT; 1351 } 1352 break; 1353 } 1354 case EVTCHNOP_bind_vcpu: { 1355 struct evtchn_bind_vcpu vcpu; 1356 1357 qemu_build_assert(sizeof(vcpu) == 8); 1358 if (kvm_copy_from_gva(cs, arg, &vcpu, sizeof(vcpu))) { 1359 err = -EFAULT; 1360 break; 1361 } 1362 1363 err = xen_evtchn_bind_vcpu_op(&vcpu); 1364 break; 1365 } 1366 case EVTCHNOP_reset: { 1367 struct evtchn_reset reset; 1368 1369 qemu_build_assert(sizeof(reset) == 2); 1370 if (kvm_copy_from_gva(cs, arg, &reset, sizeof(reset))) { 1371 err = -EFAULT; 1372 break; 1373 } 1374 1375 err = xen_evtchn_reset_op(&reset); 1376 break; 1377 } 1378 default: 1379 return false; 1380 } 1381 1382 exit->u.hcall.result = err; 1383 return true; 1384 } 1385 1386 int kvm_xen_soft_reset(void) 1387 { 1388 CPUState *cpu; 1389 int err; 1390 1391 assert(qemu_mutex_iothread_locked()); 1392 1393 trace_kvm_xen_soft_reset(); 1394 1395 err = xen_evtchn_soft_reset(); 1396 if (err) { 1397 return err; 1398 } 1399 1400 /* 1401 * Zero is the reset/startup state for HVM_PARAM_CALLBACK_IRQ. Strictly, 1402 * it maps to HVM_PARAM_CALLBACK_TYPE_GSI with GSI#0, but Xen refuses to 1403 * to deliver to the timer interrupt and treats that as 'disabled'. 1404 */ 1405 err = xen_evtchn_set_callback_param(0); 1406 if (err) { 1407 return err; 1408 } 1409 1410 CPU_FOREACH(cpu) { 1411 async_run_on_cpu(cpu, do_vcpu_soft_reset, RUN_ON_CPU_NULL); 1412 } 1413 1414 err = xen_overlay_map_shinfo_page(INVALID_GFN); 1415 if (err) { 1416 return err; 1417 } 1418 1419 err = xen_gnttab_reset(); 1420 if (err) { 1421 return err; 1422 } 1423 1424 err = xen_xenstore_reset(); 1425 if (err) { 1426 return err; 1427 } 1428 1429 return 0; 1430 } 1431 1432 static int schedop_shutdown(CPUState *cs, uint64_t arg) 1433 { 1434 struct sched_shutdown shutdown; 1435 int ret = 0; 1436 1437 /* No need for 32/64 compat handling */ 1438 qemu_build_assert(sizeof(shutdown) == 4); 1439 1440 if (kvm_copy_from_gva(cs, arg, &shutdown, sizeof(shutdown))) { 1441 return -EFAULT; 1442 } 1443 1444 switch (shutdown.reason) { 1445 case SHUTDOWN_crash: 1446 cpu_dump_state(cs, stderr, CPU_DUMP_CODE); 1447 qemu_system_guest_panicked(NULL); 1448 break; 1449 1450 case SHUTDOWN_reboot: 1451 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); 1452 break; 1453 1454 case SHUTDOWN_poweroff: 1455 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN); 1456 break; 1457 1458 case SHUTDOWN_soft_reset: 1459 qemu_mutex_lock_iothread(); 1460 ret = kvm_xen_soft_reset(); 1461 qemu_mutex_unlock_iothread(); 1462 break; 1463 1464 default: 1465 ret = -EINVAL; 1466 break; 1467 } 1468 1469 return ret; 1470 } 1471 1472 static bool kvm_xen_hcall_sched_op(struct kvm_xen_exit *exit, X86CPU *cpu, 1473 int cmd, uint64_t arg) 1474 { 1475 CPUState *cs = CPU(cpu); 1476 int err = -ENOSYS; 1477 1478 switch (cmd) { 1479 case SCHEDOP_shutdown: 1480 err = schedop_shutdown(cs, arg); 1481 break; 1482 1483 case SCHEDOP_poll: 1484 /* 1485 * Linux will panic if this doesn't work. Just yield; it's not 1486 * worth overthinking it because with event channel handling 1487 * in KVM, the kernel will intercept this and it will never 1488 * reach QEMU anyway. The semantics of the hypercall explicltly 1489 * permit spurious wakeups. 1490 */ 1491 case SCHEDOP_yield: 1492 sched_yield(); 1493 err = 0; 1494 break; 1495 1496 default: 1497 return false; 1498 } 1499 1500 exit->u.hcall.result = err; 1501 return true; 1502 } 1503 1504 static bool kvm_xen_hcall_gnttab_op(struct kvm_xen_exit *exit, X86CPU *cpu, 1505 int cmd, uint64_t arg, int count) 1506 { 1507 CPUState *cs = CPU(cpu); 1508 int err; 1509 1510 switch (cmd) { 1511 case GNTTABOP_set_version: { 1512 struct gnttab_set_version set; 1513 1514 qemu_build_assert(sizeof(set) == 4); 1515 if (kvm_copy_from_gva(cs, arg, &set, sizeof(set))) { 1516 err = -EFAULT; 1517 break; 1518 } 1519 1520 err = xen_gnttab_set_version_op(&set); 1521 if (!err && kvm_copy_to_gva(cs, arg, &set, sizeof(set))) { 1522 err = -EFAULT; 1523 } 1524 break; 1525 } 1526 case GNTTABOP_get_version: { 1527 struct gnttab_get_version get; 1528 1529 qemu_build_assert(sizeof(get) == 8); 1530 if (kvm_copy_from_gva(cs, arg, &get, sizeof(get))) { 1531 err = -EFAULT; 1532 break; 1533 } 1534 1535 err = xen_gnttab_get_version_op(&get); 1536 if (!err && kvm_copy_to_gva(cs, arg, &get, sizeof(get))) { 1537 err = -EFAULT; 1538 } 1539 break; 1540 } 1541 case GNTTABOP_query_size: { 1542 struct gnttab_query_size size; 1543 1544 qemu_build_assert(sizeof(size) == 16); 1545 if (kvm_copy_from_gva(cs, arg, &size, sizeof(size))) { 1546 err = -EFAULT; 1547 break; 1548 } 1549 1550 err = xen_gnttab_query_size_op(&size); 1551 if (!err && kvm_copy_to_gva(cs, arg, &size, sizeof(size))) { 1552 err = -EFAULT; 1553 } 1554 break; 1555 } 1556 case GNTTABOP_setup_table: 1557 case GNTTABOP_copy: 1558 case GNTTABOP_map_grant_ref: 1559 case GNTTABOP_unmap_grant_ref: 1560 case GNTTABOP_swap_grant_ref: 1561 return false; 1562 1563 default: 1564 /* Xen explicitly returns -ENOSYS to HVM guests for all others */ 1565 err = -ENOSYS; 1566 break; 1567 } 1568 1569 exit->u.hcall.result = err; 1570 return true; 1571 } 1572 1573 static bool kvm_xen_hcall_physdev_op(struct kvm_xen_exit *exit, X86CPU *cpu, 1574 int cmd, uint64_t arg) 1575 { 1576 CPUState *cs = CPU(cpu); 1577 int err; 1578 1579 switch (cmd) { 1580 case PHYSDEVOP_map_pirq: { 1581 struct physdev_map_pirq map; 1582 1583 if (hypercall_compat32(exit->u.hcall.longmode)) { 1584 struct compat_physdev_map_pirq *map32 = (void *)↦ 1585 1586 if (kvm_copy_from_gva(cs, arg, map32, sizeof(*map32))) { 1587 return -EFAULT; 1588 } 1589 1590 /* 1591 * The only thing that's different is the alignment of the 1592 * uint64_t table_base at the end, which gets padding to make 1593 * it 64-bit aligned in the 64-bit version. 1594 */ 1595 qemu_build_assert(sizeof(*map32) == 36); 1596 qemu_build_assert(offsetof(struct physdev_map_pirq, entry_nr) == 1597 offsetof(struct compat_physdev_map_pirq, entry_nr)); 1598 memmove(&map.table_base, &map32->table_base, sizeof(map.table_base)); 1599 } else { 1600 if (kvm_copy_from_gva(cs, arg, &map, sizeof(map))) { 1601 err = -EFAULT; 1602 break; 1603 } 1604 } 1605 err = xen_physdev_map_pirq(&map); 1606 /* 1607 * Since table_base is an IN parameter and won't be changed, just 1608 * copy the size of the compat structure back to the guest. 1609 */ 1610 if (!err && kvm_copy_to_gva(cs, arg, &map, 1611 sizeof(struct compat_physdev_map_pirq))) { 1612 err = -EFAULT; 1613 } 1614 break; 1615 } 1616 case PHYSDEVOP_unmap_pirq: { 1617 struct physdev_unmap_pirq unmap; 1618 1619 qemu_build_assert(sizeof(unmap) == 8); 1620 if (kvm_copy_from_gva(cs, arg, &unmap, sizeof(unmap))) { 1621 err = -EFAULT; 1622 break; 1623 } 1624 1625 err = xen_physdev_unmap_pirq(&unmap); 1626 if (!err && kvm_copy_to_gva(cs, arg, &unmap, sizeof(unmap))) { 1627 err = -EFAULT; 1628 } 1629 break; 1630 } 1631 case PHYSDEVOP_eoi: { 1632 struct physdev_eoi eoi; 1633 1634 qemu_build_assert(sizeof(eoi) == 4); 1635 if (kvm_copy_from_gva(cs, arg, &eoi, sizeof(eoi))) { 1636 err = -EFAULT; 1637 break; 1638 } 1639 1640 err = xen_physdev_eoi_pirq(&eoi); 1641 if (!err && kvm_copy_to_gva(cs, arg, &eoi, sizeof(eoi))) { 1642 err = -EFAULT; 1643 } 1644 break; 1645 } 1646 case PHYSDEVOP_irq_status_query: { 1647 struct physdev_irq_status_query query; 1648 1649 qemu_build_assert(sizeof(query) == 8); 1650 if (kvm_copy_from_gva(cs, arg, &query, sizeof(query))) { 1651 err = -EFAULT; 1652 break; 1653 } 1654 1655 err = xen_physdev_query_pirq(&query); 1656 if (!err && kvm_copy_to_gva(cs, arg, &query, sizeof(query))) { 1657 err = -EFAULT; 1658 } 1659 break; 1660 } 1661 case PHYSDEVOP_get_free_pirq: { 1662 struct physdev_get_free_pirq get; 1663 1664 qemu_build_assert(sizeof(get) == 8); 1665 if (kvm_copy_from_gva(cs, arg, &get, sizeof(get))) { 1666 err = -EFAULT; 1667 break; 1668 } 1669 1670 err = xen_physdev_get_free_pirq(&get); 1671 if (!err && kvm_copy_to_gva(cs, arg, &get, sizeof(get))) { 1672 err = -EFAULT; 1673 } 1674 break; 1675 } 1676 case PHYSDEVOP_pirq_eoi_gmfn_v2: /* FreeBSD 13 makes this hypercall */ 1677 err = -ENOSYS; 1678 break; 1679 1680 default: 1681 return false; 1682 } 1683 1684 exit->u.hcall.result = err; 1685 return true; 1686 } 1687 1688 static bool do_kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit) 1689 { 1690 uint16_t code = exit->u.hcall.input; 1691 1692 if (exit->u.hcall.cpl > 0) { 1693 exit->u.hcall.result = -EPERM; 1694 return true; 1695 } 1696 1697 switch (code) { 1698 case __HYPERVISOR_set_timer_op: 1699 if (exit->u.hcall.longmode) { 1700 return kvm_xen_hcall_set_timer_op(exit, cpu, 1701 exit->u.hcall.params[0]); 1702 } else { 1703 /* In 32-bit mode, the 64-bit timer value is in two args. */ 1704 uint64_t val = ((uint64_t)exit->u.hcall.params[1]) << 32 | 1705 (uint32_t)exit->u.hcall.params[0]; 1706 return kvm_xen_hcall_set_timer_op(exit, cpu, val); 1707 } 1708 case __HYPERVISOR_grant_table_op: 1709 return kvm_xen_hcall_gnttab_op(exit, cpu, exit->u.hcall.params[0], 1710 exit->u.hcall.params[1], 1711 exit->u.hcall.params[2]); 1712 case __HYPERVISOR_sched_op: 1713 return kvm_xen_hcall_sched_op(exit, cpu, exit->u.hcall.params[0], 1714 exit->u.hcall.params[1]); 1715 case __HYPERVISOR_event_channel_op: 1716 return kvm_xen_hcall_evtchn_op(exit, cpu, exit->u.hcall.params[0], 1717 exit->u.hcall.params[1]); 1718 case __HYPERVISOR_vcpu_op: 1719 return kvm_xen_hcall_vcpu_op(exit, cpu, 1720 exit->u.hcall.params[0], 1721 exit->u.hcall.params[1], 1722 exit->u.hcall.params[2]); 1723 case __HYPERVISOR_hvm_op: 1724 return kvm_xen_hcall_hvm_op(exit, cpu, exit->u.hcall.params[0], 1725 exit->u.hcall.params[1]); 1726 case __HYPERVISOR_memory_op: 1727 return kvm_xen_hcall_memory_op(exit, cpu, exit->u.hcall.params[0], 1728 exit->u.hcall.params[1]); 1729 case __HYPERVISOR_physdev_op: 1730 return kvm_xen_hcall_physdev_op(exit, cpu, exit->u.hcall.params[0], 1731 exit->u.hcall.params[1]); 1732 case __HYPERVISOR_xen_version: 1733 return kvm_xen_hcall_xen_version(exit, cpu, exit->u.hcall.params[0], 1734 exit->u.hcall.params[1]); 1735 default: 1736 return false; 1737 } 1738 } 1739 1740 int kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit) 1741 { 1742 if (exit->type != KVM_EXIT_XEN_HCALL) { 1743 return -1; 1744 } 1745 1746 /* 1747 * The kernel latches the guest 32/64 mode when the MSR is used to fill 1748 * the hypercall page. So if we see a hypercall in a mode that doesn't 1749 * match our own idea of the guest mode, fetch the kernel's idea of the 1750 * "long mode" to remain in sync. 1751 */ 1752 if (exit->u.hcall.longmode != xen_is_long_mode()) { 1753 xen_sync_long_mode(); 1754 } 1755 1756 if (!do_kvm_xen_handle_exit(cpu, exit)) { 1757 /* 1758 * Some hypercalls will be deliberately "implemented" by returning 1759 * -ENOSYS. This case is for hypercalls which are unexpected. 1760 */ 1761 exit->u.hcall.result = -ENOSYS; 1762 qemu_log_mask(LOG_UNIMP, "Unimplemented Xen hypercall %" 1763 PRId64 " (0x%" PRIx64 " 0x%" PRIx64 " 0x%" PRIx64 ")\n", 1764 (uint64_t)exit->u.hcall.input, 1765 (uint64_t)exit->u.hcall.params[0], 1766 (uint64_t)exit->u.hcall.params[1], 1767 (uint64_t)exit->u.hcall.params[2]); 1768 } 1769 1770 trace_kvm_xen_hypercall(CPU(cpu)->cpu_index, exit->u.hcall.cpl, 1771 exit->u.hcall.input, exit->u.hcall.params[0], 1772 exit->u.hcall.params[1], exit->u.hcall.params[2], 1773 exit->u.hcall.result); 1774 return 0; 1775 } 1776 1777 uint16_t kvm_xen_get_gnttab_max_frames(void) 1778 { 1779 KVMState *s = KVM_STATE(current_accel()); 1780 return s->xen_gnttab_max_frames; 1781 } 1782 1783 uint16_t kvm_xen_get_evtchn_max_pirq(void) 1784 { 1785 KVMState *s = KVM_STATE(current_accel()); 1786 return s->xen_evtchn_max_pirq; 1787 } 1788 1789 int kvm_put_xen_state(CPUState *cs) 1790 { 1791 X86CPU *cpu = X86_CPU(cs); 1792 CPUX86State *env = &cpu->env; 1793 uint64_t gpa; 1794 int ret; 1795 1796 gpa = env->xen_vcpu_info_gpa; 1797 if (gpa == INVALID_GPA) { 1798 gpa = env->xen_vcpu_info_default_gpa; 1799 } 1800 1801 if (gpa != INVALID_GPA) { 1802 ret = set_vcpu_info(cs, gpa); 1803 if (ret < 0) { 1804 return ret; 1805 } 1806 } 1807 1808 gpa = env->xen_vcpu_time_info_gpa; 1809 if (gpa != INVALID_GPA) { 1810 ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO, 1811 gpa); 1812 if (ret < 0) { 1813 return ret; 1814 } 1815 } 1816 1817 gpa = env->xen_vcpu_runstate_gpa; 1818 if (gpa != INVALID_GPA) { 1819 ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR, 1820 gpa); 1821 if (ret < 0) { 1822 return ret; 1823 } 1824 } 1825 1826 if (env->xen_periodic_timer_period) { 1827 ret = do_set_periodic_timer(cs, env->xen_periodic_timer_period); 1828 if (ret < 0) { 1829 return ret; 1830 } 1831 } 1832 1833 if (!kvm_xen_has_cap(EVTCHN_SEND)) { 1834 /* 1835 * If the kernel has EVTCHN_SEND support then it handles timers too, 1836 * so the timer will be restored by kvm_xen_set_vcpu_timer() below. 1837 */ 1838 QEMU_LOCK_GUARD(&env->xen_timers_lock); 1839 if (env->xen_singleshot_timer_ns) { 1840 ret = do_set_singleshot_timer(cs, env->xen_singleshot_timer_ns, 1841 false, false); 1842 if (ret < 0) { 1843 return ret; 1844 } 1845 } 1846 return 0; 1847 } 1848 1849 if (env->xen_vcpu_callback_vector) { 1850 ret = kvm_xen_set_vcpu_callback_vector(cs); 1851 if (ret < 0) { 1852 return ret; 1853 } 1854 } 1855 1856 if (env->xen_virq[VIRQ_TIMER]) { 1857 do_set_vcpu_timer_virq(cs, 1858 RUN_ON_CPU_HOST_INT(env->xen_virq[VIRQ_TIMER])); 1859 } 1860 return 0; 1861 } 1862 1863 int kvm_get_xen_state(CPUState *cs) 1864 { 1865 X86CPU *cpu = X86_CPU(cs); 1866 CPUX86State *env = &cpu->env; 1867 uint64_t gpa; 1868 int ret; 1869 1870 /* 1871 * The kernel does not mark vcpu_info as dirty when it delivers interrupts 1872 * to it. It's up to userspace to *assume* that any page shared thus is 1873 * always considered dirty. The shared_info page is different since it's 1874 * an overlay and migrated separately anyway. 1875 */ 1876 gpa = env->xen_vcpu_info_gpa; 1877 if (gpa == INVALID_GPA) { 1878 gpa = env->xen_vcpu_info_default_gpa; 1879 } 1880 if (gpa != INVALID_GPA) { 1881 MemoryRegionSection mrs = memory_region_find(get_system_memory(), 1882 gpa, 1883 sizeof(struct vcpu_info)); 1884 if (mrs.mr && 1885 !int128_lt(mrs.size, int128_make64(sizeof(struct vcpu_info)))) { 1886 memory_region_set_dirty(mrs.mr, mrs.offset_within_region, 1887 sizeof(struct vcpu_info)); 1888 } 1889 } 1890 1891 if (!kvm_xen_has_cap(EVTCHN_SEND)) { 1892 return 0; 1893 } 1894 1895 /* 1896 * If the kernel is accelerating timers, read out the current value of the 1897 * singleshot timer deadline. 1898 */ 1899 if (env->xen_virq[VIRQ_TIMER]) { 1900 struct kvm_xen_vcpu_attr va = { 1901 .type = KVM_XEN_VCPU_ATTR_TYPE_TIMER, 1902 }; 1903 ret = kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_GET_ATTR, &va); 1904 if (ret < 0) { 1905 return ret; 1906 } 1907 1908 /* 1909 * This locking is fairly pointless, and is here to appease Coverity. 1910 * There is an unavoidable race condition if a different vCPU sets a 1911 * timer for this vCPU after the value has been read out. But that's 1912 * OK in practice because *all* the vCPUs need to be stopped before 1913 * we set about migrating their state. 1914 */ 1915 QEMU_LOCK_GUARD(&X86_CPU(cs)->env.xen_timers_lock); 1916 env->xen_singleshot_timer_ns = va.u.timer.expires_ns; 1917 } 1918 1919 return 0; 1920 } 1921