1 /* 2 * Xen HVM emulation support in KVM 3 * 4 * Copyright © 2019 Oracle and/or its affiliates. All rights reserved. 5 * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. 6 * 7 * This work is licensed under the terms of the GNU GPL, version 2 or later. 8 * See the COPYING file in the top-level directory. 9 * 10 */ 11 12 #include "qemu/osdep.h" 13 #include "qemu/log.h" 14 #include "qemu/main-loop.h" 15 #include "qemu/error-report.h" 16 #include "hw/xen/xen.h" 17 #include "sysemu/kvm_int.h" 18 #include "sysemu/kvm_xen.h" 19 #include "kvm/kvm_i386.h" 20 #include "exec/address-spaces.h" 21 #include "xen-emu.h" 22 #include "trace.h" 23 #include "sysemu/runstate.h" 24 25 #include "hw/pci/msi.h" 26 #include "hw/i386/apic-msidef.h" 27 #include "hw/i386/e820_memory_layout.h" 28 #include "hw/i386/kvm/xen_overlay.h" 29 #include "hw/i386/kvm/xen_evtchn.h" 30 #include "hw/i386/kvm/xen_gnttab.h" 31 #include "hw/i386/kvm/xen_primary_console.h" 32 #include "hw/i386/kvm/xen_xenstore.h" 33 34 #include "hw/xen/interface/version.h" 35 #include "hw/xen/interface/sched.h" 36 #include "hw/xen/interface/memory.h" 37 #include "hw/xen/interface/hvm/hvm_op.h" 38 #include "hw/xen/interface/hvm/params.h" 39 #include "hw/xen/interface/vcpu.h" 40 #include "hw/xen/interface/event_channel.h" 41 #include "hw/xen/interface/grant_table.h" 42 43 #include "xen-compat.h" 44 45 static void xen_vcpu_singleshot_timer_event(void *opaque); 46 static void xen_vcpu_periodic_timer_event(void *opaque); 47 static int vcpuop_stop_singleshot_timer(CPUState *cs); 48 49 #ifdef TARGET_X86_64 50 #define hypercall_compat32(longmode) (!(longmode)) 51 #else 52 #define hypercall_compat32(longmode) (false) 53 #endif 54 55 static bool kvm_gva_to_gpa(CPUState *cs, uint64_t gva, uint64_t *gpa, 56 size_t *len, bool is_write) 57 { 58 struct kvm_translation tr = { 59 .linear_address = gva, 60 }; 61 62 if (len) { 63 *len = TARGET_PAGE_SIZE - (gva & ~TARGET_PAGE_MASK); 64 } 65 66 if (kvm_vcpu_ioctl(cs, KVM_TRANSLATE, &tr) || !tr.valid || 67 (is_write && !tr.writeable)) { 68 return false; 69 } 70 *gpa = tr.physical_address; 71 return true; 72 } 73 74 static int kvm_gva_rw(CPUState *cs, uint64_t gva, void *_buf, size_t sz, 75 bool is_write) 76 { 77 uint8_t *buf = (uint8_t *)_buf; 78 uint64_t gpa; 79 size_t len; 80 81 while (sz) { 82 if (!kvm_gva_to_gpa(cs, gva, &gpa, &len, is_write)) { 83 return -EFAULT; 84 } 85 if (len > sz) { 86 len = sz; 87 } 88 89 cpu_physical_memory_rw(gpa, buf, len, is_write); 90 91 buf += len; 92 sz -= len; 93 gva += len; 94 } 95 96 return 0; 97 } 98 99 static inline int kvm_copy_from_gva(CPUState *cs, uint64_t gva, void *buf, 100 size_t sz) 101 { 102 return kvm_gva_rw(cs, gva, buf, sz, false); 103 } 104 105 static inline int kvm_copy_to_gva(CPUState *cs, uint64_t gva, void *buf, 106 size_t sz) 107 { 108 return kvm_gva_rw(cs, gva, buf, sz, true); 109 } 110 111 int kvm_xen_init(KVMState *s, uint32_t hypercall_msr) 112 { 113 const int required_caps = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR | 114 KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | KVM_XEN_HVM_CONFIG_SHARED_INFO; 115 struct kvm_xen_hvm_config cfg = { 116 .msr = hypercall_msr, 117 .flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL, 118 }; 119 int xen_caps, ret; 120 121 xen_caps = kvm_check_extension(s, KVM_CAP_XEN_HVM); 122 if (required_caps & ~xen_caps) { 123 error_report("kvm: Xen HVM guest support not present or insufficient"); 124 return -ENOSYS; 125 } 126 127 if (xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND) { 128 struct kvm_xen_hvm_attr ha = { 129 .type = KVM_XEN_ATTR_TYPE_XEN_VERSION, 130 .u.xen_version = s->xen_version, 131 }; 132 (void)kvm_vm_ioctl(s, KVM_XEN_HVM_SET_ATTR, &ha); 133 134 cfg.flags |= KVM_XEN_HVM_CONFIG_EVTCHN_SEND; 135 } 136 137 ret = kvm_vm_ioctl(s, KVM_XEN_HVM_CONFIG, &cfg); 138 if (ret < 0) { 139 error_report("kvm: Failed to enable Xen HVM support: %s", 140 strerror(-ret)); 141 return ret; 142 } 143 144 /* If called a second time, don't repeat the rest of the setup. */ 145 if (s->xen_caps) { 146 return 0; 147 } 148 149 /* 150 * Event channel delivery via GSI/PCI_INTX needs to poll the vcpu_info 151 * of vCPU0 to deassert the IRQ when ->evtchn_upcall_pending is cleared. 152 * 153 * In the kernel, there's a notifier hook on the PIC/IOAPIC which allows 154 * such things to be polled at precisely the right time. We *could* do 155 * it nicely in the kernel: check vcpu_info[0]->evtchn_upcall_pending at 156 * the moment the IRQ is acked, and see if it should be reasserted. 157 * 158 * But the in-kernel irqchip is deprecated, so we're unlikely to add 159 * that support in the kernel. Insist on using the split irqchip mode 160 * instead. 161 * 162 * This leaves us polling for the level going low in QEMU, which lacks 163 * the appropriate hooks in its PIC/IOAPIC code. Even VFIO is sending a 164 * spurious 'ack' to an INTX IRQ every time there's any MMIO access to 165 * the device (for which it has to unmap the device and trap access, for 166 * some period after an IRQ!!). In the Xen case, we do it on exit from 167 * KVM_RUN, if the flag is set to say that the GSI is currently asserted. 168 * Which is kind of icky, but less so than the VFIO one. I may fix them 169 * both later... 170 */ 171 if (!kvm_kernel_irqchip_split()) { 172 error_report("kvm: Xen support requires kernel-irqchip=split"); 173 return -EINVAL; 174 } 175 176 s->xen_caps = xen_caps; 177 178 /* Tell fw_cfg to notify the BIOS to reserve the range. */ 179 e820_add_entry(XEN_SPECIAL_AREA_ADDR, XEN_SPECIAL_AREA_SIZE, E820_RESERVED); 180 181 /* The pages couldn't be overlaid until KVM was initialized */ 182 xen_primary_console_reset(); 183 xen_xenstore_reset(); 184 185 return 0; 186 } 187 188 int kvm_xen_init_vcpu(CPUState *cs) 189 { 190 X86CPU *cpu = X86_CPU(cs); 191 CPUX86State *env = &cpu->env; 192 int err; 193 194 /* 195 * The kernel needs to know the Xen/ACPI vCPU ID because that's 196 * what the guest uses in hypercalls such as timers. It doesn't 197 * match the APIC ID which is generally used for talking to the 198 * kernel about vCPUs. And if vCPU threads race with creating 199 * their KVM vCPUs out of order, it doesn't necessarily match 200 * with the kernel's internal vCPU indices either. 201 */ 202 if (kvm_xen_has_cap(EVTCHN_SEND)) { 203 struct kvm_xen_vcpu_attr va = { 204 .type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID, 205 .u.vcpu_id = cs->cpu_index, 206 }; 207 err = kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &va); 208 if (err) { 209 error_report("kvm: Failed to set Xen vCPU ID attribute: %s", 210 strerror(-err)); 211 return err; 212 } 213 } 214 215 env->xen_vcpu_info_gpa = INVALID_GPA; 216 env->xen_vcpu_info_default_gpa = INVALID_GPA; 217 env->xen_vcpu_time_info_gpa = INVALID_GPA; 218 env->xen_vcpu_runstate_gpa = INVALID_GPA; 219 220 qemu_mutex_init(&env->xen_timers_lock); 221 env->xen_singleshot_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, 222 xen_vcpu_singleshot_timer_event, 223 cpu); 224 if (!env->xen_singleshot_timer) { 225 return -ENOMEM; 226 } 227 env->xen_singleshot_timer->opaque = cs; 228 229 env->xen_periodic_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, 230 xen_vcpu_periodic_timer_event, 231 cpu); 232 if (!env->xen_periodic_timer) { 233 return -ENOMEM; 234 } 235 env->xen_periodic_timer->opaque = cs; 236 237 return 0; 238 } 239 240 uint32_t kvm_xen_get_caps(void) 241 { 242 return kvm_state->xen_caps; 243 } 244 245 static bool kvm_xen_hcall_xen_version(struct kvm_xen_exit *exit, X86CPU *cpu, 246 int cmd, uint64_t arg) 247 { 248 int err = 0; 249 250 switch (cmd) { 251 case XENVER_get_features: { 252 struct xen_feature_info fi; 253 254 /* No need for 32/64 compat handling */ 255 qemu_build_assert(sizeof(fi) == 8); 256 257 err = kvm_copy_from_gva(CPU(cpu), arg, &fi, sizeof(fi)); 258 if (err) { 259 break; 260 } 261 262 fi.submap = 0; 263 if (fi.submap_idx == 0) { 264 fi.submap |= 1 << XENFEAT_writable_page_tables | 265 1 << XENFEAT_writable_descriptor_tables | 266 1 << XENFEAT_auto_translated_physmap | 267 1 << XENFEAT_hvm_callback_vector | 268 1 << XENFEAT_hvm_safe_pvclock | 269 1 << XENFEAT_hvm_pirqs; 270 } 271 272 err = kvm_copy_to_gva(CPU(cpu), arg, &fi, sizeof(fi)); 273 break; 274 } 275 276 default: 277 return false; 278 } 279 280 exit->u.hcall.result = err; 281 return true; 282 } 283 284 static int kvm_xen_set_vcpu_attr(CPUState *cs, uint16_t type, uint64_t gpa) 285 { 286 struct kvm_xen_vcpu_attr xhsi; 287 288 xhsi.type = type; 289 xhsi.u.gpa = gpa; 290 291 trace_kvm_xen_set_vcpu_attr(cs->cpu_index, type, gpa); 292 293 return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &xhsi); 294 } 295 296 static int kvm_xen_set_vcpu_callback_vector(CPUState *cs) 297 { 298 uint8_t vector = X86_CPU(cs)->env.xen_vcpu_callback_vector; 299 struct kvm_xen_vcpu_attr xva; 300 301 xva.type = KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR; 302 xva.u.vector = vector; 303 304 trace_kvm_xen_set_vcpu_callback(cs->cpu_index, vector); 305 306 return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &xva); 307 } 308 309 static void do_set_vcpu_callback_vector(CPUState *cs, run_on_cpu_data data) 310 { 311 X86CPU *cpu = X86_CPU(cs); 312 CPUX86State *env = &cpu->env; 313 314 env->xen_vcpu_callback_vector = data.host_int; 315 316 if (kvm_xen_has_cap(EVTCHN_SEND)) { 317 kvm_xen_set_vcpu_callback_vector(cs); 318 } 319 } 320 321 static int set_vcpu_info(CPUState *cs, uint64_t gpa) 322 { 323 X86CPU *cpu = X86_CPU(cs); 324 CPUX86State *env = &cpu->env; 325 MemoryRegionSection mrs = { .mr = NULL }; 326 void *vcpu_info_hva = NULL; 327 int ret; 328 329 ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO, gpa); 330 if (ret || gpa == INVALID_GPA) { 331 goto out; 332 } 333 334 mrs = memory_region_find(get_system_memory(), gpa, 335 sizeof(struct vcpu_info)); 336 if (mrs.mr && mrs.mr->ram_block && 337 !int128_lt(mrs.size, int128_make64(sizeof(struct vcpu_info)))) { 338 vcpu_info_hva = qemu_map_ram_ptr(mrs.mr->ram_block, 339 mrs.offset_within_region); 340 } 341 if (!vcpu_info_hva) { 342 if (mrs.mr) { 343 memory_region_unref(mrs.mr); 344 mrs.mr = NULL; 345 } 346 ret = -EINVAL; 347 } 348 349 out: 350 if (env->xen_vcpu_info_mr) { 351 memory_region_unref(env->xen_vcpu_info_mr); 352 } 353 env->xen_vcpu_info_hva = vcpu_info_hva; 354 env->xen_vcpu_info_mr = mrs.mr; 355 return ret; 356 } 357 358 static void do_set_vcpu_info_default_gpa(CPUState *cs, run_on_cpu_data data) 359 { 360 X86CPU *cpu = X86_CPU(cs); 361 CPUX86State *env = &cpu->env; 362 363 env->xen_vcpu_info_default_gpa = data.host_ulong; 364 365 /* Changing the default does nothing if a vcpu_info was explicitly set. */ 366 if (env->xen_vcpu_info_gpa == INVALID_GPA) { 367 set_vcpu_info(cs, env->xen_vcpu_info_default_gpa); 368 } 369 } 370 371 static void do_set_vcpu_info_gpa(CPUState *cs, run_on_cpu_data data) 372 { 373 X86CPU *cpu = X86_CPU(cs); 374 CPUX86State *env = &cpu->env; 375 376 env->xen_vcpu_info_gpa = data.host_ulong; 377 378 set_vcpu_info(cs, env->xen_vcpu_info_gpa); 379 } 380 381 void *kvm_xen_get_vcpu_info_hva(uint32_t vcpu_id) 382 { 383 CPUState *cs = qemu_get_cpu(vcpu_id); 384 if (!cs) { 385 return NULL; 386 } 387 388 return X86_CPU(cs)->env.xen_vcpu_info_hva; 389 } 390 391 void kvm_xen_maybe_deassert_callback(CPUState *cs) 392 { 393 CPUX86State *env = &X86_CPU(cs)->env; 394 struct vcpu_info *vi = env->xen_vcpu_info_hva; 395 if (!vi) { 396 return; 397 } 398 399 /* If the evtchn_upcall_pending flag is cleared, turn the GSI off. */ 400 if (!vi->evtchn_upcall_pending) { 401 bql_lock(); 402 /* 403 * Check again now we have the lock, because it may have been 404 * asserted in the interim. And we don't want to take the lock 405 * every time because this is a fast path. 406 */ 407 if (!vi->evtchn_upcall_pending) { 408 X86_CPU(cs)->env.xen_callback_asserted = false; 409 xen_evtchn_set_callback_level(0); 410 } 411 bql_unlock(); 412 } 413 } 414 415 void kvm_xen_set_callback_asserted(void) 416 { 417 CPUState *cs = qemu_get_cpu(0); 418 419 if (cs) { 420 X86_CPU(cs)->env.xen_callback_asserted = true; 421 } 422 } 423 424 bool kvm_xen_has_vcpu_callback_vector(void) 425 { 426 CPUState *cs = qemu_get_cpu(0); 427 428 return cs && !!X86_CPU(cs)->env.xen_vcpu_callback_vector; 429 } 430 431 void kvm_xen_inject_vcpu_callback_vector(uint32_t vcpu_id, int type) 432 { 433 CPUState *cs = qemu_get_cpu(vcpu_id); 434 uint8_t vector; 435 436 if (!cs) { 437 return; 438 } 439 440 vector = X86_CPU(cs)->env.xen_vcpu_callback_vector; 441 if (vector) { 442 /* 443 * The per-vCPU callback vector injected via lapic. Just 444 * deliver it as an MSI. 445 */ 446 MSIMessage msg = { 447 .address = APIC_DEFAULT_ADDRESS | 448 (X86_CPU(cs)->apic_id << MSI_ADDR_DEST_ID_SHIFT), 449 .data = vector | (1UL << MSI_DATA_LEVEL_SHIFT), 450 }; 451 kvm_irqchip_send_msi(kvm_state, msg); 452 return; 453 } 454 455 switch (type) { 456 case HVM_PARAM_CALLBACK_TYPE_VECTOR: 457 /* 458 * If the evtchn_upcall_pending field in the vcpu_info is set, then 459 * KVM will automatically deliver the vector on entering the vCPU 460 * so all we have to do is kick it out. 461 */ 462 qemu_cpu_kick(cs); 463 break; 464 465 case HVM_PARAM_CALLBACK_TYPE_GSI: 466 case HVM_PARAM_CALLBACK_TYPE_PCI_INTX: 467 if (vcpu_id == 0) { 468 xen_evtchn_set_callback_level(1); 469 } 470 break; 471 } 472 } 473 474 /* Must always be called with xen_timers_lock held */ 475 static int kvm_xen_set_vcpu_timer(CPUState *cs) 476 { 477 X86CPU *cpu = X86_CPU(cs); 478 CPUX86State *env = &cpu->env; 479 480 struct kvm_xen_vcpu_attr va = { 481 .type = KVM_XEN_VCPU_ATTR_TYPE_TIMER, 482 .u.timer.port = env->xen_virq[VIRQ_TIMER], 483 .u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL, 484 .u.timer.expires_ns = env->xen_singleshot_timer_ns, 485 }; 486 487 return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &va); 488 } 489 490 static void do_set_vcpu_timer_virq(CPUState *cs, run_on_cpu_data data) 491 { 492 QEMU_LOCK_GUARD(&X86_CPU(cs)->env.xen_timers_lock); 493 kvm_xen_set_vcpu_timer(cs); 494 } 495 496 int kvm_xen_set_vcpu_virq(uint32_t vcpu_id, uint16_t virq, uint16_t port) 497 { 498 CPUState *cs = qemu_get_cpu(vcpu_id); 499 500 if (!cs) { 501 return -ENOENT; 502 } 503 504 /* cpu.h doesn't include the actual Xen header. */ 505 qemu_build_assert(NR_VIRQS == XEN_NR_VIRQS); 506 507 if (virq >= NR_VIRQS) { 508 return -EINVAL; 509 } 510 511 if (port && X86_CPU(cs)->env.xen_virq[virq]) { 512 return -EEXIST; 513 } 514 515 X86_CPU(cs)->env.xen_virq[virq] = port; 516 if (virq == VIRQ_TIMER && kvm_xen_has_cap(EVTCHN_SEND)) { 517 async_run_on_cpu(cs, do_set_vcpu_timer_virq, 518 RUN_ON_CPU_HOST_INT(port)); 519 } 520 return 0; 521 } 522 523 static void do_set_vcpu_time_info_gpa(CPUState *cs, run_on_cpu_data data) 524 { 525 X86CPU *cpu = X86_CPU(cs); 526 CPUX86State *env = &cpu->env; 527 528 env->xen_vcpu_time_info_gpa = data.host_ulong; 529 530 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO, 531 env->xen_vcpu_time_info_gpa); 532 } 533 534 static void do_set_vcpu_runstate_gpa(CPUState *cs, run_on_cpu_data data) 535 { 536 X86CPU *cpu = X86_CPU(cs); 537 CPUX86State *env = &cpu->env; 538 539 env->xen_vcpu_runstate_gpa = data.host_ulong; 540 541 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR, 542 env->xen_vcpu_runstate_gpa); 543 } 544 545 static void do_vcpu_soft_reset(CPUState *cs, run_on_cpu_data data) 546 { 547 X86CPU *cpu = X86_CPU(cs); 548 CPUX86State *env = &cpu->env; 549 550 env->xen_vcpu_info_gpa = INVALID_GPA; 551 env->xen_vcpu_info_default_gpa = INVALID_GPA; 552 env->xen_vcpu_time_info_gpa = INVALID_GPA; 553 env->xen_vcpu_runstate_gpa = INVALID_GPA; 554 env->xen_vcpu_callback_vector = 0; 555 memset(env->xen_virq, 0, sizeof(env->xen_virq)); 556 557 set_vcpu_info(cs, INVALID_GPA); 558 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO, 559 INVALID_GPA); 560 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR, 561 INVALID_GPA); 562 if (kvm_xen_has_cap(EVTCHN_SEND)) { 563 kvm_xen_set_vcpu_callback_vector(cs); 564 565 QEMU_LOCK_GUARD(&X86_CPU(cs)->env.xen_timers_lock); 566 env->xen_singleshot_timer_ns = 0; 567 kvm_xen_set_vcpu_timer(cs); 568 } else { 569 vcpuop_stop_singleshot_timer(cs); 570 }; 571 572 } 573 574 static int xen_set_shared_info(uint64_t gfn) 575 { 576 uint64_t gpa = gfn << TARGET_PAGE_BITS; 577 int i, err; 578 579 BQL_LOCK_GUARD(); 580 581 /* 582 * The xen_overlay device tells KVM about it too, since it had to 583 * do that on migration load anyway (unless we're going to jump 584 * through lots of hoops to maintain the fiction that this isn't 585 * KVM-specific. 586 */ 587 err = xen_overlay_map_shinfo_page(gpa); 588 if (err) { 589 return err; 590 } 591 592 trace_kvm_xen_set_shared_info(gfn); 593 594 for (i = 0; i < XEN_LEGACY_MAX_VCPUS; i++) { 595 CPUState *cpu = qemu_get_cpu(i); 596 if (cpu) { 597 async_run_on_cpu(cpu, do_set_vcpu_info_default_gpa, 598 RUN_ON_CPU_HOST_ULONG(gpa)); 599 } 600 gpa += sizeof(vcpu_info_t); 601 } 602 603 return err; 604 } 605 606 static int add_to_physmap_one(uint32_t space, uint64_t idx, uint64_t gfn) 607 { 608 switch (space) { 609 case XENMAPSPACE_shared_info: 610 if (idx > 0) { 611 return -EINVAL; 612 } 613 return xen_set_shared_info(gfn); 614 615 case XENMAPSPACE_grant_table: 616 return xen_gnttab_map_page(idx, gfn); 617 618 case XENMAPSPACE_gmfn: 619 case XENMAPSPACE_gmfn_range: 620 return -ENOTSUP; 621 622 case XENMAPSPACE_gmfn_foreign: 623 case XENMAPSPACE_dev_mmio: 624 return -EPERM; 625 626 default: 627 return -EINVAL; 628 } 629 } 630 631 static int do_add_to_physmap(struct kvm_xen_exit *exit, X86CPU *cpu, 632 uint64_t arg) 633 { 634 struct xen_add_to_physmap xatp; 635 CPUState *cs = CPU(cpu); 636 637 if (hypercall_compat32(exit->u.hcall.longmode)) { 638 struct compat_xen_add_to_physmap xatp32; 639 640 qemu_build_assert(sizeof(struct compat_xen_add_to_physmap) == 16); 641 if (kvm_copy_from_gva(cs, arg, &xatp32, sizeof(xatp32))) { 642 return -EFAULT; 643 } 644 xatp.domid = xatp32.domid; 645 xatp.size = xatp32.size; 646 xatp.space = xatp32.space; 647 xatp.idx = xatp32.idx; 648 xatp.gpfn = xatp32.gpfn; 649 } else { 650 if (kvm_copy_from_gva(cs, arg, &xatp, sizeof(xatp))) { 651 return -EFAULT; 652 } 653 } 654 655 if (xatp.domid != DOMID_SELF && xatp.domid != xen_domid) { 656 return -ESRCH; 657 } 658 659 return add_to_physmap_one(xatp.space, xatp.idx, xatp.gpfn); 660 } 661 662 static int do_add_to_physmap_batch(struct kvm_xen_exit *exit, X86CPU *cpu, 663 uint64_t arg) 664 { 665 struct xen_add_to_physmap_batch xatpb; 666 unsigned long idxs_gva, gpfns_gva, errs_gva; 667 CPUState *cs = CPU(cpu); 668 size_t op_sz; 669 670 if (hypercall_compat32(exit->u.hcall.longmode)) { 671 struct compat_xen_add_to_physmap_batch xatpb32; 672 673 qemu_build_assert(sizeof(struct compat_xen_add_to_physmap_batch) == 20); 674 if (kvm_copy_from_gva(cs, arg, &xatpb32, sizeof(xatpb32))) { 675 return -EFAULT; 676 } 677 xatpb.domid = xatpb32.domid; 678 xatpb.space = xatpb32.space; 679 xatpb.size = xatpb32.size; 680 681 idxs_gva = xatpb32.idxs.c; 682 gpfns_gva = xatpb32.gpfns.c; 683 errs_gva = xatpb32.errs.c; 684 op_sz = sizeof(uint32_t); 685 } else { 686 if (kvm_copy_from_gva(cs, arg, &xatpb, sizeof(xatpb))) { 687 return -EFAULT; 688 } 689 op_sz = sizeof(unsigned long); 690 idxs_gva = (unsigned long)xatpb.idxs.p; 691 gpfns_gva = (unsigned long)xatpb.gpfns.p; 692 errs_gva = (unsigned long)xatpb.errs.p; 693 } 694 695 if (xatpb.domid != DOMID_SELF && xatpb.domid != xen_domid) { 696 return -ESRCH; 697 } 698 699 /* Explicitly invalid for the batch op. Not that we implement it anyway. */ 700 if (xatpb.space == XENMAPSPACE_gmfn_range) { 701 return -EINVAL; 702 } 703 704 while (xatpb.size--) { 705 unsigned long idx = 0; 706 unsigned long gpfn = 0; 707 int err; 708 709 /* For 32-bit compat this only copies the low 32 bits of each */ 710 if (kvm_copy_from_gva(cs, idxs_gva, &idx, op_sz) || 711 kvm_copy_from_gva(cs, gpfns_gva, &gpfn, op_sz)) { 712 return -EFAULT; 713 } 714 idxs_gva += op_sz; 715 gpfns_gva += op_sz; 716 717 err = add_to_physmap_one(xatpb.space, idx, gpfn); 718 719 if (kvm_copy_to_gva(cs, errs_gva, &err, sizeof(err))) { 720 return -EFAULT; 721 } 722 errs_gva += sizeof(err); 723 } 724 return 0; 725 } 726 727 static bool kvm_xen_hcall_memory_op(struct kvm_xen_exit *exit, X86CPU *cpu, 728 int cmd, uint64_t arg) 729 { 730 int err; 731 732 switch (cmd) { 733 case XENMEM_add_to_physmap: 734 err = do_add_to_physmap(exit, cpu, arg); 735 break; 736 737 case XENMEM_add_to_physmap_batch: 738 err = do_add_to_physmap_batch(exit, cpu, arg); 739 break; 740 741 default: 742 return false; 743 } 744 745 exit->u.hcall.result = err; 746 return true; 747 } 748 749 static bool handle_set_param(struct kvm_xen_exit *exit, X86CPU *cpu, 750 uint64_t arg) 751 { 752 CPUState *cs = CPU(cpu); 753 struct xen_hvm_param hp; 754 int err = 0; 755 756 /* No need for 32/64 compat handling */ 757 qemu_build_assert(sizeof(hp) == 16); 758 759 if (kvm_copy_from_gva(cs, arg, &hp, sizeof(hp))) { 760 err = -EFAULT; 761 goto out; 762 } 763 764 if (hp.domid != DOMID_SELF && hp.domid != xen_domid) { 765 err = -ESRCH; 766 goto out; 767 } 768 769 switch (hp.index) { 770 case HVM_PARAM_CALLBACK_IRQ: 771 bql_lock(); 772 err = xen_evtchn_set_callback_param(hp.value); 773 bql_unlock(); 774 xen_set_long_mode(exit->u.hcall.longmode); 775 break; 776 default: 777 return false; 778 } 779 780 out: 781 exit->u.hcall.result = err; 782 return true; 783 } 784 785 static bool handle_get_param(struct kvm_xen_exit *exit, X86CPU *cpu, 786 uint64_t arg) 787 { 788 CPUState *cs = CPU(cpu); 789 struct xen_hvm_param hp; 790 int err = 0; 791 792 /* No need for 32/64 compat handling */ 793 qemu_build_assert(sizeof(hp) == 16); 794 795 if (kvm_copy_from_gva(cs, arg, &hp, sizeof(hp))) { 796 err = -EFAULT; 797 goto out; 798 } 799 800 if (hp.domid != DOMID_SELF && hp.domid != xen_domid) { 801 err = -ESRCH; 802 goto out; 803 } 804 805 switch (hp.index) { 806 case HVM_PARAM_STORE_PFN: 807 hp.value = XEN_SPECIAL_PFN(XENSTORE); 808 break; 809 case HVM_PARAM_STORE_EVTCHN: 810 hp.value = xen_xenstore_get_port(); 811 break; 812 case HVM_PARAM_CONSOLE_PFN: 813 hp.value = xen_primary_console_get_pfn(); 814 if (!hp.value) { 815 err = -EINVAL; 816 } 817 break; 818 case HVM_PARAM_CONSOLE_EVTCHN: 819 hp.value = xen_primary_console_get_port(); 820 if (!hp.value) { 821 err = -EINVAL; 822 } 823 break; 824 default: 825 return false; 826 } 827 828 if (!err && kvm_copy_to_gva(cs, arg, &hp, sizeof(hp))) { 829 err = -EFAULT; 830 } 831 out: 832 exit->u.hcall.result = err; 833 return true; 834 } 835 836 static int kvm_xen_hcall_evtchn_upcall_vector(struct kvm_xen_exit *exit, 837 X86CPU *cpu, uint64_t arg) 838 { 839 struct xen_hvm_evtchn_upcall_vector up; 840 CPUState *target_cs; 841 842 /* No need for 32/64 compat handling */ 843 qemu_build_assert(sizeof(up) == 8); 844 845 if (kvm_copy_from_gva(CPU(cpu), arg, &up, sizeof(up))) { 846 return -EFAULT; 847 } 848 849 if (up.vector < 0x10) { 850 return -EINVAL; 851 } 852 853 target_cs = qemu_get_cpu(up.vcpu); 854 if (!target_cs) { 855 return -EINVAL; 856 } 857 858 async_run_on_cpu(target_cs, do_set_vcpu_callback_vector, 859 RUN_ON_CPU_HOST_INT(up.vector)); 860 return 0; 861 } 862 863 static bool kvm_xen_hcall_hvm_op(struct kvm_xen_exit *exit, X86CPU *cpu, 864 int cmd, uint64_t arg) 865 { 866 int ret = -ENOSYS; 867 switch (cmd) { 868 case HVMOP_set_evtchn_upcall_vector: 869 ret = kvm_xen_hcall_evtchn_upcall_vector(exit, cpu, arg); 870 break; 871 872 case HVMOP_pagetable_dying: 873 ret = -ENOSYS; 874 break; 875 876 case HVMOP_set_param: 877 return handle_set_param(exit, cpu, arg); 878 879 case HVMOP_get_param: 880 return handle_get_param(exit, cpu, arg); 881 882 default: 883 return false; 884 } 885 886 exit->u.hcall.result = ret; 887 return true; 888 } 889 890 static int vcpuop_register_vcpu_info(CPUState *cs, CPUState *target, 891 uint64_t arg) 892 { 893 struct vcpu_register_vcpu_info rvi; 894 uint64_t gpa; 895 896 /* No need for 32/64 compat handling */ 897 qemu_build_assert(sizeof(rvi) == 16); 898 qemu_build_assert(sizeof(struct vcpu_info) == 64); 899 900 if (!target) { 901 return -ENOENT; 902 } 903 904 if (kvm_copy_from_gva(cs, arg, &rvi, sizeof(rvi))) { 905 return -EFAULT; 906 } 907 908 if (rvi.offset > TARGET_PAGE_SIZE - sizeof(struct vcpu_info)) { 909 return -EINVAL; 910 } 911 912 gpa = ((rvi.mfn << TARGET_PAGE_BITS) + rvi.offset); 913 async_run_on_cpu(target, do_set_vcpu_info_gpa, RUN_ON_CPU_HOST_ULONG(gpa)); 914 return 0; 915 } 916 917 static int vcpuop_register_vcpu_time_info(CPUState *cs, CPUState *target, 918 uint64_t arg) 919 { 920 struct vcpu_register_time_memory_area tma; 921 uint64_t gpa; 922 size_t len; 923 924 /* No need for 32/64 compat handling */ 925 qemu_build_assert(sizeof(tma) == 8); 926 qemu_build_assert(sizeof(struct vcpu_time_info) == 32); 927 928 if (!target) { 929 return -ENOENT; 930 } 931 932 if (kvm_copy_from_gva(cs, arg, &tma, sizeof(tma))) { 933 return -EFAULT; 934 } 935 936 /* 937 * Xen actually uses the GVA and does the translation through the guest 938 * page tables each time. But Linux/KVM uses the GPA, on the assumption 939 * that guests only ever use *global* addresses (kernel virtual addresses) 940 * for it. If Linux is changed to redo the GVA→GPA translation each time, 941 * it will offer a new vCPU attribute for that, and we'll use it instead. 942 */ 943 if (!kvm_gva_to_gpa(cs, tma.addr.p, &gpa, &len, false) || 944 len < sizeof(struct vcpu_time_info)) { 945 return -EFAULT; 946 } 947 948 async_run_on_cpu(target, do_set_vcpu_time_info_gpa, 949 RUN_ON_CPU_HOST_ULONG(gpa)); 950 return 0; 951 } 952 953 static int vcpuop_register_runstate_info(CPUState *cs, CPUState *target, 954 uint64_t arg) 955 { 956 struct vcpu_register_runstate_memory_area rma; 957 uint64_t gpa; 958 size_t len; 959 960 /* No need for 32/64 compat handling */ 961 qemu_build_assert(sizeof(rma) == 8); 962 /* The runstate area actually does change size, but Linux copes. */ 963 964 if (!target) { 965 return -ENOENT; 966 } 967 968 if (kvm_copy_from_gva(cs, arg, &rma, sizeof(rma))) { 969 return -EFAULT; 970 } 971 972 /* As with vcpu_time_info, Xen actually uses the GVA but KVM doesn't. */ 973 if (!kvm_gva_to_gpa(cs, rma.addr.p, &gpa, &len, false)) { 974 return -EFAULT; 975 } 976 977 async_run_on_cpu(target, do_set_vcpu_runstate_gpa, 978 RUN_ON_CPU_HOST_ULONG(gpa)); 979 return 0; 980 } 981 982 static uint64_t kvm_get_current_ns(void) 983 { 984 struct kvm_clock_data data; 985 int ret; 986 987 ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data); 988 if (ret < 0) { 989 fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret)); 990 abort(); 991 } 992 993 return data.clock; 994 } 995 996 static void xen_vcpu_singleshot_timer_event(void *opaque) 997 { 998 CPUState *cpu = opaque; 999 CPUX86State *env = &X86_CPU(cpu)->env; 1000 uint16_t port = env->xen_virq[VIRQ_TIMER]; 1001 1002 if (likely(port)) { 1003 xen_evtchn_set_port(port); 1004 } 1005 1006 qemu_mutex_lock(&env->xen_timers_lock); 1007 env->xen_singleshot_timer_ns = 0; 1008 qemu_mutex_unlock(&env->xen_timers_lock); 1009 } 1010 1011 static void xen_vcpu_periodic_timer_event(void *opaque) 1012 { 1013 CPUState *cpu = opaque; 1014 CPUX86State *env = &X86_CPU(cpu)->env; 1015 uint16_t port = env->xen_virq[VIRQ_TIMER]; 1016 int64_t qemu_now; 1017 1018 if (likely(port)) { 1019 xen_evtchn_set_port(port); 1020 } 1021 1022 qemu_mutex_lock(&env->xen_timers_lock); 1023 1024 qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); 1025 timer_mod_ns(env->xen_periodic_timer, 1026 qemu_now + env->xen_periodic_timer_period); 1027 1028 qemu_mutex_unlock(&env->xen_timers_lock); 1029 } 1030 1031 static int do_set_periodic_timer(CPUState *target, uint64_t period_ns) 1032 { 1033 CPUX86State *tenv = &X86_CPU(target)->env; 1034 int64_t qemu_now; 1035 1036 timer_del(tenv->xen_periodic_timer); 1037 1038 qemu_mutex_lock(&tenv->xen_timers_lock); 1039 1040 qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); 1041 timer_mod_ns(tenv->xen_periodic_timer, qemu_now + period_ns); 1042 tenv->xen_periodic_timer_period = period_ns; 1043 1044 qemu_mutex_unlock(&tenv->xen_timers_lock); 1045 return 0; 1046 } 1047 1048 #define MILLISECS(_ms) ((int64_t)((_ms) * 1000000ULL)) 1049 #define MICROSECS(_us) ((int64_t)((_us) * 1000ULL)) 1050 #define STIME_MAX ((time_t)((int64_t)~0ull >> 1)) 1051 /* Chosen so (NOW() + delta) won't overflow without an uptime of 200 years */ 1052 #define STIME_DELTA_MAX ((int64_t)((uint64_t)~0ull >> 2)) 1053 1054 static int vcpuop_set_periodic_timer(CPUState *cs, CPUState *target, 1055 uint64_t arg) 1056 { 1057 struct vcpu_set_periodic_timer spt; 1058 1059 qemu_build_assert(sizeof(spt) == 8); 1060 if (kvm_copy_from_gva(cs, arg, &spt, sizeof(spt))) { 1061 return -EFAULT; 1062 } 1063 1064 if (spt.period_ns < MILLISECS(1) || spt.period_ns > STIME_DELTA_MAX) { 1065 return -EINVAL; 1066 } 1067 1068 return do_set_periodic_timer(target, spt.period_ns); 1069 } 1070 1071 static int vcpuop_stop_periodic_timer(CPUState *target) 1072 { 1073 CPUX86State *tenv = &X86_CPU(target)->env; 1074 1075 qemu_mutex_lock(&tenv->xen_timers_lock); 1076 1077 timer_del(tenv->xen_periodic_timer); 1078 tenv->xen_periodic_timer_period = 0; 1079 1080 qemu_mutex_unlock(&tenv->xen_timers_lock); 1081 return 0; 1082 } 1083 1084 /* 1085 * Userspace handling of timer, for older kernels. 1086 * Must always be called with xen_timers_lock held. 1087 */ 1088 static int do_set_singleshot_timer(CPUState *cs, uint64_t timeout_abs, 1089 bool linux_wa) 1090 { 1091 CPUX86State *env = &X86_CPU(cs)->env; 1092 int64_t now = kvm_get_current_ns(); 1093 int64_t qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); 1094 int64_t delta = timeout_abs - now; 1095 1096 if (linux_wa && unlikely((int64_t)timeout_abs < 0 || 1097 (delta > 0 && (uint32_t)(delta >> 50) != 0))) { 1098 /* 1099 * Xen has a 'Linux workaround' in do_set_timer_op() which checks 1100 * for negative absolute timeout values (caused by integer 1101 * overflow), and for values about 13 days in the future (2^50ns) 1102 * which would be caused by jiffies overflow. For those cases, it 1103 * sets the timeout 100ms in the future (not *too* soon, since if 1104 * a guest really did set a long timeout on purpose we don't want 1105 * to keep churning CPU time by waking it up). 1106 */ 1107 delta = (100 * SCALE_MS); 1108 timeout_abs = now + delta; 1109 } 1110 1111 timer_mod_ns(env->xen_singleshot_timer, qemu_now + delta); 1112 env->xen_singleshot_timer_ns = now + delta; 1113 return 0; 1114 } 1115 1116 static int vcpuop_set_singleshot_timer(CPUState *cs, uint64_t arg) 1117 { 1118 struct vcpu_set_singleshot_timer sst = { 0 }; 1119 1120 /* 1121 * The struct is a uint64_t followed by a uint32_t. On 32-bit that 1122 * makes it 12 bytes. On 64-bit it gets padded to 16. The parts 1123 * that get used are identical, and there's four bytes of padding 1124 * unused at the end. For true Xen compatibility we should attempt 1125 * to copy the full 16 bytes from 64-bit guests, and return -EFAULT 1126 * if we can't get the padding too. But that's daft. Just copy what 1127 * we need. 1128 */ 1129 qemu_build_assert(offsetof(struct vcpu_set_singleshot_timer, flags) == 8); 1130 qemu_build_assert(sizeof(sst) >= 12); 1131 1132 if (kvm_copy_from_gva(cs, arg, &sst, 12)) { 1133 return -EFAULT; 1134 } 1135 1136 QEMU_LOCK_GUARD(&X86_CPU(cs)->env.xen_timers_lock); 1137 1138 /* 1139 * We ignore the VCPU_SSHOTTMR_future flag, just as Xen now does. 1140 * The only guest that ever used it, got it wrong. 1141 * https://xenbits.xen.org/gitweb/?p=xen.git;a=commitdiff;h=19c6cbd909 1142 */ 1143 return do_set_singleshot_timer(cs, sst.timeout_abs_ns, false); 1144 } 1145 1146 static int vcpuop_stop_singleshot_timer(CPUState *cs) 1147 { 1148 CPUX86State *env = &X86_CPU(cs)->env; 1149 1150 qemu_mutex_lock(&env->xen_timers_lock); 1151 1152 timer_del(env->xen_singleshot_timer); 1153 env->xen_singleshot_timer_ns = 0; 1154 1155 qemu_mutex_unlock(&env->xen_timers_lock); 1156 return 0; 1157 } 1158 1159 static bool kvm_xen_hcall_set_timer_op(struct kvm_xen_exit *exit, X86CPU *cpu, 1160 uint64_t timeout) 1161 { 1162 int err; 1163 1164 if (unlikely(timeout == 0)) { 1165 err = vcpuop_stop_singleshot_timer(CPU(cpu)); 1166 } else { 1167 QEMU_LOCK_GUARD(&X86_CPU(cpu)->env.xen_timers_lock); 1168 err = do_set_singleshot_timer(CPU(cpu), timeout, true); 1169 } 1170 exit->u.hcall.result = err; 1171 return true; 1172 } 1173 1174 static bool kvm_xen_hcall_vcpu_op(struct kvm_xen_exit *exit, X86CPU *cpu, 1175 int cmd, int vcpu_id, uint64_t arg) 1176 { 1177 CPUState *cs = CPU(cpu); 1178 CPUState *dest = cs->cpu_index == vcpu_id ? cs : qemu_get_cpu(vcpu_id); 1179 int err; 1180 1181 if (!dest) { 1182 err = -ENOENT; 1183 goto out; 1184 } 1185 1186 switch (cmd) { 1187 case VCPUOP_register_runstate_memory_area: 1188 err = vcpuop_register_runstate_info(cs, dest, arg); 1189 break; 1190 case VCPUOP_register_vcpu_time_memory_area: 1191 err = vcpuop_register_vcpu_time_info(cs, dest, arg); 1192 break; 1193 case VCPUOP_register_vcpu_info: 1194 err = vcpuop_register_vcpu_info(cs, dest, arg); 1195 break; 1196 case VCPUOP_set_singleshot_timer: { 1197 if (cs->cpu_index == vcpu_id) { 1198 err = vcpuop_set_singleshot_timer(dest, arg); 1199 } else { 1200 err = -EINVAL; 1201 } 1202 break; 1203 } 1204 case VCPUOP_stop_singleshot_timer: 1205 if (cs->cpu_index == vcpu_id) { 1206 err = vcpuop_stop_singleshot_timer(dest); 1207 } else { 1208 err = -EINVAL; 1209 } 1210 break; 1211 case VCPUOP_set_periodic_timer: { 1212 err = vcpuop_set_periodic_timer(cs, dest, arg); 1213 break; 1214 } 1215 case VCPUOP_stop_periodic_timer: 1216 err = vcpuop_stop_periodic_timer(dest); 1217 break; 1218 1219 default: 1220 return false; 1221 } 1222 1223 out: 1224 exit->u.hcall.result = err; 1225 return true; 1226 } 1227 1228 static bool kvm_xen_hcall_evtchn_op(struct kvm_xen_exit *exit, X86CPU *cpu, 1229 int cmd, uint64_t arg) 1230 { 1231 CPUState *cs = CPU(cpu); 1232 int err = -ENOSYS; 1233 1234 switch (cmd) { 1235 case EVTCHNOP_init_control: 1236 case EVTCHNOP_expand_array: 1237 case EVTCHNOP_set_priority: 1238 /* We do not support FIFO channels at this point */ 1239 err = -ENOSYS; 1240 break; 1241 1242 case EVTCHNOP_status: { 1243 struct evtchn_status status; 1244 1245 qemu_build_assert(sizeof(status) == 24); 1246 if (kvm_copy_from_gva(cs, arg, &status, sizeof(status))) { 1247 err = -EFAULT; 1248 break; 1249 } 1250 1251 err = xen_evtchn_status_op(&status); 1252 if (!err && kvm_copy_to_gva(cs, arg, &status, sizeof(status))) { 1253 err = -EFAULT; 1254 } 1255 break; 1256 } 1257 case EVTCHNOP_close: { 1258 struct evtchn_close close; 1259 1260 qemu_build_assert(sizeof(close) == 4); 1261 if (kvm_copy_from_gva(cs, arg, &close, sizeof(close))) { 1262 err = -EFAULT; 1263 break; 1264 } 1265 1266 err = xen_evtchn_close_op(&close); 1267 break; 1268 } 1269 case EVTCHNOP_unmask: { 1270 struct evtchn_unmask unmask; 1271 1272 qemu_build_assert(sizeof(unmask) == 4); 1273 if (kvm_copy_from_gva(cs, arg, &unmask, sizeof(unmask))) { 1274 err = -EFAULT; 1275 break; 1276 } 1277 1278 err = xen_evtchn_unmask_op(&unmask); 1279 break; 1280 } 1281 case EVTCHNOP_bind_virq: { 1282 struct evtchn_bind_virq virq; 1283 1284 qemu_build_assert(sizeof(virq) == 12); 1285 if (kvm_copy_from_gva(cs, arg, &virq, sizeof(virq))) { 1286 err = -EFAULT; 1287 break; 1288 } 1289 1290 err = xen_evtchn_bind_virq_op(&virq); 1291 if (!err && kvm_copy_to_gva(cs, arg, &virq, sizeof(virq))) { 1292 err = -EFAULT; 1293 } 1294 break; 1295 } 1296 case EVTCHNOP_bind_pirq: { 1297 struct evtchn_bind_pirq pirq; 1298 1299 qemu_build_assert(sizeof(pirq) == 12); 1300 if (kvm_copy_from_gva(cs, arg, &pirq, sizeof(pirq))) { 1301 err = -EFAULT; 1302 break; 1303 } 1304 1305 err = xen_evtchn_bind_pirq_op(&pirq); 1306 if (!err && kvm_copy_to_gva(cs, arg, &pirq, sizeof(pirq))) { 1307 err = -EFAULT; 1308 } 1309 break; 1310 } 1311 case EVTCHNOP_bind_ipi: { 1312 struct evtchn_bind_ipi ipi; 1313 1314 qemu_build_assert(sizeof(ipi) == 8); 1315 if (kvm_copy_from_gva(cs, arg, &ipi, sizeof(ipi))) { 1316 err = -EFAULT; 1317 break; 1318 } 1319 1320 err = xen_evtchn_bind_ipi_op(&ipi); 1321 if (!err && kvm_copy_to_gva(cs, arg, &ipi, sizeof(ipi))) { 1322 err = -EFAULT; 1323 } 1324 break; 1325 } 1326 case EVTCHNOP_send: { 1327 struct evtchn_send send; 1328 1329 qemu_build_assert(sizeof(send) == 4); 1330 if (kvm_copy_from_gva(cs, arg, &send, sizeof(send))) { 1331 err = -EFAULT; 1332 break; 1333 } 1334 1335 err = xen_evtchn_send_op(&send); 1336 break; 1337 } 1338 case EVTCHNOP_alloc_unbound: { 1339 struct evtchn_alloc_unbound alloc; 1340 1341 qemu_build_assert(sizeof(alloc) == 8); 1342 if (kvm_copy_from_gva(cs, arg, &alloc, sizeof(alloc))) { 1343 err = -EFAULT; 1344 break; 1345 } 1346 1347 err = xen_evtchn_alloc_unbound_op(&alloc); 1348 if (!err && kvm_copy_to_gva(cs, arg, &alloc, sizeof(alloc))) { 1349 err = -EFAULT; 1350 } 1351 break; 1352 } 1353 case EVTCHNOP_bind_interdomain: { 1354 struct evtchn_bind_interdomain interdomain; 1355 1356 qemu_build_assert(sizeof(interdomain) == 12); 1357 if (kvm_copy_from_gva(cs, arg, &interdomain, sizeof(interdomain))) { 1358 err = -EFAULT; 1359 break; 1360 } 1361 1362 err = xen_evtchn_bind_interdomain_op(&interdomain); 1363 if (!err && 1364 kvm_copy_to_gva(cs, arg, &interdomain, sizeof(interdomain))) { 1365 err = -EFAULT; 1366 } 1367 break; 1368 } 1369 case EVTCHNOP_bind_vcpu: { 1370 struct evtchn_bind_vcpu vcpu; 1371 1372 qemu_build_assert(sizeof(vcpu) == 8); 1373 if (kvm_copy_from_gva(cs, arg, &vcpu, sizeof(vcpu))) { 1374 err = -EFAULT; 1375 break; 1376 } 1377 1378 err = xen_evtchn_bind_vcpu_op(&vcpu); 1379 break; 1380 } 1381 case EVTCHNOP_reset: { 1382 struct evtchn_reset reset; 1383 1384 qemu_build_assert(sizeof(reset) == 2); 1385 if (kvm_copy_from_gva(cs, arg, &reset, sizeof(reset))) { 1386 err = -EFAULT; 1387 break; 1388 } 1389 1390 err = xen_evtchn_reset_op(&reset); 1391 break; 1392 } 1393 default: 1394 return false; 1395 } 1396 1397 exit->u.hcall.result = err; 1398 return true; 1399 } 1400 1401 int kvm_xen_soft_reset(void) 1402 { 1403 CPUState *cpu; 1404 int err; 1405 1406 assert(bql_locked()); 1407 1408 trace_kvm_xen_soft_reset(); 1409 1410 err = xen_evtchn_soft_reset(); 1411 if (err) { 1412 return err; 1413 } 1414 1415 /* 1416 * Zero is the reset/startup state for HVM_PARAM_CALLBACK_IRQ. Strictly, 1417 * it maps to HVM_PARAM_CALLBACK_TYPE_GSI with GSI#0, but Xen refuses to 1418 * to deliver to the timer interrupt and treats that as 'disabled'. 1419 */ 1420 err = xen_evtchn_set_callback_param(0); 1421 if (err) { 1422 return err; 1423 } 1424 1425 CPU_FOREACH(cpu) { 1426 async_run_on_cpu(cpu, do_vcpu_soft_reset, RUN_ON_CPU_NULL); 1427 } 1428 1429 err = xen_overlay_map_shinfo_page(INVALID_GFN); 1430 if (err) { 1431 return err; 1432 } 1433 1434 err = xen_gnttab_reset(); 1435 if (err) { 1436 return err; 1437 } 1438 1439 err = xen_primary_console_reset(); 1440 if (err) { 1441 return err; 1442 } 1443 1444 err = xen_xenstore_reset(); 1445 if (err) { 1446 return err; 1447 } 1448 1449 return 0; 1450 } 1451 1452 static int schedop_shutdown(CPUState *cs, uint64_t arg) 1453 { 1454 struct sched_shutdown shutdown; 1455 int ret = 0; 1456 1457 /* No need for 32/64 compat handling */ 1458 qemu_build_assert(sizeof(shutdown) == 4); 1459 1460 if (kvm_copy_from_gva(cs, arg, &shutdown, sizeof(shutdown))) { 1461 return -EFAULT; 1462 } 1463 1464 switch (shutdown.reason) { 1465 case SHUTDOWN_crash: 1466 cpu_dump_state(cs, stderr, CPU_DUMP_CODE); 1467 qemu_system_guest_panicked(NULL); 1468 break; 1469 1470 case SHUTDOWN_reboot: 1471 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); 1472 break; 1473 1474 case SHUTDOWN_poweroff: 1475 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN); 1476 break; 1477 1478 case SHUTDOWN_soft_reset: 1479 bql_lock(); 1480 ret = kvm_xen_soft_reset(); 1481 bql_unlock(); 1482 break; 1483 1484 default: 1485 ret = -EINVAL; 1486 break; 1487 } 1488 1489 return ret; 1490 } 1491 1492 static bool kvm_xen_hcall_sched_op(struct kvm_xen_exit *exit, X86CPU *cpu, 1493 int cmd, uint64_t arg) 1494 { 1495 CPUState *cs = CPU(cpu); 1496 int err = -ENOSYS; 1497 1498 switch (cmd) { 1499 case SCHEDOP_shutdown: 1500 err = schedop_shutdown(cs, arg); 1501 break; 1502 1503 case SCHEDOP_poll: 1504 /* 1505 * Linux will panic if this doesn't work. Just yield; it's not 1506 * worth overthinking it because with event channel handling 1507 * in KVM, the kernel will intercept this and it will never 1508 * reach QEMU anyway. The semantics of the hypercall explicltly 1509 * permit spurious wakeups. 1510 */ 1511 case SCHEDOP_yield: 1512 sched_yield(); 1513 err = 0; 1514 break; 1515 1516 default: 1517 return false; 1518 } 1519 1520 exit->u.hcall.result = err; 1521 return true; 1522 } 1523 1524 static bool kvm_xen_hcall_gnttab_op(struct kvm_xen_exit *exit, X86CPU *cpu, 1525 int cmd, uint64_t arg, int count) 1526 { 1527 CPUState *cs = CPU(cpu); 1528 int err; 1529 1530 switch (cmd) { 1531 case GNTTABOP_set_version: { 1532 struct gnttab_set_version set; 1533 1534 qemu_build_assert(sizeof(set) == 4); 1535 if (kvm_copy_from_gva(cs, arg, &set, sizeof(set))) { 1536 err = -EFAULT; 1537 break; 1538 } 1539 1540 err = xen_gnttab_set_version_op(&set); 1541 if (!err && kvm_copy_to_gva(cs, arg, &set, sizeof(set))) { 1542 err = -EFAULT; 1543 } 1544 break; 1545 } 1546 case GNTTABOP_get_version: { 1547 struct gnttab_get_version get; 1548 1549 qemu_build_assert(sizeof(get) == 8); 1550 if (kvm_copy_from_gva(cs, arg, &get, sizeof(get))) { 1551 err = -EFAULT; 1552 break; 1553 } 1554 1555 err = xen_gnttab_get_version_op(&get); 1556 if (!err && kvm_copy_to_gva(cs, arg, &get, sizeof(get))) { 1557 err = -EFAULT; 1558 } 1559 break; 1560 } 1561 case GNTTABOP_query_size: { 1562 struct gnttab_query_size size; 1563 1564 qemu_build_assert(sizeof(size) == 16); 1565 if (kvm_copy_from_gva(cs, arg, &size, sizeof(size))) { 1566 err = -EFAULT; 1567 break; 1568 } 1569 1570 err = xen_gnttab_query_size_op(&size); 1571 if (!err && kvm_copy_to_gva(cs, arg, &size, sizeof(size))) { 1572 err = -EFAULT; 1573 } 1574 break; 1575 } 1576 case GNTTABOP_setup_table: 1577 case GNTTABOP_copy: 1578 case GNTTABOP_map_grant_ref: 1579 case GNTTABOP_unmap_grant_ref: 1580 case GNTTABOP_swap_grant_ref: 1581 return false; 1582 1583 default: 1584 /* Xen explicitly returns -ENOSYS to HVM guests for all others */ 1585 err = -ENOSYS; 1586 break; 1587 } 1588 1589 exit->u.hcall.result = err; 1590 return true; 1591 } 1592 1593 static bool kvm_xen_hcall_physdev_op(struct kvm_xen_exit *exit, X86CPU *cpu, 1594 int cmd, uint64_t arg) 1595 { 1596 CPUState *cs = CPU(cpu); 1597 int err; 1598 1599 switch (cmd) { 1600 case PHYSDEVOP_map_pirq: { 1601 struct physdev_map_pirq map; 1602 1603 if (hypercall_compat32(exit->u.hcall.longmode)) { 1604 struct compat_physdev_map_pirq *map32 = (void *)↦ 1605 1606 if (kvm_copy_from_gva(cs, arg, map32, sizeof(*map32))) { 1607 return -EFAULT; 1608 } 1609 1610 /* 1611 * The only thing that's different is the alignment of the 1612 * uint64_t table_base at the end, which gets padding to make 1613 * it 64-bit aligned in the 64-bit version. 1614 */ 1615 qemu_build_assert(sizeof(*map32) == 36); 1616 qemu_build_assert(offsetof(struct physdev_map_pirq, entry_nr) == 1617 offsetof(struct compat_physdev_map_pirq, entry_nr)); 1618 memmove(&map.table_base, &map32->table_base, sizeof(map.table_base)); 1619 } else { 1620 if (kvm_copy_from_gva(cs, arg, &map, sizeof(map))) { 1621 err = -EFAULT; 1622 break; 1623 } 1624 } 1625 err = xen_physdev_map_pirq(&map); 1626 /* 1627 * Since table_base is an IN parameter and won't be changed, just 1628 * copy the size of the compat structure back to the guest. 1629 */ 1630 if (!err && kvm_copy_to_gva(cs, arg, &map, 1631 sizeof(struct compat_physdev_map_pirq))) { 1632 err = -EFAULT; 1633 } 1634 break; 1635 } 1636 case PHYSDEVOP_unmap_pirq: { 1637 struct physdev_unmap_pirq unmap; 1638 1639 qemu_build_assert(sizeof(unmap) == 8); 1640 if (kvm_copy_from_gva(cs, arg, &unmap, sizeof(unmap))) { 1641 err = -EFAULT; 1642 break; 1643 } 1644 1645 err = xen_physdev_unmap_pirq(&unmap); 1646 if (!err && kvm_copy_to_gva(cs, arg, &unmap, sizeof(unmap))) { 1647 err = -EFAULT; 1648 } 1649 break; 1650 } 1651 case PHYSDEVOP_eoi: { 1652 struct physdev_eoi eoi; 1653 1654 qemu_build_assert(sizeof(eoi) == 4); 1655 if (kvm_copy_from_gva(cs, arg, &eoi, sizeof(eoi))) { 1656 err = -EFAULT; 1657 break; 1658 } 1659 1660 err = xen_physdev_eoi_pirq(&eoi); 1661 if (!err && kvm_copy_to_gva(cs, arg, &eoi, sizeof(eoi))) { 1662 err = -EFAULT; 1663 } 1664 break; 1665 } 1666 case PHYSDEVOP_irq_status_query: { 1667 struct physdev_irq_status_query query; 1668 1669 qemu_build_assert(sizeof(query) == 8); 1670 if (kvm_copy_from_gva(cs, arg, &query, sizeof(query))) { 1671 err = -EFAULT; 1672 break; 1673 } 1674 1675 err = xen_physdev_query_pirq(&query); 1676 if (!err && kvm_copy_to_gva(cs, arg, &query, sizeof(query))) { 1677 err = -EFAULT; 1678 } 1679 break; 1680 } 1681 case PHYSDEVOP_get_free_pirq: { 1682 struct physdev_get_free_pirq get; 1683 1684 qemu_build_assert(sizeof(get) == 8); 1685 if (kvm_copy_from_gva(cs, arg, &get, sizeof(get))) { 1686 err = -EFAULT; 1687 break; 1688 } 1689 1690 err = xen_physdev_get_free_pirq(&get); 1691 if (!err && kvm_copy_to_gva(cs, arg, &get, sizeof(get))) { 1692 err = -EFAULT; 1693 } 1694 break; 1695 } 1696 case PHYSDEVOP_pirq_eoi_gmfn_v2: /* FreeBSD 13 makes this hypercall */ 1697 err = -ENOSYS; 1698 break; 1699 1700 default: 1701 return false; 1702 } 1703 1704 exit->u.hcall.result = err; 1705 return true; 1706 } 1707 1708 static bool do_kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit) 1709 { 1710 uint16_t code = exit->u.hcall.input; 1711 1712 if (exit->u.hcall.cpl > 0) { 1713 exit->u.hcall.result = -EPERM; 1714 return true; 1715 } 1716 1717 switch (code) { 1718 case __HYPERVISOR_set_timer_op: 1719 if (exit->u.hcall.longmode) { 1720 return kvm_xen_hcall_set_timer_op(exit, cpu, 1721 exit->u.hcall.params[0]); 1722 } else { 1723 /* In 32-bit mode, the 64-bit timer value is in two args. */ 1724 uint64_t val = ((uint64_t)exit->u.hcall.params[1]) << 32 | 1725 (uint32_t)exit->u.hcall.params[0]; 1726 return kvm_xen_hcall_set_timer_op(exit, cpu, val); 1727 } 1728 case __HYPERVISOR_grant_table_op: 1729 return kvm_xen_hcall_gnttab_op(exit, cpu, exit->u.hcall.params[0], 1730 exit->u.hcall.params[1], 1731 exit->u.hcall.params[2]); 1732 case __HYPERVISOR_sched_op: 1733 return kvm_xen_hcall_sched_op(exit, cpu, exit->u.hcall.params[0], 1734 exit->u.hcall.params[1]); 1735 case __HYPERVISOR_event_channel_op: 1736 return kvm_xen_hcall_evtchn_op(exit, cpu, exit->u.hcall.params[0], 1737 exit->u.hcall.params[1]); 1738 case __HYPERVISOR_vcpu_op: 1739 return kvm_xen_hcall_vcpu_op(exit, cpu, 1740 exit->u.hcall.params[0], 1741 exit->u.hcall.params[1], 1742 exit->u.hcall.params[2]); 1743 case __HYPERVISOR_hvm_op: 1744 return kvm_xen_hcall_hvm_op(exit, cpu, exit->u.hcall.params[0], 1745 exit->u.hcall.params[1]); 1746 case __HYPERVISOR_memory_op: 1747 return kvm_xen_hcall_memory_op(exit, cpu, exit->u.hcall.params[0], 1748 exit->u.hcall.params[1]); 1749 case __HYPERVISOR_physdev_op: 1750 return kvm_xen_hcall_physdev_op(exit, cpu, exit->u.hcall.params[0], 1751 exit->u.hcall.params[1]); 1752 case __HYPERVISOR_xen_version: 1753 return kvm_xen_hcall_xen_version(exit, cpu, exit->u.hcall.params[0], 1754 exit->u.hcall.params[1]); 1755 default: 1756 return false; 1757 } 1758 } 1759 1760 int kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit) 1761 { 1762 if (exit->type != KVM_EXIT_XEN_HCALL) { 1763 return -1; 1764 } 1765 1766 /* 1767 * The kernel latches the guest 32/64 mode when the MSR is used to fill 1768 * the hypercall page. So if we see a hypercall in a mode that doesn't 1769 * match our own idea of the guest mode, fetch the kernel's idea of the 1770 * "long mode" to remain in sync. 1771 */ 1772 if (exit->u.hcall.longmode != xen_is_long_mode()) { 1773 xen_sync_long_mode(); 1774 } 1775 1776 if (!do_kvm_xen_handle_exit(cpu, exit)) { 1777 /* 1778 * Some hypercalls will be deliberately "implemented" by returning 1779 * -ENOSYS. This case is for hypercalls which are unexpected. 1780 */ 1781 exit->u.hcall.result = -ENOSYS; 1782 qemu_log_mask(LOG_UNIMP, "Unimplemented Xen hypercall %" 1783 PRId64 " (0x%" PRIx64 " 0x%" PRIx64 " 0x%" PRIx64 ")\n", 1784 (uint64_t)exit->u.hcall.input, 1785 (uint64_t)exit->u.hcall.params[0], 1786 (uint64_t)exit->u.hcall.params[1], 1787 (uint64_t)exit->u.hcall.params[2]); 1788 } 1789 1790 trace_kvm_xen_hypercall(CPU(cpu)->cpu_index, exit->u.hcall.cpl, 1791 exit->u.hcall.input, exit->u.hcall.params[0], 1792 exit->u.hcall.params[1], exit->u.hcall.params[2], 1793 exit->u.hcall.result); 1794 return 0; 1795 } 1796 1797 uint16_t kvm_xen_get_gnttab_max_frames(void) 1798 { 1799 KVMState *s = KVM_STATE(current_accel()); 1800 return s->xen_gnttab_max_frames; 1801 } 1802 1803 uint16_t kvm_xen_get_evtchn_max_pirq(void) 1804 { 1805 KVMState *s = KVM_STATE(current_accel()); 1806 return s->xen_evtchn_max_pirq; 1807 } 1808 1809 int kvm_put_xen_state(CPUState *cs) 1810 { 1811 X86CPU *cpu = X86_CPU(cs); 1812 CPUX86State *env = &cpu->env; 1813 uint64_t gpa; 1814 int ret; 1815 1816 gpa = env->xen_vcpu_info_gpa; 1817 if (gpa == INVALID_GPA) { 1818 gpa = env->xen_vcpu_info_default_gpa; 1819 } 1820 1821 if (gpa != INVALID_GPA) { 1822 ret = set_vcpu_info(cs, gpa); 1823 if (ret < 0) { 1824 return ret; 1825 } 1826 } 1827 1828 gpa = env->xen_vcpu_time_info_gpa; 1829 if (gpa != INVALID_GPA) { 1830 ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO, 1831 gpa); 1832 if (ret < 0) { 1833 return ret; 1834 } 1835 } 1836 1837 gpa = env->xen_vcpu_runstate_gpa; 1838 if (gpa != INVALID_GPA) { 1839 ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR, 1840 gpa); 1841 if (ret < 0) { 1842 return ret; 1843 } 1844 } 1845 1846 if (env->xen_periodic_timer_period) { 1847 ret = do_set_periodic_timer(cs, env->xen_periodic_timer_period); 1848 if (ret < 0) { 1849 return ret; 1850 } 1851 } 1852 1853 if (!kvm_xen_has_cap(EVTCHN_SEND)) { 1854 /* 1855 * If the kernel has EVTCHN_SEND support then it handles timers too, 1856 * so the timer will be restored by kvm_xen_set_vcpu_timer() below. 1857 */ 1858 QEMU_LOCK_GUARD(&env->xen_timers_lock); 1859 if (env->xen_singleshot_timer_ns) { 1860 ret = do_set_singleshot_timer(cs, env->xen_singleshot_timer_ns, 1861 false); 1862 if (ret < 0) { 1863 return ret; 1864 } 1865 } 1866 return 0; 1867 } 1868 1869 if (env->xen_vcpu_callback_vector) { 1870 ret = kvm_xen_set_vcpu_callback_vector(cs); 1871 if (ret < 0) { 1872 return ret; 1873 } 1874 } 1875 1876 if (env->xen_virq[VIRQ_TIMER]) { 1877 do_set_vcpu_timer_virq(cs, 1878 RUN_ON_CPU_HOST_INT(env->xen_virq[VIRQ_TIMER])); 1879 } 1880 return 0; 1881 } 1882 1883 int kvm_get_xen_state(CPUState *cs) 1884 { 1885 X86CPU *cpu = X86_CPU(cs); 1886 CPUX86State *env = &cpu->env; 1887 uint64_t gpa; 1888 int ret; 1889 1890 /* 1891 * The kernel does not mark vcpu_info as dirty when it delivers interrupts 1892 * to it. It's up to userspace to *assume* that any page shared thus is 1893 * always considered dirty. The shared_info page is different since it's 1894 * an overlay and migrated separately anyway. 1895 */ 1896 gpa = env->xen_vcpu_info_gpa; 1897 if (gpa == INVALID_GPA) { 1898 gpa = env->xen_vcpu_info_default_gpa; 1899 } 1900 if (gpa != INVALID_GPA) { 1901 MemoryRegionSection mrs = memory_region_find(get_system_memory(), 1902 gpa, 1903 sizeof(struct vcpu_info)); 1904 if (mrs.mr && 1905 !int128_lt(mrs.size, int128_make64(sizeof(struct vcpu_info)))) { 1906 memory_region_set_dirty(mrs.mr, mrs.offset_within_region, 1907 sizeof(struct vcpu_info)); 1908 } 1909 } 1910 1911 if (!kvm_xen_has_cap(EVTCHN_SEND)) { 1912 return 0; 1913 } 1914 1915 /* 1916 * If the kernel is accelerating timers, read out the current value of the 1917 * singleshot timer deadline. 1918 */ 1919 if (env->xen_virq[VIRQ_TIMER]) { 1920 struct kvm_xen_vcpu_attr va = { 1921 .type = KVM_XEN_VCPU_ATTR_TYPE_TIMER, 1922 }; 1923 ret = kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_GET_ATTR, &va); 1924 if (ret < 0) { 1925 return ret; 1926 } 1927 1928 /* 1929 * This locking is fairly pointless, and is here to appease Coverity. 1930 * There is an unavoidable race condition if a different vCPU sets a 1931 * timer for this vCPU after the value has been read out. But that's 1932 * OK in practice because *all* the vCPUs need to be stopped before 1933 * we set about migrating their state. 1934 */ 1935 QEMU_LOCK_GUARD(&X86_CPU(cs)->env.xen_timers_lock); 1936 env->xen_singleshot_timer_ns = va.u.timer.expires_ns; 1937 } 1938 1939 return 0; 1940 } 1941