1 /* 2 * Xen HVM emulation support in KVM 3 * 4 * Copyright © 2019 Oracle and/or its affiliates. All rights reserved. 5 * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. 6 * 7 * This work is licensed under the terms of the GNU GPL, version 2 or later. 8 * See the COPYING file in the top-level directory. 9 * 10 */ 11 12 #include "qemu/osdep.h" 13 #include "qemu/log.h" 14 #include "qemu/main-loop.h" 15 #include "qemu/error-report.h" 16 #include "hw/xen/xen.h" 17 #include "sysemu/kvm_int.h" 18 #include "sysemu/kvm_xen.h" 19 #include "kvm/kvm_i386.h" 20 #include "exec/address-spaces.h" 21 #include "xen-emu.h" 22 #include "trace.h" 23 #include "sysemu/runstate.h" 24 25 #include "hw/pci/msi.h" 26 #include "hw/i386/apic-msidef.h" 27 #include "hw/i386/e820_memory_layout.h" 28 #include "hw/i386/kvm/xen_overlay.h" 29 #include "hw/i386/kvm/xen_evtchn.h" 30 #include "hw/i386/kvm/xen_gnttab.h" 31 #include "hw/i386/kvm/xen_xenstore.h" 32 33 #include "hw/xen/interface/version.h" 34 #include "hw/xen/interface/sched.h" 35 #include "hw/xen/interface/memory.h" 36 #include "hw/xen/interface/hvm/hvm_op.h" 37 #include "hw/xen/interface/hvm/params.h" 38 #include "hw/xen/interface/vcpu.h" 39 #include "hw/xen/interface/event_channel.h" 40 #include "hw/xen/interface/grant_table.h" 41 42 #include "xen-compat.h" 43 44 static void xen_vcpu_singleshot_timer_event(void *opaque); 45 static void xen_vcpu_periodic_timer_event(void *opaque); 46 static int vcpuop_stop_singleshot_timer(CPUState *cs); 47 48 #ifdef TARGET_X86_64 49 #define hypercall_compat32(longmode) (!(longmode)) 50 #else 51 #define hypercall_compat32(longmode) (false) 52 #endif 53 54 static bool kvm_gva_to_gpa(CPUState *cs, uint64_t gva, uint64_t *gpa, 55 size_t *len, bool is_write) 56 { 57 struct kvm_translation tr = { 58 .linear_address = gva, 59 }; 60 61 if (len) { 62 *len = TARGET_PAGE_SIZE - (gva & ~TARGET_PAGE_MASK); 63 } 64 65 if (kvm_vcpu_ioctl(cs, KVM_TRANSLATE, &tr) || !tr.valid || 66 (is_write && !tr.writeable)) { 67 return false; 68 } 69 *gpa = tr.physical_address; 70 return true; 71 } 72 73 static int kvm_gva_rw(CPUState *cs, uint64_t gva, void *_buf, size_t sz, 74 bool is_write) 75 { 76 uint8_t *buf = (uint8_t *)_buf; 77 uint64_t gpa; 78 size_t len; 79 80 while (sz) { 81 if (!kvm_gva_to_gpa(cs, gva, &gpa, &len, is_write)) { 82 return -EFAULT; 83 } 84 if (len > sz) { 85 len = sz; 86 } 87 88 cpu_physical_memory_rw(gpa, buf, len, is_write); 89 90 buf += len; 91 sz -= len; 92 gva += len; 93 } 94 95 return 0; 96 } 97 98 static inline int kvm_copy_from_gva(CPUState *cs, uint64_t gva, void *buf, 99 size_t sz) 100 { 101 return kvm_gva_rw(cs, gva, buf, sz, false); 102 } 103 104 static inline int kvm_copy_to_gva(CPUState *cs, uint64_t gva, void *buf, 105 size_t sz) 106 { 107 return kvm_gva_rw(cs, gva, buf, sz, true); 108 } 109 110 int kvm_xen_init(KVMState *s, uint32_t hypercall_msr) 111 { 112 const int required_caps = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR | 113 KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | KVM_XEN_HVM_CONFIG_SHARED_INFO; 114 struct kvm_xen_hvm_config cfg = { 115 .msr = hypercall_msr, 116 .flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL, 117 }; 118 int xen_caps, ret; 119 120 xen_caps = kvm_check_extension(s, KVM_CAP_XEN_HVM); 121 if (required_caps & ~xen_caps) { 122 error_report("kvm: Xen HVM guest support not present or insufficient"); 123 return -ENOSYS; 124 } 125 126 if (xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND) { 127 struct kvm_xen_hvm_attr ha = { 128 .type = KVM_XEN_ATTR_TYPE_XEN_VERSION, 129 .u.xen_version = s->xen_version, 130 }; 131 (void)kvm_vm_ioctl(s, KVM_XEN_HVM_SET_ATTR, &ha); 132 133 cfg.flags |= KVM_XEN_HVM_CONFIG_EVTCHN_SEND; 134 } 135 136 ret = kvm_vm_ioctl(s, KVM_XEN_HVM_CONFIG, &cfg); 137 if (ret < 0) { 138 error_report("kvm: Failed to enable Xen HVM support: %s", 139 strerror(-ret)); 140 return ret; 141 } 142 143 /* If called a second time, don't repeat the rest of the setup. */ 144 if (s->xen_caps) { 145 return 0; 146 } 147 148 /* 149 * Event channel delivery via GSI/PCI_INTX needs to poll the vcpu_info 150 * of vCPU0 to deassert the IRQ when ->evtchn_upcall_pending is cleared. 151 * 152 * In the kernel, there's a notifier hook on the PIC/IOAPIC which allows 153 * such things to be polled at precisely the right time. We *could* do 154 * it nicely in the kernel: check vcpu_info[0]->evtchn_upcall_pending at 155 * the moment the IRQ is acked, and see if it should be reasserted. 156 * 157 * But the in-kernel irqchip is deprecated, so we're unlikely to add 158 * that support in the kernel. Insist on using the split irqchip mode 159 * instead. 160 * 161 * This leaves us polling for the level going low in QEMU, which lacks 162 * the appropriate hooks in its PIC/IOAPIC code. Even VFIO is sending a 163 * spurious 'ack' to an INTX IRQ every time there's any MMIO access to 164 * the device (for which it has to unmap the device and trap access, for 165 * some period after an IRQ!!). In the Xen case, we do it on exit from 166 * KVM_RUN, if the flag is set to say that the GSI is currently asserted. 167 * Which is kind of icky, but less so than the VFIO one. I may fix them 168 * both later... 169 */ 170 if (!kvm_kernel_irqchip_split()) { 171 error_report("kvm: Xen support requires kernel-irqchip=split"); 172 return -EINVAL; 173 } 174 175 s->xen_caps = xen_caps; 176 177 /* Tell fw_cfg to notify the BIOS to reserve the range. */ 178 ret = e820_add_entry(XEN_SPECIAL_AREA_ADDR, XEN_SPECIAL_AREA_SIZE, 179 E820_RESERVED); 180 if (ret < 0) { 181 fprintf(stderr, "e820_add_entry() table is full\n"); 182 return ret; 183 } 184 185 /* The page couldn't be overlaid until KVM was initialized */ 186 xen_xenstore_reset(); 187 188 return 0; 189 } 190 191 int kvm_xen_init_vcpu(CPUState *cs) 192 { 193 X86CPU *cpu = X86_CPU(cs); 194 CPUX86State *env = &cpu->env; 195 int err; 196 197 /* 198 * The kernel needs to know the Xen/ACPI vCPU ID because that's 199 * what the guest uses in hypercalls such as timers. It doesn't 200 * match the APIC ID which is generally used for talking to the 201 * kernel about vCPUs. And if vCPU threads race with creating 202 * their KVM vCPUs out of order, it doesn't necessarily match 203 * with the kernel's internal vCPU indices either. 204 */ 205 if (kvm_xen_has_cap(EVTCHN_SEND)) { 206 struct kvm_xen_vcpu_attr va = { 207 .type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID, 208 .u.vcpu_id = cs->cpu_index, 209 }; 210 err = kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &va); 211 if (err) { 212 error_report("kvm: Failed to set Xen vCPU ID attribute: %s", 213 strerror(-err)); 214 return err; 215 } 216 } 217 218 env->xen_vcpu_info_gpa = INVALID_GPA; 219 env->xen_vcpu_info_default_gpa = INVALID_GPA; 220 env->xen_vcpu_time_info_gpa = INVALID_GPA; 221 env->xen_vcpu_runstate_gpa = INVALID_GPA; 222 223 qemu_mutex_init(&env->xen_timers_lock); 224 env->xen_singleshot_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, 225 xen_vcpu_singleshot_timer_event, 226 cpu); 227 if (!env->xen_singleshot_timer) { 228 return -ENOMEM; 229 } 230 env->xen_singleshot_timer->opaque = cs; 231 232 env->xen_periodic_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, 233 xen_vcpu_periodic_timer_event, 234 cpu); 235 if (!env->xen_periodic_timer) { 236 return -ENOMEM; 237 } 238 env->xen_periodic_timer->opaque = cs; 239 240 return 0; 241 } 242 243 uint32_t kvm_xen_get_caps(void) 244 { 245 return kvm_state->xen_caps; 246 } 247 248 static bool kvm_xen_hcall_xen_version(struct kvm_xen_exit *exit, X86CPU *cpu, 249 int cmd, uint64_t arg) 250 { 251 int err = 0; 252 253 switch (cmd) { 254 case XENVER_get_features: { 255 struct xen_feature_info fi; 256 257 /* No need for 32/64 compat handling */ 258 qemu_build_assert(sizeof(fi) == 8); 259 260 err = kvm_copy_from_gva(CPU(cpu), arg, &fi, sizeof(fi)); 261 if (err) { 262 break; 263 } 264 265 fi.submap = 0; 266 if (fi.submap_idx == 0) { 267 fi.submap |= 1 << XENFEAT_writable_page_tables | 268 1 << XENFEAT_writable_descriptor_tables | 269 1 << XENFEAT_auto_translated_physmap | 270 1 << XENFEAT_hvm_callback_vector | 271 1 << XENFEAT_hvm_safe_pvclock | 272 1 << XENFEAT_hvm_pirqs; 273 } 274 275 err = kvm_copy_to_gva(CPU(cpu), arg, &fi, sizeof(fi)); 276 break; 277 } 278 279 default: 280 return false; 281 } 282 283 exit->u.hcall.result = err; 284 return true; 285 } 286 287 static int kvm_xen_set_vcpu_attr(CPUState *cs, uint16_t type, uint64_t gpa) 288 { 289 struct kvm_xen_vcpu_attr xhsi; 290 291 xhsi.type = type; 292 xhsi.u.gpa = gpa; 293 294 trace_kvm_xen_set_vcpu_attr(cs->cpu_index, type, gpa); 295 296 return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &xhsi); 297 } 298 299 static int kvm_xen_set_vcpu_callback_vector(CPUState *cs) 300 { 301 uint8_t vector = X86_CPU(cs)->env.xen_vcpu_callback_vector; 302 struct kvm_xen_vcpu_attr xva; 303 304 xva.type = KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR; 305 xva.u.vector = vector; 306 307 trace_kvm_xen_set_vcpu_callback(cs->cpu_index, vector); 308 309 return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &xva); 310 } 311 312 static void do_set_vcpu_callback_vector(CPUState *cs, run_on_cpu_data data) 313 { 314 X86CPU *cpu = X86_CPU(cs); 315 CPUX86State *env = &cpu->env; 316 317 env->xen_vcpu_callback_vector = data.host_int; 318 319 if (kvm_xen_has_cap(EVTCHN_SEND)) { 320 kvm_xen_set_vcpu_callback_vector(cs); 321 } 322 } 323 324 static int set_vcpu_info(CPUState *cs, uint64_t gpa) 325 { 326 X86CPU *cpu = X86_CPU(cs); 327 CPUX86State *env = &cpu->env; 328 MemoryRegionSection mrs = { .mr = NULL }; 329 void *vcpu_info_hva = NULL; 330 int ret; 331 332 ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO, gpa); 333 if (ret || gpa == INVALID_GPA) { 334 goto out; 335 } 336 337 mrs = memory_region_find(get_system_memory(), gpa, 338 sizeof(struct vcpu_info)); 339 if (mrs.mr && mrs.mr->ram_block && 340 !int128_lt(mrs.size, int128_make64(sizeof(struct vcpu_info)))) { 341 vcpu_info_hva = qemu_map_ram_ptr(mrs.mr->ram_block, 342 mrs.offset_within_region); 343 } 344 if (!vcpu_info_hva) { 345 if (mrs.mr) { 346 memory_region_unref(mrs.mr); 347 mrs.mr = NULL; 348 } 349 ret = -EINVAL; 350 } 351 352 out: 353 if (env->xen_vcpu_info_mr) { 354 memory_region_unref(env->xen_vcpu_info_mr); 355 } 356 env->xen_vcpu_info_hva = vcpu_info_hva; 357 env->xen_vcpu_info_mr = mrs.mr; 358 return ret; 359 } 360 361 static void do_set_vcpu_info_default_gpa(CPUState *cs, run_on_cpu_data data) 362 { 363 X86CPU *cpu = X86_CPU(cs); 364 CPUX86State *env = &cpu->env; 365 366 env->xen_vcpu_info_default_gpa = data.host_ulong; 367 368 /* Changing the default does nothing if a vcpu_info was explicitly set. */ 369 if (env->xen_vcpu_info_gpa == INVALID_GPA) { 370 set_vcpu_info(cs, env->xen_vcpu_info_default_gpa); 371 } 372 } 373 374 static void do_set_vcpu_info_gpa(CPUState *cs, run_on_cpu_data data) 375 { 376 X86CPU *cpu = X86_CPU(cs); 377 CPUX86State *env = &cpu->env; 378 379 env->xen_vcpu_info_gpa = data.host_ulong; 380 381 set_vcpu_info(cs, env->xen_vcpu_info_gpa); 382 } 383 384 void *kvm_xen_get_vcpu_info_hva(uint32_t vcpu_id) 385 { 386 CPUState *cs = qemu_get_cpu(vcpu_id); 387 if (!cs) { 388 return NULL; 389 } 390 391 return X86_CPU(cs)->env.xen_vcpu_info_hva; 392 } 393 394 void kvm_xen_maybe_deassert_callback(CPUState *cs) 395 { 396 CPUX86State *env = &X86_CPU(cs)->env; 397 struct vcpu_info *vi = env->xen_vcpu_info_hva; 398 if (!vi) { 399 return; 400 } 401 402 /* If the evtchn_upcall_pending flag is cleared, turn the GSI off. */ 403 if (!vi->evtchn_upcall_pending) { 404 qemu_mutex_lock_iothread(); 405 /* 406 * Check again now we have the lock, because it may have been 407 * asserted in the interim. And we don't want to take the lock 408 * every time because this is a fast path. 409 */ 410 if (!vi->evtchn_upcall_pending) { 411 X86_CPU(cs)->env.xen_callback_asserted = false; 412 xen_evtchn_set_callback_level(0); 413 } 414 qemu_mutex_unlock_iothread(); 415 } 416 } 417 418 void kvm_xen_set_callback_asserted(void) 419 { 420 CPUState *cs = qemu_get_cpu(0); 421 422 if (cs) { 423 X86_CPU(cs)->env.xen_callback_asserted = true; 424 } 425 } 426 427 bool kvm_xen_has_vcpu_callback_vector(void) 428 { 429 CPUState *cs = qemu_get_cpu(0); 430 431 return cs && !!X86_CPU(cs)->env.xen_vcpu_callback_vector; 432 } 433 434 void kvm_xen_inject_vcpu_callback_vector(uint32_t vcpu_id, int type) 435 { 436 CPUState *cs = qemu_get_cpu(vcpu_id); 437 uint8_t vector; 438 439 if (!cs) { 440 return; 441 } 442 443 vector = X86_CPU(cs)->env.xen_vcpu_callback_vector; 444 if (vector) { 445 /* 446 * The per-vCPU callback vector injected via lapic. Just 447 * deliver it as an MSI. 448 */ 449 MSIMessage msg = { 450 .address = APIC_DEFAULT_ADDRESS | 451 (X86_CPU(cs)->apic_id << MSI_ADDR_DEST_ID_SHIFT), 452 .data = vector | (1UL << MSI_DATA_LEVEL_SHIFT), 453 }; 454 kvm_irqchip_send_msi(kvm_state, msg); 455 return; 456 } 457 458 switch (type) { 459 case HVM_PARAM_CALLBACK_TYPE_VECTOR: 460 /* 461 * If the evtchn_upcall_pending field in the vcpu_info is set, then 462 * KVM will automatically deliver the vector on entering the vCPU 463 * so all we have to do is kick it out. 464 */ 465 qemu_cpu_kick(cs); 466 break; 467 468 case HVM_PARAM_CALLBACK_TYPE_GSI: 469 case HVM_PARAM_CALLBACK_TYPE_PCI_INTX: 470 if (vcpu_id == 0) { 471 xen_evtchn_set_callback_level(1); 472 } 473 break; 474 } 475 } 476 477 /* Must always be called with xen_timers_lock held */ 478 static int kvm_xen_set_vcpu_timer(CPUState *cs) 479 { 480 X86CPU *cpu = X86_CPU(cs); 481 CPUX86State *env = &cpu->env; 482 483 struct kvm_xen_vcpu_attr va = { 484 .type = KVM_XEN_VCPU_ATTR_TYPE_TIMER, 485 .u.timer.port = env->xen_virq[VIRQ_TIMER], 486 .u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL, 487 .u.timer.expires_ns = env->xen_singleshot_timer_ns, 488 }; 489 490 return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &va); 491 } 492 493 static void do_set_vcpu_timer_virq(CPUState *cs, run_on_cpu_data data) 494 { 495 QEMU_LOCK_GUARD(&X86_CPU(cs)->env.xen_timers_lock); 496 kvm_xen_set_vcpu_timer(cs); 497 } 498 499 int kvm_xen_set_vcpu_virq(uint32_t vcpu_id, uint16_t virq, uint16_t port) 500 { 501 CPUState *cs = qemu_get_cpu(vcpu_id); 502 503 if (!cs) { 504 return -ENOENT; 505 } 506 507 /* cpu.h doesn't include the actual Xen header. */ 508 qemu_build_assert(NR_VIRQS == XEN_NR_VIRQS); 509 510 if (virq >= NR_VIRQS) { 511 return -EINVAL; 512 } 513 514 if (port && X86_CPU(cs)->env.xen_virq[virq]) { 515 return -EEXIST; 516 } 517 518 X86_CPU(cs)->env.xen_virq[virq] = port; 519 if (virq == VIRQ_TIMER && kvm_xen_has_cap(EVTCHN_SEND)) { 520 async_run_on_cpu(cs, do_set_vcpu_timer_virq, 521 RUN_ON_CPU_HOST_INT(port)); 522 } 523 return 0; 524 } 525 526 static void do_set_vcpu_time_info_gpa(CPUState *cs, run_on_cpu_data data) 527 { 528 X86CPU *cpu = X86_CPU(cs); 529 CPUX86State *env = &cpu->env; 530 531 env->xen_vcpu_time_info_gpa = data.host_ulong; 532 533 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO, 534 env->xen_vcpu_time_info_gpa); 535 } 536 537 static void do_set_vcpu_runstate_gpa(CPUState *cs, run_on_cpu_data data) 538 { 539 X86CPU *cpu = X86_CPU(cs); 540 CPUX86State *env = &cpu->env; 541 542 env->xen_vcpu_runstate_gpa = data.host_ulong; 543 544 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR, 545 env->xen_vcpu_runstate_gpa); 546 } 547 548 static void do_vcpu_soft_reset(CPUState *cs, run_on_cpu_data data) 549 { 550 X86CPU *cpu = X86_CPU(cs); 551 CPUX86State *env = &cpu->env; 552 553 env->xen_vcpu_info_gpa = INVALID_GPA; 554 env->xen_vcpu_info_default_gpa = INVALID_GPA; 555 env->xen_vcpu_time_info_gpa = INVALID_GPA; 556 env->xen_vcpu_runstate_gpa = INVALID_GPA; 557 env->xen_vcpu_callback_vector = 0; 558 memset(env->xen_virq, 0, sizeof(env->xen_virq)); 559 560 set_vcpu_info(cs, INVALID_GPA); 561 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO, 562 INVALID_GPA); 563 kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR, 564 INVALID_GPA); 565 if (kvm_xen_has_cap(EVTCHN_SEND)) { 566 kvm_xen_set_vcpu_callback_vector(cs); 567 568 QEMU_LOCK_GUARD(&X86_CPU(cs)->env.xen_timers_lock); 569 env->xen_singleshot_timer_ns = 0; 570 kvm_xen_set_vcpu_timer(cs); 571 } else { 572 vcpuop_stop_singleshot_timer(cs); 573 }; 574 575 } 576 577 static int xen_set_shared_info(uint64_t gfn) 578 { 579 uint64_t gpa = gfn << TARGET_PAGE_BITS; 580 int i, err; 581 582 QEMU_IOTHREAD_LOCK_GUARD(); 583 584 /* 585 * The xen_overlay device tells KVM about it too, since it had to 586 * do that on migration load anyway (unless we're going to jump 587 * through lots of hoops to maintain the fiction that this isn't 588 * KVM-specific. 589 */ 590 err = xen_overlay_map_shinfo_page(gpa); 591 if (err) { 592 return err; 593 } 594 595 trace_kvm_xen_set_shared_info(gfn); 596 597 for (i = 0; i < XEN_LEGACY_MAX_VCPUS; i++) { 598 CPUState *cpu = qemu_get_cpu(i); 599 if (cpu) { 600 async_run_on_cpu(cpu, do_set_vcpu_info_default_gpa, 601 RUN_ON_CPU_HOST_ULONG(gpa)); 602 } 603 gpa += sizeof(vcpu_info_t); 604 } 605 606 return err; 607 } 608 609 static int add_to_physmap_one(uint32_t space, uint64_t idx, uint64_t gfn) 610 { 611 switch (space) { 612 case XENMAPSPACE_shared_info: 613 if (idx > 0) { 614 return -EINVAL; 615 } 616 return xen_set_shared_info(gfn); 617 618 case XENMAPSPACE_grant_table: 619 return xen_gnttab_map_page(idx, gfn); 620 621 case XENMAPSPACE_gmfn: 622 case XENMAPSPACE_gmfn_range: 623 return -ENOTSUP; 624 625 case XENMAPSPACE_gmfn_foreign: 626 case XENMAPSPACE_dev_mmio: 627 return -EPERM; 628 629 default: 630 return -EINVAL; 631 } 632 } 633 634 static int do_add_to_physmap(struct kvm_xen_exit *exit, X86CPU *cpu, 635 uint64_t arg) 636 { 637 struct xen_add_to_physmap xatp; 638 CPUState *cs = CPU(cpu); 639 640 if (hypercall_compat32(exit->u.hcall.longmode)) { 641 struct compat_xen_add_to_physmap xatp32; 642 643 qemu_build_assert(sizeof(struct compat_xen_add_to_physmap) == 16); 644 if (kvm_copy_from_gva(cs, arg, &xatp32, sizeof(xatp32))) { 645 return -EFAULT; 646 } 647 xatp.domid = xatp32.domid; 648 xatp.size = xatp32.size; 649 xatp.space = xatp32.space; 650 xatp.idx = xatp32.idx; 651 xatp.gpfn = xatp32.gpfn; 652 } else { 653 if (kvm_copy_from_gva(cs, arg, &xatp, sizeof(xatp))) { 654 return -EFAULT; 655 } 656 } 657 658 if (xatp.domid != DOMID_SELF && xatp.domid != xen_domid) { 659 return -ESRCH; 660 } 661 662 return add_to_physmap_one(xatp.space, xatp.idx, xatp.gpfn); 663 } 664 665 static int do_add_to_physmap_batch(struct kvm_xen_exit *exit, X86CPU *cpu, 666 uint64_t arg) 667 { 668 struct xen_add_to_physmap_batch xatpb; 669 unsigned long idxs_gva, gpfns_gva, errs_gva; 670 CPUState *cs = CPU(cpu); 671 size_t op_sz; 672 673 if (hypercall_compat32(exit->u.hcall.longmode)) { 674 struct compat_xen_add_to_physmap_batch xatpb32; 675 676 qemu_build_assert(sizeof(struct compat_xen_add_to_physmap_batch) == 20); 677 if (kvm_copy_from_gva(cs, arg, &xatpb32, sizeof(xatpb32))) { 678 return -EFAULT; 679 } 680 xatpb.domid = xatpb32.domid; 681 xatpb.space = xatpb32.space; 682 xatpb.size = xatpb32.size; 683 684 idxs_gva = xatpb32.idxs.c; 685 gpfns_gva = xatpb32.gpfns.c; 686 errs_gva = xatpb32.errs.c; 687 op_sz = sizeof(uint32_t); 688 } else { 689 if (kvm_copy_from_gva(cs, arg, &xatpb, sizeof(xatpb))) { 690 return -EFAULT; 691 } 692 op_sz = sizeof(unsigned long); 693 idxs_gva = (unsigned long)xatpb.idxs.p; 694 gpfns_gva = (unsigned long)xatpb.gpfns.p; 695 errs_gva = (unsigned long)xatpb.errs.p; 696 } 697 698 if (xatpb.domid != DOMID_SELF && xatpb.domid != xen_domid) { 699 return -ESRCH; 700 } 701 702 /* Explicitly invalid for the batch op. Not that we implement it anyway. */ 703 if (xatpb.space == XENMAPSPACE_gmfn_range) { 704 return -EINVAL; 705 } 706 707 while (xatpb.size--) { 708 unsigned long idx = 0; 709 unsigned long gpfn = 0; 710 int err; 711 712 /* For 32-bit compat this only copies the low 32 bits of each */ 713 if (kvm_copy_from_gva(cs, idxs_gva, &idx, op_sz) || 714 kvm_copy_from_gva(cs, gpfns_gva, &gpfn, op_sz)) { 715 return -EFAULT; 716 } 717 idxs_gva += op_sz; 718 gpfns_gva += op_sz; 719 720 err = add_to_physmap_one(xatpb.space, idx, gpfn); 721 722 if (kvm_copy_to_gva(cs, errs_gva, &err, sizeof(err))) { 723 return -EFAULT; 724 } 725 errs_gva += sizeof(err); 726 } 727 return 0; 728 } 729 730 static bool kvm_xen_hcall_memory_op(struct kvm_xen_exit *exit, X86CPU *cpu, 731 int cmd, uint64_t arg) 732 { 733 int err; 734 735 switch (cmd) { 736 case XENMEM_add_to_physmap: 737 err = do_add_to_physmap(exit, cpu, arg); 738 break; 739 740 case XENMEM_add_to_physmap_batch: 741 err = do_add_to_physmap_batch(exit, cpu, arg); 742 break; 743 744 default: 745 return false; 746 } 747 748 exit->u.hcall.result = err; 749 return true; 750 } 751 752 static bool handle_set_param(struct kvm_xen_exit *exit, X86CPU *cpu, 753 uint64_t arg) 754 { 755 CPUState *cs = CPU(cpu); 756 struct xen_hvm_param hp; 757 int err = 0; 758 759 /* No need for 32/64 compat handling */ 760 qemu_build_assert(sizeof(hp) == 16); 761 762 if (kvm_copy_from_gva(cs, arg, &hp, sizeof(hp))) { 763 err = -EFAULT; 764 goto out; 765 } 766 767 if (hp.domid != DOMID_SELF && hp.domid != xen_domid) { 768 err = -ESRCH; 769 goto out; 770 } 771 772 switch (hp.index) { 773 case HVM_PARAM_CALLBACK_IRQ: 774 qemu_mutex_lock_iothread(); 775 err = xen_evtchn_set_callback_param(hp.value); 776 qemu_mutex_unlock_iothread(); 777 xen_set_long_mode(exit->u.hcall.longmode); 778 break; 779 default: 780 return false; 781 } 782 783 out: 784 exit->u.hcall.result = err; 785 return true; 786 } 787 788 static bool handle_get_param(struct kvm_xen_exit *exit, X86CPU *cpu, 789 uint64_t arg) 790 { 791 CPUState *cs = CPU(cpu); 792 struct xen_hvm_param hp; 793 int err = 0; 794 795 /* No need for 32/64 compat handling */ 796 qemu_build_assert(sizeof(hp) == 16); 797 798 if (kvm_copy_from_gva(cs, arg, &hp, sizeof(hp))) { 799 err = -EFAULT; 800 goto out; 801 } 802 803 if (hp.domid != DOMID_SELF && hp.domid != xen_domid) { 804 err = -ESRCH; 805 goto out; 806 } 807 808 switch (hp.index) { 809 case HVM_PARAM_STORE_PFN: 810 hp.value = XEN_SPECIAL_PFN(XENSTORE); 811 break; 812 case HVM_PARAM_STORE_EVTCHN: 813 hp.value = xen_xenstore_get_port(); 814 break; 815 default: 816 return false; 817 } 818 819 if (kvm_copy_to_gva(cs, arg, &hp, sizeof(hp))) { 820 err = -EFAULT; 821 } 822 out: 823 exit->u.hcall.result = err; 824 return true; 825 } 826 827 static int kvm_xen_hcall_evtchn_upcall_vector(struct kvm_xen_exit *exit, 828 X86CPU *cpu, uint64_t arg) 829 { 830 struct xen_hvm_evtchn_upcall_vector up; 831 CPUState *target_cs; 832 833 /* No need for 32/64 compat handling */ 834 qemu_build_assert(sizeof(up) == 8); 835 836 if (kvm_copy_from_gva(CPU(cpu), arg, &up, sizeof(up))) { 837 return -EFAULT; 838 } 839 840 if (up.vector < 0x10) { 841 return -EINVAL; 842 } 843 844 target_cs = qemu_get_cpu(up.vcpu); 845 if (!target_cs) { 846 return -EINVAL; 847 } 848 849 async_run_on_cpu(target_cs, do_set_vcpu_callback_vector, 850 RUN_ON_CPU_HOST_INT(up.vector)); 851 return 0; 852 } 853 854 static bool kvm_xen_hcall_hvm_op(struct kvm_xen_exit *exit, X86CPU *cpu, 855 int cmd, uint64_t arg) 856 { 857 int ret = -ENOSYS; 858 switch (cmd) { 859 case HVMOP_set_evtchn_upcall_vector: 860 ret = kvm_xen_hcall_evtchn_upcall_vector(exit, cpu, arg); 861 break; 862 863 case HVMOP_pagetable_dying: 864 ret = -ENOSYS; 865 break; 866 867 case HVMOP_set_param: 868 return handle_set_param(exit, cpu, arg); 869 870 case HVMOP_get_param: 871 return handle_get_param(exit, cpu, arg); 872 873 default: 874 return false; 875 } 876 877 exit->u.hcall.result = ret; 878 return true; 879 } 880 881 static int vcpuop_register_vcpu_info(CPUState *cs, CPUState *target, 882 uint64_t arg) 883 { 884 struct vcpu_register_vcpu_info rvi; 885 uint64_t gpa; 886 887 /* No need for 32/64 compat handling */ 888 qemu_build_assert(sizeof(rvi) == 16); 889 qemu_build_assert(sizeof(struct vcpu_info) == 64); 890 891 if (!target) { 892 return -ENOENT; 893 } 894 895 if (kvm_copy_from_gva(cs, arg, &rvi, sizeof(rvi))) { 896 return -EFAULT; 897 } 898 899 if (rvi.offset > TARGET_PAGE_SIZE - sizeof(struct vcpu_info)) { 900 return -EINVAL; 901 } 902 903 gpa = ((rvi.mfn << TARGET_PAGE_BITS) + rvi.offset); 904 async_run_on_cpu(target, do_set_vcpu_info_gpa, RUN_ON_CPU_HOST_ULONG(gpa)); 905 return 0; 906 } 907 908 static int vcpuop_register_vcpu_time_info(CPUState *cs, CPUState *target, 909 uint64_t arg) 910 { 911 struct vcpu_register_time_memory_area tma; 912 uint64_t gpa; 913 size_t len; 914 915 /* No need for 32/64 compat handling */ 916 qemu_build_assert(sizeof(tma) == 8); 917 qemu_build_assert(sizeof(struct vcpu_time_info) == 32); 918 919 if (!target) { 920 return -ENOENT; 921 } 922 923 if (kvm_copy_from_gva(cs, arg, &tma, sizeof(tma))) { 924 return -EFAULT; 925 } 926 927 /* 928 * Xen actually uses the GVA and does the translation through the guest 929 * page tables each time. But Linux/KVM uses the GPA, on the assumption 930 * that guests only ever use *global* addresses (kernel virtual addresses) 931 * for it. If Linux is changed to redo the GVA→GPA translation each time, 932 * it will offer a new vCPU attribute for that, and we'll use it instead. 933 */ 934 if (!kvm_gva_to_gpa(cs, tma.addr.p, &gpa, &len, false) || 935 len < sizeof(struct vcpu_time_info)) { 936 return -EFAULT; 937 } 938 939 async_run_on_cpu(target, do_set_vcpu_time_info_gpa, 940 RUN_ON_CPU_HOST_ULONG(gpa)); 941 return 0; 942 } 943 944 static int vcpuop_register_runstate_info(CPUState *cs, CPUState *target, 945 uint64_t arg) 946 { 947 struct vcpu_register_runstate_memory_area rma; 948 uint64_t gpa; 949 size_t len; 950 951 /* No need for 32/64 compat handling */ 952 qemu_build_assert(sizeof(rma) == 8); 953 /* The runstate area actually does change size, but Linux copes. */ 954 955 if (!target) { 956 return -ENOENT; 957 } 958 959 if (kvm_copy_from_gva(cs, arg, &rma, sizeof(rma))) { 960 return -EFAULT; 961 } 962 963 /* As with vcpu_time_info, Xen actually uses the GVA but KVM doesn't. */ 964 if (!kvm_gva_to_gpa(cs, rma.addr.p, &gpa, &len, false)) { 965 return -EFAULT; 966 } 967 968 async_run_on_cpu(target, do_set_vcpu_runstate_gpa, 969 RUN_ON_CPU_HOST_ULONG(gpa)); 970 return 0; 971 } 972 973 static uint64_t kvm_get_current_ns(void) 974 { 975 struct kvm_clock_data data; 976 int ret; 977 978 ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data); 979 if (ret < 0) { 980 fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret)); 981 abort(); 982 } 983 984 return data.clock; 985 } 986 987 static void xen_vcpu_singleshot_timer_event(void *opaque) 988 { 989 CPUState *cpu = opaque; 990 CPUX86State *env = &X86_CPU(cpu)->env; 991 uint16_t port = env->xen_virq[VIRQ_TIMER]; 992 993 if (likely(port)) { 994 xen_evtchn_set_port(port); 995 } 996 997 qemu_mutex_lock(&env->xen_timers_lock); 998 env->xen_singleshot_timer_ns = 0; 999 qemu_mutex_unlock(&env->xen_timers_lock); 1000 } 1001 1002 static void xen_vcpu_periodic_timer_event(void *opaque) 1003 { 1004 CPUState *cpu = opaque; 1005 CPUX86State *env = &X86_CPU(cpu)->env; 1006 uint16_t port = env->xen_virq[VIRQ_TIMER]; 1007 int64_t qemu_now; 1008 1009 if (likely(port)) { 1010 xen_evtchn_set_port(port); 1011 } 1012 1013 qemu_mutex_lock(&env->xen_timers_lock); 1014 1015 qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); 1016 timer_mod_ns(env->xen_periodic_timer, 1017 qemu_now + env->xen_periodic_timer_period); 1018 1019 qemu_mutex_unlock(&env->xen_timers_lock); 1020 } 1021 1022 static int do_set_periodic_timer(CPUState *target, uint64_t period_ns) 1023 { 1024 CPUX86State *tenv = &X86_CPU(target)->env; 1025 int64_t qemu_now; 1026 1027 timer_del(tenv->xen_periodic_timer); 1028 1029 qemu_mutex_lock(&tenv->xen_timers_lock); 1030 1031 qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); 1032 timer_mod_ns(tenv->xen_periodic_timer, qemu_now + period_ns); 1033 tenv->xen_periodic_timer_period = period_ns; 1034 1035 qemu_mutex_unlock(&tenv->xen_timers_lock); 1036 return 0; 1037 } 1038 1039 #define MILLISECS(_ms) ((int64_t)((_ms) * 1000000ULL)) 1040 #define MICROSECS(_us) ((int64_t)((_us) * 1000ULL)) 1041 #define STIME_MAX ((time_t)((int64_t)~0ull >> 1)) 1042 /* Chosen so (NOW() + delta) won't overflow without an uptime of 200 years */ 1043 #define STIME_DELTA_MAX ((int64_t)((uint64_t)~0ull >> 2)) 1044 1045 static int vcpuop_set_periodic_timer(CPUState *cs, CPUState *target, 1046 uint64_t arg) 1047 { 1048 struct vcpu_set_periodic_timer spt; 1049 1050 qemu_build_assert(sizeof(spt) == 8); 1051 if (kvm_copy_from_gva(cs, arg, &spt, sizeof(spt))) { 1052 return -EFAULT; 1053 } 1054 1055 if (spt.period_ns < MILLISECS(1) || spt.period_ns > STIME_DELTA_MAX) { 1056 return -EINVAL; 1057 } 1058 1059 return do_set_periodic_timer(target, spt.period_ns); 1060 } 1061 1062 static int vcpuop_stop_periodic_timer(CPUState *target) 1063 { 1064 CPUX86State *tenv = &X86_CPU(target)->env; 1065 1066 qemu_mutex_lock(&tenv->xen_timers_lock); 1067 1068 timer_del(tenv->xen_periodic_timer); 1069 tenv->xen_periodic_timer_period = 0; 1070 1071 qemu_mutex_unlock(&tenv->xen_timers_lock); 1072 return 0; 1073 } 1074 1075 /* 1076 * Userspace handling of timer, for older kernels. 1077 * Must always be called with xen_timers_lock held. 1078 */ 1079 static int do_set_singleshot_timer(CPUState *cs, uint64_t timeout_abs, 1080 bool future, bool linux_wa) 1081 { 1082 CPUX86State *env = &X86_CPU(cs)->env; 1083 int64_t now = kvm_get_current_ns(); 1084 int64_t qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); 1085 int64_t delta = timeout_abs - now; 1086 1087 if (future && timeout_abs < now) { 1088 return -ETIME; 1089 } 1090 1091 if (linux_wa && unlikely((int64_t)timeout_abs < 0 || 1092 (delta > 0 && (uint32_t)(delta >> 50) != 0))) { 1093 /* 1094 * Xen has a 'Linux workaround' in do_set_timer_op() which checks 1095 * for negative absolute timeout values (caused by integer 1096 * overflow), and for values about 13 days in the future (2^50ns) 1097 * which would be caused by jiffies overflow. For those cases, it 1098 * sets the timeout 100ms in the future (not *too* soon, since if 1099 * a guest really did set a long timeout on purpose we don't want 1100 * to keep churning CPU time by waking it up). 1101 */ 1102 delta = (100 * SCALE_MS); 1103 timeout_abs = now + delta; 1104 } 1105 1106 timer_mod_ns(env->xen_singleshot_timer, qemu_now + delta); 1107 env->xen_singleshot_timer_ns = now + delta; 1108 return 0; 1109 } 1110 1111 static int vcpuop_set_singleshot_timer(CPUState *cs, uint64_t arg) 1112 { 1113 struct vcpu_set_singleshot_timer sst = { 0 }; 1114 1115 /* 1116 * The struct is a uint64_t followed by a uint32_t. On 32-bit that 1117 * makes it 12 bytes. On 64-bit it gets padded to 16. The parts 1118 * that get used are identical, and there's four bytes of padding 1119 * unused at the end. For true Xen compatibility we should attempt 1120 * to copy the full 16 bytes from 64-bit guests, and return -EFAULT 1121 * if we can't get the padding too. But that's daft. Just copy what 1122 * we need. 1123 */ 1124 qemu_build_assert(offsetof(struct vcpu_set_singleshot_timer, flags) == 8); 1125 qemu_build_assert(sizeof(sst) >= 12); 1126 1127 if (kvm_copy_from_gva(cs, arg, &sst, 12)) { 1128 return -EFAULT; 1129 } 1130 1131 QEMU_LOCK_GUARD(&X86_CPU(cs)->env.xen_timers_lock); 1132 return do_set_singleshot_timer(cs, sst.timeout_abs_ns, 1133 !!(sst.flags & VCPU_SSHOTTMR_future), 1134 false); 1135 } 1136 1137 static int vcpuop_stop_singleshot_timer(CPUState *cs) 1138 { 1139 CPUX86State *env = &X86_CPU(cs)->env; 1140 1141 qemu_mutex_lock(&env->xen_timers_lock); 1142 1143 timer_del(env->xen_singleshot_timer); 1144 env->xen_singleshot_timer_ns = 0; 1145 1146 qemu_mutex_unlock(&env->xen_timers_lock); 1147 return 0; 1148 } 1149 1150 static bool kvm_xen_hcall_set_timer_op(struct kvm_xen_exit *exit, X86CPU *cpu, 1151 uint64_t timeout) 1152 { 1153 int err; 1154 1155 if (unlikely(timeout == 0)) { 1156 err = vcpuop_stop_singleshot_timer(CPU(cpu)); 1157 } else { 1158 QEMU_LOCK_GUARD(&X86_CPU(cpu)->env.xen_timers_lock); 1159 err = do_set_singleshot_timer(CPU(cpu), timeout, false, true); 1160 } 1161 exit->u.hcall.result = err; 1162 return true; 1163 } 1164 1165 static bool kvm_xen_hcall_vcpu_op(struct kvm_xen_exit *exit, X86CPU *cpu, 1166 int cmd, int vcpu_id, uint64_t arg) 1167 { 1168 CPUState *cs = CPU(cpu); 1169 CPUState *dest = cs->cpu_index == vcpu_id ? cs : qemu_get_cpu(vcpu_id); 1170 int err; 1171 1172 if (!dest) { 1173 err = -ENOENT; 1174 goto out; 1175 } 1176 1177 switch (cmd) { 1178 case VCPUOP_register_runstate_memory_area: 1179 err = vcpuop_register_runstate_info(cs, dest, arg); 1180 break; 1181 case VCPUOP_register_vcpu_time_memory_area: 1182 err = vcpuop_register_vcpu_time_info(cs, dest, arg); 1183 break; 1184 case VCPUOP_register_vcpu_info: 1185 err = vcpuop_register_vcpu_info(cs, dest, arg); 1186 break; 1187 case VCPUOP_set_singleshot_timer: { 1188 if (cs->cpu_index == vcpu_id) { 1189 err = vcpuop_set_singleshot_timer(dest, arg); 1190 } else { 1191 err = -EINVAL; 1192 } 1193 break; 1194 } 1195 case VCPUOP_stop_singleshot_timer: 1196 if (cs->cpu_index == vcpu_id) { 1197 err = vcpuop_stop_singleshot_timer(dest); 1198 } else { 1199 err = -EINVAL; 1200 } 1201 break; 1202 case VCPUOP_set_periodic_timer: { 1203 err = vcpuop_set_periodic_timer(cs, dest, arg); 1204 break; 1205 } 1206 case VCPUOP_stop_periodic_timer: 1207 err = vcpuop_stop_periodic_timer(dest); 1208 break; 1209 1210 default: 1211 return false; 1212 } 1213 1214 out: 1215 exit->u.hcall.result = err; 1216 return true; 1217 } 1218 1219 static bool kvm_xen_hcall_evtchn_op(struct kvm_xen_exit *exit, X86CPU *cpu, 1220 int cmd, uint64_t arg) 1221 { 1222 CPUState *cs = CPU(cpu); 1223 int err = -ENOSYS; 1224 1225 switch (cmd) { 1226 case EVTCHNOP_init_control: 1227 case EVTCHNOP_expand_array: 1228 case EVTCHNOP_set_priority: 1229 /* We do not support FIFO channels at this point */ 1230 err = -ENOSYS; 1231 break; 1232 1233 case EVTCHNOP_status: { 1234 struct evtchn_status status; 1235 1236 qemu_build_assert(sizeof(status) == 24); 1237 if (kvm_copy_from_gva(cs, arg, &status, sizeof(status))) { 1238 err = -EFAULT; 1239 break; 1240 } 1241 1242 err = xen_evtchn_status_op(&status); 1243 if (!err && kvm_copy_to_gva(cs, arg, &status, sizeof(status))) { 1244 err = -EFAULT; 1245 } 1246 break; 1247 } 1248 case EVTCHNOP_close: { 1249 struct evtchn_close close; 1250 1251 qemu_build_assert(sizeof(close) == 4); 1252 if (kvm_copy_from_gva(cs, arg, &close, sizeof(close))) { 1253 err = -EFAULT; 1254 break; 1255 } 1256 1257 err = xen_evtchn_close_op(&close); 1258 break; 1259 } 1260 case EVTCHNOP_unmask: { 1261 struct evtchn_unmask unmask; 1262 1263 qemu_build_assert(sizeof(unmask) == 4); 1264 if (kvm_copy_from_gva(cs, arg, &unmask, sizeof(unmask))) { 1265 err = -EFAULT; 1266 break; 1267 } 1268 1269 err = xen_evtchn_unmask_op(&unmask); 1270 break; 1271 } 1272 case EVTCHNOP_bind_virq: { 1273 struct evtchn_bind_virq virq; 1274 1275 qemu_build_assert(sizeof(virq) == 12); 1276 if (kvm_copy_from_gva(cs, arg, &virq, sizeof(virq))) { 1277 err = -EFAULT; 1278 break; 1279 } 1280 1281 err = xen_evtchn_bind_virq_op(&virq); 1282 if (!err && kvm_copy_to_gva(cs, arg, &virq, sizeof(virq))) { 1283 err = -EFAULT; 1284 } 1285 break; 1286 } 1287 case EVTCHNOP_bind_pirq: { 1288 struct evtchn_bind_pirq pirq; 1289 1290 qemu_build_assert(sizeof(pirq) == 12); 1291 if (kvm_copy_from_gva(cs, arg, &pirq, sizeof(pirq))) { 1292 err = -EFAULT; 1293 break; 1294 } 1295 1296 err = xen_evtchn_bind_pirq_op(&pirq); 1297 if (!err && kvm_copy_to_gva(cs, arg, &pirq, sizeof(pirq))) { 1298 err = -EFAULT; 1299 } 1300 break; 1301 } 1302 case EVTCHNOP_bind_ipi: { 1303 struct evtchn_bind_ipi ipi; 1304 1305 qemu_build_assert(sizeof(ipi) == 8); 1306 if (kvm_copy_from_gva(cs, arg, &ipi, sizeof(ipi))) { 1307 err = -EFAULT; 1308 break; 1309 } 1310 1311 err = xen_evtchn_bind_ipi_op(&ipi); 1312 if (!err && kvm_copy_to_gva(cs, arg, &ipi, sizeof(ipi))) { 1313 err = -EFAULT; 1314 } 1315 break; 1316 } 1317 case EVTCHNOP_send: { 1318 struct evtchn_send send; 1319 1320 qemu_build_assert(sizeof(send) == 4); 1321 if (kvm_copy_from_gva(cs, arg, &send, sizeof(send))) { 1322 err = -EFAULT; 1323 break; 1324 } 1325 1326 err = xen_evtchn_send_op(&send); 1327 break; 1328 } 1329 case EVTCHNOP_alloc_unbound: { 1330 struct evtchn_alloc_unbound alloc; 1331 1332 qemu_build_assert(sizeof(alloc) == 8); 1333 if (kvm_copy_from_gva(cs, arg, &alloc, sizeof(alloc))) { 1334 err = -EFAULT; 1335 break; 1336 } 1337 1338 err = xen_evtchn_alloc_unbound_op(&alloc); 1339 if (!err && kvm_copy_to_gva(cs, arg, &alloc, sizeof(alloc))) { 1340 err = -EFAULT; 1341 } 1342 break; 1343 } 1344 case EVTCHNOP_bind_interdomain: { 1345 struct evtchn_bind_interdomain interdomain; 1346 1347 qemu_build_assert(sizeof(interdomain) == 12); 1348 if (kvm_copy_from_gva(cs, arg, &interdomain, sizeof(interdomain))) { 1349 err = -EFAULT; 1350 break; 1351 } 1352 1353 err = xen_evtchn_bind_interdomain_op(&interdomain); 1354 if (!err && 1355 kvm_copy_to_gva(cs, arg, &interdomain, sizeof(interdomain))) { 1356 err = -EFAULT; 1357 } 1358 break; 1359 } 1360 case EVTCHNOP_bind_vcpu: { 1361 struct evtchn_bind_vcpu vcpu; 1362 1363 qemu_build_assert(sizeof(vcpu) == 8); 1364 if (kvm_copy_from_gva(cs, arg, &vcpu, sizeof(vcpu))) { 1365 err = -EFAULT; 1366 break; 1367 } 1368 1369 err = xen_evtchn_bind_vcpu_op(&vcpu); 1370 break; 1371 } 1372 case EVTCHNOP_reset: { 1373 struct evtchn_reset reset; 1374 1375 qemu_build_assert(sizeof(reset) == 2); 1376 if (kvm_copy_from_gva(cs, arg, &reset, sizeof(reset))) { 1377 err = -EFAULT; 1378 break; 1379 } 1380 1381 err = xen_evtchn_reset_op(&reset); 1382 break; 1383 } 1384 default: 1385 return false; 1386 } 1387 1388 exit->u.hcall.result = err; 1389 return true; 1390 } 1391 1392 int kvm_xen_soft_reset(void) 1393 { 1394 CPUState *cpu; 1395 int err; 1396 1397 assert(qemu_mutex_iothread_locked()); 1398 1399 trace_kvm_xen_soft_reset(); 1400 1401 err = xen_evtchn_soft_reset(); 1402 if (err) { 1403 return err; 1404 } 1405 1406 /* 1407 * Zero is the reset/startup state for HVM_PARAM_CALLBACK_IRQ. Strictly, 1408 * it maps to HVM_PARAM_CALLBACK_TYPE_GSI with GSI#0, but Xen refuses to 1409 * to deliver to the timer interrupt and treats that as 'disabled'. 1410 */ 1411 err = xen_evtchn_set_callback_param(0); 1412 if (err) { 1413 return err; 1414 } 1415 1416 CPU_FOREACH(cpu) { 1417 async_run_on_cpu(cpu, do_vcpu_soft_reset, RUN_ON_CPU_NULL); 1418 } 1419 1420 err = xen_overlay_map_shinfo_page(INVALID_GFN); 1421 if (err) { 1422 return err; 1423 } 1424 1425 err = xen_gnttab_reset(); 1426 if (err) { 1427 return err; 1428 } 1429 1430 err = xen_xenstore_reset(); 1431 if (err) { 1432 return err; 1433 } 1434 1435 return 0; 1436 } 1437 1438 static int schedop_shutdown(CPUState *cs, uint64_t arg) 1439 { 1440 struct sched_shutdown shutdown; 1441 int ret = 0; 1442 1443 /* No need for 32/64 compat handling */ 1444 qemu_build_assert(sizeof(shutdown) == 4); 1445 1446 if (kvm_copy_from_gva(cs, arg, &shutdown, sizeof(shutdown))) { 1447 return -EFAULT; 1448 } 1449 1450 switch (shutdown.reason) { 1451 case SHUTDOWN_crash: 1452 cpu_dump_state(cs, stderr, CPU_DUMP_CODE); 1453 qemu_system_guest_panicked(NULL); 1454 break; 1455 1456 case SHUTDOWN_reboot: 1457 qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET); 1458 break; 1459 1460 case SHUTDOWN_poweroff: 1461 qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN); 1462 break; 1463 1464 case SHUTDOWN_soft_reset: 1465 qemu_mutex_lock_iothread(); 1466 ret = kvm_xen_soft_reset(); 1467 qemu_mutex_unlock_iothread(); 1468 break; 1469 1470 default: 1471 ret = -EINVAL; 1472 break; 1473 } 1474 1475 return ret; 1476 } 1477 1478 static bool kvm_xen_hcall_sched_op(struct kvm_xen_exit *exit, X86CPU *cpu, 1479 int cmd, uint64_t arg) 1480 { 1481 CPUState *cs = CPU(cpu); 1482 int err = -ENOSYS; 1483 1484 switch (cmd) { 1485 case SCHEDOP_shutdown: 1486 err = schedop_shutdown(cs, arg); 1487 break; 1488 1489 case SCHEDOP_poll: 1490 /* 1491 * Linux will panic if this doesn't work. Just yield; it's not 1492 * worth overthinking it because with event channel handling 1493 * in KVM, the kernel will intercept this and it will never 1494 * reach QEMU anyway. The semantics of the hypercall explicltly 1495 * permit spurious wakeups. 1496 */ 1497 case SCHEDOP_yield: 1498 sched_yield(); 1499 err = 0; 1500 break; 1501 1502 default: 1503 return false; 1504 } 1505 1506 exit->u.hcall.result = err; 1507 return true; 1508 } 1509 1510 static bool kvm_xen_hcall_gnttab_op(struct kvm_xen_exit *exit, X86CPU *cpu, 1511 int cmd, uint64_t arg, int count) 1512 { 1513 CPUState *cs = CPU(cpu); 1514 int err; 1515 1516 switch (cmd) { 1517 case GNTTABOP_set_version: { 1518 struct gnttab_set_version set; 1519 1520 qemu_build_assert(sizeof(set) == 4); 1521 if (kvm_copy_from_gva(cs, arg, &set, sizeof(set))) { 1522 err = -EFAULT; 1523 break; 1524 } 1525 1526 err = xen_gnttab_set_version_op(&set); 1527 if (!err && kvm_copy_to_gva(cs, arg, &set, sizeof(set))) { 1528 err = -EFAULT; 1529 } 1530 break; 1531 } 1532 case GNTTABOP_get_version: { 1533 struct gnttab_get_version get; 1534 1535 qemu_build_assert(sizeof(get) == 8); 1536 if (kvm_copy_from_gva(cs, arg, &get, sizeof(get))) { 1537 err = -EFAULT; 1538 break; 1539 } 1540 1541 err = xen_gnttab_get_version_op(&get); 1542 if (!err && kvm_copy_to_gva(cs, arg, &get, sizeof(get))) { 1543 err = -EFAULT; 1544 } 1545 break; 1546 } 1547 case GNTTABOP_query_size: { 1548 struct gnttab_query_size size; 1549 1550 qemu_build_assert(sizeof(size) == 16); 1551 if (kvm_copy_from_gva(cs, arg, &size, sizeof(size))) { 1552 err = -EFAULT; 1553 break; 1554 } 1555 1556 err = xen_gnttab_query_size_op(&size); 1557 if (!err && kvm_copy_to_gva(cs, arg, &size, sizeof(size))) { 1558 err = -EFAULT; 1559 } 1560 break; 1561 } 1562 case GNTTABOP_setup_table: 1563 case GNTTABOP_copy: 1564 case GNTTABOP_map_grant_ref: 1565 case GNTTABOP_unmap_grant_ref: 1566 case GNTTABOP_swap_grant_ref: 1567 return false; 1568 1569 default: 1570 /* Xen explicitly returns -ENOSYS to HVM guests for all others */ 1571 err = -ENOSYS; 1572 break; 1573 } 1574 1575 exit->u.hcall.result = err; 1576 return true; 1577 } 1578 1579 static bool kvm_xen_hcall_physdev_op(struct kvm_xen_exit *exit, X86CPU *cpu, 1580 int cmd, uint64_t arg) 1581 { 1582 CPUState *cs = CPU(cpu); 1583 int err; 1584 1585 switch (cmd) { 1586 case PHYSDEVOP_map_pirq: { 1587 struct physdev_map_pirq map; 1588 1589 if (hypercall_compat32(exit->u.hcall.longmode)) { 1590 struct compat_physdev_map_pirq *map32 = (void *)↦ 1591 1592 if (kvm_copy_from_gva(cs, arg, map32, sizeof(*map32))) { 1593 return -EFAULT; 1594 } 1595 1596 /* 1597 * The only thing that's different is the alignment of the 1598 * uint64_t table_base at the end, which gets padding to make 1599 * it 64-bit aligned in the 64-bit version. 1600 */ 1601 qemu_build_assert(sizeof(*map32) == 36); 1602 qemu_build_assert(offsetof(struct physdev_map_pirq, entry_nr) == 1603 offsetof(struct compat_physdev_map_pirq, entry_nr)); 1604 memmove(&map.table_base, &map32->table_base, sizeof(map.table_base)); 1605 } else { 1606 if (kvm_copy_from_gva(cs, arg, &map, sizeof(map))) { 1607 err = -EFAULT; 1608 break; 1609 } 1610 } 1611 err = xen_physdev_map_pirq(&map); 1612 /* 1613 * Since table_base is an IN parameter and won't be changed, just 1614 * copy the size of the compat structure back to the guest. 1615 */ 1616 if (!err && kvm_copy_to_gva(cs, arg, &map, 1617 sizeof(struct compat_physdev_map_pirq))) { 1618 err = -EFAULT; 1619 } 1620 break; 1621 } 1622 case PHYSDEVOP_unmap_pirq: { 1623 struct physdev_unmap_pirq unmap; 1624 1625 qemu_build_assert(sizeof(unmap) == 8); 1626 if (kvm_copy_from_gva(cs, arg, &unmap, sizeof(unmap))) { 1627 err = -EFAULT; 1628 break; 1629 } 1630 1631 err = xen_physdev_unmap_pirq(&unmap); 1632 if (!err && kvm_copy_to_gva(cs, arg, &unmap, sizeof(unmap))) { 1633 err = -EFAULT; 1634 } 1635 break; 1636 } 1637 case PHYSDEVOP_eoi: { 1638 struct physdev_eoi eoi; 1639 1640 qemu_build_assert(sizeof(eoi) == 4); 1641 if (kvm_copy_from_gva(cs, arg, &eoi, sizeof(eoi))) { 1642 err = -EFAULT; 1643 break; 1644 } 1645 1646 err = xen_physdev_eoi_pirq(&eoi); 1647 if (!err && kvm_copy_to_gva(cs, arg, &eoi, sizeof(eoi))) { 1648 err = -EFAULT; 1649 } 1650 break; 1651 } 1652 case PHYSDEVOP_irq_status_query: { 1653 struct physdev_irq_status_query query; 1654 1655 qemu_build_assert(sizeof(query) == 8); 1656 if (kvm_copy_from_gva(cs, arg, &query, sizeof(query))) { 1657 err = -EFAULT; 1658 break; 1659 } 1660 1661 err = xen_physdev_query_pirq(&query); 1662 if (!err && kvm_copy_to_gva(cs, arg, &query, sizeof(query))) { 1663 err = -EFAULT; 1664 } 1665 break; 1666 } 1667 case PHYSDEVOP_get_free_pirq: { 1668 struct physdev_get_free_pirq get; 1669 1670 qemu_build_assert(sizeof(get) == 8); 1671 if (kvm_copy_from_gva(cs, arg, &get, sizeof(get))) { 1672 err = -EFAULT; 1673 break; 1674 } 1675 1676 err = xen_physdev_get_free_pirq(&get); 1677 if (!err && kvm_copy_to_gva(cs, arg, &get, sizeof(get))) { 1678 err = -EFAULT; 1679 } 1680 break; 1681 } 1682 case PHYSDEVOP_pirq_eoi_gmfn_v2: /* FreeBSD 13 makes this hypercall */ 1683 err = -ENOSYS; 1684 break; 1685 1686 default: 1687 return false; 1688 } 1689 1690 exit->u.hcall.result = err; 1691 return true; 1692 } 1693 1694 static bool do_kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit) 1695 { 1696 uint16_t code = exit->u.hcall.input; 1697 1698 if (exit->u.hcall.cpl > 0) { 1699 exit->u.hcall.result = -EPERM; 1700 return true; 1701 } 1702 1703 switch (code) { 1704 case __HYPERVISOR_set_timer_op: 1705 if (exit->u.hcall.longmode) { 1706 return kvm_xen_hcall_set_timer_op(exit, cpu, 1707 exit->u.hcall.params[0]); 1708 } else { 1709 /* In 32-bit mode, the 64-bit timer value is in two args. */ 1710 uint64_t val = ((uint64_t)exit->u.hcall.params[1]) << 32 | 1711 (uint32_t)exit->u.hcall.params[0]; 1712 return kvm_xen_hcall_set_timer_op(exit, cpu, val); 1713 } 1714 case __HYPERVISOR_grant_table_op: 1715 return kvm_xen_hcall_gnttab_op(exit, cpu, exit->u.hcall.params[0], 1716 exit->u.hcall.params[1], 1717 exit->u.hcall.params[2]); 1718 case __HYPERVISOR_sched_op: 1719 return kvm_xen_hcall_sched_op(exit, cpu, exit->u.hcall.params[0], 1720 exit->u.hcall.params[1]); 1721 case __HYPERVISOR_event_channel_op: 1722 return kvm_xen_hcall_evtchn_op(exit, cpu, exit->u.hcall.params[0], 1723 exit->u.hcall.params[1]); 1724 case __HYPERVISOR_vcpu_op: 1725 return kvm_xen_hcall_vcpu_op(exit, cpu, 1726 exit->u.hcall.params[0], 1727 exit->u.hcall.params[1], 1728 exit->u.hcall.params[2]); 1729 case __HYPERVISOR_hvm_op: 1730 return kvm_xen_hcall_hvm_op(exit, cpu, exit->u.hcall.params[0], 1731 exit->u.hcall.params[1]); 1732 case __HYPERVISOR_memory_op: 1733 return kvm_xen_hcall_memory_op(exit, cpu, exit->u.hcall.params[0], 1734 exit->u.hcall.params[1]); 1735 case __HYPERVISOR_physdev_op: 1736 return kvm_xen_hcall_physdev_op(exit, cpu, exit->u.hcall.params[0], 1737 exit->u.hcall.params[1]); 1738 case __HYPERVISOR_xen_version: 1739 return kvm_xen_hcall_xen_version(exit, cpu, exit->u.hcall.params[0], 1740 exit->u.hcall.params[1]); 1741 default: 1742 return false; 1743 } 1744 } 1745 1746 int kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit) 1747 { 1748 if (exit->type != KVM_EXIT_XEN_HCALL) { 1749 return -1; 1750 } 1751 1752 /* 1753 * The kernel latches the guest 32/64 mode when the MSR is used to fill 1754 * the hypercall page. So if we see a hypercall in a mode that doesn't 1755 * match our own idea of the guest mode, fetch the kernel's idea of the 1756 * "long mode" to remain in sync. 1757 */ 1758 if (exit->u.hcall.longmode != xen_is_long_mode()) { 1759 xen_sync_long_mode(); 1760 } 1761 1762 if (!do_kvm_xen_handle_exit(cpu, exit)) { 1763 /* 1764 * Some hypercalls will be deliberately "implemented" by returning 1765 * -ENOSYS. This case is for hypercalls which are unexpected. 1766 */ 1767 exit->u.hcall.result = -ENOSYS; 1768 qemu_log_mask(LOG_UNIMP, "Unimplemented Xen hypercall %" 1769 PRId64 " (0x%" PRIx64 " 0x%" PRIx64 " 0x%" PRIx64 ")\n", 1770 (uint64_t)exit->u.hcall.input, 1771 (uint64_t)exit->u.hcall.params[0], 1772 (uint64_t)exit->u.hcall.params[1], 1773 (uint64_t)exit->u.hcall.params[2]); 1774 } 1775 1776 trace_kvm_xen_hypercall(CPU(cpu)->cpu_index, exit->u.hcall.cpl, 1777 exit->u.hcall.input, exit->u.hcall.params[0], 1778 exit->u.hcall.params[1], exit->u.hcall.params[2], 1779 exit->u.hcall.result); 1780 return 0; 1781 } 1782 1783 uint16_t kvm_xen_get_gnttab_max_frames(void) 1784 { 1785 KVMState *s = KVM_STATE(current_accel()); 1786 return s->xen_gnttab_max_frames; 1787 } 1788 1789 uint16_t kvm_xen_get_evtchn_max_pirq(void) 1790 { 1791 KVMState *s = KVM_STATE(current_accel()); 1792 return s->xen_evtchn_max_pirq; 1793 } 1794 1795 int kvm_put_xen_state(CPUState *cs) 1796 { 1797 X86CPU *cpu = X86_CPU(cs); 1798 CPUX86State *env = &cpu->env; 1799 uint64_t gpa; 1800 int ret; 1801 1802 gpa = env->xen_vcpu_info_gpa; 1803 if (gpa == INVALID_GPA) { 1804 gpa = env->xen_vcpu_info_default_gpa; 1805 } 1806 1807 if (gpa != INVALID_GPA) { 1808 ret = set_vcpu_info(cs, gpa); 1809 if (ret < 0) { 1810 return ret; 1811 } 1812 } 1813 1814 gpa = env->xen_vcpu_time_info_gpa; 1815 if (gpa != INVALID_GPA) { 1816 ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO, 1817 gpa); 1818 if (ret < 0) { 1819 return ret; 1820 } 1821 } 1822 1823 gpa = env->xen_vcpu_runstate_gpa; 1824 if (gpa != INVALID_GPA) { 1825 ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR, 1826 gpa); 1827 if (ret < 0) { 1828 return ret; 1829 } 1830 } 1831 1832 if (env->xen_periodic_timer_period) { 1833 ret = do_set_periodic_timer(cs, env->xen_periodic_timer_period); 1834 if (ret < 0) { 1835 return ret; 1836 } 1837 } 1838 1839 if (!kvm_xen_has_cap(EVTCHN_SEND)) { 1840 /* 1841 * If the kernel has EVTCHN_SEND support then it handles timers too, 1842 * so the timer will be restored by kvm_xen_set_vcpu_timer() below. 1843 */ 1844 QEMU_LOCK_GUARD(&env->xen_timers_lock); 1845 if (env->xen_singleshot_timer_ns) { 1846 ret = do_set_singleshot_timer(cs, env->xen_singleshot_timer_ns, 1847 false, false); 1848 if (ret < 0) { 1849 return ret; 1850 } 1851 } 1852 return 0; 1853 } 1854 1855 if (env->xen_vcpu_callback_vector) { 1856 ret = kvm_xen_set_vcpu_callback_vector(cs); 1857 if (ret < 0) { 1858 return ret; 1859 } 1860 } 1861 1862 if (env->xen_virq[VIRQ_TIMER]) { 1863 do_set_vcpu_timer_virq(cs, 1864 RUN_ON_CPU_HOST_INT(env->xen_virq[VIRQ_TIMER])); 1865 } 1866 return 0; 1867 } 1868 1869 int kvm_get_xen_state(CPUState *cs) 1870 { 1871 X86CPU *cpu = X86_CPU(cs); 1872 CPUX86State *env = &cpu->env; 1873 uint64_t gpa; 1874 int ret; 1875 1876 /* 1877 * The kernel does not mark vcpu_info as dirty when it delivers interrupts 1878 * to it. It's up to userspace to *assume* that any page shared thus is 1879 * always considered dirty. The shared_info page is different since it's 1880 * an overlay and migrated separately anyway. 1881 */ 1882 gpa = env->xen_vcpu_info_gpa; 1883 if (gpa == INVALID_GPA) { 1884 gpa = env->xen_vcpu_info_default_gpa; 1885 } 1886 if (gpa != INVALID_GPA) { 1887 MemoryRegionSection mrs = memory_region_find(get_system_memory(), 1888 gpa, 1889 sizeof(struct vcpu_info)); 1890 if (mrs.mr && 1891 !int128_lt(mrs.size, int128_make64(sizeof(struct vcpu_info)))) { 1892 memory_region_set_dirty(mrs.mr, mrs.offset_within_region, 1893 sizeof(struct vcpu_info)); 1894 } 1895 } 1896 1897 if (!kvm_xen_has_cap(EVTCHN_SEND)) { 1898 return 0; 1899 } 1900 1901 /* 1902 * If the kernel is accelerating timers, read out the current value of the 1903 * singleshot timer deadline. 1904 */ 1905 if (env->xen_virq[VIRQ_TIMER]) { 1906 struct kvm_xen_vcpu_attr va = { 1907 .type = KVM_XEN_VCPU_ATTR_TYPE_TIMER, 1908 }; 1909 ret = kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_GET_ATTR, &va); 1910 if (ret < 0) { 1911 return ret; 1912 } 1913 1914 /* 1915 * This locking is fairly pointless, and is here to appease Coverity. 1916 * There is an unavoidable race condition if a different vCPU sets a 1917 * timer for this vCPU after the value has been read out. But that's 1918 * OK in practice because *all* the vCPUs need to be stopped before 1919 * we set about migrating their state. 1920 */ 1921 QEMU_LOCK_GUARD(&X86_CPU(cs)->env.xen_timers_lock); 1922 env->xen_singleshot_timer_ns = va.u.timer.expires_ns; 1923 } 1924 1925 return 0; 1926 } 1927