xref: /openbmc/qemu/target/i386/kvm/xen-emu.c (revision d2dfe0b5)
1 /*
2  * Xen HVM emulation support in KVM
3  *
4  * Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
5  * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
6  *
7  * This work is licensed under the terms of the GNU GPL, version 2 or later.
8  * See the COPYING file in the top-level directory.
9  *
10  */
11 
12 #include "qemu/osdep.h"
13 #include "qemu/log.h"
14 #include "qemu/main-loop.h"
15 #include "qemu/error-report.h"
16 #include "hw/xen/xen.h"
17 #include "sysemu/kvm_int.h"
18 #include "sysemu/kvm_xen.h"
19 #include "kvm/kvm_i386.h"
20 #include "exec/address-spaces.h"
21 #include "xen-emu.h"
22 #include "trace.h"
23 #include "sysemu/runstate.h"
24 
25 #include "hw/pci/msi.h"
26 #include "hw/i386/apic-msidef.h"
27 #include "hw/i386/e820_memory_layout.h"
28 #include "hw/i386/kvm/xen_overlay.h"
29 #include "hw/i386/kvm/xen_evtchn.h"
30 #include "hw/i386/kvm/xen_gnttab.h"
31 #include "hw/i386/kvm/xen_xenstore.h"
32 
33 #include "hw/xen/interface/version.h"
34 #include "hw/xen/interface/sched.h"
35 #include "hw/xen/interface/memory.h"
36 #include "hw/xen/interface/hvm/hvm_op.h"
37 #include "hw/xen/interface/hvm/params.h"
38 #include "hw/xen/interface/vcpu.h"
39 #include "hw/xen/interface/event_channel.h"
40 #include "hw/xen/interface/grant_table.h"
41 
42 #include "xen-compat.h"
43 
44 static void xen_vcpu_singleshot_timer_event(void *opaque);
45 static void xen_vcpu_periodic_timer_event(void *opaque);
46 
47 #ifdef TARGET_X86_64
48 #define hypercall_compat32(longmode) (!(longmode))
49 #else
50 #define hypercall_compat32(longmode) (false)
51 #endif
52 
53 static bool kvm_gva_to_gpa(CPUState *cs, uint64_t gva, uint64_t *gpa,
54                            size_t *len, bool is_write)
55 {
56         struct kvm_translation tr = {
57             .linear_address = gva,
58         };
59 
60         if (len) {
61             *len = TARGET_PAGE_SIZE - (gva & ~TARGET_PAGE_MASK);
62         }
63 
64         if (kvm_vcpu_ioctl(cs, KVM_TRANSLATE, &tr) || !tr.valid ||
65             (is_write && !tr.writeable)) {
66             return false;
67         }
68         *gpa = tr.physical_address;
69         return true;
70 }
71 
72 static int kvm_gva_rw(CPUState *cs, uint64_t gva, void *_buf, size_t sz,
73                       bool is_write)
74 {
75     uint8_t *buf = (uint8_t *)_buf;
76     uint64_t gpa;
77     size_t len;
78 
79     while (sz) {
80         if (!kvm_gva_to_gpa(cs, gva, &gpa, &len, is_write)) {
81             return -EFAULT;
82         }
83         if (len > sz) {
84             len = sz;
85         }
86 
87         cpu_physical_memory_rw(gpa, buf, len, is_write);
88 
89         buf += len;
90         sz -= len;
91         gva += len;
92     }
93 
94     return 0;
95 }
96 
97 static inline int kvm_copy_from_gva(CPUState *cs, uint64_t gva, void *buf,
98                                     size_t sz)
99 {
100     return kvm_gva_rw(cs, gva, buf, sz, false);
101 }
102 
103 static inline int kvm_copy_to_gva(CPUState *cs, uint64_t gva, void *buf,
104                                   size_t sz)
105 {
106     return kvm_gva_rw(cs, gva, buf, sz, true);
107 }
108 
109 int kvm_xen_init(KVMState *s, uint32_t hypercall_msr)
110 {
111     const int required_caps = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
112         KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | KVM_XEN_HVM_CONFIG_SHARED_INFO;
113     struct kvm_xen_hvm_config cfg = {
114         .msr = hypercall_msr,
115         .flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL,
116     };
117     int xen_caps, ret;
118 
119     xen_caps = kvm_check_extension(s, KVM_CAP_XEN_HVM);
120     if (required_caps & ~xen_caps) {
121         error_report("kvm: Xen HVM guest support not present or insufficient");
122         return -ENOSYS;
123     }
124 
125     if (xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND) {
126         struct kvm_xen_hvm_attr ha = {
127             .type = KVM_XEN_ATTR_TYPE_XEN_VERSION,
128             .u.xen_version = s->xen_version,
129         };
130         (void)kvm_vm_ioctl(s, KVM_XEN_HVM_SET_ATTR, &ha);
131 
132         cfg.flags |= KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
133     }
134 
135     ret = kvm_vm_ioctl(s, KVM_XEN_HVM_CONFIG, &cfg);
136     if (ret < 0) {
137         error_report("kvm: Failed to enable Xen HVM support: %s",
138                      strerror(-ret));
139         return ret;
140     }
141 
142     /* If called a second time, don't repeat the rest of the setup. */
143     if (s->xen_caps) {
144         return 0;
145     }
146 
147     /*
148      * Event channel delivery via GSI/PCI_INTX needs to poll the vcpu_info
149      * of vCPU0 to deassert the IRQ when ->evtchn_upcall_pending is cleared.
150      *
151      * In the kernel, there's a notifier hook on the PIC/IOAPIC which allows
152      * such things to be polled at precisely the right time. We *could* do
153      * it nicely in the kernel: check vcpu_info[0]->evtchn_upcall_pending at
154      * the moment the IRQ is acked, and see if it should be reasserted.
155      *
156      * But the in-kernel irqchip is deprecated, so we're unlikely to add
157      * that support in the kernel. Insist on using the split irqchip mode
158      * instead.
159      *
160      * This leaves us polling for the level going low in QEMU, which lacks
161      * the appropriate hooks in its PIC/IOAPIC code. Even VFIO is sending a
162      * spurious 'ack' to an INTX IRQ every time there's any MMIO access to
163      * the device (for which it has to unmap the device and trap access, for
164      * some period after an IRQ!!). In the Xen case, we do it on exit from
165      * KVM_RUN, if the flag is set to say that the GSI is currently asserted.
166      * Which is kind of icky, but less so than the VFIO one. I may fix them
167      * both later...
168      */
169     if (!kvm_kernel_irqchip_split()) {
170         error_report("kvm: Xen support requires kernel-irqchip=split");
171         return -EINVAL;
172     }
173 
174     s->xen_caps = xen_caps;
175 
176     /* Tell fw_cfg to notify the BIOS to reserve the range. */
177     ret = e820_add_entry(XEN_SPECIAL_AREA_ADDR, XEN_SPECIAL_AREA_SIZE,
178                          E820_RESERVED);
179     if (ret < 0) {
180         fprintf(stderr, "e820_add_entry() table is full\n");
181         return ret;
182     }
183 
184     /* The page couldn't be overlaid until KVM was initialized */
185     xen_xenstore_reset();
186 
187     return 0;
188 }
189 
190 int kvm_xen_init_vcpu(CPUState *cs)
191 {
192     X86CPU *cpu = X86_CPU(cs);
193     CPUX86State *env = &cpu->env;
194     int err;
195 
196     /*
197      * The kernel needs to know the Xen/ACPI vCPU ID because that's
198      * what the guest uses in hypercalls such as timers. It doesn't
199      * match the APIC ID which is generally used for talking to the
200      * kernel about vCPUs. And if vCPU threads race with creating
201      * their KVM vCPUs out of order, it doesn't necessarily match
202      * with the kernel's internal vCPU indices either.
203      */
204     if (kvm_xen_has_cap(EVTCHN_SEND)) {
205         struct kvm_xen_vcpu_attr va = {
206             .type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID,
207             .u.vcpu_id = cs->cpu_index,
208         };
209         err = kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &va);
210         if (err) {
211             error_report("kvm: Failed to set Xen vCPU ID attribute: %s",
212                          strerror(-err));
213             return err;
214         }
215     }
216 
217     env->xen_vcpu_info_gpa = INVALID_GPA;
218     env->xen_vcpu_info_default_gpa = INVALID_GPA;
219     env->xen_vcpu_time_info_gpa = INVALID_GPA;
220     env->xen_vcpu_runstate_gpa = INVALID_GPA;
221 
222     qemu_mutex_init(&env->xen_timers_lock);
223     env->xen_singleshot_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
224                                              xen_vcpu_singleshot_timer_event,
225                                              cpu);
226     if (!env->xen_singleshot_timer) {
227         return -ENOMEM;
228     }
229     env->xen_singleshot_timer->opaque = cs;
230 
231     env->xen_periodic_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
232                                            xen_vcpu_periodic_timer_event,
233                                            cpu);
234     if (!env->xen_periodic_timer) {
235         return -ENOMEM;
236     }
237     env->xen_periodic_timer->opaque = cs;
238 
239     return 0;
240 }
241 
242 uint32_t kvm_xen_get_caps(void)
243 {
244     return kvm_state->xen_caps;
245 }
246 
247 static bool kvm_xen_hcall_xen_version(struct kvm_xen_exit *exit, X86CPU *cpu,
248                                      int cmd, uint64_t arg)
249 {
250     int err = 0;
251 
252     switch (cmd) {
253     case XENVER_get_features: {
254         struct xen_feature_info fi;
255 
256         /* No need for 32/64 compat handling */
257         qemu_build_assert(sizeof(fi) == 8);
258 
259         err = kvm_copy_from_gva(CPU(cpu), arg, &fi, sizeof(fi));
260         if (err) {
261             break;
262         }
263 
264         fi.submap = 0;
265         if (fi.submap_idx == 0) {
266             fi.submap |= 1 << XENFEAT_writable_page_tables |
267                          1 << XENFEAT_writable_descriptor_tables |
268                          1 << XENFEAT_auto_translated_physmap |
269                          1 << XENFEAT_supervisor_mode_kernel |
270                          1 << XENFEAT_hvm_callback_vector |
271                          1 << XENFEAT_hvm_safe_pvclock |
272                          1 << XENFEAT_hvm_pirqs;
273         }
274 
275         err = kvm_copy_to_gva(CPU(cpu), arg, &fi, sizeof(fi));
276         break;
277     }
278 
279     default:
280         return false;
281     }
282 
283     exit->u.hcall.result = err;
284     return true;
285 }
286 
287 static int kvm_xen_set_vcpu_attr(CPUState *cs, uint16_t type, uint64_t gpa)
288 {
289     struct kvm_xen_vcpu_attr xhsi;
290 
291     xhsi.type = type;
292     xhsi.u.gpa = gpa;
293 
294     trace_kvm_xen_set_vcpu_attr(cs->cpu_index, type, gpa);
295 
296     return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &xhsi);
297 }
298 
299 static int kvm_xen_set_vcpu_callback_vector(CPUState *cs)
300 {
301     uint8_t vector = X86_CPU(cs)->env.xen_vcpu_callback_vector;
302     struct kvm_xen_vcpu_attr xva;
303 
304     xva.type = KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR;
305     xva.u.vector = vector;
306 
307     trace_kvm_xen_set_vcpu_callback(cs->cpu_index, vector);
308 
309     return kvm_vcpu_ioctl(cs, KVM_XEN_HVM_SET_ATTR, &xva);
310 }
311 
312 static void do_set_vcpu_callback_vector(CPUState *cs, run_on_cpu_data data)
313 {
314     X86CPU *cpu = X86_CPU(cs);
315     CPUX86State *env = &cpu->env;
316 
317     env->xen_vcpu_callback_vector = data.host_int;
318 
319     if (kvm_xen_has_cap(EVTCHN_SEND)) {
320         kvm_xen_set_vcpu_callback_vector(cs);
321     }
322 }
323 
324 static int set_vcpu_info(CPUState *cs, uint64_t gpa)
325 {
326     X86CPU *cpu = X86_CPU(cs);
327     CPUX86State *env = &cpu->env;
328     MemoryRegionSection mrs = { .mr = NULL };
329     void *vcpu_info_hva = NULL;
330     int ret;
331 
332     ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO, gpa);
333     if (ret || gpa == INVALID_GPA) {
334         goto out;
335     }
336 
337     mrs = memory_region_find(get_system_memory(), gpa,
338                              sizeof(struct vcpu_info));
339     if (mrs.mr && mrs.mr->ram_block &&
340         !int128_lt(mrs.size, int128_make64(sizeof(struct vcpu_info)))) {
341         vcpu_info_hva = qemu_map_ram_ptr(mrs.mr->ram_block,
342                                          mrs.offset_within_region);
343     }
344     if (!vcpu_info_hva) {
345         if (mrs.mr) {
346             memory_region_unref(mrs.mr);
347             mrs.mr = NULL;
348         }
349         ret = -EINVAL;
350     }
351 
352  out:
353     if (env->xen_vcpu_info_mr) {
354         memory_region_unref(env->xen_vcpu_info_mr);
355     }
356     env->xen_vcpu_info_hva = vcpu_info_hva;
357     env->xen_vcpu_info_mr = mrs.mr;
358     return ret;
359 }
360 
361 static void do_set_vcpu_info_default_gpa(CPUState *cs, run_on_cpu_data data)
362 {
363     X86CPU *cpu = X86_CPU(cs);
364     CPUX86State *env = &cpu->env;
365 
366     env->xen_vcpu_info_default_gpa = data.host_ulong;
367 
368     /* Changing the default does nothing if a vcpu_info was explicitly set. */
369     if (env->xen_vcpu_info_gpa == INVALID_GPA) {
370         set_vcpu_info(cs, env->xen_vcpu_info_default_gpa);
371     }
372 }
373 
374 static void do_set_vcpu_info_gpa(CPUState *cs, run_on_cpu_data data)
375 {
376     X86CPU *cpu = X86_CPU(cs);
377     CPUX86State *env = &cpu->env;
378 
379     env->xen_vcpu_info_gpa = data.host_ulong;
380 
381     set_vcpu_info(cs, env->xen_vcpu_info_gpa);
382 }
383 
384 void *kvm_xen_get_vcpu_info_hva(uint32_t vcpu_id)
385 {
386     CPUState *cs = qemu_get_cpu(vcpu_id);
387     if (!cs) {
388         return NULL;
389     }
390 
391     return X86_CPU(cs)->env.xen_vcpu_info_hva;
392 }
393 
394 void kvm_xen_maybe_deassert_callback(CPUState *cs)
395 {
396     CPUX86State *env = &X86_CPU(cs)->env;
397     struct vcpu_info *vi = env->xen_vcpu_info_hva;
398     if (!vi) {
399         return;
400     }
401 
402     /* If the evtchn_upcall_pending flag is cleared, turn the GSI off. */
403     if (!vi->evtchn_upcall_pending) {
404         qemu_mutex_lock_iothread();
405         /*
406          * Check again now we have the lock, because it may have been
407          * asserted in the interim. And we don't want to take the lock
408          * every time because this is a fast path.
409          */
410         if (!vi->evtchn_upcall_pending) {
411             X86_CPU(cs)->env.xen_callback_asserted = false;
412             xen_evtchn_set_callback_level(0);
413         }
414         qemu_mutex_unlock_iothread();
415     }
416 }
417 
418 void kvm_xen_set_callback_asserted(void)
419 {
420     CPUState *cs = qemu_get_cpu(0);
421 
422     if (cs) {
423         X86_CPU(cs)->env.xen_callback_asserted = true;
424     }
425 }
426 
427 void kvm_xen_inject_vcpu_callback_vector(uint32_t vcpu_id, int type)
428 {
429     CPUState *cs = qemu_get_cpu(vcpu_id);
430     uint8_t vector;
431 
432     if (!cs) {
433         return;
434     }
435 
436     vector = X86_CPU(cs)->env.xen_vcpu_callback_vector;
437     if (vector) {
438         /*
439          * The per-vCPU callback vector injected via lapic. Just
440          * deliver it as an MSI.
441          */
442         MSIMessage msg = {
443             .address = APIC_DEFAULT_ADDRESS | X86_CPU(cs)->apic_id,
444             .data = vector | (1UL << MSI_DATA_LEVEL_SHIFT),
445         };
446         kvm_irqchip_send_msi(kvm_state, msg);
447         return;
448     }
449 
450     switch (type) {
451     case HVM_PARAM_CALLBACK_TYPE_VECTOR:
452         /*
453          * If the evtchn_upcall_pending field in the vcpu_info is set, then
454          * KVM will automatically deliver the vector on entering the vCPU
455          * so all we have to do is kick it out.
456          */
457         qemu_cpu_kick(cs);
458         break;
459 
460     case HVM_PARAM_CALLBACK_TYPE_GSI:
461     case HVM_PARAM_CALLBACK_TYPE_PCI_INTX:
462         if (vcpu_id == 0) {
463             xen_evtchn_set_callback_level(1);
464         }
465         break;
466     }
467 }
468 
469 static int kvm_xen_set_vcpu_timer(CPUState *cs)
470 {
471     X86CPU *cpu = X86_CPU(cs);
472     CPUX86State *env = &cpu->env;
473 
474     struct kvm_xen_vcpu_attr va = {
475         .type = KVM_XEN_VCPU_ATTR_TYPE_TIMER,
476         .u.timer.port = env->xen_virq[VIRQ_TIMER],
477         .u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
478         .u.timer.expires_ns = env->xen_singleshot_timer_ns,
479     };
480 
481     return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &va);
482 }
483 
484 static void do_set_vcpu_timer_virq(CPUState *cs, run_on_cpu_data data)
485 {
486     kvm_xen_set_vcpu_timer(cs);
487 }
488 
489 int kvm_xen_set_vcpu_virq(uint32_t vcpu_id, uint16_t virq, uint16_t port)
490 {
491     CPUState *cs = qemu_get_cpu(vcpu_id);
492 
493     if (!cs) {
494         return -ENOENT;
495     }
496 
497     /* cpu.h doesn't include the actual Xen header. */
498     qemu_build_assert(NR_VIRQS == XEN_NR_VIRQS);
499 
500     if (virq >= NR_VIRQS) {
501         return -EINVAL;
502     }
503 
504     if (port && X86_CPU(cs)->env.xen_virq[virq]) {
505         return -EEXIST;
506     }
507 
508     X86_CPU(cs)->env.xen_virq[virq] = port;
509     if (virq == VIRQ_TIMER && kvm_xen_has_cap(EVTCHN_SEND)) {
510         async_run_on_cpu(cs, do_set_vcpu_timer_virq,
511                          RUN_ON_CPU_HOST_INT(port));
512     }
513     return 0;
514 }
515 
516 static void do_set_vcpu_time_info_gpa(CPUState *cs, run_on_cpu_data data)
517 {
518     X86CPU *cpu = X86_CPU(cs);
519     CPUX86State *env = &cpu->env;
520 
521     env->xen_vcpu_time_info_gpa = data.host_ulong;
522 
523     kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
524                           env->xen_vcpu_time_info_gpa);
525 }
526 
527 static void do_set_vcpu_runstate_gpa(CPUState *cs, run_on_cpu_data data)
528 {
529     X86CPU *cpu = X86_CPU(cs);
530     CPUX86State *env = &cpu->env;
531 
532     env->xen_vcpu_runstate_gpa = data.host_ulong;
533 
534     kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
535                           env->xen_vcpu_runstate_gpa);
536 }
537 
538 static void do_vcpu_soft_reset(CPUState *cs, run_on_cpu_data data)
539 {
540     X86CPU *cpu = X86_CPU(cs);
541     CPUX86State *env = &cpu->env;
542 
543     env->xen_vcpu_info_gpa = INVALID_GPA;
544     env->xen_vcpu_info_default_gpa = INVALID_GPA;
545     env->xen_vcpu_time_info_gpa = INVALID_GPA;
546     env->xen_vcpu_runstate_gpa = INVALID_GPA;
547     env->xen_vcpu_callback_vector = 0;
548     env->xen_singleshot_timer_ns = 0;
549     memset(env->xen_virq, 0, sizeof(env->xen_virq));
550 
551     set_vcpu_info(cs, INVALID_GPA);
552     kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
553                           INVALID_GPA);
554     kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
555                           INVALID_GPA);
556     if (kvm_xen_has_cap(EVTCHN_SEND)) {
557         kvm_xen_set_vcpu_callback_vector(cs);
558         kvm_xen_set_vcpu_timer(cs);
559     }
560 
561 }
562 
563 static int xen_set_shared_info(uint64_t gfn)
564 {
565     uint64_t gpa = gfn << TARGET_PAGE_BITS;
566     int i, err;
567 
568     QEMU_IOTHREAD_LOCK_GUARD();
569 
570     /*
571      * The xen_overlay device tells KVM about it too, since it had to
572      * do that on migration load anyway (unless we're going to jump
573      * through lots of hoops to maintain the fiction that this isn't
574      * KVM-specific.
575      */
576     err = xen_overlay_map_shinfo_page(gpa);
577     if (err) {
578             return err;
579     }
580 
581     trace_kvm_xen_set_shared_info(gfn);
582 
583     for (i = 0; i < XEN_LEGACY_MAX_VCPUS; i++) {
584         CPUState *cpu = qemu_get_cpu(i);
585         if (cpu) {
586             async_run_on_cpu(cpu, do_set_vcpu_info_default_gpa,
587                              RUN_ON_CPU_HOST_ULONG(gpa));
588         }
589         gpa += sizeof(vcpu_info_t);
590     }
591 
592     return err;
593 }
594 
595 static int add_to_physmap_one(uint32_t space, uint64_t idx, uint64_t gfn)
596 {
597     switch (space) {
598     case XENMAPSPACE_shared_info:
599         if (idx > 0) {
600             return -EINVAL;
601         }
602         return xen_set_shared_info(gfn);
603 
604     case XENMAPSPACE_grant_table:
605         return xen_gnttab_map_page(idx, gfn);
606 
607     case XENMAPSPACE_gmfn:
608     case XENMAPSPACE_gmfn_range:
609         return -ENOTSUP;
610 
611     case XENMAPSPACE_gmfn_foreign:
612     case XENMAPSPACE_dev_mmio:
613         return -EPERM;
614 
615     default:
616         return -EINVAL;
617     }
618 }
619 
620 static int do_add_to_physmap(struct kvm_xen_exit *exit, X86CPU *cpu,
621                              uint64_t arg)
622 {
623     struct xen_add_to_physmap xatp;
624     CPUState *cs = CPU(cpu);
625 
626     if (hypercall_compat32(exit->u.hcall.longmode)) {
627         struct compat_xen_add_to_physmap xatp32;
628 
629         qemu_build_assert(sizeof(struct compat_xen_add_to_physmap) == 16);
630         if (kvm_copy_from_gva(cs, arg, &xatp32, sizeof(xatp32))) {
631             return -EFAULT;
632         }
633         xatp.domid = xatp32.domid;
634         xatp.size = xatp32.size;
635         xatp.space = xatp32.space;
636         xatp.idx = xatp32.idx;
637         xatp.gpfn = xatp32.gpfn;
638     } else {
639         if (kvm_copy_from_gva(cs, arg, &xatp, sizeof(xatp))) {
640             return -EFAULT;
641         }
642     }
643 
644     if (xatp.domid != DOMID_SELF && xatp.domid != xen_domid) {
645         return -ESRCH;
646     }
647 
648     return add_to_physmap_one(xatp.space, xatp.idx, xatp.gpfn);
649 }
650 
651 static int do_add_to_physmap_batch(struct kvm_xen_exit *exit, X86CPU *cpu,
652                                    uint64_t arg)
653 {
654     struct xen_add_to_physmap_batch xatpb;
655     unsigned long idxs_gva, gpfns_gva, errs_gva;
656     CPUState *cs = CPU(cpu);
657     size_t op_sz;
658 
659     if (hypercall_compat32(exit->u.hcall.longmode)) {
660         struct compat_xen_add_to_physmap_batch xatpb32;
661 
662         qemu_build_assert(sizeof(struct compat_xen_add_to_physmap_batch) == 20);
663         if (kvm_copy_from_gva(cs, arg, &xatpb32, sizeof(xatpb32))) {
664             return -EFAULT;
665         }
666         xatpb.domid = xatpb32.domid;
667         xatpb.space = xatpb32.space;
668         xatpb.size = xatpb32.size;
669 
670         idxs_gva = xatpb32.idxs.c;
671         gpfns_gva = xatpb32.gpfns.c;
672         errs_gva = xatpb32.errs.c;
673         op_sz = sizeof(uint32_t);
674     } else {
675         if (kvm_copy_from_gva(cs, arg, &xatpb, sizeof(xatpb))) {
676             return -EFAULT;
677         }
678         op_sz = sizeof(unsigned long);
679         idxs_gva = (unsigned long)xatpb.idxs.p;
680         gpfns_gva = (unsigned long)xatpb.gpfns.p;
681         errs_gva = (unsigned long)xatpb.errs.p;
682     }
683 
684     if (xatpb.domid != DOMID_SELF && xatpb.domid != xen_domid) {
685         return -ESRCH;
686     }
687 
688     /* Explicitly invalid for the batch op. Not that we implement it anyway. */
689     if (xatpb.space == XENMAPSPACE_gmfn_range) {
690         return -EINVAL;
691     }
692 
693     while (xatpb.size--) {
694         unsigned long idx = 0;
695         unsigned long gpfn = 0;
696         int err;
697 
698         /* For 32-bit compat this only copies the low 32 bits of each */
699         if (kvm_copy_from_gva(cs, idxs_gva, &idx, op_sz) ||
700             kvm_copy_from_gva(cs, gpfns_gva, &gpfn, op_sz)) {
701             return -EFAULT;
702         }
703         idxs_gva += op_sz;
704         gpfns_gva += op_sz;
705 
706         err = add_to_physmap_one(xatpb.space, idx, gpfn);
707 
708         if (kvm_copy_to_gva(cs, errs_gva, &err, sizeof(err))) {
709             return -EFAULT;
710         }
711         errs_gva += sizeof(err);
712     }
713     return 0;
714 }
715 
716 static bool kvm_xen_hcall_memory_op(struct kvm_xen_exit *exit, X86CPU *cpu,
717                                    int cmd, uint64_t arg)
718 {
719     int err;
720 
721     switch (cmd) {
722     case XENMEM_add_to_physmap:
723         err = do_add_to_physmap(exit, cpu, arg);
724         break;
725 
726     case XENMEM_add_to_physmap_batch:
727         err = do_add_to_physmap_batch(exit, cpu, arg);
728         break;
729 
730     default:
731         return false;
732     }
733 
734     exit->u.hcall.result = err;
735     return true;
736 }
737 
738 static bool handle_set_param(struct kvm_xen_exit *exit, X86CPU *cpu,
739                              uint64_t arg)
740 {
741     CPUState *cs = CPU(cpu);
742     struct xen_hvm_param hp;
743     int err = 0;
744 
745     /* No need for 32/64 compat handling */
746     qemu_build_assert(sizeof(hp) == 16);
747 
748     if (kvm_copy_from_gva(cs, arg, &hp, sizeof(hp))) {
749         err = -EFAULT;
750         goto out;
751     }
752 
753     if (hp.domid != DOMID_SELF && hp.domid != xen_domid) {
754         err = -ESRCH;
755         goto out;
756     }
757 
758     switch (hp.index) {
759     case HVM_PARAM_CALLBACK_IRQ:
760         qemu_mutex_lock_iothread();
761         err = xen_evtchn_set_callback_param(hp.value);
762         qemu_mutex_unlock_iothread();
763         xen_set_long_mode(exit->u.hcall.longmode);
764         break;
765     default:
766         return false;
767     }
768 
769 out:
770     exit->u.hcall.result = err;
771     return true;
772 }
773 
774 static bool handle_get_param(struct kvm_xen_exit *exit, X86CPU *cpu,
775                              uint64_t arg)
776 {
777     CPUState *cs = CPU(cpu);
778     struct xen_hvm_param hp;
779     int err = 0;
780 
781     /* No need for 32/64 compat handling */
782     qemu_build_assert(sizeof(hp) == 16);
783 
784     if (kvm_copy_from_gva(cs, arg, &hp, sizeof(hp))) {
785         err = -EFAULT;
786         goto out;
787     }
788 
789     if (hp.domid != DOMID_SELF && hp.domid != xen_domid) {
790         err = -ESRCH;
791         goto out;
792     }
793 
794     switch (hp.index) {
795     case HVM_PARAM_STORE_PFN:
796         hp.value = XEN_SPECIAL_PFN(XENSTORE);
797         break;
798     case HVM_PARAM_STORE_EVTCHN:
799         hp.value = xen_xenstore_get_port();
800         break;
801     default:
802         return false;
803     }
804 
805     if (kvm_copy_to_gva(cs, arg, &hp, sizeof(hp))) {
806         err = -EFAULT;
807     }
808 out:
809     exit->u.hcall.result = err;
810     return true;
811 }
812 
813 static int kvm_xen_hcall_evtchn_upcall_vector(struct kvm_xen_exit *exit,
814                                               X86CPU *cpu, uint64_t arg)
815 {
816     struct xen_hvm_evtchn_upcall_vector up;
817     CPUState *target_cs;
818 
819     /* No need for 32/64 compat handling */
820     qemu_build_assert(sizeof(up) == 8);
821 
822     if (kvm_copy_from_gva(CPU(cpu), arg, &up, sizeof(up))) {
823         return -EFAULT;
824     }
825 
826     if (up.vector < 0x10) {
827         return -EINVAL;
828     }
829 
830     target_cs = qemu_get_cpu(up.vcpu);
831     if (!target_cs) {
832         return -EINVAL;
833     }
834 
835     async_run_on_cpu(target_cs, do_set_vcpu_callback_vector,
836                      RUN_ON_CPU_HOST_INT(up.vector));
837     return 0;
838 }
839 
840 static bool kvm_xen_hcall_hvm_op(struct kvm_xen_exit *exit, X86CPU *cpu,
841                                  int cmd, uint64_t arg)
842 {
843     int ret = -ENOSYS;
844     switch (cmd) {
845     case HVMOP_set_evtchn_upcall_vector:
846         ret = kvm_xen_hcall_evtchn_upcall_vector(exit, cpu,
847                                                  exit->u.hcall.params[0]);
848         break;
849 
850     case HVMOP_pagetable_dying:
851         ret = -ENOSYS;
852         break;
853 
854     case HVMOP_set_param:
855         return handle_set_param(exit, cpu, arg);
856 
857     case HVMOP_get_param:
858         return handle_get_param(exit, cpu, arg);
859 
860     default:
861         return false;
862     }
863 
864     exit->u.hcall.result = ret;
865     return true;
866 }
867 
868 static int vcpuop_register_vcpu_info(CPUState *cs, CPUState *target,
869                                      uint64_t arg)
870 {
871     struct vcpu_register_vcpu_info rvi;
872     uint64_t gpa;
873 
874     /* No need for 32/64 compat handling */
875     qemu_build_assert(sizeof(rvi) == 16);
876     qemu_build_assert(sizeof(struct vcpu_info) == 64);
877 
878     if (!target) {
879         return -ENOENT;
880     }
881 
882     if (kvm_copy_from_gva(cs, arg, &rvi, sizeof(rvi))) {
883         return -EFAULT;
884     }
885 
886     if (rvi.offset > TARGET_PAGE_SIZE - sizeof(struct vcpu_info)) {
887         return -EINVAL;
888     }
889 
890     gpa = ((rvi.mfn << TARGET_PAGE_BITS) + rvi.offset);
891     async_run_on_cpu(target, do_set_vcpu_info_gpa, RUN_ON_CPU_HOST_ULONG(gpa));
892     return 0;
893 }
894 
895 static int vcpuop_register_vcpu_time_info(CPUState *cs, CPUState *target,
896                                           uint64_t arg)
897 {
898     struct vcpu_register_time_memory_area tma;
899     uint64_t gpa;
900     size_t len;
901 
902     /* No need for 32/64 compat handling */
903     qemu_build_assert(sizeof(tma) == 8);
904     qemu_build_assert(sizeof(struct vcpu_time_info) == 32);
905 
906     if (!target) {
907         return -ENOENT;
908     }
909 
910     if (kvm_copy_from_gva(cs, arg, &tma, sizeof(tma))) {
911         return -EFAULT;
912     }
913 
914     /*
915      * Xen actually uses the GVA and does the translation through the guest
916      * page tables each time. But Linux/KVM uses the GPA, on the assumption
917      * that guests only ever use *global* addresses (kernel virtual addresses)
918      * for it. If Linux is changed to redo the GVA→GPA translation each time,
919      * it will offer a new vCPU attribute for that, and we'll use it instead.
920      */
921     if (!kvm_gva_to_gpa(cs, tma.addr.p, &gpa, &len, false) ||
922         len < sizeof(struct vcpu_time_info)) {
923         return -EFAULT;
924     }
925 
926     async_run_on_cpu(target, do_set_vcpu_time_info_gpa,
927                      RUN_ON_CPU_HOST_ULONG(gpa));
928     return 0;
929 }
930 
931 static int vcpuop_register_runstate_info(CPUState *cs, CPUState *target,
932                                          uint64_t arg)
933 {
934     struct vcpu_register_runstate_memory_area rma;
935     uint64_t gpa;
936     size_t len;
937 
938     /* No need for 32/64 compat handling */
939     qemu_build_assert(sizeof(rma) == 8);
940     /* The runstate area actually does change size, but Linux copes. */
941 
942     if (!target) {
943         return -ENOENT;
944     }
945 
946     if (kvm_copy_from_gva(cs, arg, &rma, sizeof(rma))) {
947         return -EFAULT;
948     }
949 
950     /* As with vcpu_time_info, Xen actually uses the GVA but KVM doesn't. */
951     if (!kvm_gva_to_gpa(cs, rma.addr.p, &gpa, &len, false)) {
952         return -EFAULT;
953     }
954 
955     async_run_on_cpu(target, do_set_vcpu_runstate_gpa,
956                      RUN_ON_CPU_HOST_ULONG(gpa));
957     return 0;
958 }
959 
960 static uint64_t kvm_get_current_ns(void)
961 {
962     struct kvm_clock_data data;
963     int ret;
964 
965     ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
966     if (ret < 0) {
967         fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
968                 abort();
969     }
970 
971     return data.clock;
972 }
973 
974 static void xen_vcpu_singleshot_timer_event(void *opaque)
975 {
976     CPUState *cpu = opaque;
977     CPUX86State *env = &X86_CPU(cpu)->env;
978     uint16_t port = env->xen_virq[VIRQ_TIMER];
979 
980     if (likely(port)) {
981         xen_evtchn_set_port(port);
982     }
983 
984     qemu_mutex_lock(&env->xen_timers_lock);
985     env->xen_singleshot_timer_ns = 0;
986     qemu_mutex_unlock(&env->xen_timers_lock);
987 }
988 
989 static void xen_vcpu_periodic_timer_event(void *opaque)
990 {
991     CPUState *cpu = opaque;
992     CPUX86State *env = &X86_CPU(cpu)->env;
993     uint16_t port = env->xen_virq[VIRQ_TIMER];
994     int64_t qemu_now;
995 
996     if (likely(port)) {
997         xen_evtchn_set_port(port);
998     }
999 
1000     qemu_mutex_lock(&env->xen_timers_lock);
1001 
1002     qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
1003     timer_mod_ns(env->xen_periodic_timer,
1004                  qemu_now + env->xen_periodic_timer_period);
1005 
1006     qemu_mutex_unlock(&env->xen_timers_lock);
1007 }
1008 
1009 static int do_set_periodic_timer(CPUState *target, uint64_t period_ns)
1010 {
1011     CPUX86State *tenv = &X86_CPU(target)->env;
1012     int64_t qemu_now;
1013 
1014     timer_del(tenv->xen_periodic_timer);
1015 
1016     qemu_mutex_lock(&tenv->xen_timers_lock);
1017 
1018     qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
1019     timer_mod_ns(tenv->xen_periodic_timer, qemu_now + period_ns);
1020     tenv->xen_periodic_timer_period = period_ns;
1021 
1022     qemu_mutex_unlock(&tenv->xen_timers_lock);
1023     return 0;
1024 }
1025 
1026 #define MILLISECS(_ms)  ((int64_t)((_ms) * 1000000ULL))
1027 #define MICROSECS(_us)  ((int64_t)((_us) * 1000ULL))
1028 #define STIME_MAX ((time_t)((int64_t)~0ull >> 1))
1029 /* Chosen so (NOW() + delta) wont overflow without an uptime of 200 years */
1030 #define STIME_DELTA_MAX ((int64_t)((uint64_t)~0ull >> 2))
1031 
1032 static int vcpuop_set_periodic_timer(CPUState *cs, CPUState *target,
1033                                      uint64_t arg)
1034 {
1035     struct vcpu_set_periodic_timer spt;
1036 
1037     qemu_build_assert(sizeof(spt) == 8);
1038     if (kvm_copy_from_gva(cs, arg, &spt, sizeof(spt))) {
1039         return -EFAULT;
1040     }
1041 
1042     if (spt.period_ns < MILLISECS(1) || spt.period_ns > STIME_DELTA_MAX) {
1043         return -EINVAL;
1044     }
1045 
1046     return do_set_periodic_timer(target, spt.period_ns);
1047 }
1048 
1049 static int vcpuop_stop_periodic_timer(CPUState *target)
1050 {
1051     CPUX86State *tenv = &X86_CPU(target)->env;
1052 
1053     qemu_mutex_lock(&tenv->xen_timers_lock);
1054 
1055     timer_del(tenv->xen_periodic_timer);
1056     tenv->xen_periodic_timer_period = 0;
1057 
1058     qemu_mutex_unlock(&tenv->xen_timers_lock);
1059     return 0;
1060 }
1061 
1062 static int do_set_singleshot_timer(CPUState *cs, uint64_t timeout_abs,
1063                                    bool future, bool linux_wa)
1064 {
1065     CPUX86State *env = &X86_CPU(cs)->env;
1066     int64_t now = kvm_get_current_ns();
1067     int64_t qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
1068     int64_t delta = timeout_abs - now;
1069 
1070     if (future && timeout_abs < now) {
1071         return -ETIME;
1072     }
1073 
1074     if (linux_wa && unlikely((int64_t)timeout_abs < 0 ||
1075                              (delta > 0 && (uint32_t)(delta >> 50) != 0))) {
1076         /*
1077          * Xen has a 'Linux workaround' in do_set_timer_op() which checks
1078          * for negative absolute timeout values (caused by integer
1079          * overflow), and for values about 13 days in the future (2^50ns)
1080          * which would be caused by jiffies overflow. For those cases, it
1081          * sets the timeout 100ms in the future (not *too* soon, since if
1082          * a guest really did set a long timeout on purpose we don't want
1083          * to keep churning CPU time by waking it up).
1084          */
1085         delta = (100 * SCALE_MS);
1086         timeout_abs = now + delta;
1087     }
1088 
1089     qemu_mutex_lock(&env->xen_timers_lock);
1090 
1091     timer_mod_ns(env->xen_singleshot_timer, qemu_now + delta);
1092     env->xen_singleshot_timer_ns = now + delta;
1093 
1094     qemu_mutex_unlock(&env->xen_timers_lock);
1095     return 0;
1096 }
1097 
1098 static int vcpuop_set_singleshot_timer(CPUState *cs, uint64_t arg)
1099 {
1100     struct vcpu_set_singleshot_timer sst = { 0 };
1101 
1102     /*
1103      * The struct is a uint64_t followed by a uint32_t. On 32-bit that
1104      * makes it 12 bytes. On 64-bit it gets padded to 16. The parts
1105      * that get used are identical, and there's four bytes of padding
1106      * unused at the end. For true Xen compatibility we should attempt
1107      * to copy the full 16 bytes from 64-bit guests, and return -EFAULT
1108      * if we can't get the padding too. But that's daft. Just copy what
1109      * we need.
1110      */
1111     qemu_build_assert(offsetof(struct vcpu_set_singleshot_timer, flags) == 8);
1112     qemu_build_assert(sizeof(sst) >= 12);
1113 
1114     if (kvm_copy_from_gva(cs, arg, &sst, 12)) {
1115         return -EFAULT;
1116     }
1117 
1118     return do_set_singleshot_timer(cs, sst.timeout_abs_ns,
1119                                    !!(sst.flags & VCPU_SSHOTTMR_future),
1120                                    false);
1121 }
1122 
1123 static int vcpuop_stop_singleshot_timer(CPUState *cs)
1124 {
1125     CPUX86State *env = &X86_CPU(cs)->env;
1126 
1127     qemu_mutex_lock(&env->xen_timers_lock);
1128 
1129     timer_del(env->xen_singleshot_timer);
1130     env->xen_singleshot_timer_ns = 0;
1131 
1132     qemu_mutex_unlock(&env->xen_timers_lock);
1133     return 0;
1134 }
1135 
1136 static bool kvm_xen_hcall_set_timer_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1137                                        uint64_t timeout)
1138 {
1139     int err;
1140 
1141     if (unlikely(timeout == 0)) {
1142         err = vcpuop_stop_singleshot_timer(CPU(cpu));
1143     } else {
1144         err = do_set_singleshot_timer(CPU(cpu), timeout, false, true);
1145     }
1146     exit->u.hcall.result = err;
1147     return true;
1148 }
1149 
1150 static bool kvm_xen_hcall_vcpu_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1151                                   int cmd, int vcpu_id, uint64_t arg)
1152 {
1153     CPUState *cs = CPU(cpu);
1154     CPUState *dest = cs->cpu_index == vcpu_id ? cs : qemu_get_cpu(vcpu_id);
1155     int err;
1156 
1157     if (!dest) {
1158         err = -ENOENT;
1159         goto out;
1160     }
1161 
1162     switch (cmd) {
1163     case VCPUOP_register_runstate_memory_area:
1164         err = vcpuop_register_runstate_info(cs, dest, arg);
1165         break;
1166     case VCPUOP_register_vcpu_time_memory_area:
1167         err = vcpuop_register_vcpu_time_info(cs, dest, arg);
1168         break;
1169     case VCPUOP_register_vcpu_info:
1170         err = vcpuop_register_vcpu_info(cs, dest, arg);
1171         break;
1172     case VCPUOP_set_singleshot_timer: {
1173         if (cs->cpu_index == vcpu_id) {
1174             err = vcpuop_set_singleshot_timer(dest, arg);
1175         } else {
1176             err = -EINVAL;
1177         }
1178         break;
1179     }
1180     case VCPUOP_stop_singleshot_timer:
1181         if (cs->cpu_index == vcpu_id) {
1182             err = vcpuop_stop_singleshot_timer(dest);
1183         } else {
1184             err = -EINVAL;
1185         }
1186         break;
1187     case VCPUOP_set_periodic_timer: {
1188         err = vcpuop_set_periodic_timer(cs, dest, arg);
1189         break;
1190     }
1191     case VCPUOP_stop_periodic_timer:
1192         err = vcpuop_stop_periodic_timer(dest);
1193         break;
1194 
1195     default:
1196         return false;
1197     }
1198 
1199  out:
1200     exit->u.hcall.result = err;
1201     return true;
1202 }
1203 
1204 static bool kvm_xen_hcall_evtchn_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1205                                     int cmd, uint64_t arg)
1206 {
1207     CPUState *cs = CPU(cpu);
1208     int err = -ENOSYS;
1209 
1210     switch (cmd) {
1211     case EVTCHNOP_init_control:
1212     case EVTCHNOP_expand_array:
1213     case EVTCHNOP_set_priority:
1214         /* We do not support FIFO channels at this point */
1215         err = -ENOSYS;
1216         break;
1217 
1218     case EVTCHNOP_status: {
1219         struct evtchn_status status;
1220 
1221         qemu_build_assert(sizeof(status) == 24);
1222         if (kvm_copy_from_gva(cs, arg, &status, sizeof(status))) {
1223             err = -EFAULT;
1224             break;
1225         }
1226 
1227         err = xen_evtchn_status_op(&status);
1228         if (!err && kvm_copy_to_gva(cs, arg, &status, sizeof(status))) {
1229             err = -EFAULT;
1230         }
1231         break;
1232     }
1233     case EVTCHNOP_close: {
1234         struct evtchn_close close;
1235 
1236         qemu_build_assert(sizeof(close) == 4);
1237         if (kvm_copy_from_gva(cs, arg, &close, sizeof(close))) {
1238             err = -EFAULT;
1239             break;
1240         }
1241 
1242         err = xen_evtchn_close_op(&close);
1243         break;
1244     }
1245     case EVTCHNOP_unmask: {
1246         struct evtchn_unmask unmask;
1247 
1248         qemu_build_assert(sizeof(unmask) == 4);
1249         if (kvm_copy_from_gva(cs, arg, &unmask, sizeof(unmask))) {
1250             err = -EFAULT;
1251             break;
1252         }
1253 
1254         err = xen_evtchn_unmask_op(&unmask);
1255         break;
1256     }
1257     case EVTCHNOP_bind_virq: {
1258         struct evtchn_bind_virq virq;
1259 
1260         qemu_build_assert(sizeof(virq) == 12);
1261         if (kvm_copy_from_gva(cs, arg, &virq, sizeof(virq))) {
1262             err = -EFAULT;
1263             break;
1264         }
1265 
1266         err = xen_evtchn_bind_virq_op(&virq);
1267         if (!err && kvm_copy_to_gva(cs, arg, &virq, sizeof(virq))) {
1268             err = -EFAULT;
1269         }
1270         break;
1271     }
1272     case EVTCHNOP_bind_pirq: {
1273         struct evtchn_bind_pirq pirq;
1274 
1275         qemu_build_assert(sizeof(pirq) == 12);
1276         if (kvm_copy_from_gva(cs, arg, &pirq, sizeof(pirq))) {
1277             err = -EFAULT;
1278             break;
1279         }
1280 
1281         err = xen_evtchn_bind_pirq_op(&pirq);
1282         if (!err && kvm_copy_to_gva(cs, arg, &pirq, sizeof(pirq))) {
1283             err = -EFAULT;
1284         }
1285         break;
1286     }
1287     case EVTCHNOP_bind_ipi: {
1288         struct evtchn_bind_ipi ipi;
1289 
1290         qemu_build_assert(sizeof(ipi) == 8);
1291         if (kvm_copy_from_gva(cs, arg, &ipi, sizeof(ipi))) {
1292             err = -EFAULT;
1293             break;
1294         }
1295 
1296         err = xen_evtchn_bind_ipi_op(&ipi);
1297         if (!err && kvm_copy_to_gva(cs, arg, &ipi, sizeof(ipi))) {
1298             err = -EFAULT;
1299         }
1300         break;
1301     }
1302     case EVTCHNOP_send: {
1303         struct evtchn_send send;
1304 
1305         qemu_build_assert(sizeof(send) == 4);
1306         if (kvm_copy_from_gva(cs, arg, &send, sizeof(send))) {
1307             err = -EFAULT;
1308             break;
1309         }
1310 
1311         err = xen_evtchn_send_op(&send);
1312         break;
1313     }
1314     case EVTCHNOP_alloc_unbound: {
1315         struct evtchn_alloc_unbound alloc;
1316 
1317         qemu_build_assert(sizeof(alloc) == 8);
1318         if (kvm_copy_from_gva(cs, arg, &alloc, sizeof(alloc))) {
1319             err = -EFAULT;
1320             break;
1321         }
1322 
1323         err = xen_evtchn_alloc_unbound_op(&alloc);
1324         if (!err && kvm_copy_to_gva(cs, arg, &alloc, sizeof(alloc))) {
1325             err = -EFAULT;
1326         }
1327         break;
1328     }
1329     case EVTCHNOP_bind_interdomain: {
1330         struct evtchn_bind_interdomain interdomain;
1331 
1332         qemu_build_assert(sizeof(interdomain) == 12);
1333         if (kvm_copy_from_gva(cs, arg, &interdomain, sizeof(interdomain))) {
1334             err = -EFAULT;
1335             break;
1336         }
1337 
1338         err = xen_evtchn_bind_interdomain_op(&interdomain);
1339         if (!err &&
1340             kvm_copy_to_gva(cs, arg, &interdomain, sizeof(interdomain))) {
1341             err = -EFAULT;
1342         }
1343         break;
1344     }
1345     case EVTCHNOP_bind_vcpu: {
1346         struct evtchn_bind_vcpu vcpu;
1347 
1348         qemu_build_assert(sizeof(vcpu) == 8);
1349         if (kvm_copy_from_gva(cs, arg, &vcpu, sizeof(vcpu))) {
1350             err = -EFAULT;
1351             break;
1352         }
1353 
1354         err = xen_evtchn_bind_vcpu_op(&vcpu);
1355         break;
1356     }
1357     case EVTCHNOP_reset: {
1358         struct evtchn_reset reset;
1359 
1360         qemu_build_assert(sizeof(reset) == 2);
1361         if (kvm_copy_from_gva(cs, arg, &reset, sizeof(reset))) {
1362             err = -EFAULT;
1363             break;
1364         }
1365 
1366         err = xen_evtchn_reset_op(&reset);
1367         break;
1368     }
1369     default:
1370         return false;
1371     }
1372 
1373     exit->u.hcall.result = err;
1374     return true;
1375 }
1376 
1377 int kvm_xen_soft_reset(void)
1378 {
1379     CPUState *cpu;
1380     int err;
1381 
1382     assert(qemu_mutex_iothread_locked());
1383 
1384     trace_kvm_xen_soft_reset();
1385 
1386     err = xen_evtchn_soft_reset();
1387     if (err) {
1388         return err;
1389     }
1390 
1391     /*
1392      * Zero is the reset/startup state for HVM_PARAM_CALLBACK_IRQ. Strictly,
1393      * it maps to HVM_PARAM_CALLBACK_TYPE_GSI with GSI#0, but Xen refuses to
1394      * to deliver to the timer interrupt and treats that as 'disabled'.
1395      */
1396     err = xen_evtchn_set_callback_param(0);
1397     if (err) {
1398         return err;
1399     }
1400 
1401     CPU_FOREACH(cpu) {
1402         async_run_on_cpu(cpu, do_vcpu_soft_reset, RUN_ON_CPU_NULL);
1403     }
1404 
1405     err = xen_overlay_map_shinfo_page(INVALID_GFN);
1406     if (err) {
1407         return err;
1408     }
1409 
1410     err = xen_gnttab_reset();
1411     if (err) {
1412         return err;
1413     }
1414 
1415     err = xen_xenstore_reset();
1416     if (err) {
1417         return err;
1418     }
1419 
1420     return 0;
1421 }
1422 
1423 static int schedop_shutdown(CPUState *cs, uint64_t arg)
1424 {
1425     struct sched_shutdown shutdown;
1426     int ret = 0;
1427 
1428     /* No need for 32/64 compat handling */
1429     qemu_build_assert(sizeof(shutdown) == 4);
1430 
1431     if (kvm_copy_from_gva(cs, arg, &shutdown, sizeof(shutdown))) {
1432         return -EFAULT;
1433     }
1434 
1435     switch (shutdown.reason) {
1436     case SHUTDOWN_crash:
1437         cpu_dump_state(cs, stderr, CPU_DUMP_CODE);
1438         qemu_system_guest_panicked(NULL);
1439         break;
1440 
1441     case SHUTDOWN_reboot:
1442         qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
1443         break;
1444 
1445     case SHUTDOWN_poweroff:
1446         qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
1447         break;
1448 
1449     case SHUTDOWN_soft_reset:
1450         qemu_mutex_lock_iothread();
1451         ret = kvm_xen_soft_reset();
1452         qemu_mutex_unlock_iothread();
1453         break;
1454 
1455     default:
1456         ret = -EINVAL;
1457         break;
1458     }
1459 
1460     return ret;
1461 }
1462 
1463 static bool kvm_xen_hcall_sched_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1464                                    int cmd, uint64_t arg)
1465 {
1466     CPUState *cs = CPU(cpu);
1467     int err = -ENOSYS;
1468 
1469     switch (cmd) {
1470     case SCHEDOP_shutdown:
1471         err = schedop_shutdown(cs, arg);
1472         break;
1473 
1474     case SCHEDOP_poll:
1475         /*
1476          * Linux will panic if this doesn't work. Just yield; it's not
1477          * worth overthinking it because with event channel handling
1478          * in KVM, the kernel will intercept this and it will never
1479          * reach QEMU anyway. The semantics of the hypercall explicltly
1480          * permit spurious wakeups.
1481          */
1482     case SCHEDOP_yield:
1483         sched_yield();
1484         err = 0;
1485         break;
1486 
1487     default:
1488         return false;
1489     }
1490 
1491     exit->u.hcall.result = err;
1492     return true;
1493 }
1494 
1495 static bool kvm_xen_hcall_gnttab_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1496                                     int cmd, uint64_t arg, int count)
1497 {
1498     CPUState *cs = CPU(cpu);
1499     int err;
1500 
1501     switch (cmd) {
1502     case GNTTABOP_set_version: {
1503         struct gnttab_set_version set;
1504 
1505         qemu_build_assert(sizeof(set) == 4);
1506         if (kvm_copy_from_gva(cs, arg, &set, sizeof(set))) {
1507             err = -EFAULT;
1508             break;
1509         }
1510 
1511         err = xen_gnttab_set_version_op(&set);
1512         if (!err && kvm_copy_to_gva(cs, arg, &set, sizeof(set))) {
1513             err = -EFAULT;
1514         }
1515         break;
1516     }
1517     case GNTTABOP_get_version: {
1518         struct gnttab_get_version get;
1519 
1520         qemu_build_assert(sizeof(get) == 8);
1521         if (kvm_copy_from_gva(cs, arg, &get, sizeof(get))) {
1522             err = -EFAULT;
1523             break;
1524         }
1525 
1526         err = xen_gnttab_get_version_op(&get);
1527         if (!err && kvm_copy_to_gva(cs, arg, &get, sizeof(get))) {
1528             err = -EFAULT;
1529         }
1530         break;
1531     }
1532     case GNTTABOP_query_size: {
1533         struct gnttab_query_size size;
1534 
1535         qemu_build_assert(sizeof(size) == 16);
1536         if (kvm_copy_from_gva(cs, arg, &size, sizeof(size))) {
1537             err = -EFAULT;
1538             break;
1539         }
1540 
1541         err = xen_gnttab_query_size_op(&size);
1542         if (!err && kvm_copy_to_gva(cs, arg, &size, sizeof(size))) {
1543             err = -EFAULT;
1544         }
1545         break;
1546     }
1547     case GNTTABOP_setup_table:
1548     case GNTTABOP_copy:
1549     case GNTTABOP_map_grant_ref:
1550     case GNTTABOP_unmap_grant_ref:
1551     case GNTTABOP_swap_grant_ref:
1552         return false;
1553 
1554     default:
1555         /* Xen explicitly returns -ENOSYS to HVM guests for all others */
1556         err = -ENOSYS;
1557         break;
1558     }
1559 
1560     exit->u.hcall.result = err;
1561     return true;
1562 }
1563 
1564 static bool kvm_xen_hcall_physdev_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1565                                      int cmd, uint64_t arg)
1566 {
1567     CPUState *cs = CPU(cpu);
1568     int err;
1569 
1570     switch (cmd) {
1571     case PHYSDEVOP_map_pirq: {
1572         struct physdev_map_pirq map;
1573 
1574         if (hypercall_compat32(exit->u.hcall.longmode)) {
1575             struct compat_physdev_map_pirq *map32 = (void *)&map;
1576 
1577             if (kvm_copy_from_gva(cs, arg, map32, sizeof(*map32))) {
1578                 return -EFAULT;
1579             }
1580 
1581             /*
1582              * The only thing that's different is the alignment of the
1583              * uint64_t table_base at the end, which gets padding to make
1584              * it 64-bit aligned in the 64-bit version.
1585              */
1586             qemu_build_assert(sizeof(*map32) == 36);
1587             qemu_build_assert(offsetof(struct physdev_map_pirq, entry_nr) ==
1588                               offsetof(struct compat_physdev_map_pirq, entry_nr));
1589             memmove(&map.table_base, &map32->table_base, sizeof(map.table_base));
1590         } else {
1591             if (kvm_copy_from_gva(cs, arg, &map, sizeof(map))) {
1592                 err = -EFAULT;
1593                 break;
1594             }
1595         }
1596         err = xen_physdev_map_pirq(&map);
1597         /*
1598          * Since table_base is an IN parameter and won't be changed, just
1599          * copy the size of the compat structure back to the guest.
1600          */
1601         if (!err && kvm_copy_to_gva(cs, arg, &map,
1602                                     sizeof(struct compat_physdev_map_pirq))) {
1603             err = -EFAULT;
1604         }
1605         break;
1606     }
1607     case PHYSDEVOP_unmap_pirq: {
1608         struct physdev_unmap_pirq unmap;
1609 
1610         qemu_build_assert(sizeof(unmap) == 8);
1611         if (kvm_copy_from_gva(cs, arg, &unmap, sizeof(unmap))) {
1612             err = -EFAULT;
1613             break;
1614         }
1615 
1616         err = xen_physdev_unmap_pirq(&unmap);
1617         if (!err && kvm_copy_to_gva(cs, arg, &unmap, sizeof(unmap))) {
1618             err = -EFAULT;
1619         }
1620         break;
1621     }
1622     case PHYSDEVOP_eoi: {
1623         struct physdev_eoi eoi;
1624 
1625         qemu_build_assert(sizeof(eoi) == 4);
1626         if (kvm_copy_from_gva(cs, arg, &eoi, sizeof(eoi))) {
1627             err = -EFAULT;
1628             break;
1629         }
1630 
1631         err = xen_physdev_eoi_pirq(&eoi);
1632         if (!err && kvm_copy_to_gva(cs, arg, &eoi, sizeof(eoi))) {
1633             err = -EFAULT;
1634         }
1635         break;
1636     }
1637     case PHYSDEVOP_irq_status_query: {
1638         struct physdev_irq_status_query query;
1639 
1640         qemu_build_assert(sizeof(query) == 8);
1641         if (kvm_copy_from_gva(cs, arg, &query, sizeof(query))) {
1642             err = -EFAULT;
1643             break;
1644         }
1645 
1646         err = xen_physdev_query_pirq(&query);
1647         if (!err && kvm_copy_to_gva(cs, arg, &query, sizeof(query))) {
1648             err = -EFAULT;
1649         }
1650         break;
1651     }
1652     case PHYSDEVOP_get_free_pirq: {
1653         struct physdev_get_free_pirq get;
1654 
1655         qemu_build_assert(sizeof(get) == 8);
1656         if (kvm_copy_from_gva(cs, arg, &get, sizeof(get))) {
1657             err = -EFAULT;
1658             break;
1659         }
1660 
1661         err = xen_physdev_get_free_pirq(&get);
1662         if (!err && kvm_copy_to_gva(cs, arg, &get, sizeof(get))) {
1663             err = -EFAULT;
1664         }
1665         break;
1666     }
1667     case PHYSDEVOP_pirq_eoi_gmfn_v2: /* FreeBSD 13 makes this hypercall */
1668         err = -ENOSYS;
1669         break;
1670 
1671     default:
1672         return false;
1673     }
1674 
1675     exit->u.hcall.result = err;
1676     return true;
1677 }
1678 
1679 static bool do_kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit)
1680 {
1681     uint16_t code = exit->u.hcall.input;
1682 
1683     if (exit->u.hcall.cpl > 0) {
1684         exit->u.hcall.result = -EPERM;
1685         return true;
1686     }
1687 
1688     switch (code) {
1689     case __HYPERVISOR_set_timer_op:
1690         if (exit->u.hcall.longmode) {
1691             return kvm_xen_hcall_set_timer_op(exit, cpu,
1692                                               exit->u.hcall.params[0]);
1693         } else {
1694             /* In 32-bit mode, the 64-bit timer value is in two args. */
1695             uint64_t val = ((uint64_t)exit->u.hcall.params[1]) << 32 |
1696                 (uint32_t)exit->u.hcall.params[0];
1697             return kvm_xen_hcall_set_timer_op(exit, cpu, val);
1698         }
1699     case __HYPERVISOR_grant_table_op:
1700         return kvm_xen_hcall_gnttab_op(exit, cpu, exit->u.hcall.params[0],
1701                                        exit->u.hcall.params[1],
1702                                        exit->u.hcall.params[2]);
1703     case __HYPERVISOR_sched_op:
1704         return kvm_xen_hcall_sched_op(exit, cpu, exit->u.hcall.params[0],
1705                                       exit->u.hcall.params[1]);
1706     case __HYPERVISOR_event_channel_op:
1707         return kvm_xen_hcall_evtchn_op(exit, cpu, exit->u.hcall.params[0],
1708                                        exit->u.hcall.params[1]);
1709     case __HYPERVISOR_vcpu_op:
1710         return kvm_xen_hcall_vcpu_op(exit, cpu,
1711                                      exit->u.hcall.params[0],
1712                                      exit->u.hcall.params[1],
1713                                      exit->u.hcall.params[2]);
1714     case __HYPERVISOR_hvm_op:
1715         return kvm_xen_hcall_hvm_op(exit, cpu, exit->u.hcall.params[0],
1716                                     exit->u.hcall.params[1]);
1717     case __HYPERVISOR_memory_op:
1718         return kvm_xen_hcall_memory_op(exit, cpu, exit->u.hcall.params[0],
1719                                        exit->u.hcall.params[1]);
1720     case __HYPERVISOR_physdev_op:
1721         return kvm_xen_hcall_physdev_op(exit, cpu, exit->u.hcall.params[0],
1722                                         exit->u.hcall.params[1]);
1723     case __HYPERVISOR_xen_version:
1724         return kvm_xen_hcall_xen_version(exit, cpu, exit->u.hcall.params[0],
1725                                          exit->u.hcall.params[1]);
1726     default:
1727         return false;
1728     }
1729 }
1730 
1731 int kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit)
1732 {
1733     if (exit->type != KVM_EXIT_XEN_HCALL) {
1734         return -1;
1735     }
1736 
1737     /*
1738      * The kernel latches the guest 32/64 mode when the MSR is used to fill
1739      * the hypercall page. So if we see a hypercall in a mode that doesn't
1740      * match our own idea of the guest mode, fetch the kernel's idea of the
1741      * "long mode" to remain in sync.
1742      */
1743     if (exit->u.hcall.longmode != xen_is_long_mode()) {
1744         xen_sync_long_mode();
1745     }
1746 
1747     if (!do_kvm_xen_handle_exit(cpu, exit)) {
1748         /*
1749          * Some hypercalls will be deliberately "implemented" by returning
1750          * -ENOSYS. This case is for hypercalls which are unexpected.
1751          */
1752         exit->u.hcall.result = -ENOSYS;
1753         qemu_log_mask(LOG_UNIMP, "Unimplemented Xen hypercall %"
1754                       PRId64 " (0x%" PRIx64 " 0x%" PRIx64 " 0x%" PRIx64 ")\n",
1755                       (uint64_t)exit->u.hcall.input,
1756                       (uint64_t)exit->u.hcall.params[0],
1757                       (uint64_t)exit->u.hcall.params[1],
1758                       (uint64_t)exit->u.hcall.params[2]);
1759     }
1760 
1761     trace_kvm_xen_hypercall(CPU(cpu)->cpu_index, exit->u.hcall.cpl,
1762                             exit->u.hcall.input, exit->u.hcall.params[0],
1763                             exit->u.hcall.params[1], exit->u.hcall.params[2],
1764                             exit->u.hcall.result);
1765     return 0;
1766 }
1767 
1768 uint16_t kvm_xen_get_gnttab_max_frames(void)
1769 {
1770     KVMState *s = KVM_STATE(current_accel());
1771     return s->xen_gnttab_max_frames;
1772 }
1773 
1774 uint16_t kvm_xen_get_evtchn_max_pirq(void)
1775 {
1776     KVMState *s = KVM_STATE(current_accel());
1777     return s->xen_evtchn_max_pirq;
1778 }
1779 
1780 int kvm_put_xen_state(CPUState *cs)
1781 {
1782     X86CPU *cpu = X86_CPU(cs);
1783     CPUX86State *env = &cpu->env;
1784     uint64_t gpa;
1785     int ret;
1786 
1787     gpa = env->xen_vcpu_info_gpa;
1788     if (gpa == INVALID_GPA) {
1789         gpa = env->xen_vcpu_info_default_gpa;
1790     }
1791 
1792     if (gpa != INVALID_GPA) {
1793         ret = set_vcpu_info(cs, gpa);
1794         if (ret < 0) {
1795             return ret;
1796         }
1797     }
1798 
1799     gpa = env->xen_vcpu_time_info_gpa;
1800     if (gpa != INVALID_GPA) {
1801         ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
1802                                     gpa);
1803         if (ret < 0) {
1804             return ret;
1805         }
1806     }
1807 
1808     gpa = env->xen_vcpu_runstate_gpa;
1809     if (gpa != INVALID_GPA) {
1810         ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
1811                                     gpa);
1812         if (ret < 0) {
1813             return ret;
1814         }
1815     }
1816 
1817     if (env->xen_periodic_timer_period) {
1818         ret = do_set_periodic_timer(cs, env->xen_periodic_timer_period);
1819         if (ret < 0) {
1820             return ret;
1821         }
1822     }
1823 
1824     if (!kvm_xen_has_cap(EVTCHN_SEND)) {
1825         /*
1826          * If the kernel has EVTCHN_SEND support then it handles timers too,
1827          * so the timer will be restored by kvm_xen_set_vcpu_timer() below.
1828          */
1829         if (env->xen_singleshot_timer_ns) {
1830             ret = do_set_singleshot_timer(cs, env->xen_singleshot_timer_ns,
1831                                     false, false);
1832             if (ret < 0) {
1833                 return ret;
1834             }
1835         }
1836         return 0;
1837     }
1838 
1839     if (env->xen_vcpu_callback_vector) {
1840         ret = kvm_xen_set_vcpu_callback_vector(cs);
1841         if (ret < 0) {
1842             return ret;
1843         }
1844     }
1845 
1846     if (env->xen_virq[VIRQ_TIMER]) {
1847         ret = kvm_xen_set_vcpu_timer(cs);
1848         if (ret < 0) {
1849             return ret;
1850         }
1851     }
1852     return 0;
1853 }
1854 
1855 int kvm_get_xen_state(CPUState *cs)
1856 {
1857     X86CPU *cpu = X86_CPU(cs);
1858     CPUX86State *env = &cpu->env;
1859     uint64_t gpa;
1860     int ret;
1861 
1862     /*
1863      * The kernel does not mark vcpu_info as dirty when it delivers interrupts
1864      * to it. It's up to userspace to *assume* that any page shared thus is
1865      * always considered dirty. The shared_info page is different since it's
1866      * an overlay and migrated separately anyway.
1867      */
1868     gpa = env->xen_vcpu_info_gpa;
1869     if (gpa == INVALID_GPA) {
1870         gpa = env->xen_vcpu_info_default_gpa;
1871     }
1872     if (gpa != INVALID_GPA) {
1873         MemoryRegionSection mrs = memory_region_find(get_system_memory(),
1874                                                      gpa,
1875                                                      sizeof(struct vcpu_info));
1876         if (mrs.mr &&
1877             !int128_lt(mrs.size, int128_make64(sizeof(struct vcpu_info)))) {
1878             memory_region_set_dirty(mrs.mr, mrs.offset_within_region,
1879                                     sizeof(struct vcpu_info));
1880         }
1881     }
1882 
1883     if (!kvm_xen_has_cap(EVTCHN_SEND)) {
1884         return 0;
1885     }
1886 
1887     /*
1888      * If the kernel is accelerating timers, read out the current value of the
1889      * singleshot timer deadline.
1890      */
1891     if (env->xen_virq[VIRQ_TIMER]) {
1892         struct kvm_xen_vcpu_attr va = {
1893             .type = KVM_XEN_VCPU_ATTR_TYPE_TIMER,
1894         };
1895         ret = kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_GET_ATTR, &va);
1896         if (ret < 0) {
1897             return ret;
1898         }
1899         env->xen_singleshot_timer_ns = va.u.timer.expires_ns;
1900     }
1901 
1902     return 0;
1903 }
1904