xref: /openbmc/qemu/target/i386/kvm/xen-emu.c (revision e8d1e0cd)
1 /*
2  * Xen HVM emulation support in KVM
3  *
4  * Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
5  * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
6  *
7  * This work is licensed under the terms of the GNU GPL, version 2 or later.
8  * See the COPYING file in the top-level directory.
9  *
10  */
11 
12 #include "qemu/osdep.h"
13 #include "qemu/log.h"
14 #include "qemu/main-loop.h"
15 #include "hw/xen/xen.h"
16 #include "sysemu/kvm_int.h"
17 #include "sysemu/kvm_xen.h"
18 #include "kvm/kvm_i386.h"
19 #include "exec/address-spaces.h"
20 #include "xen-emu.h"
21 #include "trace.h"
22 #include "sysemu/runstate.h"
23 
24 #include "hw/pci/msi.h"
25 #include "hw/i386/apic-msidef.h"
26 #include "hw/i386/e820_memory_layout.h"
27 #include "hw/i386/kvm/xen_overlay.h"
28 #include "hw/i386/kvm/xen_evtchn.h"
29 #include "hw/i386/kvm/xen_gnttab.h"
30 #include "hw/i386/kvm/xen_xenstore.h"
31 
32 #include "hw/xen/interface/version.h"
33 #include "hw/xen/interface/sched.h"
34 #include "hw/xen/interface/memory.h"
35 #include "hw/xen/interface/hvm/hvm_op.h"
36 #include "hw/xen/interface/hvm/params.h"
37 #include "hw/xen/interface/vcpu.h"
38 #include "hw/xen/interface/event_channel.h"
39 #include "hw/xen/interface/grant_table.h"
40 
41 #include "xen-compat.h"
42 
43 static void xen_vcpu_singleshot_timer_event(void *opaque);
44 static void xen_vcpu_periodic_timer_event(void *opaque);
45 
46 #ifdef TARGET_X86_64
47 #define hypercall_compat32(longmode) (!(longmode))
48 #else
49 #define hypercall_compat32(longmode) (false)
50 #endif
51 
52 static bool kvm_gva_to_gpa(CPUState *cs, uint64_t gva, uint64_t *gpa,
53                            size_t *len, bool is_write)
54 {
55         struct kvm_translation tr = {
56             .linear_address = gva,
57         };
58 
59         if (len) {
60             *len = TARGET_PAGE_SIZE - (gva & ~TARGET_PAGE_MASK);
61         }
62 
63         if (kvm_vcpu_ioctl(cs, KVM_TRANSLATE, &tr) || !tr.valid ||
64             (is_write && !tr.writeable)) {
65             return false;
66         }
67         *gpa = tr.physical_address;
68         return true;
69 }
70 
71 static int kvm_gva_rw(CPUState *cs, uint64_t gva, void *_buf, size_t sz,
72                       bool is_write)
73 {
74     uint8_t *buf = (uint8_t *)_buf;
75     uint64_t gpa;
76     size_t len;
77 
78     while (sz) {
79         if (!kvm_gva_to_gpa(cs, gva, &gpa, &len, is_write)) {
80             return -EFAULT;
81         }
82         if (len > sz) {
83             len = sz;
84         }
85 
86         cpu_physical_memory_rw(gpa, buf, len, is_write);
87 
88         buf += len;
89         sz -= len;
90         gva += len;
91     }
92 
93     return 0;
94 }
95 
96 static inline int kvm_copy_from_gva(CPUState *cs, uint64_t gva, void *buf,
97                                     size_t sz)
98 {
99     return kvm_gva_rw(cs, gva, buf, sz, false);
100 }
101 
102 static inline int kvm_copy_to_gva(CPUState *cs, uint64_t gva, void *buf,
103                                   size_t sz)
104 {
105     return kvm_gva_rw(cs, gva, buf, sz, true);
106 }
107 
108 int kvm_xen_init(KVMState *s, uint32_t hypercall_msr)
109 {
110     const int required_caps = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
111         KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | KVM_XEN_HVM_CONFIG_SHARED_INFO;
112     struct kvm_xen_hvm_config cfg = {
113         .msr = hypercall_msr,
114         .flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL,
115     };
116     int xen_caps, ret;
117 
118     xen_caps = kvm_check_extension(s, KVM_CAP_XEN_HVM);
119     if (required_caps & ~xen_caps) {
120         error_report("kvm: Xen HVM guest support not present or insufficient");
121         return -ENOSYS;
122     }
123 
124     if (xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND) {
125         struct kvm_xen_hvm_attr ha = {
126             .type = KVM_XEN_ATTR_TYPE_XEN_VERSION,
127             .u.xen_version = s->xen_version,
128         };
129         (void)kvm_vm_ioctl(s, KVM_XEN_HVM_SET_ATTR, &ha);
130 
131         cfg.flags |= KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
132     }
133 
134     ret = kvm_vm_ioctl(s, KVM_XEN_HVM_CONFIG, &cfg);
135     if (ret < 0) {
136         error_report("kvm: Failed to enable Xen HVM support: %s",
137                      strerror(-ret));
138         return ret;
139     }
140 
141     /* If called a second time, don't repeat the rest of the setup. */
142     if (s->xen_caps) {
143         return 0;
144     }
145 
146     /*
147      * Event channel delivery via GSI/PCI_INTX needs to poll the vcpu_info
148      * of vCPU0 to deassert the IRQ when ->evtchn_upcall_pending is cleared.
149      *
150      * In the kernel, there's a notifier hook on the PIC/IOAPIC which allows
151      * such things to be polled at precisely the right time. We *could* do
152      * it nicely in the kernel: check vcpu_info[0]->evtchn_upcall_pending at
153      * the moment the IRQ is acked, and see if it should be reasserted.
154      *
155      * But the in-kernel irqchip is deprecated, so we're unlikely to add
156      * that support in the kernel. Insist on using the split irqchip mode
157      * instead.
158      *
159      * This leaves us polling for the level going low in QEMU, which lacks
160      * the appropriate hooks in its PIC/IOAPIC code. Even VFIO is sending a
161      * spurious 'ack' to an INTX IRQ every time there's any MMIO access to
162      * the device (for which it has to unmap the device and trap access, for
163      * some period after an IRQ!!). In the Xen case, we do it on exit from
164      * KVM_RUN, if the flag is set to say that the GSI is currently asserted.
165      * Which is kind of icky, but less so than the VFIO one. I may fix them
166      * both later...
167      */
168     if (!kvm_kernel_irqchip_split()) {
169         error_report("kvm: Xen support requires kernel-irqchip=split");
170         return -EINVAL;
171     }
172 
173     s->xen_caps = xen_caps;
174 
175     /* Tell fw_cfg to notify the BIOS to reserve the range. */
176     ret = e820_add_entry(XEN_SPECIAL_AREA_ADDR, XEN_SPECIAL_AREA_SIZE,
177                          E820_RESERVED);
178     if (ret < 0) {
179         fprintf(stderr, "e820_add_entry() table is full\n");
180         return ret;
181     }
182 
183     /* The page couldn't be overlaid until KVM was initialized */
184     xen_xenstore_reset();
185 
186     return 0;
187 }
188 
189 int kvm_xen_init_vcpu(CPUState *cs)
190 {
191     X86CPU *cpu = X86_CPU(cs);
192     CPUX86State *env = &cpu->env;
193     int err;
194 
195     /*
196      * The kernel needs to know the Xen/ACPI vCPU ID because that's
197      * what the guest uses in hypercalls such as timers. It doesn't
198      * match the APIC ID which is generally used for talking to the
199      * kernel about vCPUs. And if vCPU threads race with creating
200      * their KVM vCPUs out of order, it doesn't necessarily match
201      * with the kernel's internal vCPU indices either.
202      */
203     if (kvm_xen_has_cap(EVTCHN_SEND)) {
204         struct kvm_xen_vcpu_attr va = {
205             .type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID,
206             .u.vcpu_id = cs->cpu_index,
207         };
208         err = kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &va);
209         if (err) {
210             error_report("kvm: Failed to set Xen vCPU ID attribute: %s",
211                          strerror(-err));
212             return err;
213         }
214     }
215 
216     env->xen_vcpu_info_gpa = INVALID_GPA;
217     env->xen_vcpu_info_default_gpa = INVALID_GPA;
218     env->xen_vcpu_time_info_gpa = INVALID_GPA;
219     env->xen_vcpu_runstate_gpa = INVALID_GPA;
220 
221     qemu_mutex_init(&env->xen_timers_lock);
222     env->xen_singleshot_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
223                                              xen_vcpu_singleshot_timer_event,
224                                              cpu);
225     if (!env->xen_singleshot_timer) {
226         return -ENOMEM;
227     }
228     env->xen_singleshot_timer->opaque = cs;
229 
230     env->xen_periodic_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
231                                            xen_vcpu_periodic_timer_event,
232                                            cpu);
233     if (!env->xen_periodic_timer) {
234         return -ENOMEM;
235     }
236     env->xen_periodic_timer->opaque = cs;
237 
238     return 0;
239 }
240 
241 uint32_t kvm_xen_get_caps(void)
242 {
243     return kvm_state->xen_caps;
244 }
245 
246 static bool kvm_xen_hcall_xen_version(struct kvm_xen_exit *exit, X86CPU *cpu,
247                                      int cmd, uint64_t arg)
248 {
249     int err = 0;
250 
251     switch (cmd) {
252     case XENVER_get_features: {
253         struct xen_feature_info fi;
254 
255         /* No need for 32/64 compat handling */
256         qemu_build_assert(sizeof(fi) == 8);
257 
258         err = kvm_copy_from_gva(CPU(cpu), arg, &fi, sizeof(fi));
259         if (err) {
260             break;
261         }
262 
263         fi.submap = 0;
264         if (fi.submap_idx == 0) {
265             fi.submap |= 1 << XENFEAT_writable_page_tables |
266                          1 << XENFEAT_writable_descriptor_tables |
267                          1 << XENFEAT_auto_translated_physmap |
268                          1 << XENFEAT_supervisor_mode_kernel |
269                          1 << XENFEAT_hvm_callback_vector |
270                          1 << XENFEAT_hvm_safe_pvclock |
271                          1 << XENFEAT_hvm_pirqs;
272         }
273 
274         err = kvm_copy_to_gva(CPU(cpu), arg, &fi, sizeof(fi));
275         break;
276     }
277 
278     default:
279         return false;
280     }
281 
282     exit->u.hcall.result = err;
283     return true;
284 }
285 
286 static int kvm_xen_set_vcpu_attr(CPUState *cs, uint16_t type, uint64_t gpa)
287 {
288     struct kvm_xen_vcpu_attr xhsi;
289 
290     xhsi.type = type;
291     xhsi.u.gpa = gpa;
292 
293     trace_kvm_xen_set_vcpu_attr(cs->cpu_index, type, gpa);
294 
295     return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &xhsi);
296 }
297 
298 static int kvm_xen_set_vcpu_callback_vector(CPUState *cs)
299 {
300     uint8_t vector = X86_CPU(cs)->env.xen_vcpu_callback_vector;
301     struct kvm_xen_vcpu_attr xva;
302 
303     xva.type = KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR;
304     xva.u.vector = vector;
305 
306     trace_kvm_xen_set_vcpu_callback(cs->cpu_index, vector);
307 
308     return kvm_vcpu_ioctl(cs, KVM_XEN_HVM_SET_ATTR, &xva);
309 }
310 
311 static void do_set_vcpu_callback_vector(CPUState *cs, run_on_cpu_data data)
312 {
313     X86CPU *cpu = X86_CPU(cs);
314     CPUX86State *env = &cpu->env;
315 
316     env->xen_vcpu_callback_vector = data.host_int;
317 
318     if (kvm_xen_has_cap(EVTCHN_SEND)) {
319         kvm_xen_set_vcpu_callback_vector(cs);
320     }
321 }
322 
323 static int set_vcpu_info(CPUState *cs, uint64_t gpa)
324 {
325     X86CPU *cpu = X86_CPU(cs);
326     CPUX86State *env = &cpu->env;
327     MemoryRegionSection mrs = { .mr = NULL };
328     void *vcpu_info_hva = NULL;
329     int ret;
330 
331     ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO, gpa);
332     if (ret || gpa == INVALID_GPA) {
333         goto out;
334     }
335 
336     mrs = memory_region_find(get_system_memory(), gpa,
337                              sizeof(struct vcpu_info));
338     if (mrs.mr && mrs.mr->ram_block &&
339         !int128_lt(mrs.size, int128_make64(sizeof(struct vcpu_info)))) {
340         vcpu_info_hva = qemu_map_ram_ptr(mrs.mr->ram_block,
341                                          mrs.offset_within_region);
342     }
343     if (!vcpu_info_hva) {
344         if (mrs.mr) {
345             memory_region_unref(mrs.mr);
346             mrs.mr = NULL;
347         }
348         ret = -EINVAL;
349     }
350 
351  out:
352     if (env->xen_vcpu_info_mr) {
353         memory_region_unref(env->xen_vcpu_info_mr);
354     }
355     env->xen_vcpu_info_hva = vcpu_info_hva;
356     env->xen_vcpu_info_mr = mrs.mr;
357     return ret;
358 }
359 
360 static void do_set_vcpu_info_default_gpa(CPUState *cs, run_on_cpu_data data)
361 {
362     X86CPU *cpu = X86_CPU(cs);
363     CPUX86State *env = &cpu->env;
364 
365     env->xen_vcpu_info_default_gpa = data.host_ulong;
366 
367     /* Changing the default does nothing if a vcpu_info was explicitly set. */
368     if (env->xen_vcpu_info_gpa == INVALID_GPA) {
369         set_vcpu_info(cs, env->xen_vcpu_info_default_gpa);
370     }
371 }
372 
373 static void do_set_vcpu_info_gpa(CPUState *cs, run_on_cpu_data data)
374 {
375     X86CPU *cpu = X86_CPU(cs);
376     CPUX86State *env = &cpu->env;
377 
378     env->xen_vcpu_info_gpa = data.host_ulong;
379 
380     set_vcpu_info(cs, env->xen_vcpu_info_gpa);
381 }
382 
383 void *kvm_xen_get_vcpu_info_hva(uint32_t vcpu_id)
384 {
385     CPUState *cs = qemu_get_cpu(vcpu_id);
386     if (!cs) {
387         return NULL;
388     }
389 
390     return X86_CPU(cs)->env.xen_vcpu_info_hva;
391 }
392 
393 void kvm_xen_maybe_deassert_callback(CPUState *cs)
394 {
395     CPUX86State *env = &X86_CPU(cs)->env;
396     struct vcpu_info *vi = env->xen_vcpu_info_hva;
397     if (!vi) {
398         return;
399     }
400 
401     /* If the evtchn_upcall_pending flag is cleared, turn the GSI off. */
402     if (!vi->evtchn_upcall_pending) {
403         qemu_mutex_lock_iothread();
404         /*
405          * Check again now we have the lock, because it may have been
406          * asserted in the interim. And we don't want to take the lock
407          * every time because this is a fast path.
408          */
409         if (!vi->evtchn_upcall_pending) {
410             X86_CPU(cs)->env.xen_callback_asserted = false;
411             xen_evtchn_set_callback_level(0);
412         }
413         qemu_mutex_unlock_iothread();
414     }
415 }
416 
417 void kvm_xen_set_callback_asserted(void)
418 {
419     CPUState *cs = qemu_get_cpu(0);
420 
421     if (cs) {
422         X86_CPU(cs)->env.xen_callback_asserted = true;
423     }
424 }
425 
426 void kvm_xen_inject_vcpu_callback_vector(uint32_t vcpu_id, int type)
427 {
428     CPUState *cs = qemu_get_cpu(vcpu_id);
429     uint8_t vector;
430 
431     if (!cs) {
432         return;
433     }
434 
435     vector = X86_CPU(cs)->env.xen_vcpu_callback_vector;
436     if (vector) {
437         /*
438          * The per-vCPU callback vector injected via lapic. Just
439          * deliver it as an MSI.
440          */
441         MSIMessage msg = {
442             .address = APIC_DEFAULT_ADDRESS | X86_CPU(cs)->apic_id,
443             .data = vector | (1UL << MSI_DATA_LEVEL_SHIFT),
444         };
445         kvm_irqchip_send_msi(kvm_state, msg);
446         return;
447     }
448 
449     switch (type) {
450     case HVM_PARAM_CALLBACK_TYPE_VECTOR:
451         /*
452          * If the evtchn_upcall_pending field in the vcpu_info is set, then
453          * KVM will automatically deliver the vector on entering the vCPU
454          * so all we have to do is kick it out.
455          */
456         qemu_cpu_kick(cs);
457         break;
458 
459     case HVM_PARAM_CALLBACK_TYPE_GSI:
460     case HVM_PARAM_CALLBACK_TYPE_PCI_INTX:
461         if (vcpu_id == 0) {
462             xen_evtchn_set_callback_level(1);
463         }
464         break;
465     }
466 }
467 
468 static int kvm_xen_set_vcpu_timer(CPUState *cs)
469 {
470     X86CPU *cpu = X86_CPU(cs);
471     CPUX86State *env = &cpu->env;
472 
473     struct kvm_xen_vcpu_attr va = {
474         .type = KVM_XEN_VCPU_ATTR_TYPE_TIMER,
475         .u.timer.port = env->xen_virq[VIRQ_TIMER],
476         .u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
477         .u.timer.expires_ns = env->xen_singleshot_timer_ns,
478     };
479 
480     return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &va);
481 }
482 
483 static void do_set_vcpu_timer_virq(CPUState *cs, run_on_cpu_data data)
484 {
485     kvm_xen_set_vcpu_timer(cs);
486 }
487 
488 int kvm_xen_set_vcpu_virq(uint32_t vcpu_id, uint16_t virq, uint16_t port)
489 {
490     CPUState *cs = qemu_get_cpu(vcpu_id);
491 
492     if (!cs) {
493         return -ENOENT;
494     }
495 
496     /* cpu.h doesn't include the actual Xen header. */
497     qemu_build_assert(NR_VIRQS == XEN_NR_VIRQS);
498 
499     if (virq >= NR_VIRQS) {
500         return -EINVAL;
501     }
502 
503     if (port && X86_CPU(cs)->env.xen_virq[virq]) {
504         return -EEXIST;
505     }
506 
507     X86_CPU(cs)->env.xen_virq[virq] = port;
508     if (virq == VIRQ_TIMER && kvm_xen_has_cap(EVTCHN_SEND)) {
509         async_run_on_cpu(cs, do_set_vcpu_timer_virq,
510                          RUN_ON_CPU_HOST_INT(port));
511     }
512     return 0;
513 }
514 
515 static void do_set_vcpu_time_info_gpa(CPUState *cs, run_on_cpu_data data)
516 {
517     X86CPU *cpu = X86_CPU(cs);
518     CPUX86State *env = &cpu->env;
519 
520     env->xen_vcpu_time_info_gpa = data.host_ulong;
521 
522     kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
523                           env->xen_vcpu_time_info_gpa);
524 }
525 
526 static void do_set_vcpu_runstate_gpa(CPUState *cs, run_on_cpu_data data)
527 {
528     X86CPU *cpu = X86_CPU(cs);
529     CPUX86State *env = &cpu->env;
530 
531     env->xen_vcpu_runstate_gpa = data.host_ulong;
532 
533     kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
534                           env->xen_vcpu_runstate_gpa);
535 }
536 
537 static void do_vcpu_soft_reset(CPUState *cs, run_on_cpu_data data)
538 {
539     X86CPU *cpu = X86_CPU(cs);
540     CPUX86State *env = &cpu->env;
541 
542     env->xen_vcpu_info_gpa = INVALID_GPA;
543     env->xen_vcpu_info_default_gpa = INVALID_GPA;
544     env->xen_vcpu_time_info_gpa = INVALID_GPA;
545     env->xen_vcpu_runstate_gpa = INVALID_GPA;
546     env->xen_vcpu_callback_vector = 0;
547     env->xen_singleshot_timer_ns = 0;
548     memset(env->xen_virq, 0, sizeof(env->xen_virq));
549 
550     set_vcpu_info(cs, INVALID_GPA);
551     kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
552                           INVALID_GPA);
553     kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
554                           INVALID_GPA);
555     if (kvm_xen_has_cap(EVTCHN_SEND)) {
556         kvm_xen_set_vcpu_callback_vector(cs);
557         kvm_xen_set_vcpu_timer(cs);
558     }
559 
560 }
561 
562 static int xen_set_shared_info(uint64_t gfn)
563 {
564     uint64_t gpa = gfn << TARGET_PAGE_BITS;
565     int i, err;
566 
567     QEMU_IOTHREAD_LOCK_GUARD();
568 
569     /*
570      * The xen_overlay device tells KVM about it too, since it had to
571      * do that on migration load anyway (unless we're going to jump
572      * through lots of hoops to maintain the fiction that this isn't
573      * KVM-specific.
574      */
575     err = xen_overlay_map_shinfo_page(gpa);
576     if (err) {
577             return err;
578     }
579 
580     trace_kvm_xen_set_shared_info(gfn);
581 
582     for (i = 0; i < XEN_LEGACY_MAX_VCPUS; i++) {
583         CPUState *cpu = qemu_get_cpu(i);
584         if (cpu) {
585             async_run_on_cpu(cpu, do_set_vcpu_info_default_gpa,
586                              RUN_ON_CPU_HOST_ULONG(gpa));
587         }
588         gpa += sizeof(vcpu_info_t);
589     }
590 
591     return err;
592 }
593 
594 static int add_to_physmap_one(uint32_t space, uint64_t idx, uint64_t gfn)
595 {
596     switch (space) {
597     case XENMAPSPACE_shared_info:
598         if (idx > 0) {
599             return -EINVAL;
600         }
601         return xen_set_shared_info(gfn);
602 
603     case XENMAPSPACE_grant_table:
604         return xen_gnttab_map_page(idx, gfn);
605 
606     case XENMAPSPACE_gmfn:
607     case XENMAPSPACE_gmfn_range:
608         return -ENOTSUP;
609 
610     case XENMAPSPACE_gmfn_foreign:
611     case XENMAPSPACE_dev_mmio:
612         return -EPERM;
613 
614     default:
615         return -EINVAL;
616     }
617 }
618 
619 static int do_add_to_physmap(struct kvm_xen_exit *exit, X86CPU *cpu,
620                              uint64_t arg)
621 {
622     struct xen_add_to_physmap xatp;
623     CPUState *cs = CPU(cpu);
624 
625     if (hypercall_compat32(exit->u.hcall.longmode)) {
626         struct compat_xen_add_to_physmap xatp32;
627 
628         qemu_build_assert(sizeof(struct compat_xen_add_to_physmap) == 16);
629         if (kvm_copy_from_gva(cs, arg, &xatp32, sizeof(xatp32))) {
630             return -EFAULT;
631         }
632         xatp.domid = xatp32.domid;
633         xatp.size = xatp32.size;
634         xatp.space = xatp32.space;
635         xatp.idx = xatp32.idx;
636         xatp.gpfn = xatp32.gpfn;
637     } else {
638         if (kvm_copy_from_gva(cs, arg, &xatp, sizeof(xatp))) {
639             return -EFAULT;
640         }
641     }
642 
643     if (xatp.domid != DOMID_SELF && xatp.domid != xen_domid) {
644         return -ESRCH;
645     }
646 
647     return add_to_physmap_one(xatp.space, xatp.idx, xatp.gpfn);
648 }
649 
650 static int do_add_to_physmap_batch(struct kvm_xen_exit *exit, X86CPU *cpu,
651                                    uint64_t arg)
652 {
653     struct xen_add_to_physmap_batch xatpb;
654     unsigned long idxs_gva, gpfns_gva, errs_gva;
655     CPUState *cs = CPU(cpu);
656     size_t op_sz;
657 
658     if (hypercall_compat32(exit->u.hcall.longmode)) {
659         struct compat_xen_add_to_physmap_batch xatpb32;
660 
661         qemu_build_assert(sizeof(struct compat_xen_add_to_physmap_batch) == 20);
662         if (kvm_copy_from_gva(cs, arg, &xatpb32, sizeof(xatpb32))) {
663             return -EFAULT;
664         }
665         xatpb.domid = xatpb32.domid;
666         xatpb.space = xatpb32.space;
667         xatpb.size = xatpb32.size;
668 
669         idxs_gva = xatpb32.idxs.c;
670         gpfns_gva = xatpb32.gpfns.c;
671         errs_gva = xatpb32.errs.c;
672         op_sz = sizeof(uint32_t);
673     } else {
674         if (kvm_copy_from_gva(cs, arg, &xatpb, sizeof(xatpb))) {
675             return -EFAULT;
676         }
677         op_sz = sizeof(unsigned long);
678         idxs_gva = (unsigned long)xatpb.idxs.p;
679         gpfns_gva = (unsigned long)xatpb.gpfns.p;
680         errs_gva = (unsigned long)xatpb.errs.p;
681     }
682 
683     if (xatpb.domid != DOMID_SELF && xatpb.domid != xen_domid) {
684         return -ESRCH;
685     }
686 
687     /* Explicitly invalid for the batch op. Not that we implement it anyway. */
688     if (xatpb.space == XENMAPSPACE_gmfn_range) {
689         return -EINVAL;
690     }
691 
692     while (xatpb.size--) {
693         unsigned long idx = 0;
694         unsigned long gpfn = 0;
695         int err;
696 
697         /* For 32-bit compat this only copies the low 32 bits of each */
698         if (kvm_copy_from_gva(cs, idxs_gva, &idx, op_sz) ||
699             kvm_copy_from_gva(cs, gpfns_gva, &gpfn, op_sz)) {
700             return -EFAULT;
701         }
702         idxs_gva += op_sz;
703         gpfns_gva += op_sz;
704 
705         err = add_to_physmap_one(xatpb.space, idx, gpfn);
706 
707         if (kvm_copy_to_gva(cs, errs_gva, &err, sizeof(err))) {
708             return -EFAULT;
709         }
710         errs_gva += sizeof(err);
711     }
712     return 0;
713 }
714 
715 static bool kvm_xen_hcall_memory_op(struct kvm_xen_exit *exit, X86CPU *cpu,
716                                    int cmd, uint64_t arg)
717 {
718     int err;
719 
720     switch (cmd) {
721     case XENMEM_add_to_physmap:
722         err = do_add_to_physmap(exit, cpu, arg);
723         break;
724 
725     case XENMEM_add_to_physmap_batch:
726         err = do_add_to_physmap_batch(exit, cpu, arg);
727         break;
728 
729     default:
730         return false;
731     }
732 
733     exit->u.hcall.result = err;
734     return true;
735 }
736 
737 static bool handle_set_param(struct kvm_xen_exit *exit, X86CPU *cpu,
738                              uint64_t arg)
739 {
740     CPUState *cs = CPU(cpu);
741     struct xen_hvm_param hp;
742     int err = 0;
743 
744     /* No need for 32/64 compat handling */
745     qemu_build_assert(sizeof(hp) == 16);
746 
747     if (kvm_copy_from_gva(cs, arg, &hp, sizeof(hp))) {
748         err = -EFAULT;
749         goto out;
750     }
751 
752     if (hp.domid != DOMID_SELF && hp.domid != xen_domid) {
753         err = -ESRCH;
754         goto out;
755     }
756 
757     switch (hp.index) {
758     case HVM_PARAM_CALLBACK_IRQ:
759         qemu_mutex_lock_iothread();
760         err = xen_evtchn_set_callback_param(hp.value);
761         qemu_mutex_unlock_iothread();
762         xen_set_long_mode(exit->u.hcall.longmode);
763         break;
764     default:
765         return false;
766     }
767 
768 out:
769     exit->u.hcall.result = err;
770     return true;
771 }
772 
773 static bool handle_get_param(struct kvm_xen_exit *exit, X86CPU *cpu,
774                              uint64_t arg)
775 {
776     CPUState *cs = CPU(cpu);
777     struct xen_hvm_param hp;
778     int err = 0;
779 
780     /* No need for 32/64 compat handling */
781     qemu_build_assert(sizeof(hp) == 16);
782 
783     if (kvm_copy_from_gva(cs, arg, &hp, sizeof(hp))) {
784         err = -EFAULT;
785         goto out;
786     }
787 
788     if (hp.domid != DOMID_SELF && hp.domid != xen_domid) {
789         err = -ESRCH;
790         goto out;
791     }
792 
793     switch (hp.index) {
794     case HVM_PARAM_STORE_PFN:
795         hp.value = XEN_SPECIAL_PFN(XENSTORE);
796         break;
797     case HVM_PARAM_STORE_EVTCHN:
798         hp.value = xen_xenstore_get_port();
799         break;
800     default:
801         return false;
802     }
803 
804     if (kvm_copy_to_gva(cs, arg, &hp, sizeof(hp))) {
805         err = -EFAULT;
806     }
807 out:
808     exit->u.hcall.result = err;
809     return true;
810 }
811 
812 static int kvm_xen_hcall_evtchn_upcall_vector(struct kvm_xen_exit *exit,
813                                               X86CPU *cpu, uint64_t arg)
814 {
815     struct xen_hvm_evtchn_upcall_vector up;
816     CPUState *target_cs;
817 
818     /* No need for 32/64 compat handling */
819     qemu_build_assert(sizeof(up) == 8);
820 
821     if (kvm_copy_from_gva(CPU(cpu), arg, &up, sizeof(up))) {
822         return -EFAULT;
823     }
824 
825     if (up.vector < 0x10) {
826         return -EINVAL;
827     }
828 
829     target_cs = qemu_get_cpu(up.vcpu);
830     if (!target_cs) {
831         return -EINVAL;
832     }
833 
834     async_run_on_cpu(target_cs, do_set_vcpu_callback_vector,
835                      RUN_ON_CPU_HOST_INT(up.vector));
836     return 0;
837 }
838 
839 static bool kvm_xen_hcall_hvm_op(struct kvm_xen_exit *exit, X86CPU *cpu,
840                                  int cmd, uint64_t arg)
841 {
842     int ret = -ENOSYS;
843     switch (cmd) {
844     case HVMOP_set_evtchn_upcall_vector:
845         ret = kvm_xen_hcall_evtchn_upcall_vector(exit, cpu,
846                                                  exit->u.hcall.params[0]);
847         break;
848 
849     case HVMOP_pagetable_dying:
850         ret = -ENOSYS;
851         break;
852 
853     case HVMOP_set_param:
854         return handle_set_param(exit, cpu, arg);
855 
856     case HVMOP_get_param:
857         return handle_get_param(exit, cpu, arg);
858 
859     default:
860         return false;
861     }
862 
863     exit->u.hcall.result = ret;
864     return true;
865 }
866 
867 static int vcpuop_register_vcpu_info(CPUState *cs, CPUState *target,
868                                      uint64_t arg)
869 {
870     struct vcpu_register_vcpu_info rvi;
871     uint64_t gpa;
872 
873     /* No need for 32/64 compat handling */
874     qemu_build_assert(sizeof(rvi) == 16);
875     qemu_build_assert(sizeof(struct vcpu_info) == 64);
876 
877     if (!target) {
878         return -ENOENT;
879     }
880 
881     if (kvm_copy_from_gva(cs, arg, &rvi, sizeof(rvi))) {
882         return -EFAULT;
883     }
884 
885     if (rvi.offset > TARGET_PAGE_SIZE - sizeof(struct vcpu_info)) {
886         return -EINVAL;
887     }
888 
889     gpa = ((rvi.mfn << TARGET_PAGE_BITS) + rvi.offset);
890     async_run_on_cpu(target, do_set_vcpu_info_gpa, RUN_ON_CPU_HOST_ULONG(gpa));
891     return 0;
892 }
893 
894 static int vcpuop_register_vcpu_time_info(CPUState *cs, CPUState *target,
895                                           uint64_t arg)
896 {
897     struct vcpu_register_time_memory_area tma;
898     uint64_t gpa;
899     size_t len;
900 
901     /* No need for 32/64 compat handling */
902     qemu_build_assert(sizeof(tma) == 8);
903     qemu_build_assert(sizeof(struct vcpu_time_info) == 32);
904 
905     if (!target) {
906         return -ENOENT;
907     }
908 
909     if (kvm_copy_from_gva(cs, arg, &tma, sizeof(tma))) {
910         return -EFAULT;
911     }
912 
913     /*
914      * Xen actually uses the GVA and does the translation through the guest
915      * page tables each time. But Linux/KVM uses the GPA, on the assumption
916      * that guests only ever use *global* addresses (kernel virtual addresses)
917      * for it. If Linux is changed to redo the GVA→GPA translation each time,
918      * it will offer a new vCPU attribute for that, and we'll use it instead.
919      */
920     if (!kvm_gva_to_gpa(cs, tma.addr.p, &gpa, &len, false) ||
921         len < sizeof(struct vcpu_time_info)) {
922         return -EFAULT;
923     }
924 
925     async_run_on_cpu(target, do_set_vcpu_time_info_gpa,
926                      RUN_ON_CPU_HOST_ULONG(gpa));
927     return 0;
928 }
929 
930 static int vcpuop_register_runstate_info(CPUState *cs, CPUState *target,
931                                          uint64_t arg)
932 {
933     struct vcpu_register_runstate_memory_area rma;
934     uint64_t gpa;
935     size_t len;
936 
937     /* No need for 32/64 compat handling */
938     qemu_build_assert(sizeof(rma) == 8);
939     /* The runstate area actually does change size, but Linux copes. */
940 
941     if (!target) {
942         return -ENOENT;
943     }
944 
945     if (kvm_copy_from_gva(cs, arg, &rma, sizeof(rma))) {
946         return -EFAULT;
947     }
948 
949     /* As with vcpu_time_info, Xen actually uses the GVA but KVM doesn't. */
950     if (!kvm_gva_to_gpa(cs, rma.addr.p, &gpa, &len, false)) {
951         return -EFAULT;
952     }
953 
954     async_run_on_cpu(target, do_set_vcpu_runstate_gpa,
955                      RUN_ON_CPU_HOST_ULONG(gpa));
956     return 0;
957 }
958 
959 static uint64_t kvm_get_current_ns(void)
960 {
961     struct kvm_clock_data data;
962     int ret;
963 
964     ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
965     if (ret < 0) {
966         fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
967                 abort();
968     }
969 
970     return data.clock;
971 }
972 
973 static void xen_vcpu_singleshot_timer_event(void *opaque)
974 {
975     CPUState *cpu = opaque;
976     CPUX86State *env = &X86_CPU(cpu)->env;
977     uint16_t port = env->xen_virq[VIRQ_TIMER];
978 
979     if (likely(port)) {
980         xen_evtchn_set_port(port);
981     }
982 
983     qemu_mutex_lock(&env->xen_timers_lock);
984     env->xen_singleshot_timer_ns = 0;
985     qemu_mutex_unlock(&env->xen_timers_lock);
986 }
987 
988 static void xen_vcpu_periodic_timer_event(void *opaque)
989 {
990     CPUState *cpu = opaque;
991     CPUX86State *env = &X86_CPU(cpu)->env;
992     uint16_t port = env->xen_virq[VIRQ_TIMER];
993     int64_t qemu_now;
994 
995     if (likely(port)) {
996         xen_evtchn_set_port(port);
997     }
998 
999     qemu_mutex_lock(&env->xen_timers_lock);
1000 
1001     qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
1002     timer_mod_ns(env->xen_periodic_timer,
1003                  qemu_now + env->xen_periodic_timer_period);
1004 
1005     qemu_mutex_unlock(&env->xen_timers_lock);
1006 }
1007 
1008 static int do_set_periodic_timer(CPUState *target, uint64_t period_ns)
1009 {
1010     CPUX86State *tenv = &X86_CPU(target)->env;
1011     int64_t qemu_now;
1012 
1013     timer_del(tenv->xen_periodic_timer);
1014 
1015     qemu_mutex_lock(&tenv->xen_timers_lock);
1016 
1017     qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
1018     timer_mod_ns(tenv->xen_periodic_timer, qemu_now + period_ns);
1019     tenv->xen_periodic_timer_period = period_ns;
1020 
1021     qemu_mutex_unlock(&tenv->xen_timers_lock);
1022     return 0;
1023 }
1024 
1025 #define MILLISECS(_ms)  ((int64_t)((_ms) * 1000000ULL))
1026 #define MICROSECS(_us)  ((int64_t)((_us) * 1000ULL))
1027 #define STIME_MAX ((time_t)((int64_t)~0ull >> 1))
1028 /* Chosen so (NOW() + delta) wont overflow without an uptime of 200 years */
1029 #define STIME_DELTA_MAX ((int64_t)((uint64_t)~0ull >> 2))
1030 
1031 static int vcpuop_set_periodic_timer(CPUState *cs, CPUState *target,
1032                                      uint64_t arg)
1033 {
1034     struct vcpu_set_periodic_timer spt;
1035 
1036     qemu_build_assert(sizeof(spt) == 8);
1037     if (kvm_copy_from_gva(cs, arg, &spt, sizeof(spt))) {
1038         return -EFAULT;
1039     }
1040 
1041     if (spt.period_ns < MILLISECS(1) || spt.period_ns > STIME_DELTA_MAX) {
1042         return -EINVAL;
1043     }
1044 
1045     return do_set_periodic_timer(target, spt.period_ns);
1046 }
1047 
1048 static int vcpuop_stop_periodic_timer(CPUState *target)
1049 {
1050     CPUX86State *tenv = &X86_CPU(target)->env;
1051 
1052     qemu_mutex_lock(&tenv->xen_timers_lock);
1053 
1054     timer_del(tenv->xen_periodic_timer);
1055     tenv->xen_periodic_timer_period = 0;
1056 
1057     qemu_mutex_unlock(&tenv->xen_timers_lock);
1058     return 0;
1059 }
1060 
1061 static int do_set_singleshot_timer(CPUState *cs, uint64_t timeout_abs,
1062                                    bool future, bool linux_wa)
1063 {
1064     CPUX86State *env = &X86_CPU(cs)->env;
1065     int64_t now = kvm_get_current_ns();
1066     int64_t qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
1067     int64_t delta = timeout_abs - now;
1068 
1069     if (future && timeout_abs < now) {
1070         return -ETIME;
1071     }
1072 
1073     if (linux_wa && unlikely((int64_t)timeout_abs < 0 ||
1074                              (delta > 0 && (uint32_t)(delta >> 50) != 0))) {
1075         /*
1076          * Xen has a 'Linux workaround' in do_set_timer_op() which checks
1077          * for negative absolute timeout values (caused by integer
1078          * overflow), and for values about 13 days in the future (2^50ns)
1079          * which would be caused by jiffies overflow. For those cases, it
1080          * sets the timeout 100ms in the future (not *too* soon, since if
1081          * a guest really did set a long timeout on purpose we don't want
1082          * to keep churning CPU time by waking it up).
1083          */
1084         delta = (100 * SCALE_MS);
1085         timeout_abs = now + delta;
1086     }
1087 
1088     qemu_mutex_lock(&env->xen_timers_lock);
1089 
1090     timer_mod_ns(env->xen_singleshot_timer, qemu_now + delta);
1091     env->xen_singleshot_timer_ns = now + delta;
1092 
1093     qemu_mutex_unlock(&env->xen_timers_lock);
1094     return 0;
1095 }
1096 
1097 static int vcpuop_set_singleshot_timer(CPUState *cs, uint64_t arg)
1098 {
1099     struct vcpu_set_singleshot_timer sst = { 0 };
1100 
1101     /*
1102      * The struct is a uint64_t followed by a uint32_t. On 32-bit that
1103      * makes it 12 bytes. On 64-bit it gets padded to 16. The parts
1104      * that get used are identical, and there's four bytes of padding
1105      * unused at the end. For true Xen compatibility we should attempt
1106      * to copy the full 16 bytes from 64-bit guests, and return -EFAULT
1107      * if we can't get the padding too. But that's daft. Just copy what
1108      * we need.
1109      */
1110     qemu_build_assert(offsetof(struct vcpu_set_singleshot_timer, flags) == 8);
1111     qemu_build_assert(sizeof(sst) >= 12);
1112 
1113     if (kvm_copy_from_gva(cs, arg, &sst, 12)) {
1114         return -EFAULT;
1115     }
1116 
1117     return do_set_singleshot_timer(cs, sst.timeout_abs_ns,
1118                                    !!(sst.flags & VCPU_SSHOTTMR_future),
1119                                    false);
1120 }
1121 
1122 static int vcpuop_stop_singleshot_timer(CPUState *cs)
1123 {
1124     CPUX86State *env = &X86_CPU(cs)->env;
1125 
1126     qemu_mutex_lock(&env->xen_timers_lock);
1127 
1128     timer_del(env->xen_singleshot_timer);
1129     env->xen_singleshot_timer_ns = 0;
1130 
1131     qemu_mutex_unlock(&env->xen_timers_lock);
1132     return 0;
1133 }
1134 
1135 static bool kvm_xen_hcall_set_timer_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1136                                        uint64_t timeout)
1137 {
1138     int err;
1139 
1140     if (unlikely(timeout == 0)) {
1141         err = vcpuop_stop_singleshot_timer(CPU(cpu));
1142     } else {
1143         err = do_set_singleshot_timer(CPU(cpu), timeout, false, true);
1144     }
1145     exit->u.hcall.result = err;
1146     return true;
1147 }
1148 
1149 static bool kvm_xen_hcall_vcpu_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1150                                   int cmd, int vcpu_id, uint64_t arg)
1151 {
1152     CPUState *cs = CPU(cpu);
1153     CPUState *dest = cs->cpu_index == vcpu_id ? cs : qemu_get_cpu(vcpu_id);
1154     int err;
1155 
1156     if (!dest) {
1157         err = -ENOENT;
1158         goto out;
1159     }
1160 
1161     switch (cmd) {
1162     case VCPUOP_register_runstate_memory_area:
1163         err = vcpuop_register_runstate_info(cs, dest, arg);
1164         break;
1165     case VCPUOP_register_vcpu_time_memory_area:
1166         err = vcpuop_register_vcpu_time_info(cs, dest, arg);
1167         break;
1168     case VCPUOP_register_vcpu_info:
1169         err = vcpuop_register_vcpu_info(cs, dest, arg);
1170         break;
1171     case VCPUOP_set_singleshot_timer: {
1172         if (cs->cpu_index == vcpu_id) {
1173             err = vcpuop_set_singleshot_timer(dest, arg);
1174         } else {
1175             err = -EINVAL;
1176         }
1177         break;
1178     }
1179     case VCPUOP_stop_singleshot_timer:
1180         if (cs->cpu_index == vcpu_id) {
1181             err = vcpuop_stop_singleshot_timer(dest);
1182         } else {
1183             err = -EINVAL;
1184         }
1185         break;
1186     case VCPUOP_set_periodic_timer: {
1187         err = vcpuop_set_periodic_timer(cs, dest, arg);
1188         break;
1189     }
1190     case VCPUOP_stop_periodic_timer:
1191         err = vcpuop_stop_periodic_timer(dest);
1192         break;
1193 
1194     default:
1195         return false;
1196     }
1197 
1198  out:
1199     exit->u.hcall.result = err;
1200     return true;
1201 }
1202 
1203 static bool kvm_xen_hcall_evtchn_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1204                                     int cmd, uint64_t arg)
1205 {
1206     CPUState *cs = CPU(cpu);
1207     int err = -ENOSYS;
1208 
1209     switch (cmd) {
1210     case EVTCHNOP_init_control:
1211     case EVTCHNOP_expand_array:
1212     case EVTCHNOP_set_priority:
1213         /* We do not support FIFO channels at this point */
1214         err = -ENOSYS;
1215         break;
1216 
1217     case EVTCHNOP_status: {
1218         struct evtchn_status status;
1219 
1220         qemu_build_assert(sizeof(status) == 24);
1221         if (kvm_copy_from_gva(cs, arg, &status, sizeof(status))) {
1222             err = -EFAULT;
1223             break;
1224         }
1225 
1226         err = xen_evtchn_status_op(&status);
1227         if (!err && kvm_copy_to_gva(cs, arg, &status, sizeof(status))) {
1228             err = -EFAULT;
1229         }
1230         break;
1231     }
1232     case EVTCHNOP_close: {
1233         struct evtchn_close close;
1234 
1235         qemu_build_assert(sizeof(close) == 4);
1236         if (kvm_copy_from_gva(cs, arg, &close, sizeof(close))) {
1237             err = -EFAULT;
1238             break;
1239         }
1240 
1241         err = xen_evtchn_close_op(&close);
1242         break;
1243     }
1244     case EVTCHNOP_unmask: {
1245         struct evtchn_unmask unmask;
1246 
1247         qemu_build_assert(sizeof(unmask) == 4);
1248         if (kvm_copy_from_gva(cs, arg, &unmask, sizeof(unmask))) {
1249             err = -EFAULT;
1250             break;
1251         }
1252 
1253         err = xen_evtchn_unmask_op(&unmask);
1254         break;
1255     }
1256     case EVTCHNOP_bind_virq: {
1257         struct evtchn_bind_virq virq;
1258 
1259         qemu_build_assert(sizeof(virq) == 12);
1260         if (kvm_copy_from_gva(cs, arg, &virq, sizeof(virq))) {
1261             err = -EFAULT;
1262             break;
1263         }
1264 
1265         err = xen_evtchn_bind_virq_op(&virq);
1266         if (!err && kvm_copy_to_gva(cs, arg, &virq, sizeof(virq))) {
1267             err = -EFAULT;
1268         }
1269         break;
1270     }
1271     case EVTCHNOP_bind_pirq: {
1272         struct evtchn_bind_pirq pirq;
1273 
1274         qemu_build_assert(sizeof(pirq) == 12);
1275         if (kvm_copy_from_gva(cs, arg, &pirq, sizeof(pirq))) {
1276             err = -EFAULT;
1277             break;
1278         }
1279 
1280         err = xen_evtchn_bind_pirq_op(&pirq);
1281         if (!err && kvm_copy_to_gva(cs, arg, &pirq, sizeof(pirq))) {
1282             err = -EFAULT;
1283         }
1284         break;
1285     }
1286     case EVTCHNOP_bind_ipi: {
1287         struct evtchn_bind_ipi ipi;
1288 
1289         qemu_build_assert(sizeof(ipi) == 8);
1290         if (kvm_copy_from_gva(cs, arg, &ipi, sizeof(ipi))) {
1291             err = -EFAULT;
1292             break;
1293         }
1294 
1295         err = xen_evtchn_bind_ipi_op(&ipi);
1296         if (!err && kvm_copy_to_gva(cs, arg, &ipi, sizeof(ipi))) {
1297             err = -EFAULT;
1298         }
1299         break;
1300     }
1301     case EVTCHNOP_send: {
1302         struct evtchn_send send;
1303 
1304         qemu_build_assert(sizeof(send) == 4);
1305         if (kvm_copy_from_gva(cs, arg, &send, sizeof(send))) {
1306             err = -EFAULT;
1307             break;
1308         }
1309 
1310         err = xen_evtchn_send_op(&send);
1311         break;
1312     }
1313     case EVTCHNOP_alloc_unbound: {
1314         struct evtchn_alloc_unbound alloc;
1315 
1316         qemu_build_assert(sizeof(alloc) == 8);
1317         if (kvm_copy_from_gva(cs, arg, &alloc, sizeof(alloc))) {
1318             err = -EFAULT;
1319             break;
1320         }
1321 
1322         err = xen_evtchn_alloc_unbound_op(&alloc);
1323         if (!err && kvm_copy_to_gva(cs, arg, &alloc, sizeof(alloc))) {
1324             err = -EFAULT;
1325         }
1326         break;
1327     }
1328     case EVTCHNOP_bind_interdomain: {
1329         struct evtchn_bind_interdomain interdomain;
1330 
1331         qemu_build_assert(sizeof(interdomain) == 12);
1332         if (kvm_copy_from_gva(cs, arg, &interdomain, sizeof(interdomain))) {
1333             err = -EFAULT;
1334             break;
1335         }
1336 
1337         err = xen_evtchn_bind_interdomain_op(&interdomain);
1338         if (!err &&
1339             kvm_copy_to_gva(cs, arg, &interdomain, sizeof(interdomain))) {
1340             err = -EFAULT;
1341         }
1342         break;
1343     }
1344     case EVTCHNOP_bind_vcpu: {
1345         struct evtchn_bind_vcpu vcpu;
1346 
1347         qemu_build_assert(sizeof(vcpu) == 8);
1348         if (kvm_copy_from_gva(cs, arg, &vcpu, sizeof(vcpu))) {
1349             err = -EFAULT;
1350             break;
1351         }
1352 
1353         err = xen_evtchn_bind_vcpu_op(&vcpu);
1354         break;
1355     }
1356     case EVTCHNOP_reset: {
1357         struct evtchn_reset reset;
1358 
1359         qemu_build_assert(sizeof(reset) == 2);
1360         if (kvm_copy_from_gva(cs, arg, &reset, sizeof(reset))) {
1361             err = -EFAULT;
1362             break;
1363         }
1364 
1365         err = xen_evtchn_reset_op(&reset);
1366         break;
1367     }
1368     default:
1369         return false;
1370     }
1371 
1372     exit->u.hcall.result = err;
1373     return true;
1374 }
1375 
1376 int kvm_xen_soft_reset(void)
1377 {
1378     CPUState *cpu;
1379     int err;
1380 
1381     assert(qemu_mutex_iothread_locked());
1382 
1383     trace_kvm_xen_soft_reset();
1384 
1385     err = xen_evtchn_soft_reset();
1386     if (err) {
1387         return err;
1388     }
1389 
1390     /*
1391      * Zero is the reset/startup state for HVM_PARAM_CALLBACK_IRQ. Strictly,
1392      * it maps to HVM_PARAM_CALLBACK_TYPE_GSI with GSI#0, but Xen refuses to
1393      * to deliver to the timer interrupt and treats that as 'disabled'.
1394      */
1395     err = xen_evtchn_set_callback_param(0);
1396     if (err) {
1397         return err;
1398     }
1399 
1400     CPU_FOREACH(cpu) {
1401         async_run_on_cpu(cpu, do_vcpu_soft_reset, RUN_ON_CPU_NULL);
1402     }
1403 
1404     err = xen_overlay_map_shinfo_page(INVALID_GFN);
1405     if (err) {
1406         return err;
1407     }
1408 
1409     err = xen_gnttab_reset();
1410     if (err) {
1411         return err;
1412     }
1413 
1414     err = xen_xenstore_reset();
1415     if (err) {
1416         return err;
1417     }
1418 
1419     return 0;
1420 }
1421 
1422 static int schedop_shutdown(CPUState *cs, uint64_t arg)
1423 {
1424     struct sched_shutdown shutdown;
1425     int ret = 0;
1426 
1427     /* No need for 32/64 compat handling */
1428     qemu_build_assert(sizeof(shutdown) == 4);
1429 
1430     if (kvm_copy_from_gva(cs, arg, &shutdown, sizeof(shutdown))) {
1431         return -EFAULT;
1432     }
1433 
1434     switch (shutdown.reason) {
1435     case SHUTDOWN_crash:
1436         cpu_dump_state(cs, stderr, CPU_DUMP_CODE);
1437         qemu_system_guest_panicked(NULL);
1438         break;
1439 
1440     case SHUTDOWN_reboot:
1441         qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
1442         break;
1443 
1444     case SHUTDOWN_poweroff:
1445         qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
1446         break;
1447 
1448     case SHUTDOWN_soft_reset:
1449         qemu_mutex_lock_iothread();
1450         ret = kvm_xen_soft_reset();
1451         qemu_mutex_unlock_iothread();
1452         break;
1453 
1454     default:
1455         ret = -EINVAL;
1456         break;
1457     }
1458 
1459     return ret;
1460 }
1461 
1462 static bool kvm_xen_hcall_sched_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1463                                    int cmd, uint64_t arg)
1464 {
1465     CPUState *cs = CPU(cpu);
1466     int err = -ENOSYS;
1467 
1468     switch (cmd) {
1469     case SCHEDOP_shutdown:
1470         err = schedop_shutdown(cs, arg);
1471         break;
1472 
1473     case SCHEDOP_poll:
1474         /*
1475          * Linux will panic if this doesn't work. Just yield; it's not
1476          * worth overthinking it because with event channel handling
1477          * in KVM, the kernel will intercept this and it will never
1478          * reach QEMU anyway. The semantics of the hypercall explicltly
1479          * permit spurious wakeups.
1480          */
1481     case SCHEDOP_yield:
1482         sched_yield();
1483         err = 0;
1484         break;
1485 
1486     default:
1487         return false;
1488     }
1489 
1490     exit->u.hcall.result = err;
1491     return true;
1492 }
1493 
1494 static bool kvm_xen_hcall_gnttab_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1495                                     int cmd, uint64_t arg, int count)
1496 {
1497     CPUState *cs = CPU(cpu);
1498     int err;
1499 
1500     switch (cmd) {
1501     case GNTTABOP_set_version: {
1502         struct gnttab_set_version set;
1503 
1504         qemu_build_assert(sizeof(set) == 4);
1505         if (kvm_copy_from_gva(cs, arg, &set, sizeof(set))) {
1506             err = -EFAULT;
1507             break;
1508         }
1509 
1510         err = xen_gnttab_set_version_op(&set);
1511         if (!err && kvm_copy_to_gva(cs, arg, &set, sizeof(set))) {
1512             err = -EFAULT;
1513         }
1514         break;
1515     }
1516     case GNTTABOP_get_version: {
1517         struct gnttab_get_version get;
1518 
1519         qemu_build_assert(sizeof(get) == 8);
1520         if (kvm_copy_from_gva(cs, arg, &get, sizeof(get))) {
1521             err = -EFAULT;
1522             break;
1523         }
1524 
1525         err = xen_gnttab_get_version_op(&get);
1526         if (!err && kvm_copy_to_gva(cs, arg, &get, sizeof(get))) {
1527             err = -EFAULT;
1528         }
1529         break;
1530     }
1531     case GNTTABOP_query_size: {
1532         struct gnttab_query_size size;
1533 
1534         qemu_build_assert(sizeof(size) == 16);
1535         if (kvm_copy_from_gva(cs, arg, &size, sizeof(size))) {
1536             err = -EFAULT;
1537             break;
1538         }
1539 
1540         err = xen_gnttab_query_size_op(&size);
1541         if (!err && kvm_copy_to_gva(cs, arg, &size, sizeof(size))) {
1542             err = -EFAULT;
1543         }
1544         break;
1545     }
1546     case GNTTABOP_setup_table:
1547     case GNTTABOP_copy:
1548     case GNTTABOP_map_grant_ref:
1549     case GNTTABOP_unmap_grant_ref:
1550     case GNTTABOP_swap_grant_ref:
1551         return false;
1552 
1553     default:
1554         /* Xen explicitly returns -ENOSYS to HVM guests for all others */
1555         err = -ENOSYS;
1556         break;
1557     }
1558 
1559     exit->u.hcall.result = err;
1560     return true;
1561 }
1562 
1563 static bool kvm_xen_hcall_physdev_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1564                                      int cmd, uint64_t arg)
1565 {
1566     CPUState *cs = CPU(cpu);
1567     int err;
1568 
1569     switch (cmd) {
1570     case PHYSDEVOP_map_pirq: {
1571         struct physdev_map_pirq map;
1572 
1573         if (hypercall_compat32(exit->u.hcall.longmode)) {
1574             struct compat_physdev_map_pirq *map32 = (void *)&map;
1575 
1576             if (kvm_copy_from_gva(cs, arg, map32, sizeof(*map32))) {
1577                 return -EFAULT;
1578             }
1579 
1580             /*
1581              * The only thing that's different is the alignment of the
1582              * uint64_t table_base at the end, which gets padding to make
1583              * it 64-bit aligned in the 64-bit version.
1584              */
1585             qemu_build_assert(sizeof(*map32) == 36);
1586             qemu_build_assert(offsetof(struct physdev_map_pirq, entry_nr) ==
1587                               offsetof(struct compat_physdev_map_pirq, entry_nr));
1588             memmove(&map.table_base, &map32->table_base, sizeof(map.table_base));
1589         } else {
1590             if (kvm_copy_from_gva(cs, arg, &map, sizeof(map))) {
1591                 err = -EFAULT;
1592                 break;
1593             }
1594         }
1595         err = xen_physdev_map_pirq(&map);
1596         /*
1597          * Since table_base is an IN parameter and won't be changed, just
1598          * copy the size of the compat structure back to the guest.
1599          */
1600         if (!err && kvm_copy_to_gva(cs, arg, &map,
1601                                     sizeof(struct compat_physdev_map_pirq))) {
1602             err = -EFAULT;
1603         }
1604         break;
1605     }
1606     case PHYSDEVOP_unmap_pirq: {
1607         struct physdev_unmap_pirq unmap;
1608 
1609         qemu_build_assert(sizeof(unmap) == 8);
1610         if (kvm_copy_from_gva(cs, arg, &unmap, sizeof(unmap))) {
1611             err = -EFAULT;
1612             break;
1613         }
1614 
1615         err = xen_physdev_unmap_pirq(&unmap);
1616         if (!err && kvm_copy_to_gva(cs, arg, &unmap, sizeof(unmap))) {
1617             err = -EFAULT;
1618         }
1619         break;
1620     }
1621     case PHYSDEVOP_eoi: {
1622         struct physdev_eoi eoi;
1623 
1624         qemu_build_assert(sizeof(eoi) == 4);
1625         if (kvm_copy_from_gva(cs, arg, &eoi, sizeof(eoi))) {
1626             err = -EFAULT;
1627             break;
1628         }
1629 
1630         err = xen_physdev_eoi_pirq(&eoi);
1631         if (!err && kvm_copy_to_gva(cs, arg, &eoi, sizeof(eoi))) {
1632             err = -EFAULT;
1633         }
1634         break;
1635     }
1636     case PHYSDEVOP_irq_status_query: {
1637         struct physdev_irq_status_query query;
1638 
1639         qemu_build_assert(sizeof(query) == 8);
1640         if (kvm_copy_from_gva(cs, arg, &query, sizeof(query))) {
1641             err = -EFAULT;
1642             break;
1643         }
1644 
1645         err = xen_physdev_query_pirq(&query);
1646         if (!err && kvm_copy_to_gva(cs, arg, &query, sizeof(query))) {
1647             err = -EFAULT;
1648         }
1649         break;
1650     }
1651     case PHYSDEVOP_get_free_pirq: {
1652         struct physdev_get_free_pirq get;
1653 
1654         qemu_build_assert(sizeof(get) == 8);
1655         if (kvm_copy_from_gva(cs, arg, &get, sizeof(get))) {
1656             err = -EFAULT;
1657             break;
1658         }
1659 
1660         err = xen_physdev_get_free_pirq(&get);
1661         if (!err && kvm_copy_to_gva(cs, arg, &get, sizeof(get))) {
1662             err = -EFAULT;
1663         }
1664         break;
1665     }
1666     case PHYSDEVOP_pirq_eoi_gmfn_v2: /* FreeBSD 13 makes this hypercall */
1667         err = -ENOSYS;
1668         break;
1669 
1670     default:
1671         return false;
1672     }
1673 
1674     exit->u.hcall.result = err;
1675     return true;
1676 }
1677 
1678 static bool do_kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit)
1679 {
1680     uint16_t code = exit->u.hcall.input;
1681 
1682     if (exit->u.hcall.cpl > 0) {
1683         exit->u.hcall.result = -EPERM;
1684         return true;
1685     }
1686 
1687     switch (code) {
1688     case __HYPERVISOR_set_timer_op:
1689         if (exit->u.hcall.longmode) {
1690             return kvm_xen_hcall_set_timer_op(exit, cpu,
1691                                               exit->u.hcall.params[0]);
1692         } else {
1693             /* In 32-bit mode, the 64-bit timer value is in two args. */
1694             uint64_t val = ((uint64_t)exit->u.hcall.params[1]) << 32 |
1695                 (uint32_t)exit->u.hcall.params[0];
1696             return kvm_xen_hcall_set_timer_op(exit, cpu, val);
1697         }
1698     case __HYPERVISOR_grant_table_op:
1699         return kvm_xen_hcall_gnttab_op(exit, cpu, exit->u.hcall.params[0],
1700                                        exit->u.hcall.params[1],
1701                                        exit->u.hcall.params[2]);
1702     case __HYPERVISOR_sched_op:
1703         return kvm_xen_hcall_sched_op(exit, cpu, exit->u.hcall.params[0],
1704                                       exit->u.hcall.params[1]);
1705     case __HYPERVISOR_event_channel_op:
1706         return kvm_xen_hcall_evtchn_op(exit, cpu, exit->u.hcall.params[0],
1707                                        exit->u.hcall.params[1]);
1708     case __HYPERVISOR_vcpu_op:
1709         return kvm_xen_hcall_vcpu_op(exit, cpu,
1710                                      exit->u.hcall.params[0],
1711                                      exit->u.hcall.params[1],
1712                                      exit->u.hcall.params[2]);
1713     case __HYPERVISOR_hvm_op:
1714         return kvm_xen_hcall_hvm_op(exit, cpu, exit->u.hcall.params[0],
1715                                     exit->u.hcall.params[1]);
1716     case __HYPERVISOR_memory_op:
1717         return kvm_xen_hcall_memory_op(exit, cpu, exit->u.hcall.params[0],
1718                                        exit->u.hcall.params[1]);
1719     case __HYPERVISOR_physdev_op:
1720         return kvm_xen_hcall_physdev_op(exit, cpu, exit->u.hcall.params[0],
1721                                         exit->u.hcall.params[1]);
1722     case __HYPERVISOR_xen_version:
1723         return kvm_xen_hcall_xen_version(exit, cpu, exit->u.hcall.params[0],
1724                                          exit->u.hcall.params[1]);
1725     default:
1726         return false;
1727     }
1728 }
1729 
1730 int kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit)
1731 {
1732     if (exit->type != KVM_EXIT_XEN_HCALL) {
1733         return -1;
1734     }
1735 
1736     /*
1737      * The kernel latches the guest 32/64 mode when the MSR is used to fill
1738      * the hypercall page. So if we see a hypercall in a mode that doesn't
1739      * match our own idea of the guest mode, fetch the kernel's idea of the
1740      * "long mode" to remain in sync.
1741      */
1742     if (exit->u.hcall.longmode != xen_is_long_mode()) {
1743         xen_sync_long_mode();
1744     }
1745 
1746     if (!do_kvm_xen_handle_exit(cpu, exit)) {
1747         /*
1748          * Some hypercalls will be deliberately "implemented" by returning
1749          * -ENOSYS. This case is for hypercalls which are unexpected.
1750          */
1751         exit->u.hcall.result = -ENOSYS;
1752         qemu_log_mask(LOG_UNIMP, "Unimplemented Xen hypercall %"
1753                       PRId64 " (0x%" PRIx64 " 0x%" PRIx64 " 0x%" PRIx64 ")\n",
1754                       (uint64_t)exit->u.hcall.input,
1755                       (uint64_t)exit->u.hcall.params[0],
1756                       (uint64_t)exit->u.hcall.params[1],
1757                       (uint64_t)exit->u.hcall.params[2]);
1758     }
1759 
1760     trace_kvm_xen_hypercall(CPU(cpu)->cpu_index, exit->u.hcall.cpl,
1761                             exit->u.hcall.input, exit->u.hcall.params[0],
1762                             exit->u.hcall.params[1], exit->u.hcall.params[2],
1763                             exit->u.hcall.result);
1764     return 0;
1765 }
1766 
1767 uint16_t kvm_xen_get_gnttab_max_frames(void)
1768 {
1769     KVMState *s = KVM_STATE(current_accel());
1770     return s->xen_gnttab_max_frames;
1771 }
1772 
1773 uint16_t kvm_xen_get_evtchn_max_pirq(void)
1774 {
1775     KVMState *s = KVM_STATE(current_accel());
1776     return s->xen_evtchn_max_pirq;
1777 }
1778 
1779 int kvm_put_xen_state(CPUState *cs)
1780 {
1781     X86CPU *cpu = X86_CPU(cs);
1782     CPUX86State *env = &cpu->env;
1783     uint64_t gpa;
1784     int ret;
1785 
1786     gpa = env->xen_vcpu_info_gpa;
1787     if (gpa == INVALID_GPA) {
1788         gpa = env->xen_vcpu_info_default_gpa;
1789     }
1790 
1791     if (gpa != INVALID_GPA) {
1792         ret = set_vcpu_info(cs, gpa);
1793         if (ret < 0) {
1794             return ret;
1795         }
1796     }
1797 
1798     gpa = env->xen_vcpu_time_info_gpa;
1799     if (gpa != INVALID_GPA) {
1800         ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
1801                                     gpa);
1802         if (ret < 0) {
1803             return ret;
1804         }
1805     }
1806 
1807     gpa = env->xen_vcpu_runstate_gpa;
1808     if (gpa != INVALID_GPA) {
1809         ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
1810                                     gpa);
1811         if (ret < 0) {
1812             return ret;
1813         }
1814     }
1815 
1816     if (env->xen_periodic_timer_period) {
1817         ret = do_set_periodic_timer(cs, env->xen_periodic_timer_period);
1818         if (ret < 0) {
1819             return ret;
1820         }
1821     }
1822 
1823     if (!kvm_xen_has_cap(EVTCHN_SEND)) {
1824         /*
1825          * If the kernel has EVTCHN_SEND support then it handles timers too,
1826          * so the timer will be restored by kvm_xen_set_vcpu_timer() below.
1827          */
1828         if (env->xen_singleshot_timer_ns) {
1829             ret = do_set_singleshot_timer(cs, env->xen_singleshot_timer_ns,
1830                                     false, false);
1831             if (ret < 0) {
1832                 return ret;
1833             }
1834         }
1835         return 0;
1836     }
1837 
1838     if (env->xen_vcpu_callback_vector) {
1839         ret = kvm_xen_set_vcpu_callback_vector(cs);
1840         if (ret < 0) {
1841             return ret;
1842         }
1843     }
1844 
1845     if (env->xen_virq[VIRQ_TIMER]) {
1846         ret = kvm_xen_set_vcpu_timer(cs);
1847         if (ret < 0) {
1848             return ret;
1849         }
1850     }
1851     return 0;
1852 }
1853 
1854 int kvm_get_xen_state(CPUState *cs)
1855 {
1856     X86CPU *cpu = X86_CPU(cs);
1857     CPUX86State *env = &cpu->env;
1858     uint64_t gpa;
1859     int ret;
1860 
1861     /*
1862      * The kernel does not mark vcpu_info as dirty when it delivers interrupts
1863      * to it. It's up to userspace to *assume* that any page shared thus is
1864      * always considered dirty. The shared_info page is different since it's
1865      * an overlay and migrated separately anyway.
1866      */
1867     gpa = env->xen_vcpu_info_gpa;
1868     if (gpa == INVALID_GPA) {
1869         gpa = env->xen_vcpu_info_default_gpa;
1870     }
1871     if (gpa != INVALID_GPA) {
1872         MemoryRegionSection mrs = memory_region_find(get_system_memory(),
1873                                                      gpa,
1874                                                      sizeof(struct vcpu_info));
1875         if (mrs.mr &&
1876             !int128_lt(mrs.size, int128_make64(sizeof(struct vcpu_info)))) {
1877             memory_region_set_dirty(mrs.mr, mrs.offset_within_region,
1878                                     sizeof(struct vcpu_info));
1879         }
1880     }
1881 
1882     if (!kvm_xen_has_cap(EVTCHN_SEND)) {
1883         return 0;
1884     }
1885 
1886     /*
1887      * If the kernel is accelerating timers, read out the current value of the
1888      * singleshot timer deadline.
1889      */
1890     if (env->xen_virq[VIRQ_TIMER]) {
1891         struct kvm_xen_vcpu_attr va = {
1892             .type = KVM_XEN_VCPU_ATTR_TYPE_TIMER,
1893         };
1894         ret = kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_GET_ATTR, &va);
1895         if (ret < 0) {
1896             return ret;
1897         }
1898         env->xen_singleshot_timer_ns = va.u.timer.expires_ns;
1899     }
1900 
1901     return 0;
1902 }
1903