xref: /openbmc/qemu/target/i386/kvm/xen-emu.c (revision 095859e5d97284dd3ea666c337845dc63f6ba5e7)
1 /*
2  * Xen HVM emulation support in KVM
3  *
4  * Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
5  * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
6  *
7  * This work is licensed under the terms of the GNU GPL, version 2 or later.
8  * See the COPYING file in the top-level directory.
9  *
10  */
11 
12 #include "qemu/osdep.h"
13 #include "qemu/log.h"
14 #include "qemu/main-loop.h"
15 #include "qemu/error-report.h"
16 #include "hw/xen/xen.h"
17 #include "sysemu/kvm_int.h"
18 #include "sysemu/kvm_xen.h"
19 #include "kvm/kvm_i386.h"
20 #include "exec/address-spaces.h"
21 #include "xen-emu.h"
22 #include "trace.h"
23 #include "sysemu/runstate.h"
24 
25 #include "hw/pci/msi.h"
26 #include "hw/i386/apic-msidef.h"
27 #include "hw/i386/e820_memory_layout.h"
28 #include "hw/i386/kvm/xen_overlay.h"
29 #include "hw/i386/kvm/xen_evtchn.h"
30 #include "hw/i386/kvm/xen_gnttab.h"
31 #include "hw/i386/kvm/xen_xenstore.h"
32 
33 #include "hw/xen/interface/version.h"
34 #include "hw/xen/interface/sched.h"
35 #include "hw/xen/interface/memory.h"
36 #include "hw/xen/interface/hvm/hvm_op.h"
37 #include "hw/xen/interface/hvm/params.h"
38 #include "hw/xen/interface/vcpu.h"
39 #include "hw/xen/interface/event_channel.h"
40 #include "hw/xen/interface/grant_table.h"
41 
42 #include "xen-compat.h"
43 
44 static void xen_vcpu_singleshot_timer_event(void *opaque);
45 static void xen_vcpu_periodic_timer_event(void *opaque);
46 static int vcpuop_stop_singleshot_timer(CPUState *cs);
47 
48 #ifdef TARGET_X86_64
49 #define hypercall_compat32(longmode) (!(longmode))
50 #else
51 #define hypercall_compat32(longmode) (false)
52 #endif
53 
54 static bool kvm_gva_to_gpa(CPUState *cs, uint64_t gva, uint64_t *gpa,
55                            size_t *len, bool is_write)
56 {
57         struct kvm_translation tr = {
58             .linear_address = gva,
59         };
60 
61         if (len) {
62             *len = TARGET_PAGE_SIZE - (gva & ~TARGET_PAGE_MASK);
63         }
64 
65         if (kvm_vcpu_ioctl(cs, KVM_TRANSLATE, &tr) || !tr.valid ||
66             (is_write && !tr.writeable)) {
67             return false;
68         }
69         *gpa = tr.physical_address;
70         return true;
71 }
72 
73 static int kvm_gva_rw(CPUState *cs, uint64_t gva, void *_buf, size_t sz,
74                       bool is_write)
75 {
76     uint8_t *buf = (uint8_t *)_buf;
77     uint64_t gpa;
78     size_t len;
79 
80     while (sz) {
81         if (!kvm_gva_to_gpa(cs, gva, &gpa, &len, is_write)) {
82             return -EFAULT;
83         }
84         if (len > sz) {
85             len = sz;
86         }
87 
88         cpu_physical_memory_rw(gpa, buf, len, is_write);
89 
90         buf += len;
91         sz -= len;
92         gva += len;
93     }
94 
95     return 0;
96 }
97 
98 static inline int kvm_copy_from_gva(CPUState *cs, uint64_t gva, void *buf,
99                                     size_t sz)
100 {
101     return kvm_gva_rw(cs, gva, buf, sz, false);
102 }
103 
104 static inline int kvm_copy_to_gva(CPUState *cs, uint64_t gva, void *buf,
105                                   size_t sz)
106 {
107     return kvm_gva_rw(cs, gva, buf, sz, true);
108 }
109 
110 int kvm_xen_init(KVMState *s, uint32_t hypercall_msr)
111 {
112     const int required_caps = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
113         KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | KVM_XEN_HVM_CONFIG_SHARED_INFO;
114     struct kvm_xen_hvm_config cfg = {
115         .msr = hypercall_msr,
116         .flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL,
117     };
118     int xen_caps, ret;
119 
120     xen_caps = kvm_check_extension(s, KVM_CAP_XEN_HVM);
121     if (required_caps & ~xen_caps) {
122         error_report("kvm: Xen HVM guest support not present or insufficient");
123         return -ENOSYS;
124     }
125 
126     if (xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND) {
127         struct kvm_xen_hvm_attr ha = {
128             .type = KVM_XEN_ATTR_TYPE_XEN_VERSION,
129             .u.xen_version = s->xen_version,
130         };
131         (void)kvm_vm_ioctl(s, KVM_XEN_HVM_SET_ATTR, &ha);
132 
133         cfg.flags |= KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
134     }
135 
136     ret = kvm_vm_ioctl(s, KVM_XEN_HVM_CONFIG, &cfg);
137     if (ret < 0) {
138         error_report("kvm: Failed to enable Xen HVM support: %s",
139                      strerror(-ret));
140         return ret;
141     }
142 
143     /* If called a second time, don't repeat the rest of the setup. */
144     if (s->xen_caps) {
145         return 0;
146     }
147 
148     /*
149      * Event channel delivery via GSI/PCI_INTX needs to poll the vcpu_info
150      * of vCPU0 to deassert the IRQ when ->evtchn_upcall_pending is cleared.
151      *
152      * In the kernel, there's a notifier hook on the PIC/IOAPIC which allows
153      * such things to be polled at precisely the right time. We *could* do
154      * it nicely in the kernel: check vcpu_info[0]->evtchn_upcall_pending at
155      * the moment the IRQ is acked, and see if it should be reasserted.
156      *
157      * But the in-kernel irqchip is deprecated, so we're unlikely to add
158      * that support in the kernel. Insist on using the split irqchip mode
159      * instead.
160      *
161      * This leaves us polling for the level going low in QEMU, which lacks
162      * the appropriate hooks in its PIC/IOAPIC code. Even VFIO is sending a
163      * spurious 'ack' to an INTX IRQ every time there's any MMIO access to
164      * the device (for which it has to unmap the device and trap access, for
165      * some period after an IRQ!!). In the Xen case, we do it on exit from
166      * KVM_RUN, if the flag is set to say that the GSI is currently asserted.
167      * Which is kind of icky, but less so than the VFIO one. I may fix them
168      * both later...
169      */
170     if (!kvm_kernel_irqchip_split()) {
171         error_report("kvm: Xen support requires kernel-irqchip=split");
172         return -EINVAL;
173     }
174 
175     s->xen_caps = xen_caps;
176 
177     /* Tell fw_cfg to notify the BIOS to reserve the range. */
178     ret = e820_add_entry(XEN_SPECIAL_AREA_ADDR, XEN_SPECIAL_AREA_SIZE,
179                          E820_RESERVED);
180     if (ret < 0) {
181         fprintf(stderr, "e820_add_entry() table is full\n");
182         return ret;
183     }
184 
185     /* The page couldn't be overlaid until KVM was initialized */
186     xen_xenstore_reset();
187 
188     return 0;
189 }
190 
191 int kvm_xen_init_vcpu(CPUState *cs)
192 {
193     X86CPU *cpu = X86_CPU(cs);
194     CPUX86State *env = &cpu->env;
195     int err;
196 
197     /*
198      * The kernel needs to know the Xen/ACPI vCPU ID because that's
199      * what the guest uses in hypercalls such as timers. It doesn't
200      * match the APIC ID which is generally used for talking to the
201      * kernel about vCPUs. And if vCPU threads race with creating
202      * their KVM vCPUs out of order, it doesn't necessarily match
203      * with the kernel's internal vCPU indices either.
204      */
205     if (kvm_xen_has_cap(EVTCHN_SEND)) {
206         struct kvm_xen_vcpu_attr va = {
207             .type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID,
208             .u.vcpu_id = cs->cpu_index,
209         };
210         err = kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &va);
211         if (err) {
212             error_report("kvm: Failed to set Xen vCPU ID attribute: %s",
213                          strerror(-err));
214             return err;
215         }
216     }
217 
218     env->xen_vcpu_info_gpa = INVALID_GPA;
219     env->xen_vcpu_info_default_gpa = INVALID_GPA;
220     env->xen_vcpu_time_info_gpa = INVALID_GPA;
221     env->xen_vcpu_runstate_gpa = INVALID_GPA;
222 
223     qemu_mutex_init(&env->xen_timers_lock);
224     env->xen_singleshot_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
225                                              xen_vcpu_singleshot_timer_event,
226                                              cpu);
227     if (!env->xen_singleshot_timer) {
228         return -ENOMEM;
229     }
230     env->xen_singleshot_timer->opaque = cs;
231 
232     env->xen_periodic_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
233                                            xen_vcpu_periodic_timer_event,
234                                            cpu);
235     if (!env->xen_periodic_timer) {
236         return -ENOMEM;
237     }
238     env->xen_periodic_timer->opaque = cs;
239 
240     return 0;
241 }
242 
243 uint32_t kvm_xen_get_caps(void)
244 {
245     return kvm_state->xen_caps;
246 }
247 
248 static bool kvm_xen_hcall_xen_version(struct kvm_xen_exit *exit, X86CPU *cpu,
249                                      int cmd, uint64_t arg)
250 {
251     int err = 0;
252 
253     switch (cmd) {
254     case XENVER_get_features: {
255         struct xen_feature_info fi;
256 
257         /* No need for 32/64 compat handling */
258         qemu_build_assert(sizeof(fi) == 8);
259 
260         err = kvm_copy_from_gva(CPU(cpu), arg, &fi, sizeof(fi));
261         if (err) {
262             break;
263         }
264 
265         fi.submap = 0;
266         if (fi.submap_idx == 0) {
267             fi.submap |= 1 << XENFEAT_writable_page_tables |
268                          1 << XENFEAT_writable_descriptor_tables |
269                          1 << XENFEAT_auto_translated_physmap |
270                          1 << XENFEAT_supervisor_mode_kernel |
271                          1 << XENFEAT_hvm_callback_vector |
272                          1 << XENFEAT_hvm_safe_pvclock |
273                          1 << XENFEAT_hvm_pirqs;
274         }
275 
276         err = kvm_copy_to_gva(CPU(cpu), arg, &fi, sizeof(fi));
277         break;
278     }
279 
280     default:
281         return false;
282     }
283 
284     exit->u.hcall.result = err;
285     return true;
286 }
287 
288 static int kvm_xen_set_vcpu_attr(CPUState *cs, uint16_t type, uint64_t gpa)
289 {
290     struct kvm_xen_vcpu_attr xhsi;
291 
292     xhsi.type = type;
293     xhsi.u.gpa = gpa;
294 
295     trace_kvm_xen_set_vcpu_attr(cs->cpu_index, type, gpa);
296 
297     return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &xhsi);
298 }
299 
300 static int kvm_xen_set_vcpu_callback_vector(CPUState *cs)
301 {
302     uint8_t vector = X86_CPU(cs)->env.xen_vcpu_callback_vector;
303     struct kvm_xen_vcpu_attr xva;
304 
305     xva.type = KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR;
306     xva.u.vector = vector;
307 
308     trace_kvm_xen_set_vcpu_callback(cs->cpu_index, vector);
309 
310     return kvm_vcpu_ioctl(cs, KVM_XEN_HVM_SET_ATTR, &xva);
311 }
312 
313 static void do_set_vcpu_callback_vector(CPUState *cs, run_on_cpu_data data)
314 {
315     X86CPU *cpu = X86_CPU(cs);
316     CPUX86State *env = &cpu->env;
317 
318     env->xen_vcpu_callback_vector = data.host_int;
319 
320     if (kvm_xen_has_cap(EVTCHN_SEND)) {
321         kvm_xen_set_vcpu_callback_vector(cs);
322     }
323 }
324 
325 static int set_vcpu_info(CPUState *cs, uint64_t gpa)
326 {
327     X86CPU *cpu = X86_CPU(cs);
328     CPUX86State *env = &cpu->env;
329     MemoryRegionSection mrs = { .mr = NULL };
330     void *vcpu_info_hva = NULL;
331     int ret;
332 
333     ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO, gpa);
334     if (ret || gpa == INVALID_GPA) {
335         goto out;
336     }
337 
338     mrs = memory_region_find(get_system_memory(), gpa,
339                              sizeof(struct vcpu_info));
340     if (mrs.mr && mrs.mr->ram_block &&
341         !int128_lt(mrs.size, int128_make64(sizeof(struct vcpu_info)))) {
342         vcpu_info_hva = qemu_map_ram_ptr(mrs.mr->ram_block,
343                                          mrs.offset_within_region);
344     }
345     if (!vcpu_info_hva) {
346         if (mrs.mr) {
347             memory_region_unref(mrs.mr);
348             mrs.mr = NULL;
349         }
350         ret = -EINVAL;
351     }
352 
353  out:
354     if (env->xen_vcpu_info_mr) {
355         memory_region_unref(env->xen_vcpu_info_mr);
356     }
357     env->xen_vcpu_info_hva = vcpu_info_hva;
358     env->xen_vcpu_info_mr = mrs.mr;
359     return ret;
360 }
361 
362 static void do_set_vcpu_info_default_gpa(CPUState *cs, run_on_cpu_data data)
363 {
364     X86CPU *cpu = X86_CPU(cs);
365     CPUX86State *env = &cpu->env;
366 
367     env->xen_vcpu_info_default_gpa = data.host_ulong;
368 
369     /* Changing the default does nothing if a vcpu_info was explicitly set. */
370     if (env->xen_vcpu_info_gpa == INVALID_GPA) {
371         set_vcpu_info(cs, env->xen_vcpu_info_default_gpa);
372     }
373 }
374 
375 static void do_set_vcpu_info_gpa(CPUState *cs, run_on_cpu_data data)
376 {
377     X86CPU *cpu = X86_CPU(cs);
378     CPUX86State *env = &cpu->env;
379 
380     env->xen_vcpu_info_gpa = data.host_ulong;
381 
382     set_vcpu_info(cs, env->xen_vcpu_info_gpa);
383 }
384 
385 void *kvm_xen_get_vcpu_info_hva(uint32_t vcpu_id)
386 {
387     CPUState *cs = qemu_get_cpu(vcpu_id);
388     if (!cs) {
389         return NULL;
390     }
391 
392     return X86_CPU(cs)->env.xen_vcpu_info_hva;
393 }
394 
395 void kvm_xen_maybe_deassert_callback(CPUState *cs)
396 {
397     CPUX86State *env = &X86_CPU(cs)->env;
398     struct vcpu_info *vi = env->xen_vcpu_info_hva;
399     if (!vi) {
400         return;
401     }
402 
403     /* If the evtchn_upcall_pending flag is cleared, turn the GSI off. */
404     if (!vi->evtchn_upcall_pending) {
405         qemu_mutex_lock_iothread();
406         /*
407          * Check again now we have the lock, because it may have been
408          * asserted in the interim. And we don't want to take the lock
409          * every time because this is a fast path.
410          */
411         if (!vi->evtchn_upcall_pending) {
412             X86_CPU(cs)->env.xen_callback_asserted = false;
413             xen_evtchn_set_callback_level(0);
414         }
415         qemu_mutex_unlock_iothread();
416     }
417 }
418 
419 void kvm_xen_set_callback_asserted(void)
420 {
421     CPUState *cs = qemu_get_cpu(0);
422 
423     if (cs) {
424         X86_CPU(cs)->env.xen_callback_asserted = true;
425     }
426 }
427 
428 void kvm_xen_inject_vcpu_callback_vector(uint32_t vcpu_id, int type)
429 {
430     CPUState *cs = qemu_get_cpu(vcpu_id);
431     uint8_t vector;
432 
433     if (!cs) {
434         return;
435     }
436 
437     vector = X86_CPU(cs)->env.xen_vcpu_callback_vector;
438     if (vector) {
439         /*
440          * The per-vCPU callback vector injected via lapic. Just
441          * deliver it as an MSI.
442          */
443         MSIMessage msg = {
444             .address = APIC_DEFAULT_ADDRESS | X86_CPU(cs)->apic_id,
445             .data = vector | (1UL << MSI_DATA_LEVEL_SHIFT),
446         };
447         kvm_irqchip_send_msi(kvm_state, msg);
448         return;
449     }
450 
451     switch (type) {
452     case HVM_PARAM_CALLBACK_TYPE_VECTOR:
453         /*
454          * If the evtchn_upcall_pending field in the vcpu_info is set, then
455          * KVM will automatically deliver the vector on entering the vCPU
456          * so all we have to do is kick it out.
457          */
458         qemu_cpu_kick(cs);
459         break;
460 
461     case HVM_PARAM_CALLBACK_TYPE_GSI:
462     case HVM_PARAM_CALLBACK_TYPE_PCI_INTX:
463         if (vcpu_id == 0) {
464             xen_evtchn_set_callback_level(1);
465         }
466         break;
467     }
468 }
469 
470 /* Must always be called with xen_timers_lock held */
471 static int kvm_xen_set_vcpu_timer(CPUState *cs)
472 {
473     X86CPU *cpu = X86_CPU(cs);
474     CPUX86State *env = &cpu->env;
475 
476     struct kvm_xen_vcpu_attr va = {
477         .type = KVM_XEN_VCPU_ATTR_TYPE_TIMER,
478         .u.timer.port = env->xen_virq[VIRQ_TIMER],
479         .u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
480         .u.timer.expires_ns = env->xen_singleshot_timer_ns,
481     };
482 
483     return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &va);
484 }
485 
486 static void do_set_vcpu_timer_virq(CPUState *cs, run_on_cpu_data data)
487 {
488     QEMU_LOCK_GUARD(&X86_CPU(cs)->env.xen_timers_lock);
489     kvm_xen_set_vcpu_timer(cs);
490 }
491 
492 int kvm_xen_set_vcpu_virq(uint32_t vcpu_id, uint16_t virq, uint16_t port)
493 {
494     CPUState *cs = qemu_get_cpu(vcpu_id);
495 
496     if (!cs) {
497         return -ENOENT;
498     }
499 
500     /* cpu.h doesn't include the actual Xen header. */
501     qemu_build_assert(NR_VIRQS == XEN_NR_VIRQS);
502 
503     if (virq >= NR_VIRQS) {
504         return -EINVAL;
505     }
506 
507     if (port && X86_CPU(cs)->env.xen_virq[virq]) {
508         return -EEXIST;
509     }
510 
511     X86_CPU(cs)->env.xen_virq[virq] = port;
512     if (virq == VIRQ_TIMER && kvm_xen_has_cap(EVTCHN_SEND)) {
513         async_run_on_cpu(cs, do_set_vcpu_timer_virq,
514                          RUN_ON_CPU_HOST_INT(port));
515     }
516     return 0;
517 }
518 
519 static void do_set_vcpu_time_info_gpa(CPUState *cs, run_on_cpu_data data)
520 {
521     X86CPU *cpu = X86_CPU(cs);
522     CPUX86State *env = &cpu->env;
523 
524     env->xen_vcpu_time_info_gpa = data.host_ulong;
525 
526     kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
527                           env->xen_vcpu_time_info_gpa);
528 }
529 
530 static void do_set_vcpu_runstate_gpa(CPUState *cs, run_on_cpu_data data)
531 {
532     X86CPU *cpu = X86_CPU(cs);
533     CPUX86State *env = &cpu->env;
534 
535     env->xen_vcpu_runstate_gpa = data.host_ulong;
536 
537     kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
538                           env->xen_vcpu_runstate_gpa);
539 }
540 
541 static void do_vcpu_soft_reset(CPUState *cs, run_on_cpu_data data)
542 {
543     X86CPU *cpu = X86_CPU(cs);
544     CPUX86State *env = &cpu->env;
545 
546     env->xen_vcpu_info_gpa = INVALID_GPA;
547     env->xen_vcpu_info_default_gpa = INVALID_GPA;
548     env->xen_vcpu_time_info_gpa = INVALID_GPA;
549     env->xen_vcpu_runstate_gpa = INVALID_GPA;
550     env->xen_vcpu_callback_vector = 0;
551     memset(env->xen_virq, 0, sizeof(env->xen_virq));
552 
553     set_vcpu_info(cs, INVALID_GPA);
554     kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
555                           INVALID_GPA);
556     kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
557                           INVALID_GPA);
558     if (kvm_xen_has_cap(EVTCHN_SEND)) {
559         kvm_xen_set_vcpu_callback_vector(cs);
560 
561         QEMU_LOCK_GUARD(&X86_CPU(cs)->env.xen_timers_lock);
562         env->xen_singleshot_timer_ns = 0;
563         kvm_xen_set_vcpu_timer(cs);
564     } else {
565         vcpuop_stop_singleshot_timer(cs);
566     };
567 
568 }
569 
570 static int xen_set_shared_info(uint64_t gfn)
571 {
572     uint64_t gpa = gfn << TARGET_PAGE_BITS;
573     int i, err;
574 
575     QEMU_IOTHREAD_LOCK_GUARD();
576 
577     /*
578      * The xen_overlay device tells KVM about it too, since it had to
579      * do that on migration load anyway (unless we're going to jump
580      * through lots of hoops to maintain the fiction that this isn't
581      * KVM-specific.
582      */
583     err = xen_overlay_map_shinfo_page(gpa);
584     if (err) {
585             return err;
586     }
587 
588     trace_kvm_xen_set_shared_info(gfn);
589 
590     for (i = 0; i < XEN_LEGACY_MAX_VCPUS; i++) {
591         CPUState *cpu = qemu_get_cpu(i);
592         if (cpu) {
593             async_run_on_cpu(cpu, do_set_vcpu_info_default_gpa,
594                              RUN_ON_CPU_HOST_ULONG(gpa));
595         }
596         gpa += sizeof(vcpu_info_t);
597     }
598 
599     return err;
600 }
601 
602 static int add_to_physmap_one(uint32_t space, uint64_t idx, uint64_t gfn)
603 {
604     switch (space) {
605     case XENMAPSPACE_shared_info:
606         if (idx > 0) {
607             return -EINVAL;
608         }
609         return xen_set_shared_info(gfn);
610 
611     case XENMAPSPACE_grant_table:
612         return xen_gnttab_map_page(idx, gfn);
613 
614     case XENMAPSPACE_gmfn:
615     case XENMAPSPACE_gmfn_range:
616         return -ENOTSUP;
617 
618     case XENMAPSPACE_gmfn_foreign:
619     case XENMAPSPACE_dev_mmio:
620         return -EPERM;
621 
622     default:
623         return -EINVAL;
624     }
625 }
626 
627 static int do_add_to_physmap(struct kvm_xen_exit *exit, X86CPU *cpu,
628                              uint64_t arg)
629 {
630     struct xen_add_to_physmap xatp;
631     CPUState *cs = CPU(cpu);
632 
633     if (hypercall_compat32(exit->u.hcall.longmode)) {
634         struct compat_xen_add_to_physmap xatp32;
635 
636         qemu_build_assert(sizeof(struct compat_xen_add_to_physmap) == 16);
637         if (kvm_copy_from_gva(cs, arg, &xatp32, sizeof(xatp32))) {
638             return -EFAULT;
639         }
640         xatp.domid = xatp32.domid;
641         xatp.size = xatp32.size;
642         xatp.space = xatp32.space;
643         xatp.idx = xatp32.idx;
644         xatp.gpfn = xatp32.gpfn;
645     } else {
646         if (kvm_copy_from_gva(cs, arg, &xatp, sizeof(xatp))) {
647             return -EFAULT;
648         }
649     }
650 
651     if (xatp.domid != DOMID_SELF && xatp.domid != xen_domid) {
652         return -ESRCH;
653     }
654 
655     return add_to_physmap_one(xatp.space, xatp.idx, xatp.gpfn);
656 }
657 
658 static int do_add_to_physmap_batch(struct kvm_xen_exit *exit, X86CPU *cpu,
659                                    uint64_t arg)
660 {
661     struct xen_add_to_physmap_batch xatpb;
662     unsigned long idxs_gva, gpfns_gva, errs_gva;
663     CPUState *cs = CPU(cpu);
664     size_t op_sz;
665 
666     if (hypercall_compat32(exit->u.hcall.longmode)) {
667         struct compat_xen_add_to_physmap_batch xatpb32;
668 
669         qemu_build_assert(sizeof(struct compat_xen_add_to_physmap_batch) == 20);
670         if (kvm_copy_from_gva(cs, arg, &xatpb32, sizeof(xatpb32))) {
671             return -EFAULT;
672         }
673         xatpb.domid = xatpb32.domid;
674         xatpb.space = xatpb32.space;
675         xatpb.size = xatpb32.size;
676 
677         idxs_gva = xatpb32.idxs.c;
678         gpfns_gva = xatpb32.gpfns.c;
679         errs_gva = xatpb32.errs.c;
680         op_sz = sizeof(uint32_t);
681     } else {
682         if (kvm_copy_from_gva(cs, arg, &xatpb, sizeof(xatpb))) {
683             return -EFAULT;
684         }
685         op_sz = sizeof(unsigned long);
686         idxs_gva = (unsigned long)xatpb.idxs.p;
687         gpfns_gva = (unsigned long)xatpb.gpfns.p;
688         errs_gva = (unsigned long)xatpb.errs.p;
689     }
690 
691     if (xatpb.domid != DOMID_SELF && xatpb.domid != xen_domid) {
692         return -ESRCH;
693     }
694 
695     /* Explicitly invalid for the batch op. Not that we implement it anyway. */
696     if (xatpb.space == XENMAPSPACE_gmfn_range) {
697         return -EINVAL;
698     }
699 
700     while (xatpb.size--) {
701         unsigned long idx = 0;
702         unsigned long gpfn = 0;
703         int err;
704 
705         /* For 32-bit compat this only copies the low 32 bits of each */
706         if (kvm_copy_from_gva(cs, idxs_gva, &idx, op_sz) ||
707             kvm_copy_from_gva(cs, gpfns_gva, &gpfn, op_sz)) {
708             return -EFAULT;
709         }
710         idxs_gva += op_sz;
711         gpfns_gva += op_sz;
712 
713         err = add_to_physmap_one(xatpb.space, idx, gpfn);
714 
715         if (kvm_copy_to_gva(cs, errs_gva, &err, sizeof(err))) {
716             return -EFAULT;
717         }
718         errs_gva += sizeof(err);
719     }
720     return 0;
721 }
722 
723 static bool kvm_xen_hcall_memory_op(struct kvm_xen_exit *exit, X86CPU *cpu,
724                                    int cmd, uint64_t arg)
725 {
726     int err;
727 
728     switch (cmd) {
729     case XENMEM_add_to_physmap:
730         err = do_add_to_physmap(exit, cpu, arg);
731         break;
732 
733     case XENMEM_add_to_physmap_batch:
734         err = do_add_to_physmap_batch(exit, cpu, arg);
735         break;
736 
737     default:
738         return false;
739     }
740 
741     exit->u.hcall.result = err;
742     return true;
743 }
744 
745 static bool handle_set_param(struct kvm_xen_exit *exit, X86CPU *cpu,
746                              uint64_t arg)
747 {
748     CPUState *cs = CPU(cpu);
749     struct xen_hvm_param hp;
750     int err = 0;
751 
752     /* No need for 32/64 compat handling */
753     qemu_build_assert(sizeof(hp) == 16);
754 
755     if (kvm_copy_from_gva(cs, arg, &hp, sizeof(hp))) {
756         err = -EFAULT;
757         goto out;
758     }
759 
760     if (hp.domid != DOMID_SELF && hp.domid != xen_domid) {
761         err = -ESRCH;
762         goto out;
763     }
764 
765     switch (hp.index) {
766     case HVM_PARAM_CALLBACK_IRQ:
767         qemu_mutex_lock_iothread();
768         err = xen_evtchn_set_callback_param(hp.value);
769         qemu_mutex_unlock_iothread();
770         xen_set_long_mode(exit->u.hcall.longmode);
771         break;
772     default:
773         return false;
774     }
775 
776 out:
777     exit->u.hcall.result = err;
778     return true;
779 }
780 
781 static bool handle_get_param(struct kvm_xen_exit *exit, X86CPU *cpu,
782                              uint64_t arg)
783 {
784     CPUState *cs = CPU(cpu);
785     struct xen_hvm_param hp;
786     int err = 0;
787 
788     /* No need for 32/64 compat handling */
789     qemu_build_assert(sizeof(hp) == 16);
790 
791     if (kvm_copy_from_gva(cs, arg, &hp, sizeof(hp))) {
792         err = -EFAULT;
793         goto out;
794     }
795 
796     if (hp.domid != DOMID_SELF && hp.domid != xen_domid) {
797         err = -ESRCH;
798         goto out;
799     }
800 
801     switch (hp.index) {
802     case HVM_PARAM_STORE_PFN:
803         hp.value = XEN_SPECIAL_PFN(XENSTORE);
804         break;
805     case HVM_PARAM_STORE_EVTCHN:
806         hp.value = xen_xenstore_get_port();
807         break;
808     default:
809         return false;
810     }
811 
812     if (kvm_copy_to_gva(cs, arg, &hp, sizeof(hp))) {
813         err = -EFAULT;
814     }
815 out:
816     exit->u.hcall.result = err;
817     return true;
818 }
819 
820 static int kvm_xen_hcall_evtchn_upcall_vector(struct kvm_xen_exit *exit,
821                                               X86CPU *cpu, uint64_t arg)
822 {
823     struct xen_hvm_evtchn_upcall_vector up;
824     CPUState *target_cs;
825 
826     /* No need for 32/64 compat handling */
827     qemu_build_assert(sizeof(up) == 8);
828 
829     if (kvm_copy_from_gva(CPU(cpu), arg, &up, sizeof(up))) {
830         return -EFAULT;
831     }
832 
833     if (up.vector < 0x10) {
834         return -EINVAL;
835     }
836 
837     target_cs = qemu_get_cpu(up.vcpu);
838     if (!target_cs) {
839         return -EINVAL;
840     }
841 
842     async_run_on_cpu(target_cs, do_set_vcpu_callback_vector,
843                      RUN_ON_CPU_HOST_INT(up.vector));
844     return 0;
845 }
846 
847 static bool kvm_xen_hcall_hvm_op(struct kvm_xen_exit *exit, X86CPU *cpu,
848                                  int cmd, uint64_t arg)
849 {
850     int ret = -ENOSYS;
851     switch (cmd) {
852     case HVMOP_set_evtchn_upcall_vector:
853         ret = kvm_xen_hcall_evtchn_upcall_vector(exit, cpu,
854                                                  exit->u.hcall.params[0]);
855         break;
856 
857     case HVMOP_pagetable_dying:
858         ret = -ENOSYS;
859         break;
860 
861     case HVMOP_set_param:
862         return handle_set_param(exit, cpu, arg);
863 
864     case HVMOP_get_param:
865         return handle_get_param(exit, cpu, arg);
866 
867     default:
868         return false;
869     }
870 
871     exit->u.hcall.result = ret;
872     return true;
873 }
874 
875 static int vcpuop_register_vcpu_info(CPUState *cs, CPUState *target,
876                                      uint64_t arg)
877 {
878     struct vcpu_register_vcpu_info rvi;
879     uint64_t gpa;
880 
881     /* No need for 32/64 compat handling */
882     qemu_build_assert(sizeof(rvi) == 16);
883     qemu_build_assert(sizeof(struct vcpu_info) == 64);
884 
885     if (!target) {
886         return -ENOENT;
887     }
888 
889     if (kvm_copy_from_gva(cs, arg, &rvi, sizeof(rvi))) {
890         return -EFAULT;
891     }
892 
893     if (rvi.offset > TARGET_PAGE_SIZE - sizeof(struct vcpu_info)) {
894         return -EINVAL;
895     }
896 
897     gpa = ((rvi.mfn << TARGET_PAGE_BITS) + rvi.offset);
898     async_run_on_cpu(target, do_set_vcpu_info_gpa, RUN_ON_CPU_HOST_ULONG(gpa));
899     return 0;
900 }
901 
902 static int vcpuop_register_vcpu_time_info(CPUState *cs, CPUState *target,
903                                           uint64_t arg)
904 {
905     struct vcpu_register_time_memory_area tma;
906     uint64_t gpa;
907     size_t len;
908 
909     /* No need for 32/64 compat handling */
910     qemu_build_assert(sizeof(tma) == 8);
911     qemu_build_assert(sizeof(struct vcpu_time_info) == 32);
912 
913     if (!target) {
914         return -ENOENT;
915     }
916 
917     if (kvm_copy_from_gva(cs, arg, &tma, sizeof(tma))) {
918         return -EFAULT;
919     }
920 
921     /*
922      * Xen actually uses the GVA and does the translation through the guest
923      * page tables each time. But Linux/KVM uses the GPA, on the assumption
924      * that guests only ever use *global* addresses (kernel virtual addresses)
925      * for it. If Linux is changed to redo the GVA→GPA translation each time,
926      * it will offer a new vCPU attribute for that, and we'll use it instead.
927      */
928     if (!kvm_gva_to_gpa(cs, tma.addr.p, &gpa, &len, false) ||
929         len < sizeof(struct vcpu_time_info)) {
930         return -EFAULT;
931     }
932 
933     async_run_on_cpu(target, do_set_vcpu_time_info_gpa,
934                      RUN_ON_CPU_HOST_ULONG(gpa));
935     return 0;
936 }
937 
938 static int vcpuop_register_runstate_info(CPUState *cs, CPUState *target,
939                                          uint64_t arg)
940 {
941     struct vcpu_register_runstate_memory_area rma;
942     uint64_t gpa;
943     size_t len;
944 
945     /* No need for 32/64 compat handling */
946     qemu_build_assert(sizeof(rma) == 8);
947     /* The runstate area actually does change size, but Linux copes. */
948 
949     if (!target) {
950         return -ENOENT;
951     }
952 
953     if (kvm_copy_from_gva(cs, arg, &rma, sizeof(rma))) {
954         return -EFAULT;
955     }
956 
957     /* As with vcpu_time_info, Xen actually uses the GVA but KVM doesn't. */
958     if (!kvm_gva_to_gpa(cs, rma.addr.p, &gpa, &len, false)) {
959         return -EFAULT;
960     }
961 
962     async_run_on_cpu(target, do_set_vcpu_runstate_gpa,
963                      RUN_ON_CPU_HOST_ULONG(gpa));
964     return 0;
965 }
966 
967 static uint64_t kvm_get_current_ns(void)
968 {
969     struct kvm_clock_data data;
970     int ret;
971 
972     ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
973     if (ret < 0) {
974         fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
975                 abort();
976     }
977 
978     return data.clock;
979 }
980 
981 static void xen_vcpu_singleshot_timer_event(void *opaque)
982 {
983     CPUState *cpu = opaque;
984     CPUX86State *env = &X86_CPU(cpu)->env;
985     uint16_t port = env->xen_virq[VIRQ_TIMER];
986 
987     if (likely(port)) {
988         xen_evtchn_set_port(port);
989     }
990 
991     qemu_mutex_lock(&env->xen_timers_lock);
992     env->xen_singleshot_timer_ns = 0;
993     qemu_mutex_unlock(&env->xen_timers_lock);
994 }
995 
996 static void xen_vcpu_periodic_timer_event(void *opaque)
997 {
998     CPUState *cpu = opaque;
999     CPUX86State *env = &X86_CPU(cpu)->env;
1000     uint16_t port = env->xen_virq[VIRQ_TIMER];
1001     int64_t qemu_now;
1002 
1003     if (likely(port)) {
1004         xen_evtchn_set_port(port);
1005     }
1006 
1007     qemu_mutex_lock(&env->xen_timers_lock);
1008 
1009     qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
1010     timer_mod_ns(env->xen_periodic_timer,
1011                  qemu_now + env->xen_periodic_timer_period);
1012 
1013     qemu_mutex_unlock(&env->xen_timers_lock);
1014 }
1015 
1016 static int do_set_periodic_timer(CPUState *target, uint64_t period_ns)
1017 {
1018     CPUX86State *tenv = &X86_CPU(target)->env;
1019     int64_t qemu_now;
1020 
1021     timer_del(tenv->xen_periodic_timer);
1022 
1023     qemu_mutex_lock(&tenv->xen_timers_lock);
1024 
1025     qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
1026     timer_mod_ns(tenv->xen_periodic_timer, qemu_now + period_ns);
1027     tenv->xen_periodic_timer_period = period_ns;
1028 
1029     qemu_mutex_unlock(&tenv->xen_timers_lock);
1030     return 0;
1031 }
1032 
1033 #define MILLISECS(_ms)  ((int64_t)((_ms) * 1000000ULL))
1034 #define MICROSECS(_us)  ((int64_t)((_us) * 1000ULL))
1035 #define STIME_MAX ((time_t)((int64_t)~0ull >> 1))
1036 /* Chosen so (NOW() + delta) wont overflow without an uptime of 200 years */
1037 #define STIME_DELTA_MAX ((int64_t)((uint64_t)~0ull >> 2))
1038 
1039 static int vcpuop_set_periodic_timer(CPUState *cs, CPUState *target,
1040                                      uint64_t arg)
1041 {
1042     struct vcpu_set_periodic_timer spt;
1043 
1044     qemu_build_assert(sizeof(spt) == 8);
1045     if (kvm_copy_from_gva(cs, arg, &spt, sizeof(spt))) {
1046         return -EFAULT;
1047     }
1048 
1049     if (spt.period_ns < MILLISECS(1) || spt.period_ns > STIME_DELTA_MAX) {
1050         return -EINVAL;
1051     }
1052 
1053     return do_set_periodic_timer(target, spt.period_ns);
1054 }
1055 
1056 static int vcpuop_stop_periodic_timer(CPUState *target)
1057 {
1058     CPUX86State *tenv = &X86_CPU(target)->env;
1059 
1060     qemu_mutex_lock(&tenv->xen_timers_lock);
1061 
1062     timer_del(tenv->xen_periodic_timer);
1063     tenv->xen_periodic_timer_period = 0;
1064 
1065     qemu_mutex_unlock(&tenv->xen_timers_lock);
1066     return 0;
1067 }
1068 
1069 /*
1070  * Userspace handling of timer, for older kernels.
1071  * Must always be called with xen_timers_lock held.
1072  */
1073 static int do_set_singleshot_timer(CPUState *cs, uint64_t timeout_abs,
1074                                    bool future, bool linux_wa)
1075 {
1076     CPUX86State *env = &X86_CPU(cs)->env;
1077     int64_t now = kvm_get_current_ns();
1078     int64_t qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
1079     int64_t delta = timeout_abs - now;
1080 
1081     if (future && timeout_abs < now) {
1082         return -ETIME;
1083     }
1084 
1085     if (linux_wa && unlikely((int64_t)timeout_abs < 0 ||
1086                              (delta > 0 && (uint32_t)(delta >> 50) != 0))) {
1087         /*
1088          * Xen has a 'Linux workaround' in do_set_timer_op() which checks
1089          * for negative absolute timeout values (caused by integer
1090          * overflow), and for values about 13 days in the future (2^50ns)
1091          * which would be caused by jiffies overflow. For those cases, it
1092          * sets the timeout 100ms in the future (not *too* soon, since if
1093          * a guest really did set a long timeout on purpose we don't want
1094          * to keep churning CPU time by waking it up).
1095          */
1096         delta = (100 * SCALE_MS);
1097         timeout_abs = now + delta;
1098     }
1099 
1100     timer_mod_ns(env->xen_singleshot_timer, qemu_now + delta);
1101     env->xen_singleshot_timer_ns = now + delta;
1102     return 0;
1103 }
1104 
1105 static int vcpuop_set_singleshot_timer(CPUState *cs, uint64_t arg)
1106 {
1107     struct vcpu_set_singleshot_timer sst = { 0 };
1108 
1109     /*
1110      * The struct is a uint64_t followed by a uint32_t. On 32-bit that
1111      * makes it 12 bytes. On 64-bit it gets padded to 16. The parts
1112      * that get used are identical, and there's four bytes of padding
1113      * unused at the end. For true Xen compatibility we should attempt
1114      * to copy the full 16 bytes from 64-bit guests, and return -EFAULT
1115      * if we can't get the padding too. But that's daft. Just copy what
1116      * we need.
1117      */
1118     qemu_build_assert(offsetof(struct vcpu_set_singleshot_timer, flags) == 8);
1119     qemu_build_assert(sizeof(sst) >= 12);
1120 
1121     if (kvm_copy_from_gva(cs, arg, &sst, 12)) {
1122         return -EFAULT;
1123     }
1124 
1125     QEMU_LOCK_GUARD(&X86_CPU(cs)->env.xen_timers_lock);
1126     return do_set_singleshot_timer(cs, sst.timeout_abs_ns,
1127                                    !!(sst.flags & VCPU_SSHOTTMR_future),
1128                                    false);
1129 }
1130 
1131 static int vcpuop_stop_singleshot_timer(CPUState *cs)
1132 {
1133     CPUX86State *env = &X86_CPU(cs)->env;
1134 
1135     qemu_mutex_lock(&env->xen_timers_lock);
1136 
1137     timer_del(env->xen_singleshot_timer);
1138     env->xen_singleshot_timer_ns = 0;
1139 
1140     qemu_mutex_unlock(&env->xen_timers_lock);
1141     return 0;
1142 }
1143 
1144 static bool kvm_xen_hcall_set_timer_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1145                                        uint64_t timeout)
1146 {
1147     int err;
1148 
1149     if (unlikely(timeout == 0)) {
1150         err = vcpuop_stop_singleshot_timer(CPU(cpu));
1151     } else {
1152         QEMU_LOCK_GUARD(&X86_CPU(cpu)->env.xen_timers_lock);
1153         err = do_set_singleshot_timer(CPU(cpu), timeout, false, true);
1154     }
1155     exit->u.hcall.result = err;
1156     return true;
1157 }
1158 
1159 static bool kvm_xen_hcall_vcpu_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1160                                   int cmd, int vcpu_id, uint64_t arg)
1161 {
1162     CPUState *cs = CPU(cpu);
1163     CPUState *dest = cs->cpu_index == vcpu_id ? cs : qemu_get_cpu(vcpu_id);
1164     int err;
1165 
1166     if (!dest) {
1167         err = -ENOENT;
1168         goto out;
1169     }
1170 
1171     switch (cmd) {
1172     case VCPUOP_register_runstate_memory_area:
1173         err = vcpuop_register_runstate_info(cs, dest, arg);
1174         break;
1175     case VCPUOP_register_vcpu_time_memory_area:
1176         err = vcpuop_register_vcpu_time_info(cs, dest, arg);
1177         break;
1178     case VCPUOP_register_vcpu_info:
1179         err = vcpuop_register_vcpu_info(cs, dest, arg);
1180         break;
1181     case VCPUOP_set_singleshot_timer: {
1182         if (cs->cpu_index == vcpu_id) {
1183             err = vcpuop_set_singleshot_timer(dest, arg);
1184         } else {
1185             err = -EINVAL;
1186         }
1187         break;
1188     }
1189     case VCPUOP_stop_singleshot_timer:
1190         if (cs->cpu_index == vcpu_id) {
1191             err = vcpuop_stop_singleshot_timer(dest);
1192         } else {
1193             err = -EINVAL;
1194         }
1195         break;
1196     case VCPUOP_set_periodic_timer: {
1197         err = vcpuop_set_periodic_timer(cs, dest, arg);
1198         break;
1199     }
1200     case VCPUOP_stop_periodic_timer:
1201         err = vcpuop_stop_periodic_timer(dest);
1202         break;
1203 
1204     default:
1205         return false;
1206     }
1207 
1208  out:
1209     exit->u.hcall.result = err;
1210     return true;
1211 }
1212 
1213 static bool kvm_xen_hcall_evtchn_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1214                                     int cmd, uint64_t arg)
1215 {
1216     CPUState *cs = CPU(cpu);
1217     int err = -ENOSYS;
1218 
1219     switch (cmd) {
1220     case EVTCHNOP_init_control:
1221     case EVTCHNOP_expand_array:
1222     case EVTCHNOP_set_priority:
1223         /* We do not support FIFO channels at this point */
1224         err = -ENOSYS;
1225         break;
1226 
1227     case EVTCHNOP_status: {
1228         struct evtchn_status status;
1229 
1230         qemu_build_assert(sizeof(status) == 24);
1231         if (kvm_copy_from_gva(cs, arg, &status, sizeof(status))) {
1232             err = -EFAULT;
1233             break;
1234         }
1235 
1236         err = xen_evtchn_status_op(&status);
1237         if (!err && kvm_copy_to_gva(cs, arg, &status, sizeof(status))) {
1238             err = -EFAULT;
1239         }
1240         break;
1241     }
1242     case EVTCHNOP_close: {
1243         struct evtchn_close close;
1244 
1245         qemu_build_assert(sizeof(close) == 4);
1246         if (kvm_copy_from_gva(cs, arg, &close, sizeof(close))) {
1247             err = -EFAULT;
1248             break;
1249         }
1250 
1251         err = xen_evtchn_close_op(&close);
1252         break;
1253     }
1254     case EVTCHNOP_unmask: {
1255         struct evtchn_unmask unmask;
1256 
1257         qemu_build_assert(sizeof(unmask) == 4);
1258         if (kvm_copy_from_gva(cs, arg, &unmask, sizeof(unmask))) {
1259             err = -EFAULT;
1260             break;
1261         }
1262 
1263         err = xen_evtchn_unmask_op(&unmask);
1264         break;
1265     }
1266     case EVTCHNOP_bind_virq: {
1267         struct evtchn_bind_virq virq;
1268 
1269         qemu_build_assert(sizeof(virq) == 12);
1270         if (kvm_copy_from_gva(cs, arg, &virq, sizeof(virq))) {
1271             err = -EFAULT;
1272             break;
1273         }
1274 
1275         err = xen_evtchn_bind_virq_op(&virq);
1276         if (!err && kvm_copy_to_gva(cs, arg, &virq, sizeof(virq))) {
1277             err = -EFAULT;
1278         }
1279         break;
1280     }
1281     case EVTCHNOP_bind_pirq: {
1282         struct evtchn_bind_pirq pirq;
1283 
1284         qemu_build_assert(sizeof(pirq) == 12);
1285         if (kvm_copy_from_gva(cs, arg, &pirq, sizeof(pirq))) {
1286             err = -EFAULT;
1287             break;
1288         }
1289 
1290         err = xen_evtchn_bind_pirq_op(&pirq);
1291         if (!err && kvm_copy_to_gva(cs, arg, &pirq, sizeof(pirq))) {
1292             err = -EFAULT;
1293         }
1294         break;
1295     }
1296     case EVTCHNOP_bind_ipi: {
1297         struct evtchn_bind_ipi ipi;
1298 
1299         qemu_build_assert(sizeof(ipi) == 8);
1300         if (kvm_copy_from_gva(cs, arg, &ipi, sizeof(ipi))) {
1301             err = -EFAULT;
1302             break;
1303         }
1304 
1305         err = xen_evtchn_bind_ipi_op(&ipi);
1306         if (!err && kvm_copy_to_gva(cs, arg, &ipi, sizeof(ipi))) {
1307             err = -EFAULT;
1308         }
1309         break;
1310     }
1311     case EVTCHNOP_send: {
1312         struct evtchn_send send;
1313 
1314         qemu_build_assert(sizeof(send) == 4);
1315         if (kvm_copy_from_gva(cs, arg, &send, sizeof(send))) {
1316             err = -EFAULT;
1317             break;
1318         }
1319 
1320         err = xen_evtchn_send_op(&send);
1321         break;
1322     }
1323     case EVTCHNOP_alloc_unbound: {
1324         struct evtchn_alloc_unbound alloc;
1325 
1326         qemu_build_assert(sizeof(alloc) == 8);
1327         if (kvm_copy_from_gva(cs, arg, &alloc, sizeof(alloc))) {
1328             err = -EFAULT;
1329             break;
1330         }
1331 
1332         err = xen_evtchn_alloc_unbound_op(&alloc);
1333         if (!err && kvm_copy_to_gva(cs, arg, &alloc, sizeof(alloc))) {
1334             err = -EFAULT;
1335         }
1336         break;
1337     }
1338     case EVTCHNOP_bind_interdomain: {
1339         struct evtchn_bind_interdomain interdomain;
1340 
1341         qemu_build_assert(sizeof(interdomain) == 12);
1342         if (kvm_copy_from_gva(cs, arg, &interdomain, sizeof(interdomain))) {
1343             err = -EFAULT;
1344             break;
1345         }
1346 
1347         err = xen_evtchn_bind_interdomain_op(&interdomain);
1348         if (!err &&
1349             kvm_copy_to_gva(cs, arg, &interdomain, sizeof(interdomain))) {
1350             err = -EFAULT;
1351         }
1352         break;
1353     }
1354     case EVTCHNOP_bind_vcpu: {
1355         struct evtchn_bind_vcpu vcpu;
1356 
1357         qemu_build_assert(sizeof(vcpu) == 8);
1358         if (kvm_copy_from_gva(cs, arg, &vcpu, sizeof(vcpu))) {
1359             err = -EFAULT;
1360             break;
1361         }
1362 
1363         err = xen_evtchn_bind_vcpu_op(&vcpu);
1364         break;
1365     }
1366     case EVTCHNOP_reset: {
1367         struct evtchn_reset reset;
1368 
1369         qemu_build_assert(sizeof(reset) == 2);
1370         if (kvm_copy_from_gva(cs, arg, &reset, sizeof(reset))) {
1371             err = -EFAULT;
1372             break;
1373         }
1374 
1375         err = xen_evtchn_reset_op(&reset);
1376         break;
1377     }
1378     default:
1379         return false;
1380     }
1381 
1382     exit->u.hcall.result = err;
1383     return true;
1384 }
1385 
1386 int kvm_xen_soft_reset(void)
1387 {
1388     CPUState *cpu;
1389     int err;
1390 
1391     assert(qemu_mutex_iothread_locked());
1392 
1393     trace_kvm_xen_soft_reset();
1394 
1395     err = xen_evtchn_soft_reset();
1396     if (err) {
1397         return err;
1398     }
1399 
1400     /*
1401      * Zero is the reset/startup state for HVM_PARAM_CALLBACK_IRQ. Strictly,
1402      * it maps to HVM_PARAM_CALLBACK_TYPE_GSI with GSI#0, but Xen refuses to
1403      * to deliver to the timer interrupt and treats that as 'disabled'.
1404      */
1405     err = xen_evtchn_set_callback_param(0);
1406     if (err) {
1407         return err;
1408     }
1409 
1410     CPU_FOREACH(cpu) {
1411         async_run_on_cpu(cpu, do_vcpu_soft_reset, RUN_ON_CPU_NULL);
1412     }
1413 
1414     err = xen_overlay_map_shinfo_page(INVALID_GFN);
1415     if (err) {
1416         return err;
1417     }
1418 
1419     err = xen_gnttab_reset();
1420     if (err) {
1421         return err;
1422     }
1423 
1424     err = xen_xenstore_reset();
1425     if (err) {
1426         return err;
1427     }
1428 
1429     return 0;
1430 }
1431 
1432 static int schedop_shutdown(CPUState *cs, uint64_t arg)
1433 {
1434     struct sched_shutdown shutdown;
1435     int ret = 0;
1436 
1437     /* No need for 32/64 compat handling */
1438     qemu_build_assert(sizeof(shutdown) == 4);
1439 
1440     if (kvm_copy_from_gva(cs, arg, &shutdown, sizeof(shutdown))) {
1441         return -EFAULT;
1442     }
1443 
1444     switch (shutdown.reason) {
1445     case SHUTDOWN_crash:
1446         cpu_dump_state(cs, stderr, CPU_DUMP_CODE);
1447         qemu_system_guest_panicked(NULL);
1448         break;
1449 
1450     case SHUTDOWN_reboot:
1451         qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
1452         break;
1453 
1454     case SHUTDOWN_poweroff:
1455         qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
1456         break;
1457 
1458     case SHUTDOWN_soft_reset:
1459         qemu_mutex_lock_iothread();
1460         ret = kvm_xen_soft_reset();
1461         qemu_mutex_unlock_iothread();
1462         break;
1463 
1464     default:
1465         ret = -EINVAL;
1466         break;
1467     }
1468 
1469     return ret;
1470 }
1471 
1472 static bool kvm_xen_hcall_sched_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1473                                    int cmd, uint64_t arg)
1474 {
1475     CPUState *cs = CPU(cpu);
1476     int err = -ENOSYS;
1477 
1478     switch (cmd) {
1479     case SCHEDOP_shutdown:
1480         err = schedop_shutdown(cs, arg);
1481         break;
1482 
1483     case SCHEDOP_poll:
1484         /*
1485          * Linux will panic if this doesn't work. Just yield; it's not
1486          * worth overthinking it because with event channel handling
1487          * in KVM, the kernel will intercept this and it will never
1488          * reach QEMU anyway. The semantics of the hypercall explicltly
1489          * permit spurious wakeups.
1490          */
1491     case SCHEDOP_yield:
1492         sched_yield();
1493         err = 0;
1494         break;
1495 
1496     default:
1497         return false;
1498     }
1499 
1500     exit->u.hcall.result = err;
1501     return true;
1502 }
1503 
1504 static bool kvm_xen_hcall_gnttab_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1505                                     int cmd, uint64_t arg, int count)
1506 {
1507     CPUState *cs = CPU(cpu);
1508     int err;
1509 
1510     switch (cmd) {
1511     case GNTTABOP_set_version: {
1512         struct gnttab_set_version set;
1513 
1514         qemu_build_assert(sizeof(set) == 4);
1515         if (kvm_copy_from_gva(cs, arg, &set, sizeof(set))) {
1516             err = -EFAULT;
1517             break;
1518         }
1519 
1520         err = xen_gnttab_set_version_op(&set);
1521         if (!err && kvm_copy_to_gva(cs, arg, &set, sizeof(set))) {
1522             err = -EFAULT;
1523         }
1524         break;
1525     }
1526     case GNTTABOP_get_version: {
1527         struct gnttab_get_version get;
1528 
1529         qemu_build_assert(sizeof(get) == 8);
1530         if (kvm_copy_from_gva(cs, arg, &get, sizeof(get))) {
1531             err = -EFAULT;
1532             break;
1533         }
1534 
1535         err = xen_gnttab_get_version_op(&get);
1536         if (!err && kvm_copy_to_gva(cs, arg, &get, sizeof(get))) {
1537             err = -EFAULT;
1538         }
1539         break;
1540     }
1541     case GNTTABOP_query_size: {
1542         struct gnttab_query_size size;
1543 
1544         qemu_build_assert(sizeof(size) == 16);
1545         if (kvm_copy_from_gva(cs, arg, &size, sizeof(size))) {
1546             err = -EFAULT;
1547             break;
1548         }
1549 
1550         err = xen_gnttab_query_size_op(&size);
1551         if (!err && kvm_copy_to_gva(cs, arg, &size, sizeof(size))) {
1552             err = -EFAULT;
1553         }
1554         break;
1555     }
1556     case GNTTABOP_setup_table:
1557     case GNTTABOP_copy:
1558     case GNTTABOP_map_grant_ref:
1559     case GNTTABOP_unmap_grant_ref:
1560     case GNTTABOP_swap_grant_ref:
1561         return false;
1562 
1563     default:
1564         /* Xen explicitly returns -ENOSYS to HVM guests for all others */
1565         err = -ENOSYS;
1566         break;
1567     }
1568 
1569     exit->u.hcall.result = err;
1570     return true;
1571 }
1572 
1573 static bool kvm_xen_hcall_physdev_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1574                                      int cmd, uint64_t arg)
1575 {
1576     CPUState *cs = CPU(cpu);
1577     int err;
1578 
1579     switch (cmd) {
1580     case PHYSDEVOP_map_pirq: {
1581         struct physdev_map_pirq map;
1582 
1583         if (hypercall_compat32(exit->u.hcall.longmode)) {
1584             struct compat_physdev_map_pirq *map32 = (void *)&map;
1585 
1586             if (kvm_copy_from_gva(cs, arg, map32, sizeof(*map32))) {
1587                 return -EFAULT;
1588             }
1589 
1590             /*
1591              * The only thing that's different is the alignment of the
1592              * uint64_t table_base at the end, which gets padding to make
1593              * it 64-bit aligned in the 64-bit version.
1594              */
1595             qemu_build_assert(sizeof(*map32) == 36);
1596             qemu_build_assert(offsetof(struct physdev_map_pirq, entry_nr) ==
1597                               offsetof(struct compat_physdev_map_pirq, entry_nr));
1598             memmove(&map.table_base, &map32->table_base, sizeof(map.table_base));
1599         } else {
1600             if (kvm_copy_from_gva(cs, arg, &map, sizeof(map))) {
1601                 err = -EFAULT;
1602                 break;
1603             }
1604         }
1605         err = xen_physdev_map_pirq(&map);
1606         /*
1607          * Since table_base is an IN parameter and won't be changed, just
1608          * copy the size of the compat structure back to the guest.
1609          */
1610         if (!err && kvm_copy_to_gva(cs, arg, &map,
1611                                     sizeof(struct compat_physdev_map_pirq))) {
1612             err = -EFAULT;
1613         }
1614         break;
1615     }
1616     case PHYSDEVOP_unmap_pirq: {
1617         struct physdev_unmap_pirq unmap;
1618 
1619         qemu_build_assert(sizeof(unmap) == 8);
1620         if (kvm_copy_from_gva(cs, arg, &unmap, sizeof(unmap))) {
1621             err = -EFAULT;
1622             break;
1623         }
1624 
1625         err = xen_physdev_unmap_pirq(&unmap);
1626         if (!err && kvm_copy_to_gva(cs, arg, &unmap, sizeof(unmap))) {
1627             err = -EFAULT;
1628         }
1629         break;
1630     }
1631     case PHYSDEVOP_eoi: {
1632         struct physdev_eoi eoi;
1633 
1634         qemu_build_assert(sizeof(eoi) == 4);
1635         if (kvm_copy_from_gva(cs, arg, &eoi, sizeof(eoi))) {
1636             err = -EFAULT;
1637             break;
1638         }
1639 
1640         err = xen_physdev_eoi_pirq(&eoi);
1641         if (!err && kvm_copy_to_gva(cs, arg, &eoi, sizeof(eoi))) {
1642             err = -EFAULT;
1643         }
1644         break;
1645     }
1646     case PHYSDEVOP_irq_status_query: {
1647         struct physdev_irq_status_query query;
1648 
1649         qemu_build_assert(sizeof(query) == 8);
1650         if (kvm_copy_from_gva(cs, arg, &query, sizeof(query))) {
1651             err = -EFAULT;
1652             break;
1653         }
1654 
1655         err = xen_physdev_query_pirq(&query);
1656         if (!err && kvm_copy_to_gva(cs, arg, &query, sizeof(query))) {
1657             err = -EFAULT;
1658         }
1659         break;
1660     }
1661     case PHYSDEVOP_get_free_pirq: {
1662         struct physdev_get_free_pirq get;
1663 
1664         qemu_build_assert(sizeof(get) == 8);
1665         if (kvm_copy_from_gva(cs, arg, &get, sizeof(get))) {
1666             err = -EFAULT;
1667             break;
1668         }
1669 
1670         err = xen_physdev_get_free_pirq(&get);
1671         if (!err && kvm_copy_to_gva(cs, arg, &get, sizeof(get))) {
1672             err = -EFAULT;
1673         }
1674         break;
1675     }
1676     case PHYSDEVOP_pirq_eoi_gmfn_v2: /* FreeBSD 13 makes this hypercall */
1677         err = -ENOSYS;
1678         break;
1679 
1680     default:
1681         return false;
1682     }
1683 
1684     exit->u.hcall.result = err;
1685     return true;
1686 }
1687 
1688 static bool do_kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit)
1689 {
1690     uint16_t code = exit->u.hcall.input;
1691 
1692     if (exit->u.hcall.cpl > 0) {
1693         exit->u.hcall.result = -EPERM;
1694         return true;
1695     }
1696 
1697     switch (code) {
1698     case __HYPERVISOR_set_timer_op:
1699         if (exit->u.hcall.longmode) {
1700             return kvm_xen_hcall_set_timer_op(exit, cpu,
1701                                               exit->u.hcall.params[0]);
1702         } else {
1703             /* In 32-bit mode, the 64-bit timer value is in two args. */
1704             uint64_t val = ((uint64_t)exit->u.hcall.params[1]) << 32 |
1705                 (uint32_t)exit->u.hcall.params[0];
1706             return kvm_xen_hcall_set_timer_op(exit, cpu, val);
1707         }
1708     case __HYPERVISOR_grant_table_op:
1709         return kvm_xen_hcall_gnttab_op(exit, cpu, exit->u.hcall.params[0],
1710                                        exit->u.hcall.params[1],
1711                                        exit->u.hcall.params[2]);
1712     case __HYPERVISOR_sched_op:
1713         return kvm_xen_hcall_sched_op(exit, cpu, exit->u.hcall.params[0],
1714                                       exit->u.hcall.params[1]);
1715     case __HYPERVISOR_event_channel_op:
1716         return kvm_xen_hcall_evtchn_op(exit, cpu, exit->u.hcall.params[0],
1717                                        exit->u.hcall.params[1]);
1718     case __HYPERVISOR_vcpu_op:
1719         return kvm_xen_hcall_vcpu_op(exit, cpu,
1720                                      exit->u.hcall.params[0],
1721                                      exit->u.hcall.params[1],
1722                                      exit->u.hcall.params[2]);
1723     case __HYPERVISOR_hvm_op:
1724         return kvm_xen_hcall_hvm_op(exit, cpu, exit->u.hcall.params[0],
1725                                     exit->u.hcall.params[1]);
1726     case __HYPERVISOR_memory_op:
1727         return kvm_xen_hcall_memory_op(exit, cpu, exit->u.hcall.params[0],
1728                                        exit->u.hcall.params[1]);
1729     case __HYPERVISOR_physdev_op:
1730         return kvm_xen_hcall_physdev_op(exit, cpu, exit->u.hcall.params[0],
1731                                         exit->u.hcall.params[1]);
1732     case __HYPERVISOR_xen_version:
1733         return kvm_xen_hcall_xen_version(exit, cpu, exit->u.hcall.params[0],
1734                                          exit->u.hcall.params[1]);
1735     default:
1736         return false;
1737     }
1738 }
1739 
1740 int kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit)
1741 {
1742     if (exit->type != KVM_EXIT_XEN_HCALL) {
1743         return -1;
1744     }
1745 
1746     /*
1747      * The kernel latches the guest 32/64 mode when the MSR is used to fill
1748      * the hypercall page. So if we see a hypercall in a mode that doesn't
1749      * match our own idea of the guest mode, fetch the kernel's idea of the
1750      * "long mode" to remain in sync.
1751      */
1752     if (exit->u.hcall.longmode != xen_is_long_mode()) {
1753         xen_sync_long_mode();
1754     }
1755 
1756     if (!do_kvm_xen_handle_exit(cpu, exit)) {
1757         /*
1758          * Some hypercalls will be deliberately "implemented" by returning
1759          * -ENOSYS. This case is for hypercalls which are unexpected.
1760          */
1761         exit->u.hcall.result = -ENOSYS;
1762         qemu_log_mask(LOG_UNIMP, "Unimplemented Xen hypercall %"
1763                       PRId64 " (0x%" PRIx64 " 0x%" PRIx64 " 0x%" PRIx64 ")\n",
1764                       (uint64_t)exit->u.hcall.input,
1765                       (uint64_t)exit->u.hcall.params[0],
1766                       (uint64_t)exit->u.hcall.params[1],
1767                       (uint64_t)exit->u.hcall.params[2]);
1768     }
1769 
1770     trace_kvm_xen_hypercall(CPU(cpu)->cpu_index, exit->u.hcall.cpl,
1771                             exit->u.hcall.input, exit->u.hcall.params[0],
1772                             exit->u.hcall.params[1], exit->u.hcall.params[2],
1773                             exit->u.hcall.result);
1774     return 0;
1775 }
1776 
1777 uint16_t kvm_xen_get_gnttab_max_frames(void)
1778 {
1779     KVMState *s = KVM_STATE(current_accel());
1780     return s->xen_gnttab_max_frames;
1781 }
1782 
1783 uint16_t kvm_xen_get_evtchn_max_pirq(void)
1784 {
1785     KVMState *s = KVM_STATE(current_accel());
1786     return s->xen_evtchn_max_pirq;
1787 }
1788 
1789 int kvm_put_xen_state(CPUState *cs)
1790 {
1791     X86CPU *cpu = X86_CPU(cs);
1792     CPUX86State *env = &cpu->env;
1793     uint64_t gpa;
1794     int ret;
1795 
1796     gpa = env->xen_vcpu_info_gpa;
1797     if (gpa == INVALID_GPA) {
1798         gpa = env->xen_vcpu_info_default_gpa;
1799     }
1800 
1801     if (gpa != INVALID_GPA) {
1802         ret = set_vcpu_info(cs, gpa);
1803         if (ret < 0) {
1804             return ret;
1805         }
1806     }
1807 
1808     gpa = env->xen_vcpu_time_info_gpa;
1809     if (gpa != INVALID_GPA) {
1810         ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
1811                                     gpa);
1812         if (ret < 0) {
1813             return ret;
1814         }
1815     }
1816 
1817     gpa = env->xen_vcpu_runstate_gpa;
1818     if (gpa != INVALID_GPA) {
1819         ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
1820                                     gpa);
1821         if (ret < 0) {
1822             return ret;
1823         }
1824     }
1825 
1826     if (env->xen_periodic_timer_period) {
1827         ret = do_set_periodic_timer(cs, env->xen_periodic_timer_period);
1828         if (ret < 0) {
1829             return ret;
1830         }
1831     }
1832 
1833     if (!kvm_xen_has_cap(EVTCHN_SEND)) {
1834         /*
1835          * If the kernel has EVTCHN_SEND support then it handles timers too,
1836          * so the timer will be restored by kvm_xen_set_vcpu_timer() below.
1837          */
1838         QEMU_LOCK_GUARD(&env->xen_timers_lock);
1839         if (env->xen_singleshot_timer_ns) {
1840             ret = do_set_singleshot_timer(cs, env->xen_singleshot_timer_ns,
1841                                     false, false);
1842             if (ret < 0) {
1843                 return ret;
1844             }
1845         }
1846         return 0;
1847     }
1848 
1849     if (env->xen_vcpu_callback_vector) {
1850         ret = kvm_xen_set_vcpu_callback_vector(cs);
1851         if (ret < 0) {
1852             return ret;
1853         }
1854     }
1855 
1856     if (env->xen_virq[VIRQ_TIMER]) {
1857         do_set_vcpu_timer_virq(cs,
1858                                RUN_ON_CPU_HOST_INT(env->xen_virq[VIRQ_TIMER]));
1859     }
1860     return 0;
1861 }
1862 
1863 int kvm_get_xen_state(CPUState *cs)
1864 {
1865     X86CPU *cpu = X86_CPU(cs);
1866     CPUX86State *env = &cpu->env;
1867     uint64_t gpa;
1868     int ret;
1869 
1870     /*
1871      * The kernel does not mark vcpu_info as dirty when it delivers interrupts
1872      * to it. It's up to userspace to *assume* that any page shared thus is
1873      * always considered dirty. The shared_info page is different since it's
1874      * an overlay and migrated separately anyway.
1875      */
1876     gpa = env->xen_vcpu_info_gpa;
1877     if (gpa == INVALID_GPA) {
1878         gpa = env->xen_vcpu_info_default_gpa;
1879     }
1880     if (gpa != INVALID_GPA) {
1881         MemoryRegionSection mrs = memory_region_find(get_system_memory(),
1882                                                      gpa,
1883                                                      sizeof(struct vcpu_info));
1884         if (mrs.mr &&
1885             !int128_lt(mrs.size, int128_make64(sizeof(struct vcpu_info)))) {
1886             memory_region_set_dirty(mrs.mr, mrs.offset_within_region,
1887                                     sizeof(struct vcpu_info));
1888         }
1889     }
1890 
1891     if (!kvm_xen_has_cap(EVTCHN_SEND)) {
1892         return 0;
1893     }
1894 
1895     /*
1896      * If the kernel is accelerating timers, read out the current value of the
1897      * singleshot timer deadline.
1898      */
1899     if (env->xen_virq[VIRQ_TIMER]) {
1900         struct kvm_xen_vcpu_attr va = {
1901             .type = KVM_XEN_VCPU_ATTR_TYPE_TIMER,
1902         };
1903         ret = kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_GET_ATTR, &va);
1904         if (ret < 0) {
1905             return ret;
1906         }
1907 
1908         /*
1909          * This locking is fairly pointless, and is here to appease Coverity.
1910          * There is an unavoidable race condition if a different vCPU sets a
1911          * timer for this vCPU after the value has been read out. But that's
1912          * OK in practice because *all* the vCPUs need to be stopped before
1913          * we set about migrating their state.
1914          */
1915         QEMU_LOCK_GUARD(&X86_CPU(cs)->env.xen_timers_lock);
1916         env->xen_singleshot_timer_ns = va.u.timer.expires_ns;
1917     }
1918 
1919     return 0;
1920 }
1921