xref: /openbmc/qemu/target/i386/kvm/xen-emu.c (revision f7230e09b1ccfb7055b79dfee981e18d444a118a)
1 /*
2  * Xen HVM emulation support in KVM
3  *
4  * Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
5  * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
6  *
7  * This work is licensed under the terms of the GNU GPL, version 2 or later.
8  * See the COPYING file in the top-level directory.
9  *
10  */
11 
12 #include "qemu/osdep.h"
13 #include "qemu/log.h"
14 #include "qemu/main-loop.h"
15 #include "qemu/error-report.h"
16 #include "hw/xen/xen.h"
17 #include "sysemu/kvm_int.h"
18 #include "sysemu/kvm_xen.h"
19 #include "kvm/kvm_i386.h"
20 #include "exec/address-spaces.h"
21 #include "xen-emu.h"
22 #include "trace.h"
23 #include "sysemu/runstate.h"
24 
25 #include "hw/pci/msi.h"
26 #include "hw/i386/apic-msidef.h"
27 #include "hw/i386/e820_memory_layout.h"
28 #include "hw/i386/kvm/xen_overlay.h"
29 #include "hw/i386/kvm/xen_evtchn.h"
30 #include "hw/i386/kvm/xen_gnttab.h"
31 #include "hw/i386/kvm/xen_primary_console.h"
32 #include "hw/i386/kvm/xen_xenstore.h"
33 
34 #include "hw/xen/interface/version.h"
35 #include "hw/xen/interface/sched.h"
36 #include "hw/xen/interface/memory.h"
37 #include "hw/xen/interface/hvm/hvm_op.h"
38 #include "hw/xen/interface/hvm/params.h"
39 #include "hw/xen/interface/vcpu.h"
40 #include "hw/xen/interface/event_channel.h"
41 #include "hw/xen/interface/grant_table.h"
42 
43 #include "xen-compat.h"
44 
45 static void xen_vcpu_singleshot_timer_event(void *opaque);
46 static void xen_vcpu_periodic_timer_event(void *opaque);
47 static int vcpuop_stop_singleshot_timer(CPUState *cs);
48 
49 #ifdef TARGET_X86_64
50 #define hypercall_compat32(longmode) (!(longmode))
51 #else
52 #define hypercall_compat32(longmode) (false)
53 #endif
54 
55 static bool kvm_gva_to_gpa(CPUState *cs, uint64_t gva, uint64_t *gpa,
56                            size_t *len, bool is_write)
57 {
58         struct kvm_translation tr = {
59             .linear_address = gva,
60         };
61 
62         if (len) {
63             *len = TARGET_PAGE_SIZE - (gva & ~TARGET_PAGE_MASK);
64         }
65 
66         if (kvm_vcpu_ioctl(cs, KVM_TRANSLATE, &tr) || !tr.valid ||
67             (is_write && !tr.writeable)) {
68             return false;
69         }
70         *gpa = tr.physical_address;
71         return true;
72 }
73 
74 static int kvm_gva_rw(CPUState *cs, uint64_t gva, void *_buf, size_t sz,
75                       bool is_write)
76 {
77     uint8_t *buf = (uint8_t *)_buf;
78     uint64_t gpa;
79     size_t len;
80 
81     while (sz) {
82         if (!kvm_gva_to_gpa(cs, gva, &gpa, &len, is_write)) {
83             return -EFAULT;
84         }
85         if (len > sz) {
86             len = sz;
87         }
88 
89         cpu_physical_memory_rw(gpa, buf, len, is_write);
90 
91         buf += len;
92         sz -= len;
93         gva += len;
94     }
95 
96     return 0;
97 }
98 
99 static inline int kvm_copy_from_gva(CPUState *cs, uint64_t gva, void *buf,
100                                     size_t sz)
101 {
102     return kvm_gva_rw(cs, gva, buf, sz, false);
103 }
104 
105 static inline int kvm_copy_to_gva(CPUState *cs, uint64_t gva, void *buf,
106                                   size_t sz)
107 {
108     return kvm_gva_rw(cs, gva, buf, sz, true);
109 }
110 
111 int kvm_xen_init(KVMState *s, uint32_t hypercall_msr)
112 {
113     const int required_caps = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
114         KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | KVM_XEN_HVM_CONFIG_SHARED_INFO;
115     struct kvm_xen_hvm_config cfg = {
116         .msr = hypercall_msr,
117         .flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL,
118     };
119     int xen_caps, ret;
120 
121     xen_caps = kvm_check_extension(s, KVM_CAP_XEN_HVM);
122     if (required_caps & ~xen_caps) {
123         error_report("kvm: Xen HVM guest support not present or insufficient");
124         return -ENOSYS;
125     }
126 
127     if (xen_caps & KVM_XEN_HVM_CONFIG_EVTCHN_SEND) {
128         struct kvm_xen_hvm_attr ha = {
129             .type = KVM_XEN_ATTR_TYPE_XEN_VERSION,
130             .u.xen_version = s->xen_version,
131         };
132         (void)kvm_vm_ioctl(s, KVM_XEN_HVM_SET_ATTR, &ha);
133 
134         cfg.flags |= KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
135     }
136 
137     ret = kvm_vm_ioctl(s, KVM_XEN_HVM_CONFIG, &cfg);
138     if (ret < 0) {
139         error_report("kvm: Failed to enable Xen HVM support: %s",
140                      strerror(-ret));
141         return ret;
142     }
143 
144     /* If called a second time, don't repeat the rest of the setup. */
145     if (s->xen_caps) {
146         return 0;
147     }
148 
149     /*
150      * Event channel delivery via GSI/PCI_INTX needs to poll the vcpu_info
151      * of vCPU0 to deassert the IRQ when ->evtchn_upcall_pending is cleared.
152      *
153      * In the kernel, there's a notifier hook on the PIC/IOAPIC which allows
154      * such things to be polled at precisely the right time. We *could* do
155      * it nicely in the kernel: check vcpu_info[0]->evtchn_upcall_pending at
156      * the moment the IRQ is acked, and see if it should be reasserted.
157      *
158      * But the in-kernel irqchip is deprecated, so we're unlikely to add
159      * that support in the kernel. Insist on using the split irqchip mode
160      * instead.
161      *
162      * This leaves us polling for the level going low in QEMU, which lacks
163      * the appropriate hooks in its PIC/IOAPIC code. Even VFIO is sending a
164      * spurious 'ack' to an INTX IRQ every time there's any MMIO access to
165      * the device (for which it has to unmap the device and trap access, for
166      * some period after an IRQ!!). In the Xen case, we do it on exit from
167      * KVM_RUN, if the flag is set to say that the GSI is currently asserted.
168      * Which is kind of icky, but less so than the VFIO one. I may fix them
169      * both later...
170      */
171     if (!kvm_kernel_irqchip_split()) {
172         error_report("kvm: Xen support requires kernel-irqchip=split");
173         return -EINVAL;
174     }
175 
176     s->xen_caps = xen_caps;
177 
178     /* Tell fw_cfg to notify the BIOS to reserve the range. */
179     e820_add_entry(XEN_SPECIAL_AREA_ADDR, XEN_SPECIAL_AREA_SIZE, E820_RESERVED);
180 
181     /* The pages couldn't be overlaid until KVM was initialized */
182     xen_primary_console_reset();
183     xen_xenstore_reset();
184 
185     return 0;
186 }
187 
188 int kvm_xen_init_vcpu(CPUState *cs)
189 {
190     X86CPU *cpu = X86_CPU(cs);
191     CPUX86State *env = &cpu->env;
192     int err;
193 
194     /*
195      * The kernel needs to know the Xen/ACPI vCPU ID because that's
196      * what the guest uses in hypercalls such as timers. It doesn't
197      * match the APIC ID which is generally used for talking to the
198      * kernel about vCPUs. And if vCPU threads race with creating
199      * their KVM vCPUs out of order, it doesn't necessarily match
200      * with the kernel's internal vCPU indices either.
201      */
202     if (kvm_xen_has_cap(EVTCHN_SEND)) {
203         struct kvm_xen_vcpu_attr va = {
204             .type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID,
205             .u.vcpu_id = cs->cpu_index,
206         };
207         err = kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &va);
208         if (err) {
209             error_report("kvm: Failed to set Xen vCPU ID attribute: %s",
210                          strerror(-err));
211             return err;
212         }
213     }
214 
215     env->xen_vcpu_info_gpa = INVALID_GPA;
216     env->xen_vcpu_info_default_gpa = INVALID_GPA;
217     env->xen_vcpu_time_info_gpa = INVALID_GPA;
218     env->xen_vcpu_runstate_gpa = INVALID_GPA;
219 
220     qemu_mutex_init(&env->xen_timers_lock);
221     env->xen_singleshot_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
222                                              xen_vcpu_singleshot_timer_event,
223                                              cpu);
224     if (!env->xen_singleshot_timer) {
225         return -ENOMEM;
226     }
227     env->xen_singleshot_timer->opaque = cs;
228 
229     env->xen_periodic_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
230                                            xen_vcpu_periodic_timer_event,
231                                            cpu);
232     if (!env->xen_periodic_timer) {
233         return -ENOMEM;
234     }
235     env->xen_periodic_timer->opaque = cs;
236 
237     return 0;
238 }
239 
240 uint32_t kvm_xen_get_caps(void)
241 {
242     return kvm_state->xen_caps;
243 }
244 
245 static bool kvm_xen_hcall_xen_version(struct kvm_xen_exit *exit, X86CPU *cpu,
246                                      int cmd, uint64_t arg)
247 {
248     int err = 0;
249 
250     switch (cmd) {
251     case XENVER_get_features: {
252         struct xen_feature_info fi;
253 
254         /* No need for 32/64 compat handling */
255         qemu_build_assert(sizeof(fi) == 8);
256 
257         err = kvm_copy_from_gva(CPU(cpu), arg, &fi, sizeof(fi));
258         if (err) {
259             break;
260         }
261 
262         fi.submap = 0;
263         if (fi.submap_idx == 0) {
264             fi.submap |= 1 << XENFEAT_writable_page_tables |
265                          1 << XENFEAT_writable_descriptor_tables |
266                          1 << XENFEAT_auto_translated_physmap |
267                          1 << XENFEAT_hvm_callback_vector |
268                          1 << XENFEAT_hvm_safe_pvclock |
269                          1 << XENFEAT_hvm_pirqs;
270         }
271 
272         err = kvm_copy_to_gva(CPU(cpu), arg, &fi, sizeof(fi));
273         break;
274     }
275 
276     default:
277         return false;
278     }
279 
280     exit->u.hcall.result = err;
281     return true;
282 }
283 
284 static int kvm_xen_set_vcpu_attr(CPUState *cs, uint16_t type, uint64_t gpa)
285 {
286     struct kvm_xen_vcpu_attr xhsi;
287 
288     xhsi.type = type;
289     xhsi.u.gpa = gpa;
290 
291     trace_kvm_xen_set_vcpu_attr(cs->cpu_index, type, gpa);
292 
293     return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &xhsi);
294 }
295 
296 static int kvm_xen_set_vcpu_callback_vector(CPUState *cs)
297 {
298     uint8_t vector = X86_CPU(cs)->env.xen_vcpu_callback_vector;
299     struct kvm_xen_vcpu_attr xva;
300 
301     xva.type = KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR;
302     xva.u.vector = vector;
303 
304     trace_kvm_xen_set_vcpu_callback(cs->cpu_index, vector);
305 
306     return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &xva);
307 }
308 
309 static void do_set_vcpu_callback_vector(CPUState *cs, run_on_cpu_data data)
310 {
311     X86CPU *cpu = X86_CPU(cs);
312     CPUX86State *env = &cpu->env;
313 
314     env->xen_vcpu_callback_vector = data.host_int;
315 
316     if (kvm_xen_has_cap(EVTCHN_SEND)) {
317         kvm_xen_set_vcpu_callback_vector(cs);
318     }
319 }
320 
321 static int set_vcpu_info(CPUState *cs, uint64_t gpa)
322 {
323     X86CPU *cpu = X86_CPU(cs);
324     CPUX86State *env = &cpu->env;
325     MemoryRegionSection mrs = { .mr = NULL };
326     void *vcpu_info_hva = NULL;
327     int ret;
328 
329     ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO, gpa);
330     if (ret || gpa == INVALID_GPA) {
331         goto out;
332     }
333 
334     mrs = memory_region_find(get_system_memory(), gpa,
335                              sizeof(struct vcpu_info));
336     if (mrs.mr && mrs.mr->ram_block &&
337         !int128_lt(mrs.size, int128_make64(sizeof(struct vcpu_info)))) {
338         vcpu_info_hva = qemu_map_ram_ptr(mrs.mr->ram_block,
339                                          mrs.offset_within_region);
340     }
341     if (!vcpu_info_hva) {
342         if (mrs.mr) {
343             memory_region_unref(mrs.mr);
344             mrs.mr = NULL;
345         }
346         ret = -EINVAL;
347     }
348 
349  out:
350     if (env->xen_vcpu_info_mr) {
351         memory_region_unref(env->xen_vcpu_info_mr);
352     }
353     env->xen_vcpu_info_hva = vcpu_info_hva;
354     env->xen_vcpu_info_mr = mrs.mr;
355     return ret;
356 }
357 
358 static void do_set_vcpu_info_default_gpa(CPUState *cs, run_on_cpu_data data)
359 {
360     X86CPU *cpu = X86_CPU(cs);
361     CPUX86State *env = &cpu->env;
362 
363     env->xen_vcpu_info_default_gpa = data.host_ulong;
364 
365     /* Changing the default does nothing if a vcpu_info was explicitly set. */
366     if (env->xen_vcpu_info_gpa == INVALID_GPA) {
367         set_vcpu_info(cs, env->xen_vcpu_info_default_gpa);
368     }
369 }
370 
371 static void do_set_vcpu_info_gpa(CPUState *cs, run_on_cpu_data data)
372 {
373     X86CPU *cpu = X86_CPU(cs);
374     CPUX86State *env = &cpu->env;
375 
376     env->xen_vcpu_info_gpa = data.host_ulong;
377 
378     set_vcpu_info(cs, env->xen_vcpu_info_gpa);
379 }
380 
381 void *kvm_xen_get_vcpu_info_hva(uint32_t vcpu_id)
382 {
383     CPUState *cs = qemu_get_cpu(vcpu_id);
384     if (!cs) {
385         return NULL;
386     }
387 
388     return X86_CPU(cs)->env.xen_vcpu_info_hva;
389 }
390 
391 void kvm_xen_maybe_deassert_callback(CPUState *cs)
392 {
393     CPUX86State *env = &X86_CPU(cs)->env;
394     struct vcpu_info *vi = env->xen_vcpu_info_hva;
395     if (!vi) {
396         return;
397     }
398 
399     /* If the evtchn_upcall_pending flag is cleared, turn the GSI off. */
400     if (!vi->evtchn_upcall_pending) {
401         bql_lock();
402         /*
403          * Check again now we have the lock, because it may have been
404          * asserted in the interim. And we don't want to take the lock
405          * every time because this is a fast path.
406          */
407         if (!vi->evtchn_upcall_pending) {
408             X86_CPU(cs)->env.xen_callback_asserted = false;
409             xen_evtchn_set_callback_level(0);
410         }
411         bql_unlock();
412     }
413 }
414 
415 void kvm_xen_set_callback_asserted(void)
416 {
417     CPUState *cs = qemu_get_cpu(0);
418 
419     if (cs) {
420         X86_CPU(cs)->env.xen_callback_asserted = true;
421     }
422 }
423 
424 bool kvm_xen_has_vcpu_callback_vector(void)
425 {
426     CPUState *cs = qemu_get_cpu(0);
427 
428     return cs && !!X86_CPU(cs)->env.xen_vcpu_callback_vector;
429 }
430 
431 void kvm_xen_inject_vcpu_callback_vector(uint32_t vcpu_id, int type)
432 {
433     CPUState *cs = qemu_get_cpu(vcpu_id);
434     uint8_t vector;
435 
436     if (!cs) {
437         return;
438     }
439 
440     vector = X86_CPU(cs)->env.xen_vcpu_callback_vector;
441     if (vector) {
442         /*
443          * The per-vCPU callback vector injected via lapic. Just
444          * deliver it as an MSI.
445          */
446         MSIMessage msg = {
447             .address = APIC_DEFAULT_ADDRESS |
448                        (X86_CPU(cs)->apic_id << MSI_ADDR_DEST_ID_SHIFT),
449             .data = vector | (1UL << MSI_DATA_LEVEL_SHIFT),
450         };
451         kvm_irqchip_send_msi(kvm_state, msg);
452         return;
453     }
454 
455     switch (type) {
456     case HVM_PARAM_CALLBACK_TYPE_VECTOR:
457         /*
458          * If the evtchn_upcall_pending field in the vcpu_info is set, then
459          * KVM will automatically deliver the vector on entering the vCPU
460          * so all we have to do is kick it out.
461          */
462         qemu_cpu_kick(cs);
463         break;
464 
465     case HVM_PARAM_CALLBACK_TYPE_GSI:
466     case HVM_PARAM_CALLBACK_TYPE_PCI_INTX:
467         if (vcpu_id == 0) {
468             xen_evtchn_set_callback_level(1);
469         }
470         break;
471     }
472 }
473 
474 /* Must always be called with xen_timers_lock held */
475 static int kvm_xen_set_vcpu_timer(CPUState *cs)
476 {
477     X86CPU *cpu = X86_CPU(cs);
478     CPUX86State *env = &cpu->env;
479 
480     struct kvm_xen_vcpu_attr va = {
481         .type = KVM_XEN_VCPU_ATTR_TYPE_TIMER,
482         .u.timer.port = env->xen_virq[VIRQ_TIMER],
483         .u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
484         .u.timer.expires_ns = env->xen_singleshot_timer_ns,
485     };
486 
487     return kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_SET_ATTR, &va);
488 }
489 
490 static void do_set_vcpu_timer_virq(CPUState *cs, run_on_cpu_data data)
491 {
492     QEMU_LOCK_GUARD(&X86_CPU(cs)->env.xen_timers_lock);
493     kvm_xen_set_vcpu_timer(cs);
494 }
495 
496 int kvm_xen_set_vcpu_virq(uint32_t vcpu_id, uint16_t virq, uint16_t port)
497 {
498     CPUState *cs = qemu_get_cpu(vcpu_id);
499 
500     if (!cs) {
501         return -ENOENT;
502     }
503 
504     /* cpu.h doesn't include the actual Xen header. */
505     qemu_build_assert(NR_VIRQS == XEN_NR_VIRQS);
506 
507     if (virq >= NR_VIRQS) {
508         return -EINVAL;
509     }
510 
511     if (port && X86_CPU(cs)->env.xen_virq[virq]) {
512         return -EEXIST;
513     }
514 
515     X86_CPU(cs)->env.xen_virq[virq] = port;
516     if (virq == VIRQ_TIMER && kvm_xen_has_cap(EVTCHN_SEND)) {
517         async_run_on_cpu(cs, do_set_vcpu_timer_virq,
518                          RUN_ON_CPU_HOST_INT(port));
519     }
520     return 0;
521 }
522 
523 static void do_set_vcpu_time_info_gpa(CPUState *cs, run_on_cpu_data data)
524 {
525     X86CPU *cpu = X86_CPU(cs);
526     CPUX86State *env = &cpu->env;
527 
528     env->xen_vcpu_time_info_gpa = data.host_ulong;
529 
530     kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
531                           env->xen_vcpu_time_info_gpa);
532 }
533 
534 static void do_set_vcpu_runstate_gpa(CPUState *cs, run_on_cpu_data data)
535 {
536     X86CPU *cpu = X86_CPU(cs);
537     CPUX86State *env = &cpu->env;
538 
539     env->xen_vcpu_runstate_gpa = data.host_ulong;
540 
541     kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
542                           env->xen_vcpu_runstate_gpa);
543 }
544 
545 static void do_vcpu_soft_reset(CPUState *cs, run_on_cpu_data data)
546 {
547     X86CPU *cpu = X86_CPU(cs);
548     CPUX86State *env = &cpu->env;
549 
550     env->xen_vcpu_info_gpa = INVALID_GPA;
551     env->xen_vcpu_info_default_gpa = INVALID_GPA;
552     env->xen_vcpu_time_info_gpa = INVALID_GPA;
553     env->xen_vcpu_runstate_gpa = INVALID_GPA;
554     env->xen_vcpu_callback_vector = 0;
555     memset(env->xen_virq, 0, sizeof(env->xen_virq));
556 
557     set_vcpu_info(cs, INVALID_GPA);
558     kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
559                           INVALID_GPA);
560     kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
561                           INVALID_GPA);
562     if (kvm_xen_has_cap(EVTCHN_SEND)) {
563         kvm_xen_set_vcpu_callback_vector(cs);
564 
565         QEMU_LOCK_GUARD(&X86_CPU(cs)->env.xen_timers_lock);
566         env->xen_singleshot_timer_ns = 0;
567         kvm_xen_set_vcpu_timer(cs);
568     } else {
569         vcpuop_stop_singleshot_timer(cs);
570     };
571 
572 }
573 
574 static int xen_set_shared_info(uint64_t gfn)
575 {
576     uint64_t gpa = gfn << TARGET_PAGE_BITS;
577     int i, err;
578 
579     BQL_LOCK_GUARD();
580 
581     /*
582      * The xen_overlay device tells KVM about it too, since it had to
583      * do that on migration load anyway (unless we're going to jump
584      * through lots of hoops to maintain the fiction that this isn't
585      * KVM-specific.
586      */
587     err = xen_overlay_map_shinfo_page(gpa);
588     if (err) {
589             return err;
590     }
591 
592     trace_kvm_xen_set_shared_info(gfn);
593 
594     for (i = 0; i < XEN_LEGACY_MAX_VCPUS; i++) {
595         CPUState *cpu = qemu_get_cpu(i);
596         if (cpu) {
597             async_run_on_cpu(cpu, do_set_vcpu_info_default_gpa,
598                              RUN_ON_CPU_HOST_ULONG(gpa));
599         }
600         gpa += sizeof(vcpu_info_t);
601     }
602 
603     return err;
604 }
605 
606 static int add_to_physmap_one(uint32_t space, uint64_t idx, uint64_t gfn)
607 {
608     switch (space) {
609     case XENMAPSPACE_shared_info:
610         if (idx > 0) {
611             return -EINVAL;
612         }
613         return xen_set_shared_info(gfn);
614 
615     case XENMAPSPACE_grant_table:
616         return xen_gnttab_map_page(idx, gfn);
617 
618     case XENMAPSPACE_gmfn:
619     case XENMAPSPACE_gmfn_range:
620         return -ENOTSUP;
621 
622     case XENMAPSPACE_gmfn_foreign:
623     case XENMAPSPACE_dev_mmio:
624         return -EPERM;
625 
626     default:
627         return -EINVAL;
628     }
629 }
630 
631 static int do_add_to_physmap(struct kvm_xen_exit *exit, X86CPU *cpu,
632                              uint64_t arg)
633 {
634     struct xen_add_to_physmap xatp;
635     CPUState *cs = CPU(cpu);
636 
637     if (hypercall_compat32(exit->u.hcall.longmode)) {
638         struct compat_xen_add_to_physmap xatp32;
639 
640         qemu_build_assert(sizeof(struct compat_xen_add_to_physmap) == 16);
641         if (kvm_copy_from_gva(cs, arg, &xatp32, sizeof(xatp32))) {
642             return -EFAULT;
643         }
644         xatp.domid = xatp32.domid;
645         xatp.size = xatp32.size;
646         xatp.space = xatp32.space;
647         xatp.idx = xatp32.idx;
648         xatp.gpfn = xatp32.gpfn;
649     } else {
650         if (kvm_copy_from_gva(cs, arg, &xatp, sizeof(xatp))) {
651             return -EFAULT;
652         }
653     }
654 
655     if (xatp.domid != DOMID_SELF && xatp.domid != xen_domid) {
656         return -ESRCH;
657     }
658 
659     return add_to_physmap_one(xatp.space, xatp.idx, xatp.gpfn);
660 }
661 
662 static int do_add_to_physmap_batch(struct kvm_xen_exit *exit, X86CPU *cpu,
663                                    uint64_t arg)
664 {
665     struct xen_add_to_physmap_batch xatpb;
666     unsigned long idxs_gva, gpfns_gva, errs_gva;
667     CPUState *cs = CPU(cpu);
668     size_t op_sz;
669 
670     if (hypercall_compat32(exit->u.hcall.longmode)) {
671         struct compat_xen_add_to_physmap_batch xatpb32;
672 
673         qemu_build_assert(sizeof(struct compat_xen_add_to_physmap_batch) == 20);
674         if (kvm_copy_from_gva(cs, arg, &xatpb32, sizeof(xatpb32))) {
675             return -EFAULT;
676         }
677         xatpb.domid = xatpb32.domid;
678         xatpb.space = xatpb32.space;
679         xatpb.size = xatpb32.size;
680 
681         idxs_gva = xatpb32.idxs.c;
682         gpfns_gva = xatpb32.gpfns.c;
683         errs_gva = xatpb32.errs.c;
684         op_sz = sizeof(uint32_t);
685     } else {
686         if (kvm_copy_from_gva(cs, arg, &xatpb, sizeof(xatpb))) {
687             return -EFAULT;
688         }
689         op_sz = sizeof(unsigned long);
690         idxs_gva = (unsigned long)xatpb.idxs.p;
691         gpfns_gva = (unsigned long)xatpb.gpfns.p;
692         errs_gva = (unsigned long)xatpb.errs.p;
693     }
694 
695     if (xatpb.domid != DOMID_SELF && xatpb.domid != xen_domid) {
696         return -ESRCH;
697     }
698 
699     /* Explicitly invalid for the batch op. Not that we implement it anyway. */
700     if (xatpb.space == XENMAPSPACE_gmfn_range) {
701         return -EINVAL;
702     }
703 
704     while (xatpb.size--) {
705         unsigned long idx = 0;
706         unsigned long gpfn = 0;
707         int err;
708 
709         /* For 32-bit compat this only copies the low 32 bits of each */
710         if (kvm_copy_from_gva(cs, idxs_gva, &idx, op_sz) ||
711             kvm_copy_from_gva(cs, gpfns_gva, &gpfn, op_sz)) {
712             return -EFAULT;
713         }
714         idxs_gva += op_sz;
715         gpfns_gva += op_sz;
716 
717         err = add_to_physmap_one(xatpb.space, idx, gpfn);
718 
719         if (kvm_copy_to_gva(cs, errs_gva, &err, sizeof(err))) {
720             return -EFAULT;
721         }
722         errs_gva += sizeof(err);
723     }
724     return 0;
725 }
726 
727 static bool kvm_xen_hcall_memory_op(struct kvm_xen_exit *exit, X86CPU *cpu,
728                                    int cmd, uint64_t arg)
729 {
730     int err;
731 
732     switch (cmd) {
733     case XENMEM_add_to_physmap:
734         err = do_add_to_physmap(exit, cpu, arg);
735         break;
736 
737     case XENMEM_add_to_physmap_batch:
738         err = do_add_to_physmap_batch(exit, cpu, arg);
739         break;
740 
741     default:
742         return false;
743     }
744 
745     exit->u.hcall.result = err;
746     return true;
747 }
748 
749 static bool handle_set_param(struct kvm_xen_exit *exit, X86CPU *cpu,
750                              uint64_t arg)
751 {
752     CPUState *cs = CPU(cpu);
753     struct xen_hvm_param hp;
754     int err = 0;
755 
756     /* No need for 32/64 compat handling */
757     qemu_build_assert(sizeof(hp) == 16);
758 
759     if (kvm_copy_from_gva(cs, arg, &hp, sizeof(hp))) {
760         err = -EFAULT;
761         goto out;
762     }
763 
764     if (hp.domid != DOMID_SELF && hp.domid != xen_domid) {
765         err = -ESRCH;
766         goto out;
767     }
768 
769     switch (hp.index) {
770     case HVM_PARAM_CALLBACK_IRQ:
771         bql_lock();
772         err = xen_evtchn_set_callback_param(hp.value);
773         bql_unlock();
774         xen_set_long_mode(exit->u.hcall.longmode);
775         break;
776     default:
777         return false;
778     }
779 
780 out:
781     exit->u.hcall.result = err;
782     return true;
783 }
784 
785 static bool handle_get_param(struct kvm_xen_exit *exit, X86CPU *cpu,
786                              uint64_t arg)
787 {
788     CPUState *cs = CPU(cpu);
789     struct xen_hvm_param hp;
790     int err = 0;
791 
792     /* No need for 32/64 compat handling */
793     qemu_build_assert(sizeof(hp) == 16);
794 
795     if (kvm_copy_from_gva(cs, arg, &hp, sizeof(hp))) {
796         err = -EFAULT;
797         goto out;
798     }
799 
800     if (hp.domid != DOMID_SELF && hp.domid != xen_domid) {
801         err = -ESRCH;
802         goto out;
803     }
804 
805     switch (hp.index) {
806     case HVM_PARAM_STORE_PFN:
807         hp.value = XEN_SPECIAL_PFN(XENSTORE);
808         break;
809     case HVM_PARAM_STORE_EVTCHN:
810         hp.value = xen_xenstore_get_port();
811         break;
812     case HVM_PARAM_CONSOLE_PFN:
813         hp.value = xen_primary_console_get_pfn();
814         if (!hp.value) {
815             err = -EINVAL;
816         }
817         break;
818     case HVM_PARAM_CONSOLE_EVTCHN:
819         hp.value = xen_primary_console_get_port();
820         if (!hp.value) {
821             err = -EINVAL;
822         }
823         break;
824     default:
825         return false;
826     }
827 
828     if (!err && kvm_copy_to_gva(cs, arg, &hp, sizeof(hp))) {
829         err = -EFAULT;
830     }
831 out:
832     exit->u.hcall.result = err;
833     return true;
834 }
835 
836 static int kvm_xen_hcall_evtchn_upcall_vector(struct kvm_xen_exit *exit,
837                                               X86CPU *cpu, uint64_t arg)
838 {
839     struct xen_hvm_evtchn_upcall_vector up;
840     CPUState *target_cs;
841 
842     /* No need for 32/64 compat handling */
843     qemu_build_assert(sizeof(up) == 8);
844 
845     if (kvm_copy_from_gva(CPU(cpu), arg, &up, sizeof(up))) {
846         return -EFAULT;
847     }
848 
849     if (up.vector < 0x10) {
850         return -EINVAL;
851     }
852 
853     target_cs = qemu_get_cpu(up.vcpu);
854     if (!target_cs) {
855         return -EINVAL;
856     }
857 
858     async_run_on_cpu(target_cs, do_set_vcpu_callback_vector,
859                      RUN_ON_CPU_HOST_INT(up.vector));
860     return 0;
861 }
862 
863 static bool kvm_xen_hcall_hvm_op(struct kvm_xen_exit *exit, X86CPU *cpu,
864                                  int cmd, uint64_t arg)
865 {
866     int ret = -ENOSYS;
867     switch (cmd) {
868     case HVMOP_set_evtchn_upcall_vector:
869         ret = kvm_xen_hcall_evtchn_upcall_vector(exit, cpu, arg);
870         break;
871 
872     case HVMOP_pagetable_dying:
873         ret = -ENOSYS;
874         break;
875 
876     case HVMOP_set_param:
877         return handle_set_param(exit, cpu, arg);
878 
879     case HVMOP_get_param:
880         return handle_get_param(exit, cpu, arg);
881 
882     default:
883         return false;
884     }
885 
886     exit->u.hcall.result = ret;
887     return true;
888 }
889 
890 static int vcpuop_register_vcpu_info(CPUState *cs, CPUState *target,
891                                      uint64_t arg)
892 {
893     struct vcpu_register_vcpu_info rvi;
894     uint64_t gpa;
895 
896     /* No need for 32/64 compat handling */
897     qemu_build_assert(sizeof(rvi) == 16);
898     qemu_build_assert(sizeof(struct vcpu_info) == 64);
899 
900     if (!target) {
901         return -ENOENT;
902     }
903 
904     if (kvm_copy_from_gva(cs, arg, &rvi, sizeof(rvi))) {
905         return -EFAULT;
906     }
907 
908     if (rvi.offset > TARGET_PAGE_SIZE - sizeof(struct vcpu_info)) {
909         return -EINVAL;
910     }
911 
912     gpa = ((rvi.mfn << TARGET_PAGE_BITS) + rvi.offset);
913     async_run_on_cpu(target, do_set_vcpu_info_gpa, RUN_ON_CPU_HOST_ULONG(gpa));
914     return 0;
915 }
916 
917 static int vcpuop_register_vcpu_time_info(CPUState *cs, CPUState *target,
918                                           uint64_t arg)
919 {
920     struct vcpu_register_time_memory_area tma;
921     uint64_t gpa;
922     size_t len;
923 
924     /* No need for 32/64 compat handling */
925     qemu_build_assert(sizeof(tma) == 8);
926     qemu_build_assert(sizeof(struct vcpu_time_info) == 32);
927 
928     if (!target) {
929         return -ENOENT;
930     }
931 
932     if (kvm_copy_from_gva(cs, arg, &tma, sizeof(tma))) {
933         return -EFAULT;
934     }
935 
936     /*
937      * Xen actually uses the GVA and does the translation through the guest
938      * page tables each time. But Linux/KVM uses the GPA, on the assumption
939      * that guests only ever use *global* addresses (kernel virtual addresses)
940      * for it. If Linux is changed to redo the GVA→GPA translation each time,
941      * it will offer a new vCPU attribute for that, and we'll use it instead.
942      */
943     if (!kvm_gva_to_gpa(cs, tma.addr.p, &gpa, &len, false) ||
944         len < sizeof(struct vcpu_time_info)) {
945         return -EFAULT;
946     }
947 
948     async_run_on_cpu(target, do_set_vcpu_time_info_gpa,
949                      RUN_ON_CPU_HOST_ULONG(gpa));
950     return 0;
951 }
952 
953 static int vcpuop_register_runstate_info(CPUState *cs, CPUState *target,
954                                          uint64_t arg)
955 {
956     struct vcpu_register_runstate_memory_area rma;
957     uint64_t gpa;
958     size_t len;
959 
960     /* No need for 32/64 compat handling */
961     qemu_build_assert(sizeof(rma) == 8);
962     /* The runstate area actually does change size, but Linux copes. */
963 
964     if (!target) {
965         return -ENOENT;
966     }
967 
968     if (kvm_copy_from_gva(cs, arg, &rma, sizeof(rma))) {
969         return -EFAULT;
970     }
971 
972     /* As with vcpu_time_info, Xen actually uses the GVA but KVM doesn't. */
973     if (!kvm_gva_to_gpa(cs, rma.addr.p, &gpa, &len, false)) {
974         return -EFAULT;
975     }
976 
977     async_run_on_cpu(target, do_set_vcpu_runstate_gpa,
978                      RUN_ON_CPU_HOST_ULONG(gpa));
979     return 0;
980 }
981 
982 static uint64_t kvm_get_current_ns(void)
983 {
984     struct kvm_clock_data data;
985     int ret;
986 
987     ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
988     if (ret < 0) {
989         fprintf(stderr, "KVM_GET_CLOCK failed: %s\n", strerror(ret));
990                 abort();
991     }
992 
993     return data.clock;
994 }
995 
996 static void xen_vcpu_singleshot_timer_event(void *opaque)
997 {
998     CPUState *cpu = opaque;
999     CPUX86State *env = &X86_CPU(cpu)->env;
1000     uint16_t port = env->xen_virq[VIRQ_TIMER];
1001 
1002     if (likely(port)) {
1003         xen_evtchn_set_port(port);
1004     }
1005 
1006     qemu_mutex_lock(&env->xen_timers_lock);
1007     env->xen_singleshot_timer_ns = 0;
1008     qemu_mutex_unlock(&env->xen_timers_lock);
1009 }
1010 
1011 static void xen_vcpu_periodic_timer_event(void *opaque)
1012 {
1013     CPUState *cpu = opaque;
1014     CPUX86State *env = &X86_CPU(cpu)->env;
1015     uint16_t port = env->xen_virq[VIRQ_TIMER];
1016     int64_t qemu_now;
1017 
1018     if (likely(port)) {
1019         xen_evtchn_set_port(port);
1020     }
1021 
1022     qemu_mutex_lock(&env->xen_timers_lock);
1023 
1024     qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
1025     timer_mod_ns(env->xen_periodic_timer,
1026                  qemu_now + env->xen_periodic_timer_period);
1027 
1028     qemu_mutex_unlock(&env->xen_timers_lock);
1029 }
1030 
1031 static int do_set_periodic_timer(CPUState *target, uint64_t period_ns)
1032 {
1033     CPUX86State *tenv = &X86_CPU(target)->env;
1034     int64_t qemu_now;
1035 
1036     timer_del(tenv->xen_periodic_timer);
1037 
1038     qemu_mutex_lock(&tenv->xen_timers_lock);
1039 
1040     qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
1041     timer_mod_ns(tenv->xen_periodic_timer, qemu_now + period_ns);
1042     tenv->xen_periodic_timer_period = period_ns;
1043 
1044     qemu_mutex_unlock(&tenv->xen_timers_lock);
1045     return 0;
1046 }
1047 
1048 #define MILLISECS(_ms)  ((int64_t)((_ms) * 1000000ULL))
1049 #define MICROSECS(_us)  ((int64_t)((_us) * 1000ULL))
1050 #define STIME_MAX ((time_t)((int64_t)~0ull >> 1))
1051 /* Chosen so (NOW() + delta) won't overflow without an uptime of 200 years */
1052 #define STIME_DELTA_MAX ((int64_t)((uint64_t)~0ull >> 2))
1053 
1054 static int vcpuop_set_periodic_timer(CPUState *cs, CPUState *target,
1055                                      uint64_t arg)
1056 {
1057     struct vcpu_set_periodic_timer spt;
1058 
1059     qemu_build_assert(sizeof(spt) == 8);
1060     if (kvm_copy_from_gva(cs, arg, &spt, sizeof(spt))) {
1061         return -EFAULT;
1062     }
1063 
1064     if (spt.period_ns < MILLISECS(1) || spt.period_ns > STIME_DELTA_MAX) {
1065         return -EINVAL;
1066     }
1067 
1068     return do_set_periodic_timer(target, spt.period_ns);
1069 }
1070 
1071 static int vcpuop_stop_periodic_timer(CPUState *target)
1072 {
1073     CPUX86State *tenv = &X86_CPU(target)->env;
1074 
1075     qemu_mutex_lock(&tenv->xen_timers_lock);
1076 
1077     timer_del(tenv->xen_periodic_timer);
1078     tenv->xen_periodic_timer_period = 0;
1079 
1080     qemu_mutex_unlock(&tenv->xen_timers_lock);
1081     return 0;
1082 }
1083 
1084 /*
1085  * Userspace handling of timer, for older kernels.
1086  * Must always be called with xen_timers_lock held.
1087  */
1088 static int do_set_singleshot_timer(CPUState *cs, uint64_t timeout_abs,
1089                                    bool linux_wa)
1090 {
1091     CPUX86State *env = &X86_CPU(cs)->env;
1092     int64_t now = kvm_get_current_ns();
1093     int64_t qemu_now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
1094     int64_t delta = timeout_abs - now;
1095 
1096     if (linux_wa && unlikely((int64_t)timeout_abs < 0 ||
1097                              (delta > 0 && (uint32_t)(delta >> 50) != 0))) {
1098         /*
1099          * Xen has a 'Linux workaround' in do_set_timer_op() which checks
1100          * for negative absolute timeout values (caused by integer
1101          * overflow), and for values about 13 days in the future (2^50ns)
1102          * which would be caused by jiffies overflow. For those cases, it
1103          * sets the timeout 100ms in the future (not *too* soon, since if
1104          * a guest really did set a long timeout on purpose we don't want
1105          * to keep churning CPU time by waking it up).
1106          */
1107         delta = (100 * SCALE_MS);
1108         timeout_abs = now + delta;
1109     }
1110 
1111     timer_mod_ns(env->xen_singleshot_timer, qemu_now + delta);
1112     env->xen_singleshot_timer_ns = now + delta;
1113     return 0;
1114 }
1115 
1116 static int vcpuop_set_singleshot_timer(CPUState *cs, uint64_t arg)
1117 {
1118     struct vcpu_set_singleshot_timer sst = { 0 };
1119 
1120     /*
1121      * The struct is a uint64_t followed by a uint32_t. On 32-bit that
1122      * makes it 12 bytes. On 64-bit it gets padded to 16. The parts
1123      * that get used are identical, and there's four bytes of padding
1124      * unused at the end. For true Xen compatibility we should attempt
1125      * to copy the full 16 bytes from 64-bit guests, and return -EFAULT
1126      * if we can't get the padding too. But that's daft. Just copy what
1127      * we need.
1128      */
1129     qemu_build_assert(offsetof(struct vcpu_set_singleshot_timer, flags) == 8);
1130     qemu_build_assert(sizeof(sst) >= 12);
1131 
1132     if (kvm_copy_from_gva(cs, arg, &sst, 12)) {
1133         return -EFAULT;
1134     }
1135 
1136     QEMU_LOCK_GUARD(&X86_CPU(cs)->env.xen_timers_lock);
1137 
1138     /*
1139      * We ignore the VCPU_SSHOTTMR_future flag, just as Xen now does.
1140      * The only guest that ever used it, got it wrong.
1141      * https://xenbits.xen.org/gitweb/?p=xen.git;a=commitdiff;h=19c6cbd909
1142      */
1143     return do_set_singleshot_timer(cs, sst.timeout_abs_ns, false);
1144 }
1145 
1146 static int vcpuop_stop_singleshot_timer(CPUState *cs)
1147 {
1148     CPUX86State *env = &X86_CPU(cs)->env;
1149 
1150     qemu_mutex_lock(&env->xen_timers_lock);
1151 
1152     timer_del(env->xen_singleshot_timer);
1153     env->xen_singleshot_timer_ns = 0;
1154 
1155     qemu_mutex_unlock(&env->xen_timers_lock);
1156     return 0;
1157 }
1158 
1159 static bool kvm_xen_hcall_set_timer_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1160                                        uint64_t timeout)
1161 {
1162     int err;
1163 
1164     if (unlikely(timeout == 0)) {
1165         err = vcpuop_stop_singleshot_timer(CPU(cpu));
1166     } else {
1167         QEMU_LOCK_GUARD(&X86_CPU(cpu)->env.xen_timers_lock);
1168         err = do_set_singleshot_timer(CPU(cpu), timeout, true);
1169     }
1170     exit->u.hcall.result = err;
1171     return true;
1172 }
1173 
1174 static bool kvm_xen_hcall_vcpu_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1175                                   int cmd, int vcpu_id, uint64_t arg)
1176 {
1177     CPUState *cs = CPU(cpu);
1178     CPUState *dest = cs->cpu_index == vcpu_id ? cs : qemu_get_cpu(vcpu_id);
1179     int err;
1180 
1181     if (!dest) {
1182         err = -ENOENT;
1183         goto out;
1184     }
1185 
1186     switch (cmd) {
1187     case VCPUOP_register_runstate_memory_area:
1188         err = vcpuop_register_runstate_info(cs, dest, arg);
1189         break;
1190     case VCPUOP_register_vcpu_time_memory_area:
1191         err = vcpuop_register_vcpu_time_info(cs, dest, arg);
1192         break;
1193     case VCPUOP_register_vcpu_info:
1194         err = vcpuop_register_vcpu_info(cs, dest, arg);
1195         break;
1196     case VCPUOP_set_singleshot_timer: {
1197         if (cs->cpu_index == vcpu_id) {
1198             err = vcpuop_set_singleshot_timer(dest, arg);
1199         } else {
1200             err = -EINVAL;
1201         }
1202         break;
1203     }
1204     case VCPUOP_stop_singleshot_timer:
1205         if (cs->cpu_index == vcpu_id) {
1206             err = vcpuop_stop_singleshot_timer(dest);
1207         } else {
1208             err = -EINVAL;
1209         }
1210         break;
1211     case VCPUOP_set_periodic_timer: {
1212         err = vcpuop_set_periodic_timer(cs, dest, arg);
1213         break;
1214     }
1215     case VCPUOP_stop_periodic_timer:
1216         err = vcpuop_stop_periodic_timer(dest);
1217         break;
1218 
1219     default:
1220         return false;
1221     }
1222 
1223  out:
1224     exit->u.hcall.result = err;
1225     return true;
1226 }
1227 
1228 static bool kvm_xen_hcall_evtchn_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1229                                     int cmd, uint64_t arg)
1230 {
1231     CPUState *cs = CPU(cpu);
1232     int err = -ENOSYS;
1233 
1234     switch (cmd) {
1235     case EVTCHNOP_init_control:
1236     case EVTCHNOP_expand_array:
1237     case EVTCHNOP_set_priority:
1238         /* We do not support FIFO channels at this point */
1239         err = -ENOSYS;
1240         break;
1241 
1242     case EVTCHNOP_status: {
1243         struct evtchn_status status;
1244 
1245         qemu_build_assert(sizeof(status) == 24);
1246         if (kvm_copy_from_gva(cs, arg, &status, sizeof(status))) {
1247             err = -EFAULT;
1248             break;
1249         }
1250 
1251         err = xen_evtchn_status_op(&status);
1252         if (!err && kvm_copy_to_gva(cs, arg, &status, sizeof(status))) {
1253             err = -EFAULT;
1254         }
1255         break;
1256     }
1257     case EVTCHNOP_close: {
1258         struct evtchn_close close;
1259 
1260         qemu_build_assert(sizeof(close) == 4);
1261         if (kvm_copy_from_gva(cs, arg, &close, sizeof(close))) {
1262             err = -EFAULT;
1263             break;
1264         }
1265 
1266         err = xen_evtchn_close_op(&close);
1267         break;
1268     }
1269     case EVTCHNOP_unmask: {
1270         struct evtchn_unmask unmask;
1271 
1272         qemu_build_assert(sizeof(unmask) == 4);
1273         if (kvm_copy_from_gva(cs, arg, &unmask, sizeof(unmask))) {
1274             err = -EFAULT;
1275             break;
1276         }
1277 
1278         err = xen_evtchn_unmask_op(&unmask);
1279         break;
1280     }
1281     case EVTCHNOP_bind_virq: {
1282         struct evtchn_bind_virq virq;
1283 
1284         qemu_build_assert(sizeof(virq) == 12);
1285         if (kvm_copy_from_gva(cs, arg, &virq, sizeof(virq))) {
1286             err = -EFAULT;
1287             break;
1288         }
1289 
1290         err = xen_evtchn_bind_virq_op(&virq);
1291         if (!err && kvm_copy_to_gva(cs, arg, &virq, sizeof(virq))) {
1292             err = -EFAULT;
1293         }
1294         break;
1295     }
1296     case EVTCHNOP_bind_pirq: {
1297         struct evtchn_bind_pirq pirq;
1298 
1299         qemu_build_assert(sizeof(pirq) == 12);
1300         if (kvm_copy_from_gva(cs, arg, &pirq, sizeof(pirq))) {
1301             err = -EFAULT;
1302             break;
1303         }
1304 
1305         err = xen_evtchn_bind_pirq_op(&pirq);
1306         if (!err && kvm_copy_to_gva(cs, arg, &pirq, sizeof(pirq))) {
1307             err = -EFAULT;
1308         }
1309         break;
1310     }
1311     case EVTCHNOP_bind_ipi: {
1312         struct evtchn_bind_ipi ipi;
1313 
1314         qemu_build_assert(sizeof(ipi) == 8);
1315         if (kvm_copy_from_gva(cs, arg, &ipi, sizeof(ipi))) {
1316             err = -EFAULT;
1317             break;
1318         }
1319 
1320         err = xen_evtchn_bind_ipi_op(&ipi);
1321         if (!err && kvm_copy_to_gva(cs, arg, &ipi, sizeof(ipi))) {
1322             err = -EFAULT;
1323         }
1324         break;
1325     }
1326     case EVTCHNOP_send: {
1327         struct evtchn_send send;
1328 
1329         qemu_build_assert(sizeof(send) == 4);
1330         if (kvm_copy_from_gva(cs, arg, &send, sizeof(send))) {
1331             err = -EFAULT;
1332             break;
1333         }
1334 
1335         err = xen_evtchn_send_op(&send);
1336         break;
1337     }
1338     case EVTCHNOP_alloc_unbound: {
1339         struct evtchn_alloc_unbound alloc;
1340 
1341         qemu_build_assert(sizeof(alloc) == 8);
1342         if (kvm_copy_from_gva(cs, arg, &alloc, sizeof(alloc))) {
1343             err = -EFAULT;
1344             break;
1345         }
1346 
1347         err = xen_evtchn_alloc_unbound_op(&alloc);
1348         if (!err && kvm_copy_to_gva(cs, arg, &alloc, sizeof(alloc))) {
1349             err = -EFAULT;
1350         }
1351         break;
1352     }
1353     case EVTCHNOP_bind_interdomain: {
1354         struct evtchn_bind_interdomain interdomain;
1355 
1356         qemu_build_assert(sizeof(interdomain) == 12);
1357         if (kvm_copy_from_gva(cs, arg, &interdomain, sizeof(interdomain))) {
1358             err = -EFAULT;
1359             break;
1360         }
1361 
1362         err = xen_evtchn_bind_interdomain_op(&interdomain);
1363         if (!err &&
1364             kvm_copy_to_gva(cs, arg, &interdomain, sizeof(interdomain))) {
1365             err = -EFAULT;
1366         }
1367         break;
1368     }
1369     case EVTCHNOP_bind_vcpu: {
1370         struct evtchn_bind_vcpu vcpu;
1371 
1372         qemu_build_assert(sizeof(vcpu) == 8);
1373         if (kvm_copy_from_gva(cs, arg, &vcpu, sizeof(vcpu))) {
1374             err = -EFAULT;
1375             break;
1376         }
1377 
1378         err = xen_evtchn_bind_vcpu_op(&vcpu);
1379         break;
1380     }
1381     case EVTCHNOP_reset: {
1382         struct evtchn_reset reset;
1383 
1384         qemu_build_assert(sizeof(reset) == 2);
1385         if (kvm_copy_from_gva(cs, arg, &reset, sizeof(reset))) {
1386             err = -EFAULT;
1387             break;
1388         }
1389 
1390         err = xen_evtchn_reset_op(&reset);
1391         break;
1392     }
1393     default:
1394         return false;
1395     }
1396 
1397     exit->u.hcall.result = err;
1398     return true;
1399 }
1400 
1401 int kvm_xen_soft_reset(void)
1402 {
1403     CPUState *cpu;
1404     int err;
1405 
1406     assert(bql_locked());
1407 
1408     trace_kvm_xen_soft_reset();
1409 
1410     err = xen_evtchn_soft_reset();
1411     if (err) {
1412         return err;
1413     }
1414 
1415     /*
1416      * Zero is the reset/startup state for HVM_PARAM_CALLBACK_IRQ. Strictly,
1417      * it maps to HVM_PARAM_CALLBACK_TYPE_GSI with GSI#0, but Xen refuses to
1418      * to deliver to the timer interrupt and treats that as 'disabled'.
1419      */
1420     err = xen_evtchn_set_callback_param(0);
1421     if (err) {
1422         return err;
1423     }
1424 
1425     CPU_FOREACH(cpu) {
1426         async_run_on_cpu(cpu, do_vcpu_soft_reset, RUN_ON_CPU_NULL);
1427     }
1428 
1429     err = xen_overlay_map_shinfo_page(INVALID_GFN);
1430     if (err) {
1431         return err;
1432     }
1433 
1434     err = xen_gnttab_reset();
1435     if (err) {
1436         return err;
1437     }
1438 
1439     err = xen_primary_console_reset();
1440     if (err) {
1441         return err;
1442     }
1443 
1444     err = xen_xenstore_reset();
1445     if (err) {
1446         return err;
1447     }
1448 
1449     return 0;
1450 }
1451 
1452 static int schedop_shutdown(CPUState *cs, uint64_t arg)
1453 {
1454     struct sched_shutdown shutdown;
1455     int ret = 0;
1456 
1457     /* No need for 32/64 compat handling */
1458     qemu_build_assert(sizeof(shutdown) == 4);
1459 
1460     if (kvm_copy_from_gva(cs, arg, &shutdown, sizeof(shutdown))) {
1461         return -EFAULT;
1462     }
1463 
1464     switch (shutdown.reason) {
1465     case SHUTDOWN_crash:
1466         cpu_dump_state(cs, stderr, CPU_DUMP_CODE);
1467         qemu_system_guest_panicked(NULL);
1468         break;
1469 
1470     case SHUTDOWN_reboot:
1471         qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
1472         break;
1473 
1474     case SHUTDOWN_poweroff:
1475         qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
1476         break;
1477 
1478     case SHUTDOWN_soft_reset:
1479         bql_lock();
1480         ret = kvm_xen_soft_reset();
1481         bql_unlock();
1482         break;
1483 
1484     default:
1485         ret = -EINVAL;
1486         break;
1487     }
1488 
1489     return ret;
1490 }
1491 
1492 static bool kvm_xen_hcall_sched_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1493                                    int cmd, uint64_t arg)
1494 {
1495     CPUState *cs = CPU(cpu);
1496     int err = -ENOSYS;
1497 
1498     switch (cmd) {
1499     case SCHEDOP_shutdown:
1500         err = schedop_shutdown(cs, arg);
1501         break;
1502 
1503     case SCHEDOP_poll:
1504         /*
1505          * Linux will panic if this doesn't work. Just yield; it's not
1506          * worth overthinking it because with event channel handling
1507          * in KVM, the kernel will intercept this and it will never
1508          * reach QEMU anyway. The semantics of the hypercall explicltly
1509          * permit spurious wakeups.
1510          */
1511     case SCHEDOP_yield:
1512         sched_yield();
1513         err = 0;
1514         break;
1515 
1516     default:
1517         return false;
1518     }
1519 
1520     exit->u.hcall.result = err;
1521     return true;
1522 }
1523 
1524 static bool kvm_xen_hcall_gnttab_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1525                                     int cmd, uint64_t arg, int count)
1526 {
1527     CPUState *cs = CPU(cpu);
1528     int err;
1529 
1530     switch (cmd) {
1531     case GNTTABOP_set_version: {
1532         struct gnttab_set_version set;
1533 
1534         qemu_build_assert(sizeof(set) == 4);
1535         if (kvm_copy_from_gva(cs, arg, &set, sizeof(set))) {
1536             err = -EFAULT;
1537             break;
1538         }
1539 
1540         err = xen_gnttab_set_version_op(&set);
1541         if (!err && kvm_copy_to_gva(cs, arg, &set, sizeof(set))) {
1542             err = -EFAULT;
1543         }
1544         break;
1545     }
1546     case GNTTABOP_get_version: {
1547         struct gnttab_get_version get;
1548 
1549         qemu_build_assert(sizeof(get) == 8);
1550         if (kvm_copy_from_gva(cs, arg, &get, sizeof(get))) {
1551             err = -EFAULT;
1552             break;
1553         }
1554 
1555         err = xen_gnttab_get_version_op(&get);
1556         if (!err && kvm_copy_to_gva(cs, arg, &get, sizeof(get))) {
1557             err = -EFAULT;
1558         }
1559         break;
1560     }
1561     case GNTTABOP_query_size: {
1562         struct gnttab_query_size size;
1563 
1564         qemu_build_assert(sizeof(size) == 16);
1565         if (kvm_copy_from_gva(cs, arg, &size, sizeof(size))) {
1566             err = -EFAULT;
1567             break;
1568         }
1569 
1570         err = xen_gnttab_query_size_op(&size);
1571         if (!err && kvm_copy_to_gva(cs, arg, &size, sizeof(size))) {
1572             err = -EFAULT;
1573         }
1574         break;
1575     }
1576     case GNTTABOP_setup_table:
1577     case GNTTABOP_copy:
1578     case GNTTABOP_map_grant_ref:
1579     case GNTTABOP_unmap_grant_ref:
1580     case GNTTABOP_swap_grant_ref:
1581         return false;
1582 
1583     default:
1584         /* Xen explicitly returns -ENOSYS to HVM guests for all others */
1585         err = -ENOSYS;
1586         break;
1587     }
1588 
1589     exit->u.hcall.result = err;
1590     return true;
1591 }
1592 
1593 static bool kvm_xen_hcall_physdev_op(struct kvm_xen_exit *exit, X86CPU *cpu,
1594                                      int cmd, uint64_t arg)
1595 {
1596     CPUState *cs = CPU(cpu);
1597     int err;
1598 
1599     switch (cmd) {
1600     case PHYSDEVOP_map_pirq: {
1601         struct physdev_map_pirq map;
1602 
1603         if (hypercall_compat32(exit->u.hcall.longmode)) {
1604             struct compat_physdev_map_pirq *map32 = (void *)&map;
1605 
1606             if (kvm_copy_from_gva(cs, arg, map32, sizeof(*map32))) {
1607                 return -EFAULT;
1608             }
1609 
1610             /*
1611              * The only thing that's different is the alignment of the
1612              * uint64_t table_base at the end, which gets padding to make
1613              * it 64-bit aligned in the 64-bit version.
1614              */
1615             qemu_build_assert(sizeof(*map32) == 36);
1616             qemu_build_assert(offsetof(struct physdev_map_pirq, entry_nr) ==
1617                               offsetof(struct compat_physdev_map_pirq, entry_nr));
1618             memmove(&map.table_base, &map32->table_base, sizeof(map.table_base));
1619         } else {
1620             if (kvm_copy_from_gva(cs, arg, &map, sizeof(map))) {
1621                 err = -EFAULT;
1622                 break;
1623             }
1624         }
1625         err = xen_physdev_map_pirq(&map);
1626         /*
1627          * Since table_base is an IN parameter and won't be changed, just
1628          * copy the size of the compat structure back to the guest.
1629          */
1630         if (!err && kvm_copy_to_gva(cs, arg, &map,
1631                                     sizeof(struct compat_physdev_map_pirq))) {
1632             err = -EFAULT;
1633         }
1634         break;
1635     }
1636     case PHYSDEVOP_unmap_pirq: {
1637         struct physdev_unmap_pirq unmap;
1638 
1639         qemu_build_assert(sizeof(unmap) == 8);
1640         if (kvm_copy_from_gva(cs, arg, &unmap, sizeof(unmap))) {
1641             err = -EFAULT;
1642             break;
1643         }
1644 
1645         err = xen_physdev_unmap_pirq(&unmap);
1646         if (!err && kvm_copy_to_gva(cs, arg, &unmap, sizeof(unmap))) {
1647             err = -EFAULT;
1648         }
1649         break;
1650     }
1651     case PHYSDEVOP_eoi: {
1652         struct physdev_eoi eoi;
1653 
1654         qemu_build_assert(sizeof(eoi) == 4);
1655         if (kvm_copy_from_gva(cs, arg, &eoi, sizeof(eoi))) {
1656             err = -EFAULT;
1657             break;
1658         }
1659 
1660         err = xen_physdev_eoi_pirq(&eoi);
1661         if (!err && kvm_copy_to_gva(cs, arg, &eoi, sizeof(eoi))) {
1662             err = -EFAULT;
1663         }
1664         break;
1665     }
1666     case PHYSDEVOP_irq_status_query: {
1667         struct physdev_irq_status_query query;
1668 
1669         qemu_build_assert(sizeof(query) == 8);
1670         if (kvm_copy_from_gva(cs, arg, &query, sizeof(query))) {
1671             err = -EFAULT;
1672             break;
1673         }
1674 
1675         err = xen_physdev_query_pirq(&query);
1676         if (!err && kvm_copy_to_gva(cs, arg, &query, sizeof(query))) {
1677             err = -EFAULT;
1678         }
1679         break;
1680     }
1681     case PHYSDEVOP_get_free_pirq: {
1682         struct physdev_get_free_pirq get;
1683 
1684         qemu_build_assert(sizeof(get) == 8);
1685         if (kvm_copy_from_gva(cs, arg, &get, sizeof(get))) {
1686             err = -EFAULT;
1687             break;
1688         }
1689 
1690         err = xen_physdev_get_free_pirq(&get);
1691         if (!err && kvm_copy_to_gva(cs, arg, &get, sizeof(get))) {
1692             err = -EFAULT;
1693         }
1694         break;
1695     }
1696     case PHYSDEVOP_pirq_eoi_gmfn_v2: /* FreeBSD 13 makes this hypercall */
1697         err = -ENOSYS;
1698         break;
1699 
1700     default:
1701         return false;
1702     }
1703 
1704     exit->u.hcall.result = err;
1705     return true;
1706 }
1707 
1708 static bool do_kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit)
1709 {
1710     uint16_t code = exit->u.hcall.input;
1711 
1712     if (exit->u.hcall.cpl > 0) {
1713         exit->u.hcall.result = -EPERM;
1714         return true;
1715     }
1716 
1717     switch (code) {
1718     case __HYPERVISOR_set_timer_op:
1719         if (exit->u.hcall.longmode) {
1720             return kvm_xen_hcall_set_timer_op(exit, cpu,
1721                                               exit->u.hcall.params[0]);
1722         } else {
1723             /* In 32-bit mode, the 64-bit timer value is in two args. */
1724             uint64_t val = ((uint64_t)exit->u.hcall.params[1]) << 32 |
1725                 (uint32_t)exit->u.hcall.params[0];
1726             return kvm_xen_hcall_set_timer_op(exit, cpu, val);
1727         }
1728     case __HYPERVISOR_grant_table_op:
1729         return kvm_xen_hcall_gnttab_op(exit, cpu, exit->u.hcall.params[0],
1730                                        exit->u.hcall.params[1],
1731                                        exit->u.hcall.params[2]);
1732     case __HYPERVISOR_sched_op:
1733         return kvm_xen_hcall_sched_op(exit, cpu, exit->u.hcall.params[0],
1734                                       exit->u.hcall.params[1]);
1735     case __HYPERVISOR_event_channel_op:
1736         return kvm_xen_hcall_evtchn_op(exit, cpu, exit->u.hcall.params[0],
1737                                        exit->u.hcall.params[1]);
1738     case __HYPERVISOR_vcpu_op:
1739         return kvm_xen_hcall_vcpu_op(exit, cpu,
1740                                      exit->u.hcall.params[0],
1741                                      exit->u.hcall.params[1],
1742                                      exit->u.hcall.params[2]);
1743     case __HYPERVISOR_hvm_op:
1744         return kvm_xen_hcall_hvm_op(exit, cpu, exit->u.hcall.params[0],
1745                                     exit->u.hcall.params[1]);
1746     case __HYPERVISOR_memory_op:
1747         return kvm_xen_hcall_memory_op(exit, cpu, exit->u.hcall.params[0],
1748                                        exit->u.hcall.params[1]);
1749     case __HYPERVISOR_physdev_op:
1750         return kvm_xen_hcall_physdev_op(exit, cpu, exit->u.hcall.params[0],
1751                                         exit->u.hcall.params[1]);
1752     case __HYPERVISOR_xen_version:
1753         return kvm_xen_hcall_xen_version(exit, cpu, exit->u.hcall.params[0],
1754                                          exit->u.hcall.params[1]);
1755     default:
1756         return false;
1757     }
1758 }
1759 
1760 int kvm_xen_handle_exit(X86CPU *cpu, struct kvm_xen_exit *exit)
1761 {
1762     if (exit->type != KVM_EXIT_XEN_HCALL) {
1763         return -1;
1764     }
1765 
1766     /*
1767      * The kernel latches the guest 32/64 mode when the MSR is used to fill
1768      * the hypercall page. So if we see a hypercall in a mode that doesn't
1769      * match our own idea of the guest mode, fetch the kernel's idea of the
1770      * "long mode" to remain in sync.
1771      */
1772     if (exit->u.hcall.longmode != xen_is_long_mode()) {
1773         xen_sync_long_mode();
1774     }
1775 
1776     if (!do_kvm_xen_handle_exit(cpu, exit)) {
1777         /*
1778          * Some hypercalls will be deliberately "implemented" by returning
1779          * -ENOSYS. This case is for hypercalls which are unexpected.
1780          */
1781         exit->u.hcall.result = -ENOSYS;
1782         qemu_log_mask(LOG_UNIMP, "Unimplemented Xen hypercall %"
1783                       PRId64 " (0x%" PRIx64 " 0x%" PRIx64 " 0x%" PRIx64 ")\n",
1784                       (uint64_t)exit->u.hcall.input,
1785                       (uint64_t)exit->u.hcall.params[0],
1786                       (uint64_t)exit->u.hcall.params[1],
1787                       (uint64_t)exit->u.hcall.params[2]);
1788     }
1789 
1790     trace_kvm_xen_hypercall(CPU(cpu)->cpu_index, exit->u.hcall.cpl,
1791                             exit->u.hcall.input, exit->u.hcall.params[0],
1792                             exit->u.hcall.params[1], exit->u.hcall.params[2],
1793                             exit->u.hcall.result);
1794     return 0;
1795 }
1796 
1797 uint16_t kvm_xen_get_gnttab_max_frames(void)
1798 {
1799     KVMState *s = KVM_STATE(current_accel());
1800     return s->xen_gnttab_max_frames;
1801 }
1802 
1803 uint16_t kvm_xen_get_evtchn_max_pirq(void)
1804 {
1805     KVMState *s = KVM_STATE(current_accel());
1806     return s->xen_evtchn_max_pirq;
1807 }
1808 
1809 int kvm_put_xen_state(CPUState *cs)
1810 {
1811     X86CPU *cpu = X86_CPU(cs);
1812     CPUX86State *env = &cpu->env;
1813     uint64_t gpa;
1814     int ret;
1815 
1816     gpa = env->xen_vcpu_info_gpa;
1817     if (gpa == INVALID_GPA) {
1818         gpa = env->xen_vcpu_info_default_gpa;
1819     }
1820 
1821     if (gpa != INVALID_GPA) {
1822         ret = set_vcpu_info(cs, gpa);
1823         if (ret < 0) {
1824             return ret;
1825         }
1826     }
1827 
1828     gpa = env->xen_vcpu_time_info_gpa;
1829     if (gpa != INVALID_GPA) {
1830         ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO,
1831                                     gpa);
1832         if (ret < 0) {
1833             return ret;
1834         }
1835     }
1836 
1837     gpa = env->xen_vcpu_runstate_gpa;
1838     if (gpa != INVALID_GPA) {
1839         ret = kvm_xen_set_vcpu_attr(cs, KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR,
1840                                     gpa);
1841         if (ret < 0) {
1842             return ret;
1843         }
1844     }
1845 
1846     if (env->xen_periodic_timer_period) {
1847         ret = do_set_periodic_timer(cs, env->xen_periodic_timer_period);
1848         if (ret < 0) {
1849             return ret;
1850         }
1851     }
1852 
1853     if (!kvm_xen_has_cap(EVTCHN_SEND)) {
1854         /*
1855          * If the kernel has EVTCHN_SEND support then it handles timers too,
1856          * so the timer will be restored by kvm_xen_set_vcpu_timer() below.
1857          */
1858         QEMU_LOCK_GUARD(&env->xen_timers_lock);
1859         if (env->xen_singleshot_timer_ns) {
1860             ret = do_set_singleshot_timer(cs, env->xen_singleshot_timer_ns,
1861                                           false);
1862             if (ret < 0) {
1863                 return ret;
1864             }
1865         }
1866         return 0;
1867     }
1868 
1869     if (env->xen_vcpu_callback_vector) {
1870         ret = kvm_xen_set_vcpu_callback_vector(cs);
1871         if (ret < 0) {
1872             return ret;
1873         }
1874     }
1875 
1876     if (env->xen_virq[VIRQ_TIMER]) {
1877         do_set_vcpu_timer_virq(cs,
1878                                RUN_ON_CPU_HOST_INT(env->xen_virq[VIRQ_TIMER]));
1879     }
1880     return 0;
1881 }
1882 
1883 int kvm_get_xen_state(CPUState *cs)
1884 {
1885     X86CPU *cpu = X86_CPU(cs);
1886     CPUX86State *env = &cpu->env;
1887     uint64_t gpa;
1888     int ret;
1889 
1890     /*
1891      * The kernel does not mark vcpu_info as dirty when it delivers interrupts
1892      * to it. It's up to userspace to *assume* that any page shared thus is
1893      * always considered dirty. The shared_info page is different since it's
1894      * an overlay and migrated separately anyway.
1895      */
1896     gpa = env->xen_vcpu_info_gpa;
1897     if (gpa == INVALID_GPA) {
1898         gpa = env->xen_vcpu_info_default_gpa;
1899     }
1900     if (gpa != INVALID_GPA) {
1901         MemoryRegionSection mrs = memory_region_find(get_system_memory(),
1902                                                      gpa,
1903                                                      sizeof(struct vcpu_info));
1904         if (mrs.mr &&
1905             !int128_lt(mrs.size, int128_make64(sizeof(struct vcpu_info)))) {
1906             memory_region_set_dirty(mrs.mr, mrs.offset_within_region,
1907                                     sizeof(struct vcpu_info));
1908         }
1909     }
1910 
1911     if (!kvm_xen_has_cap(EVTCHN_SEND)) {
1912         return 0;
1913     }
1914 
1915     /*
1916      * If the kernel is accelerating timers, read out the current value of the
1917      * singleshot timer deadline.
1918      */
1919     if (env->xen_virq[VIRQ_TIMER]) {
1920         struct kvm_xen_vcpu_attr va = {
1921             .type = KVM_XEN_VCPU_ATTR_TYPE_TIMER,
1922         };
1923         ret = kvm_vcpu_ioctl(cs, KVM_XEN_VCPU_GET_ATTR, &va);
1924         if (ret < 0) {
1925             return ret;
1926         }
1927 
1928         /*
1929          * This locking is fairly pointless, and is here to appease Coverity.
1930          * There is an unavoidable race condition if a different vCPU sets a
1931          * timer for this vCPU after the value has been read out. But that's
1932          * OK in practice because *all* the vCPUs need to be stopped before
1933          * we set about migrating their state.
1934          */
1935         QEMU_LOCK_GUARD(&X86_CPU(cs)->env.xen_timers_lock);
1936         env->xen_singleshot_timer_ns = va.u.timer.expires_ns;
1937     }
1938 
1939     return 0;
1940 }
1941